#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import csv
def get_html( url) :
r = requests.get ( url)
return r.text
def get_total_pages( html) :
soup = BeautifulSoup( html, 'lxml' )
pages = soup.find ( 'div' , class_= 'pagination-pages' ) .find_all ( 'a' , class_= 'pagination-page' ) [ -1 ] .get ( 'href' )
total_pages = pages.split ( '=' ) [ 1 ] .split ( '&' ) [ 0 ]
return int ( total_pages)
def write_csv( date) :
with open ( 'avito.csv' , 'a' ) as f:
writer = csv .writer ( f)
writer.writerow ( ( date[ 'title' ] ,
date[ 'price' ] ,
date[ 'metro' ] ,
date[ 'url' ] ) )
def get_page_date( html) :
soup = BeautifulSoup( html, 'lxml' )
ads = soup.find ( 'div' , class_= 'catalog-list' ) .find_all ( 'div' , class_= 'item_table' )
for ad in ads:
name = ad.find ( 'div' , class_= 'description' ) .find ( 'h3' ) .text .strip ( ) .lower ( )
if 'htc' in name:
try :
title = ad.find ( 'div' , class_= 'description' ) .find ( 'h3' ) .text .strip ( )
except :
title= ''
try :
url = 'https://w...content-available-to-author-only...o.ru/' + ad.find ( 'div' , class_= 'description' ) .find ( 'h3' ) .find ( 'a' ) .get ( 'href' )
except :
url = ''
try :
price = ad.find ( 'div' , class_= 'about' ) .text .strip ( )
except :
price = ''
try :
metro = ad.find ( 'div' , class_= 'data' ) .find ( 'p' ) .text .strip ( )
except :
metro = ''
date = { 'title' : title,
'price' : price,
'metro' : metro,
'url' : url}
write_csv( date)
else :
continue
def main( ) :
url = 'https://w...content-available-to-author-only...o.ru/moskva/telefony/htc'
base_url = 'https://w...content-available-to-author-only...o.ru/moskva/telefony/htc?p='
total_pages = get_total_pages( get_html( url) )
for i in range ( 1 , total_pages) :
url_gen = base_url + str ( i)
html = get_html( url_gen)
get_page_date( html)
if __name__ == '__main__' :
main( )
IyEvdXNyL2Jpbi9lbnYgcHl0aG9uMwoKaW1wb3J0IHJlcXVlc3RzCmZyb20gYnM0IGltcG9ydCBCZWF1dGlmdWxTb3VwCmltcG9ydCBjc3YKCmRlZiBnZXRfaHRtbCh1cmwpOgogICAgciA9IHJlcXVlc3RzLmdldCh1cmwpCiAgICByZXR1cm4gci50ZXh0CgpkZWYgZ2V0X3RvdGFsX3BhZ2VzKGh0bWwpOgogICAgc291cCA9IEJlYXV0aWZ1bFNvdXAoaHRtbCwgJ2x4bWwnKQogICAgcGFnZXMgPSBzb3VwLmZpbmQoJ2RpdicsIGNsYXNzXz0ncGFnaW5hdGlvbi1wYWdlcycpLmZpbmRfYWxsKCdhJywgY2xhc3NfPSdwYWdpbmF0aW9uLXBhZ2UnKVstMV0uZ2V0KCdocmVmJykKICAgIHRvdGFsX3BhZ2VzID0gcGFnZXMuc3BsaXQoJz0nKVsxXS5zcGxpdCgnJicpWzBdCiAgICByZXR1cm4gaW50KHRvdGFsX3BhZ2VzKQoKZGVmIHdyaXRlX2NzdihkYXRlKToKICAgIHdpdGggb3BlbignYXZpdG8uY3N2JywgJ2EnKSBhcyBmOgogICAgICAgIHdyaXRlciA9IGNzdi53cml0ZXIoZikKICAgICAgICB3cml0ZXIud3JpdGVyb3coKGRhdGVbJ3RpdGxlJ10sCiAgICAgICAgICAgICAgICAgICAgZGF0ZVsncHJpY2UnXSwKICAgICAgICAgICAgICAgICAgICBkYXRlWydtZXRybyddLAogICAgICAgICAgICAgICAgICAgIGRhdGVbJ3VybCddKSkKCmRlZiBnZXRfcGFnZV9kYXRlKGh0bWwpOgogICAgc291cCA9IEJlYXV0aWZ1bFNvdXAoaHRtbCwgJ2x4bWwnKQogICAgYWRzID0gc291cC5maW5kKCdkaXYnLCBjbGFzc189J2NhdGFsb2ctbGlzdCcpLmZpbmRfYWxsKCdkaXYnLCBjbGFzc189J2l0ZW1fdGFibGUnKQogICAgZm9yIGFkIGluIGFkczoKICAgICAgICBuYW1lID0gYWQuZmluZCgnZGl2JywgY2xhc3NfPSdkZXNjcmlwdGlvbicpLmZpbmQoJ2gzJykudGV4dC5zdHJpcCgpLmxvd2VyKCkKICAgICAgICBpZiAnaHRjJyBpbiBuYW1lOgogICAgICAgICAgICB0cnk6CiAgICAgICAgICAgICAgICB0aXRsZSA9IGFkLmZpbmQoJ2RpdicsIGNsYXNzXz0nZGVzY3JpcHRpb24nKS5maW5kKCdoMycpLnRleHQuc3RyaXAoKQogICAgICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgICAgICB0aXRsZT0gJycKICAgICAgICAgICAgdHJ5OgogICAgICAgICAgICAgICAgdXJsID0gJ2h0dHBzOi8vdy4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4uby5ydS8nICsgYWQuZmluZCgnZGl2JywgY2xhc3NfPSdkZXNjcmlwdGlvbicpLmZpbmQoJ2gzJykuZmluZCgnYScpLmdldCgnaHJlZicpCiAgICAgICAgICAgIGV4Y2VwdDoKICAgICAgICAgICAgICAgIHVybCA9ICcnCiAgICAgICAgICAgIHRyeToKICAgICAgICAgICAgICAgIHByaWNlID0gYWQuZmluZCgnZGl2JywgY2xhc3NfPSdhYm91dCcpLnRleHQuc3RyaXAoKQogICAgICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgICAgICBwcmljZSA9ICcnCiAgICAgICAgICAgIHRyeToKICAgICAgICAgICAgICAgIG1ldHJvID0gYWQuZmluZCgnZGl2JywgY2xhc3NfPSdkYXRhJykuZmluZCgncCcpLnRleHQuc3RyaXAoKQogICAgICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgICAgICBtZXRybyA9ICcnCiAgICAgICAgICAgIGRhdGUgPSB7J3RpdGxlJzogdGl0bGUsCiAgICAgICAgICAgICAgICAgICAgJ3ByaWNlJzogcHJpY2UsCiAgICAgICAgICAgICAgICAgICAgJ21ldHJvJzogbWV0cm8sCiAgICAgICAgICAgICAgICAgICAgJ3VybCc6IHVybH0KICAgICAgICAgICAgd3JpdGVfY3N2KGRhdGUpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgY29udGludWUKCmRlZiBtYWluKCk6CiAgICB1cmwgPSAnaHR0cHM6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi5vLnJ1L21vc2t2YS90ZWxlZm9ueS9odGMnCiAgICBiYXNlX3VybCA9ICdodHRwczovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLm8ucnUvbW9za3ZhL3RlbGVmb255L2h0Yz9wPScKICAgIHRvdGFsX3BhZ2VzID0gZ2V0X3RvdGFsX3BhZ2VzKGdldF9odG1sKHVybCkpCiAgICBmb3IgaSBpbiByYW5nZSgxLCB0b3RhbF9wYWdlcyk6CiAgICAgICAgdXJsX2dlbiA9IGJhc2VfdXJsICsgc3RyKGkpCiAgICAgICAgaHRtbCA9IGdldF9odG1sKHVybF9nZW4pCiAgICAgICAgZ2V0X3BhZ2VfZGF0ZShodG1sKQoKaWYgX19uYW1lX18gPT0gJ19fbWFpbl9fJzoKICAgIG1haW4oKe+7vw==
compilation info
Traceback (most recent call last):
File "/usr/lib/python3.5/py_compile.py", line 125, in compile
_optimize=optimize)
File "<frozen importlib._bootstrap_external>", line 735, in source_to_code
File "<frozen importlib._bootstrap>", line 222, in _call_with_frames_removed
File "./prog.py", line 65
main()
^
SyntaxError: invalid character in identifier
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.5/py_compile.py", line 129, in compile
raise py_exc
py_compile.PyCompileError: File "./prog.py", line 65
main()
^
SyntaxError: invalid character in identifier
stdout