# -*- coding: utf-8 -*-
import time
import thread
import urllib2
import logging
import lxml.html
good_key_in_url = 'profile|search|user'
bad_key_in_url = 'Hello|2ch|ororo'
timeout = 10
max_threads = 100
first_url = 'http://content-available-to-author-only.ru'
out = []
received_page = []
path = {'path_to_all_links': ".//*/a/@href"}
logging.basicConfig(
level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def worker(myID, url):
global out
global received_page
mutex.acquire()
logging.info('%s | %s' % ('get', url))
mutex.release()
try:
# Get HTML page
page = urllib2.urlopen(url, None, timeout).read()
except Exception as e:
mutex.acquire()
logging.error('%s | %s' % (url, e))
mutex.release()
else:
# Save url for escape loop
received_page.append(url)
# ETree object form HTML
doc = lxml.html.document_fromstring(page)
# Get all '<a href' links from page
etree_links = doc.xpath(path['path_to_all_links'])
if etree_links:
# Sort links
for link in etree_links:
link = link.encode('utf-8')
# Check to link loops
if link not in received_page:
# Checking for entry key in the url
for key in bad_key_in_url:
# Bad links
if key in link:
break
else:
for key in good_key_in_url:
# Good links
if key in link:
mutex.acquire()
out.append(link)
mutex.release()
break
if __name__ == '__main__':
good_key_in_url = good_key_in_url.split('|')
bad_key_in_url = bad_key_in_url.split('|')
out_file = open('out.txt', 'w')
mutex = thread.allocate_lock()
logging.info('run')
i = 0
out.append(first_url)
while True:
if out and len(out) <= max_threads:
i += 1
mutex.acquire()
url = out[0]
del out[0]
if url != first_url:
out_file.write(url+'\n')
if len(received_page) > 10000:
received_page = []
i = 0
mutex.release()
thread.start_new_thread(worker, (i, url))
if thread._count() < 1:
time.sleep(5)
elif thread._count() < 1:
out_file.close()
logging.info('exit')
quit()