# -*- coding: utf-8 -*-

import time
import thread
import urllib2
import logging

import lxml.html

good_key_in_url = 'profile|search|user'
bad_key_in_url = 'Hello|2ch|ororo'
timeout = 10
max_threads = 100
first_url = 'http://content-available-to-author-only.ru'

out = []
received_page = []
path = {'path_to_all_links': ".//*/a/@href"}
logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def worker(myID, url):
    global out
    global received_page

    mutex.acquire()
    logging.info('%s | %s' % ('get', url))
    mutex.release()

    try:
        # Get HTML page
        page = urllib2.urlopen(url, None, timeout).read()
    except Exception as e:
        mutex.acquire()
        logging.error('%s | %s' % (url, e))
        mutex.release()
    else:
        # Save url for escape loop
        received_page.append(url)
        # ETree object form HTML
        doc = lxml.html.document_fromstring(page)
        # Get all '<a href' links from page
        etree_links = doc.xpath(path['path_to_all_links'])
        if etree_links:
            # Sort links
            for link in etree_links:
                link = link.encode('utf-8')
                # Check to link loops
                if link not in received_page:
                    # Checking for entry key in the url
                    for key in bad_key_in_url:
                        # Bad links
                        if key in link:
                            break
                    else:
                        for key in good_key_in_url:
                            # Good links
                            if key in link:
                                mutex.acquire()
                                out.append(link)
                                mutex.release()
                                break

if __name__ == '__main__':
    good_key_in_url = good_key_in_url.split('|')
    bad_key_in_url = bad_key_in_url.split('|')
    out_file = open('out.txt', 'w')
    mutex = thread.allocate_lock()
    logging.info('run')

    i = 0
    out.append(first_url)
    while True:
        if out and len(out) <= max_threads:
            i += 1
            mutex.acquire()
            url = out[0]
            del out[0]
            if url != first_url:
                out_file.write(url+'\n')
            if len(received_page) > 10000:
                received_page = []
                i = 0
            mutex.release()
            thread.start_new_thread(worker, (i, url))
            if thread._count() < 1:
            	time.sleep(5)
        elif thread._count() < 1:
            out_file.close()
            logging.info('exit')
            quit()