fork download
  1. # -*- coding: utf-8 -*-
  2.  
  3. import time
  4. import thread
  5. import urllib2
  6. import logging
  7.  
  8. import lxml.html
  9.  
  10. good_key_in_url = 'profile|search|user'
  11. bad_key_in_url = 'Hello|2ch|ororo'
  12. timeout = 10
  13. max_threads = 100
  14. first_url = 'http://content-available-to-author-only.ru'
  15.  
  16. out = []
  17. received_page = []
  18. path = {'path_to_all_links': ".//*/a/@href"}
  19. logging.basicConfig(
  20. level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  21.  
  22. def worker(myID, url):
  23. global out
  24. global received_page
  25.  
  26. mutex.acquire()
  27. logging.info('%s | %s' % ('get', url))
  28. mutex.release()
  29.  
  30. try:
  31. # Get HTML page
  32. page = urllib2.urlopen(url, None, timeout).read()
  33. except Exception as e:
  34. mutex.acquire()
  35. logging.error('%s | %s' % (url, e))
  36. mutex.release()
  37. else:
  38. # Save url for escape loop
  39. received_page.append(url)
  40. # ETree object form HTML
  41. doc = lxml.html.document_fromstring(page)
  42. # Get all '<a href' links from page
  43. etree_links = doc.xpath(path['path_to_all_links'])
  44. if etree_links:
  45. # Sort links
  46. for link in etree_links:
  47. link = link.encode('utf-8')
  48. # Check to link loops
  49. if link not in received_page:
  50. # Checking for entry key in the url
  51. for key in bad_key_in_url:
  52. # Bad links
  53. if key in link:
  54. break
  55. else:
  56. for key in good_key_in_url:
  57. # Good links
  58. if key in link:
  59. mutex.acquire()
  60. out.append(link)
  61. mutex.release()
  62. break
  63.  
  64. if __name__ == '__main__':
  65. good_key_in_url = good_key_in_url.split('|')
  66. bad_key_in_url = bad_key_in_url.split('|')
  67. out_file = open('out.txt', 'w')
  68. mutex = thread.allocate_lock()
  69. logging.info('run')
  70.  
  71. i = 0
  72. out.append(first_url)
  73. while True:
  74. if out and len(out) <= max_threads:
  75. i += 1
  76. mutex.acquire()
  77. url = out[0]
  78. del out[0]
  79. if url != first_url:
  80. out_file.write(url+'\n')
  81. if len(received_page) > 10000:
  82. received_page = []
  83. i = 0
  84. mutex.release()
  85. thread.start_new_thread(worker, (i, url))
  86. if thread._count() < 1:
  87. time.sleep(5)
  88. elif thread._count() < 1:
  89. out_file.close()
  90. logging.info('exit')
  91. quit()
Runtime error #stdin #stdout #stderr 0.24s 17120KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 68, in <module>
IOError: [Errno 13] Permission denied: 'out.txt'