fork download
  1. # -*- coding: utf-8 -*-
  2.  
  3. '''Asynchronous checker get part url's from input file, where one url in string,
  4. get responses from server through http proxy and sort it in output files.
  5. '''
  6. import eventlet
  7. from erequests import AsyncRequest, AsyncRequestFactory
  8.  
  9. urls = []
  10. max_number_items_in_urls = 100 # max url in one cycle
  11. requests_in_time = 50 # specifies the number of requests
  12. # to make at a time
  13.  
  14. timeout_for_req = 10.001 # exception if no bytes have been received
  15. # on the underlying socket for timeout seconds
  16. url_for_proxy_tests = 'http://content-available-to-author-only.ru'
  17. proxies_file_name = 'proxy.txt'# 1 proxy in string http://109.202.19.26:8080
  18. input_file_name = 'input.txt'
  19. output_file_names = ['1xx.txt', '2xx.txt', '3xx.txt',
  20. '4xx.txt', '5xx.txt', '6xx.txt' 'error.txt']
  21. # Delete old if exist and create new output files
  22. output_files = dict(
  23. [(file_name, open(file_name, 'w')) for file_name in output_file_names])
  24. # Get http proxies from file
  25. proxy_list = [proxy.strip() for proxy in open(proxies_file_name)]
  26.  
  27.  
  28. class ProxyError(Exception):
  29. pass
  30.  
  31.  
  32. def simple_proxy_test(proxy, url=url_for_proxy_tests, timeout=timeout_for_req):
  33. '''Get url through proxy and return True if passed,
  34. or False if has exceptont on timeout, socket error etc.
  35. '''
  36. try:
  37. AsyncRequest.send(
  38. AsyncRequestFactory.head(
  39. url,
  40. proxies={'http': proxy, 'https': ''},
  41. timeout=timeout_for_req))
  42. except Exception:
  43. test_passed = False
  44. else:
  45. test_passed = True
  46. return test_passed
  47.  
  48. def get_proxy():
  49. '''The function get proxy form queue,
  50. check and if they do not work - excludes from the queue else returns
  51. and transferring it to the end queue.'''
  52. while proxy_list:
  53. proxy = proxy_list[0]
  54. if simple_proxy_test(proxy=proxy):
  55. proxy_list.append(proxy_list[0])
  56. del proxy_list[0]
  57. return proxy
  58. elif not simple_proxy_test(proxy=proxy, timeout=30.0):
  59. # Re-test proxy with big timeout and delete if test not passed
  60. del proxy_list[0]
  61. else:
  62. raise ProxyError, 'Run out proxy'
  63.  
  64. def imap(requests, size=10):
  65. """Concurrently converts a sequence of AsyncRequest objects to Responses.
  66. This function from erequests where I added url in exception object.
  67. """
  68.  
  69. pool = eventlet.GreenPool(size)
  70.  
  71. def send(r):
  72. try:
  73. return r.send()
  74. except Exception as e:
  75. # Add url in exception object because in message I only see proxy
  76. e.url = r.url
  77. return e
  78.  
  79. for r in pool.imap(send, requests):
  80. yield r
  81.  
  82. def create_request(urls, proxy={'http': ''}):
  83. # Prepare request and written response in file
  84. for req in imap(
  85. (AsyncRequestFactory.head(url, timeout=timeout_for_req, proxies=proxy)
  86. for url in urls), requests_in_time):
  87. try:
  88. req.status_code
  89. except AttributeError:
  90. # Write in error.txt url and exception
  91. print '%s | %s' % (req.url, req)
  92. output_files['error.txt'].write('%s | %s\n' % (req.url, req))
  93. else:
  94. # Write status and url in output files)
  95. print '%s | %s' % (req.status_code, req.url)
  96. file_name = str(req.status_code)[0] + 'xx.txt'
  97. output_files[
  98. file_name].write('%s | %s\n' % (req.status_code, req.url))
  99.  
  100. with open(input_file_name) as input_file:
  101. for url in input_file:
  102. # Get part urls from file
  103. if len(urls) < max_number_items_in_urls:
  104. url = url.strip()
  105. urls.append(url)
  106. else:
  107. # If run out proxy - read proxy.txt again
  108. try:
  109. proxy = { 'http': get_proxy(), 'https': '' }
  110. except ProxyError:
  111. print 'Run out proxy'
  112. proxy_list = [
  113. proxy.strip() for proxy in open(proxies_file_name)]
  114. proxy = { 'http': get_proxy(), 'https': '' }
  115. # Request and write result
  116. create_request(urls, proxy)
  117. urls = []
  118. else:
  119. # For remainder urls
  120. create_request(urls)
Runtime error #stdin #stdout #stderr 0.01s 7892KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 6, in <module>
ImportError: No module named eventlet