fork download
  1. # -*- coding: utf-8 -*-
  2.  
  3. '''Async checker who take url's from input file, where one url in string,
  4. create async queue where one url relates to one proxies - one item in queue,
  5. get responses from server through http proxy and sort in output files.
  6. '''
  7. import logging
  8. from eventlet import GreenPool
  9. from erequests import AsyncRequestFactory
  10.  
  11. urls_and_proxies = []
  12. requests_at_time = 1000 # specifies the number of requests
  13. # to make at a time (open connections)
  14. timeout_for_req = 10.001 # exception if no bytes have been received
  15. # on the underlying socket for timeout seconds
  16. timeout_for_proxy_test = 30.0 # 30 sec
  17. url_for_proxy_tests = 'http://content-available-to-author-only.ru'
  18. proxies_file_name = 'proxy.txt' # one proxy in string http://109.202.19.26:8080
  19. input_file_name = 'input.txt'
  20. output_file_names = ['1xx.txt', '2xx.txt', '3xx.txt',
  21. '4xx.txt', '5xx.txt', 'error.txt']
  22. logging.basicConfig(
  23. level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
  24.  
  25. class ProxyError(Exception):
  26. pass
  27.  
  28.  
  29. def test_proxies_from_file(proxies_file_name):
  30. '''The function create and return list proxy form file,
  31. check and if they do not work - excludes from the list and from file
  32. '''
  33. # Get http proxies from file
  34. logging.warning('test proxies')
  35. proxy_from_response = []
  36. proxy_checked = []
  37. proxy_from_file = [proxy.strip() for proxy in open(proxies_file_name)]
  38. requests = async_reqest([(url_for_proxy_tests, { 'http': proxy })
  39. for proxy in proxy_from_file], timeout_for_req=timeout_for_proxy_test)
  40. # Get only good proxy
  41. for response in requests:
  42. try:
  43. response.status_code
  44. except AttributeError:
  45. pass
  46. else:
  47. proxy_from_response.append(
  48. response.connection.proxy_manager.keys()[0])
  49. for from_response in proxy_from_response:
  50. for from_file in proxy_from_file:
  51. if from_response in from_file:
  52. proxy_checked.append(from_file)
  53. break
  54. # Write good proxy in file or rise exception no live proxies in file
  55. if not proxy_checked:
  56. raise ProxyError, 'Run out proxy'
  57. elif len(proxy_checked) != len(proxy_from_file):
  58. with open(proxies_file_name, 'w') as proxies_file:
  59. for proxy in proxy_checked:
  60. proxies_file.write(proxy+'\n')
  61. logging.warning('end test proxies')
  62. return proxy_checked
  63.  
  64. def imap(requests, size=10):
  65. '''Concurrently converts a sequence of AsyncRequest objects to Responses.
  66. This function from erequests where I added url in exception object.
  67. '''
  68.  
  69. pool = GreenPool(size)
  70.  
  71. def send(r):
  72. try:
  73. return r.send()
  74. except Exception as e:
  75. # Add url in exception object because in message I only see proxy
  76. e.url = r.url
  77. return e
  78.  
  79. for r in pool.imap(send, requests):
  80. yield r
  81.  
  82. def sort_responses_to_files(requests):
  83. '''Get first number from HTTP status code and write response
  84. in file where first letter in name is this number,
  85. or if response is exception object - write in error.txt
  86. '''
  87. logging.warning('get and sort responses')
  88. for response in requests:
  89. try:
  90. response.status_code
  91. except AttributeError:
  92. # Write in error.txt url and exception
  93. logging.warning('%s | %s' % (response.url, response))
  94. output_files['error.txt'
  95. ].write('%s | %s\n' % (response.url, response))
  96. else:
  97. # Write status and url in output files)
  98. logging.warning('%s | %s' % (response.status_code, response.url))
  99. file_name = str(response.status_code)[0] + 'xx.txt'
  100. output_files[file_name
  101. ].write('%s | %s\n' % (response.status_code, response.url))
  102.  
  103. def async_reqest(urls_and_proxies,
  104. timeout_for_req=timeout_for_req,
  105. number_of_reqests=requests_at_time):
  106. '''Resive list where item is (url, proxy={ 'http': 'ip:port' })
  107. and returns acync queue of reqests.
  108. '''
  109. logging.warning('create async requests')
  110. requests = imap(
  111. (AsyncRequestFactory.head(
  112. url, timeout=timeout_for_req, proxies=proxy)
  113. for url, proxy in urls_and_proxies), number_of_reqests)
  114. return requests
  115.  
  116. if __name__ == '__main__':
  117. # Delete old if exist and create new output files
  118. output_files = dict(
  119. [(file_name, open(file_name, 'w')) for file_name in output_file_names])
  120. # Get and test all proxies from file
  121. proxies_list = test_proxies_from_file(proxies_file_name)
  122. with open(input_file_name) as input_file:
  123. logging.warning('create list urls for async reqest')
  124. for url in input_file:
  125. # Add urls in queue from file, while exist one proxy for each url
  126. if proxies_list:
  127. proxy = { 'http': proxies_list[0]}
  128. url = url.strip()
  129. urls_and_proxies.append((url, proxy))
  130. del proxies_list[0]
  131. else:
  132. # If run out proxy - request and write result
  133. logging.warning('run queue')
  134. requests = async_reqest(urls_and_proxies)
  135. sort_responses_to_files(requests)
  136. urls_and_proxies = []
  137. # Get again all proxies from file and test
  138. proxies_list = test_proxies_from_file(proxies_file_name)
  139. logging.warning('create list urls for async reqest')
  140. else:
  141. # For remainder urls
  142. logging.warning('run remainder queue')
  143. proxies_list = test_proxies_from_file(proxies_file_name)
  144. requests = async_reqest(urls_and_proxies)
  145. sort_responses_to_files(requests)
Runtime error #stdin #stdout #stderr 0.05s 8816KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 9, in <module>
ImportError: No module named eventlet