fork(1) download
  1. # -*- coding: utf-8 -*-
  2.  
  3. '''Async checker who take url's from input file, where one url in string,
  4. create async queue where one url relates to one proxies - one item in queue,
  5. get responses from server through http proxy and sort in output files.
  6. '''
  7. import logging
  8. from eventlet import GreenPool
  9. from erequests import AsyncRequestFactory
  10.  
  11. urls_and_proxies = []
  12. requests_at_time = 1000 # specifies the number of requests
  13. # to make at a time (open connections)
  14. timeout_for_req = 10.001 # exception if no bytes have been received
  15. # on the underlying socket for timeout seconds
  16. timeout_for_proxy_test = 30.0 # 30 sec
  17. url_for_proxy_tests = 'http://content-available-to-author-only.ru'
  18. proxies_file_name = 'proxy.txt' # one proxy in string http://109.202.19.26:8080
  19. bad_proxies_file_name = 'bad_proxy.txt'
  20. input_file_name = 'input.txt'
  21. output_file_names = ['1xx.txt', '2xx.txt', '3xx.txt',
  22. '4xx.txt', '5xx.txt', 'error.txt']
  23. logging.basicConfig(
  24. level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
  25.  
  26. class ProxyError(Exception):
  27. pass
  28.  
  29.  
  30. def test_proxies_from_file(proxies_file_name):
  31. '''The function create and return list proxy form file,
  32. check and if they do not work - excludes from the list and from file
  33. '''
  34. # Get http proxies from file
  35. logging.warning('test proxies')
  36. proxy_from_response = []
  37. proxy_checked = []
  38. proxy_from_file = [proxy.strip() for proxy in open(proxies_file_name)]
  39. requests = async_reqest([(url_for_proxy_tests, { 'http': proxy })
  40. for proxy in proxy_from_file], timeout_for_req=timeout_for_proxy_test)
  41. # Get only good proxy
  42. for response in requests:
  43. try:
  44. response.status_code
  45. except AttributeError:
  46. pass
  47. else:
  48. proxy_from_response.append(
  49. response.connection.proxy_manager.keys()[0])
  50. for from_response in proxy_from_response:
  51. for from_file in proxy_from_file:
  52. if from_response in from_file:
  53. proxy_checked.append(from_file)
  54. break
  55. # Write bad proxy in file or rise exception no live proxies in file
  56. if not proxy_checked:
  57. raise ProxyError, 'Run out proxy'
  58. elif len(proxy_checked) != len(proxy_from_file):
  59. with open(bad_proxies_file_name, 'w') as proxies_file:
  60. for proxy in [bad_proxy for bad_proxy in proxy_from_file
  61. if bad_proxy not in proxy_checked]:
  62. proxies_file.write(proxy+'\n')
  63. logging.warning('end test proxies')
  64. return proxy_checked
  65.  
  66. def imap(requests, size=10):
  67. '''Concurrently converts a sequence of AsyncRequest objects to Responses.
  68. This function from erequests where I added url in exception object.
  69. '''
  70.  
  71. pool = GreenPool(size)
  72.  
  73. def send(r):
  74. try:
  75. return r.send()
  76. except Exception as e:
  77. # Add url in exception object because in message I only see proxy
  78. e.url = r.url
  79. return e
  80.  
  81. for r in pool.imap(send, requests):
  82. yield r
  83.  
  84. def sort_responses_to_files(requests):
  85. '''Get first number from HTTP status code and write response
  86. in file where first letter in name is this number,
  87. or if response is exception object - write in error.txt
  88. '''
  89. logging.warning('get and sort responses')
  90. for response in requests:
  91. try:
  92. response.status_code
  93. except AttributeError:
  94. # Write in error.txt url and exception
  95. logging.warning('%s | %s' % (response.url, response))
  96. output_files['error.txt'
  97. ].write('%s | %s\n' % (response.url, response))
  98. else:
  99. # Write status and url in output files)
  100. logging.warning('%s | %s' % (response.status_code, response.url))
  101. file_name = str(response.status_code)[0] + 'xx.txt'
  102. output_files[file_name
  103. ].write('%s | %s\n' % (response.status_code, response.url))
  104.  
  105. def async_reqest(urls_and_proxies,
  106. timeout_for_req=timeout_for_req,
  107. number_of_reqests=requests_at_time):
  108. '''Resive list where item is (url, proxy={ 'http': 'ip:port' })
  109. and returns acync queue of reqests.
  110. '''
  111. logging.warning('create async requests')
  112. requests = imap(
  113. (AsyncRequestFactory.head(
  114. url, timeout=timeout_for_req, proxies=proxy)
  115. for url, proxy in urls_and_proxies), number_of_reqests)
  116. return requests
  117.  
  118. if __name__ == '__main__':
  119. # Delete old if exist and create new output files
  120. output_files = dict(
  121. [(file_name, open(file_name, 'w')) for file_name in output_file_names])
  122. # Get and test all proxies from file
  123. proxies_list = test_proxies_from_file(proxies_file_name)
  124. with open(input_file_name) as input_file:
  125. logging.warning('create list urls for async reqest')
  126. for url in input_file:
  127. # Add urls in queue from file, while exist one proxy for each url
  128. if proxies_list:
  129. proxy = { 'http': proxies_list[0]}
  130. url = url.strip()
  131. urls_and_proxies.append((url, proxy))
  132. del proxies_list[0]
  133. else:
  134. # If run out proxy - request and write result
  135. logging.warning('run queue')
  136. requests = async_reqest(urls_and_proxies)
  137. sort_responses_to_files(requests)
  138. urls_and_proxies = []
  139. # Get again all proxies from file and test
  140. proxies_list = test_proxies_from_file(proxies_file_name)
  141. logging.warning('create list urls for async reqest')
  142. else:
  143. # For remainder urls
  144. logging.warning('run remainder queue')
  145. requests = async_reqest(urls_and_proxies)
  146. sort_responses_to_files(requests)
Runtime error #stdin #stdout #stderr 0.04s 8816KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 8, in <module>
ImportError: No module named eventlet