# -*- coding: utf-8 -*-
'''Async checker who take url's from input file, where one url in string,
create async queue where one url relates to one proxies - one item in queue,
get responses from server through http proxy and sort in output files.
'''
import logging
from eventlet import GreenPool
from erequests import AsyncRequestFactory
urls_and_proxies = []
timeout_for_req = 10.001 # exception if no bytes have been received
# on the underlying socket for timeout seconds
timeout_for_proxy_test = 15.0 # 15 sec
url_for_proxy_tests = 'http://content-available-to-author-only.ru'
proxies_file_name = 'proxy.txt' # one proxy in string http://109.202.19.26:8080
bad_proxies_file_name = 'bad_proxy.txt'
input_file_name = 'input.txt'
output_file_names = ['1xx.txt', '2xx.txt', '3xx.txt',
'4xx.txt', '5xx.txt', 'error.txt']
logging.basicConfig(
level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
class ProxyError(Exception):
pass
def test_proxies_from_file(proxies_file_name):
'''The function create and return list proxy form file,
check and if they do not work - excludes from the list and from file
'''
# Get http proxies from file
logging.warning('test proxies')
proxy_from_response = []
proxy_checked = []
proxy_from_file = [proxy.strip() for proxy in open(proxies_file_name)]
requests = async_reqest(
[(url_for_proxy_tests, { 'http': proxy }) for proxy in proxy_from_file],
number_of_reqests=len(proxy_from_file),
timeout_for_req=timeout_for_proxy_test)
# Get only good proxy
for response in requests:
try:
response.status_code
except AttributeError:
pass
else:
proxy_from_response.append(
response.connection.proxy_manager.keys()[0])
for from_response in proxy_from_response:
for from_file in proxy_from_file:
if from_response in from_file:
proxy_checked.append(from_file)
break
# Write bad proxy in file or rise exception no live proxies in file
if not proxy_checked:
raise ProxyError, 'Run out proxy'
elif len(proxy_checked) != len(proxy_from_file):
with open(bad_proxies_file_name, 'w') as proxies_file:
for proxy in [bad_proxy for bad_proxy in proxy_from_file
if bad_proxy not in proxy_checked]:
proxies_file.write(proxy+'\n')
logging.warning('end test proxies')
return proxy_checked
def imap(requests, size=10):
'''Concurrently converts a sequence of AsyncRequest objects to Responses.
This function from erequests where I added url in exception object.
'''
pool = GreenPool(size)
def send(r):
try:
return r.send()
except Exception as e:
# Add url in exception object because in message I only see proxy
e.url = r.url
return e
for r in pool.imap(send, requests):
yield r
def sort_responses_to_files(requests):
'''Get first number from HTTP status code and write response
in file where first letter in name is this number,
or if response is exception object - write in error.txt
'''
logging.warning('get and sort responses')
for response in requests:
try:
response.status_code
except AttributeError:
# Write in error.txt url and exception
logging.warning('%s | %s' % (response.url, response))
output_files['error.txt'
].write('%s | %s\n' % (response.url, response))
else:
# Write status and url in output files)
logging.warning('%s | %s' % (response.status_code, response.url))
file_name = str(response.status_code)[0] + 'xx.txt'
if output_files.has_key(file_name):
output_files[file_name
].write('%s | %s\n' % (response.status_code, response.url))
else:
output_files['error.txt'
].write('%s | %s\n' % (response.url, response))
def async_reqest(urls_and_proxies,
number_of_reqests,
timeout_for_req=timeout_for_req):
'''Resive list where item is (url, proxy={ 'http': 'ip:port' })
and returns acync queue of reqests.
'''
logging.warning('create async requests')
requests = imap(
(AsyncRequestFactory.head(
url, timeout=timeout_for_req, proxies=proxy)
for url, proxy in urls_and_proxies), number_of_reqests)
return requests
if __name__ == '__main__':
# Delete old if exist and create new output files
output_files = dict(
[(file_name, open(file_name, 'w')) for file_name in output_file_names])
# Get and test all proxies from file
proxies_list = test_proxies_from_file(proxies_file_name)
with open(input_file_name) as input_file:
logging.warning('create list urls for async reqest')
for url in input_file:
# Add urls in queue from file, while exist one proxy for each url
if proxies_list:
proxy = { 'http': proxies_list[0]}
url = url.strip()
urls_and_proxies.append((url, proxy))
del proxies_list[0]
else:
# If run out proxy - request and write result
logging.warning('run queue')
requests = async_reqest(urls_and_proxies,
number_of_reqests=len(urls_and_proxies))
sort_responses_to_files(requests)
urls_and_proxies = []
# Get again all proxies from file and test
proxies_list = test_proxies_from_file(proxies_file_name)
logging.warning('create list urls for async reqest')
else:
# For remainder urls
logging.warning('run remainder queue')
requests = async_reqest(urls_and_proxies,
number_of_reqests=len(urls_and_proxies))
sort_responses_to_files(requests)