# -*- coding: utf-8 -*-
'''Asynchronous checker get part url's from input file, where one url in string,
get responses from server through http proxy and sort it in output files.
'''
import eventlet
from erequests import AsyncRequest, AsyncRequestFactory
urls = []
max_number_items_in_urls = 100 # max url in one cycle
requests_in_time = 50 # specifies the number of requests
# to make at a time
timeout_for_req = 10.001 # exception if no bytes have been received
# on the underlying socket for timeout seconds
url_for_proxy_tests = 'http://content-available-to-author-only.ru'
proxies_file_name = 'proxy.txt'# 1 proxy in string http://109.202.19.26:8080
input_file_name = 'input.txt'
output_file_names = ['1xx.txt', '2xx.txt', '3xx.txt',
'4xx.txt', '5xx.txt', '6xx.txt' 'error.txt']
# Delete old if exist and create new output files
output_files = dict(
[(file_name, open(file_name, 'w')) for file_name in output_file_names])
# Get http proxies from file
proxy_list = [proxy.strip() for proxy in open(proxies_file_name)]
class ProxyError(Exception):
pass
def simple_proxy_test(proxy, url=url_for_proxy_tests, timeout=timeout_for_req):
'''Get url through proxy and return True if passed,
or False if has exceptont on timeout, socket error etc.
'''
try:
AsyncRequest.send(
AsyncRequestFactory.head(
url,
proxies={'http': proxy, 'https': ''},
timeout=timeout_for_req))
except Exception:
test_passed = False
else:
test_passed = True
return test_passed
def get_proxy():
'''The function get proxy form queue,
check and if they do not work - excludes from the queue else returns
and transferring it to the end queue.'''
while proxy_list:
proxy = proxy_list[0]
if simple_proxy_test(proxy=proxy):
proxy_list.append(proxy_list[0])
del proxy_list[0]
return proxy
elif not simple_proxy_test(proxy=proxy, timeout=30.0):
# Re-test proxy with big timeout and delete if test not passed
del proxy_list[0]
else:
raise ProxyError, 'Run out proxy'
def imap(requests, size=10):
"""Concurrently converts a sequence of AsyncRequest objects to Responses.
This function from erequests where I added url in exception object.
"""
pool = eventlet.GreenPool(size)
def send(r):
try:
return r.send()
except Exception as e:
# Add url in exception object because in message I only see proxy
e.url = r.url
return e
for r in pool.imap(send, requests):
yield r
def create_request(urls, proxy={'http': ''}):
# Prepare request and written response in file
for req in imap(
(AsyncRequestFactory.head(url, timeout=timeout_for_req, proxies=proxy)
for url in urls), requests_in_time):
try:
req.status_code
except AttributeError:
# Write in error.txt url and exception
print '%s | %s' % (req.url, req)
output_files['error.txt'].write('%s | %s\n' % (req.url, req))
else:
# Write status and url in output files)
print '%s | %s' % (req.status_code, req.url)
file_name = str(req.status_code)[0] + 'xx.txt'
output_files[
file_name].write('%s | %s\n' % (req.status_code, req.url))
with open(input_file_name) as input_file:
for url in input_file:
# Get part urls from file
if len(urls) < max_number_items_in_urls:
url = url.strip()
urls.append(url)
else:
# If run out proxy - read proxy.txt again
try:
proxy = { 'http': get_proxy(), 'https': '' }
except ProxyError:
print 'Run out proxy'
proxy_list = [
proxy.strip() for proxy in open(proxies_file_name)]
proxy = { 'http': get_proxy(), 'https': '' }
# Request and write result
create_request(urls, proxy)
urls = []
else:
# For remainder urls
create_request(urls)