# your code goes here
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
url = 'http://w...content-available-to-author-only...o.com/yearly/chart/?page=' + str(max_pages) + '&view=releasedate&view2=domestic&yr=2015&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://w...content-available-to-author-only...o.com' + link.get('href')
print link.text
getRank(href)
break
def getRank(item_url):
href = item_url[:37]+"page=weekend&" + item_url[37:]
response = requests.get(href)
print response.status_code, "for", href
soup = BeautifulSoup(response.content) # or BeautifulSoup(response.content, "html5lib")
rows = soup.select('.chart-wide tr')
header_skipped = False
for row in rows:
if not header_skipped:
header_skipped = True
continue
headers = "Date Rank WeekendGross Change Theaters Change/Avg GrossToDate Week".split()
for header, child in zip(headers, row.children):
print header, ":", child.text
rank = 5
print rank
spider(1)
IyB5b3VyIGNvZGUgZ29lcyBoZXJlCgojIC0qLSBjb2Rpbmc6IHV0Zi04IC0qLQoKaW1wb3J0IHJlcXVlc3RzCmZyb20gYnM0IGltcG9ydCBCZWF1dGlmdWxTb3VwCgpkZWYgc3BpZGVyKG1heF9wYWdlcyk6CiAgICB1cmwgPSAnaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLm8uY29tL3llYXJseS9jaGFydC8/cGFnZT0nICsgc3RyKG1heF9wYWdlcykgKyAnJnZpZXc9cmVsZWFzZWRhdGUmdmlldzI9ZG9tZXN0aWMmeXI9MjAxNSZwPS5odG0nCiAgICBzb3VyY2VfY29kZSA9IHJlcXVlc3RzLmdldCh1cmwpCiAgICBwbGFpbl90ZXh0ID0gc291cmNlX2NvZGUudGV4dAogICAgc291cCA9IEJlYXV0aWZ1bFNvdXAocGxhaW5fdGV4dCkKICAgIAogICAgZm9yIGxpbmsgaW4gc291cC5zZWxlY3QoJ3RkID4gYiA+IGZvbnQgPiBhW2hyZWZePS9tb3ZpZXMvP10nKToKICAgICAgICBocmVmID0gJ2h0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi5vLmNvbScgKyBsaW5rLmdldCgnaHJlZicpCiAgICAgICAgcHJpbnQgbGluay50ZXh0CiAgICAgICAgZ2V0UmFuayhocmVmKQogICAgICAgIGJyZWFrCgpkZWYgZ2V0UmFuayhpdGVtX3VybCk6CiAgICBocmVmID0gaXRlbV91cmxbOjM3XSsicGFnZT13ZWVrZW5kJiIgKyBpdGVtX3VybFszNzpdCiAgICByZXNwb25zZSA9IHJlcXVlc3RzLmdldChocmVmKQogICAgcHJpbnQgcmVzcG9uc2Uuc3RhdHVzX2NvZGUsICJmb3IiLCBocmVmCiAgICBzb3VwID0gQmVhdXRpZnVsU291cChyZXNwb25zZS5jb250ZW50KSAgIyBvciBCZWF1dGlmdWxTb3VwKHJlc3BvbnNlLmNvbnRlbnQsICJodG1sNWxpYiIpCiAgICAKICAgIHJvd3MgPSBzb3VwLnNlbGVjdCgnLmNoYXJ0LXdpZGUgdHInKQoKICAgIGhlYWRlcl9za2lwcGVkID0gRmFsc2UKICAgIGZvciByb3cgaW4gcm93czoKICAgICAgICBpZiBub3QgaGVhZGVyX3NraXBwZWQ6CiAgICAgICAgICAgIGhlYWRlcl9za2lwcGVkID0gVHJ1ZQogICAgICAgICAgICBjb250aW51ZQoKICAgICAgICBoZWFkZXJzID0gIkRhdGUgUmFuayBXZWVrZW5kR3Jvc3MgQ2hhbmdlIFRoZWF0ZXJzIENoYW5nZS9BdmcgR3Jvc3NUb0RhdGUgV2VlayIuc3BsaXQoKQoKICAgICAgICBmb3IgaGVhZGVyLCBjaGlsZCBpbiB6aXAoaGVhZGVycywgcm93LmNoaWxkcmVuKToKICAgICAgICAgICAgcHJpbnQgaGVhZGVyLCAiOiIsIGNoaWxkLnRleHQKCiAgICByYW5rID0gNQogICAgcHJpbnQgcmFuawoKc3BpZGVyKDEpCg==