#!/usr/bin/env python
import scraperwiki
import requests
from bs4 import BeautifulSoup
import string
allitems = []
uri = "http://l...content-available-to-author-only...g.com/r/discussion/new/"
html = requests.get(uri)
soup = BeautifulSoup(html.content)
items = soup.find_all(class_="post list")
while len(items) > 0:
for item in items:
if (item is not None) and (item.div is not None) and (item.div.span is not None):
points = item.div.span.span
itemdate = item.find(class_="date")
id = string.split(item.h2.a['href'], '/')[4]
title = item.h2.a.string
if (points is not None) and (itemdate is not None):
allitems.append([points.string, title, itemdate.string, id])
unique_keys = [ 'id' ]
data = { 'id':unicode(id), 'title':unicode(title), 'points':int(points.string), 'posted_on':unicode(itemdate.string)}
scraperwiki.sql.save(unique_keys, data)
uri = "http://l...content-available-to-author-only...g.com/r/discussion/new/?after=t3_" + allitems[-1][3]
html = requests.get(uri)
soup = BeautifulSoup(html.content)
items = soup.find_all(class_="post list")
print uri, len(allitems)
IyEvdXNyL2Jpbi9lbnYgcHl0aG9uCgppbXBvcnQgc2NyYXBlcndpa2kKaW1wb3J0IHJlcXVlc3RzCmZyb20gYnM0IGltcG9ydCBCZWF1dGlmdWxTb3VwCmltcG9ydCBzdHJpbmcKCmFsbGl0ZW1zID0gW10KCnVyaSA9ICJodHRwOi8vbC4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4uZy5jb20vci9kaXNjdXNzaW9uL25ldy8iCmh0bWwgPSByZXF1ZXN0cy5nZXQodXJpKQpzb3VwID0gQmVhdXRpZnVsU291cChodG1sLmNvbnRlbnQpCml0ZW1zID0gc291cC5maW5kX2FsbChjbGFzc189InBvc3QgbGlzdCIpCgp3aGlsZSBsZW4oaXRlbXMpID4gMDoKICAgIGZvciBpdGVtIGluIGl0ZW1zOgogICAgICAgIAogICAgICAgIGlmIChpdGVtIGlzIG5vdCBOb25lKSBhbmQgKGl0ZW0uZGl2IGlzIG5vdCBOb25lKSBhbmQgKGl0ZW0uZGl2LnNwYW4gaXMgbm90IE5vbmUpOgogICAgICAgICAgICBwb2ludHMgPSBpdGVtLmRpdi5zcGFuLnNwYW4KICAgICAgICAgICAgaXRlbWRhdGUgPSBpdGVtLmZpbmQoY2xhc3NfPSJkYXRlIikKICAgICAgICAgICAgaWQgPSBzdHJpbmcuc3BsaXQoaXRlbS5oMi5hWydocmVmJ10sICcvJylbNF0KICAgICAgICAgICAgdGl0bGUgPSBpdGVtLmgyLmEuc3RyaW5nCiAgICAgICAgICAgIGlmIChwb2ludHMgaXMgbm90IE5vbmUpIGFuZCAoaXRlbWRhdGUgaXMgbm90IE5vbmUpOgogICAgICAgICAgICAgICAgYWxsaXRlbXMuYXBwZW5kKFtwb2ludHMuc3RyaW5nLCB0aXRsZSwgaXRlbWRhdGUuc3RyaW5nLCBpZF0pCiAgICAgICAgICAgICAgICB1bmlxdWVfa2V5cyA9IFsgJ2lkJyBdCiAgICAgICAgICAgICAgICBkYXRhID0geyAnaWQnOnVuaWNvZGUoaWQpLCAndGl0bGUnOnVuaWNvZGUodGl0bGUpLCAncG9pbnRzJzppbnQocG9pbnRzLnN0cmluZyksICdwb3N0ZWRfb24nOnVuaWNvZGUoaXRlbWRhdGUuc3RyaW5nKX0KICAgICAgICAgICAgICAgIHNjcmFwZXJ3aWtpLnNxbC5zYXZlKHVuaXF1ZV9rZXlzLCBkYXRhKQogICAgCiAgICB1cmkgPSAiaHR0cDovL2wuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLmcuY29tL3IvZGlzY3Vzc2lvbi9uZXcvP2FmdGVyPXQzXyIgKyBhbGxpdGVtc1stMV1bM10KICAgIGh0bWwgPSByZXF1ZXN0cy5nZXQodXJpKQogICAgc291cCA9IEJlYXV0aWZ1bFNvdXAoaHRtbC5jb250ZW50KQogICAgaXRlbXMgPSBzb3VwLmZpbmRfYWxsKGNsYXNzXz0icG9zdCBsaXN0IikKICAgIAogICAgcHJpbnQgdXJpLCBsZW4oYWxsaXRlbXMp