fork download
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3.  
  4. import codecs
  5. from datetime import datetime
  6. from optparse import OptionParser
  7. import re
  8. import time
  9. import urllib, urllib2
  10.  
  11. from BeautifulSoup import BeautifulSoup, NavigableString
  12.  
  13.  
  14. class Message:
  15. def __init__(self, thread_url, sender, recipient, timestamp, subject, content):
  16. self.thread_url = thread_url
  17. self.sender = sender
  18. self.recipient = recipient
  19. self.timestamp = timestamp
  20. self.subject = subject
  21. self.content = content
  22. def __str__(self):
  23. return """
  24. URL: %s
  25. From: %s
  26. To: %s
  27. Date: %s
  28. Subject: %s
  29. Content-Length: %d
  30.  
  31. %s
  32.  
  33. """ % ( self.thread_url,
  34. self.sender,
  35. self.recipient,
  36. self.timestamp,
  37. self.subject.strip(),
  38. len(self.content),
  39. self.content
  40. )
  41.  
  42.  
  43. class ArrowFetcher:
  44. base_url = 'http://w...content-available-to-author-only...d.com'
  45. sleep_duration = 3.0 # time to wait after each HTTP request
  46.  
  47. def __init__(self, username, password):
  48. self.username = username
  49. self.thread_urls = []
  50. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
  51. urllib2.install_opener(opener)
  52. params = urllib.urlencode(dict(username=username, password=password))
  53. f = opener.open(self.base_url + '/login', params)
  54. f.close()
  55.  
  56. def _safely_soupify(self, f):
  57. f = f.partition("function autocoreError")[0] + '</body></html>' # wtf okc with the weirdly encoded "</scr' + 'ipt>'"-type statements in your javascript
  58. return(BeautifulSoup(f))
  59.  
  60. def _request_read_sleep(self, url):
  61. f = urllib2.urlopen(url).read()
  62. time.sleep(self.sleep_duration)
  63. return f
  64.  
  65. def queue_threads(self):
  66. self.thread_urls = []
  67. for folder in range(1,4): # Inbox, Sent, Smiles
  68. page = 0;
  69. while (True):
  70. print "queuing folder %s, page %s" % (folder, page)
  71. f = self._request_read_sleep(self.base_url + '/messages?folder=' + str(folder) + '&low=' + str((page * 30) + 1))
  72. soup = self._safely_soupify(f)
  73. end_pattern = re.compile('&folder=\d\';')
  74. threads = [
  75. re.sub(end_pattern, '', li.find('p')['onclick'].partition("window.location='")[2])
  76. for li in soup.find('ul', {'id': 'messages'}).findAll('li')
  77. ]
  78. if len(threads) == 0: # break out of the infinite loop when we reach the end and there are no threads on the page
  79. break
  80. else:
  81. self.thread_urls.extend(threads)
  82. page = page + 1
  83.  
  84. def dedupe_threads(self):
  85. print "removing duplicate URLs"
  86. self.thread_urls = list(set(self.thread_urls))
  87.  
  88. def fetch_threads(self):
  89. self.messages = []
  90. for thread_url in self.thread_urls:
  91. self.messages.extend(self._fetch_thread(thread_url))
  92.  
  93. def strptime(self, string, format='%b %d, %Y &ndash; %I:%M%p'):
  94. return datetime.strptime(string.strip(), format)
  95.  
  96. def write_messages(self, file_name):
  97. self.messages.sort(key = lambda message: message.timestamp) # sort by time
  98. f = codecs.open(file_name, encoding='utf-8', mode='w') # ugh, otherwise i think it will try to write ascii
  99. for message in self.messages:
  100. print "writing message for thread: " + message.thread_url
  101. f.write(unicode(message))
  102. f.close()
  103.  
  104. def _fetch_thread(self, thread_url):
  105. message_list = []
  106. print "fetching thread: " + self.base_url + thread_url
  107. f = self._request_read_sleep(self.base_url + thread_url)
  108. soup = self._safely_soupify(f)
  109. try:
  110. subject = soup.find('strong', {'id': 'message_heading'}).contents[0]
  111. except AttributeError:
  112. subject = ''
  113. try:
  114. other_user = soup.find('a', {'class': 'buddyname'}).contents[0]
  115. except AttributeError:
  116. try:
  117. # messages from OkCupid itself are a special case
  118. other_user = soup.find('ul', {'id': 'thread'}).find('p', 'signature').contents[0].partition('Message from ')[2]
  119. except AttributeError:
  120. other_user = ''
  121. for message in soup.find('ul', {'id': 'thread'}).findAll('li'):
  122. body_contents = message.find('div', 'message_body')
  123. if body_contents:
  124. body = self._strip_tags(body_contents.renderContents()).renderContents().strip()
  125. for pair in [ ('<br />', '\n'),
  126. ('&amp;', '&'),
  127. ('&lt;', '<'),
  128. ('&gt;', '>'),
  129. ('&quot;', '"'),
  130. ('&#39;', "'"),
  131. ('&mdash;', "—")]:
  132. body = body.replace(pair[0], pair[1])
  133. timestamp = message.find('span','timestamp')
  134. if timestamp.decodeContents and timestamp.decodeContents():
  135. timestamp = self.strptime(timestamp.decodeContents().strip())
  136. else:
  137. timestamp = self.strptime(timestamp.text.strip())
  138. sender = other_user
  139. recipient = self.username
  140. if message['class'].replace('preview', '').strip() == 'from_me':
  141. recipient = other_user
  142. sender = self.username
  143. message_list.append(Message(self.base_url + thread_url,
  144. unicode(sender),
  145. unicode(recipient),
  146. timestamp,
  147. unicode(subject),
  148. body.decode('utf-8')))
  149. else:
  150. continue # control elements are also <li>'s in their html, so non-messages
  151. return message_list
  152.  
  153. # http://stackoverflow.com/questions/1765848/remove-a-tag-using-beautifulsoup-but-keep-its-contents/1766002#1766002
  154. def _strip_tags(self, html, invalid_tags=['a', 'span', 'strong', 'div']):
  155. soup = BeautifulSoup(html)
  156. for tag in soup.findAll(True):
  157. if tag.name in invalid_tags:
  158. s = ""
  159. for c in tag.contents:
  160. if type(c) != NavigableString:
  161. c = self._strip_tags(unicode(c), invalid_tags)
  162. s += unicode(c).strip()
  163. else:
  164. s += unicode(c)
  165. tag.replaceWith(s)
  166. return soup
  167.  
  168.  
  169. def main():
  170. parser = OptionParser()
  171. parser.add_option("-u", "--username", dest="username",
  172. help="your OkCupid username")
  173. parser.add_option("-p", "--password", dest="password",
  174. help="your OkCupid password")
  175. parser.add_option("-f", "--filename", dest="filename",
  176. help="the file to which you want to write the data")
  177. (options, args) = parser.parse_args()
  178. if not options.username:
  179. print "Please specify your OkCupid username with either '-u' or '--username'"
  180. if not options.password:
  181. print "Please specify your OkCupid password with either '-p' or '--password'"
  182. if not options.filename:
  183. print "Please specify the destination file with either '-f' or '--filename'"
  184. if options.username and options.password and options.filename:
  185. arrow_fetcher = ArrowFetcher(options.username, options.password)
  186. arrow_fetcher.queue_threads()
  187. arrow_fetcher.dedupe_threads()
  188. arrow_fetcher.fetch_threads()
  189. arrow_fetcher.write_messages(options.filename)
  190.  
  191. if __name__ == '__main__':
  192. main()
  193.  
Runtime error #stdin #stdout 0.02s 5852KB
stdin
okc_arrow_fetcher.py -u LookinGlass86 -p jm789521 -f c:\users\Jon\output.txt
stdout
Standard output is empty