fork download
  1. # -*- coding: utf-8 -*-
  2.  
  3. import codecs, re, time, tnttzinfo, gethttp, tntvnlang
  4. from datetime import datetime
  5. from pprint import pprint
  6.  
  7. VN_TZ = tnttzinfo.UTCTimeZone(+420, False, "GMT+07")
  8. VN_TZ_Y = tnttzinfo.UTCTimeZone(-17*60, False, "GMT-17")
  9.  
  10. def unescape_html(s):
  11. s = s.replace('>','>')
  12. s = s.replace('&lt;','<')
  13. s = s.replace('&quot;','"')
  14. s = s.replace('&#39;',"'")
  15. s = s.replace('&amp;','&')
  16. return s
  17.  
  18. def GetAllPosts(thread_id, page=1):
  19. ##Get page source
  20. url = 'http://f...content-available-to-author-only...n.com/showthread.php?%s/page%s' % (thread_id, page)
  21. c = gethttp.GetHttpSource(url).decode('utf8')
  22. if not c:
  23. yield None
  24. ##Get current & yesterday datetime
  25. currentdate = datetime.now(VN_TZ).strftime("%d-%m-%Y")
  26. yesterdaydate = datetime.now(VN_TZ_Y).strftime("%d-%m-%Y")
  27. ##Make list of posts
  28. pl = re.compile('<li class="postbit (.+?)<hr />(.+?)</li>', re.S|re.U)
  29. pl = [c[m.span()[0]:m.span()[1]] for m in pl.finditer(c)]
  30. ##Process list of posts (return new list)
  31. npl = []
  32. for p0 in pl:
  33. ##Get post id
  34. pid = int( re.findall('class="postcounter">#(\d+)</a>', p0, re.U)[0] )
  35. ##Get post created and post last edited's timestamp
  36. pdate = re.findall(u'<span class="date">(.+?)<span', p0, re.U)[0]
  37. if pdate.find(u'Hôm nay') != -1:
  38. pdate = currentdate
  39. elif pdate.find(u'Hôm qua') != -1:
  40. pdate = yesterdaydate
  41. else:
  42. pdate = re.findall(u'\d\d-\d\d-\d\d\d\d', pdate)[0]
  43. pdate = [int(i) for i in pdate.split('-')]
  44. ptime = re.findall(u'<span class="time">(.+?)</span', p0, re.U)[0]
  45. ptime = [int(i) for i in ptime.split(':')]
  46. pcreated = datetime(pdate[2], pdate[1], pdate[0], ptime[0], ptime[1], tzinfo=VN_TZ)
  47. pedit = re.findall(u'<blockquote class="postcontent lastedited">(.+?)</blockquote>',
  48. p0, re.U|re.S)
  49. if pedit:
  50. pedit = pedit[0].strip()
  51. pedittime = re.findall(u'<span class="time">(.+?)</span', pedit, re.U)[0]
  52. if pedit.find(u'Hôm nay') != -1:
  53. pedit = currentdate
  54. elif pedit.find(u'Hôm qua') != -1:
  55. pedit = yesterdaydate
  56. else:
  57. pedit = re.findall(u'\d\d-\d\d-\d\d\d\d', pedit)[0]
  58. pedit += '-'+pedittime
  59. pedit = re.sub(u':', u'-', pedit)
  60. pedit = [int(i) for i in pedit.split('-')]
  61. plastedited = datetime(pedit[2],pedit[1],pedit[0],pedit[3],pedit[4],tzinfo=VN_TZ)
  62. else: plastedited = pcreated
  63. ##Get post content
  64. raw_content = re.findall(u'<blockquote class="postcontent restore">(.+?)</blockquote>',
  65. p0, re.U|re.S)[0]
  66. pcontent = '\n'.join(li for li in [re.sub(u'<(.+?)>', '', line, re.U).strip()
  67. for line in raw_content.splitlines()] if li)
  68. ##Get poster's info: name, rank, join date, post count
  69. raw_name = re.findall(u'<div class="popupmenu memberaction">(.+?)</a>',
  70. p0, re.U|re.S)[0]
  71. uname = re.sub(u'<(.+?)>', '', raw_name, re.U).strip()
  72. raw_rank = re.findall(u'<span class="rank">(.+?)</span>', p0, re.U)[0]
  73. urank = re.sub(u'<(.+?)>', '', raw_rank, re.U).strip()
  74. ustats = re.findall(u'<dl class="userstats">(.+?)</dl>', p0, re.U|re.S)[0]
  75. ujoindate = re.findall(u'<dt>Ngày tham gia</dt> <dd>(.+?)</dd>', ustats, re.U)[0]
  76. ujoindate = [int(i) for i in ujoindate.split('-')]
  77. ujoindate = datetime(ujoindate[2], ujoindate[1], ujoindate[0], tzinfo=VN_TZ)
  78. upostcount = re.findall(u'<dt>Bài vi&#7871;t</dt> <dd>(.+?)</dd>', ustats, re.U)[0]
  79. upostcount = int(re.sub(u',', u'', upostcount))
  80.  
  81. uname = unescape_html(uname)
  82. urank = unescape_html(urank)
  83. pcontent = unescape_html(pcontent)
  84.  
  85. # int float float str str str float int
  86. yield (pid, pcreated, plastedited, pcontent, uname, urank, ujoindate, upostcount)
  87.  
Runtime error #stdin #stdout #stderr 0.08s 8832KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 3, in <module>
    import codecs, re, time, tnttzinfo, gethttp, tntvnlang
ImportError: No module named tnttzinfo