# -*- coding: utf-8 -*-
import codecs , re , time , tnttzinfo, gethttp, tntvnlang
from datetime import datetime
from pprint import pprint
VN_TZ = tnttzinfo.UTCTimeZone ( +420 , False , "GMT+07" )
VN_TZ_Y = tnttzinfo.UTCTimeZone ( -17 *60 , False , "GMT-17" )
def unescape_html( s) :
s = s.replace ( '>' , '>' )
s = s.replace ( '<' , '<' )
s = s.replace ( '"' , '"' )
s = s.replace ( ''' , "'" )
s = s.replace ( '&' , '&' )
return s
def GetAllPosts( thread_id, page= 1 ) :
##Get page source
url = 'http://f...content-available-to-author-only...n.com/showthread.php?%s/page%s' % ( thread_id, page)
c = gethttp.GetHttpSource ( url) .decode ( 'utf8' )
if not c:
yield None
##Get current & yesterday datetime
currentdate = datetime .now ( VN_TZ) .strftime ( "%d-%m-%Y" )
yesterdaydate = datetime .now ( VN_TZ_Y) .strftime ( "%d-%m-%Y" )
##Make list of posts
pl = re .compile ( '<li class="postbit (.+?)<hr />(.+?)</li>' , re .S |re .U )
pl = [ c[ m.span ( ) [ 0 ] :m.span ( ) [ 1 ] ] for m in pl.finditer ( c) ]
##Process list of posts (return new list)
npl = [ ]
for p0 in pl:
##Get post id
pid = int ( re .findall ( 'class="postcounter">#(\d +)</a>' , p0, re .U ) [ 0 ] )
##Get post created and post last edited's timestamp
pdate = re .findall ( u'<span class="date">(.+?)<span' , p0, re .U ) [ 0 ]
if pdate.find ( u'Hôm nay' ) != -1 :
pdate = currentdate
elif pdate.find ( u'Hôm qua' ) != -1 :
pdate = yesterdaydate
else :
pdate = re .findall ( u'\d \d -\d \d -\d \d \d \d ' , pdate) [ 0 ]
pdate = [ int ( i) for i in pdate.split ( '-' ) ]
ptime = re .findall ( u'<span class="time">(.+?)</span' , p0, re .U ) [ 0 ]
ptime = [ int ( i) for i in ptime.split ( ':' ) ]
pcreated = datetime ( pdate[ 2 ] , pdate[ 1 ] , pdate[ 0 ] , ptime[ 0 ] , ptime[ 1 ] , tzinfo= VN_TZ)
pedit = re .findall ( u'<blockquote class="postcontent lastedited">(.+?)</blockquote>' ,
p0, re .U |re .S )
if pedit:
pedit = pedit[ 0 ] .strip ( )
pedittime = re .findall ( u'<span class="time">(.+?)</span' , pedit, re .U ) [ 0 ]
if pedit.find ( u'Hôm nay' ) != -1 :
pedit = currentdate
elif pedit.find ( u'Hôm qua' ) != -1 :
pedit = yesterdaydate
else :
pedit = re .findall ( u'\d \d -\d \d -\d \d \d \d ' , pedit) [ 0 ]
pedit += '-' +pedittime
pedit = re .sub ( u':' , u'-' , pedit)
pedit = [ int ( i) for i in pedit.split ( '-' ) ]
plastedited = datetime ( pedit[ 2 ] , pedit[ 1 ] , pedit[ 0 ] , pedit[ 3 ] , pedit[ 4 ] , tzinfo= VN_TZ)
else : plastedited = pcreated
##Get post content
raw_content = re .findall ( u'<blockquote class="postcontent restore">(.+?)</blockquote>' ,
p0, re .U |re .S ) [ 0 ]
pcontent = '\n ' .join ( li for li in [ re .sub ( u'<(.+?)>' , '' , line, re .U ) .strip ( )
for line in raw_content.splitlines ( ) ] if li)
##Get poster's info: name, rank, join date, post count
raw_name = re .findall ( u'<div class="popupmenu memberaction">(.+?)</a>' ,
p0, re .U |re .S ) [ 0 ]
uname = re .sub ( u'<(.+?)>' , '' , raw_name, re .U ) .strip ( )
raw_rank = re .findall ( u'<span class="rank">(.+?)</span>' , p0, re .U ) [ 0 ]
urank = re .sub ( u'<(.+?)>' , '' , raw_rank, re .U ) .strip ( )
ustats = re .findall ( u'<dl class="userstats">(.+?)</dl>' , p0, re .U |re .S ) [ 0 ]
ujoindate = re .findall ( u'<dt>Ngày tham gia</dt> <dd>(.+?)</dd>' , ustats, re .U ) [ 0 ]
ujoindate = [ int ( i) for i in ujoindate.split ( '-' ) ]
ujoindate = datetime ( ujoindate[ 2 ] , ujoindate[ 1 ] , ujoindate[ 0 ] , tzinfo= VN_TZ)
upostcount = re .findall ( u'<dt>Bài viết</dt> <dd>(.+?)</dd>' , ustats, re .U ) [ 0 ]
upostcount = int ( re .sub ( u',' , u'' , upostcount) )
uname = unescape_html( uname)
urank = unescape_html( urank)
pcontent = unescape_html( pcontent)
# int float float str str str float int
yield ( pid, pcreated, plastedited, pcontent, uname, urank, ujoindate, upostcount)
IyAtKi0gY29kaW5nOiB1dGYtOCAtKi0KCmltcG9ydCBjb2RlY3MsIHJlLCB0aW1lLCB0bnR0emluZm8sIGdldGh0dHAsIHRudHZubGFuZwpmcm9tIGRhdGV0aW1lIGltcG9ydCBkYXRldGltZQpmcm9tIHBwcmludCBpbXBvcnQgcHByaW50CgpWTl9UWiA9IHRudHR6aW5mby5VVENUaW1lWm9uZSgrNDIwLCBGYWxzZSwgIkdNVCswNyIpClZOX1RaX1kgPSB0bnR0emluZm8uVVRDVGltZVpvbmUoLTE3KjYwLCBGYWxzZSwgIkdNVC0xNyIpCgpkZWYgdW5lc2NhcGVfaHRtbChzKToKICAgIHMgPSBzLnJlcGxhY2UoJyZndDsnLCc+JykKICAgIHMgPSBzLnJlcGxhY2UoJyZsdDsnLCc8JykKICAgIHMgPSBzLnJlcGxhY2UoJyZxdW90OycsJyInKQogICAgcyA9IHMucmVwbGFjZSgnJiMzOTsnLCInIikKICAgIHMgPSBzLnJlcGxhY2UoJyZhbXA7JywnJicpCiAgICByZXR1cm4gcwoKZGVmIEdldEFsbFBvc3RzKHRocmVhZF9pZCwgcGFnZT0xKToKICAgICMjR2V0IHBhZ2Ugc291cmNlCiAgICB1cmwgPSAnaHR0cDovL2YuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLm4uY29tL3Nob3d0aHJlYWQucGhwPyVzL3BhZ2UlcycgJSAodGhyZWFkX2lkLCBwYWdlKQogICAgYyA9IGdldGh0dHAuR2V0SHR0cFNvdXJjZSh1cmwpLmRlY29kZSgndXRmOCcpCiAgICBpZiBub3QgYzoKICAgIAl5aWVsZCBOb25lCiAgICAjI0dldCBjdXJyZW50ICYgeWVzdGVyZGF5IGRhdGV0aW1lCiAgICBjdXJyZW50ZGF0ZSA9IGRhdGV0aW1lLm5vdyhWTl9UWikuc3RyZnRpbWUoIiVkLSVtLSVZIikKICAgIHllc3RlcmRheWRhdGUgPSBkYXRldGltZS5ub3coVk5fVFpfWSkuc3RyZnRpbWUoIiVkLSVtLSVZIikKICAgICMjTWFrZSBsaXN0IG9mIHBvc3RzCiAgICBwbCA9IHJlLmNvbXBpbGUoJzxsaSBjbGFzcz0icG9zdGJpdCAoLis/KTxociAvPiguKz8pPC9saT4nLCByZS5TfHJlLlUpCiAgICBwbCA9IFtjW20uc3BhbigpWzBdOm0uc3BhbigpWzFdXSBmb3IgbSBpbiBwbC5maW5kaXRlcihjKV0KICAgICMjUHJvY2VzcyBsaXN0IG9mIHBvc3RzIChyZXR1cm4gbmV3IGxpc3QpCiAgICBucGwgPSBbXQogICAgZm9yIHAwIGluIHBsOgogICAgICAgICMjR2V0IHBvc3QgaWQKICAgICAgICBwaWQgPSBpbnQoIHJlLmZpbmRhbGwoJ2NsYXNzPSJwb3N0Y291bnRlciI+IyhcZCspPC9hPicsIHAwLCByZS5VKVswXSApCiAgICAgICAgIyNHZXQgcG9zdCBjcmVhdGVkIGFuZCBwb3N0IGxhc3QgZWRpdGVkJ3MgdGltZXN0YW1wCiAgICAgICAgcGRhdGUgPSByZS5maW5kYWxsKHUnPHNwYW4gY2xhc3M9ImRhdGUiPiguKz8pPHNwYW4nLCBwMCwgcmUuVSlbMF0KICAgICAgICBpZiBwZGF0ZS5maW5kKHUnSMO0bSBuYXknKSAhPSAtMToKICAgICAgICAgICAgcGRhdGUgPSBjdXJyZW50ZGF0ZQogICAgICAgIGVsaWYgcGRhdGUuZmluZCh1J0jDtG0gcXVhJykgIT0gLTE6CiAgICAgICAgICAgIHBkYXRlID0geWVzdGVyZGF5ZGF0ZQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHBkYXRlID0gcmUuZmluZGFsbCh1J1xkXGQtXGRcZC1cZFxkXGRcZCcsIHBkYXRlKVswXQogICAgICAgIHBkYXRlID0gW2ludChpKSBmb3IgaSBpbiBwZGF0ZS5zcGxpdCgnLScpXQogICAgICAgIHB0aW1lID0gcmUuZmluZGFsbCh1JzxzcGFuIGNsYXNzPSJ0aW1lIj4oLis/KTwvc3BhbicsIHAwLCByZS5VKVswXQogICAgICAgIHB0aW1lID0gW2ludChpKSBmb3IgaSBpbiBwdGltZS5zcGxpdCgnOicpXQogICAgICAgIHBjcmVhdGVkID0gZGF0ZXRpbWUocGRhdGVbMl0sIHBkYXRlWzFdLCBwZGF0ZVswXSwgcHRpbWVbMF0sIHB0aW1lWzFdLCB0emluZm89Vk5fVFopCiAgICAgICAgcGVkaXQgPSByZS5maW5kYWxsKHUnPGJsb2NrcXVvdGUgY2xhc3M9InBvc3Rjb250ZW50IGxhc3RlZGl0ZWQiPiguKz8pPC9ibG9ja3F1b3RlPicsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHAwLCByZS5VfHJlLlMpCiAgICAgICAgaWYgcGVkaXQ6CiAgICAgICAgICAgIHBlZGl0ID0gcGVkaXRbMF0uc3RyaXAoKQogICAgICAgICAgICBwZWRpdHRpbWUgPSByZS5maW5kYWxsKHUnPHNwYW4gY2xhc3M9InRpbWUiPiguKz8pPC9zcGFuJywgcGVkaXQsIHJlLlUpWzBdCiAgICAgICAgICAgIGlmIHBlZGl0LmZpbmQodSdIw7RtIG5heScpICE9IC0xOgogICAgICAgICAgICAgICAgcGVkaXQgPSBjdXJyZW50ZGF0ZQogICAgICAgICAgICBlbGlmIHBlZGl0LmZpbmQodSdIw7RtIHF1YScpICE9IC0xOgogICAgICAgICAgICAgICAgcGVkaXQgPSB5ZXN0ZXJkYXlkYXRlCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBwZWRpdCA9IHJlLmZpbmRhbGwodSdcZFxkLVxkXGQtXGRcZFxkXGQnLCBwZWRpdClbMF0KICAgICAgICAgICAgcGVkaXQgKz0gJy0nK3BlZGl0dGltZQogICAgICAgICAgICBwZWRpdCA9IHJlLnN1Yih1JzonLCB1Jy0nLCBwZWRpdCkKICAgICAgICAgICAgcGVkaXQgPSBbaW50KGkpIGZvciBpIGluIHBlZGl0LnNwbGl0KCctJyldCiAgICAgICAgICAgIHBsYXN0ZWRpdGVkID0gZGF0ZXRpbWUocGVkaXRbMl0scGVkaXRbMV0scGVkaXRbMF0scGVkaXRbM10scGVkaXRbNF0sdHppbmZvPVZOX1RaKQogICAgICAgIGVsc2U6IHBsYXN0ZWRpdGVkID0gcGNyZWF0ZWQKICAgICAgICAjI0dldCBwb3N0IGNvbnRlbnQKICAgICAgICByYXdfY29udGVudCA9IHJlLmZpbmRhbGwodSc8YmxvY2txdW90ZSBjbGFzcz0icG9zdGNvbnRlbnQgcmVzdG9yZSI+KC4rPyk8L2Jsb2NrcXVvdGU+JywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcDAsIHJlLlV8cmUuUylbMF0KICAgICAgICBwY29udGVudCA9ICdcbicuam9pbihsaSBmb3IgbGkgaW4gW3JlLnN1Yih1JzwoLis/KT4nLCAnJywgbGluZSwgcmUuVSkuc3RyaXAoKQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZm9yIGxpbmUgaW4gcmF3X2NvbnRlbnQuc3BsaXRsaW5lcygpXSBpZiBsaSkKICAgICAgICAjI0dldCBwb3N0ZXIncyBpbmZvOiBuYW1lLCByYW5rLCBqb2luIGRhdGUsIHBvc3QgY291bnQKICAgICAgICByYXdfbmFtZSA9IHJlLmZpbmRhbGwodSc8ZGl2IGNsYXNzPSJwb3B1cG1lbnUgbWVtYmVyYWN0aW9uIj4oLis/KTwvYT4nLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICBwMCwgcmUuVXxyZS5TKVswXQogICAgICAgIHVuYW1lID0gcmUuc3ViKHUnPCguKz8pPicsICcnLCByYXdfbmFtZSwgcmUuVSkuc3RyaXAoKQogICAgICAgIHJhd19yYW5rID0gcmUuZmluZGFsbCh1JzxzcGFuIGNsYXNzPSJyYW5rIj4oLis/KTwvc3Bhbj4nLCBwMCwgcmUuVSlbMF0KICAgICAgICB1cmFuayA9IHJlLnN1Yih1JzwoLis/KT4nLCAnJywgcmF3X3JhbmssIHJlLlUpLnN0cmlwKCkKICAgICAgICB1c3RhdHMgPSByZS5maW5kYWxsKHUnPGRsIGNsYXNzPSJ1c2Vyc3RhdHMiPiguKz8pPC9kbD4nLCBwMCwgcmUuVXxyZS5TKVswXQogICAgICAgIHVqb2luZGF0ZSA9IHJlLmZpbmRhbGwodSc8ZHQ+TmfDoHkgdGhhbSBnaWE8L2R0PiA8ZGQ+KC4rPyk8L2RkPicsIHVzdGF0cywgcmUuVSlbMF0KICAgICAgICB1am9pbmRhdGUgPSBbaW50KGkpIGZvciBpIGluIHVqb2luZGF0ZS5zcGxpdCgnLScpXQogICAgICAgIHVqb2luZGF0ZSA9IGRhdGV0aW1lKHVqb2luZGF0ZVsyXSwgdWpvaW5kYXRlWzFdLCB1am9pbmRhdGVbMF0sIHR6aW5mbz1WTl9UWikKICAgICAgICB1cG9zdGNvdW50ID0gcmUuZmluZGFsbCh1JzxkdD5Cw6BpIHZpJiM3ODcxO3Q8L2R0PiA8ZGQ+KC4rPyk8L2RkPicsIHVzdGF0cywgcmUuVSlbMF0KICAgICAgICB1cG9zdGNvdW50ID0gaW50KHJlLnN1Yih1JywnLCB1JycsIHVwb3N0Y291bnQpKQoKICAgICAgICB1bmFtZSA9IHVuZXNjYXBlX2h0bWwodW5hbWUpCiAgICAgICAgdXJhbmsgPSB1bmVzY2FwZV9odG1sKHVyYW5rKQogICAgICAgIHBjb250ZW50ID0gdW5lc2NhcGVfaHRtbChwY29udGVudCkKCiAgICAgICAgIyAgICAgIGludCAgIGZsb2F0ICAgICAgIGZsb2F0ICAgICAgIHN0ciAgICAgIHN0ciAgICBzdHIgICAgIGZsb2F0ICAgICAgIGludAogICAgICAgIHlpZWxkIChwaWQsIHBjcmVhdGVkLCBwbGFzdGVkaXRlZCwgcGNvbnRlbnQsIHVuYW1lLCB1cmFuaywgdWpvaW5kYXRlLCB1cG9zdGNvdW50KQo=