# your code goes here
# -*- coding: utf-8 -*-
import re
import urllib .request as req
from random import choice, sample
### Settings
re_blockquote = re .compile ( '<blockquote.*?.>.*?.</blockquote>' , flags= re .DOTALL )
re_remove_blockquote = re .compile ( '<[/]*blockquote.*?>' , flags= re .DOTALL )
re_remove_a = re .compile ( '<a.*?.>.*.</a>' )
re_remove_u_symbols = re .compile ( '.*&#*.*;' )
replace_list = [ [ '\n ' , '' ] ,
[ '\t ' , ' ' ] ,
[ '<br>' , '' ] ,
[ '<blockquote>' , '' ] ,
[ '</blockquote>' , '' ] ,
[ '<span class=\" spoiler\" >' , '.' ] ,
[ '<span class=\" unkfunc\" >' , '.' ] ,
[ '<span class=\" u\" >' , '.' ] ,
[ '</span>' , '.' ] ,
[ '<strong>' , ' ' ] ,
[ '</strong>' , ' ' ] ,
[ '<em>' , '.' ] ,
[ '</em>' , '.' ] ,
[ '<sub>' , '.' ] ,
[ '</sub>' , '.' ] ,
[ '<sup>' , '.' ] ,
[ '</sup>' , '.' ] ,
# ~ [';', '.'],
# ~ [',', ' '],
# ~ ['-', ' '],
[ '(' , ',' ] ,
[ ')' , ',' ] ,
[ '!' , '.' ] ,
[ '?' , '.' ] ,
[ ' ' , ' ' ] ,
[ '. ' , '.' ] ,
[ ' .' , '.' ] ,
[ '..' , '.' ] ,
]
### Parser
url = ''
while not 'http' in url:
url = str ( input ( 'full thread url:>' ) )
print ( ">Downloading thread %s..." %( url, ) )
with req.urlopen ( url) as threadf:
thread = threadf.read ( ) .decode ( 'utf-8' )
res = re_remove_a.sub ( '' , thread )
res = re_blockquote.findall ( res)
res = map ( lambda x: re_remove_blockquote.sub ( '' , x) , res)
res = map ( lambda x: re_remove_u_symbols.sub ( '' , x) , res)
raw_messages = res
threadf.close ( )
del ( thread )
del ( res)
### Replacing
print ( ">Replacing..." )
messages = [ ]
for j in enumerate ( raw_messages) :
s = j[ 1 ]
for i in enumerate ( replace_list) :
while i[ 1 ] [ 0 ] in s:
s = s.replace ( i[ 1 ] [ 0 ] , i[ 1 ] [ 1 ] )
messages.append ( s.strip ( ) )
### Building dict 4 Markov
s = [ j for i in messages for j in i.split ( '.' ) ]
s1 = [ i.split ( ' ' ) for i in s]
for i in enumerate ( s1) :
if '' in i[ 1 ] :
s1[ i[ 0 ] ] .remove ( '' )
if s1[ i[ 0 ] ] == [ ] :
s1.remove ( [ ] )
words = { '.' : [ ] ,
# ~ '!': [],
# ~ '?': []
}
for i in s1:
l = len ( i)
words[ '.' ] .append ( i[ 0 ] )
# ~ words['!'].append(i[0])
# ~ words['?'].append(i[0])
for j in enumerate ( i[ :-1 ] ) :
if not ( j[ 1 ] in words) :
words[ j[ 1 ] ] = [ ]
words[ j[ 1 ] ] .append ( i[ j[ 0 ] +1 ] )
if not i[ -1 ] in words:
words[ i[ -1 ] ] = [ '.' ,
# ~ '!',
# ~ '?'
]
else :
words[ i[ -1 ] ] .extend ( [ '.' ,
# ~ '!',
# ~ '?'
] )
### Building sentences 4 Markov
big = list ( filter ( lambda x: x.istitle ( ) , list ( words.keys ( ) ) ) )
print ( )
print ( )
for j in range ( 100 ) :
res = [ choice( big) , ]
while not ( res[ -1 ] == '.' ) :
res.append ( choice( words[ res[ -1 ] ] ) )
print ( ' ' .join ( res) )
IyB5b3VyIGNvZGUgZ29lcyBoZXJlCiMgLSotIGNvZGluZzogdXRmLTggLSotCgppbXBvcnQgcmUKaW1wb3J0IHVybGxpYi5yZXF1ZXN0IGFzIHJlcQpmcm9tIHJhbmRvbSBpbXBvcnQgY2hvaWNlLCBzYW1wbGUKCiMjIyBTZXR0aW5ncwpyZV9ibG9ja3F1b3RlID0gcmUuY29tcGlsZSgnPGJsb2NrcXVvdGUuKj8uPi4qPy48L2Jsb2NrcXVvdGU+JywgZmxhZ3M9cmUuRE9UQUxMKQpyZV9yZW1vdmVfYmxvY2txdW90ZSA9IHJlLmNvbXBpbGUoJzxbL10qYmxvY2txdW90ZS4qPz4nLCBmbGFncz1yZS5ET1RBTEwpCnJlX3JlbW92ZV9hID0gcmUuY29tcGlsZSgnPGEuKj8uPi4qLjwvYT4nKQpyZV9yZW1vdmVfdV9zeW1ib2xzID0gcmUuY29tcGlsZSgnLiomIyouKjsnKQoKcmVwbGFjZV9saXN0ID0gW1snXG4nLCAnJ10sCiAgICAgICAgICAgICAgICBbJ1x0JywgJyAnXSwKICAgICAgICAgICAgICAgIFsnPGJyPicsICcnXSwKICAgICAgICAgICAgICAgIFsnPGJsb2NrcXVvdGU+JywgJyddLAogICAgICAgICAgICAgICAgWyc8L2Jsb2NrcXVvdGU+JywgJyddLAogICAgICAgICAgICAgICAgWyc8c3BhbiBjbGFzcz1cInNwb2lsZXJcIj4nLCAnLiddLAogICAgICAgICAgICAgICAgWyc8c3BhbiBjbGFzcz1cInVua2Z1bmNcIj4nLCAnLiddLAogICAgICAgICAgICAgICAgWyc8c3BhbiBjbGFzcz1cInVcIj4nLCAnLiddLAogICAgICAgICAgICAgICAgWyc8L3NwYW4+JywgJy4nXSwKICAgICAgICAgICAgICAgIFsnPHN0cm9uZz4nLCAnICddLAogICAgICAgICAgICAgICAgWyc8L3N0cm9uZz4nLCAnICddLAogICAgICAgICAgICAgICAgWyc8ZW0+JywgJy4nXSwKICAgICAgICAgICAgICAgIFsnPC9lbT4nLCAnLiddLAogICAgICAgICAgICAgICAgWyc8c3ViPicsICcuJ10sCiAgICAgICAgICAgICAgICBbJzwvc3ViPicsICcuJ10sCiAgICAgICAgICAgICAgICBbJzxzdXA+JywgJy4nXSwKICAgICAgICAgICAgICAgIFsnPC9zdXA+JywgJy4nXSwKICAgICAgICAgICAgICAgICMgfiBbJzsnLCAnLiddLAogICAgICAgICAgICAgICAgIyB+IFsnLCcsICcgJ10sCiAgICAgICAgICAgICAgICAjIH4gWyctJywgJyAnXSwKICAgICAgICAgICAgICAgIFsnKCcsICcsJ10sCiAgICAgICAgICAgICAgICBbJyknLCAnLCddLAogICAgICAgICAgICAgICAgWychJywgJy4nXSwKICAgICAgICAgICAgICAgIFsnPycsICcuJ10sCiAgICAgICAgICAgICAgICBbJyAgJywgJyAnXSwKICAgICAgICAgICAgICAgIFsnLiAnLCAnLiddLAogICAgICAgICAgICAgICAgWycgLicsICcuJ10sCiAgICAgICAgICAgICAgICBbJy4uJywgJy4nXSwKICAgICAgICAgICAgICAgIF0KCiMjIyBQYXJzZXIKdXJsID0gJycKd2hpbGUgbm90ICdodHRwJyBpbiB1cmw6CiAgICB1cmwgPSBzdHIoaW5wdXQoJ2Z1bGwgdGhyZWFkIHVybDo+JykpCgpwcmludCgiPkRvd25sb2FkaW5nIHRocmVhZCAlcy4uLiIgJSh1cmwsKSkKCndpdGggcmVxLnVybG9wZW4odXJsKSBhcyB0aHJlYWRmOgogICAgCiAgICB0aHJlYWQgPSB0aHJlYWRmLnJlYWQoKS5kZWNvZGUoJ3V0Zi04JykKICAgIAogICAgcmVzID0gcmVfcmVtb3ZlX2Euc3ViKCcnLCB0aHJlYWQpCiAgICByZXMgPSByZV9ibG9ja3F1b3RlLmZpbmRhbGwocmVzKQogICAgcmVzID0gbWFwKGxhbWJkYSB4OiByZV9yZW1vdmVfYmxvY2txdW90ZS5zdWIoJycsIHgpLCByZXMpCiAgICByZXMgPSBtYXAobGFtYmRhIHg6IHJlX3JlbW92ZV91X3N5bWJvbHMuc3ViKCcnLCB4KSwgcmVzKQogICAgCiAgICByYXdfbWVzc2FnZXMgPSByZXMKICAgIAogICAgdGhyZWFkZi5jbG9zZSgpCiAgICBkZWwodGhyZWFkKQogICAgZGVsKHJlcykKICAgIAoKIyMjIFJlcGxhY2luZwpwcmludCgiPlJlcGxhY2luZy4uLiIpCm1lc3NhZ2VzID0gW10KCmZvciBqIGluIGVudW1lcmF0ZShyYXdfbWVzc2FnZXMpOgogICAgcyA9IGpbMV0KICAgIGZvciBpIGluIGVudW1lcmF0ZShyZXBsYWNlX2xpc3QpOgogICAgICAgIHdoaWxlIGlbMV1bMF0gaW4gczoKICAgICAgICAgICAgcyA9IHMucmVwbGFjZShpWzFdWzBdLCBpWzFdWzFdKQogICAgbWVzc2FnZXMuYXBwZW5kKHMuc3RyaXAoKSkKCiMjIyBCdWlsZGluZyBkaWN0IDQgTWFya292CgpzID0gW2ogZm9yIGkgaW4gbWVzc2FnZXMgZm9yIGogaW4gaS5zcGxpdCgnLicpIF0KCnMxID0gW2kuc3BsaXQoJyAnKSBmb3IgaSBpbiBzXQoKZm9yIGkgaW4gZW51bWVyYXRlKHMxKToKICAgIGlmICcnIGluIGlbMV06CiAgICAgICAgczFbaVswXV0ucmVtb3ZlKCcnKQogICAgaWYgczFbaVswXV0gPT0gW106CiAgICAgICAgczEucmVtb3ZlKFtdKQoKd29yZHMgPSB7Jy4nOiBbXSwgCiAgICAgICAgICMgfiAnISc6IFtdLAogICAgICAgICAjIH4gJz8nOiBbXQogICAgICAgICB9Cgpmb3IgaSBpbiBzMToKICAgIAogICAgbCA9IGxlbihpKQogICAgCiAgICB3b3Jkc1snLiddLmFwcGVuZChpWzBdKQogICAgIyB+IHdvcmRzWychJ10uYXBwZW5kKGlbMF0pCiAgICAjIH4gd29yZHNbJz8nXS5hcHBlbmQoaVswXSkKICAgIGZvciBqIGluIGVudW1lcmF0ZShpWzotMV0pOgogICAgICAgICAgICAKICAgICAgICAgICAgaWYgbm90IChqWzFdIGluIHdvcmRzKToKICAgICAgICAgICAgICAgIHdvcmRzW2pbMV1dID0gW10KICAgICAgICAgICAgd29yZHNbalsxXV0uYXBwZW5kKGlbalswXSsxXSkKCiAgICBpZiBub3QgaVstMV0gaW4gd29yZHM6CiAgICAgICAgd29yZHNbaVstMV1dID0gWycuJywKICAgICAgICAgICAgICAgICAgICAgICAgIyB+ICchJywKICAgICAgICAgICAgICAgICAgICAgICAgIyB+ICc/JwogICAgICAgICAgICAgICAgICAgICAgICBdCiAgICBlbHNlOgogICAgICAgIHdvcmRzW2lbLTFdXS5leHRlbmQoWycuJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIH4gJyEnLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgfiAnPycKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBdKQoKIyMjIEJ1aWxkaW5nIHNlbnRlbmNlcyA0IE1hcmtvdgoKYmlnID0gbGlzdChmaWx0ZXIobGFtYmRhIHg6IHguaXN0aXRsZSgpLCBsaXN0KHdvcmRzLmtleXMoKSkpKQoKcHJpbnQoKQpwcmludCgpCgpmb3IgaiBpbiByYW5nZSgxMDApOgoKICAgIHJlcyA9IFtjaG9pY2UoYmlnKSwgXQoKICAgIHdoaWxlIG5vdChyZXNbLTFdPT0nLicpOgoKICAgICAgICByZXMuYXBwZW5kKGNob2ljZSh3b3Jkc1tyZXNbLTFdXSkpCiAgICBwcmludCgnICcuam9pbihyZXMpKQoK
stdout
full thread url:>>Downloading thread https://2...content-available-to-author-only...h.hk/gd/res/172753.html...
stderr
Traceback (most recent call last):
File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1090, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1128, in _send_request
self.endheaders(body)
File "/usr/lib/python3.4/http/client.py", line 1086, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.4/http/client.py", line 924, in _send_output
self.send(msg)
File "/usr/lib/python3.4/http/client.py", line 859, in send
self.connect()
File "/usr/lib/python3.4/http/client.py", line 1225, in connect
super().connect()
File "/usr/lib/python3.4/http/client.py", line 836, in connect
self.timeout, self.source_address)
File "/usr/lib/python3.4/socket.py", line 494, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
File "/usr/lib/python3.4/socket.py", line 533, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -2] Name or service not known
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./prog.py", line 51, in <module>
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 463, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 481, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1225, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib/python3.4/urllib/request.py", line 1184, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -2] Name or service not known>