fork download
  1. # your code goes here
  2. # -*- coding: utf-8 -*-
  3.  
  4. import re
  5. import urllib.request as req
  6. from random import choice, sample
  7.  
  8. ### Settings
  9. re_blockquote = re.compile('<blockquote.*?.>.*?.</blockquote>', flags=re.DOTALL)
  10. re_remove_blockquote = re.compile('<[/]*blockquote.*?>', flags=re.DOTALL)
  11. re_remove_a = re.compile('<a.*?.>.*.</a>')
  12. re_remove_u_symbols = re.compile('.*&#*.*;')
  13.  
  14. replace_list = [['\n', ''],
  15. ['\t', ' '],
  16. ['<br>', ''],
  17. ['<blockquote>', ''],
  18. ['</blockquote>', ''],
  19. ['<span class=\"spoiler\">', '.'],
  20. ['<span class=\"unkfunc\">', '.'],
  21. ['<span class=\"u\">', '.'],
  22. ['</span>', '.'],
  23. ['<strong>', ' '],
  24. ['</strong>', ' '],
  25. ['<em>', '.'],
  26. ['</em>', '.'],
  27. ['<sub>', '.'],
  28. ['</sub>', '.'],
  29. ['<sup>', '.'],
  30. ['</sup>', '.'],
  31. # ~ [';', '.'],
  32. # ~ [',', ' '],
  33. # ~ ['-', ' '],
  34. ['(', ','],
  35. [')', ','],
  36. ['!', '.'],
  37. ['?', '.'],
  38. [' ', ' '],
  39. ['. ', '.'],
  40. [' .', '.'],
  41. ['..', '.'],
  42. ]
  43.  
  44. ### Parser
  45. url = ''
  46. while not 'http' in url:
  47. url = str(input('full thread url:>'))
  48.  
  49. print(">Downloading thread %s..." %(url,))
  50.  
  51. with req.urlopen(url) as threadf:
  52.  
  53. thread = threadf.read().decode('utf-8')
  54.  
  55. res = re_remove_a.sub('', thread)
  56. res = re_blockquote.findall(res)
  57. res = map(lambda x: re_remove_blockquote.sub('', x), res)
  58. res = map(lambda x: re_remove_u_symbols.sub('', x), res)
  59.  
  60. raw_messages = res
  61.  
  62. threadf.close()
  63. del(thread)
  64. del(res)
  65.  
  66.  
  67. ### Replacing
  68. print(">Replacing...")
  69. messages = []
  70.  
  71. for j in enumerate(raw_messages):
  72. s = j[1]
  73. for i in enumerate(replace_list):
  74. while i[1][0] in s:
  75. s = s.replace(i[1][0], i[1][1])
  76. messages.append(s.strip())
  77.  
  78. ### Building dict 4 Markov
  79.  
  80. s = [j for i in messages for j in i.split('.') ]
  81.  
  82. s1 = [i.split(' ') for i in s]
  83.  
  84. for i in enumerate(s1):
  85. if '' in i[1]:
  86. s1[i[0]].remove('')
  87. if s1[i[0]] == []:
  88. s1.remove([])
  89.  
  90. words = {'.': [],
  91. # ~ '!': [],
  92. # ~ '?': []
  93. }
  94.  
  95. for i in s1:
  96.  
  97. l = len(i)
  98.  
  99. words['.'].append(i[0])
  100. # ~ words['!'].append(i[0])
  101. # ~ words['?'].append(i[0])
  102. for j in enumerate(i[:-1]):
  103.  
  104. if not (j[1] in words):
  105. words[j[1]] = []
  106. words[j[1]].append(i[j[0]+1])
  107.  
  108. if not i[-1] in words:
  109. words[i[-1]] = ['.',
  110. # ~ '!',
  111. # ~ '?'
  112. ]
  113. else:
  114. words[i[-1]].extend(['.',
  115. # ~ '!',
  116. # ~ '?'
  117. ])
  118.  
  119. ### Building sentences 4 Markov
  120.  
  121. big = list(filter(lambda x: x.istitle(), list(words.keys())))
  122.  
  123. print()
  124. print()
  125.  
  126. for j in range(100):
  127.  
  128. res = [choice(big), ]
  129.  
  130. while not(res[-1]=='.'):
  131.  
  132. res.append(choice(words[res[-1]]))
  133. print(' '.join(res))
  134.  
  135.  
Runtime error #stdin #stdout #stderr 0.12s 13928KB
stdin
https://2...content-available-to-author-only...h.hk/gd/res/172753.html
stdout
full thread url:>>Downloading thread https://2...content-available-to-author-only...h.hk/gd/res/172753.html...
stderr
Traceback (most recent call last):
  File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open
    h.request(req.get_method(), req.selector, req.data, headers)
  File "/usr/lib/python3.4/http/client.py", line 1090, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.4/http/client.py", line 1128, in _send_request
    self.endheaders(body)
  File "/usr/lib/python3.4/http/client.py", line 1086, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python3.4/http/client.py", line 924, in _send_output
    self.send(msg)
  File "/usr/lib/python3.4/http/client.py", line 859, in send
    self.connect()
  File "/usr/lib/python3.4/http/client.py", line 1225, in connect
    super().connect()
  File "/usr/lib/python3.4/http/client.py", line 836, in connect
    self.timeout, self.source_address)
  File "/usr/lib/python3.4/socket.py", line 494, in create_connection
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
  File "/usr/lib/python3.4/socket.py", line 533, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "./prog.py", line 51, in <module>
  File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.4/urllib/request.py", line 463, in open
    response = self._open(req, data)
  File "/usr/lib/python3.4/urllib/request.py", line 481, in _open
    '_open', req)
  File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.4/urllib/request.py", line 1225, in https_open
    context=self._context, check_hostname=self._check_hostname)
  File "/usr/lib/python3.4/urllib/request.py", line 1184, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -2] Name or service not known>