# your code goes here
# -*- coding: utf-8 -*-

import re
import urllib.request as req
from random import choice, sample

### Settings
re_blockquote = re.compile('<blockquote.*?.>.*?.</blockquote>', flags=re.DOTALL)
re_remove_blockquote = re.compile('<[/]*blockquote.*?>', flags=re.DOTALL)
re_remove_a = re.compile('<a.*?.>.*.</a>')
re_remove_u_symbols = re.compile('.*&#*.*;')

replace_list = [['\n', ''],
                ['\t', ' '],
                ['<br>', ''],
                ['<blockquote>', ''],
                ['</blockquote>', ''],
                ['<span class=\"spoiler\">', '.'],
                ['<span class=\"unkfunc\">', '.'],
                ['<span class=\"u\">', '.'],
                ['</span>', '.'],
                ['<strong>', ' '],
                ['</strong>', ' '],
                ['<em>', '.'],
                ['</em>', '.'],
                ['<sub>', '.'],
                ['</sub>', '.'],
                ['<sup>', '.'],
                ['</sup>', '.'],
                # ~ [';', '.'],
                # ~ [',', ' '],
                # ~ ['-', ' '],
                ['(', ','],
                [')', ','],
                ['!', '.'],
                ['?', '.'],
                ['  ', ' '],
                ['. ', '.'],
                [' .', '.'],
                ['..', '.'],
                ]

### Parser
url = ''
while not 'http' in url:
    url = str(input('full thread url:>'))

print(">Downloading thread %s..." %(url,))

with req.urlopen(url) as threadf:
    
    thread = threadf.read().decode('utf-8')
    
    res = re_remove_a.sub('', thread)
    res = re_blockquote.findall(res)
    res = map(lambda x: re_remove_blockquote.sub('', x), res)
    res = map(lambda x: re_remove_u_symbols.sub('', x), res)
    
    raw_messages = res
    
    threadf.close()
    del(thread)
    del(res)
    

### Replacing
print(">Replacing...")
messages = []

for j in enumerate(raw_messages):
    s = j[1]
    for i in enumerate(replace_list):
        while i[1][0] in s:
            s = s.replace(i[1][0], i[1][1])
    messages.append(s.strip())

### Building dict 4 Markov

s = [j for i in messages for j in i.split('.') ]

s1 = [i.split(' ') for i in s]

for i in enumerate(s1):
    if '' in i[1]:
        s1[i[0]].remove('')
    if s1[i[0]] == []:
        s1.remove([])

words = {'.': [], 
         # ~ '!': [],
         # ~ '?': []
         }

for i in s1:
    
    l = len(i)
    
    words['.'].append(i[0])
    # ~ words['!'].append(i[0])
    # ~ words['?'].append(i[0])
    for j in enumerate(i[:-1]):
            
            if not (j[1] in words):
                words[j[1]] = []
            words[j[1]].append(i[j[0]+1])

    if not i[-1] in words:
        words[i[-1]] = ['.',
                        # ~ '!',
                        # ~ '?'
                        ]
    else:
        words[i[-1]].extend(['.',
                             # ~ '!',
                             # ~ '?'
                             ])

### Building sentences 4 Markov

big = list(filter(lambda x: x.istitle(), list(words.keys())))

print()
print()

for j in range(100):

    res = [choice(big), ]

    while not(res[-1]=='.'):

        res.append(choice(words[res[-1]]))
    print(' '.join(res))

