import json
import lxml.html
def first( iterable, default = None) :
try:
return next( iter( iterable) )
except StopIteration:
return None
def get_tree( node) :
childs = [ ]
for child_node in node.iterchildren ( ) :
child = get_child( child_node)
childs.append ( child)
return childs
def get_child( child_node) :
child = { }
tag = child_node.tag
if tag == 'ul' :
child[ 'subtree' ] = get_tree( child_node)
elif tag == 'li' :
child[ 'title' ] = child_node.find ( 'span' ) .text_content ( )
child[ 'link' ] = first( child_node.xpath ( 'span/a/@href' ) )
ul = child_node.find ( 'ul' )
if ul is not None:
child[ 'subtree' ] = get_tree( ul)
else :
raise Exception
( "Unexpected tag: {}" .
format ( tag
) ) return child
def main( ) :
with open( './rutracker_cr_forum_map.html' ) as fin:
html = lxml.html .fromstring ( fin.read ( ) )
fmap = html.get_element_by_id ( 'f-map' )
tree = get_tree( fmap)
with open( 'rutracker_cr_forum_map.json' , 'w' ) as fout:
json.dump ( tree, fout, ensure_ascii= False, indent= 4 )
if __name__ == "__main__" :
main( )
aW1wb3J0IGpzb24KCmltcG9ydCBseG1sLmh0bWwKCgpkZWYgZmlyc3QoaXRlcmFibGUsIGRlZmF1bHQ9Tm9uZSk6CiAgICB0cnk6CiAgICAgICAgcmV0dXJuIG5leHQoaXRlcihpdGVyYWJsZSkpCiAgICBleGNlcHQgU3RvcEl0ZXJhdGlvbjoKICAgICAgICByZXR1cm4gTm9uZQoKCmRlZiBnZXRfdHJlZShub2RlKToKICAgIGNoaWxkcyA9IFtdCiAgICBmb3IgY2hpbGRfbm9kZSBpbiBub2RlLml0ZXJjaGlsZHJlbigpOgogICAgICAgIGNoaWxkID0gZ2V0X2NoaWxkKGNoaWxkX25vZGUpCiAgICAgICAgY2hpbGRzLmFwcGVuZChjaGlsZCkKICAgIHJldHVybiBjaGlsZHMKCgpkZWYgZ2V0X2NoaWxkKGNoaWxkX25vZGUpOgogICAgY2hpbGQgPSB7fQogICAgdGFnID0gY2hpbGRfbm9kZS50YWcKICAgIGlmIHRhZyA9PSAndWwnOgogICAgICAgIGNoaWxkWydzdWJ0cmVlJ10gPSBnZXRfdHJlZShjaGlsZF9ub2RlKQogICAgZWxpZiB0YWcgPT0gJ2xpJzoKICAgICAgICBjaGlsZFsndGl0bGUnXSA9IGNoaWxkX25vZGUuZmluZCgnc3BhbicpLnRleHRfY29udGVudCgpCiAgICAgICAgY2hpbGRbJ2xpbmsnXSA9IGZpcnN0KGNoaWxkX25vZGUueHBhdGgoJ3NwYW4vYS9AaHJlZicpKQogICAgICAgIHVsID0gY2hpbGRfbm9kZS5maW5kKCd1bCcpCiAgICAgICAgaWYgdWwgaXMgbm90IE5vbmU6CiAgICAgICAgICAgIGNoaWxkWydzdWJ0cmVlJ10gPSBnZXRfdHJlZSh1bCkKICAgIGVsc2U6CiAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJVbmV4cGVjdGVkIHRhZzoge30iLmZvcm1hdCh0YWcpKQogICAgcmV0dXJuIGNoaWxkCgoKZGVmIG1haW4oKToKICAgIHdpdGggb3BlbignLi9ydXRyYWNrZXJfY3JfZm9ydW1fbWFwLmh0bWwnKSBhcyBmaW46CiAgICAgICAgaHRtbCA9IGx4bWwuaHRtbC5mcm9tc3RyaW5nKGZpbi5yZWFkKCkpCiAgICBmbWFwID0gaHRtbC5nZXRfZWxlbWVudF9ieV9pZCgnZi1tYXAnKQogICAgdHJlZSA9IGdldF90cmVlKGZtYXApCiAgICB3aXRoIG9wZW4oJ3J1dHJhY2tlcl9jcl9mb3J1bV9tYXAuanNvbicsICd3JykgYXMgZm91dDoKICAgICAgICBqc29uLmR1bXAodHJlZSwgZm91dCwgZW5zdXJlX2FzY2lpPUZhbHNlLCBpbmRlbnQ9NCkKCgppZiBfX25hbWVfXyA9PSAiX19tYWluX18iOgogICAgbWFpbigpCg==