import json
import attr
import lxml.html
@attr.s
class Child:
title = attr.ib(default=None)
link = attr.ib(default=None)
subtree = attr.ib(default=[])
def first(iterable, default=None):
try:
return next(iter(iterable))
except StopIteration:
return None
def get_tree(node):
childs = []
for child_node in node.iterchildren():
child = get_child(child_node)
childs.append(child)
return childs
def get_child(child_node):
child = Child()
tag = child_node.tag
if tag == 'ul':
child.subtree = get_tree(child_node)
elif tag == 'li':
child.title = child_node.find('span').text_content()
child.link = first(child_node.xpath('span/a/@href'))
ul = child_node.find('ul')
if ul is not None:
child.subtree = get_tree(ul)
else:
raise Exception
("Unexpected tag: {}".
format(tag
)) return attr.asdict(child)
def main():
with open('./rutracker_cr_forum_map.html') as fin:
html = lxml.html.fromstring(fin.read())
fmap = html.get_element_by_id('f-map')
tree = get_tree(fmap)
with open('rutracker_cr_forum_map.json', 'w') as fout:
json.dump(tree, fout, ensure_ascii=False, indent=4)
if __name__ == "__main__":
main()
aW1wb3J0IGpzb24KCmltcG9ydCBhdHRyCmltcG9ydCBseG1sLmh0bWwKCgpAYXR0ci5zCmNsYXNzIENoaWxkOgogICAgdGl0bGUgPSBhdHRyLmliKGRlZmF1bHQ9Tm9uZSkKICAgIGxpbmsgPSBhdHRyLmliKGRlZmF1bHQ9Tm9uZSkKICAgIHN1YnRyZWUgPSBhdHRyLmliKGRlZmF1bHQ9W10pCgoKZGVmIGZpcnN0KGl0ZXJhYmxlLCBkZWZhdWx0PU5vbmUpOgogICAgdHJ5OgogICAgICAgIHJldHVybiBuZXh0KGl0ZXIoaXRlcmFibGUpKQogICAgZXhjZXB0IFN0b3BJdGVyYXRpb246CiAgICAgICAgcmV0dXJuIE5vbmUKCgpkZWYgZ2V0X3RyZWUobm9kZSk6CiAgICBjaGlsZHMgPSBbXQogICAgZm9yIGNoaWxkX25vZGUgaW4gbm9kZS5pdGVyY2hpbGRyZW4oKToKICAgICAgICBjaGlsZCA9IGdldF9jaGlsZChjaGlsZF9ub2RlKQogICAgICAgIGNoaWxkcy5hcHBlbmQoY2hpbGQpCiAgICByZXR1cm4gY2hpbGRzCgoKZGVmIGdldF9jaGlsZChjaGlsZF9ub2RlKToKICAgIGNoaWxkID0gQ2hpbGQoKQogICAgdGFnID0gY2hpbGRfbm9kZS50YWcKICAgIGlmIHRhZyA9PSAndWwnOgogICAgICAgIGNoaWxkLnN1YnRyZWUgPSBnZXRfdHJlZShjaGlsZF9ub2RlKQogICAgZWxpZiB0YWcgPT0gJ2xpJzoKICAgICAgICBjaGlsZC50aXRsZSA9IGNoaWxkX25vZGUuZmluZCgnc3BhbicpLnRleHRfY29udGVudCgpCiAgICAgICAgY2hpbGQubGluayA9IGZpcnN0KGNoaWxkX25vZGUueHBhdGgoJ3NwYW4vYS9AaHJlZicpKQogICAgICAgIHVsID0gY2hpbGRfbm9kZS5maW5kKCd1bCcpCiAgICAgICAgaWYgdWwgaXMgbm90IE5vbmU6CiAgICAgICAgICAgIGNoaWxkLnN1YnRyZWUgPSBnZXRfdHJlZSh1bCkKICAgIGVsc2U6CiAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJVbmV4cGVjdGVkIHRhZzoge30iLmZvcm1hdCh0YWcpKQogICAgcmV0dXJuIGF0dHIuYXNkaWN0KGNoaWxkKQoKCmRlZiBtYWluKCk6CiAgICB3aXRoIG9wZW4oJy4vcnV0cmFja2VyX2NyX2ZvcnVtX21hcC5odG1sJykgYXMgZmluOgogICAgICAgIGh0bWwgPSBseG1sLmh0bWwuZnJvbXN0cmluZyhmaW4ucmVhZCgpKQogICAgZm1hcCA9IGh0bWwuZ2V0X2VsZW1lbnRfYnlfaWQoJ2YtbWFwJykKICAgIHRyZWUgPSBnZXRfdHJlZShmbWFwKQogICAgd2l0aCBvcGVuKCdydXRyYWNrZXJfY3JfZm9ydW1fbWFwLmpzb24nLCAndycpIGFzIGZvdXQ6CiAgICAgICAganNvbi5kdW1wKHRyZWUsIGZvdXQsIGVuc3VyZV9hc2NpaT1GYWxzZSwgaW5kZW50PTQpCgoKaWYgX19uYW1lX18gPT0gIl9fbWFpbl9fIjoKICAgIG1haW4oKQo=