#!/usr/bin/env python
import fileinput
import re
from collections import defaultdict
from pprint import pprint

occ_list = []
observed = defaultdict(list)
for line in fileinput.input():
    m = re.search(r"(\S+)\s+([^:]+:[^:]+:\S+)\s+(\S+)\s+(\d+)", line)
    if m:
       word, rel, wirt, occ = m.groups()
       occ = int(occ)
       occ_list.append(occ)
       observed[word, rel, wirt].append(occ / 1064542.0)

pprint(occ_list)
pprint(dict(observed))
