#!/usr/bin/env python import fileinput import re from collections import defaultdict from pprint import pprint occ_list = [] observed = defaultdict(list) for line in fileinput.input(): m = re.search(r"(\S+)\s+([^:]+:[^:]+:\S+)\s+(\S+)\s+(\d+)", line) if m: word, rel, wirt, occ = m.groups() occ = int(occ) occ_list.append(occ) observed[word, rel, wirt].append(occ / 1064542.0) pprint(occ_list) pprint(dict(observed))
abroad a:at:n request 1 abroad a:at:n silence 1 abroad a:at:n time 6 abroad a:because of:n schedule 1 abroad a:by:n american 1 abroad a:by:n bank 1 abroad a:by:n blow 1
[1, 1, 6, 1, 1, 1, 1] {('abroad', 'a:at:n', 'request'): [9.393711098293914e-07], ('abroad', 'a:at:n', 'silence'): [9.393711098293914e-07], ('abroad', 'a:at:n', 'time'): [5.636226658976349e-06], ('abroad', 'a:because of:n', 'schedule'): [9.393711098293914e-07], ('abroad', 'a:by:n', 'american'): [9.393711098293914e-07], ('abroad', 'a:by:n', 'bank'): [9.393711098293914e-07], ('abroad', 'a:by:n', 'blow'): [9.393711098293914e-07]}