fork download
  1. #!/usr/bin/env python
  2. import fileinput
  3. import re
  4. from collections import defaultdict
  5. from pprint import pprint
  6.  
  7. occ_list = []
  8. observed = defaultdict(list)
  9. for line in fileinput.input():
  10. m = re.search(r"(\S+)\s+([^:]+:[^:]+:\S+)\s+(\S+)\s+(\d+)", line)
  11. if m:
  12. word, rel, wirt, occ = m.groups()
  13. occ = int(occ)
  14. occ_list.append(occ)
  15. observed[word, rel, wirt].append(occ / 1064542.0)
  16.  
  17. pprint(occ_list)
  18. pprint(dict(observed))
  19.  
Success #stdin #stdout 0.03s 5908KB
stdin
abroad a:at:n request 1
abroad a:at:n silence 1
abroad a:at:n time 6
abroad a:because of:n schedule 1
abroad a:by:n american 1
abroad a:by:n bank 1
abroad a:by:n blow 1
stdout
[1, 1, 6, 1, 1, 1, 1]
{('abroad', 'a:at:n', 'request'): [9.393711098293914e-07],
 ('abroad', 'a:at:n', 'silence'): [9.393711098293914e-07],
 ('abroad', 'a:at:n', 'time'): [5.636226658976349e-06],
 ('abroad', 'a:because of:n', 'schedule'): [9.393711098293914e-07],
 ('abroad', 'a:by:n', 'american'): [9.393711098293914e-07],
 ('abroad', 'a:by:n', 'bank'): [9.393711098293914e-07],
 ('abroad', 'a:by:n', 'blow'): [9.393711098293914e-07]}