fork download
  1. import re
  2. text = "When_WRB it_PRP 's_VBZ time_NN for_IN their_PRP$ biannual_JJ powwow_NN ,_, the_DT nation_NN 's_POS manufacturing_NN titans_NNS typically_RB jet_VBP off_RP to_TO the_DT sunny_JJ confines_NNS of_IN resort_NN towns_NNS like_IN Boca_NNP Raton_NNP and_CC Hot_NNP Springs_NNP ._."
  3. pos_file = "CC CONJ \nCD NUM \nDT DET \nEX DT \nFW X \nIN ADP \nJJ ADJ \nJJR ADJ \nJJS ADJ \nLS X \nMD VERB \nNN NOUN \nNNS NOUN \nNNP NOUN \nNNPS NOUN \nPDT DET \nPOS PRT \nPRP PRON \nPRP$ PRON \nRB ADV \nRBR ADV \nRBS ADV \nRP PRT \nSYM X \nTO PRT \nUH X \nVB VERB \nVBZ VERB \nVBP VERB \nVBD VERB \nVBN VERB \nVBG VERB \nWDT DET \nWP PRON \nWP$ PRON \nWRB ADV \n. . \n, . \n: . \n( . \n) . "
  4. dict_pos = {}
  5. for line in pos_file.splitlines():
  6. c = line.strip().split()
  7. dict_pos[c[1]] = dict_pos.get(c[1], list()) + [c[0]]
  8.  
  9. def to_regex(x):
  10. r = []
  11. if x[0].isalnum() or x[0] == '_':
  12. r.append(r'(?<![^\W_])')
  13. else:
  14. if any(l.isalnum() or l=='_' for l in x):
  15. r.append(r'\B')
  16. r.append(re.escape(x))
  17. if x[-1].isalnum() or x[-1] == '_':
  18. r.append(r'\b')
  19. else:
  20. if any(l.isalnum() or l=='_' for l in x):
  21. r.append(r'\B')
  22. return "".join(r)
  23.  
  24. rx_dctvals = {}
  25. for key, val in dict_pos.items():
  26. rx_dctvals[re.compile("|".join(sorted([to_regex(v) for v in val], key=len, reverse=True)))] = key
  27.  
  28. for rx, repl in rx_dctvals.items():
  29. text = rx.sub(repl.replace('\\', '\\\\'), text)
  30.  
  31. print(text)
Success #stdin #stdout 0.03s 9392KB
stdin
Standard input is empty
stdout
When_ADV it_PRON 's_VERB time_NOUN for_ADP their_PRON biannual_ADJ powwow_NOUN ._. the_DET nation_NOUN 's_PRT manufacturing_NOUN titans_NOUN typically_ADV jet_VERB off_PRT to_PRT the_DET sunny_ADJ confines_NOUN of_ADP resort_NOUN towns_NOUN like_ADP Boca_NOUN Raton_NOUN and_CONJ Hot_NOUN Springs_NOUN ._.