import re
text = "When_WRB it_PRP 's_VBZ time_NN for_IN their_PRP$ biannual_JJ powwow_NN ,_, the_DT nation_NN 's_POS manufacturing_NN titans_NNS typically_RB jet_VBP off_RP to_TO the_DT sunny_JJ confines_NNS of_IN resort_NN towns_NNS like_IN Boca_NNP Raton_NNP and_CC Hot_NNP Springs_NNP ._."
pos_file = "CC CONJ \nCD NUM \nDT DET \nEX DT \nFW X \nIN ADP \nJJ ADJ \nJJR ADJ \nJJS ADJ \nLS X \nMD VERB \nNN NOUN \nNNS NOUN \nNNP NOUN \nNNPS NOUN \nPDT DET \nPOS PRT \nPRP PRON \nPRP$ PRON \nRB ADV \nRBR ADV \nRBS ADV \nRP PRT \nSYM X \nTO PRT \nUH X \nVB VERB \nVBZ VERB \nVBP VERB \nVBD VERB \nVBN VERB \nVBG VERB \nWDT DET \nWP PRON \nWP$ PRON \nWRB ADV \n. . \n, . \n: . \n( . \n) . "
dict_pos = {}
for line in pos_file.splitlines():
c = line.strip().split()
dict_pos[c[1]] = dict_pos.get(c[1], list()) + [c[0]]
def to_regex(x):
r = []
if x[0].isalnum() or x[0] == '_':
r.append(r'(?<![^\W_])')
else:
if any(l.isalnum() or l=='_' for l in x):
r.append(r'\B')
r.append(re.escape(x))
if x[-1].isalnum() or x[-1] == '_':
r.append(r'\b')
else:
if any(l.isalnum() or l=='_' for l in x):
r.append(r'\B')
return "".join(r)
rx_dctvals = {}
for key, val in dict_pos.items():
rx_dctvals[re.compile("|".join(sorted([to_regex(v) for v in val], key=len, reverse=True)))] = key
for rx, repl in rx_dctvals.items():
text = rx.sub(repl.replace('\\', '\\\\'), text)
print(text)
aW1wb3J0IHJlCnRleHQgPSAiV2hlbl9XUkIgaXRfUFJQICdzX1ZCWiB0aW1lX05OIGZvcl9JTiB0aGVpcl9QUlAkIGJpYW5udWFsX0pKIHBvd3dvd19OTiAsXywgdGhlX0RUIG5hdGlvbl9OTiAnc19QT1MgbWFudWZhY3R1cmluZ19OTiB0aXRhbnNfTk5TIHR5cGljYWxseV9SQiBqZXRfVkJQIG9mZl9SUCB0b19UTyB0aGVfRFQgc3VubnlfSkogY29uZmluZXNfTk5TIG9mX0lOIHJlc29ydF9OTiB0b3duc19OTlMgbGlrZV9JTiBCb2NhX05OUCBSYXRvbl9OTlAgYW5kX0NDIEhvdF9OTlAgU3ByaW5nc19OTlAgLl8uIgpwb3NfZmlsZSA9ICJDQyAgQ09OSiAgXG5DRCAgTlVNICBcbkRUICBERVQgIFxuRVggIERUICBcbkZXICBYICBcbklOICBBRFAgICAgXG5KSiAgQURKICAgIFxuSkpSIEFESiAgIFxuSkpTIEFESiAgICBcbkxTICBYICAgIFxuTUQgIFZFUkIgICAgXG5OTiAgTk9VTiAgXG5OTlMgTk9VTiAgXG5OTlAgTk9VTiAgXG5OTlBTICAgIE5PVU4gIFxuUERUIERFVCBcblBPUyBQUlQgIFxuUFJQIFBST04gIFxuUFJQJCAgICBQUk9OICBcblJCICBBRFYgIFxuUkJSIEFEViAgXG5SQlMgQURWICBcblJQICBQUlQgIFxuU1lNIFggIFxuVE8gIFBSVCAgXG5VSCAgWCAgXG5WQiAgVkVSQiAgXG5WQlogVkVSQiAgXG5WQlAgVkVSQiAgXG5WQkQgVkVSQiAgXG5WQk4gVkVSQiAgXG5WQkcgVkVSQiAgXG5XRFQgREVUICBcbldQICBQUk9OICBcbldQJCBQUk9OICBcbldSQiBBRFYgIFxuLiAgIC4gIFxuLCAgIC4gXG46ICAgLiAgXG4oICAgLiAgXG4pICAgLiAgIgpkaWN0X3BvcyA9IHt9CmZvciBsaW5lIGluIHBvc19maWxlLnNwbGl0bGluZXMoKToKICAgIGMgPSBsaW5lLnN0cmlwKCkuc3BsaXQoKQogICAgZGljdF9wb3NbY1sxXV0gPSBkaWN0X3Bvcy5nZXQoY1sxXSwgbGlzdCgpKSArIFtjWzBdXQoKZGVmIHRvX3JlZ2V4KHgpOgoJciA9IFtdCglpZiB4WzBdLmlzYWxudW0oKSBvciB4WzBdID09ICdfJzoKCQlyLmFwcGVuZChyJyg/PCFbXlxXX10pJykKCWVsc2U6CgkJaWYgYW55KGwuaXNhbG51bSgpIG9yIGw9PSdfJyBmb3IgbCBpbiB4KToKCQkJci5hcHBlbmQocidcQicpCglyLmFwcGVuZChyZS5lc2NhcGUoeCkpCglpZiB4Wy0xXS5pc2FsbnVtKCkgb3IgeFstMV0gPT0gJ18nOgoJCXIuYXBwZW5kKHInXGInKQoJZWxzZToKCQlpZiBhbnkobC5pc2FsbnVtKCkgb3IgbD09J18nIGZvciBsIGluIHgpOgoJCQlyLmFwcGVuZChyJ1xCJykKCXJldHVybiAiIi5qb2luKHIpCgpyeF9kY3R2YWxzID0ge30KZm9yIGtleSwgdmFsIGluIGRpY3RfcG9zLml0ZW1zKCk6CiAgICByeF9kY3R2YWxzW3JlLmNvbXBpbGUoInwiLmpvaW4oc29ydGVkKFt0b19yZWdleCh2KSBmb3IgdiBpbiB2YWxdLCBrZXk9bGVuLCByZXZlcnNlPVRydWUpKSldID0ga2V5Cgpmb3IgcngsIHJlcGwgaW4gcnhfZGN0dmFscy5pdGVtcygpOgoJdGV4dCA9IHJ4LnN1YihyZXBsLnJlcGxhY2UoJ1xcJywgJ1xcXFwnKSwgdGV4dCkKCnByaW50KHRleHQp