import re
# Input
text = "I've been bad but I aspire to be a better person, and behave like my dog and cat :)"
a = {"animal": [ "dog", "cat", "dog and cat"], "XXX": ["I've been", "asp*", ":)"]}
class Trie():
"""Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
The corresponding Regex should match much faster than a simple Regex union."""
def __init__(self):
self.data = {}
def add(self, word):
ref = self.data
for char in word:
ref[char] = char in ref and ref[char] or {}
ref = ref[char]
ref[''] = 1
def dump(self):
return self.data
def quote(self, char):
if char == '*':
return r'\w*'
else:
return re.escape(char)
def _pattern(self, pData):
data = pData
if "" in data and len(data.keys()) == 1:
return None
alt = []
cc = []
q = 0
for char in sorted(data.keys()):
if isinstance(data[char], dict):
try:
recurse = self._pattern(data[char])
alt.append(self.quote(char) + recurse)
except:
cc.append(self.quote(char))
else:
q = 1
cconly = not len(alt) > 0
if len(cc) > 0:
if len(cc) == 1:
alt.append(cc[0])
else:
alt.append('[' + ''.join(cc) + ']')
if len(alt) == 1:
result = alt[0]
else:
result = "(?:" + "|".join(alt) + ")"
if q:
if cconly:
result += "?"
else:
result = "(?:%s)?" % result
return result
def pattern(self):
return self._pattern(self.dump())
# Creating patterns
a2 = {}
for k,v in a.items():
trie = Trie()
for w in v:
trie.add(w)
a2[k] = re.compile(fr"(?<!\w){trie.pattern()}(?!\w)", re.I)
for k,r in a2.items():
text = r.sub(k, text)
print(text)
aW1wb3J0IHJlCgojIElucHV0CnRleHQgPSAiSSd2ZSBiZWVuIGJhZCBidXQgSSBhc3BpcmUgdG8gYmUgYSBiZXR0ZXIgcGVyc29uLCBhbmQgYmVoYXZlIGxpa2UgbXkgZG9nIGFuZCBjYXQgOikiCmEgPSB7ImFuaW1hbCI6IFsgImRvZyIsICJjYXQiLCAiZG9nIGFuZCBjYXQiXSwgIlhYWCI6IFsiSSd2ZSBiZWVuIiwgImFzcCoiLCAiOikiXX0KCmNsYXNzIFRyaWUoKToKICAgICIiIlJlZ2V4OjpUcmllIGluIFB5dGhvbi4gQ3JlYXRlcyBhIFRyaWUgb3V0IG9mIGEgbGlzdCBvZiB3b3Jkcy4gVGhlIHRyaWUgY2FuIGJlIGV4cG9ydGVkIHRvIGEgUmVnZXggcGF0dGVybi4KICAgIFRoZSBjb3JyZXNwb25kaW5nIFJlZ2V4IHNob3VsZCBtYXRjaCBtdWNoIGZhc3RlciB0aGFuIGEgc2ltcGxlIFJlZ2V4IHVuaW9uLiIiIgogICAgZGVmIF9faW5pdF9fKHNlbGYpOgogICAgICAgIHNlbGYuZGF0YSA9IHt9CgogICAgZGVmIGFkZChzZWxmLCB3b3JkKToKICAgICAgICByZWYgPSBzZWxmLmRhdGEKICAgICAgICBmb3IgY2hhciBpbiB3b3JkOgogICAgICAgICAgICByZWZbY2hhcl0gPSBjaGFyIGluIHJlZiBhbmQgcmVmW2NoYXJdIG9yIHt9CiAgICAgICAgICAgIHJlZiA9IHJlZltjaGFyXQogICAgICAgIHJlZlsnJ10gPSAxCgogICAgZGVmIGR1bXAoc2VsZik6CiAgICAgICAgcmV0dXJuIHNlbGYuZGF0YQoKICAgIGRlZiBxdW90ZShzZWxmLCBjaGFyKToKICAgICAgICBpZiBjaGFyID09ICcqJzoKICAgICAgICAgICAgcmV0dXJuIHInXHcqJwogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiByZS5lc2NhcGUoY2hhcikKCiAgICBkZWYgX3BhdHRlcm4oc2VsZiwgcERhdGEpOgogICAgICAgIGRhdGEgPSBwRGF0YQogICAgICAgIGlmICIiIGluIGRhdGEgYW5kIGxlbihkYXRhLmtleXMoKSkgPT0gMToKICAgICAgICAgICAgcmV0dXJuIE5vbmUKCiAgICAgICAgYWx0ID0gW10KICAgICAgICBjYyA9IFtdCiAgICAgICAgcSA9IDAKICAgICAgICBmb3IgY2hhciBpbiBzb3J0ZWQoZGF0YS5rZXlzKCkpOgogICAgICAgICAgICBpZiBpc2luc3RhbmNlKGRhdGFbY2hhcl0sIGRpY3QpOgogICAgICAgICAgICAgICAgdHJ5OgogICAgICAgICAgICAgICAgICAgIHJlY3Vyc2UgPSBzZWxmLl9wYXR0ZXJuKGRhdGFbY2hhcl0pCiAgICAgICAgICAgICAgICAgICAgYWx0LmFwcGVuZChzZWxmLnF1b3RlKGNoYXIpICsgcmVjdXJzZSkKICAgICAgICAgICAgICAgIGV4Y2VwdDoKICAgICAgICAgICAgICAgICAgICBjYy5hcHBlbmQoc2VsZi5xdW90ZShjaGFyKSkKICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHEgPSAxCiAgICAgICAgY2Nvbmx5ID0gbm90IGxlbihhbHQpID4gMAoKICAgICAgICBpZiBsZW4oY2MpID4gMDoKICAgICAgICAgICAgaWYgbGVuKGNjKSA9PSAxOgogICAgICAgICAgICAgICAgYWx0LmFwcGVuZChjY1swXSkKICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIGFsdC5hcHBlbmQoJ1snICsgJycuam9pbihjYykgKyAnXScpCgogICAgICAgIGlmIGxlbihhbHQpID09IDE6CiAgICAgICAgICAgIHJlc3VsdCA9IGFsdFswXQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJlc3VsdCA9ICIoPzoiICsgInwiLmpvaW4oYWx0KSArICIpIgoKICAgICAgICBpZiBxOgogICAgICAgICAgICBpZiBjY29ubHk6CiAgICAgICAgICAgICAgICByZXN1bHQgKz0gIj8iCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICByZXN1bHQgPSAiKD86JXMpPyIgJSByZXN1bHQKICAgICAgICByZXR1cm4gcmVzdWx0CgogICAgZGVmIHBhdHRlcm4oc2VsZik6CiAgICAgICAgcmV0dXJuIHNlbGYuX3BhdHRlcm4oc2VsZi5kdW1wKCkpCgojIENyZWF0aW5nIHBhdHRlcm5zCmEyID0ge30KZm9yIGssdiBpbiBhLml0ZW1zKCk6CiAgICB0cmllID0gVHJpZSgpCiAgICBmb3IgdyBpbiB2OgogICAgICAgIHRyaWUuYWRkKHcpCiAgICBhMltrXSA9IHJlLmNvbXBpbGUoZnIiKD88IVx3KXt0cmllLnBhdHRlcm4oKX0oPyFcdykiLCByZS5JKQoKZm9yIGssciBpbiBhMi5pdGVtcygpOgoJdGV4dCA9IHIuc3ViKGssIHRleHQpCgkKcHJpbnQodGV4dCkK