fork download
  1. import re
  2.  
  3. class Trie():
  4. """Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
  5. The corresponding Regex should match much faster than a simple Regex union."""
  6.  
  7. def __init__(self):
  8. self.data = {}
  9.  
  10. def add(self, word):
  11. ref = self.data
  12. for char in word:
  13. ref[char] = char in ref and ref[char] or {}
  14. ref = ref[char]
  15. ref[''] = 1
  16.  
  17. def dump(self):
  18. return self.data
  19.  
  20. def quote(self, char):
  21. return re.escape(char)
  22.  
  23. def _pattern(self, pData):
  24. data = pData
  25. if "" in data and len(data.keys()) == 1:
  26. return None
  27.  
  28. alt = []
  29. cc = []
  30. q = 0
  31. for char in sorted(data.keys()):
  32. if isinstance(data[char], dict):
  33. try:
  34. recurse = self._pattern(data[char])
  35. alt.append(self.quote(char) + recurse)
  36. except:
  37. cc.append(self.quote(char))
  38. else:
  39. q = 1
  40. cconly = not len(alt) > 0
  41.  
  42. if len(cc) > 0:
  43. if len(cc) == 1:
  44. alt.append(cc[0])
  45. else:
  46. alt.append('[' + ''.join(cc) + ']')
  47.  
  48. if len(alt) == 1:
  49. result = alt[0]
  50. else:
  51. result = "(?:" + "|".join(alt) + ")"
  52.  
  53. if q:
  54. if cconly:
  55. result += "?"
  56. else:
  57. result = "(?:%s)?" % result
  58. return result
  59.  
  60. def pattern(self):
  61. return self._pattern(self.dump())
  62.  
  63.  
  64. text = r'FIND ANY MATCHING WORD BY THIS VERY LONG REGEX PATTERN FIGHT FIGHTER PARROT PARROT_ING'
  65. keywords = ['FIND', 'ANY', 'MATCHING', 'WORD', 'BY', 'THIS', 'VERY', 'LONG', 'REGEX', 'PATTERN', 'PARROT', 'FIGHT']
  66. trie = Trie()
  67. for word in keywords:
  68. trie.add(word)
  69. pattern = fr'(?<![^\W_])({trie.pattern()})(?![^\W_])'
  70. print(re.findall(pattern, text))
Success #stdin #stdout 0.03s 9412KB
stdin
Standard input is empty
stdout
['FIND', 'ANY', 'MATCHING', 'WORD', 'BY', 'THIS', 'VERY', 'LONG', 'REGEX', 'PATTERN', 'FIGHT', 'PARROT', 'PARROT']