fork download
  1. import re,json
  2.  
  3. def findTokenOffset(text, pattern):
  4. items = []
  5. for m in pattern.finditer(text):
  6. item = {}
  7. #item['index']= NO!! #// word index starts from 1
  8. item['word']=m.group()
  9. item['characterOffsetBegin'] = m.start()
  10. item['characterOffsetEnd'] = m.end()
  11. items.append(item)
  12. return items
  13.  
  14. text = "George Washington came to Washington Washington.com"
  15. tokens = ["George Washington", "Washington"]
  16. pattern = re.compile(fr'(?<!\w)(?:{"|".join(sorted(map(re.escape, tokens), key=len, reverse=True))})(?!\w)(?!\.\b)', re.I )
  17. offsets = findTokenOffset(text,pattern)
  18. print(json.dumps(offsets, indent=2))
Success #stdin #stdout 0.03s 9672KB
stdin
Standard input is empty
stdout
[
  {
    "word": "George Washington",
    "characterOffsetBegin": 0,
    "characterOffsetEnd": 17
  },
  {
    "word": "Washington",
    "characterOffsetBegin": 26,
    "characterOffsetEnd": 36
  }
]