import re,json
def findTokenOffset(text, pattern):
items = []
for m in pattern.finditer(text):
item = {}
#item['index']= NO!! #// word index starts from 1
item['word']=m.group()
item['characterOffsetBegin'] = m.start()
item['characterOffsetEnd'] = m.end()
items.append(item)
return items
text = "George Washington came to Washington Washington.com"
tokens = ["George Washington", "Washington"]
pattern = re.compile(fr'(?<!\w)(?:{"|".join(sorted(map(re.escape, tokens), key=len, reverse=True))})(?!\w)(?!\.\b)', re.I )
offsets = findTokenOffset(text,pattern)
print(json.dumps(offsets, indent=2))
aW1wb3J0IHJlLGpzb24KCmRlZiBmaW5kVG9rZW5PZmZzZXQodGV4dCwgcGF0dGVybik6CiAgICBpdGVtcyA9IFtdCiAgICBmb3IgbSBpbiBwYXR0ZXJuLmZpbmRpdGVyKHRleHQpOgogICAgICAgIGl0ZW0gPSB7fQogICAgICAgICNpdGVtWydpbmRleCddPSBOTyEhICMvLyB3b3JkIGluZGV4IHN0YXJ0cyBmcm9tIDEKICAgICAgICBpdGVtWyd3b3JkJ109bS5ncm91cCgpCiAgICAgICAgaXRlbVsnY2hhcmFjdGVyT2Zmc2V0QmVnaW4nXSA9IG0uc3RhcnQoKQogICAgICAgIGl0ZW1bJ2NoYXJhY3Rlck9mZnNldEVuZCddID0gbS5lbmQoKQogICAgICAgIGl0ZW1zLmFwcGVuZChpdGVtKQogICAgcmV0dXJuIGl0ZW1zCgp0ZXh0ID0gIkdlb3JnZSBXYXNoaW5ndG9uIGNhbWUgdG8gV2FzaGluZ3RvbiBXYXNoaW5ndG9uLmNvbSIKdG9rZW5zID0gWyJHZW9yZ2UgV2FzaGluZ3RvbiIsICJXYXNoaW5ndG9uIl0KcGF0dGVybiA9IHJlLmNvbXBpbGUoZnInKD88IVx3KSg/OnsifCIuam9pbihzb3J0ZWQobWFwKHJlLmVzY2FwZSwgdG9rZW5zKSwga2V5PWxlbiwgcmV2ZXJzZT1UcnVlKSl9KSg/IVx3KSg/IVwuXGIpJywgcmUuSSApCm9mZnNldHMgPSBmaW5kVG9rZW5PZmZzZXQodGV4dCxwYXR0ZXJuKQpwcmludChqc29uLmR1bXBzKG9mZnNldHMsIGluZGVudD0yKSkg