fork download
  1. import re
  2. import json
  3.  
  4. # a input document of sentences
  5. document="These are oranges and apples and and pears, but not pinapples\nThese are oranges and apples and pears, but not pinapples"
  6.  
  7.  
  8. # uncomment to test UNICODE
  9. document="तुम मुझे दोस्त कहते कहते हो"
  10.  
  11. sentences=[] # sentences
  12. seen = {} # map if a token has been see already!
  13.  
  14. # split into sentences
  15. lines=document.splitlines()
  16.  
  17. for index,line in enumerate(lines):
  18.  
  19. print("Line:%d %s" % (index,line))
  20.  
  21. # split token that are words
  22. # LP: (for Simon ;P we do not care of punct at all!
  23. rgx = re.compile("([\w][\w']*\w)")
  24. tokens=rgx.findall(line)
  25.  
  26. # uncomment to test UNICODE
  27. tokens=["तुम","मुझे","दोस्त","कहते","कहते","हो"]
  28.  
  29. print("Tokens:",tokens)
  30.  
  31. sentence={} # a sentence
  32. items=[] # word tokens
  33.  
  34. # for each token word
  35. for index_word,word in enumerate(tokens):
  36.  
  37. # uncomment to test UNICODE
  38. my_regex = r"(?<!\S){}(?!\S)".format(re.escape(word))
  39. #my_regex = r"\b(?=\w)" + re.escape(word) + r"\b(?!\w)"
  40. r = re.compile(my_regex, flags=re.I | re.X | re.UNICODE)
  41.  
  42. item = {}
  43. # for each matched token in sentence
  44. for m in r.finditer(document):
  45.  
  46. token=m.group()
  47. characterOffsetBegin=m.start()
  48. characterOffsetEnd=characterOffsetBegin+len(m.group()) - 1 # LP: star from 0
  49.  
  50. print ("word:%s characterOffsetBegin:%d characterOffsetEnd:%d" % (token, characterOffsetBegin, characterOffsetEnd) )
  51.  
  52. found=-1
  53. if word in seen:
  54. found=seen[word]
  55.  
  56. if characterOffsetBegin > found:
  57. # store last word has been seen
  58. seen[word] = characterOffsetBegin
  59. item['index']=index_word+1 #// word index starts from 1
  60. item['word']=token
  61. item['characterOffsetBegin'] = characterOffsetBegin;
  62. item['characterOffsetEnd'] = characterOffsetEnd;
  63. items.append(item)
  64. break
  65.  
  66. sentence['text']=line
  67. sentence['tokens']=items
  68. sentences.append(sentence)
  69.  
  70. print(json.dumps(sentences, indent=4, sort_keys=True))
  71.  
  72. print("------ testing ------")
  73. text=''
  74. for sentence in sentences:
  75. for token in sentence['tokens']:
  76. # LP: we get the token from a slice in original text
  77. text = text + document[token['characterOffsetBegin']:token['characterOffsetEnd']+1] + " "
  78. text = text + '\n'
  79. print(text)
Success #stdin #stdout 0.02s 30192KB
stdin
Standard input is empty
stdout
Line:0 तुम मुझे दोस्त कहते कहते हो
Tokens: ['तुम', 'मुझे', 'दोस्त', 'कहते', 'कहते', 'हो']
word:तुम characterOffsetBegin:0 characterOffsetEnd:2
word:मुझे characterOffsetBegin:4 characterOffsetEnd:7
word:दोस्त characterOffsetBegin:9 characterOffsetEnd:13
word:कहते characterOffsetBegin:15 characterOffsetEnd:18
word:कहते characterOffsetBegin:15 characterOffsetEnd:18
word:कहते characterOffsetBegin:20 characterOffsetEnd:23
word:हो characterOffsetBegin:25 characterOffsetEnd:26
[
    {
        "text": "\u0924\u0941\u092e \u092e\u0941\u091d\u0947 \u0926\u094b\u0938\u094d\u0924 \u0915\u0939\u0924\u0947 \u0915\u0939\u0924\u0947 \u0939\u094b",
        "tokens": [
            {
                "characterOffsetBegin": 0,
                "characterOffsetEnd": 2,
                "index": 1,
                "word": "\u0924\u0941\u092e"
            },
            {
                "characterOffsetBegin": 4,
                "characterOffsetEnd": 7,
                "index": 2,
                "word": "\u092e\u0941\u091d\u0947"
            },
            {
                "characterOffsetBegin": 9,
                "characterOffsetEnd": 13,
                "index": 3,
                "word": "\u0926\u094b\u0938\u094d\u0924"
            },
            {
                "characterOffsetBegin": 15,
                "characterOffsetEnd": 18,
                "index": 4,
                "word": "\u0915\u0939\u0924\u0947"
            },
            {
                "characterOffsetBegin": 20,
                "characterOffsetEnd": 23,
                "index": 5,
                "word": "\u0915\u0939\u0924\u0947"
            },
            {
                "characterOffsetBegin": 25,
                "characterOffsetEnd": 26,
                "index": 6,
                "word": "\u0939\u094b"
            }
        ]
    }
]
------ testing ------
तुम मुझे दोस्त कहते कहते हो