import re
number_words = [ "zero" , "one" , "two" , "three" , "four" , "five" , "six" , "seven" , "eight" , "nine" , "ten" , "eleven" , "twelve" , "thirteen" , "fourteen" , "fifteen" , "sixteen" , "seventeen" , "eighteen" , "nineteen" ]
number_words_tens = [ "twenty" , "thirty" , "forty" , "fifty" , "sixty" , "seventy" , "eighty" , "ninety" ]
number_words_rx = re .compile ( r'\b (?:(?:{0})?(?:{1})|(?:{0}))\b ' .format ( "|" .join ( number_words_tens) , "|" .join ( number_words) ) )
main_rx = re .compile ( r'\s *\d +(?:\s +(?:and\s +)?\d +){2,}' )
numbers_1_99 = number_words
numbers_1_99.extend ( tens if ones == "zero" else ( tens + "-" + ones) # stackoverflow.com/a/8982279/3832970
for tens in "twenty thirty forty fifty sixty seventy eighty ninety" .split ( )
for ones in numbers_1_99[ 0 :10 ] )
def text2int( textnum, numwords= { } ) : # stackoverflow.com/a/493788/3832970
units = [
"zero" , "one" , "two" , "three" , "four" , "five" , "six" , "seven" , "eight" ,
"nine" , "ten" , "eleven" , "twelve" , "thirteen" , "fourteen" , "fifteen" ,
"sixteen" , "seventeen" , "eighteen" , "nineteen" ,
]
tens = [ "" , "" , "twenty" , "thirty" , "forty" , "fifty" , "sixty" , "seventy" , "eighty" , "ninety" ]
numwords[ "and" ] = ( 1 , 0 )
for idx, word in enumerate ( units) :
numwords[ word] = ( 1 , idx)
for idx, word in enumerate ( tens) :
numwords[ word] = ( 1 , idx * 10 )
current = result = 0
for word in textnum.split ( ) :
if word not in numwords:
raise Exception ( "Illegal word: " + word)
scale, increment = numwords[ word]
current = current + increment
return result + current
sample1 = "hello my name is sofie my social security number is thirteen zero four five and seventy eighteen seven and forty and I live on mountain street number twelve"
sample1 = number_words_rx.sub ( lambda x: str ( text2int( x.group ( ) ) ) , sample1)
#3 or more numbers occur with only whitespace or "and"
re_results = main_rx.sub ( '' , sample1)
print ( re .sub ( r'\d {1,2}' , lambda x: numbers_1_99[ int ( x.group ( ) ) ] , re_results) )
aW1wb3J0IHJlCgpudW1iZXJfd29yZHMgPSBbICJ6ZXJvIiwgIm9uZSIsICJ0d28iLCAidGhyZWUiLCAiZm91ciIsICJmaXZlIiwgInNpeCIsICJzZXZlbiIsICJlaWdodCIsICJuaW5lIiwgInRlbiIsICJlbGV2ZW4iLCAidHdlbHZlIiwgInRoaXJ0ZWVuIiwgImZvdXJ0ZWVuIiwgImZpZnRlZW4iLCAic2l4dGVlbiIsICJzZXZlbnRlZW4iLCAiZWlnaHRlZW4iLCAibmluZXRlZW4iXQpudW1iZXJfd29yZHNfdGVucyA9WyAidHdlbnR5IiwgInRoaXJ0eSIsICJmb3J0eSIsICJmaWZ0eSIsICJzaXh0eSIsICJzZXZlbnR5IiwgImVpZ2h0eSIsICJuaW5ldHkiIF0KbnVtYmVyX3dvcmRzX3J4ID0gcmUuY29tcGlsZShyJ1xiKD86KD86ezB9KT8oPzp7MX0pfCg/OnswfSkpXGInLmZvcm1hdCgifCIuam9pbihudW1iZXJfd29yZHNfdGVucyksInwiLmpvaW4obnVtYmVyX3dvcmRzKSkpCm1haW5fcnggPSByZS5jb21waWxlKHInXHMqXGQrKD86XHMrKD86YW5kXHMrKT9cZCspezIsfScpCm51bWJlcnNfMV85OSA9IG51bWJlcl93b3JkcwpudW1iZXJzXzFfOTkuZXh0ZW5kKHRlbnMgaWYgb25lcyA9PSAiemVybyIgZWxzZSAodGVucyArICItIiArIG9uZXMpICMgc3RhY2tvdmVyZmxvdy5jb20vYS84OTgyMjc5LzM4MzI5NzAKICAgIGZvciB0ZW5zIGluICJ0d2VudHkgdGhpcnR5IGZvcnR5IGZpZnR5IHNpeHR5IHNldmVudHkgZWlnaHR5IG5pbmV0eSIuc3BsaXQoKQogICAgZm9yIG9uZXMgaW4gbnVtYmVyc18xXzk5WzA6MTBdKQogICAgCmRlZiB0ZXh0MmludCh0ZXh0bnVtLCBudW13b3Jkcz17fSk6ICMgc3RhY2tvdmVyZmxvdy5jb20vYS80OTM3ODgvMzgzMjk3MAogICAgdW5pdHMgPSBbCiAgICAgICAgInplcm8iLCAib25lIiwgInR3byIsICJ0aHJlZSIsICJmb3VyIiwgImZpdmUiLCAic2l4IiwgInNldmVuIiwgImVpZ2h0IiwKICAgICAgICAibmluZSIsICJ0ZW4iLCAiZWxldmVuIiwgInR3ZWx2ZSIsICJ0aGlydGVlbiIsICJmb3VydGVlbiIsICJmaWZ0ZWVuIiwKICAgICAgICAic2l4dGVlbiIsICJzZXZlbnRlZW4iLCAiZWlnaHRlZW4iLCAibmluZXRlZW4iLAogICAgXQogICAgdGVucyA9IFsiIiwgIiIsICJ0d2VudHkiLCAidGhpcnR5IiwgImZvcnR5IiwgImZpZnR5IiwgInNpeHR5IiwgInNldmVudHkiLCAiZWlnaHR5IiwgIm5pbmV0eSJdCiAgICBudW13b3Jkc1siYW5kIl0gPSAoMSwgMCkKICAgIGZvciBpZHgsIHdvcmQgaW4gZW51bWVyYXRlKHVuaXRzKToKICAgICAgICBudW13b3Jkc1t3b3JkXSA9ICgxLCBpZHgpCiAgICBmb3IgaWR4LCB3b3JkIGluIGVudW1lcmF0ZSh0ZW5zKToKICAgICAgCW51bXdvcmRzW3dvcmRdID0gKDEsIGlkeCAqIDEwKQogICAgY3VycmVudCA9IHJlc3VsdCA9IDAKICAgIGZvciB3b3JkIGluIHRleHRudW0uc3BsaXQoKToKICAgICAgICBpZiB3b3JkIG5vdCBpbiBudW13b3JkczoKICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiSWxsZWdhbCB3b3JkOiAiICsgd29yZCkKCiAgICAgICAgc2NhbGUsIGluY3JlbWVudCA9IG51bXdvcmRzW3dvcmRdCiAgICAgICAgY3VycmVudCA9IGN1cnJlbnQgKyBpbmNyZW1lbnQKCiAgICByZXR1cm4gcmVzdWx0ICsgY3VycmVudApzYW1wbGUxID0gImhlbGxvIG15IG5hbWUgaXMgc29maWUgbXkgc29jaWFsIHNlY3VyaXR5IG51bWJlciBpcyB0aGlydGVlbiB6ZXJvIGZvdXIgZml2ZSBhbmQgc2V2ZW50eSBlaWdodGVlbiBzZXZlbiBhbmQgZm9ydHkgYW5kIEkgbGl2ZSBvbiBtb3VudGFpbiBzdHJlZXQgbnVtYmVyIHR3ZWx2ZSIKc2FtcGxlMSA9IG51bWJlcl93b3Jkc19yeC5zdWIobGFtYmRhIHg6IHN0cih0ZXh0MmludCh4Lmdyb3VwKCkpKSwgc2FtcGxlMSkKIzMgb3IgbW9yZSBudW1iZXJzIG9jY3VyIHdpdGggb25seSB3aGl0ZXNwYWNlIG9yICJhbmQiCnJlX3Jlc3VsdHMgPSBtYWluX3J4LnN1YignJywgc2FtcGxlMSkKcHJpbnQoIHJlLnN1YihyJ1xkezEsMn0nLCBsYW1iZGEgeDogbnVtYmVyc18xXzk5W2ludCh4Lmdyb3VwKCkpXSwgcmVfcmVzdWx0cykgKQ==