fork(1) download
  1. from nltk.tokenize import RegexpTokenizer
  2.  
  3. def regex_tokenize(text="The cost of a youth pass for Caltrain costs $4.50"):
  4.  
  5. pattern = r'''(?x) # set flag to allow verbose regexps
  6. ([A-Z]\.)+ # abbreviations, e.g. U.S.A.
  7. | \$?\d+(\.\d+)?%? # numbers, incl. currency and percentages
  8. | \w+([-']\w+)* # words w/ optional internal hyphens/apostrophe
  9. | @((\w)+([-']\w+))*
  10. | [+/\-@&*] # special characters with meanings
  11. '''
  12.  
  13. #pattern = r'[+/\-@&*#](\w+)|(\w+)'
  14.  
  15. tokenizer = RegexpTokenizer(pattern)
  16. token_list = tokenizer.tokenize(text)
  17.  
  18. #print token_list
  19.  
  20. return token_list
  21.  
Runtime error #stdin #stdout #stderr 0.01s 7732KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 1, in <module>
ImportError: No module named nltk.tokenize