from nltk.tokenize import RegexpTokenizer
def regex_tokenize(text="The cost of a youth pass for Caltrain costs $4.50"):
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \$?\d+(\.\d+)?%? # numbers, incl. currency and percentages
| \w+([-']\w+)* # words w/ optional internal hyphens/apostrophe
| @((\w)+([-']\w+))*
| [+/\-@&*] # special characters with meanings
'''
#pattern = r'[+/\-@&*#](\w+)|(\w+)'
tokenizer = RegexpTokenizer(pattern)
token_list = tokenizer.tokenize(text)
#print token_list
return token_list
ZnJvbSBubHRrLnRva2VuaXplIGltcG9ydCBSZWdleHBUb2tlbml6ZXIKCmRlZiByZWdleF90b2tlbml6ZSh0ZXh0PSJUaGUgY29zdCBvZiBhIHlvdXRoIHBhc3MgZm9yIENhbHRyYWluIGNvc3RzICQ0LjUwIik6CgogICAgcGF0dGVybiA9IHInJycoP3gpICAgICAgICAgICAgICAgIyBzZXQgZmxhZyB0byBhbGxvdyB2ZXJib3NlIHJlZ2V4cHMKICAgICAgICAgICAgICAgICAgKFtBLVpdXC4pKyAgICAgICAgICMgYWJicmV2aWF0aW9ucywgZS5nLiBVLlMuQS4KICAgICAgICAgICAgICAgICAgfCBcJD9cZCsoXC5cZCspPyU/ICMgbnVtYmVycywgaW5jbC4gY3VycmVuY3kgYW5kIHBlcmNlbnRhZ2VzCiAgICAgICAgICAgICAgICAgIHwgXHcrKFstJ11cdyspKiAgICAjIHdvcmRzIHcvIG9wdGlvbmFsIGludGVybmFsIGh5cGhlbnMvYXBvc3Ryb3BoZQogICAgICAgICAgICAgICAgICB8IEAoKFx3KSsoWy0nXVx3KykpKgogICAgICAgICAgICAgICAgICB8IFsrL1wtQCYqXSAgICAgICAgIyBzcGVjaWFsIGNoYXJhY3RlcnMgd2l0aCBtZWFuaW5ncwogICAgICAgICAgICAgICAgJycnCgogICAgI3BhdHRlcm4gPSByJ1srL1wtQCYqI10oXHcrKXwoXHcrKScKCiAgICB0b2tlbml6ZXIgPSBSZWdleHBUb2tlbml6ZXIocGF0dGVybikKICAgIHRva2VuX2xpc3QgPSB0b2tlbml6ZXIudG9rZW5pemUodGV4dCkKCiAgICAjcHJpbnQgdG9rZW5fbGlzdAoKICAgIHJldHVybiB0b2tlbl9saXN0Cg==