fork download
  1. import re
  2.  
  3. emoticon_string = r"""
  4. (?P<EMOTICON>
  5. [<>]?
  6. [:;=8] # eyes
  7. [-o*']? # optional nose
  8. [][()dDpP/:{}@|\\] # mouth
  9. |
  10. [][()dDpP/:}{@|\\] # mouth
  11. [-o*']? # optional nose
  12. [:;=8] # eyes
  13. [<>]?
  14. )"""
  15.  
  16. regex_strings = (
  17. # URL:
  18. r"""(?P<URL>https?://(?:[-a-zA-Z0-9_$@.&+!*(),]|%[0-9a-fA-F][0-9a-fA-F])+)"""
  19. ,
  20. # Twitter username:
  21. r"""(?P<USER>@\w+)"""
  22. ,
  23. # Hashtags:
  24. r"""(?P<HASHTAG>\#+\w+[\w'-]*\w+)"""
  25. ,
  26. # Cashtags:
  27. r"""(?P<CASHTAG>\$+\w+[\w'-]*\w+)"""
  28. ,
  29. # Remaining word types:
  30. r"""
  31. (?P<NUMBER>[+-]?\d+(?:[,/.:-]\d+[+-]?)?) # Numbers, including fractions, decimals.
  32. |
  33. (?P<WORD>\w+) # Words without apostrophes or dashes.
  34. |
  35. (?P<ELLIPSIS>\.(?:\s*\.)+) # Ellipsis dots.
  36. |
  37. (?P<ELSE>\S) # Everything else that isn't whitespace.
  38. """
  39. )
  40.  
  41. word_re = re.compile(r"""({}|{})""".format(emoticon_string, "|".join(regex_strings)), re.VERBOSE | re.I | re.UNICODE)
  42. emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
  43. ######################################################################
  44.  
  45. class Tokenizer:
  46. def __init__(self, preserve_case=False):
  47. self.preserve_case = preserve_case
  48.  
  49. def tokenize(self, s):
  50. try:
  51. s = str(s)
  52. except UnicodeDecodeError:
  53. s = str(s).encode('string_escape')
  54. s = unicode(s)
  55. # Tokenize:
  56. words = []
  57. for x in word_re.finditer(s):
  58. for key, val in x.groupdict().items():
  59. if val:
  60. if not self.preserve_case:
  61. val = val.lower()
  62. if key in ['WORD','ELLIPSIS','ELSE']:
  63. words.append(val)
  64. elif key in ['HASHTAG','CASHTAG','USER','URL']: # Add more here if needed
  65. words.append("{{{}|{}}}".format(key, re.sub(r'^[#@$]+', '', val)))
  66. else:
  67. words.append("{{{}|{}}}".format(key, val))
  68. return words
  69.  
  70. if __name__ == '__main__':
  71. tok = Tokenizer(preserve_case=False)
  72. test = ' RT @trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://s...content-available-to-author-only...e.com'
  73. tokenized = tok.tokenize(test)
  74. print("\n".join(tokenized))
Success #stdin #stdout 0.02s 28064KB
stdin
Standard input is empty
stdout
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|http://s...content-available-to-author-only...e.com}