fork download
  1. import re, string
  2.  
  3. tokenized_doc1 = [['The', 'intelligent', 'directory', 'enquiry', 'assistant', '(', 'YPA', ')', 'project', 'is', 'an', 'example', '(', 'going', 'back', 'quite', 'a', 'few', 'years', 'now', '...', ')', 'where', 'the', 'extraction', 'of', 'information', 'from', 'partially', 'structured', 'data', 'together', 'with', 'engineering', 'issues', 'played', 'major', 'roles', 'in', 'making', 'the', 'YPA', 'a', 'usable', 'online', 'system', '.'], ['I', 'am', 'developing', 'techniques', 'that', 'allow', 'the', 'extraction', 'of', 'conceptual', 'information', 'from', 'document', 'collections', 'and', 'the', 'utilization', 'of', 'such', 'knowledge', 'in', 'retrieval', 'tasks', '.'], ['The', 'type', 'of', 'documents', 'can', 'range', 'from', 'Web', 'pages', 'to', 'newspaper', 'articles', 'or', 'other', 'forms', 'of', 'vaguely/partially', 'structured', 'data', '.']]
  4. punct_rx = re.compile('[%s]' % re.escape(string.punctuation.replace(".", "").replace("-", "")))
  5. token_without_punctuation = []
  6.  
  7. for x in tokenized_doc1:
  8. y = []
  9. for token in x:
  10. tokens = punct_rx.sub("", token)
  11. y.append(tokens)
  12. token_without_punctuation.append(y)
  13.  
  14. print(token_without_punctuation)
Success #stdin #stdout 0.02s 27784KB
stdin
Standard input is empty
stdout
[['The', 'intelligent', 'directory', 'enquiry', 'assistant', '', 'YPA', '', 'project', 'is', 'an', 'example', '', 'going', 'back', 'quite', 'a', 'few', 'years', 'now', '...', '', 'where', 'the', 'extraction', 'of', 'information', 'from', 'partially', 'structured', 'data', 'together', 'with', 'engineering', 'issues', 'played', 'major', 'roles', 'in', 'making', 'the', 'YPA', 'a', 'usable', 'online', 'system', '.'], ['I', 'am', 'developing', 'techniques', 'that', 'allow', 'the', 'extraction', 'of', 'conceptual', 'information', 'from', 'document', 'collections', 'and', 'the', 'utilization', 'of', 'such', 'knowledge', 'in', 'retrieval', 'tasks', '.'], ['The', 'type', 'of', 'documents', 'can', 'range', 'from', 'Web', 'pages', 'to', 'newspaper', 'articles', 'or', 'other', 'forms', 'of', 'vaguelypartially', 'structured', 'data', '.']]