fork download
  1. import re
  2. import fileinput
  3.  
  4. # output list
  5. words = []
  6. positions = []
  7. # temp accumulators
  8. w_buffer = []
  9. p_buffer = []
  10.  
  11. pattern = re.compile(ur'^(?:[ \t]*[(][ \t]*(\d+)[ \t]*,[ \t]*(\d+)[ \t]*,[ \t]*(\S+)[ \t]*[)][ \t]*)?$', re.UNICODE)
  12.  
  13. for line in fileinput.input():
  14. for (start, end, token) in re.findall(pattern, line):
  15. if start:
  16. w_buffer.append(token)
  17. p_buffer.append((int(start), int(end), token))
  18. else:
  19. words.append(tuple(w_buffer)); w_buffer = []
  20. positions.append(p_buffer); p_buffer = []
  21. if start:
  22. words.append(tuple(w_buffer))
  23. positions.append(p_buffer)
  24.  
  25. # An optional prettified output
  26. import pprint as pp
  27. pp.pprint(words)
  28. pp.pprint(positions)
Success #stdin #stdout 0.03s 9024KB
stdin
( 0,    12,   Tokenization  )   
(  13  , 15  	, is )   
  (  16, 22, widely)
( 23, 31, regarded )
(32,  	 34,   as  )
(35, 36, a)  
(37, 43, solved )  
(44, 51, problem)
(52, 55, due)
(56, 58, to)
(59, 62, the)
(63, 67, high)
(68, 76, accuracy)
(77, 81, that)
(82, 91, rulebased)
(92, 102, tokenizers)
(103, 110, achieve)
(110, 111, .)

(0, 3, But)
(4, 14, rule-based)
(15, 25, tokenizers)
(26, 29, are)
(30, 34, hard)
(35, 37, to)
(38, 46, maintain)
(47, 50, and)
(51, 56, their)
(57, 62, rules)
(63, 71, language)
(72, 80, specific)
(80, 81, .)

(0, 2, We)
(3, 7, show)
(8, 12, that)
(13, 17, high)
(18, 26, accuracy)
(27, 31, word)
(32, 35, and)
(36, 44, sentence)
(45, 57, segmentation)
(58, 61, can)
(62, 64, be)
(65, 73, achieved)
(74, 76, by)
(77, 82, using)
(83, 93, supervised)
(94, 102, sequence)
(103, 111, labeling)
(112, 114, on)
(115, 118, the)
(119, 128, character)
(129, 134, level)
(135, 143, combined)
(144, 148, with)
(149, 161, unsupervised)
(162, 169, feature)
(170, 178, learning)
(178, 179, .)

(0, 2, We)
(3, 12, evaluated)
(13, 16, our)
(17, 23, method)
(24, 26, on)
(27, 32, three)
(33, 42, languages)
(43, 46, and)
(47, 55, obtained)
(56, 61, error)
(62, 67, rates)
(68, 70, of)
(71, 75, 0.27)
(76, 77, ‰)
(78, 79, ()
(79, 86, English)
(86, 87, ))
(87, 88, ,)
(89, 93, 0.35)
(94, 95, ‰)
(96, 97, (     )
(97, 102, Dutch)
(102, 103, )     )
(104, 107, and)
(108, 112, 0.76)
(113, 114, ‰  )
(115, 116,   (    )
(116, 123 ,  Italian)
(123, 124, )   )
(125, 128, for)
(129, 132, our)
(133, 137, best)
(138, 144, models)
(144, 145, .)
stdout
[('Tokenization',
  'is',
  'widely',
  'regarded',
  'as',
  'a',
  'solved',
  'problem',
  'due',
  'to',
  'the',
  'high',
  'accuracy',
  'that',
  'rulebased',
  'tokenizers',
  'achieve',
  '.'),
 ('But',
  'rule-based',
  'tokenizers',
  'are',
  'hard',
  'to',
  'maintain',
  'and',
  'their',
  'rules',
  'language',
  'specific',
  '.'),
 ('We',
  'show',
  'that',
  'high',
  'accuracy',
  'word',
  'and',
  'sentence',
  'segmentation',
  'can',
  'be',
  'achieved',
  'by',
  'using',
  'supervised',
  'sequence',
  'labeling',
  'on',
  'the',
  'character',
  'level',
  'combined',
  'with',
  'unsupervised',
  'feature',
  'learning',
  '.'),
 ('We',
  'evaluated',
  'our',
  'method',
  'on',
  'three',
  'languages',
  'and',
  'obtained',
  'error',
  'rates',
  'of',
  '0.27',
  '\xe2\x80\xb0',
  '(',
  'English',
  ')',
  ',',
  '0.35',
  '\xe2\x80\xb0',
  '(',
  'Dutch',
  ')',
  'and',
  '0.76',
  '\xe2\x80\xb0',
  '(',
  'Italian',
  ')',
  'for',
  'our',
  'best',
  'models',
  '.')]
[[(0, 12, 'Tokenization'),
  (13, 15, 'is'),
  (16, 22, 'widely'),
  (23, 31, 'regarded'),
  (32, 34, 'as'),
  (35, 36, 'a'),
  (37, 43, 'solved'),
  (44, 51, 'problem'),
  (52, 55, 'due'),
  (56, 58, 'to'),
  (59, 62, 'the'),
  (63, 67, 'high'),
  (68, 76, 'accuracy'),
  (77, 81, 'that'),
  (82, 91, 'rulebased'),
  (92, 102, 'tokenizers'),
  (103, 110, 'achieve'),
  (110, 111, '.')],
 [(0, 3, 'But'),
  (4, 14, 'rule-based'),
  (15, 25, 'tokenizers'),
  (26, 29, 'are'),
  (30, 34, 'hard'),
  (35, 37, 'to'),
  (38, 46, 'maintain'),
  (47, 50, 'and'),
  (51, 56, 'their'),
  (57, 62, 'rules'),
  (63, 71, 'language'),
  (72, 80, 'specific'),
  (80, 81, '.')],
 [(0, 2, 'We'),
  (3, 7, 'show'),
  (8, 12, 'that'),
  (13, 17, 'high'),
  (18, 26, 'accuracy'),
  (27, 31, 'word'),
  (32, 35, 'and'),
  (36, 44, 'sentence'),
  (45, 57, 'segmentation'),
  (58, 61, 'can'),
  (62, 64, 'be'),
  (65, 73, 'achieved'),
  (74, 76, 'by'),
  (77, 82, 'using'),
  (83, 93, 'supervised'),
  (94, 102, 'sequence'),
  (103, 111, 'labeling'),
  (112, 114, 'on'),
  (115, 118, 'the'),
  (119, 128, 'character'),
  (129, 134, 'level'),
  (135, 143, 'combined'),
  (144, 148, 'with'),
  (149, 161, 'unsupervised'),
  (162, 169, 'feature'),
  (170, 178, 'learning'),
  (178, 179, '.')],
 [(0, 2, 'We'),
  (3, 12, 'evaluated'),
  (13, 16, 'our'),
  (17, 23, 'method'),
  (24, 26, 'on'),
  (27, 32, 'three'),
  (33, 42, 'languages'),
  (43, 46, 'and'),
  (47, 55, 'obtained'),
  (56, 61, 'error'),
  (62, 67, 'rates'),
  (68, 70, 'of'),
  (71, 75, '0.27'),
  (76, 77, '\xe2\x80\xb0'),
  (78, 79, '('),
  (79, 86, 'English'),
  (86, 87, ')'),
  (87, 88, ','),
  (89, 93, '0.35'),
  (94, 95, '\xe2\x80\xb0'),
  (96, 97, '('),
  (97, 102, 'Dutch'),
  (102, 103, ')'),
  (104, 107, 'and'),
  (108, 112, '0.76'),
  (113, 114, '\xe2\x80\xb0'),
  (115, 116, '('),
  (116, 123, 'Italian'),
  (123, 124, ')'),
  (125, 128, 'for'),
  (129, 132, 'our'),
  (133, 137, 'best'),
  (138, 144, 'models'),
  (144, 145, '.')]]