[('Tokenization',
'is',
'widely',
'regarded',
'as',
'a',
'solved',
'problem',
'due',
'to',
'the',
'high',
'accuracy',
'that',
'rulebased',
'tokenizers',
'achieve',
'.'),
('But',
'rule-based',
'tokenizers',
'are',
'hard',
'to',
'maintain',
'and',
'their',
'rules',
'language',
'specific',
'.'),
('We',
'show',
'that',
'high',
'accuracy',
'word',
'and',
'sentence',
'segmentation',
'can',
'be',
'achieved',
'by',
'using',
'supervised',
'sequence',
'labeling',
'on',
'the',
'character',
'level',
'combined',
'with',
'unsupervised',
'feature',
'learning',
'.'),
('We',
'evaluated',
'our',
'method',
'on',
'three',
'languages',
'and',
'obtained',
'error',
'rates',
'of',
'0.27',
'\xe2\x80\xb0',
'(',
'English',
')',
',',
'0.35',
'\xe2\x80\xb0',
'(',
'Dutch',
')',
'and',
'0.76',
'\xe2\x80\xb0',
'(',
'Italian',
')',
'for',
'our',
'best',
'models',
'.')]
[[(0, 12, 'Tokenization'),
(13, 15, 'is'),
(16, 22, 'widely'),
(23, 31, 'regarded'),
(32, 34, 'as'),
(35, 36, 'a'),
(37, 43, 'solved'),
(44, 51, 'problem'),
(52, 55, 'due'),
(56, 58, 'to'),
(59, 62, 'the'),
(63, 67, 'high'),
(68, 76, 'accuracy'),
(77, 81, 'that'),
(82, 91, 'rulebased'),
(92, 102, 'tokenizers'),
(103, 110, 'achieve'),
(110, 111, '.')],
[(0, 3, 'But'),
(4, 14, 'rule-based'),
(15, 25, 'tokenizers'),
(26, 29, 'are'),
(30, 34, 'hard'),
(35, 37, 'to'),
(38, 46, 'maintain'),
(47, 50, 'and'),
(51, 56, 'their'),
(57, 62, 'rules'),
(63, 71, 'language'),
(72, 80, 'specific'),
(80, 81, '.')],
[(0, 2, 'We'),
(3, 7, 'show'),
(8, 12, 'that'),
(13, 17, 'high'),
(18, 26, 'accuracy'),
(27, 31, 'word'),
(32, 35, 'and'),
(36, 44, 'sentence'),
(45, 57, 'segmentation'),
(58, 61, 'can'),
(62, 64, 'be'),
(65, 73, 'achieved'),
(74, 76, 'by'),
(77, 82, 'using'),
(83, 93, 'supervised'),
(94, 102, 'sequence'),
(103, 111, 'labeling'),
(112, 114, 'on'),
(115, 118, 'the'),
(119, 128, 'character'),
(129, 134, 'level'),
(135, 143, 'combined'),
(144, 148, 'with'),
(149, 161, 'unsupervised'),
(162, 169, 'feature'),
(170, 178, 'learning'),
(178, 179, '.')],
[(0, 2, 'We'),
(3, 12, 'evaluated'),
(13, 16, 'our'),
(17, 23, 'method'),
(24, 26, 'on'),
(27, 32, 'three'),
(33, 42, 'languages'),
(43, 46, 'and'),
(47, 55, 'obtained'),
(56, 61, 'error'),
(62, 67, 'rates'),
(68, 70, 'of'),
(71, 75, '0.27'),
(76, 77, '\xe2\x80\xb0'),
(78, 79, '('),
(79, 86, 'English'),
(86, 87, ')'),
(87, 88, ','),
(89, 93, '0.35'),
(94, 95, '\xe2\x80\xb0'),
(96, 97, '('),
(97, 102, 'Dutch'),
(102, 103, ')'),
(104, 107, 'and'),
(108, 112, '0.76'),
(113, 114, '\xe2\x80\xb0'),
(115, 116, '('),
(116, 123, 'Italian'),
(123, 124, ')'),
(125, 128, 'for'),
(129, 132, 'our'),
(133, 137, 'best'),
(138, 144, 'models'),
(144, 145, '.')]]