fork(4) download
  1. /*
  2.   English Sentence Splitter
  3.  
  4.   Input (stdin): English text.
  5.  
  6.   Output (stdout): Numbered list of individual sentences, each in a separate line.
  7.  
  8.   Algorithm:
  9.   (a) Read the input word-by-word.
  10.   (b) For each pair of adjacent words, estimate the probability that it's a sentence boundary:
  11.   (b-1) Estimate the probability that the previous word ends a sentence.
  12.   (b-2) Estimate the probability that the current word starts a sentence.
  13.   (b-3) These are two different estimates of the SAME probability, so to calculate
  14.   the boundary probability, we take their arithmetic mean.
  15.   (c) If that probability is over 0.5, put the current sentence in list and start a new one.
  16.   Note: estimates are based on author's intuition and not on real life data.
  17.  
  18.   Featurs and bugs:
  19.   (1) All multiple whitespace in sentences will be converted to a single space. So, for example, if the original text was split by line length (e.g. 80 chars per line), the split will NOT be copied to the output.
  20.   (2) I at end of sentence might be mistaken as an initial:
  21.   "Of course", said I. Milne is a great author.
  22.   (3) Quotations may be split into two sentences:
  23.   "Of course. Milne is a great author".
  24.   (4) Three-dots might split a sentence, even if they are within a quotation:
  25.   "This is a quote by the great author... A. A. Milne".
  26.  
  27.   Author: Erel Segal
  28.   Date: 2010-09-13
  29. */
  30.  
  31. #undef DEBUG
  32.  
  33. #include <iostream>
  34. #include <vector>
  35. #include <string>
  36. using namespace std;
  37.  
  38.  
  39. float probability_that_word_ends_a_sentence(string word) {
  40. // Calculate the conditional probability:
  41. // Pr(sentence_end|word)
  42. // NOTE: numbers are based on author's intuition and not on real-life data. if (!word.length())
  43.  
  44. if (!word.length())
  45. return 1.0; // empty word - possible only at beginning and end of text
  46.  
  47. char last = word[word.length()-1];
  48. if (last=='?' || last=='!') {
  49. return 0.99;
  50. } else if (last=='.') {
  51. // A dot may end a sentence, but may also mark an initial ("A. A. Milne"):
  52. if (word.length()>2) {
  53. return 0.97;
  54. } else {
  55. return 0.2;
  56. // Possible only if the sentence ends with I, such as:
  57. // "'Of course!' said I. Milne is a great author!"
  58. }
  59. } else if (last==':') { // A colon may sometimes end a sentence
  60. return 0.8;
  61. } else if (last=='"') { // there may be delimiters at the end of a quotation.
  62. if (word.length()>1) {
  63. char before_last = word[word.length()-2];
  64. if (before_last=='.' || before_last=='?' || before_last=='!') {
  65. return 0.9;
  66. } else {
  67. return 0.01; // may be a sentence end only if there is a typo.
  68. }
  69. } else {
  70. return 0.02; // may be a sentence end only if there is a typo.
  71. }
  72. } else {
  73. return 0.02; // may be a sentence end only if there is a typo.
  74. }
  75. }
  76.  
  77.  
  78. float probability_that_word_starts_a_sentence(string word) {
  79. // Calculate the conditional probability:
  80. // Pr(sentence_start|word)
  81. // NOTE: numbers are based on author's intuition and not on real-life data.
  82.  
  83. if (!word.length())
  84. return 1.0; // empty word - possible only at beginning and end of text
  85.  
  86. char first = word[0];
  87. if ('A' <= first && first <= 'Z')
  88. return 0.4; // may be a sentence start, but also a proper name.
  89. else if ('a' <= first && first <= 'z')
  90. return 0.02; // may be a sentence start only if there is a typo.
  91. else
  92. return 0.1; // may be a number or another symbol that starts a sentence rarely.
  93. }
  94.  
  95.  
  96. float probability_of_sentence_boundary(string previous_word, string current_word) {
  97. // Calculate the conditional probability:
  98. // Pr(sentence_boundary|previous_word current_word)
  99. // We have two different estimates to the same probability:
  100. // Pr(sentence_end|previous_word)
  101. // Pr(sentence_start|current_word)
  102. // so we use their arithmetic mean as a fused estimate.
  103.  
  104. float p_end = probability_that_word_ends_a_sentence(previous_word);
  105. float p_start = probability_that_word_starts_a_sentence(current_word);
  106. #ifdef DEBUG
  107. cout << previous_word << " " << p_end << " " << current_word << " " << p_start << endl;
  108. #endif
  109. return 0.5*(p_end+p_start);
  110. }
  111.  
  112.  
  113. void read_sentences_from_cin (vector<string>& sentences) {
  114. string previous_word="", current_word=""; // process two words at a time
  115. string sentence=""; // calculate one sentence at a time
  116.  
  117. // Loop until end of input:
  118. for (;;) {
  119.  
  120. // Read next word and check for end of file:
  121. current_word = "";
  122. cin >> current_word;
  123. if (!current_word.length()) {
  124. // At end of file, add the current sentence, although it may not end in a sentence-delimiter:
  125. if (sentence.length()) sentences.push_back(sentence);
  126. break;
  127. }
  128.  
  129. // If end of sentence, save the current sentence content and initialize it:
  130. if (probability_of_sentence_boundary(previous_word, current_word) > 0.5) {
  131. if (sentence.length()) sentences.push_back(sentence);
  132. sentence = "";
  133. }
  134.  
  135. // Add the current word to the current sentence:
  136. if (sentence.length()>0)
  137. sentence += " ";
  138. sentence += current_word;
  139.  
  140. previous_word = current_word;
  141. }
  142. }
  143.  
  144.  
  145. void print_sentences_to_cout (vector<string> sentences) {
  146. for (size_t i=0; i<sentences.size(); ++i) {
  147. cout << "(" << (i+1) << ") " << sentences[i] << endl;
  148. }
  149. }
  150.  
  151.  
  152. int main() {
  153. vector<string> sentences;
  154. read_sentences_from_cin (sentences);
  155. print_sentences_to_cout (sentences);
  156. }
  157.  
  158.  
  159.  
  160. /*
  161.   Test case:
  162.  
  163.  
  164.   the first sentence may be missing a capital, while leading and inside whitespace is ignored.
  165.  
  166. Here is my test case:
  167.  
  168. Newline should
  169. not break a sentence. Newline is not required to start a sentence. Floating
  170. point numbers such as Pi = 3.14 should not break a sentence. Nor should they
  171. when they are at the end such as e=2.71.
  172. A. A. Milne was a great author, his name should be totally contained in a single sentence.
  173. "Of course", said I. Milne is a great author!
  174. "Of course", I said. Milne is a great author, but the word order is also important.
  175. "Of course. I. Milne is a great author".
  176.  
  177. What happens when there is a delimiter and no capital afterwards? let's check. it depends.
  178.  
  179. "This is a quote with three dots... it ends here".
  180. "This is another quote by... Milne".
  181.  
  182. I said: "This is a quote that ends with a dot." I hope it will be considered a full sentence.
  183.  
  184. The last sentence may be missing a delimiter
  185. */
Success #stdin #stdout 0s 2864KB
stdin

   the first sentence may be missing a capital, while leading    and    inside whitespace is ignored.

Here is my test case:

Newline should
not break a sentence. Newline is not required to start a sentence. Floating
point numbers such as Pi = 3.14 should not break a sentence. Nor should they
when they are at the end such as e=2.71. 
A. A. Milne was a great author, his name should be totally contained in a single sentence.
"Of course", said I. Milne is a great author!
"Of course", I said. Milne is a great author, but the word order is also important.
"Of course. I. Milne is a great author".

What happens when there is a delimiter and no capital afterwards? let's check. it depends.

"This is a quote with three dots... it ends here".
"This is another quote by... Milne".

I said: "This is a quote that ends with a dot." I hope it will be considered a full sentence.

The last sentence may be missing a delimiter
stdout
(1)  the first sentence may be missing a capital, while leading and inside whitespace is ignored.
(2)  Here is my test case:
(3)  Newline should not break a sentence.
(4)  Newline is not required to start a sentence.
(5)  Floating point numbers such as Pi = 3.14 should not break a sentence.
(6)  Nor should they when they are at the end such as e=2.71.
(7)  A. A. Milne was a great author, his name should be totally contained in a single sentence.
(8)  "Of course", said I. Milne is a great author!
(9)  "Of course", I said.
(10)  Milne is a great author, but the word order is also important.
(11)  "Of course.
(12)  I. Milne is a great author".
(13)  What happens when there is a delimiter and no capital afterwards?
(14)  let's check. it depends.
(15)  "This is a quote with three dots... it ends here".
(16)  "This is another quote by...
(17)  Milne".
(18)  I said: "This is a quote that ends with a dot."
(19)  I hope it will be considered a full sentence.
(20)  The last sentence may be missing a delimiter