fork(2) download
  1. <?php
  2.  
  3. function sentence_split($text) {
  4. $before_regexes = array('/(?:(?:[\'\"„][\.!?…][\'\"”]\s)|(?:[^\.]\s[A-Z]\.\s)|(?:\b(?:St|Gen|Hon|Vol|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\.\s)|(?:\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\.\s[A-Z]\.\s)|(?:\bApr\.\s)|(?:\bAug\.\s)|(?:\bBros\.\s)|(?:\bCo\.\s)|(?:\bCorp\.\s)|(?:\bDec\.\s)|(?:\bDist\.\s)|(?:\bFeb\.\s)|(?:\bInc\.\s)|(?:\bJan\.\s)|(?:\bJul\.\s)|(?:\bJun\.\s)|(?:\bMar\.\s)|(?:\bNov\.\s)|(?:\bOct\.\s)|(?:\bPh\.?D\.\s)|(?:\bSept?\.\s)|(?:\b\p{Lu}\.\p{Lu}\.\s)|(?:\b\p{Lu}\.\s\p{Lu}\.\s)|(?:\bcf\.\s)|(?:\be\.g\.\s)|(?:\besp\.\s)|(?:\bet\b\s\bal\.\s)|(?:\bvs\.\s)|(?:\p{Ps}[!?]+\p{Pe} ))\Z/mu',
  5. '/(?:(?:[\.\s]\p{L}{1,2}\.\s))\Z/mu',
  6. '/(?:(?:[\[\(]*\.\.\.[\]\)]* ))\Z/mu',
  7. '/(?:(?:\b(?:pp|[Vv]iz|i\.?\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\.?\s*f|vs)\.\s))\Z/mu',
  8. '/(?:(?:\b[Ee]tc\.\s))\Z/mu',
  9. '/(?:(?:[\.!?…]+\p{Pe} )|(?:[\[\(]*…[\]\)]* ))\Z/mu',
  10. '/(?:(?:\b\p{L}\.))\Z/mu',
  11. '/(?:(?:\b\p{L}\.\s))\Z/mu',
  12. '/(?:(?:\b[Ff]igs?\.\s)|(?:\b[nN]o\.\s))\Z/mu',
  13. '/(?:(?:[\"”\']\s*))\Z/mu',
  14. '/(?:(?:[\.!?…][\x{00BB}\x{2019}\x{201D}\x{203A}\"\'\p{Pe}\x{0002}]*\s)|(?:\r?\n))\Z/mu',
  15. '/(?:(?:[\.!?…][\'\"\x{00BB}\x{2019}\x{201D}\x{203A}\p{Pe}\x{0002}]*))\Z/mu',
  16. '/(?:(?:\s\p{L}[\.!?…]\s))\Z/mu');
  17. $after_regexes = array('/\A(?:)/mu',
  18. '/\A(?:[\p{N}\p{Ll}])/mu',
  19. '/\A(?:[^\p{Lu}])/mu',
  20. '/\A(?:[^\p{Lu}]|I)/mu',
  21. '/\A(?:[^p{Lu}])/mu',
  22. '/\A(?:\p{Ll})/mu',
  23. '/\A(?:\p{L}\.)/mu',
  24. '/\A(?:\p{L}\.\s)/mu',
  25. '/\A(?:\p{N})/mu',
  26. '/\A(?:\s*\p{Ll})/mu',
  27. '/\A(?:)/mu',
  28. '/\A(?:\p{Lu}[^\p{Lu}])/mu',
  29. '/\A(?:\p{Lu}\p{Ll})/mu');
  30. $is_sentence_boundary = array(false, false, false, false, false, false, false, false, false, false, true, true, true);
  31. $count = 13;
  32.  
  33. $sentences = array();
  34. $sentence = '';
  35. $before = '';
  36. $after = substr($text, 0, 10);
  37. $text = substr($text, 10);
  38.  
  39. while($text != '') {
  40. for($i = 0; $i < $count; $i++) {
  41. if(preg_match($before_regexes[$i], $before) && preg_match($after_regexes[$i], $after)) {
  42. if($is_sentence_boundary[$i]) {
  43. array_push($sentences, $sentence);
  44. $sentence = '';
  45. }
  46. break;
  47. }
  48. }
  49.  
  50. $first_from_text = $text[0];
  51. $text = substr($text, 1);
  52. $first_from_after = $after[0];
  53. $after = substr($after, 1);
  54. $before .= $first_from_after;
  55. $sentence .= $first_from_after;
  56. $after .= $first_from_text;
  57. }
  58.  
  59. if($sentence != '' && $after != '') {
  60. array_push($sentences, $sentence.$after);
  61. }
  62.  
  63. return $sentences;
  64. }
  65.  
  66. $text = "Entertainment media properties.&Acirc;&nbsp; Fairy Tail and Tokyo Ghoul.";
  67. print_r(sentence_split($text));
  68.  
  69.  
  70.  
  71.  
Success #stdin #stdout 0.03s 52472KB
stdin
Standard input is empty
stdout
Array
(
    [0] => Entertainment media properties.&Acirc;&nbsp; Fairy Tail and Tokyo Ghoul.
)