fork(4) download
  1. <?php
  2.  
  3. // words to be removed
  4. $stopwords = array(
  5. 'der' => 1,
  6. 'die' => 1,
  7. 'das' => 1,
  8. 'the' => 1);
  9. # use words as key for better performance
  10.  
  11. // remove stopwords from string
  12. function strip_stopwords($str = "")
  13. {
  14. global $stopwords;
  15.  
  16. // 1.) break string into words
  17. // [^-\w\'] matches characters, that are not [0-9a-zA-Z_-']
  18. // if input is unicode/utf-8, the u flag is needed: /pattern/u
  19. $words = preg_split('/[^-\w\']+/', $str, -1, PREG_SPLIT_NO_EMPTY);
  20.  
  21. // 2.) if we have at least 2 words, remove stopwords
  22. if(count($words) > 1)
  23. {
  24. $words = array_filter($words, function ($w) use (&$stopwords) {
  25. return !isset($stopwords[strtolower($w)]);
  26. # if utf-8: mb_strtolower($w, "utf-8")
  27. });
  28. }
  29.  
  30. // check if not too much was removed such as "the the" would return empty
  31. if(!empty($words))
  32. return implode(" ", $words);
  33. return $str;
  34. }
  35.  
  36. // test it
  37. echo strip_stopwords("The Hobbit das foo, der");
  38.  
Success #stdin #stdout 0.02s 52472KB
stdin
Standard input is empty
stdout
Hobbit foo