fork(1) download
  1. <?php
  2.  
  3. function FixUTF8($args) {
  4. $output = $args['input'];
  5.  
  6. $output = convertNamedHTMLEntitiesToNumeric(['input'=>$output]);
  7.  
  8. preg_match_all('/(&#[0-9]+;)/', $output, $matches, PREG_OFFSET_CAPTURE);
  9. $full_matches = $matches[0];
  10.  
  11. $found = [];
  12. $search = [];
  13. $replace = [];
  14.  
  15. for($i = 0; $i < count($full_matches); $i++) {
  16. $match = $full_matches[$i];
  17. $word = $match[0];
  18. if(!$found[$word]) {
  19. $found[$word] = TRUE;
  20. $search[] = $word;
  21. $replacement = str_replace(['&#', ';'], ['HTML Entity #', ''], $word);
  22. $replace[] = $replacement;
  23. }
  24. }
  25.  
  26. $new_output = str_replace($search, $replace, $output);
  27.  
  28. return $new_output;
  29. }
  30.  
  31. function convertNamedHTMLEntitiesToNumeric($args) {
  32. $input = $args['input'];
  33. return preg_replace_callback("/(&[a-zA-Z][a-zA-Z0-9]*;)/",function($m){
  34. $c = html_entity_decode($m[0],ENT_HTML5,"UTF-8");
  35. # return htmlentities($c,ENT_XML1,"UTF-8"); -- see update below
  36.  
  37. $convmap = array(0x80, 0xffff, 0, 0xffff);
  38. return mb_encode_numericentity($c, $convmap, 'UTF-8');
  39. }, $input);
  40. }
  41.  
  42. print(FixUTF8(['input'=>"Oggi &egrave; un bel&nbsp;giorno"]));
Success #stdin #stdout #stderr 0.02s 26660KB
stdin
Standard input is empty
stdout
Oggi HTML Entity #232 un belHTML Entity #160giorno
stderr
PHP Notice:  Undefined index: &#232; in /home/TEPVwT/prog.php on line 18
PHP Notice:  Undefined index: &#160; in /home/TEPVwT/prog.php on line 18