fork download
  1. use strict;
  2. use warnings;
  3.  
  4. # allowed tag and attribute names
  5.  
  6. my $allowed_tags_open = 'p|body|b|u|em|strong|ul|ol|li|h1|h2|h3|h4|h5|h6|a|tr|td|table|tbody|label|div|sup|sub|caption';
  7.  
  8. my $allowed_tags_self_closing = 'img|br|hr';
  9.  
  10. my $allowed_attributes = 'alt|href|tcmuri|title|height|width|align|valign|rowspan|colspan|src|summary|class|id|name|title|target|nowrap|scope|axis|cellpadding|cellspacing|dir|lang|rel';
  11.  
  12. $allowed_attributes .= '|style'; # for testing
  13.  
  14.  
  15. # definitions for matching allowed tag and attribute names
  16.  
  17. my $re_tags = qr~(?(DEFINE)
  18. (?<tags_open>
  19. /?+
  20. (?>
  21. (?: $allowed_tags_open )
  22. (?! [^\s>/] ) # from (?&tagname)
  23. )
  24. )
  25. (?<tags_self_closing>
  26. (?>
  27. (?: $allowed_tags_self_closing )
  28. (?! [^\s>/] ) # from (?&tagname)
  29. )
  30. )
  31. (?<tags> (?> (?&tags_open) | (?&tags_self_closing) ) )
  32. (?<attribs>
  33. (?>
  34. (?: $allowed_attributes )
  35. (?! [^\s=/>] ) # from (?&attname)
  36. )
  37. )
  38. )~xi;
  39.  
  40.  
  41. # definitions for matching the tags
  42. # trying to follow compatible tokenization characteristics of modern browsers
  43.  
  44. my $re_defs = qr~(?(DEFINE)
  45. (?<tagname> [a-z/][^\s>/]*+ ) # will match the leading / in closing tags
  46. (?<attname> [^\s>/][^\s=/>]*+ ) # first char can be pretty much anything, including =
  47. (?<attval> (?>
  48. "[^"]*+" |
  49. \'[^\']*+\' |
  50. [^\s>]*+ # unquoted values can contain quotes, = and /
  51. )
  52. )
  53. (?<attrib> (?&attname)
  54. (?: \s*+
  55. = \s*+
  56. (?&attval)
  57. )?+
  58. )
  59. (?<crap> (?!/>)[^\s>] ) # most crap inside tag is ignored, but don't eat the last / in self closing tags
  60. (?<tag> <(?&tagname)
  61. (?: \s*+ # spaces between attributes not required: <b/foo=">"style=color:red>bold red text</b>
  62. (?>
  63. (?&attrib) | # order matters
  64. (?&crap) # if not an attribute, eat the crap
  65. )
  66. )*+
  67. \s*+ /?+
  68. >
  69. )
  70. )~xi;
  71.  
  72.  
  73.  
  74. sub sanitize_html{
  75. my $str = shift;
  76. $str =~ s/(?&tag) $re_defs/ sanitize_tag($&) /gexo;
  77. return $str;
  78. }
  79.  
  80.  
  81. sub sanitize_tag{
  82. my $tag = shift;
  83.  
  84. my ($name, $attr, $end) =
  85. $tag =~ /^ < ((?&tags)) (.*?) ( \/?+ > ) $ $re_tags/xo
  86. or return ''; # return empty string if not allowed tag
  87.  
  88. # return a new clean closing tag if it's a closing tag
  89. return "<$name>" if substr($name, 0, 1) eq '/';
  90.  
  91. # clean attributes
  92. return "<$name" . sanitize_attributes($attr) . $end;
  93. }
  94.  
  95.  
  96. sub sanitize_attributes{
  97. my $attr = shift;
  98. my $new = '';
  99.  
  100. $attr =~ s{
  101. \G
  102. \s*+ # spaces between attributes not required
  103. (?>
  104. ( (?&attrib) ) | # order matters
  105. (?&crap) # if not an attribute, eat the crap
  106. )
  107.  
  108. $re_defs
  109. }{
  110. my $att = $1;
  111. $new .= " $att" if $att && $att =~ /^(?&attribs) $re_tags/xo;
  112. '';
  113. }gexo;
  114.  
  115. return $new;
  116. }
  117.  
  118.  
  119. ### test
  120.  
  121. my $test = <<'_TEST_';
  122. <b>simple</b>
  123. self <img>closing</img>
  124.  
  125. <abc id="test">new tag and known attribute</abc>
  126. <a id="test" xyz="testattr" href="/foo">one unknown attr</a>
  127. <a id="foo">attr in closing tag</a id="foo">
  128.  
  129. <b/#ñ%&/()!¢º`=">="">crap be gone</b> not bold<br/x"/>
  130. <b/style=color:red;background:url("x.gif");/*="still.CSS*/ id="x"zz"<script class="x">tricky</b/ x=">"//> not bold
  131. _TEST_
  132.  
  133. print $test, "\n";
  134. print '-' x 70, "\n";
  135. print sanitize_html $test;
  136.  
Success #stdin #stdout 0s 4728KB
stdin
Standard input is empty
stdout
<b>simple</b>
self <img>closing</img>

<abc id="test">new tag and known attribute</abc>
<a id="test" xyz="testattr" href="/foo">one unknown attr</a>
<a id="foo">attr in closing tag</a id="foo">

<b/#ñ%&/()!¢º`=">="">crap be gone</b> not bold<br/x"/>
<b/style=color:red;background:url("x.gif");/*="still.CSS*/ id="x"zz"<script class="x">tricky</b/ x=">"//> not bold

----------------------------------------------------------------------
<b>simple</b>
self <img>closing

new tag and known attribute
<a id="test" href="/foo">one unknown attr</a>
<a id="foo">attr in closing tag</a>

<b>crap be gone</b> not bold<br/>
<b style=color:red;background:url("x.gif");/*="still.CSS*/ id="x" class="x">tricky</b> not bold