use strict;
use warnings;
# allowed tag and attribute names
my $allowed_tags_open = 'p|body|b|u|em|strong|ul|ol|li|h1|h2|h3|h4|h5|h6|a|tr|td|table|tbody|label|div|sup|sub|caption';
my $allowed_tags_self_closing = 'img|br|hr';
my $allowed_attributes = 'alt|href|tcmuri|title|height|width|align|valign|rowspan|colspan|src|summary|class|id|name|title|target|nowrap|scope|axis|cellpadding|cellspacing|dir|lang|rel';
$allowed_attributes .= '|style'; # for testing
# definitions for matching allowed tag and attribute names
my $re_tags = qr~(?(DEFINE
) (?<tags_open>
/?+
(?>
(?: $allowed_tags_open )
(?! [^\s>/] ) # from (?&tagname)
)
)
(?<tags_self_closing>
(?>
(?: $allowed_tags_self_closing )
(?! [^\s>/] ) # from (?&tagname)
)
)
(?<tags> (?> (?&tags_open) | (?&tags_self_closing) ) )
(?<attribs>
(?>
(?: $allowed_attributes )
(?! [^\
s=/>] ) # from (?&attname) )
)
)~xi;
# definitions for matching the tags
# trying to follow compatible tokenization characteristics of modern browsers
my $re_defs = qr~(?(DEFINE
) (?<tagname> [a-z/][^\s>/]*+ ) # will match the leading / in closing tags
(?<attname> [^\s>/][^\
s=/>]*+ ) # first char can be pretty much anything, including = (?<attval> (?>
"[^"]*+" |
\'[^\']*+\' |
[^\s>]*+ # unquoted values can contain quotes, = and /
)
)
(?<attrib> (?&attname)
(?: \s*+
= \s*+
(?&attval)
)?+
)
(?<crap> (?!/>)[^\s>] ) # most crap inside tag is ignored, but don't eat the last / in self closing tags
(?<tag> <(?&tagname)
(?: \s*+ # spaces between attributes not required: <b/foo=">"style=color:red>bold red text</b>
(?>
(?&attrib) | # order matters
(?&crap) # if not an attribute, eat the crap
)
)*+
\s*+ /?+
>
)
)~xi;
sub sanitize_html{
my $str = shift;
$str =~ s/(?&tag) $re_defs/ sanitize_tag($&) /gexo;
return $str;
}
sub sanitize_tag{
my $tag = shift;
my ($name, $attr, $end) =
$tag =~ /^ < ((?&tags)) (.*?) ( \/?+ > ) $ $re_tags/xo
or return ''; # return empty string if not allowed tag
# return a new clean closing tag if it's a closing tag
return "<$name>" if substr($name, 0, 1) eq '/';
# clean attributes
return "<$name" . sanitize_attributes($attr) . $end;
}
sub sanitize_attributes{
my $attr = shift;
my $new = '';
$attr =~ s{
\G
\s*+ # spaces between attributes not required
(?>
( (?&attrib) ) | # order matters
(?&crap) # if not an attribute, eat the crap
)
$re_defs
}{
my $att = $1;
$new .= " $att" if $att && $att =~ /^(?&attribs) $re_tags/xo;
'';
}gexo;
return $new;
}
### test
my $test = <<'_TEST_';
<b>simple</b>
self <img>closing</img>
<abc id="test">new tag and known attribute</abc>
<a id="test" xyz="testattr" href="/foo">one unknown attr</a>
<a id="foo">attr in closing tag</a id="foo">
<b/#ñ%&/()!¢º`=">="">crap be gone</b> not bold<br/x"/>
<b/style=color:red;background:url("x.gif");/*="still.CSS*/ id="x"zz"<script class="x">tricky</b/ x=">"//> not bold
_TEST_
print $test, "\n";
print '-' x 70, "\n";
print sanitize_html $test;