use strict; use warnings; # allowed tag and attribute names my $allowed_tags_open = 'p|body|b|u|em|strong|ul|ol|li|h1|h2|h3|h4|h5|h6|a|tr|td|table|tbody|label|div|sup|sub|caption'; my $allowed_tags_self_closing = 'img|br|hr'; my $allowed_attributes = 'alt|href|tcmuri|title|height|width|align|valign|rowspan|colspan|src|summary|class|id|name|title|target|nowrap|scope|axis|cellpadding|cellspacing|dir|lang|rel'; $allowed_attributes .= '|style'; # for testing # definitions for matching allowed tag and attribute names my $re_tags = qr~(?(DEFINE) (? /?+ (?> (?: $allowed_tags_open ) (?! [^\s>/] ) # from (?&tagname) ) ) (? (?> (?: $allowed_tags_self_closing ) (?! [^\s>/] ) # from (?&tagname) ) ) (? (?> (?&tags_open) | (?&tags_self_closing) ) ) (? (?> (?: $allowed_attributes ) (?! [^\s=/>] ) # from (?&attname) ) ) )~xi; # definitions for matching the tags # trying to follow compatible tokenization characteristics of modern browsers my $re_defs = qr~(?(DEFINE) (? [a-z/][^\s>/]*+ ) # will match the leading / in closing tags (? [^\s>/][^\s=/>]*+ ) # first char can be pretty much anything, including = (? (?> "[^"]*+" | \'[^\']*+\' | [^\s>]*+ # unquoted values can contain quotes, = and / ) ) (? (?&attname) (?: \s*+ = \s*+ (?&attval) )?+ ) (? (?!/>)[^\s>] ) # most crap inside tag is ignored, but don't eat the last / in self closing tags (? <(?&tagname) (?: \s*+ # spaces between attributes not required: bold red text (?> (?&attrib) | # order matters (?&crap) # if not an attribute, eat the crap ) )*+ \s*+ /?+ > ) )~xi; sub sanitize_html{ my $str = shift; $str =~ s/(?&tag) $re_defs/ sanitize_tag($&) /gexo; return $str; } sub sanitize_tag{ my $tag = shift; my ($name, $attr, $end) = $tag =~ /^ < ((?&tags)) (.*?) ( \/?+ > ) $ $re_tags/xo or return ''; # return empty string if not allowed tag # return a new clean closing tag if it's a closing tag return "<$name>" if substr($name, 0, 1) eq '/'; # clean attributes return "<$name" . sanitize_attributes($attr) . $end; } sub sanitize_attributes{ my $attr = shift; my $new = ''; $attr =~ s{ \G \s*+ # spaces between attributes not required (?> ( (?&attrib) ) | # order matters (?&crap) # if not an attribute, eat the crap ) $re_defs }{ my $att = $1; $new .= " $att" if $att && $att =~ /^(?&attribs) $re_tags/xo; ''; }gexo; return $new; } ### test my $test = <<'_TEST_'; simple self closing new tag and known attribute one unknown attr attr in closing tag crap be gone not bold
tricky not bold _TEST_ print $test, "\n"; print '-' x 70, "\n"; print sanitize_html $test;