1Scrubber(3) User Contributed Perl Documentation Scrubber(3)
2
3
4
6 HTML::Scrubber - Perl extension for scrubbing/sanitizing html
7
9 #!/usr/bin/perl -w
10 use HTML::Scrubber;
11 use strict;
12 #
13 my $html = q[
14 <style type="text/css"> BAD { background: #666; color: #666;} </style>
15 <script language="javascript"> alert("Hello, I am EVIL!"); </script>
16 <HR>
17 a => <a href=1>link </a>
18 br => <br>
19 b => <B> bold </B>
20 u => <U> UNDERLINE </U>
21 ];
22 #
23 my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); #
24 #
25 print $scrubber->scrub($html); #
26 #
27 $scrubber->deny( qw[ p b i u hr br ] ); #
28 #
29 print $scrubber->scrub($html); #
30 #
31
33 If you wanna "scrub" or "sanitize" html input in a reliable an flexible
34 fashion, then this module is for you.
35
36 I wasn't satisfied with HTML::Sanitizer because it is based on
37 HTML::TreeBuilder, so I thought I'd write something similar that works
38 directly with HTML::Parser.
39
41 First a note on documentation: just study the EXAMPLE below. It's all
42 the documentation you could need
43
44 Also, be sure to read all the comments as well as How does it work?.
45
46 If you're new to perl, good luck to you.
47
48 comment
49 warn "comments are ", $p->comment ? 'allowed' : 'not allowed';
50 $p->comment(0); # off by default
51
52 process
53 warn "process instructions are ", $p->process ? 'allowed' : 'not allowed';
54 $p->process(0); # off by default
55
56 script
57 warn "script tags (and everything in between) are supressed"
58 if $p->script; # off by default
59 $p->script( 0 || 1 );
60
61 ** Please note that this is implemented using HTML::Parser's
62 ignore_elements function, so if "script" is set to true, all script
63 tags encountered will be validated like all other tags.
64
65 style
66 warn "style tags (and everything in between) are supressed"
67 if $p->style; # off by default
68 $p->style( 0 || 1 );
69
70 ** Please note that this is implemented using HTML::Parser's
71 ignore_elements function, so if "style" is set to true, all style tags
72 encountered will be validated like all other tags.
73
74 allow
75 $p->allow(qw[ t a g s ]);
76
77 deny
78 $p->deny(qw[ t a g s ]);
79
80 rules
81 $p->rules(
82 img => {
83 src => qr{^(?!http://)}i, # only relative image links allowed
84 alt => 1, # alt attribute allowed
85 '*' => 0, # deny all other attributes
86 },
87 b => 1,
88 ...
89 );
90
91 default
92 print "default is ", $p->default();
93 $p->default(1); # allow tags by default
94 $p->default(
95 undef, # don't change
96 { # default attribute rules
97 '*' => 1, # allow attributes by default
98 }
99 );
100
101 scrub_file
102 $html = $scrubber->scrub_file('foo.html'); ## returns giant string
103 die "Eeek $!" unless defined $html; ## opening foo.html may have failed
104 $scrubber->scrub_file('foo.html', 'new.html') or die "Eeek $!";
105 $scrubber->scrub_file('foo.html', *STDOUT)
106 or die "Eeek $!"
107 if fileno STDOUT;
108
109 scrub
110 print $scrubber->scrub($html); ## returns giant string
111 $scrubber->scrub($html, 'new.html') or die "Eeek $!";
112 $scrubber->scrub($html', *STDOUT)
113 or die "Eeek $!"
114 if fileno STDOUT;
115
117 When a tag is encountered, HTML::Scrubber allows/denies the tag using
118 the explicit rule if one exists.
119
120 If no explicit rule exists, Scrubber applies the default rule.
121
122 If an explicit rule exists, but it's a simple rule(1), the default
123 attribute rule is applied.
124
125 EXAMPLE
126 #!/usr/bin/perl -w
127 use HTML::Scrubber;
128 use strict;
129 #
130 my @allow = qw[ br hr b a ];
131 #
132 my @rules = (
133 script => 0,
134 img => {
135 src => qr{^(?!http://)}i, # only relative image links allowed
136 alt => 1, # alt attribute allowed
137 '*' => 0, # deny all other attributes
138 },
139 );
140 #
141 my @default = (
142 0 => # default rule, deny all tags
143 {
144 '*' => 1, # default rule, allow all attributes
145 'href' => qr{^(?!(?:java)?script)}i,
146 'src' => qr{^(?!(?:java)?script)}i,
147 # If your perl doesn't have qr
148 # just use a string with length greater than 1
149 'cite' => '(?i-xsm:^(?!(?:java)?script))',
150 'language' => 0,
151 'name' => 1, # could be sneaky, but hey ;)
152 'onblur' => 0,
153 'onchange' => 0,
154 'onclick' => 0,
155 'ondblclick' => 0,
156 'onerror' => 0,
157 'onfocus' => 0,
158 'onkeydown' => 0,
159 'onkeypress' => 0,
160 'onkeyup' => 0,
161 'onload' => 0,
162 'onmousedown' => 0,
163 'onmousemove' => 0,
164 'onmouseout' => 0,
165 'onmouseover' => 0,
166 'onmouseup' => 0,
167 'onreset' => 0,
168 'onselect' => 0,
169 'onsubmit' => 0,
170 'onunload' => 0,
171 'src' => 0,
172 'type' => 0,
173 }
174 );
175 #
176 my $scrubber = HTML::Scrubber->new();
177 $scrubber->allow( @allow );
178 $scrubber->rules( @rules ); # key/value pairs
179 $scrubber->default( @default );
180 $scrubber->comment(1); # 1 allow, 0 deny
181 #
182 ## preferred way to create the same object
183 $scrubber = HTML::Scrubber->new(
184 allow => \@allow,
185 rules => \@rules,
186 default => \@default,
187 comment => 1,
188 process => 0,
189 );
190 #
191 require Data::Dumper,die Data::Dumper::Dumper($scrubber) if @ARGV;
192 #
193 my $it = q[
194 <?php echo(" EVIL EVIL EVIL "); ?> <!-- asdf -->
195 <hr>
196 <I FAKE="attribute" > IN ITALICS WITH FAKE="attribute" </I><br>
197 <B> IN BOLD </B><br>
198 <A NAME="evil">
199 <A HREF="javascript:alert('die die die');">HREF=JAVA <!></A>
200 <br>
201 <A HREF="image/bigone.jpg" ONMOUSEOVER="alert('die die die');">
202 <IMG SRC="image/smallone.jpg" ALT="ONMOUSEOVER JAVASCRIPT">
203 </A>
204 </A> <br>
205 ];
206 #
207 print "#original text",$/, $it, $/;
208 print
209 "#scrubbed text (default ",
210 $scrubber->default(), # no arguments returns the current value
211 " comment ",
212 $scrubber->comment(),
213 " process ",
214 $scrubber->process(),
215 " )",
216 $/,
217 $scrubber->scrub($it),
218 $/;
219 #
220 $scrubber->default(1); # allow all tags by default
221 $scrubber->comment(0); # deny comments
222 #
223 print
224 "#scrubbed text (default ",
225 $scrubber->default(),
226 " comment ",
227 $scrubber->comment(),
228 " process ",
229 $scrubber->process(),
230 " )",
231 $/,
232 $scrubber->scrub($it),
233 $/;
234 #
235 $scrubber->process(1); # allow process instructions (dangerous)
236 $default[0] = 1; # allow all tags by default
237 $default[1]->{'*'} = 0; # deny all attributes by default
238 $scrubber->default(@default); # set the default again
239 #
240 print
241 "#scrubbed text (default ",
242 $scrubber->default(),
243 " comment ",
244 $scrubber->comment(),
245 " process ",
246 $scrubber->process(),
247 " )",
248 $/,
249 $scrubber->scrub($it),
250 $/;
251
252 FUN
253 If you have Test::Inline (and you've installed HTML::Scrubber), try
254
255 pod2test Scrubber.pm >scrubber.t
256 perl scrubber.t
257
259 HTML::Parser, Test::Inline, HTML::Sanitizer.
260
262 Please use https://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber to
263 report bugs/additions/etc or send mail to
264 <bug-HTML-Scrubber#rt.cpan.org>.
265
267 D. H. (PodMaster)
268
270 Copyright (c) 2003-2004 by D.H. (PodMaster). All rights reserved.
271
272 This module is free software; you can redistribute it and/or modify it
273 under the same terms as Perl itself. The LICENSE file contains the
274 full text of the license.
275
276
277
278perl v5.12.0 2010-05-05 Scrubber(3)