1Scrubber(3) User Contributed Perl Documentation Scrubber(3)
2
3
4
6 HTML::Scrubber - Perl extension for scrubbing/sanitizing html
7
9 #!/usr/bin/perl -w
10 use HTML::Scrubber;
11 use strict;
12 #
13 my $html = q[
14 <style type="text/css"> BAD { background: #666; color: #666;} </style>
15 <script language="javascript"> alert("Hello, I am EVIL!"); </script>
16 <HR>
17 a => <a href=1>link </a>
18 br => <br>
19 b => <B> bold </B>
20 u => <U> UNDERLINE </U>
21 ];
22 #
23 my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); #
24 #
25 print $scrubber->scrub($html); #
26 #
27 $scrubber->deny( qw[ p b i u hr br ] ); #
28 #
29 print $scrubber->scrub($html); #
30 #
31
33 If you wanna "scrub" or "sanitize" html input in a reliable an flexible
34 fashion, then this module is for you.
35
36 I wasn't satisfied with HTML::Sanitizer because it is based on
37 HTML::TreeBuilder, so I thought I'd write something similar that works
38 directly with HTML::Parser.
39
41 First a note on documentation: just study the EXAMPLE below. It's all
42 the documentation you could need
43
44 Also, be sure to read all the comments as well as How does it work?.
45
46 If you're new to perl, good luck to you.
47
48 comment
49
50 warn "comments are ", $p->comment ? 'allowed' : 'not allowed';
51 $p->comment(0); # off by default
52
53 process
54
55 warn "process instructions are ", $p->process ? 'allowed' : 'not allowed';
56 $p->process(0); # off by default
57
58 script
59
60 warn "script tags (and everything in between) are supressed"
61 if $p->script; # off by default
62 $p->script( 0 ⎪⎪ 1 );
63
64 ** Please note that this is implemented using HTML::Parser's
65 ignore_elements function, so if "script" is set to true, all script
66 tags encountered will be validated like all other tags.
67
68 style
69
70 warn "style tags (and everything in between) are supressed"
71 if $p->style; # off by default
72 $p->style( 0 ⎪⎪ 1 );
73
74 ** Please note that this is implemented using HTML::Parser's
75 ignore_elements function, so if "style" is set to true, all style tags
76 encountered will be validated like all other tags.
77
78 allow
79
80 $p->allow(qw[ t a g s ]);
81
82 deny
83
84 $p->deny(qw[ t a g s ]);
85
86 rules
87
88 $p->rules(
89 img => {
90 src => qr{^(?!http://)}i, # only relative image links allowed
91 alt => 1, # alt attribute allowed
92 '*' => 0, # deny all other attributes
93 },
94 b => 1,
95 ...
96 );
97
98 default
99
100 print "default is ", $p->default();
101 $p->default(1); # allow tags by default
102 $p->default(
103 undef, # don't change
104 { # default attribute rules
105 '*' => 1, # allow attributes by default
106 }
107 );
108
109 scrub_file
110
111 $html = $scrubber->scrub_file('foo.html'); ## returns giant string
112 die "Eeek $!" unless defined $html; ## opening foo.html may have failed
113 $scrubber->scrub_file('foo.html', 'new.html') or die "Eeek $!";
114 $scrubber->scrub_file('foo.html', *STDOUT)
115 or die "Eeek $!"
116 if fileno STDOUT;
117
118 scrub
119
120 print $scrubber->scrub($html); ## returns giant string
121 $scrubber->scrub($html, 'new.html') or die "Eeek $!";
122 $scrubber->scrub($html', *STDOUT)
123 or die "Eeek $!"
124 if fileno STDOUT;
125
127 When a tag is encountered, HTML::Scrubber allows/denies the tag using
128 the explicit rule if one exists.
129
130 If no explicit rule exists, Scrubber applies the default rule.
131
132 If an explicit rule exists, but it's a simple rule(1), the default
133 attribute rule is applied.
134
135 EXAMPLE
136
137 #!/usr/bin/perl -w
138 use HTML::Scrubber;
139 use strict;
140 #
141 my @allow = qw[ br hr b a ];
142 #
143 my @rules = (
144 script => 0,
145 img => {
146 src => qr{^(?!http://)}i, # only relative image links allowed
147 alt => 1, # alt attribute allowed
148 '*' => 0, # deny all other attributes
149 },
150 );
151 #
152 my @default = (
153 0 => # default rule, deny all tags
154 {
155 '*' => 1, # default rule, allow all attributes
156 'href' => qr{^(?!(?:java)?script)}i,
157 'src' => qr{^(?!(?:java)?script)}i,
158 # If your perl doesn't have qr
159 # just use a string with length greater than 1
160 'cite' => '(?i-xsm:^(?!(?:java)?script))',
161 'language' => 0,
162 'name' => 1, # could be sneaky, but hey ;)
163 'onblur' => 0,
164 'onchange' => 0,
165 'onclick' => 0,
166 'ondblclick' => 0,
167 'onerror' => 0,
168 'onfocus' => 0,
169 'onkeydown' => 0,
170 'onkeypress' => 0,
171 'onkeyup' => 0,
172 'onload' => 0,
173 'onmousedown' => 0,
174 'onmousemove' => 0,
175 'onmouseout' => 0,
176 'onmouseover' => 0,
177 'onmouseup' => 0,
178 'onreset' => 0,
179 'onselect' => 0,
180 'onsubmit' => 0,
181 'onunload' => 0,
182 'src' => 0,
183 'type' => 0,
184 }
185 );
186 #
187 my $scrubber = HTML::Scrubber->new();
188 $scrubber->allow( @allow );
189 $scrubber->rules( @rules ); # key/value pairs
190 $scrubber->default( @default );
191 $scrubber->comment(1); # 1 allow, 0 deny
192 #
193 ## preferred way to create the same object
194 $scrubber = HTML::Scrubber->new(
195 allow => \@allow,
196 rules => \@rules,
197 default => \@default,
198 comment => 1,
199 process => 0,
200 );
201 #
202 require Data::Dumper,die Data::Dumper::Dumper($scrubber) if @ARGV;
203 #
204 my $it = q[
205 <?php echo(" EVIL EVIL EVIL "); ?> <!-- asdf -->
206 <hr>
207 <I FAKE="attribute" > IN ITALICS WITH FAKE="attribute" </I><br>
208 <B> IN BOLD </B><br>
209 <A NAME="evil">
210 <A HREF="javascript:alert('die die die');">HREF=JAVA <!></A>
211 <br>
212 <A HREF="image/bigone.jpg" ONMOUSEOVER="alert('die die die');">
213 <IMG SRC="image/smallone.jpg" ALT="ONMOUSEOVER JAVASCRIPT">
214 </A>
215 </A> <br>
216 ];
217 #
218 print "#original text",$/, $it, $/;
219 print
220 "#scrubbed text (default ",
221 $scrubber->default(), # no arguments returns the current value
222 " comment ",
223 $scrubber->comment(),
224 " process ",
225 $scrubber->process(),
226 " )",
227 $/,
228 $scrubber->scrub($it),
229 $/;
230 #
231 $scrubber->default(1); # allow all tags by default
232 $scrubber->comment(0); # deny comments
233 #
234 print
235 "#scrubbed text (default ",
236 $scrubber->default(),
237 " comment ",
238 $scrubber->comment(),
239 " process ",
240 $scrubber->process(),
241 " )",
242 $/,
243 $scrubber->scrub($it),
244 $/;
245 #
246 $scrubber->process(1); # allow process instructions (dangerous)
247 $default[0] = 1; # allow all tags by default
248 $default[1]->{'*'} = 0; # deny all attributes by default
249 $scrubber->default(@default); # set the default again
250 #
251 print
252 "#scrubbed text (default ",
253 $scrubber->default(),
254 " comment ",
255 $scrubber->comment(),
256 " process ",
257 $scrubber->process(),
258 " )",
259 $/,
260 $scrubber->scrub($it),
261 $/;
262
263 FUN
264
265 If you have Test::Inline (and you've installed HTML::Scrubber), try
266
267 pod2test Scrubber.pm >scrubber.t
268 perl scrubber.t
269
271 HTML::Parser, Test::Inline, HTML::Sanitizer.
272
274 Please use https://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber to
275 report bugs/additions/etc or send mail to <bug-HTML-Scrub‐
276 ber#rt.cpan.org>.
277
279 D. H. (PodMaster)
280
282 Copyright (c) 2003-2004 by D.H. (PodMaster). All rights reserved.
283
284 This module is free software; you can redistribute it and/or modify it
285 under the same terms as Perl itself. The LICENSE file contains the
286 full text of the license.
287
288
289
290perl v5.8.8 2006-09-08 Scrubber(3)