1Scrubber(3)           User Contributed Perl Documentation          Scrubber(3)
2
3
4

NAME

6       HTML::Scrubber - Perl extension for scrubbing/sanitizing html
7

SYNOPSIS

9           #!/usr/bin/perl -w
10           use HTML::Scrubber;
11           use strict;
12                                                                                   #
13           my $html = q[
14           <style type="text/css"> BAD { background: #666; color: #666;} </style>
15           <script language="javascript"> alert("Hello, I am EVIL!");    </script>
16           <HR>
17               a   => <a href=1>link </a>
18               br  => <br>
19               b   => <B> bold </B>
20               u   => <U> UNDERLINE </U>
21           ];
22                                                                                   #
23           my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); #
24                                                                                   #
25           print $scrubber->scrub($html);                                          #
26                                                                                   #
27           $scrubber->deny( qw[ p b i u hr br ] );                                 #
28                                                                                   #
29           print $scrubber->scrub($html);                                          #
30                                                                                   #
31

DESCRIPTION

33       If you wanna "scrub" or "sanitize" html input in a reliable an flexible
34       fashion, then this module is for you.
35
36       I wasn't satisfied with HTML::Sanitizer because it is based on
37       HTML::TreeBuilder, so I thought I'd write something similar that works
38       directly with HTML::Parser.
39

METHODS

41       First a note on documentation: just study the EXAMPLE below.  It's all
42       the documentation you could need
43
44       Also, be sure to read all the comments as well as How does it work?.
45
46       If you're new to perl, good luck to you.
47
48       comment
49
50           warn "comments are  ", $p->comment ? 'allowed' : 'not allowed';
51           $p->comment(0);  # off by default
52
53       process
54
55           warn "process instructions are  ", $p->process ? 'allowed' : 'not allowed';
56           $p->process(0);  # off by default
57
58       script
59
60           warn "script tags (and everything in between) are supressed"
61               if $p->script;      # off by default
62           $p->script( 0 ⎪⎪ 1 );
63
64       ** Please note that this is implemented using HTML::Parser's
65       ignore_elements function, so if "script" is set to true, all script
66       tags encountered will be validated like all other tags.
67
68       style
69
70           warn "style tags (and everything in between) are supressed"
71               if $p->style;       # off by default
72           $p->style( 0 ⎪⎪ 1 );
73
74       ** Please note that this is implemented using HTML::Parser's
75       ignore_elements function, so if "style" is set to true, all style tags
76       encountered will be validated like all other tags.
77
78       allow
79
80           $p->allow(qw[ t a g s ]);
81
82       deny
83
84           $p->deny(qw[ t a g s ]);
85
86       rules
87
88           $p->rules(
89               img => {
90                   src => qr{^(?!http://)}i, # only relative image links allowed
91                   alt => 1,                 # alt attribute allowed
92                   '*' => 0,                 # deny all other attributes
93               },
94               b => 1,
95               ...
96           );
97
98       default
99
100           print "default is ", $p->default();
101           $p->default(1);      # allow tags by default
102           $p->default(
103               undef,           # don't change
104               {                # default attribute rules
105                   '*' => 1,    # allow attributes by default
106               }
107           );
108
109       scrub_file
110
111           $html = $scrubber->scrub_file('foo.html');   ## returns giant string
112           die "Eeek $!" unless defined $html;  ## opening foo.html may have failed
113           $scrubber->scrub_file('foo.html', 'new.html') or die "Eeek $!";
114           $scrubber->scrub_file('foo.html', *STDOUT)
115               or die "Eeek $!"
116                   if fileno STDOUT;
117
118       scrub
119
120           print $scrubber->scrub($html);  ## returns giant string
121           $scrubber->scrub($html, 'new.html') or die "Eeek $!";
122           $scrubber->scrub($html', *STDOUT)
123               or die "Eeek $!"
124                   if fileno STDOUT;
125

How does it work?

127       When a tag is encountered, HTML::Scrubber allows/denies the tag using
128       the explicit rule if one exists.
129
130       If no explicit rule exists, Scrubber applies the default rule.
131
132       If an explicit rule exists, but it's a simple rule(1), the default
133       attribute rule is applied.
134
135       EXAMPLE
136
137           #!/usr/bin/perl -w
138           use HTML::Scrubber;
139           use strict;
140                                                                                   #
141           my @allow = qw[ br hr b a ];
142                                                                                   #
143           my @rules = (
144               script => 0,
145               img => {
146                   src => qr{^(?!http://)}i, # only relative image links allowed
147                   alt => 1,                 # alt attribute allowed
148                   '*' => 0,                 # deny all other attributes
149               },
150           );
151                                                                                   #
152           my @default = (
153               0   =>    # default rule, deny all tags
154               {
155                   '*'           => 1, # default rule, allow all attributes
156                   'href'        => qr{^(?!(?:java)?script)}i,
157                   'src'         => qr{^(?!(?:java)?script)}i,
158           #   If your perl doesn't have qr
159           #   just use a string with length greater than 1
160                   'cite'        => '(?i-xsm:^(?!(?:java)?script))',
161                   'language'    => 0,
162                   'name'        => 1, # could be sneaky, but hey ;)
163                   'onblur'      => 0,
164                   'onchange'    => 0,
165                   'onclick'     => 0,
166                   'ondblclick'  => 0,
167                   'onerror'     => 0,
168                   'onfocus'     => 0,
169                   'onkeydown'   => 0,
170                   'onkeypress'  => 0,
171                   'onkeyup'     => 0,
172                   'onload'      => 0,
173                   'onmousedown' => 0,
174                   'onmousemove' => 0,
175                   'onmouseout'  => 0,
176                   'onmouseover' => 0,
177                   'onmouseup'   => 0,
178                   'onreset'     => 0,
179                   'onselect'    => 0,
180                   'onsubmit'    => 0,
181                   'onunload'    => 0,
182                   'src'         => 0,
183                   'type'        => 0,
184               }
185           );
186                                                                                   #
187           my $scrubber = HTML::Scrubber->new();
188           $scrubber->allow( @allow );
189           $scrubber->rules( @rules ); # key/value pairs
190           $scrubber->default( @default );
191           $scrubber->comment(1); # 1 allow, 0 deny
192                                                                                   #
193           ## preferred way to create the same object
194           $scrubber = HTML::Scrubber->new(
195               allow   => \@allow,
196               rules   => \@rules,
197               default => \@default,
198               comment => 1,
199               process => 0,
200           );
201                                                                                   #
202           require Data::Dumper,die Data::Dumper::Dumper($scrubber) if @ARGV;
203                                                                                   #
204           my $it = q[
205               <?php   echo(" EVIL EVIL EVIL "); ?>    <!-- asdf -->
206               <hr>
207               <I FAKE="attribute" > IN ITALICS WITH FAKE="attribute" </I><br>
208               <B> IN BOLD </B><br>
209               <A NAME="evil">
210                   <A HREF="javascript:alert('die die die');">HREF=JAVA &lt;!&gt;</A>
211                   <br>
212                   <A HREF="image/bigone.jpg" ONMOUSEOVER="alert('die die die');">
213                       <IMG SRC="image/smallone.jpg" ALT="ONMOUSEOVER JAVASCRIPT">
214                   </A>
215               </A> <br>
216           ];
217                                                                                   #
218           print "#original text",$/, $it, $/;
219           print
220               "#scrubbed text (default ",
221               $scrubber->default(), # no arguments returns the current value
222               " comment ",
223               $scrubber->comment(),
224               " process ",
225               $scrubber->process(),
226               " )",
227               $/,
228               $scrubber->scrub($it),
229               $/;
230                                                                                   #
231           $scrubber->default(1); # allow all tags by default
232           $scrubber->comment(0); # deny comments
233                                                                                   #
234           print
235               "#scrubbed text (default ",
236               $scrubber->default(),
237               " comment ",
238               $scrubber->comment(),
239               " process ",
240               $scrubber->process(),
241               " )",
242               $/,
243               $scrubber->scrub($it),
244               $/;
245                                                                                   #
246           $scrubber->process(1);        # allow process instructions (dangerous)
247           $default[0] = 1;              # allow all tags by default
248           $default[1]->{'*'} = 0;       # deny all attributes by default
249           $scrubber->default(@default); # set the default again
250                                                                                   #
251           print
252               "#scrubbed text (default ",
253               $scrubber->default(),
254               " comment ",
255               $scrubber->comment(),
256               " process ",
257               $scrubber->process(),
258               " )",
259               $/,
260               $scrubber->scrub($it),
261               $/;
262
263       FUN
264
265       If you have Test::Inline (and you've installed HTML::Scrubber), try
266
267           pod2test Scrubber.pm >scrubber.t
268           perl scrubber.t
269

SEE ALSO

271       HTML::Parser, Test::Inline, HTML::Sanitizer.
272

BUGS/SUGGESTIONS/ETC

274       Please use https://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber to
275       report bugs/additions/etc or send mail to <bug-HTML-Scrub‐
276       ber#rt.cpan.org>.
277

AUTHOR

279       D. H. (PodMaster)
280

LICENSE

282       Copyright (c) 2003-2004 by D.H. (PodMaster). All rights reserved.
283
284       This module is free software; you can redistribute it and/or modify it
285       under the same terms as Perl itself.  The LICENSE file contains the
286       full text of the license.
287
288
289
290perl v5.8.8                       2006-09-08                       Scrubber(3)
Impressum