1Scrubber(3)           User Contributed Perl Documentation          Scrubber(3)
2
3
4

NAME

6       HTML::Scrubber - Perl extension for scrubbing/sanitizing html
7

SYNOPSIS

9           #!/usr/bin/perl -w
10           use HTML::Scrubber;
11           use strict;
12                                                                                   #
13           my $html = q[
14           <style type="text/css"> BAD { background: #666; color: #666;} </style>
15           <script language="javascript"> alert("Hello, I am EVIL!");    </script>
16           <HR>
17               a   => <a href=1>link </a>
18               br  => <br>
19               b   => <B> bold </B>
20               u   => <U> UNDERLINE </U>
21           ];
22                                                                                   #
23           my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); #
24                                                                                   #
25           print $scrubber->scrub($html);                                          #
26                                                                                   #
27           $scrubber->deny( qw[ p b i u hr br ] );                                 #
28                                                                                   #
29           print $scrubber->scrub($html);                                          #
30                                                                                   #
31

DESCRIPTION

33       If you wanna "scrub" or "sanitize" html input in a reliable an flexible
34       fashion, then this module is for you.
35
36       I wasn't satisfied with HTML::Sanitizer because it is based on
37       HTML::TreeBuilder, so I thought I'd write something similar that works
38       directly with HTML::Parser.
39

METHODS

41       First a note on documentation: just study the EXAMPLE below.  It's all
42       the documentation you could need
43
44       Also, be sure to read all the comments as well as How does it work?.
45
46       If you're new to perl, good luck to you.
47
48   comment
49           warn "comments are  ", $p->comment ? 'allowed' : 'not allowed';
50           $p->comment(0);  # off by default
51
52   process
53           warn "process instructions are  ", $p->process ? 'allowed' : 'not allowed';
54           $p->process(0);  # off by default
55
56   script
57           warn "script tags (and everything in between) are supressed"
58               if $p->script;      # off by default
59           $p->script( 0 || 1 );
60
61       ** Please note that this is implemented using HTML::Parser's
62       ignore_elements function, so if "script" is set to true, all script
63       tags encountered will be validated like all other tags.
64
65   style
66           warn "style tags (and everything in between) are supressed"
67               if $p->style;       # off by default
68           $p->style( 0 || 1 );
69
70       ** Please note that this is implemented using HTML::Parser's
71       ignore_elements function, so if "style" is set to true, all style tags
72       encountered will be validated like all other tags.
73
74   allow
75           $p->allow(qw[ t a g s ]);
76
77   deny
78           $p->deny(qw[ t a g s ]);
79
80   rules
81           $p->rules(
82               img => {
83                   src => qr{^(?!http://)}i, # only relative image links allowed
84                   alt => 1,                 # alt attribute allowed
85                   '*' => 0,                 # deny all other attributes
86               },
87               b => 1,
88               ...
89           );
90
91   default
92           print "default is ", $p->default();
93           $p->default(1);      # allow tags by default
94           $p->default(
95               undef,           # don't change
96               {                # default attribute rules
97                   '*' => 1,    # allow attributes by default
98               }
99           );
100
101   scrub_file
102           $html = $scrubber->scrub_file('foo.html');   ## returns giant string
103           die "Eeek $!" unless defined $html;  ## opening foo.html may have failed
104           $scrubber->scrub_file('foo.html', 'new.html') or die "Eeek $!";
105           $scrubber->scrub_file('foo.html', *STDOUT)
106               or die "Eeek $!"
107                   if fileno STDOUT;
108
109   scrub
110           print $scrubber->scrub($html);  ## returns giant string
111           $scrubber->scrub($html, 'new.html') or die "Eeek $!";
112           $scrubber->scrub($html', *STDOUT)
113               or die "Eeek $!"
114                   if fileno STDOUT;
115

How does it work?

117       When a tag is encountered, HTML::Scrubber allows/denies the tag using
118       the explicit rule if one exists.
119
120       If no explicit rule exists, Scrubber applies the default rule.
121
122       If an explicit rule exists, but it's a simple rule(1), the default
123       attribute rule is applied.
124
125   EXAMPLE
126           #!/usr/bin/perl -w
127           use HTML::Scrubber;
128           use strict;
129                                                                                   #
130           my @allow = qw[ br hr b a ];
131                                                                                   #
132           my @rules = (
133               script => 0,
134               img => {
135                   src => qr{^(?!http://)}i, # only relative image links allowed
136                   alt => 1,                 # alt attribute allowed
137                   '*' => 0,                 # deny all other attributes
138               },
139           );
140                                                                                   #
141           my @default = (
142               0   =>    # default rule, deny all tags
143               {
144                   '*'           => 1, # default rule, allow all attributes
145                   'href'        => qr{^(?!(?:java)?script)}i,
146                   'src'         => qr{^(?!(?:java)?script)}i,
147           #   If your perl doesn't have qr
148           #   just use a string with length greater than 1
149                   'cite'        => '(?i-xsm:^(?!(?:java)?script))',
150                   'language'    => 0,
151                   'name'        => 1, # could be sneaky, but hey ;)
152                   'onblur'      => 0,
153                   'onchange'    => 0,
154                   'onclick'     => 0,
155                   'ondblclick'  => 0,
156                   'onerror'     => 0,
157                   'onfocus'     => 0,
158                   'onkeydown'   => 0,
159                   'onkeypress'  => 0,
160                   'onkeyup'     => 0,
161                   'onload'      => 0,
162                   'onmousedown' => 0,
163                   'onmousemove' => 0,
164                   'onmouseout'  => 0,
165                   'onmouseover' => 0,
166                   'onmouseup'   => 0,
167                   'onreset'     => 0,
168                   'onselect'    => 0,
169                   'onsubmit'    => 0,
170                   'onunload'    => 0,
171                   'src'         => 0,
172                   'type'        => 0,
173               }
174           );
175                                                                                   #
176           my $scrubber = HTML::Scrubber->new();
177           $scrubber->allow( @allow );
178           $scrubber->rules( @rules ); # key/value pairs
179           $scrubber->default( @default );
180           $scrubber->comment(1); # 1 allow, 0 deny
181                                                                                   #
182           ## preferred way to create the same object
183           $scrubber = HTML::Scrubber->new(
184               allow   => \@allow,
185               rules   => \@rules,
186               default => \@default,
187               comment => 1,
188               process => 0,
189           );
190                                                                                   #
191           require Data::Dumper,die Data::Dumper::Dumper($scrubber) if @ARGV;
192                                                                                   #
193           my $it = q[
194               <?php   echo(" EVIL EVIL EVIL "); ?>    <!-- asdf -->
195               <hr>
196               <I FAKE="attribute" > IN ITALICS WITH FAKE="attribute" </I><br>
197               <B> IN BOLD </B><br>
198               <A NAME="evil">
199                   <A HREF="javascript:alert('die die die');">HREF=JAVA &lt;!&gt;</A>
200                   <br>
201                   <A HREF="image/bigone.jpg" ONMOUSEOVER="alert('die die die');">
202                       <IMG SRC="image/smallone.jpg" ALT="ONMOUSEOVER JAVASCRIPT">
203                   </A>
204               </A> <br>
205           ];
206                                                                                   #
207           print "#original text",$/, $it, $/;
208           print
209               "#scrubbed text (default ",
210               $scrubber->default(), # no arguments returns the current value
211               " comment ",
212               $scrubber->comment(),
213               " process ",
214               $scrubber->process(),
215               " )",
216               $/,
217               $scrubber->scrub($it),
218               $/;
219                                                                                   #
220           $scrubber->default(1); # allow all tags by default
221           $scrubber->comment(0); # deny comments
222                                                                                   #
223           print
224               "#scrubbed text (default ",
225               $scrubber->default(),
226               " comment ",
227               $scrubber->comment(),
228               " process ",
229               $scrubber->process(),
230               " )",
231               $/,
232               $scrubber->scrub($it),
233               $/;
234                                                                                   #
235           $scrubber->process(1);        # allow process instructions (dangerous)
236           $default[0] = 1;              # allow all tags by default
237           $default[1]->{'*'} = 0;       # deny all attributes by default
238           $scrubber->default(@default); # set the default again
239                                                                                   #
240           print
241               "#scrubbed text (default ",
242               $scrubber->default(),
243               " comment ",
244               $scrubber->comment(),
245               " process ",
246               $scrubber->process(),
247               " )",
248               $/,
249               $scrubber->scrub($it),
250               $/;
251
252   FUN
253       If you have Test::Inline (and you've installed HTML::Scrubber), try
254
255           pod2test Scrubber.pm >scrubber.t
256           perl scrubber.t
257

SEE ALSO

259       HTML::Parser, Test::Inline, HTML::Sanitizer.
260

BUGS/SUGGESTIONS/ETC

262       Please use https://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber to
263       report bugs/additions/etc or send mail to
264       <bug-HTML-Scrubber#rt.cpan.org>.
265

AUTHOR

267       D. H. (PodMaster)
268

LICENSE

270       Copyright (c) 2003-2004 by D.H. (PodMaster). All rights reserved.
271
272       This module is free software; you can redistribute it and/or modify it
273       under the same terms as Perl itself.  The LICENSE file contains the
274       full text of the license.
275
276
277
278perl v5.12.0                      2010-05-05                       Scrubber(3)
Impressum