XML::TokeParser(3pm)

1TokeParser(3)         User Contributed Perl Documentation        TokeParser(3)
2
3
4

NAME

6       XML::TokeParser - Simplified interface to XML::Parser
7

SYNOPSIS

9           use XML::TokeParser;
10                                                                           #
11           #parse from file
12           my $p = XML::TokeParser->new('file.xml')
13                                                                           #
14           #parse from open handle
15           open IN, 'file.xml' or die $!;
16           my $p = XML::TokeParser->new( \*IN, Noempty => 1 );
17                                                                           #
18           #parse literal text
19           my $text = '<tag xmlns="http://www.omsdev.com">text</tag>';
20           my $p    = XML::TokeParser->new( \$text, Namespaces => 1 );
21                                                                           #
22           #read next token
23           my $token = $p->get_token();
24                                                                           #
25           #skip to <title> and read text
26           $p->get_tag('title');
27           $p->get_text();
28                                                                           #
29           #read text of next <para>, ignoring any internal markup
30           $p->get_tag('para');
31           $p->get_trimmed_text('/para');
32                                                                           #
33           #process <para> if interesting text
34           $t = $p->get_tag('para');
35           $p->begin_saving($t);
36           if ( $p->get_trimmed_text('/para') =~ /interesting stuff/ ) {
37               $p->restore_saved();
38               process_para($p);
39           }
40

DESCRIPTION

42       XML::TokeParser provides a procedural ("pull mode") interface to
43       XML::Parser in much the same way that Gisle Aas' HTML::TokeParser
44       provides a procedural interface to HTML::Parser.  XML::TokeParser
45       splits its XML input up into "tokens," each corresponding to an
46       XML::Parser event.
47
48       A token is a bless'd reference to an array whose first element is an
49       event-type string and whose last element is the literal text of the XML
50       input that generated the event, with intermediate elements varying
51       according to the event type.
52
53       Each token is an object of type XML::TokeParser::Token.  Read
54       "XML::TokeParser::Token" to learn what methods are available for
55       inspecting the token, and retrieving data from it.
56

METHODS

58       $p = XML::TokeParser->new($input, [options])
59           Creates a new parser, specifying the input source and any options.
60           If $input is a string, it is the name of the file to parse.  If
61           $input is a reference to a string, that string is the actual text
62           to parse.  If $input is a reference to a typeglob or an IO::Handle
63           object corresponding to an open file or socket, the text read from
64           the handle will be parsed.
65
66           Options are name=>value pairs and can be any of the following:
67
68           Namespaces
69               If set to a true value, namespace processing is enabled.
70
71           ParseParamEnt
72               This option is passed on to the underlying XML::Parser object;
73               see that module's documentation for details.
74
75           Noempty
76               If set to a true value, text tokens consisting of only
77               whitespace (such as those created by indentation and line
78               breaks in between tags) will be ignored.
79
80           Latin
81               If set to a true value, all text other than the literal text
82               elements of tokens will be translated into the ISO 8859-1
83               (Latin-1) character encoding rather than the normal UTF-8
84               encoding.
85
86           Catalog
87               The value is the URI of a catalog file used to resolve PUBLIC
88               and SYSTEM identifiers.  See XML::Catalog for details.
89
90       $token = $p->get_token()
91           Returns the next token, as an array reference, from the input.
92           Returns undef if there are no remaining tokens.
93
94       $p->unget_token($token,...)
95           Pushes tokens back so they will be re-read.  Useful if you've read
96           one or more tokens too far.  Correctly handles "partial" tokens
97           returned by get_tag().
98
99       $token = $p->get_tag( [$token] )
100           If no argument given, skips tokens until the next start tag or end
101           tag token. If an argument is given, skips tokens until the start
102           tag or end tag (if the argument begins with '/') for the named
103           element.  The returned token does not include an event type code;
104           its first element is the element name, prefixed by a '/' if the
105           token is for an end tag.
106
107       $text = $p->get_text( [$token] )
108           If no argument given, returns the text at the current position, or
109           an empty string if the next token is not a 'T' token.  If an
110           argument is given, gathers up all text between the current position
111           and the specified start or end tag, stripping out any intervening
112           tags (much like the way a typical Web browser deals with unknown
113           tags).
114
115       $text = $p->get_trimmed_text( [$token] )
116           Like get_text(), but deletes any leading or trailing whitespaces
117           and collapses multiple whitespace (including newlines) into single
118           spaces.
119
120       $p->begin_saving( [$token] )
121           Causes subsequent calls to get_token(), get_tag(), get_text(), and
122           get_trimmed_text() to save the returned tokens.  In conjunction
123           with restore_saved(), allows you to "back up" within a token
124           stream.  If an argument is supplied, it is placed at the beginning
125           of the list of saved tokens (useful because you often won't know
126           you want to begin saving until you've already read the first token
127           you want saved).
128
129       $p->restore_saved()
130           Pushes all the tokens saved by begin_saving() back onto the token
131           stream.  Stops saving tokens.  To cancel saving without backing up,
132           call begin_saving() and restore_saved() in succession.
133
134   XML::TokeParser::Token
135       A token is a blessed array reference, that you acquire using
136       "$p->get_token" or "$p->get_tag", and that might look like:
137
138           ["S",  $tag, $attr, $attrseq, $raw]
139           ["E",  $tag, $raw]
140           ["T",  $text, $raw]
141           ["C",  $text, $raw]
142           ["PI", $target, $data, $raw]
143
144       If you don't like remembering array indices (you're a real programmer),
145       you may access the attributes of a token like:
146
147       "$t->tag", "$t->attr", "$t->attrseq", "$t->raw", "$t->text",
148       "$t->target", "$t->data".
149
150       ****Please note that this may change in the future, where as there will
151       be 4 token types, XML::TokeParser::Token::StartTag ....
152
153       What kind of token is it?
154
155       To find out, inspect your token using any of these is_* methods (1 ==
156       true, 0 == false, d'oh):
157
158       is_text
159       is_comment
160       is_pi which is short for is_process_instruction
161       is_start_tag
162       is_end_tag
163       is_tag
164
165       What's that token made of?  To retrieve data from your token, use any
166       of the following methods, depending on the kind of token you have:
167
168       target
169           only for process instructions
170
171       data
172           only for process instructions
173
174       raw for all tokens
175
176       attr
177           only for start tags, returns a hashref ( "print "#link ",
178           ""$t->attr""->{href}" ).
179
180       my $attrseq = $t->attrseq
181           only for start tags, returns an array ref of the keys found in
182           "$t->attr" in the order they originally appeared in.
183
184       my $tagname = $t->tag
185           only for tags ( "print "opening ", ""$t->tag"" if
186           ""$t->is_start_tag" ).
187
188       my $text = $token->text
189           only for tokens of type text and comment
190
191       Here's more detailed info about the tokens.
192
193       Start tag
194           The token has five elements: 'S', the element's name, a reference
195           to a hash of attribute values keyed by attribute names, a reference
196           to an array of attribute names in the order in which they appeared
197           in the tag, and the literal text.
198
199       End tag
200           The token has three elements: 'E', the element's name, and the
201           literal text.
202
203       Character data (text)
204           The token has three elements: 'T', the parsed text, and the literal
205           text.  All contiguous runs of text are gathered into single tokens;
206           there will never be two 'T' tokens in a row.
207
208       Comment
209           The token has three elements: 'C', the parsed text of the comment,
210           and the literal text.
211
212       Processing instruction
213           The token has four elements: 'PI', the target, the data, and the
214           literal text.
215
216       The literal text includes any markup delimiters (pointy brackets,
217       <![CDATA[, etc.), entity references, and numeric character references
218       and is in the XML document's original character encoding.  All other
219       text is in UTF-8 (unless the Latin option is set, in which case it's in
220       ISO-8859-1) regardless of the original encoding, and all entity and
221       character references are expanded.
222
223       If the Namespaces option is set, element and attribute names are
224       prefixed by their (possibly empty) namespace URIs enclosed in curly
225       brackets and xmlns:* attributes do not appear in 'S' tokens.
226

DIFFERENCES FROM HTML::TokeParser

228       Uses a true XML parser rather than a modified HTML parser.
229
230       Text and comment tokens include extracted text as well as literal text.
231
232       PI tokens include target and data as well as literal text.
233
234       No tokens for declarations.
235
236       No "textify" hash.
237
238       unget_token correctly handles partial tokens returned by get_tag().
239
240       begin_saving() and restore_saved()
241

EXAMPLES

243       Example:
244
245           use XML::TokeParser;
246           use strict;
247                                                                                      #
248           my $text = '<tag foo="bar" foy="floy"> some text <!--comment--></tag>';
249           my $p    = XML::TokeParser->new( \$text );
250                                                                                      #
251           print $/;
252                                                                                      #
253           while( defined( my $t = $p->get_token() ) ){
254               local $\="\n";
255               print '         raw = ', $t->raw;
256                                                                                      #
257               if( $t->tag ){
258                   print '         tag = ', $t->tag;
259                                                                                      #
260                   if( $t->is_start_tag ) {
261                       print '        attr = ', join ',', %{$t->attr};
262                       print '     attrseq = ', join ',', @{$t->attrseq};
263                   }
264                                                                                      #
265                   print 'is_tag       ', $t->is_tag;
266                   print 'is_start_tag ', $t->is_start_tag;
267                   print 'is_end_tag   ', $t->is_end_tag;
268               }
269               elsif( $t->is_pi ){
270                   print '      target = ', $t->target;
271                   print '        data = ', $t->data;
272                   print 'is_pi        ', $t->is_pi;
273               }
274               else {
275                   print '        text = ', $t->text;
276                   print 'is_text      ', $t->is_text;
277                   print 'is_comment   ', $t->is_comment;
278               }
279                                                                                      #
280               print $/;
281           }
282           __END__
283
284       Output:
285
286                    raw = <tag foo="bar" foy="floy">
287                    tag = tag
288                   attr = foo,bar,foy,floy
289                attrseq = foo,foy
290           is_tag       1
291           is_start_tag 1
292           is_end_tag   0
293
294
295                    raw =  some text
296                   text =  some text
297           is_text      1
298           is_comment   0
299
300
301                    raw = <!--comment-->
302                   text = comment
303           is_text      0
304           is_comment   1
305
306
307                    raw = </tag>
308                    tag = tag
309           is_tag       1
310           is_start_tag 0
311           is_end_tag   1
312

BUGS

314       To report bugs, go to
315       <http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-TokeParser> or send mail
316       to <bug-XML-Tokeparser@rt.cpan.org>
317

AUTHOR

319       Copyright (c) 2003 D.H. aka PodMaster (current maintainer).  Copyright
320       (c) 2001 Eric Bohlman (original author).
321
322       All rights reserved.  This program is free software; you can
323       redistribute it and/or modify it under the same terms as Perl itself.
324       If you don't know what this means, visit <http://perl.com/> or
325       <http://cpan.org/>.
326