1KinoSearch1::Docs::TutoUrsiearl(C3o)ntributed Perl DocumKeinntoaSteiaornch1::Docs::Tutorial(3)
2
3
4
6 KinoSearch1::Docs::Tutorial - sample indexing and search applications
7
9 The following sample code for invindexer.plx and search.cgi can be used
10 to create a simple search engine. It requires the html presentation of
11 the US Constitution included in the distribution for KinoSearch1, under
12 "t/us_constitution".
13
14 Note that a proper indexer for html documents would not rely on quick-
15 n-dirty regular expressions for stripping tags, as this one does for
16 the sake of brevity -- it would use a dedicated parsing module such as
17 HTML::Parser.
18
19 invindexer.plx
20 #!/usr/bin/perl
21 use strict;
22 use warnings;
23
24 use File::Spec;
25 use KinoSearch1::InvIndexer;
26 use KinoSearch1::Analysis::PolyAnalyzer;
27
28 ### In order for invindexer.plx to work correctly, you must modify
29 ### $source_dir, $path_to_invindex, and possibly $base_url.
30 ###
31 ### $source_dir must lead to the directory containing the US
32 ### Constitution html files.
33 ###
34 ### $path_to_invindex is the future location of the invindex.
35 ###
36 ### $base_url should reflect the location of the us_constitution directory
37 ### when accessed via a web browser.
38 my $source_dir = '';
39 my $path_to_invindex = '';
40 my $base_url = '/us_constitution';
41
42 opendir( my $source_dh, $source_dir )
43 or die "Couldn't opendir '$source_dir': $!";
44 my @filenames = grep {/\.html/} readdir $source_dh;
45 closedir $source_dh or die "Couldn't closedir '$source_dir': $!";
46
47 ### STEP 1: Choose an Analyzer.
48 my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
49 language => 'en',
50 );
51
52 ### STEP 2: Create a InvIndexer object.
53 my $invindexer = KinoSearch1::InvIndexer->new(
54 analyzer => $analyzer,
55 invindex => $path_to_invindex,
56 create => 1,
57 );
58
59 ### STEP 3: Define fields.
60 $invindexer->spec_field( name => 'title' );
61 $invindexer->spec_field(
62 name => 'bodytext',
63 vectorized => 1,
64 );
65 $invindexer->spec_field(
66 name => 'url',
67 indexed => 0,
68 );
69
70 foreach my $filename (@filenames) {
71 next if $filename eq 'index.html';
72 my $filepath = File::Spec->catfile( $source_dir, $filename );
73 open( my $fh, '<', $filepath )
74 or die "couldn't open file '$filepath': $!";
75 my $content = do { local $/; <$fh> };
76
77 ### STEP 4: Start a new document.
78 my $doc = $invindexer->new_doc;
79
80 $content =~ m#<title>(.*?)</title>#s
81 or die "couldn't isolate title in '$filepath'";
82 my $title = $1;
83 $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
84 or die "couldn't isolate bodytext in '$filepath'";
85 my $bodytext = $1;
86 $bodytext =~ s/<.*?>/ /gsm; # quick and dirty tag stripping
87
88 ### STEP 5: Set the value for each field.
89 $doc->set_value( url => "$base_url/$filename" );
90 $doc->set_value( title => $title );
91 $doc->set_value( bodytext => $bodytext );
92
93 ### STEP 6 Add the document to the invindex.
94 $invindexer->add_doc($doc);
95
96 ### STEP 7 Repeat steps 3-5 for each document in the collection.
97 }
98
99 ### STEP 8 Finalize the invindex.
100 $invindexer->finish;
101
102 search.cgi
103 #!/usr/bin/perl -T
104 use strict;
105 use warnings;
106
107 use CGI;
108 use List::Util qw( max min );
109 use POSIX qw( ceil );
110 use KinoSearch1::Searcher;
111 use KinoSearch1::Analysis::PolyAnalyzer;
112 use KinoSearch1::Highlight::Highlighter;
113
114 my $cgi = CGI->new;
115 my $q = $cgi->param('q');
116 my $offset = $cgi->param('offset');
117 my $hits_per_page = 10;
118 $q = '' unless defined $q;
119 $offset = 0 unless defined $offset;
120
121 ### In order for search.cgi to work, $path_to_invindex must be modified so
122 ### that it points to the invindex created by invindexer.plx, and
123 ### $base_url may have to change to reflect where a web-browser should
124 ### look for the us_constitution directory.
125 my $path_to_invindex = '';
126 my $base_url = '/us_constitution';
127
128 ### STEP 1: Specify the same Analyzer used to create the invindex.
129 my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
130 language => 'en',
131 );
132
133 ### STEP 2: Create a Searcher object.
134 my $searcher = KinoSearch1::Searcher->new(
135 invindex => $path_to_invindex,
136 analyzer => $analyzer,
137 );
138
139 ### STEP 3: Feed a query to the Search object.
140 my $hits = $searcher->search($q);
141
142 ### STEP 4: Arrange for highlighted excerpts to be created.
143 my $highlighter = KinoSearch1::Highlight::Highlighter->new(
144 excerpt_field => 'bodytext' );
145 $hits->create_excerpts( highlighter => $highlighter );
146
147 ### STEP 5: Process the search.
148 $hits->seek( $offset, $hits_per_page );
149
150 ### STEP 6: Format the results however you like.
151
152 # create result list
153 my $report = '';
154 while ( my $hit = $hits->fetch_hit_hashref ) {
155 my $score = sprintf( "%0.3f", $hit->{score} );
156 $report .= qq|
157 <p>
158 <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
159 <em>$score</em>
160 <br>
161 $hit->{excerpt}
162 <br>
163 <span class="excerptURL">$hit->{url}</span>
164 </p>
165 |;
166 }
167
168 $q = CGI::escapeHTML($q);
169
170 # display info about the number of hits, paging links
171 my $total_hits = $hits->total_hits;
172 my $num_hits_info;
173 if ( !length $q ) {
174 # no query, no display
175 $num_hits_info = '';
176 }
177 elsif ( $total_hits == 0 ) {
178 # alert the user that their search failed
179 $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;
180 }
181 else {
182 # calculate the nums for the first and last hit to display
183 my $last_result = min( ( $offset + $hits_per_page ), $total_hits );
184 my $first_result = min( ( $offset + 1 ), $last_result );
185
186 # display the result nums, start paging info
187 $num_hits_info = qq|
188 <p>
189 Results <strong>$first_result-$last_result</strong>
190 of <strong>$total_hits</strong> for <strong>$q</strong>.
191 </p>
192 <p>
193 Results Page:
194 |;
195
196 # calculate first and last hits pages to display / link to
197 my $current_page = int( $first_result / $hits_per_page ) + 1;
198 my $last_page = ceil( $total_hits / $hits_per_page );
199 my $first_page = max( 1, ( $current_page - 9 ) );
200 $last_page = min( $last_page, ( $current_page + 10 ) );
201
202 # create a url for use in paging links
203 my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;
204 $href .= ";offset=0" unless $href =~ /offset=/;
205
206 # generate the "Prev" link;
207 if ( $current_page > 1 ) {
208 my $new_offset = ( $current_page - 2 ) * $hits_per_page;
209 $href =~ s/(?<=offset=)\d+/$new_offset/;
210 $num_hits_info .= qq|<a href="$href"><= Prev</a>\n|;
211 }
212
213 # generate paging links
214 for my $page_num ( $first_page .. $last_page ) {
215 if ( $page_num == $current_page ) {
216 $num_hits_info .= qq|$page_num \n|;
217 }
218 else {
219 my $new_offset = ( $page_num - 1 ) * $hits_per_page;
220 $href =~ s/(?<=offset=)\d+/$new_offset/;
221 $num_hits_info .= qq|<a href="$href">$page_num</a>\n|;
222 }
223 }
224
225 # generate the "Next" link
226 if ( $current_page != $last_page ) {
227 my $new_offset = $current_page * $hits_per_page;
228 $href =~ s/(?<=offset=)\d+/$new_offset/;
229 $num_hits_info .= qq|<a href="$href">Next =></a>\n|;
230 }
231
232 # finish paging links
233 $num_hits_info .= "</p>\n";
234 }
235
236 # blast it all out
237 print "Content-type: text/html\n\n";
238 print <<END_HTML;
239 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
240 "http://www.w3.org/TR/html4/loose.dtd">
241 <html>
242 <head>
243 <meta http-equiv="Content-type"
244 content="text/html;charset=ISO-8859-1">
245 <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
246 <title>KinoSearch: $q</title>
247 </head>
248
249 <body>
250
251 <div id="navigation">
252 <form id="usconSearch" action="">
253 <strong>
254 Search the <a href="$base_url/index.html">US Constitution</a>:
255 </strong>
256 <input type="text" name="q" id="q" value="$q">
257 <input type="submit" value="=>">
258 <input type="hidden" name="offset" value="0">
259 </form>
260 </div><!--navigation-->
261
262 <div id="bodytext">
263
264 $report
265
266 $num_hits_info
267
268 <p style="font-size: smaller; color: #666">
269 <em>Powered by
270 <a href="http://www.rectangular.com/kinosearch/">
271 KinoSearch
272 </a>
273 </em>
274 </p>
275 </div><!--bodytext-->
276
277 </body>
278
279 </html>
280 END_HTML
281
283 Copyright 2005-2010 Marvin Humphrey
284
286 See KinoSearch1 version 1.00.
287
288
289
290perl v5.12.2 2010-10-05 KinoSearch1::Docs::Tutorial(3)