KinoSearch1::Docs::Tutorial(3pm)

1KinoSearch1::Docs::TutoUrsiearl(C3o)ntributed Perl DocumKeinntoaSteiaornch1::Docs::Tutorial(3)
2
3
4

NAME

6       KinoSearch1::Docs::Tutorial - sample indexing and search applications
7

DESCRIPTION

9       The following sample code for invindexer.plx and search.cgi can be used
10       to create a simple search engine. It requires the html presentation of
11       the US Constitution included in the distribution for KinoSearch1, under
12       "t/us_constitution".
13
14       Note that a proper indexer for html documents would not rely on quick-
15       n-dirty regular expressions for stripping tags, as this one does for
16       the sake of brevity -- it would use a dedicated parsing module such as
17       HTML::Parser.
18
19   invindexer.plx
20           #!/usr/bin/perl
21           use strict;
22           use warnings;
23
24           use File::Spec;
25           use KinoSearch1::InvIndexer;
26           use KinoSearch1::Analysis::PolyAnalyzer;
27
28           ### In order for invindexer.plx to work correctly, you must modify
29           ### $source_dir, $path_to_invindex, and possibly $base_url.
30           ###
31           ### $source_dir must lead to the directory containing the US
32           ### Constitution html files.
33           ###
34           ### $path_to_invindex is the future location of the invindex.
35           ###
36           ### $base_url should reflect the location of the us_constitution directory
37           ### when accessed via a web browser.
38           my $source_dir       = '';
39           my $path_to_invindex = '';
40           my $base_url         = '/us_constitution';
41
42           opendir( my $source_dh, $source_dir )
43               or die "Couldn't opendir '$source_dir': $!";
44           my @filenames = grep {/\.html/} readdir $source_dh;
45           closedir $source_dh or die "Couldn't closedir '$source_dir': $!";
46
47           ### STEP 1: Choose an Analyzer.
48           my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
49               language => 'en',
50           );
51
52           ### STEP 2: Create a InvIndexer object.
53           my $invindexer = KinoSearch1::InvIndexer->new(
54               analyzer => $analyzer,
55               invindex => $path_to_invindex,
56               create   => 1,
57           );
58
59           ### STEP 3: Define fields.
60           $invindexer->spec_field( name => 'title' );
61           $invindexer->spec_field(
62               name       => 'bodytext',
63               vectorized => 1,
64           );
65           $invindexer->spec_field(
66               name    => 'url',
67               indexed => 0,
68           );
69
70           foreach my $filename (@filenames) {
71               next if $filename eq 'index.html';
72               my $filepath = File::Spec->catfile( $source_dir, $filename );
73               open( my $fh, '<', $filepath )
74                   or die "couldn't open file '$filepath': $!";
75               my $content = do { local $/; <$fh> };
76
77               ### STEP 4: Start a new document.
78               my $doc = $invindexer->new_doc;
79
80               $content =~ m#<title>(.*?)</title>#s
81                   or die "couldn't isolate title in '$filepath'";
82               my $title = $1;
83               $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
84                   or die "couldn't isolate bodytext in '$filepath'";
85               my $bodytext = $1;
86               $bodytext =~ s/<.*?>/ /gsm;    # quick and dirty tag stripping
87
88               ### STEP 5: Set the value for each field.
89               $doc->set_value( url      => "$base_url/$filename" );
90               $doc->set_value( title    => $title );
91               $doc->set_value( bodytext => $bodytext );
92
93               ### STEP 6 Add the document to the invindex.
94               $invindexer->add_doc($doc);
95
96               ### STEP 7 Repeat steps 3-5 for each document in the collection.
97           }
98
99           ### STEP 8 Finalize the invindex.
100           $invindexer->finish;
101
102   search.cgi
103           #!/usr/bin/perl -T
104           use strict;
105           use warnings;
106
107           use CGI;
108           use List::Util qw( max min );
109           use POSIX qw( ceil );
110           use KinoSearch1::Searcher;
111           use KinoSearch1::Analysis::PolyAnalyzer;
112           use KinoSearch1::Highlight::Highlighter;
113
114           my $cgi           = CGI->new;
115           my $q             = $cgi->param('q');
116           my $offset        = $cgi->param('offset');
117           my $hits_per_page = 10;
118           $q      = '' unless defined $q;
119           $offset = 0  unless defined $offset;
120
121           ### In order for search.cgi to work, $path_to_invindex must be modified so
122           ### that it points to the invindex created by invindexer.plx, and
123           ### $base_url may have to change to reflect where a web-browser should
124           ### look for the us_constitution directory.
125           my $path_to_invindex = '';
126           my $base_url         = '/us_constitution';
127
128           ### STEP 1: Specify the same Analyzer used to create the invindex.
129           my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
130               language => 'en',
131           );
132
133           ### STEP 2: Create a Searcher object.
134           my $searcher = KinoSearch1::Searcher->new(
135               invindex => $path_to_invindex,
136               analyzer => $analyzer,
137           );
138
139           ### STEP 3: Feed a query to the Search object.
140           my $hits = $searcher->search($q);
141
142           ### STEP 4: Arrange for highlighted excerpts to be created.
143           my $highlighter = KinoSearch1::Highlight::Highlighter->new(
144               excerpt_field => 'bodytext' );
145           $hits->create_excerpts( highlighter => $highlighter );
146
147           ### STEP 5: Process the search.
148           $hits->seek( $offset, $hits_per_page );
149
150           ### STEP 6: Format the results however you like.
151
152           # create result list
153           my $report = '';
154           while ( my $hit = $hits->fetch_hit_hashref ) {
155               my $score = sprintf( "%0.3f", $hit->{score} );
156               $report .= qq|
157                   <p>
158                       <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
159                       <em>$score</em>
160                       <br>
161                       $hit->{excerpt}
162                       <br>
163                       <span class="excerptURL">$hit->{url}</span>
164                   </p>
165                   |;
166           }
167
168           $q = CGI::escapeHTML($q);
169
170           # display info about the number of hits, paging links
171           my $total_hits = $hits->total_hits;
172           my $num_hits_info;
173           if ( !length $q ) {
174               # no query, no display
175               $num_hits_info = '';
176           }
177           elsif ( $total_hits == 0 ) {
178               # alert the user that their search failed
179               $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;
180           }
181           else {
182               # calculate the nums for the first and last hit to display
183               my $last_result = min( ( $offset + $hits_per_page ), $total_hits );
184               my $first_result = min( ( $offset + 1 ), $last_result );
185
186               # display the result nums, start paging info
187               $num_hits_info = qq|
188                   <p>
189                       Results <strong>$first_result-$last_result</strong>
190                       of <strong>$total_hits</strong> for <strong>$q</strong>.
191                   </p>
192                   <p>
193                       Results Page:
194                   |;
195
196               # calculate first and last hits pages to display / link to
197               my $current_page = int( $first_result / $hits_per_page ) + 1;
198               my $last_page    = ceil( $total_hits / $hits_per_page );
199               my $first_page   = max( 1, ( $current_page - 9 ) );
200               $last_page = min( $last_page, ( $current_page + 10 ) );
201
202               # create a url for use in paging links
203               my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;
204               $href .= ";offset=0" unless $href =~ /offset=/;
205
206               # generate the "Prev" link;
207               if ( $current_page > 1 ) {
208                   my $new_offset = ( $current_page - 2 ) * $hits_per_page;
209                   $href =~ s/(?<=offset=)\d+/$new_offset/;
210                   $num_hits_info .= qq|<a href="$href">&lt;= Prev</a>\n|;
211               }
212
213               # generate paging links
214               for my $page_num ( $first_page .. $last_page ) {
215                   if ( $page_num == $current_page ) {
216                       $num_hits_info .= qq|$page_num \n|;
217                   }
218                   else {
219                       my $new_offset = ( $page_num - 1 ) * $hits_per_page;
220                       $href =~ s/(?<=offset=)\d+/$new_offset/;
221                       $num_hits_info .= qq|<a href="$href">$page_num</a>\n|;
222                   }
223               }
224
225               # generate the "Next" link
226               if ( $current_page != $last_page ) {
227                   my $new_offset = $current_page * $hits_per_page;
228                   $href =~ s/(?<=offset=)\d+/$new_offset/;
229                   $num_hits_info .= qq|<a href="$href">Next =&gt;</a>\n|;
230               }
231
232               # finish paging links
233               $num_hits_info .= "</p>\n";
234           }
235
236           # blast it all out
237           print "Content-type: text/html\n\n";
238           print <<END_HTML;
239           <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
240               "http://www.w3.org/TR/html4/loose.dtd">
241           <html>
242           <head>
243               <meta http-equiv="Content-type"
244                   content="text/html;charset=ISO-8859-1">
245               <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
246               <title>KinoSearch: $q</title>
247           </head>
248
249           <body>
250
251               <div id="navigation">
252                   <form id="usconSearch" action="">
253                       <strong>
254                       Search the <a href="$base_url/index.html">US Constitution</a>:
255                       </strong>
256                       <input type="text" name="q" id="q" value="$q">
257                       <input type="submit" value="=&gt;">
258                       <input type="hidden" name="offset" value="0">
259                   </form>
260               </div><!--navigation-->
261
262               <div id="bodytext">
263
264               $report
265
266               $num_hits_info
267
268               <p style="font-size: smaller; color: #666">
269                   <em>Powered by
270                       <a href="http://www.rectangular.com/kinosearch/">
271                           KinoSearch
272                       </a>
273                   </em>
274               </p>
275               </div><!--bodytext-->
276
277           </body>
278
279           </html>
280           END_HTML
281

COPYRIGHT

283       Copyright 2005-2010 Marvin Humphrey
284

LICENSE, DISCLAIMER, BUGS, etc.

286       See KinoSearch1 version 1.01.
287
288
289
290perl v5.36.0                      2022-07-22    KinoSearch1::Docs::Tutorial(3)