Lucy::Docs::Tutorial::SimpleTutorial(3pm)

1Lucy::Docs::Tutorial::SUismeprleCTounttorriibault(e3dLpumPc)eyr:l:DDooccsu:m:eTnuttaotriioanl::SimpleTutorial(3pm)
2
3
4

NAME

6       Lucy::Docs::Tutorial::SimpleTutorial - Bare-bones search app.
7

DESCRIPTION

9   Setup
10       Copy the text presentation of the US Constitution from the "sample"
11       directory of the Apache Lucy distribution to the base level of your web
12       server’s "htdocs" directory.
13
14           $ cp -R sample/us_constitution /usr/local/apache2/htdocs/
15
16   Indexing: indexer.pl
17       Our first task will be to create an application called "indexer.pl"
18       which builds a searchable “inverted index” from a collection of
19       documents.
20
21       After we specify some configuration variables and load all necessary
22       modules…
23
24           #!/usr/local/bin/perl
25           use strict;
26           use warnings;
27
28           # (Change configuration variables as needed.)
29           my $path_to_index = '/path/to/index';
30           my $uscon_source  = '/usr/local/apache2/htdocs/us_constitution';
31
32           use Lucy::Simple;
33           use File::Spec::Functions qw( catfile );
34
35       … we’ll start by creating a Lucy::Simple object, telling it where we’d
36       like the index to be located and the language of the source material.
37
38           my $lucy = Lucy::Simple->new(
39               path     => $path_to_index,
40               language => 'en',
41           );
42
43       Next, we’ll add a subroutine which parses our sample documents.
44
45           # Parse a file from our US Constitution collection and return a hashref with
46           # the fields title, body, and url.
47           sub parse_file {
48               my $filename = shift;
49               my $filepath = catfile( $uscon_source, $filename );
50               open( my $fh, '<', $filepath ) or die "Can't open '$filepath': $!";
51               my $text = do { local $/; <$fh> };    # slurp file content
52               $text =~ /\A(.+?)^\s+(.*)/ms
53                   or die "Can't extract title/bodytext from '$filepath'";
54               my $title    = $1;
55               my $bodytext = $2;
56               return {
57                   title    => $title,
58                   content  => $bodytext,
59                   url      => "/us_constitution/$filename",
60               };
61           }
62
63       Add some elementary directory reading code…
64
65           # Collect names of source files.
66           opendir( my $dh, $uscon_source )
67               or die "Couldn't opendir '$uscon_source': $!";
68           my @filenames = grep { $_ =~ /\.txt/ } readdir $dh;
69
70       … and now we’re ready for the meat of indexer.pl – which occupies
71       exactly one line of code.
72
73           foreach my $filename (@filenames) {
74               my $doc = parse_file($filename);
75               $lucy->add_doc($doc);  # ta-da!
76           }
77
78   Search: search.cgi
79       As with our indexing app, the bulk of the code in our search script
80       won’t be Lucy-specific.
81
82       The beginning is dedicated to CGI processing and configuration.
83
84           #!/usr/local/bin/perl -T
85           use strict;
86           use warnings;
87
88           # (Change configuration variables as needed.)
89           my $path_to_index = '/path/to/index';
90
91           use CGI;
92           use List::Util qw( max min );
93           use POSIX qw( ceil );
94           use Encode qw( decode );
95           use Lucy::Simple;
96
97           my $cgi       = CGI->new;
98           my $q         = decode( "UTF-8", $cgi->param('q') || '' );
99           my $offset    = decode( "UTF-8", $cgi->param('offset') || 0 );
100           my $page_size = 10;
101
102       Once that’s out of the way, we create our Lucy::Simple object and feed
103       it a query string.
104
105           my $lucy = Lucy::Simple->new(
106               path     => $path_to_index,
107               language => 'en',
108           );
109           my $hit_count = $lucy->search(
110               query      => $q,
111               offset     => $offset,
112               num_wanted => $page_size,
113           );
114
115       The value returned by search() is the total number of documents in the
116       collection which matched the query.  We’ll show this hit count to the
117       user, and also use it in conjunction with the parameters "offset" and
118       "num_wanted" to break up results into “pages” of manageable size.
119
120       Calling search() on our Simple object turns it into an iterator.
121       Invoking next() now returns hits one at a time as HitDoc objects,
122       starting with the most relevant.
123
124           # Create result list.
125           my $report = '';
126           while ( my $hit = $lucy->next ) {
127               my $score = sprintf( "%0.3f", $hit->get_score );
128               $report .= qq|
129                   <p>
130                     <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
131                     <em>$score</em>
132                     <br>
133                     <span class="excerptURL">$hit->{url}</span>
134                   </p>
135                   |;
136           }
137
138       The rest of the script is just text wrangling.
139
140           #---------------------------------------------------------------#
141           # No tutorial material below this point - just html generation. #
142           #---------------------------------------------------------------#
143
144           # Generate paging links and hit count, print and exit.
145           my $paging_links = generate_paging_info( $q, $hit_count );
146           blast_out_content( $q, $report, $paging_links );
147
148           # Create html fragment with links for paging through results n-at-a-time.
149           sub generate_paging_info {
150               my ( $query_string, $total_hits ) = @_;
151               my $escaped_q = CGI::escapeHTML($query_string);
152               my $paging_info;
153               if ( !length $query_string ) {
154                   # No query?  No display.
155                   $paging_info = '';
156               }
157               elsif ( $total_hits == 0 ) {
158                   # Alert the user that their search failed.
159                   $paging_info
160                       = qq|<p>No matches for <strong>$escaped_q</strong></p>|;
161               }
162               else {
163                   # Calculate the nums for the first and last hit to display.
164                   my $last_result = min( ( $offset + $page_size ), $total_hits );
165                   my $first_result = min( ( $offset + 1 ), $last_result );
166
167                   # Display the result nums, start paging info.
168                   $paging_info = qq|
169                       <p>
170                           Results <strong>$first_result-$last_result</strong>
171                           of <strong>$total_hits</strong>
172                           for <strong>$escaped_q</strong>.
173                       </p>
174                       <p>
175                           Results Page:
176                       |;
177
178                   # Calculate first and last hits pages to display / link to.
179                   my $current_page = int( $first_result / $page_size ) + 1;
180                   my $last_page    = ceil( $total_hits / $page_size );
181                   my $first_page   = max( 1, ( $current_page - 9 ) );
182                   $last_page = min( $last_page, ( $current_page + 10 ) );
183
184                   # Create a url for use in paging links.
185                   my $href = $cgi->url( -relative => 1 );
186                   $href .= "?q=" . CGI::escape($query_string);
187                   $href .= ";offset=" . CGI::escape($offset);
188
189                   # Generate the "Prev" link.
190                   if ( $current_page > 1 ) {
191                       my $new_offset = ( $current_page - 2 ) * $page_size;
192                       $href =~ s/(?<=offset=)\d+/$new_offset/;
193                       $paging_info .= qq|<a href="$href">&lt;= Prev</a>\n|;
194                   }
195
196                   # Generate paging links.
197                   for my $page_num ( $first_page .. $last_page ) {
198                       if ( $page_num == $current_page ) {
199                           $paging_info .= qq|$page_num \n|;
200                       }
201                       else {
202                           my $new_offset = ( $page_num - 1 ) * $page_size;
203                           $href =~ s/(?<=offset=)\d+/$new_offset/;
204                           $paging_info .= qq|<a href="$href">$page_num</a>\n|;
205                       }
206                   }
207
208                   # Generate the "Next" link.
209                   if ( $current_page != $last_page ) {
210                       my $new_offset = $current_page * $page_size;
211                       $href =~ s/(?<=offset=)\d+/$new_offset/;
212                       $paging_info .= qq|<a href="$href">Next =&gt;</a>\n|;
213                   }
214
215                   # Close tag.
216                   $paging_info .= "</p>\n";
217               }
218
219               return $paging_info;
220           }
221
222           # Print content to output.
223           sub blast_out_content {
224               my ( $query_string, $hit_list, $paging_info ) = @_;
225               my $escaped_q = CGI::escapeHTML($query_string);
226               binmode( STDOUT, ":encoding(UTF-8)" );
227               print qq|Content-type: text/html; charset=UTF-8\n\n|;
228               print qq|
229           <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
230               "http://www.w3.org/TR/html4/loose.dtd">
231           <html>
232           <head>
233             <meta http-equiv="Content-type"
234               content="text/html;charset=UTF-8">
235             <link rel="stylesheet" type="text/css"
236               href="/us_constitution/uscon.css">
237             <title>Lucy: $escaped_q</title>
238           </head>
239
240           <body>
241
242             <div id="navigation">
243               <form id="usconSearch" action="">
244                 <strong>
245                   Search the
246                   <a href="/us_constitution/index.html">US Constitution</a>:
247                 </strong>
248                 <input type="text" name="q" id="q" value="$escaped_q">
249                 <input type="submit" value="=&gt;">
250               </form>
251             </div><!--navigation-->
252
253             <div id="bodytext">
254
255             $hit_list
256
257             $paging_info
258
259               <p style="font-size: smaller; color: #666">
260                 <em>
261                   Powered by <a href="http://lucy.apache.org/"
262                   >Apache Lucy<small><sup>TM</sup></small></a>
263                 </em>
264               </p>
265             </div><!--bodytext-->
266
267           </body>
268
269           </html>
270           |;
271           }
272
273   OK… now what?
274       Lucy::Simple is perfectly adequate for some tasks, but it’s not very
275       flexible.  Many people find that it doesn’t do at least one or two
276       things they can’t live without.
277
278       In our next tutorial chapter, BeyondSimpleTutorial, we’ll rewrite our
279       indexing and search scripts using the classes that Lucy::Simple hides
280       from view, opening up the possibilities for expansion; then, we’ll
281       spend the rest of the tutorial chapters exploring these possibilities.
282
283
284
285perl v5.38.0                      2023L-u0c7y-:2:0Docs::Tutorial::SimpleTutorial(3pm)