1Lucy::Docs::Tutorial::SUismeprleCTounttorriibault(e3dLpumPc)eyr:l:DDooccsu:m:eTnuttaotriioanl::SimpleTutorial(3pm)
2
3
4
6 Lucy::Docs::Tutorial::SimpleTutorial - Bare-bones search app.
7
9 Setup
10 Copy the text presentation of the US Constitution from the "sample"
11 directory of the Apache Lucy distribution to the base level of your web
12 server’s "htdocs" directory.
13
14 $ cp -R sample/us_constitution /usr/local/apache2/htdocs/
15
16 Indexing: indexer.pl
17 Our first task will be to create an application called "indexer.pl"
18 which builds a searchable “inverted index” from a collection of
19 documents.
20
21 After we specify some configuration variables and load all necessary
22 modules…
23
24 #!/usr/local/bin/perl
25 use strict;
26 use warnings;
27
28 # (Change configuration variables as needed.)
29 my $path_to_index = '/path/to/index';
30 my $uscon_source = '/usr/local/apache2/htdocs/us_constitution';
31
32 use Lucy::Simple;
33 use File::Spec::Functions qw( catfile );
34
35 … we’ll start by creating a Lucy::Simple object, telling it where we’d
36 like the index to be located and the language of the source material.
37
38 my $lucy = Lucy::Simple->new(
39 path => $path_to_index,
40 language => 'en',
41 );
42
43 Next, we’ll add a subroutine which parses our sample documents.
44
45 # Parse a file from our US Constitution collection and return a hashref with
46 # the fields title, body, and url.
47 sub parse_file {
48 my $filename = shift;
49 my $filepath = catfile( $uscon_source, $filename );
50 open( my $fh, '<', $filepath ) or die "Can't open '$filepath': $!";
51 my $text = do { local $/; <$fh> }; # slurp file content
52 $text =~ /\A(.+?)^\s+(.*)/ms
53 or die "Can't extract title/bodytext from '$filepath'";
54 my $title = $1;
55 my $bodytext = $2;
56 return {
57 title => $title,
58 content => $bodytext,
59 url => "/us_constitution/$filename",
60 };
61 }
62
63 Add some elementary directory reading code…
64
65 # Collect names of source files.
66 opendir( my $dh, $uscon_source )
67 or die "Couldn't opendir '$uscon_source': $!";
68 my @filenames = grep { $_ =~ /\.txt/ } readdir $dh;
69
70 … and now we’re ready for the meat of indexer.pl – which occupies
71 exactly one line of code.
72
73 foreach my $filename (@filenames) {
74 my $doc = parse_file($filename);
75 $lucy->add_doc($doc); # ta-da!
76 }
77
78 Search: search.cgi
79 As with our indexing app, the bulk of the code in our search script
80 won’t be Lucy-specific.
81
82 The beginning is dedicated to CGI processing and configuration.
83
84 #!/usr/local/bin/perl -T
85 use strict;
86 use warnings;
87
88 # (Change configuration variables as needed.)
89 my $path_to_index = '/path/to/index';
90
91 use CGI;
92 use List::Util qw( max min );
93 use POSIX qw( ceil );
94 use Encode qw( decode );
95 use Lucy::Simple;
96
97 my $cgi = CGI->new;
98 my $q = decode( "UTF-8", $cgi->param('q') || '' );
99 my $offset = decode( "UTF-8", $cgi->param('offset') || 0 );
100 my $page_size = 10;
101
102 Once that’s out of the way, we create our Lucy::Simple object and feed
103 it a query string.
104
105 my $lucy = Lucy::Simple->new(
106 path => $path_to_index,
107 language => 'en',
108 );
109 my $hit_count = $lucy->search(
110 query => $q,
111 offset => $offset,
112 num_wanted => $page_size,
113 );
114
115 The value returned by search() is the total number of documents in the
116 collection which matched the query. We’ll show this hit count to the
117 user, and also use it in conjunction with the parameters "offset" and
118 "num_wanted" to break up results into “pages” of manageable size.
119
120 Calling search() on our Simple object turns it into an iterator.
121 Invoking next() now returns hits one at a time as HitDoc objects,
122 starting with the most relevant.
123
124 # Create result list.
125 my $report = '';
126 while ( my $hit = $lucy->next ) {
127 my $score = sprintf( "%0.3f", $hit->get_score );
128 $report .= qq|
129 <p>
130 <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
131 <em>$score</em>
132 <br>
133 <span class="excerptURL">$hit->{url}</span>
134 </p>
135 |;
136 }
137
138 The rest of the script is just text wrangling.
139
140 #---------------------------------------------------------------#
141 # No tutorial material below this point - just html generation. #
142 #---------------------------------------------------------------#
143
144 # Generate paging links and hit count, print and exit.
145 my $paging_links = generate_paging_info( $q, $hit_count );
146 blast_out_content( $q, $report, $paging_links );
147
148 # Create html fragment with links for paging through results n-at-a-time.
149 sub generate_paging_info {
150 my ( $query_string, $total_hits ) = @_;
151 my $escaped_q = CGI::escapeHTML($query_string);
152 my $paging_info;
153 if ( !length $query_string ) {
154 # No query? No display.
155 $paging_info = '';
156 }
157 elsif ( $total_hits == 0 ) {
158 # Alert the user that their search failed.
159 $paging_info
160 = qq|<p>No matches for <strong>$escaped_q</strong></p>|;
161 }
162 else {
163 # Calculate the nums for the first and last hit to display.
164 my $last_result = min( ( $offset + $page_size ), $total_hits );
165 my $first_result = min( ( $offset + 1 ), $last_result );
166
167 # Display the result nums, start paging info.
168 $paging_info = qq|
169 <p>
170 Results <strong>$first_result-$last_result</strong>
171 of <strong>$total_hits</strong>
172 for <strong>$escaped_q</strong>.
173 </p>
174 <p>
175 Results Page:
176 |;
177
178 # Calculate first and last hits pages to display / link to.
179 my $current_page = int( $first_result / $page_size ) + 1;
180 my $last_page = ceil( $total_hits / $page_size );
181 my $first_page = max( 1, ( $current_page - 9 ) );
182 $last_page = min( $last_page, ( $current_page + 10 ) );
183
184 # Create a url for use in paging links.
185 my $href = $cgi->url( -relative => 1 );
186 $href .= "?q=" . CGI::escape($query_string);
187 $href .= ";offset=" . CGI::escape($offset);
188
189 # Generate the "Prev" link.
190 if ( $current_page > 1 ) {
191 my $new_offset = ( $current_page - 2 ) * $page_size;
192 $href =~ s/(?<=offset=)\d+/$new_offset/;
193 $paging_info .= qq|<a href="$href"><= Prev</a>\n|;
194 }
195
196 # Generate paging links.
197 for my $page_num ( $first_page .. $last_page ) {
198 if ( $page_num == $current_page ) {
199 $paging_info .= qq|$page_num \n|;
200 }
201 else {
202 my $new_offset = ( $page_num - 1 ) * $page_size;
203 $href =~ s/(?<=offset=)\d+/$new_offset/;
204 $paging_info .= qq|<a href="$href">$page_num</a>\n|;
205 }
206 }
207
208 # Generate the "Next" link.
209 if ( $current_page != $last_page ) {
210 my $new_offset = $current_page * $page_size;
211 $href =~ s/(?<=offset=)\d+/$new_offset/;
212 $paging_info .= qq|<a href="$href">Next =></a>\n|;
213 }
214
215 # Close tag.
216 $paging_info .= "</p>\n";
217 }
218
219 return $paging_info;
220 }
221
222 # Print content to output.
223 sub blast_out_content {
224 my ( $query_string, $hit_list, $paging_info ) = @_;
225 my $escaped_q = CGI::escapeHTML($query_string);
226 binmode( STDOUT, ":encoding(UTF-8)" );
227 print qq|Content-type: text/html; charset=UTF-8\n\n|;
228 print qq|
229 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
230 "http://www.w3.org/TR/html4/loose.dtd">
231 <html>
232 <head>
233 <meta http-equiv="Content-type"
234 content="text/html;charset=UTF-8">
235 <link rel="stylesheet" type="text/css"
236 href="/us_constitution/uscon.css">
237 <title>Lucy: $escaped_q</title>
238 </head>
239
240 <body>
241
242 <div id="navigation">
243 <form id="usconSearch" action="">
244 <strong>
245 Search the
246 <a href="/us_constitution/index.html">US Constitution</a>:
247 </strong>
248 <input type="text" name="q" id="q" value="$escaped_q">
249 <input type="submit" value="=>">
250 </form>
251 </div><!--navigation-->
252
253 <div id="bodytext">
254
255 $hit_list
256
257 $paging_info
258
259 <p style="font-size: smaller; color: #666">
260 <em>
261 Powered by <a href="http://lucy.apache.org/"
262 >Apache Lucy<small><sup>TM</sup></small></a>
263 </em>
264 </p>
265 </div><!--bodytext-->
266
267 </body>
268
269 </html>
270 |;
271 }
272
273 OK… now what?
274 Lucy::Simple is perfectly adequate for some tasks, but it’s not very
275 flexible. Many people find that it doesn’t do at least one or two
276 things they can’t live without.
277
278 In our next tutorial chapter, BeyondSimpleTutorial, we’ll rewrite our
279 indexing and search scripts using the classes that Lucy::Simple hides
280 from view, opening up the possibilities for expansion; then, we’ll
281 spend the rest of the tutorial chapters exploring these possibilities.
282
283
284
285perl v5.38.0 2023L-u0c7y-:2:0Docs::Tutorial::SimpleTutorial(3pm)