1Lucy::Docs::Cookbook::CUussetromCQounetrryi(b3u)ted PerlLuDcoyc:u:mDeonctsa:t:iCoonokbook::CustomQuery(3)
2
3
4
6 Lucy::Docs::Cookbook::CustomQuery - Sample subclass of Query
7
9 Explore Apache Lucy’s support for custom query types by creating a
10 “PrefixQuery” class to handle trailing wildcards.
11
12 my $prefix_query = PrefixQuery->new(
13 field => 'content',
14 query_string => 'foo*',
15 );
16 my $hits = $searcher->hits( query => $prefix_query );
17 ...
18
19 Query, Compiler, and Matcher
20 To add support for a new query type, we need three classes: a Query, a
21 Compiler, and a Matcher.
22
23 • PrefixQuery - a subclass of Query, and the only class that client
24 code will deal with directly.
25
26 • PrefixCompiler - a subclass of Compiler, whose primary role is to
27 compile a PrefixQuery to a PrefixMatcher.
28
29 • PrefixMatcher - a subclass of Matcher, which does the heavy
30 lifting: it applies the query to individual documents and assigns a
31 score to each match.
32
33 The PrefixQuery class on its own isn’t enough because a Query object’s
34 role is limited to expressing an abstract specification for the search.
35 A Query is basically nothing but metadata; execution is left to the
36 Query’s companion Compiler and Matcher.
37
38 Here’s a simplified sketch illustrating how a Searcher’s hits() method
39 ties together the three classes.
40
41 sub hits {
42 my ( $self, $query ) = @_;
43 my $compiler = $query->make_compiler(
44 searcher => $self,
45 boost => $query->get_boost,
46 );
47 my $matcher = $compiler->make_matcher(
48 reader => $self->get_reader,
49 need_score => 1,
50 );
51 my @hits = $matcher->capture_hits;
52 return \@hits;
53 }
54
55 PrefixQuery
56
57 Our PrefixQuery class will have two attributes: a query string and a
58 field name.
59
60 package PrefixQuery;
61 use base qw( Lucy::Search::Query );
62 use Carp;
63 use Scalar::Util qw( blessed );
64
65 # Inside-out member vars and hand-rolled accessors.
66 my %query_string;
67 my %field;
68 sub get_query_string { my $self = shift; return $query_string{$$self} }
69 sub get_field { my $self = shift; return $field{$$self} }
70
71 PrefixQuery’s constructor collects and validates the attributes.
72
73 sub new {
74 my ( $class, %args ) = @_;
75 my $query_string = delete $args{query_string};
76 my $field = delete $args{field};
77 my $self = $class->SUPER::new(%args);
78 confess("'query_string' param is required")
79 unless defined $query_string;
80 confess("Invalid query_string: '$query_string'")
81 unless $query_string =~ /\*\s*$/;
82 confess("'field' param is required")
83 unless defined $field;
84 $query_string{$$self} = $query_string;
85 $field{$$self} = $field;
86 return $self;
87 }
88
89 Since this is an inside-out class, we’ll need a destructor:
90
91 sub DESTROY {
92 my $self = shift;
93 delete $query_string{$$self};
94 delete $field{$$self};
95 $self->SUPER::DESTROY;
96 }
97
98 The equals() method determines whether two Queries are logically
99 equivalent:
100
101 sub equals {
102 my ( $self, $other ) = @_;
103 return 0 unless blessed($other);
104 return 0 unless $other->isa("PrefixQuery");
105 return 0 unless $field{$$self} eq $field{$$other};
106 return 0 unless $query_string{$$self} eq $query_string{$$other};
107 return 1;
108 }
109
110 The last thing we’ll need is a make_compiler() factory method which
111 kicks out a subclass of Compiler.
112
113 sub make_compiler {
114 my ( $self, %args ) = @_;
115 my $subordinate = delete $args{subordinate};
116 my $compiler = PrefixCompiler->new( %args, parent => $self );
117 $compiler->normalize unless $subordinate;
118 return $compiler;
119 }
120
121 PrefixCompiler
122
123 PrefixQuery’s make_compiler() method will be called internally at
124 search-time by objects which subclass Searcher – such as
125 IndexSearchers.
126
127 A Searcher is associated with a particular collection of documents.
128 These documents may all reside in one index, as with IndexSearcher, or
129 they may be spread out across multiple indexes on one or more machines,
130 as with LucyX::Remote::ClusterSearcher.
131
132 Searcher objects have access to certain statistical information about
133 the collections they represent; for instance, a Searcher can tell you
134 how many documents are in the collection…
135
136 my $maximum_number_of_docs_in_collection = $searcher->doc_max;
137
138 … or how many documents a specific term appears in:
139
140 my $term_appears_in_this_many_docs = $searcher->doc_freq(
141 field => 'content',
142 term => 'foo',
143 );
144
145 Such information can be used by sophisticated Compiler implementations
146 to assign more or less heft to individual queries or sub-queries.
147 However, we’re not going to bother with weighting for this demo; we’ll
148 just assign a fixed score of 1.0 to each matching document.
149
150 We don’t need to write a constructor, as it will suffice to inherit
151 new() from Lucy::Search::Compiler. The only method we need to
152 implement for PrefixCompiler is make_matcher().
153
154 package PrefixCompiler;
155 use base qw( Lucy::Search::Compiler );
156
157 sub make_matcher {
158 my ( $self, %args ) = @_;
159 my $seg_reader = $args{reader};
160
161 # Retrieve low-level components LexiconReader and PostingListReader.
162 my $lex_reader
163 = $seg_reader->obtain("Lucy::Index::LexiconReader");
164 my $plist_reader
165 = $seg_reader->obtain("Lucy::Index::PostingListReader");
166
167 # Acquire a Lexicon and seek it to our query string.
168 my $substring = $self->get_parent->get_query_string;
169 $substring =~ s/\*.\s*$//;
170 my $field = $self->get_parent->get_field;
171 my $lexicon = $lex_reader->lexicon( field => $field );
172 return unless $lexicon;
173 $lexicon->seek($substring);
174
175 # Accumulate PostingLists for each matching term.
176 my @posting_lists;
177 while ( defined( my $term = $lexicon->get_term ) ) {
178 last unless $term =~ /^\Q$substring/;
179 my $posting_list = $plist_reader->posting_list(
180 field => $field,
181 term => $term,
182 );
183 if ($posting_list) {
184 push @posting_lists, $posting_list;
185 }
186 last unless $lexicon->next;
187 }
188 return unless @posting_lists;
189
190 return PrefixMatcher->new( posting_lists => \@posting_lists );
191 }
192
193 PrefixCompiler gets access to a SegReader object when make_matcher()
194 gets called. From the SegReader and its sub-components LexiconReader
195 and PostingListReader, we acquire a Lexicon, scan through the Lexicon’s
196 unique terms, and acquire a PostingList for each term that matches our
197 prefix.
198
199 Each of these PostingList objects represents a set of documents which
200 match the query.
201
202 PrefixMatcher
203
204 The Matcher subclass is the most involved.
205
206 package PrefixMatcher;
207 use base qw( Lucy::Search::Matcher );
208
209 # Inside-out member vars.
210 my %doc_ids;
211 my %tick;
212
213 sub new {
214 my ( $class, %args ) = @_;
215 my $posting_lists = delete $args{posting_lists};
216 my $self = $class->SUPER::new(%args);
217
218 # Cheesy but simple way of interleaving PostingList doc sets.
219 my %all_doc_ids;
220 for my $posting_list (@$posting_lists) {
221 while ( my $doc_id = $posting_list->next ) {
222 $all_doc_ids{$doc_id} = undef;
223 }
224 }
225 my @doc_ids = sort { $a <=> $b } keys %all_doc_ids;
226 $doc_ids{$$self} = \@doc_ids;
227
228 # Track our position within the array of doc ids.
229 $tick{$$self} = -1;
230
231 return $self;
232 }
233
234 sub DESTROY {
235 my $self = shift;
236 delete $doc_ids{$$self};
237 delete $tick{$$self};
238 $self->SUPER::DESTROY;
239 }
240
241 The doc ids must be in order, or some will be ignored; hence the "sort"
242 above.
243
244 In addition to the constructor and destructor, there are three methods
245 that must be overridden.
246
247 next() advances the Matcher to the next valid matching doc.
248
249 sub next {
250 my $self = shift;
251 my $doc_ids = $doc_ids{$$self};
252 my $tick = ++$tick{$$self};
253 return 0 if $tick >= scalar @$doc_ids;
254 return $doc_ids->[$tick];
255 }
256
257 get_doc_id() returns the current document id, or 0 if the Matcher is
258 exhausted. (Document numbers start at 1, so 0 is a sentinel.)
259
260 sub get_doc_id {
261 my $self = shift;
262 my $tick = $tick{$$self};
263 my $doc_ids = $doc_ids{$$self};
264 return $tick < scalar @$doc_ids ? $doc_ids->[$tick] : 0;
265 }
266
267 score() conveys the relevance score of the current match. We’ll just
268 return a fixed score of 1.0:
269
270 sub score { 1.0 }
271
272 Usage
273 To get a basic feel for PrefixQuery, insert the FlatQueryParser module
274 described in CustomQueryParser (which supports PrefixQuery) into the
275 search.cgi sample app.
276
277 my $parser = FlatQueryParser->new( schema => $searcher->get_schema );
278 my $query = $parser->parse($q);
279
280 If you’re planning on using PrefixQuery in earnest, though, you may
281 want to change up analyzers to avoid stemming, because stemming –
282 another approach to prefix conflation – is not perfectly compatible
283 with prefix searches.
284
285 # Polyanalyzer with no SnowballStemmer.
286 my $analyzer = Lucy::Analysis::PolyAnalyzer->new(
287 analyzers => [
288 Lucy::Analysis::StandardTokenizer->new,
289 Lucy::Analysis::Normalizer->new,
290 ],
291 );
292
293
294
295perl v5.36.0 2023-01-2L0ucy::Docs::Cookbook::CustomQuery(3)