Lucy::Docs::Cookbook::CustomQueryParser(3pm)

1Lucy::Docs::Cookbook::CUussetromCQounetrryiPbaurtseLedurc(Py3e:)r:lDoDcosc:u:mCeonotkabtoiookn::CustomQueryParser(3)
2
3
4

NAME

6       Lucy::Docs::Cookbook::CustomQueryParser - Sample subclass of
7       QueryParser.
8

DESCRIPTION

10       Implement a custom search query language using a subclass of
11       QueryParser.
12
13   The language
14       At first, our query language will support only simple term queries and
15       phrases delimited by double quotes.  For simplicityXs sake, it will not
16       support parenthetical groupings, boolean operators, or prepended
17       plus/minus.  The results for all subqueries will be unioned together X
18       i.e. joined using an OR X which is usually the best approach for small-
19       to-medium-sized document collections.
20
21       Later, weXll add support for trailing wildcards.
22
23   Single-field parser
24       Our initial parser implentation will generate queries against a single
25       fixed field, XcontentX, and it will analyze text using a fixed choice
26       of English EasyAnalyzer.  We wonXt subclass Lucy::Search::QueryParser
27       just yet.
28
29           package FlatQueryParser;
30           use Lucy::Search::TermQuery;
31           use Lucy::Search::PhraseQuery;
32           use Lucy::Search::ORQuery;
33           use Carp;
34
35           sub new {
36               my $analyzer = Lucy::Analysis::EasyAnalyzer->new(
37                   language => 'en',
38               );
39               return bless {
40                   field    => 'content',
41                   analyzer => $analyzer,
42               }, __PACKAGE__;
43           }
44
45       Some private helper subs for creating TermQuery and PhraseQuery objects
46       will help keep the size of our main parse() subroutine down:
47
48           sub _make_term_query {
49               my ( $self, $term ) = @_;
50               return Lucy::Search::TermQuery->new(
51                   field => $self->{field},
52                   term  => $term,
53               );
54           }
55
56           sub _make_phrase_query {
57               my ( $self, $terms ) = @_;
58               return Lucy::Search::PhraseQuery->new(
59                   field => $self->{field},
60                   terms => $terms,
61               );
62           }
63
64       Our private _tokenize() method treats double-quote delimited material
65       as a single token and splits on whitespace everywhere else.
66
67           sub _tokenize {
68               my ( $self, $query_string ) = @_;
69               my @tokens;
70               while ( length $query_string ) {
71                   if ( $query_string =~ s/^\s+// ) {
72                       next;    # skip whitespace
73                   }
74                   elsif ( $query_string =~ s/^("[^"]*(?:"|$))// ) {
75                       push @tokens, $1;    # double-quoted phrase
76                   }
77                   else {
78                       $query_string =~ s/(\S+)//;
79                       push @tokens, $1;    # single word
80                   }
81               }
82               return \@tokens;
83           }
84
85       The main parsing routine creates an array of tokens by calling
86       _tokenize(), runs the tokens through through the EasyAnalyzer, creates
87       TermQuery or PhraseQuery objects according to how many tokens emerge
88       from the EasyAnalyzerXs split() method, and adds each of the sub-
89       queries to the primary ORQuery.
90
91           sub parse {
92               my ( $self, $query_string ) = @_;
93               my $tokens   = $self->_tokenize($query_string);
94               my $analyzer = $self->{analyzer};
95               my $or_query = Lucy::Search::ORQuery->new;
96
97               for my $token (@$tokens) {
98                   if ( $token =~ s/^"// ) {
99                       $token =~ s/"$//;
100                       my $terms = $analyzer->split($token);
101                       my $query = $self->_make_phrase_query($terms);
102                       $or_query->add_child($phrase_query);
103                   }
104                   else {
105                       my $terms = $analyzer->split($token);
106                       if ( @$terms == 1 ) {
107                           my $query = $self->_make_term_query( $terms->[0] );
108                           $or_query->add_child($query);
109                       }
110                       elsif ( @$terms > 1 ) {
111                           my $query = $self->_make_phrase_query($terms);
112                           $or_query->add_child($query);
113                       }
114                   }
115               }
116
117               return $or_query;
118           }
119
120   Multi-field parser
121       Most often, the end user will want their search query to match not only
122       a single XcontentX field, but also XtitleX and so on.  To make that
123       happen, we have to turn queries such as thisX
124
125           foo AND NOT bar
126
127       X into the logical equivalent of this:
128
129           (title:foo OR content:foo) AND NOT (title:bar OR content:bar)
130
131       Rather than continue with our own from-scratch parser class and write
132       the routines to accomplish that expansion, weXre now going to subclass
133       Lucy::Search::QueryParser and take advantage of some of its existing
134       methods.
135
136       Our first parser implementation had the XcontentX field name and the
137       choice of English EasyAnalyzer hard-coded for simplicity, but we donXt
138       need to do that once we subclass Lucy::Search::QueryParser.
139       QueryParserXs constructor X which we will inherit, allowing us to
140       eliminate our own constructor X requires a Schema which conveys field
141       and Analyzer information, so we can just defer to that.
142
143           package FlatQueryParser;
144           use base qw( Lucy::Search::QueryParser );
145           use Lucy::Search::TermQuery;
146           use Lucy::Search::PhraseQuery;
147           use Lucy::Search::ORQuery;
148           use PrefixQuery;
149           use Carp;
150
151           # Inherit new()
152
153       WeXre also going to jettison our _make_term_query() and
154       _make_phrase_query() helper subs and chop our parse() subroutine way
155       down.  Our revised parse() routine will generate
156       Lucy::Search::LeafQuery objects instead of TermQueries and
157       PhraseQueries:
158
159           sub parse {
160               my ( $self, $query_string ) = @_;
161               my $tokens = $self->_tokenize($query_string);
162               my $or_query = Lucy::Search::ORQuery->new;
163               for my $token (@$tokens) {
164                   my $leaf_query = Lucy::Search::LeafQuery->new( text => $token );
165                   $or_query->add_child($leaf_query);
166               }
167               return $self->expand($or_query);
168           }
169
170       The magic happens in QueryParserXs expand() method, which walks the
171       ORQuery object we supply to it looking for LeafQuery objects, and calls
172       expand_leaf() for each one it finds.  expand_leaf() performs field-
173       specific analysis, decides whether each query should be a TermQuery or
174       a PhraseQuery, and if multiple fields are required, creates an ORQuery
175       which mults out e.g.  "foo" into "(title:foo OR content:foo)".
176
177   Extending the query language
178       To add support for trailing wildcards to our query language, we need to
179       override expand_leaf() to accommodate PrefixQuery, while deferring to
180       the parent class implementation on TermQuery and PhraseQuery.
181
182           sub expand_leaf {
183               my ( $self, $leaf_query ) = @_;
184               my $text = $leaf_query->get_text;
185               if ( $text =~ /\*$/ ) {
186                   my $or_query = Lucy::Search::ORQuery->new;
187                   for my $field ( @{ $self->get_fields } ) {
188                       my $prefix_query = PrefixQuery->new(
189                           field        => $field,
190                           query_string => $text,
191                       );
192                       $or_query->add_child($prefix_query);
193                   }
194                   return $or_query;
195               }
196               else {
197                   return $self->SUPER::expand_leaf($leaf_query);
198               }
199           }
200
201       Ordinarily, those asterisks would have been stripped when running
202       tokens through the EasyAnalyzer X query strings containing Xfoo*X would
203       produce TermQueries for the term XfooX.  Our override intercepts tokens
204       with trailing asterisks and processes them as PrefixQueries before
205       "SUPER::expand_leaf" can discard them, so that a search for Xfoo*X can
206       match XfoodX, XfoosballX, and so on.
207
208   Usage
209       Insert our custom parser into the search.cgi sample app to get a feel
210       for how it behaves:
211
212           my $parser = FlatQueryParser->new( schema => $searcher->get_schema );
213           my $query  = $parser->parse( decode( 'UTF-8', $cgi->param('q') || '' ) );
214           my $hits   = $searcher->hits(
215               query      => $query,
216               offset     => $offset,
217               num_wanted => $page_size,
218           );
219           ...
220
221
222
223perl v5.34.0                      202L2u-c0y1:-:2D1ocs::Cookbook::CustomQueryParser(3)