1Lucy::Docs::Cookbook::CUussetromCQounetrryiPbaurtseLedurc(Py3e:)r:lDoDcosc:u:mCeonotkabtoiookn::CustomQueryParser(3)
2
3
4
6 Lucy::Docs::Cookbook::CustomQueryParser - Sample subclass of
7 QueryParser.
8
10 Implement a custom search query language using a subclass of
11 QueryParser.
12
13 The language
14 At first, our query language will support only simple term queries and
15 phrases delimited by double quotes. For simplicity’s sake, it will not
16 support parenthetical groupings, boolean operators, or prepended
17 plus/minus. The results for all subqueries will be unioned together –
18 i.e. joined using an OR – which is usually the best approach for small-
19 to-medium-sized document collections.
20
21 Later, we’ll add support for trailing wildcards.
22
23 Single-field parser
24 Our initial parser implentation will generate queries against a single
25 fixed field, “content”, and it will analyze text using a fixed choice
26 of English EasyAnalyzer. We won’t subclass Lucy::Search::QueryParser
27 just yet.
28
29 package FlatQueryParser;
30 use Lucy::Search::TermQuery;
31 use Lucy::Search::PhraseQuery;
32 use Lucy::Search::ORQuery;
33 use Carp;
34
35 sub new {
36 my $analyzer = Lucy::Analysis::EasyAnalyzer->new(
37 language => 'en',
38 );
39 return bless {
40 field => 'content',
41 analyzer => $analyzer,
42 }, __PACKAGE__;
43 }
44
45 Some private helper subs for creating TermQuery and PhraseQuery objects
46 will help keep the size of our main parse() subroutine down:
47
48 sub _make_term_query {
49 my ( $self, $term ) = @_;
50 return Lucy::Search::TermQuery->new(
51 field => $self->{field},
52 term => $term,
53 );
54 }
55
56 sub _make_phrase_query {
57 my ( $self, $terms ) = @_;
58 return Lucy::Search::PhraseQuery->new(
59 field => $self->{field},
60 terms => $terms,
61 );
62 }
63
64 Our private _tokenize() method treats double-quote delimited material
65 as a single token and splits on whitespace everywhere else.
66
67 sub _tokenize {
68 my ( $self, $query_string ) = @_;
69 my @tokens;
70 while ( length $query_string ) {
71 if ( $query_string =~ s/^\s+// ) {
72 next; # skip whitespace
73 }
74 elsif ( $query_string =~ s/^("[^"]*(?:"|$))// ) {
75 push @tokens, $1; # double-quoted phrase
76 }
77 else {
78 $query_string =~ s/(\S+)//;
79 push @tokens, $1; # single word
80 }
81 }
82 return \@tokens;
83 }
84
85 The main parsing routine creates an array of tokens by calling
86 _tokenize(), runs the tokens through through the EasyAnalyzer, creates
87 TermQuery or PhraseQuery objects according to how many tokens emerge
88 from the EasyAnalyzer’s split() method, and adds each of the sub-
89 queries to the primary ORQuery.
90
91 sub parse {
92 my ( $self, $query_string ) = @_;
93 my $tokens = $self->_tokenize($query_string);
94 my $analyzer = $self->{analyzer};
95 my $or_query = Lucy::Search::ORQuery->new;
96
97 for my $token (@$tokens) {
98 if ( $token =~ s/^"// ) {
99 $token =~ s/"$//;
100 my $terms = $analyzer->split($token);
101 my $query = $self->_make_phrase_query($terms);
102 $or_query->add_child($phrase_query);
103 }
104 else {
105 my $terms = $analyzer->split($token);
106 if ( @$terms == 1 ) {
107 my $query = $self->_make_term_query( $terms->[0] );
108 $or_query->add_child($query);
109 }
110 elsif ( @$terms > 1 ) {
111 my $query = $self->_make_phrase_query($terms);
112 $or_query->add_child($query);
113 }
114 }
115 }
116
117 return $or_query;
118 }
119
120 Multi-field parser
121 Most often, the end user will want their search query to match not only
122 a single ‘content’ field, but also ‘title’ and so on. To make that
123 happen, we have to turn queries such as this…
124
125 foo AND NOT bar
126
127 … into the logical equivalent of this:
128
129 (title:foo OR content:foo) AND NOT (title:bar OR content:bar)
130
131 Rather than continue with our own from-scratch parser class and write
132 the routines to accomplish that expansion, we’re now going to subclass
133 Lucy::Search::QueryParser and take advantage of some of its existing
134 methods.
135
136 Our first parser implementation had the “content” field name and the
137 choice of English EasyAnalyzer hard-coded for simplicity, but we don’t
138 need to do that once we subclass Lucy::Search::QueryParser.
139 QueryParser’s constructor – which we will inherit, allowing us to
140 eliminate our own constructor – requires a Schema which conveys field
141 and Analyzer information, so we can just defer to that.
142
143 package FlatQueryParser;
144 use base qw( Lucy::Search::QueryParser );
145 use Lucy::Search::TermQuery;
146 use Lucy::Search::PhraseQuery;
147 use Lucy::Search::ORQuery;
148 use PrefixQuery;
149 use Carp;
150
151 # Inherit new()
152
153 We’re also going to jettison our _make_term_query() and
154 _make_phrase_query() helper subs and chop our parse() subroutine way
155 down. Our revised parse() routine will generate
156 Lucy::Search::LeafQuery objects instead of TermQueries and
157 PhraseQueries:
158
159 sub parse {
160 my ( $self, $query_string ) = @_;
161 my $tokens = $self->_tokenize($query_string);
162 my $or_query = Lucy::Search::ORQuery->new;
163 for my $token (@$tokens) {
164 my $leaf_query = Lucy::Search::LeafQuery->new( text => $token );
165 $or_query->add_child($leaf_query);
166 }
167 return $self->expand($or_query);
168 }
169
170 The magic happens in QueryParser’s expand() method, which walks the
171 ORQuery object we supply to it looking for LeafQuery objects, and calls
172 expand_leaf() for each one it finds. expand_leaf() performs field-
173 specific analysis, decides whether each query should be a TermQuery or
174 a PhraseQuery, and if multiple fields are required, creates an ORQuery
175 which mults out e.g. "foo" into "(title:foo OR content:foo)".
176
177 Extending the query language
178 To add support for trailing wildcards to our query language, we need to
179 override expand_leaf() to accommodate PrefixQuery, while deferring to
180 the parent class implementation on TermQuery and PhraseQuery.
181
182 sub expand_leaf {
183 my ( $self, $leaf_query ) = @_;
184 my $text = $leaf_query->get_text;
185 if ( $text =~ /\*$/ ) {
186 my $or_query = Lucy::Search::ORQuery->new;
187 for my $field ( @{ $self->get_fields } ) {
188 my $prefix_query = PrefixQuery->new(
189 field => $field,
190 query_string => $text,
191 );
192 $or_query->add_child($prefix_query);
193 }
194 return $or_query;
195 }
196 else {
197 return $self->SUPER::expand_leaf($leaf_query);
198 }
199 }
200
201 Ordinarily, those asterisks would have been stripped when running
202 tokens through the EasyAnalyzer – query strings containing “foo*” would
203 produce TermQueries for the term “foo”. Our override intercepts tokens
204 with trailing asterisks and processes them as PrefixQueries before
205 "SUPER::expand_leaf" can discard them, so that a search for “foo*” can
206 match “food”, “foosball”, and so on.
207
208 Usage
209 Insert our custom parser into the search.cgi sample app to get a feel
210 for how it behaves:
211
212 my $parser = FlatQueryParser->new( schema => $searcher->get_schema );
213 my $query = $parser->parse( decode( 'UTF-8', $cgi->param('q') || '' ) );
214 my $hits = $searcher->hits(
215 query => $query,
216 offset => $offset,
217 num_wanted => $page_size,
218 );
219 ...
220
221
222
223perl v5.36.0 202L3u-c0y1:-:2D0ocs::Cookbook::CustomQueryParser(3)