| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package KinoSearch1::QueryParser::QueryParser; |
|
2
|
18
|
|
|
18
|
|
30973
|
use strict; |
|
|
18
|
|
|
|
|
39
|
|
|
|
18
|
|
|
|
|
623
|
|
|
3
|
18
|
|
|
18
|
|
96
|
use warnings; |
|
|
18
|
|
|
|
|
40
|
|
|
|
18
|
|
|
|
|
464
|
|
|
4
|
18
|
|
|
18
|
|
866
|
use KinoSearch1::Util::ToolSet; |
|
|
18
|
|
|
|
|
40
|
|
|
|
18
|
|
|
|
|
2729
|
|
|
5
|
18
|
|
|
18
|
|
106
|
use base qw( KinoSearch1::Util::Class ); |
|
|
18
|
|
|
|
|
49
|
|
|
|
18
|
|
|
|
|
2841
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
BEGIN { |
|
8
|
18
|
|
|
18
|
|
196
|
__PACKAGE__->init_instance_vars( |
|
9
|
|
|
|
|
|
|
# constructor args / members |
|
10
|
|
|
|
|
|
|
analyzer => undef, |
|
11
|
|
|
|
|
|
|
default_boolop => 'OR', |
|
12
|
|
|
|
|
|
|
default_field => undef, # back compat |
|
13
|
|
|
|
|
|
|
fields => undef, |
|
14
|
|
|
|
|
|
|
# members |
|
15
|
|
|
|
|
|
|
bool_groups => undef, |
|
16
|
|
|
|
|
|
|
phrases => undef, |
|
17
|
|
|
|
|
|
|
bool_group_re => undef, |
|
18
|
|
|
|
|
|
|
phrase_re => undef, |
|
19
|
|
|
|
|
|
|
label_inc => 0, |
|
20
|
|
|
|
|
|
|
); |
|
21
|
|
|
|
|
|
|
} |
|
22
|
|
|
|
|
|
|
|
|
23
|
18
|
|
|
18
|
|
16726
|
use KinoSearch1::Analysis::TokenBatch; |
|
|
18
|
|
|
|
|
54
|
|
|
|
18
|
|
|
|
|
473
|
|
|
24
|
18
|
|
|
18
|
|
6950
|
use KinoSearch1::Analysis::Tokenizer; |
|
|
18
|
|
|
|
|
52
|
|
|
|
18
|
|
|
|
|
620
|
|
|
25
|
18
|
|
|
18
|
|
10261
|
use KinoSearch1::Search::BooleanQuery; |
|
|
18
|
|
|
|
|
60
|
|
|
|
18
|
|
|
|
|
661
|
|
|
26
|
18
|
|
|
18
|
|
11126
|
use KinoSearch1::Search::PhraseQuery; |
|
|
18
|
|
|
|
|
65
|
|
|
|
18
|
|
|
|
|
592
|
|
|
27
|
18
|
|
|
18
|
|
116
|
use KinoSearch1::Search::TermQuery; |
|
|
18
|
|
|
|
|
44
|
|
|
|
18
|
|
|
|
|
405
|
|
|
28
|
18
|
|
|
18
|
|
638
|
use KinoSearch1::Index::Term; |
|
|
18
|
|
|
|
|
36
|
|
|
|
18
|
|
|
|
|
36926
|
|
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub init_instance { |
|
31
|
47
|
|
|
47
|
1
|
253
|
my $self = shift; |
|
32
|
47
|
|
|
|
|
192
|
$self->{bool_groups} = {}; |
|
33
|
47
|
|
|
|
|
111
|
$self->{phrases} = {}; |
|
34
|
|
|
|
|
|
|
|
|
35
|
47
|
50
|
|
|
|
478
|
croak("default_boolop must be either 'AND' or 'OR'") |
|
36
|
|
|
|
|
|
|
unless $self->{default_boolop} =~ /^(?:AND|OR)$/; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# create a random string that presumably won't appear in a search string |
|
39
|
47
|
|
|
|
|
468
|
my @chars = ( 'A' .. 'Z' ); |
|
40
|
47
|
|
|
|
|
103
|
my $randstring = ''; |
|
41
|
47
|
|
|
|
|
1538
|
$randstring .= $chars[ rand @chars ] for ( 1 .. 16 ); |
|
42
|
47
|
|
|
|
|
145
|
$self->{randstring} = $randstring; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# create labels which won't appear in search strings |
|
45
|
47
|
|
|
|
|
2392
|
$self->{phrase_re} = qr/^(_phrase$randstring\d+)/; |
|
46
|
47
|
|
|
|
|
797
|
$self->{bool_group_re} = qr/^(_boolgroup$randstring\d+)/; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# verify fields param |
|
49
|
47
|
100
|
|
|
|
226
|
my $fields |
|
50
|
|
|
|
|
|
|
= defined $self->{fields} |
|
51
|
|
|
|
|
|
|
? $self->{fields} |
|
52
|
|
|
|
|
|
|
: [ $self->{default_field} ]; |
|
53
|
47
|
50
|
33
|
|
|
488
|
croak("Required parameter 'fields' not supplied as arrayref") |
|
54
|
|
|
|
|
|
|
unless ( defined $fields |
|
55
|
|
|
|
|
|
|
and reftype($fields) eq 'ARRAY' ); |
|
56
|
47
|
|
|
|
|
110
|
$self->{fields} = $fields; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# verify analyzer |
|
59
|
47
|
50
|
|
|
|
218
|
croak("Missing required param 'analyzer'") |
|
60
|
|
|
|
|
|
|
unless a_isa_b( $self->{analyzer}, |
|
61
|
|
|
|
|
|
|
'KinoSearch1::Analysis::Analyzer' ); |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# regex matching a quoted string |
|
65
|
|
|
|
|
|
|
my $quoted_re = qr/ |
|
66
|
|
|
|
|
|
|
" # opening quote |
|
67
|
|
|
|
|
|
|
( # capture |
|
68
|
|
|
|
|
|
|
[^"]*? # anything not a quote |
|
69
|
|
|
|
|
|
|
) |
|
70
|
|
|
|
|
|
|
(?:"|$) # closed by either a quote or end of string |
|
71
|
|
|
|
|
|
|
/xsm; |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# regex matching a parenthetical group |
|
74
|
|
|
|
|
|
|
my $paren_re = qr/ |
|
75
|
|
|
|
|
|
|
\( # opening paren |
|
76
|
|
|
|
|
|
|
( # capture |
|
77
|
|
|
|
|
|
|
[^()]*? # anything not a paren |
|
78
|
|
|
|
|
|
|
) |
|
79
|
|
|
|
|
|
|
(?:\)|$) # closed by paren or end of string |
|
80
|
|
|
|
|
|
|
/xsm; |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# regex matching a negating boolean operator |
|
83
|
|
|
|
|
|
|
my $neg_re = qr/^(?: |
|
84
|
|
|
|
|
|
|
NOT\s+ # NOT followed by space |
|
85
|
|
|
|
|
|
|
|-(?=\S) # minus followed by something not-spacey |
|
86
|
|
|
|
|
|
|
)/xsm; |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# regex matching a requiring boolean operator |
|
89
|
|
|
|
|
|
|
my $req_re = qr/^ |
|
90
|
|
|
|
|
|
|
\+(?=\S) # plus followed by something not-spacey |
|
91
|
|
|
|
|
|
|
/xsm; |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# regex matching a field indicator |
|
94
|
|
|
|
|
|
|
my $field_re = qr/^ |
|
95
|
|
|
|
|
|
|
( # capture |
|
96
|
|
|
|
|
|
|
[^"(:\s]+ # non-spacey string |
|
97
|
|
|
|
|
|
|
) |
|
98
|
|
|
|
|
|
|
: # followed by : |
|
99
|
|
|
|
|
|
|
/xsm; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub parse { |
|
102
|
348
|
|
|
348
|
1
|
1006
|
my ( $self, $qstring_orig, $default_fields ) = @_; |
|
103
|
348
|
50
|
|
|
|
1159
|
$qstring_orig = '' unless defined $qstring_orig; |
|
104
|
348
|
|
66
|
|
|
2058
|
$default_fields ||= $self->{fields}; |
|
105
|
348
|
|
|
|
|
846
|
my $default_boolop = $self->{default_boolop}; |
|
106
|
348
|
|
|
|
|
504
|
my @clauses; |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# substitute contiguous labels for phrases and boolean groups |
|
109
|
348
|
|
|
|
|
2181
|
my $qstring = $self->_extract_phrases($qstring_orig); |
|
110
|
348
|
|
|
|
|
1027
|
$qstring = $self->_extract_boolgroups($qstring); |
|
111
|
|
|
|
|
|
|
|
|
112
|
348
|
|
|
|
|
635
|
local $_ = $qstring; |
|
113
|
348
|
|
|
|
|
1366
|
while ( bytes::length $_ ) { |
|
114
|
|
|
|
|
|
|
# fast-forward past whitespace |
|
115
|
971
|
100
|
|
|
|
9256
|
next if s/^\s+//; |
|
116
|
|
|
|
|
|
|
|
|
117
|
578
|
100
|
|
|
|
1693
|
my $occur = $default_boolop eq 'AND' ? 'MUST' : 'SHOULD'; |
|
118
|
|
|
|
|
|
|
|
|
119
|
578
|
100
|
|
|
|
2367
|
if (s/^AND\s+//) { |
|
|
|
100
|
|
|
|
|
|
|
120
|
41
|
100
|
|
|
|
129
|
if (@clauses) { |
|
121
|
|
|
|
|
|
|
# require the previous clause (unless it's negated) |
|
122
|
31
|
100
|
|
|
|
551
|
if ( $clauses[-1]{occur} eq 'SHOULD' ) { |
|
123
|
18
|
|
|
|
|
51
|
$clauses[-1]{occur} = 'MUST'; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
# require this clause |
|
127
|
41
|
|
|
|
|
145
|
$occur = 'MUST'; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
elsif (s/^OR\s+//) { |
|
130
|
42
|
100
|
|
|
|
147
|
if (@clauses) { |
|
131
|
38
|
|
|
|
|
110
|
$clauses[-1]{occur} = 'SHOULD'; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
42
|
|
|
|
|
114
|
$occur = 'SHOULD'; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# detect tokens which cause this clause to be required or negated |
|
137
|
578
|
100
|
|
|
|
5115
|
if (s/$neg_re//) { |
|
|
|
100
|
|
|
|
|
|
|
138
|
55
|
|
|
|
|
156
|
$occur = 'MUST_NOT'; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
elsif (s/$req_re//) { |
|
141
|
41
|
|
|
|
|
94
|
$occur = 'MUST'; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# set the field |
|
145
|
578
|
100
|
|
|
|
4186
|
my $fields = s/^$field_re// ? [$1] : $default_fields; |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# if a phrase label is detected... |
|
148
|
578
|
100
|
|
|
|
6853
|
if (s/$self->{phrase_re}//) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
149
|
96
|
|
|
|
|
236
|
my $query; |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# retreive the text and analyze it |
|
152
|
96
|
|
|
|
|
381
|
my $orig_phrase_text = delete $self->{phrases}{$1}; |
|
153
|
96
|
|
|
|
|
247
|
my $token_texts = $self->_analyze($orig_phrase_text); |
|
154
|
96
|
50
|
|
|
|
316
|
if (@$token_texts) { |
|
155
|
96
|
|
|
|
|
302
|
my $query = $self->_get_field_query( $fields, $token_texts ); |
|
156
|
96
|
50
|
|
|
|
873
|
push @clauses, { query => $query, occur => $occur } |
|
157
|
|
|
|
|
|
|
if defined $query; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
# if a label indicating a bool group is detected... |
|
161
|
|
|
|
|
|
|
elsif (s/$self->{bool_group_re}//) { |
|
162
|
|
|
|
|
|
|
# parse boolean subqueries recursively |
|
163
|
83
|
|
|
|
|
363
|
my $inner_text = delete $self->{bool_groups}{$1}; |
|
164
|
83
|
|
|
|
|
294
|
my $query = $self->parse( $inner_text, $fields ); |
|
165
|
83
|
|
|
|
|
473
|
push @clauses, { query => $query, occur => $occur }; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
# what's left is probably a term |
|
168
|
|
|
|
|
|
|
elsif (s/([^"(\s]+)//) { |
|
169
|
399
|
|
|
|
|
1252
|
my $token_texts = $self->_analyze($1); |
|
170
|
399
|
|
|
|
|
1265
|
@$token_texts = grep { $_ ne '' } @$token_texts; |
|
|
399
|
|
|
|
|
1552
|
|
|
171
|
399
|
100
|
|
|
|
1701
|
if (@$token_texts) { |
|
172
|
357
|
|
|
|
|
1237
|
my $query = $self->_get_field_query( $fields, $token_texts ); |
|
173
|
357
|
|
|
|
|
3028
|
push @clauses, { occur => $occur, query => $query }; |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
|
|
178
|
348
|
100
|
100
|
|
|
2852
|
if ( @clauses == 1 and $clauses[0]{occur} ne 'MUST_NOT' ) { |
|
179
|
|
|
|
|
|
|
# if it's just a simple query, return it unwrapped |
|
180
|
147
|
|
|
|
|
917
|
return $clauses[0]{query}; |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
else { |
|
183
|
|
|
|
|
|
|
# otherwise, build a boolean query |
|
184
|
201
|
|
|
|
|
1467
|
my $bool_query = KinoSearch1::Search::BooleanQuery->new; |
|
185
|
201
|
|
|
|
|
623
|
for my $clause (@clauses) { |
|
186
|
389
|
|
|
|
|
1488
|
$bool_query->add_clause( |
|
187
|
|
|
|
|
|
|
query => $clause->{query}, |
|
188
|
|
|
|
|
|
|
occur => $clause->{occur}, |
|
189
|
|
|
|
|
|
|
); |
|
190
|
|
|
|
|
|
|
} |
|
191
|
201
|
|
|
|
|
1641
|
return $bool_query; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Wrap a TermQuery/PhraseQuery to deal with multiple fields. |
|
196
|
|
|
|
|
|
|
sub _get_field_query { |
|
197
|
453
|
|
|
453
|
|
861
|
my ( $self, $fields, $token_texts ) = @_; |
|
198
|
|
|
|
|
|
|
|
|
199
|
534
|
|
|
|
|
1913
|
my @queries = grep { defined $_ } |
|
|
534
|
|
|
|
|
1301
|
|
|
200
|
453
|
|
|
|
|
1021
|
map { $self->_gen_single_field_query( $_, $token_texts ) } @$fields; |
|
201
|
|
|
|
|
|
|
|
|
202
|
453
|
50
|
|
|
|
1963
|
if ( @queries == 0 ) { |
|
|
|
100
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
0
|
return; |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
elsif ( @queries == 1 ) { |
|
206
|
408
|
|
|
|
|
1112
|
return $queries[0]; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
else { |
|
209
|
45
|
|
|
|
|
456
|
my $wrapper_query = KinoSearch1::Search::BooleanQuery->new; |
|
210
|
45
|
|
|
|
|
109
|
for my $query (@queries) { |
|
211
|
126
|
|
|
|
|
514
|
$wrapper_query->add_clause( |
|
212
|
|
|
|
|
|
|
query => $query, |
|
213
|
|
|
|
|
|
|
occur => 'SHOULD', |
|
214
|
|
|
|
|
|
|
); |
|
215
|
|
|
|
|
|
|
} |
|
216
|
45
|
|
|
|
|
148
|
return $wrapper_query; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Create a TermQuery, a PhraseQuery, or nothing. |
|
221
|
|
|
|
|
|
|
sub _gen_single_field_query { |
|
222
|
534
|
|
|
534
|
|
964
|
my ( $self, $field, $token_texts ) = @_; |
|
223
|
|
|
|
|
|
|
|
|
224
|
534
|
100
|
|
|
|
1598
|
if ( @$token_texts == 1 ) { |
|
|
|
50
|
|
|
|
|
|
|
225
|
468
|
|
|
|
|
3134
|
my $term = KinoSearch1::Index::Term->new( $field, $token_texts->[0] ); |
|
226
|
468
|
|
|
|
|
6021
|
return KinoSearch1::Search::TermQuery->new( term => $term ); |
|
227
|
|
|
|
|
|
|
} |
|
228
|
|
|
|
|
|
|
elsif ( @$token_texts > 1 ) { |
|
229
|
66
|
|
|
|
|
465
|
my $phrase_query = KinoSearch1::Search::PhraseQuery->new; |
|
230
|
66
|
|
|
|
|
198
|
for my $token_text (@$token_texts) { |
|
231
|
140
|
|
|
|
|
531
|
$phrase_query->add_term( |
|
232
|
|
|
|
|
|
|
KinoSearch1::Index::Term->new( $field, $token_text ), |
|
233
|
|
|
|
|
|
|
); |
|
234
|
|
|
|
|
|
|
} |
|
235
|
66
|
|
|
|
|
226
|
return $phrase_query; |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# break a string into tokens |
|
240
|
|
|
|
|
|
|
sub _analyze { |
|
241
|
495
|
|
|
495
|
|
1300
|
my ( $self, $string ) = @_; |
|
242
|
|
|
|
|
|
|
|
|
243
|
495
|
|
|
|
|
4231
|
my $token_batch = KinoSearch1::Analysis::TokenBatch->new; |
|
244
|
495
|
|
|
|
|
2260
|
$token_batch->append( $string, 0, bytes::length($string) ); |
|
245
|
495
|
|
|
|
|
6270
|
$token_batch = $self->{analyzer}->analyze($token_batch); |
|
246
|
495
|
|
|
|
|
11022
|
my @token_texts; |
|
247
|
495
|
|
|
|
|
1984
|
while ( $token_batch->next ) { |
|
248
|
565
|
|
|
|
|
3298
|
push @token_texts, $token_batch->get_text; |
|
249
|
|
|
|
|
|
|
} |
|
250
|
495
|
|
|
|
|
2433
|
return \@token_texts; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
# replace all phrases with labels |
|
254
|
|
|
|
|
|
|
sub _extract_phrases { |
|
255
|
348
|
|
|
348
|
|
672
|
my ( $self, $qstring ) = @_; |
|
256
|
|
|
|
|
|
|
|
|
257
|
348
|
|
|
|
|
2465
|
while ( $qstring =~ $quoted_re ) { |
|
258
|
96
|
|
|
|
|
628
|
my $label |
|
259
|
|
|
|
|
|
|
= sprintf( "_phrase$self->{randstring}%d", $self->{label_inc}++ ); |
|
260
|
96
|
|
|
|
|
987
|
$qstring =~ s/$quoted_re/$label /; # extra space for safety |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
# store the phrase text for later retrieval |
|
263
|
96
|
|
|
|
|
994
|
$self->{phrases}{$label} = $1; |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
348
|
|
|
|
|
917
|
return $qstring; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# recursively replace boolean groupings with labels, innermost first |
|
270
|
|
|
|
|
|
|
sub _extract_boolgroups { |
|
271
|
348
|
|
|
348
|
|
998
|
my ( $self, $qstring ) = @_; |
|
272
|
|
|
|
|
|
|
|
|
273
|
348
|
|
|
|
|
2317
|
while ( $qstring =~ $paren_re ) { |
|
274
|
83
|
|
|
|
|
526
|
my $label = sprintf( "_boolgroup$self->{randstring}%d", |
|
275
|
|
|
|
|
|
|
$self->{label_inc}++ ); |
|
276
|
83
|
|
|
|
|
899
|
$qstring =~ s/$paren_re/$label /; # extra space for safety |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# store the text for later retrieval |
|
279
|
83
|
|
|
|
|
670
|
$self->{bool_groups}{$label} = $1; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
348
|
|
|
|
|
810
|
return $qstring; |
|
283
|
|
|
|
|
|
|
} |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
1; |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
__END__ |