line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Regexp::Keywords;
|
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require Exporter;
|
4
|
|
|
|
|
|
|
@ISA = (Exporter);
|
5
|
|
|
|
|
|
|
@EXPORT_OK = qw(keywords_regexp);
|
6
|
|
|
|
|
|
|
|
7
|
2
|
|
|
2
|
|
40564
|
use warnings;
|
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
64
|
|
8
|
2
|
|
|
2
|
|
10
|
use strict;
|
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
58
|
|
9
|
2
|
|
|
2
|
|
11
|
use Carp;
|
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
16924
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Regexp::Keywords - A regexp builder to test against keywords lists
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 VERSION
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Version 0.03
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.03';
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
This module helps you to search inside a list of keywords for some of them,
|
26
|
|
|
|
|
|
|
using a simple query syntax with AND, OR and NOT operators and grouping.
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use Regexp::Keywords;
|
29
|
|
|
|
|
|
|
my $kw = Regexp::Keywords->new();
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
my $wanted = 'comedy + ( action , romance ) - thriller';
|
32
|
|
|
|
|
|
|
$kw->prepare($wanted);
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my $movie_tags = 'action,comedy,crime,fantasy,adventure';
|
35
|
|
|
|
|
|
|
print "Buy ticket!\n" if $kw->test($movie_tags);
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
I, also known as I, are used to classify things in a category.
|
38
|
|
|
|
|
|
|
Many tags can be assigned at the same time to an item,
|
39
|
|
|
|
|
|
|
even if they belong to different available categories.
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
In real life, keywords lists are found in:
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=over 4
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=item *
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
Public databases like IMDB.
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=item *
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Metadata of HTML pages from public services, such as Picasa or Youtube
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=item *
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
Metadata of Word, Excel and other documents.
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=back
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head1 CONSTRUCTOR
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=head2 new ( )
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Creates a Keywords object.
|
65
|
|
|
|
|
|
|
Some attributes can be initialized from the constructor:
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=over 4
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=item *
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
L<< C|/ignore_case >> C<< => [0|1] >>
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=item *
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
L<< C|/multi_words >> C<< => [0|1|2] >>
|
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=item *
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
L<< C|/partial_words >> C<< => [0|1] >>
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=item *
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
L<< C|/texted_ops >> C<< => [0|1] >>
|
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=back
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
See L for a description
|
88
|
|
|
|
|
|
|
of these attributes.
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Example: To create a Keywords object that will be used to test
|
91
|
|
|
|
|
|
|
strings with mixed case keywords:
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
my $kw = Keywords->new(ignore_case => 1);
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=cut
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub new {
|
98
|
1
|
|
|
1
|
1
|
25
|
my $class = shift;
|
99
|
1
|
|
|
|
|
4
|
my %passed_parms = @_;
|
100
|
|
|
|
|
|
|
|
101
|
1
|
|
|
|
|
4
|
my %parms = (
|
102
|
|
|
|
|
|
|
ignore_case => 0,
|
103
|
|
|
|
|
|
|
multi_words => 0,
|
104
|
|
|
|
|
|
|
partial_words => 0,
|
105
|
|
|
|
|
|
|
texted_ops => 0,
|
106
|
|
|
|
|
|
|
);
|
107
|
|
|
|
|
|
|
|
108
|
1
|
|
|
|
|
5
|
while ( my($key,$value) = each %passed_parms ) {
|
109
|
2
|
50
|
|
|
|
6
|
if ( exists $parms{$key} ) {
|
110
|
2
|
|
|
|
|
6
|
$parms{$key} = $value;
|
111
|
|
|
|
|
|
|
} else {
|
112
|
0
|
|
|
|
|
0
|
croak("Invalid parameter $key");
|
113
|
|
|
|
|
|
|
}
|
114
|
|
|
|
|
|
|
}
|
115
|
|
|
|
|
|
|
|
116
|
1
|
|
|
|
|
2
|
my $self = {};
|
117
|
1
|
|
|
|
|
3
|
$self->{query} = undef;
|
118
|
1
|
|
|
|
|
2
|
$self->{parsed_query} = undef;
|
119
|
1
|
|
|
|
|
3
|
$self->{regexp} = undef;
|
120
|
1
|
|
|
|
|
2
|
$self->{ok} = undef;
|
121
|
1
|
|
|
|
|
2
|
bless($self, $class);
|
122
|
|
|
|
|
|
|
|
123
|
1
|
|
|
|
|
4
|
for my $parm ( keys %parms ) {
|
124
|
4
|
|
|
|
|
13
|
$self->{$parm} = $parms{$parm};
|
125
|
|
|
|
|
|
|
}
|
126
|
|
|
|
|
|
|
|
127
|
1
|
|
|
|
|
4
|
return $self;
|
128
|
|
|
|
|
|
|
}
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head1 BUILDING METHODS
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head2 $kw->prepare( $query )
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
Parse a query and build a regexp pattern to be used for
|
135
|
|
|
|
|
|
|
keywords strings tests.
|
136
|
|
|
|
|
|
|
Dies on malformed query expressions.
|
137
|
|
|
|
|
|
|
See L later in this doc.
|
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=cut
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub prepare {
|
142
|
22
|
|
|
22
|
1
|
5922
|
my $self = shift;
|
143
|
22
|
|
|
|
|
31
|
my $query = shift;
|
144
|
|
|
|
|
|
|
|
145
|
22
|
|
|
|
|
29
|
$self->{query} = $query;
|
146
|
22
|
|
|
|
|
45
|
$self->{parsed_query} = _query_parser($query, $self->{texted_ops});
|
147
|
22
|
|
|
|
|
64
|
$self->{regexp} = _regexp_builder($self->{parsed_query}, $self->{ignore_case}, $self->{multi_words}, $self->{partial_words});
|
148
|
22
|
|
|
|
|
83
|
$self->{ok} = defined $self->{regexp};
|
149
|
|
|
|
|
|
|
|
150
|
22
|
|
|
|
|
51
|
return $self->{regexp};
|
151
|
|
|
|
|
|
|
}
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=head2 $kw->set( attribute => value [, ...] )
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
The following attributes can be changed after the object creation:
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=over 4
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=item *
|
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
L<< C|/ignore_case >> C<< => [0|1] >>
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=item *
|
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
L<< C|/multi_words >> C<< => [0|1|2] >>
|
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=item *
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
L<< C|/parsed_query >> C<< => "internal binary format" >>
|
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=item *
|
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
L<< C|/partial_words >> C<< => [0|1] >>
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item *
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
L<< C|/query >> C<< => "free-form text" >>
|
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=item *
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
L<< C|/texted_ops >> C<< => [0|1] >>
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=back
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Dies on unknown attributes.
|
186
|
|
|
|
|
|
|
See L section
|
187
|
|
|
|
|
|
|
for a description of each attribute.
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
B Some of this attributes invalidates the associated
|
190
|
|
|
|
|
|
|
regexp if it was already built,
|
191
|
|
|
|
|
|
|
so an I C or C is done
|
192
|
|
|
|
|
|
|
after changing all the specified attributes.
|
193
|
|
|
|
|
|
|
For the same reason, is better to call C with many
|
194
|
|
|
|
|
|
|
parameters instead of setting one at a time.
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
B It's not recommended to modify the attributes directly
|
197
|
|
|
|
|
|
|
from the object, or you could get unexpected results
|
198
|
|
|
|
|
|
|
if the query is not parsed or built again.
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=cut
|
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
sub set {
|
203
|
1
|
|
|
1
|
1
|
364
|
my $self = shift;
|
204
|
1
|
|
|
|
|
4
|
my %passed_parms = @_;
|
205
|
|
|
|
|
|
|
|
206
|
1
|
|
|
|
|
5
|
my %valid_parms = (
|
207
|
|
|
|
|
|
|
ignore_case => 1,
|
208
|
|
|
|
|
|
|
multi_words => 1,
|
209
|
|
|
|
|
|
|
parsed_query => 2,
|
210
|
|
|
|
|
|
|
partial_words => 1,
|
211
|
|
|
|
|
|
|
query => 2,
|
212
|
|
|
|
|
|
|
texted_ops => 2,
|
213
|
|
|
|
|
|
|
);
|
214
|
1
|
|
|
|
|
2
|
my $rebuild_needed = 0;
|
215
|
1
|
|
|
|
|
2
|
my $reparse_needed = 0;
|
216
|
|
|
|
|
|
|
|
217
|
1
|
|
|
|
|
5
|
while ( my($key,$value) = each %passed_parms ) {
|
218
|
1
|
50
|
|
|
|
4
|
if ( exists $valid_parms{$key} ) {
|
219
|
1
|
50
|
33
|
|
|
10
|
$rebuild_needed++ if $valid_parms{$key} && $self->{$key} ne $value;
|
220
|
1
|
50
|
33
|
|
|
7
|
$reparse_needed++ if $valid_parms{$key} == 2 && $self->{$key} ne $value;
|
221
|
1
|
|
|
|
|
4
|
$self->{$key} = $value;
|
222
|
|
|
|
|
|
|
} else {
|
223
|
0
|
|
|
|
|
0
|
croak("Invalid parameter $key");
|
224
|
|
|
|
|
|
|
}
|
225
|
|
|
|
|
|
|
}
|
226
|
|
|
|
|
|
|
# $self->{ok} = 0 if $rebuild_needed + $reparse_needed;
|
227
|
|
|
|
|
|
|
# The previous removed because of the following auto updates:
|
228
|
1
|
50
|
33
|
|
|
7
|
$self->reparse() if $reparse_needed && $self->{query};
|
229
|
1
|
50
|
33
|
|
|
8
|
$self->rebuild() if $rebuild_needed && $self->{parsed_query};
|
230
|
|
|
|
|
|
|
}
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=head2 $kw->get( 'attribute' )
|
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
This method returns the current value for the specified attribute.
|
235
|
|
|
|
|
|
|
Dies on unknown attributes.
|
236
|
|
|
|
|
|
|
See L section for a list of available attributes.
|
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=cut
|
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
sub get {
|
241
|
1
|
|
|
1
|
1
|
327
|
my $self = shift;
|
242
|
1
|
|
|
|
|
2
|
my $key = shift;
|
243
|
|
|
|
|
|
|
|
244
|
1
|
50
|
|
|
|
4
|
croak("Invalid parameter $key") unless exists $self->{$key};
|
245
|
|
|
|
|
|
|
|
246
|
1
|
|
|
|
|
3
|
return $self->{$key};
|
247
|
|
|
|
|
|
|
}
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 $kw->reparse( )
|
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
If any of the object's attribute changes,
|
252
|
|
|
|
|
|
|
a reparse of the source query may be required,
|
253
|
|
|
|
|
|
|
depending on the affected attribute.
|
254
|
|
|
|
|
|
|
Dies on bad queries.
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub reparse {
|
259
|
1
|
|
|
1
|
1
|
2
|
my $self = shift;
|
260
|
|
|
|
|
|
|
|
261
|
1
|
|
|
|
|
3
|
$self->{parsed_query} = _query_parser($self->{query}, $self->{texted_ops});
|
262
|
1
|
|
|
|
|
3
|
$self->{ok} = 0;
|
263
|
|
|
|
|
|
|
}
|
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=head2 $kw->rebuild( )
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
If any of the object's attribute changes,
|
268
|
|
|
|
|
|
|
a rebuild of the regexp may be required,
|
269
|
|
|
|
|
|
|
depending on the affected attribute.
|
270
|
|
|
|
|
|
|
Dies on bad parsed queries.
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=cut
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub rebuild {
|
275
|
1
|
|
|
1
|
1
|
2
|
my $self = shift;
|
276
|
|
|
|
|
|
|
|
277
|
1
|
|
|
|
|
4
|
$self->{regexp} = _regexp_builder($self->{parsed_query}, $self->{ignore_case}, $self->{multi_words}, $self->{partial_words});
|
278
|
1
|
|
|
|
|
4
|
$self->{ok} = defined $self->{regexp};
|
279
|
|
|
|
|
|
|
|
280
|
1
|
|
|
|
|
2
|
return $self->{regexp};
|
281
|
|
|
|
|
|
|
}
|
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=head1 KEYWORDS TESTING METHODS
|
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=head2 $kw->test( $keyword_list )
|
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Returns I if the list matches the parsed query, otherwise returns I.
|
288
|
|
|
|
|
|
|
Dies if no query has been parsed yet.
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub test {
|
293
|
0
|
|
|
0
|
1
|
0
|
my $self = shift;
|
294
|
0
|
|
|
|
|
0
|
my $list = shift;
|
295
|
|
|
|
|
|
|
|
296
|
0
|
0
|
|
|
|
0
|
croak "Query not prepared for test" unless $self->{ok};
|
297
|
|
|
|
|
|
|
|
298
|
0
|
|
|
|
|
0
|
return $list =~ /$self->{regexp}/;
|
299
|
|
|
|
|
|
|
}
|
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=head2 $kw->grep( @list_of_kwlists )
|
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Returns an array only with the keywords lists that matches de parsed query.
|
304
|
|
|
|
|
|
|
Dies if no query has been parsed yet.
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
@selected_keys = $kw->grep_keys(map {$_ => $table{$_}[$col]} keys %table);
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
@selected_indexes = $kw->grep_keys(map {$_ => $array[$_]} 1 .. $#table);
|
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=cut
|
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
sub grep {
|
313
|
20
|
|
|
20
|
1
|
26
|
my $self = shift;
|
314
|
|
|
|
|
|
|
|
315
|
20
|
50
|
|
|
|
63
|
croak "Query not prepared for grep" unless $self->{ok};
|
316
|
|
|
|
|
|
|
|
317
|
20
|
|
|
|
|
420
|
return grep {$_ =~ /$self->{regexp}/} @_;
|
|
280
|
|
|
|
|
1255
|
|
318
|
|
|
|
|
|
|
}
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=head2 $kw->grep_keys( %hash_of_kwlists )
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
Returns an array of keys from a hash when their corresponding values
|
323
|
|
|
|
|
|
|
satisfy the query.
|
324
|
|
|
|
|
|
|
Dies if no query has been parsed yet.
|
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=cut
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub grep_keys {
|
329
|
1
|
|
|
1
|
1
|
2
|
my $self = shift;
|
330
|
|
|
|
|
|
|
|
331
|
1
|
50
|
|
|
|
4
|
croak "Query not prepared for grepkeys" unless $self->{ok};
|
332
|
|
|
|
|
|
|
|
333
|
1
|
|
|
|
|
5
|
my %pairs = @_;
|
334
|
1
|
|
|
|
|
7
|
return grep {$pairs{$_} =~ /$self->{regexp}/} keys %pairs;
|
|
14
|
|
|
|
|
56
|
|
335
|
|
|
|
|
|
|
}
|
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head1 EXPORTED FUNCTION
|
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
The following function can be imported and accessed directly from your program.
|
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head2 keywords_regexp( $query [, $ignore [, $multi [, $partial [, $texted ] ] ] ] )
|
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
Returns a regular expression (C) for a query
|
344
|
|
|
|
|
|
|
to which keywords lists strings can be tested against.
|
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
See L section
|
347
|
|
|
|
|
|
|
for a description of the attributes for the corresponding
|
348
|
|
|
|
|
|
|
parameters and the default values if ommitted.
|
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub keywords_regexp {
|
353
|
1
|
|
|
1
|
1
|
258
|
my $query = shift;
|
354
|
|
|
|
|
|
|
|
355
|
1
|
|
|
|
|
3
|
my $texted = splice @_, 3, 1;
|
356
|
|
|
|
|
|
|
# my $ignore = shift;
|
357
|
|
|
|
|
|
|
# my $multi = shift;
|
358
|
|
|
|
|
|
|
# my $partial = shift;
|
359
|
|
|
|
|
|
|
|
360
|
1
|
|
|
|
|
3
|
return _regexp_builder(_query_parser($query, $texted), @_);
|
361
|
|
|
|
|
|
|
}
|
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# INTERNALS
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
sub _regexp_builder {
|
366
|
24
|
|
|
24
|
|
33
|
my $parsed_query = shift;
|
367
|
24
|
|
|
|
|
26
|
my $ignore_case = shift;
|
368
|
24
|
|
|
|
|
25
|
my $multi_words = shift;
|
369
|
24
|
|
|
|
|
20
|
my $partial_words = shift;
|
370
|
|
|
|
|
|
|
|
371
|
24
|
|
|
|
|
26
|
my $expr = $parsed_query;
|
372
|
24
|
50
|
|
|
|
44
|
croak('Undefined query') unless $expr;
|
373
|
|
|
|
|
|
|
|
374
|
24
|
50
|
|
|
|
44
|
my $bound = ($partial_words ? '' : '\b');
|
375
|
|
|
|
|
|
|
|
376
|
24
|
50
|
|
|
|
41
|
my $space = ($multi_words == 1 ? '\s+' : '\W+');
|
377
|
|
|
|
|
|
|
|
378
|
24
|
100
|
|
|
|
96
|
$expr =~ s/(!?)([\w\.\^]+)/($1?'(?!':'(?=').'.*'.$bound.$2.$bound.')'/ge;
|
|
66
|
|
|
|
|
327
|
|
379
|
24
|
|
|
|
|
73
|
$expr =~ s/\&//g;
|
380
|
24
|
|
|
|
|
34
|
$expr =~ s/\^/$space/g;
|
381
|
24
|
50
|
|
|
|
61
|
$expr = ($ignore_case?'(?i)':'').'^('.$expr.')';
|
382
|
|
|
|
|
|
|
|
383
|
24
|
|
|
|
|
675
|
return qr/$expr/;
|
384
|
|
|
|
|
|
|
}
|
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
sub _query_parser {
|
387
|
24
|
|
|
24
|
|
31
|
my $query = shift;
|
388
|
24
|
|
|
|
|
39
|
my $texted_ops = shift;
|
389
|
|
|
|
|
|
|
|
390
|
24
|
50
|
|
|
|
51
|
croak('Query required')
|
391
|
|
|
|
|
|
|
unless defined $query;
|
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# Cleanup:
|
394
|
24
|
|
|
|
|
38
|
$query =~ tr/\+\-\,\"\[\]\{\}\<\>/\&\!\|\'\(\)\(\)\(\)/; # unify operators
|
395
|
24
|
100
|
|
|
|
49
|
if ($texted_ops) {
|
396
|
1
|
|
|
|
|
4
|
$query =~ s/\bAND\b/\&/gi;
|
397
|
1
|
|
|
|
|
4
|
$query =~ s/\bOR\b/\|/gi;
|
398
|
1
|
|
|
|
|
2
|
$query =~ s/\bNOT\b/\!/gi;
|
399
|
|
|
|
|
|
|
}
|
400
|
24
|
|
|
|
|
43
|
$query =~ s/\'\s*([\w\.\s]+?)\s*\'/_uqt($1)/ge; # remove quotes
|
|
2
|
|
|
|
|
7
|
|
401
|
24
|
|
|
|
|
165
|
$query =~ s/(?<=[\w\.\)])\s*(?=[(\!])/\&/g; # add implicit ANDs before ( and NOT
|
402
|
24
|
|
|
|
|
86
|
$query =~ s/(?<=\))\s*(?=[\w\.\(\!])/\&/g; # add implicit ANDs after )
|
403
|
24
|
|
|
|
|
65
|
$query =~ s/(?<=[\w\.])\s+(?=[\w\.])/\&/g; # add implicit ANDs between words
|
404
|
24
|
|
|
|
|
303
|
$query =~ s/\s*//g; # remove spaces
|
405
|
24
|
|
|
|
|
34
|
$query =~ s/\!\!//g; # NOT NOT
|
406
|
24
|
|
|
|
|
171
|
$query =~ s/\!\!//g # more NOT NOT after ...
|
407
|
|
|
|
|
|
|
while $query =~ s/\((\!?[\w\.\^]+)\)/$1/g; # remove extra ( )
|
408
|
|
|
|
|
|
|
|
409
|
24
|
50
|
|
|
|
179
|
croak('Invalid expression in query')
|
410
|
|
|
|
|
|
|
if $query =~ /[^\w\.\^\!\|\&\(\)]|^\||[\|\!]$|[\&\|\(\!][\&\|\)]|\![^\w\.\(]/;
|
411
|
|
|
|
|
|
|
# not_valid_chars | op_begins | op_ends | no_consecutive_ops | negated_operator
|
412
|
24
|
50
|
|
|
|
64
|
croak('At least one keyword expected in query')
|
413
|
|
|
|
|
|
|
unless $query =~ /[\w\.]/;
|
414
|
|
|
|
|
|
|
|
415
|
24
|
|
|
|
|
63
|
1 while $query =~ s/\!\(([^\)]+)\)/_neg($1)/ge; # NOT ( )
|
|
2
|
|
|
|
|
6
|
|
416
|
24
|
|
|
|
|
50
|
1 while $query =~ s/\(\(([^\(\)]+)\)\)/\($1\)/g; # extra ( )
|
417
|
|
|
|
|
|
|
|
418
|
24
|
|
|
|
|
31
|
my $pairs = $query;
|
419
|
24
|
|
|
|
|
102
|
1 while $pairs =~ s/\((.*?)\)/$1/; # fast way!
|
420
|
24
|
0
|
|
|
|
62
|
croak('Unpaired '.($1 eq '('?'opening':'closing').' parenteses in query')
|
|
|
50
|
|
|
|
|
|
421
|
|
|
|
|
|
|
if $pairs =~ /([\(\)])/;
|
422
|
|
|
|
|
|
|
|
423
|
24
|
|
|
|
|
64
|
return $query;
|
424
|
|
|
|
|
|
|
}
|
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub _neg {
|
427
|
2
|
|
|
2
|
|
6
|
my $query = shift;
|
428
|
2
|
50
|
|
|
|
8
|
$query =~ s/(\!?)([\w\.\^]+)/$1?$2:"!".$2/ge;
|
|
4
|
|
|
|
|
24
|
|
429
|
2
|
|
|
|
|
4
|
$query =~ tr/\|\&/\&\|/;
|
430
|
2
|
|
|
|
|
15
|
return "(".$query.")";
|
431
|
|
|
|
|
|
|
}
|
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub _uqt {
|
434
|
2
|
|
|
2
|
|
5
|
my $words = shift;
|
435
|
2
|
|
|
|
|
10
|
$words =~ s/\s+/\^/g;
|
436
|
2
|
|
|
|
|
8
|
return "(".$words.")";
|
437
|
|
|
|
|
|
|
}
|
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=head1 ATTRIBUTES
|
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
Object's attributes can control how to parse a query,
|
442
|
|
|
|
|
|
|
build a regular expression or test strings.
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Is it possible to access them using C<< $kw->{attribute} >>,
|
445
|
|
|
|
|
|
|
it's better to read them with
|
446
|
|
|
|
|
|
|
L<< $kw-Eget()|"$kw->get( 'attribute' )" >>
|
447
|
|
|
|
|
|
|
and change them with
|
448
|
|
|
|
|
|
|
L<< $kw-Eset()|"$kw->set( attribute => value [, ...] )" >>,
|
449
|
|
|
|
|
|
|
because some validations are done to keep things consistent.
|
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=head2 ignore_case
|
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
Defines if the L should be case (in)sensitive.
|
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
Defaults to case sensitive (a value of 0).
|
456
|
|
|
|
|
|
|
Set to 1 turn the L into case insenitive.
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
B Changing this parameter with
|
459
|
|
|
|
|
|
|
L<< $kw-Eset()|"$kw->set( attribute => value [, ...] )" >>
|
460
|
|
|
|
|
|
|
after regexp has been built, causes the
|
461
|
|
|
|
|
|
|
L to be rebuilt from L.
|
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=head2 multi_words
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
This attribute controls whether the keywords list may include
|
466
|
|
|
|
|
|
|
many words as a single keyword.
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
The default (0) is to treat each word as a keyword.
|
469
|
|
|
|
|
|
|
When this attribute is 1, the keywords list may include many words
|
470
|
|
|
|
|
|
|
as a single keyword.
|
471
|
|
|
|
|
|
|
When is set to 2, the delimiter between words is not a space.
|
472
|
|
|
|
|
|
|
To search for such a keyword, write the words between quotes in the
|
473
|
|
|
|
|
|
|
query string.
|
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
B Changing this parameter with
|
476
|
|
|
|
|
|
|
L<< $kw-Eset()|"$kw->set( attribute => value [, ...] )" >>
|
477
|
|
|
|
|
|
|
after regexp has been built, causes the
|
478
|
|
|
|
|
|
|
L to be rebuilt from L.
|
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
B When set to 0 or 2,
|
481
|
|
|
|
|
|
|
a query with strings in quotes could match a keyword list
|
482
|
|
|
|
|
|
|
if each word is present in the list, side by side in the same order.
|
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=head2 parsed_query
|
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
Contains the query in the
|
487
|
|
|
|
|
|
|
L,
|
488
|
|
|
|
|
|
|
which is required to build the L.
|
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=head2 partial_words
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
By default (value of 0),
|
493
|
|
|
|
|
|
|
only words that match exactly would return I
|
494
|
|
|
|
|
|
|
when a keywords list is tested.
|
495
|
|
|
|
|
|
|
Set this attribute to 1 if you want to match lists where
|
496
|
|
|
|
|
|
|
keywords contains words from the query.
|
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
For example, "word" will match if a list contains "words",
|
499
|
|
|
|
|
|
|
but "query" won't match "queries".
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
B Changing this parameter with
|
502
|
|
|
|
|
|
|
L<< $kw-Eset()|"$kw->set( attribute => value [, ...] )" >>
|
503
|
|
|
|
|
|
|
after regexp has been built, causes the
|
504
|
|
|
|
|
|
|
L to be rebuilt from L.
|
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
B Setting both L
|
507
|
|
|
|
|
|
|
and L to 1
|
508
|
|
|
|
|
|
|
could return unexpected results on tests,
|
509
|
|
|
|
|
|
|
because just first and last words will be considered to be
|
510
|
|
|
|
|
|
|
partial strings only from the outside.
|
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=head2 query
|
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
Contains the original query in the
|
515
|
|
|
|
|
|
|
L.
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head2 regexp
|
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
Contains the regular expresion built for the object's query.
|
520
|
|
|
|
|
|
|
It's a C value!
|
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=head2 texted_ops
|
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
I, I and I operators are represented
|
525
|
|
|
|
|
|
|
by some punctuation chars.
|
526
|
|
|
|
|
|
|
In default mode (0), any use one of that words would
|
527
|
|
|
|
|
|
|
try to match it in the keywords list.
|
528
|
|
|
|
|
|
|
Set this attribute to 1 to allow words C, C and C
|
529
|
|
|
|
|
|
|
to be used as binary operators in query expressions
|
530
|
|
|
|
|
|
|
instead of keywords to match.
|
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
B Changing this attribute with
|
533
|
|
|
|
|
|
|
L<< $kw-Eset()|"$kw->set( attribute => value [, ...] )" >>
|
534
|
|
|
|
|
|
|
after a regexp has been built,
|
535
|
|
|
|
|
|
|
forces a L to be reparsed into L
|
536
|
|
|
|
|
|
|
and L to be rebuilt.
|
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=head1 KEYWORD LISTS
|
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
A Keyword is a combination of letters, underlines and numbers
|
541
|
|
|
|
|
|
|
(C\w+/> pattern).
|
542
|
|
|
|
|
|
|
Sometimes, more than one word can be used to create a keyword,
|
543
|
|
|
|
|
|
|
and a space is between them.
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
Keyword lists are string values with words, usually delimited by comma
|
546
|
|
|
|
|
|
|
or any other punctuation sign.
|
547
|
|
|
|
|
|
|
Spaces may also appear surrounding them.
|
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
There is no validation for field names inside a keywords list.
|
550
|
|
|
|
|
|
|
In fact, that names are also treated as keywords by themselves
|
551
|
|
|
|
|
|
|
(see L).
|
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=head1 QUERY EXPRESSIONS
|
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
A I is a list of keywords
|
556
|
|
|
|
|
|
|
with some operators surrounding them
|
557
|
|
|
|
|
|
|
to provide simple boolean conditions.
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
Query expressions are in the form of:
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
term1 & term2 # AND operator
|
562
|
|
|
|
|
|
|
term1 | term2 # OR operator
|
563
|
|
|
|
|
|
|
!term1 # NOT operator
|
564
|
|
|
|
|
|
|
"term one" # multi-word keyword
|
565
|
|
|
|
|
|
|
term1 & ( term2 | term3 ) # Grouping changes precedence
|
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
All spaces are optional in query expressions,
|
568
|
|
|
|
|
|
|
except for those in multi-word keywords when quoted.
|
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=head2 Expression Terms
|
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
A C is one of the following:
|
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=over 4
|
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=item *
|
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
A single keyword, build with letters, numbers and underscore.
|
579
|
|
|
|
|
|
|
"C<.>" (dot) can be used as a single char wildcard.
|
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=item *
|
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
A sentence of multiple words as a single keyword, enclosed by quotes.
|
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
=item *
|
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
A query expression,
|
588
|
|
|
|
|
|
|
optionally enclosed by parenteses if precedence matters.
|
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=back
|
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=head2 Operators
|
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=over 4
|
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=item * term1 AND term2
|
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
Use I operator when both terms must be present in the keyword list.
|
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
I can be written as "C<&>" (andpersand) or "C<+>" (plus), but may be ommited.
|
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
term1 & term2
|
603
|
|
|
|
|
|
|
+term1 +term2
|
604
|
|
|
|
|
|
|
term1 term2
|
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=item * term1 OR term2
|
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
Use I operator when at least one of the terms is required
|
609
|
|
|
|
|
|
|
in the keyword list.
|
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
I can be written as "C<|>" (vertical bar) or "C<,>" (comma), and cannot be ommited.
|
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
term1 | term2
|
614
|
|
|
|
|
|
|
term1, term2
|
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=item * NOT term
|
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
Use I operator when the term must not be present in the keyword list.
|
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
I can be written as "C" (exclamation mark) or "C<->" (minus).
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
! term
|
623
|
|
|
|
|
|
|
-term
|
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
=back
|
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
To allow the words "AND", "OR" and "NOT" to be treated as operators,
|
628
|
|
|
|
|
|
|
set the C parameter.
|
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
=head2 Grouping
|
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
Precedence is as usual: I has the highest,
|
633
|
|
|
|
|
|
|
then I, and I has the lowest.
|
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
Precedence order in a query expression can be changed
|
636
|
|
|
|
|
|
|
with the use of parenteses.
|
637
|
|
|
|
|
|
|
For example:
|
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
word1 | word2 & word3
|
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
is the same as:
|
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
word1 | ( word2 & word3 )
|
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
but not as:
|
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
( word1 | word2 ) & word3
|
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
where word3 is required at the same time than
|
650
|
|
|
|
|
|
|
either word1 or word2.
|
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
Is it possible to use I for a whole group,
|
653
|
|
|
|
|
|
|
so the following two queries mean the same:
|
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
+word1 -(word2,word3)
|
656
|
|
|
|
|
|
|
+word1 -word2 -word3
|
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
Expresion groups can be nested.
|
659
|
|
|
|
|
|
|
Also, "C<[...]>", "C<{...}>" and "C<< <...> >>"
|
660
|
|
|
|
|
|
|
can be used just like "C<(...)>",
|
661
|
|
|
|
|
|
|
but there is no validation for balanced parenteses by type,
|
662
|
|
|
|
|
|
|
i.e. all of them gets translated into the same before
|
663
|
|
|
|
|
|
|
the validation to detect an orphan one.
|
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
=head2 Tricks
|
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=over 4
|
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=item *
|
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
If fields names and their corresponding values
|
672
|
|
|
|
|
|
|
are specified inside a keywords list,
|
673
|
|
|
|
|
|
|
is it possible to use a single dot "C<.>" to say "C"
|
674
|
|
|
|
|
|
|
as a single term in a query expression for a better match.
|
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
For example, the following query expressions:
|
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
bar & read.yes # matches 2 record
|
679
|
|
|
|
|
|
|
bar & read & yes # matches 3 records
|
680
|
|
|
|
|
|
|
bar & "read yes" # matches 2 record when multi_words=2
|
681
|
|
|
|
|
|
|
# else don't match
|
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
from these keywords lists:
|
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
foo, own:yes, read:yes, rating:3
|
686
|
|
|
|
|
|
|
foo, bar, own:yes, read:yes, rating:1
|
687
|
|
|
|
|
|
|
foo, bar, baz, own:yes, read:no
|
688
|
|
|
|
|
|
|
bar, baz, own:no, read:yes, rating:0
|
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
=item *
|
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
Query with strings in quotes could match a keyword list
|
693
|
|
|
|
|
|
|
if each word is present in the list,
|
694
|
|
|
|
|
|
|
side by side in the same order,
|
695
|
|
|
|
|
|
|
when the L<< C|/"multi_words" >> is NOT set to 1.
|
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
Using the previous sample list,
|
698
|
|
|
|
|
|
|
the query expressions:
|
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
"foo bar" # matches 2 records
|
701
|
|
|
|
|
|
|
"bar foo" # don't match anything
|
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
=item *
|
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
Use I operator when two or more different conditions
|
706
|
|
|
|
|
|
|
satisfies the request.
|
707
|
|
|
|
|
|
|
For example, use:
|
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
own.yes (rating.0 | -rating)
|
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
to match 2 unrated owned books from the sample list.
|
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
=item *
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
You can use this module against a whole document,
|
716
|
|
|
|
|
|
|
not only to a keywords list:
|
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
$kw->prepare('"form method post" !captcha');
|
719
|
|
|
|
|
|
|
print "Unprotected form detected\n" if $kw->test($html_page);
|
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
=back
|
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
=head1 INTERNAL BOOLEAN FORMAT
|
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
Queries in the free-style format are parsed and translated into
|
726
|
|
|
|
|
|
|
an strict internal format. Note that space char is not allowed.
|
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
The elements of this format are:
|
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=over 4
|
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
=item * C<&> (andpersand)
|
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
I operator.
|
735
|
|
|
|
|
|
|
It can't be ommited as in free-style format.
|
736
|
|
|
|
|
|
|
Must be surrounded by (negated) keywords or
|
737
|
|
|
|
|
|
|
parenteses from the outside.
|
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
=item * C<|> (vertical bar)
|
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
I operator.
|
742
|
|
|
|
|
|
|
Must be surrounded by (negated) keywords or
|
743
|
|
|
|
|
|
|
parenteses from the outside.
|
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
=item * C (exclamation mark)
|
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
I operator.
|
748
|
|
|
|
|
|
|
It can appear only preceding a keyword, not a parenteses
|
749
|
|
|
|
|
|
|
or another one.
|
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
=item * C<(> C<)> (parenteses)
|
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
Group delimiters.
|
754
|
|
|
|
|
|
|
Only keywords and other parenteses can touch them from inside.
|
755
|
|
|
|
|
|
|
Nested groups are allowed, empty groups are not.
|
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
=item * C
|
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
A word that matches C\w+/> (letters, numbers or underscore).
|
760
|
|
|
|
|
|
|
It can optionally contain wildcards or space placeholder
|
761
|
|
|
|
|
|
|
following their own rules.
|
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=item * C<.> (dot)
|
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
Single char wildcard.
|
766
|
|
|
|
|
|
|
A word can contain multiple wildcards, but starting or ending with one
|
767
|
|
|
|
|
|
|
may give unpredictable results on test.
|
768
|
|
|
|
|
|
|
Use with care.
|
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
=item * C<^> (caret)
|
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
Space placeholder.
|
773
|
|
|
|
|
|
|
Used to join multiple words as a single keyword.
|
774
|
|
|
|
|
|
|
This is the internal representation of quoted strings with spaces
|
775
|
|
|
|
|
|
|
from the free-style query.
|
776
|
|
|
|
|
|
|
It's not allowed to start or finish a keyword with this space placeholder,
|
777
|
|
|
|
|
|
|
and consecutive placeholders are also invalid.
|
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=back
|
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
Examples:
|
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
tom&jerry|sylvester&tweety
|
784
|
|
|
|
|
|
|
moe&(shemp|curly|joe)&larry
|
785
|
|
|
|
|
|
|
popeye&olive&(!bluto&!brutus)
|
786
|
|
|
|
|
|
|
hagar^the^horrible|popeye^the^sailor
|
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
Examples of bad queries:
|
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
tom&jerry,sylvester&tweety
|
791
|
|
|
|
|
|
|
moe(shemp|curly|joe)larry
|
792
|
|
|
|
|
|
|
popeye&olive&!(bluto|brutus)
|
793
|
|
|
|
|
|
|
^the^
|
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
=head1 KNOWN LIMITATIONS
|
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
Currently, only ASCII chars are supported.
|
798
|
|
|
|
|
|
|
No UTF-8, no Unicode, no accented vowels, no Kanji... Sorry!
|
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
=head1 AUTHOR
|
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
Victor Parada, C<< >>
|
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=head1 BUGS
|
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
Please report any bugs or feature requests to
|
807
|
|
|
|
|
|
|
C,
|
808
|
|
|
|
|
|
|
or through the web interface at
|
809
|
|
|
|
|
|
|
L.
|
810
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified
|
811
|
|
|
|
|
|
|
of progress on your bug as I make changes.
|
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
=head1 SUPPORT
|
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command.
|
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
perldoc Regexp::Keywords
|
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
You can also look for information at:
|
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
=over 4
|
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker
|
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
L
|
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation
|
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
L
|
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=item * CPAN Ratings
|
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
L
|
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
=item * Search CPAN
|
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
L
|
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
=back
|
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS
|
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
Thank's to the Monks from the Monastery at L.
|
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE
|
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
Copyright 2009 Victor Parada.
|
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
850
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published
|
851
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License.
|
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
See L for more information.
|
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
=cut
|
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
1; # End of Regexp::Keywords
|