| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
46
|
|
|
46
|
|
2850918
|
use strict; |
|
|
46
|
|
|
|
|
115
|
|
|
|
46
|
|
|
|
|
1754
|
|
|
2
|
46
|
|
|
46
|
|
398
|
use warnings; |
|
|
46
|
|
|
|
|
165
|
|
|
|
46
|
|
|
|
|
2229
|
|
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package KinoSearch::Test::TestUtils; |
|
5
|
46
|
|
|
46
|
|
397
|
use base qw( Exporter ); |
|
|
46
|
|
|
|
|
101
|
|
|
|
46
|
|
|
|
|
7355
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
|
8
|
|
|
|
|
|
|
working_dir |
|
9
|
|
|
|
|
|
|
create_working_dir |
|
10
|
|
|
|
|
|
|
remove_working_dir |
|
11
|
|
|
|
|
|
|
create_index |
|
12
|
|
|
|
|
|
|
create_uscon_index |
|
13
|
|
|
|
|
|
|
test_index_loc |
|
14
|
|
|
|
|
|
|
persistent_test_index_loc |
|
15
|
|
|
|
|
|
|
init_test_index_loc |
|
16
|
|
|
|
|
|
|
get_uscon_docs |
|
17
|
|
|
|
|
|
|
utf8_test_strings |
|
18
|
|
|
|
|
|
|
test_analyzer |
|
19
|
|
|
|
|
|
|
doc_ids_from_td_coll |
|
20
|
|
|
|
|
|
|
modulo_set |
|
21
|
|
|
|
|
|
|
); |
|
22
|
|
|
|
|
|
|
|
|
23
|
46
|
|
|
46
|
|
26578
|
use KinoSearch; |
|
|
46
|
|
|
|
|
132
|
|
|
|
46
|
|
|
|
|
2382
|
|
|
24
|
46
|
|
|
46
|
|
23923
|
use KinoSearch::Test; |
|
|
46
|
|
|
|
|
114
|
|
|
|
46
|
|
|
|
|
1442
|
|
|
25
|
|
|
|
|
|
|
|
|
26
|
46
|
|
|
46
|
|
418
|
use lib 'sample'; |
|
|
46
|
|
|
|
|
97
|
|
|
|
46
|
|
|
|
|
473
|
|
|
27
|
46
|
|
|
46
|
|
25341
|
use KinoSearch::Test::USConSchema; |
|
|
46
|
|
|
|
|
131
|
|
|
|
46
|
|
|
|
|
2023
|
|
|
28
|
|
|
|
|
|
|
|
|
29
|
46
|
|
|
46
|
|
289
|
use File::Spec::Functions qw( catdir catfile curdir ); |
|
|
46
|
|
|
|
|
95
|
|
|
|
46
|
|
|
|
|
3510
|
|
|
30
|
46
|
|
|
46
|
|
57702
|
use Encode qw( _utf8_off ); |
|
|
46
|
|
|
|
|
646193
|
|
|
|
46
|
|
|
|
|
4765
|
|
|
31
|
46
|
|
|
46
|
|
429
|
use File::Path qw( rmtree ); |
|
|
46
|
|
|
|
|
87
|
|
|
|
46
|
|
|
|
|
2822
|
|
|
32
|
46
|
|
|
46
|
|
335
|
use Carp; |
|
|
46
|
|
|
|
|
97
|
|
|
|
46
|
|
|
|
|
90060
|
|
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my $working_dir = catfile( curdir(), 'kinosearch_test' ); |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# Return a directory within the system's temp directory where we will put all |
|
37
|
|
|
|
|
|
|
# testing scratch files. |
|
38
|
3
|
|
|
3
|
0
|
84
|
sub working_dir {$working_dir} |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub create_working_dir { |
|
41
|
1
|
50
|
|
1
|
0
|
132
|
mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!"; |
|
42
|
|
|
|
|
|
|
} |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# Verify that this user owns the working dir, then zap it. Returns true upon |
|
45
|
|
|
|
|
|
|
# success. |
|
46
|
|
|
|
|
|
|
sub remove_working_dir { |
|
47
|
2
|
100
|
|
2
|
0
|
76
|
return unless -d $working_dir; |
|
48
|
1
|
|
|
|
|
23028
|
rmtree $working_dir; |
|
49
|
1
|
|
|
|
|
10
|
return 1; |
|
50
|
|
|
|
|
|
|
} |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Return a location for a test index to be used by a single test file. If |
|
53
|
|
|
|
|
|
|
# the test file crashes it cannot clean up after itself, so we put the cleanup |
|
54
|
|
|
|
|
|
|
# routine in a single test file to be run at or near the end of the test |
|
55
|
|
|
|
|
|
|
# suite. |
|
56
|
|
|
|
|
|
|
sub test_index_loc { |
|
57
|
9
|
|
|
9
|
0
|
97
|
return catdir( $working_dir, 'test_index' ); |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Return a location for a test index intended to be shared by multiple test |
|
61
|
|
|
|
|
|
|
# files. It will be cleaned as above. |
|
62
|
|
|
|
|
|
|
sub persistent_test_index_loc { |
|
63
|
3
|
|
|
3
|
0
|
195
|
return catdir( $working_dir, 'persistent_test_index' ); |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# Destroy anything left over in the test_index location, then create the |
|
67
|
|
|
|
|
|
|
# directory. Finally, return the path. |
|
68
|
|
|
|
|
|
|
sub init_test_index_loc { |
|
69
|
9
|
|
|
9
|
0
|
85
|
my $dir = test_index_loc(); |
|
70
|
9
|
|
|
|
|
10376
|
rmtree $dir; |
|
71
|
9
|
50
|
|
|
|
175
|
die "Can't clean up '$dir'" if -e $dir; |
|
72
|
9
|
50
|
|
|
|
839
|
mkdir $dir or die "Can't mkdir '$dir': $!"; |
|
73
|
9
|
|
|
|
|
51
|
return $dir; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# Build a RAM index, using the supplied array of strings as source material. |
|
77
|
|
|
|
|
|
|
# The index will have a single field: "content". |
|
78
|
|
|
|
|
|
|
sub create_index { |
|
79
|
27
|
|
|
27
|
0
|
240886
|
my $folder = KinoSearch::Store::RAMFolder->new; |
|
80
|
27
|
|
|
|
|
1157
|
my $indexer = KinoSearch::Index::Indexer->new( |
|
81
|
|
|
|
|
|
|
index => $folder, |
|
82
|
|
|
|
|
|
|
schema => KinoSearch::Test::TestSchema->new, |
|
83
|
|
|
|
|
|
|
); |
|
84
|
27
|
|
|
|
|
53959
|
$indexer->add_doc( { content => $_ } ) for @_; |
|
85
|
27
|
|
|
|
|
231606
|
$indexer->commit; |
|
86
|
27
|
|
|
|
|
2818
|
return $folder; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# Slurp us constitition docs and build hashrefs. |
|
90
|
|
|
|
|
|
|
sub get_uscon_docs { |
|
91
|
|
|
|
|
|
|
|
|
92
|
2
|
|
|
2
|
0
|
26
|
my $uscon_dir = catdir( 'sample', 'us_constitution' ); |
|
93
|
2
|
50
|
|
|
|
1282
|
opendir( my $uscon_dh, $uscon_dir ) |
|
94
|
|
|
|
|
|
|
or die "couldn't opendir '$uscon_dir': $!"; |
|
95
|
2
|
|
|
|
|
340
|
my @filenames = grep {/\.txt$/} sort readdir $uscon_dh; |
|
|
112
|
|
|
|
|
262
|
|
|
96
|
2
|
50
|
|
|
|
63
|
closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!"; |
|
97
|
|
|
|
|
|
|
|
|
98
|
2
|
|
|
|
|
6
|
my %docs; |
|
99
|
|
|
|
|
|
|
|
|
100
|
2
|
|
|
|
|
9
|
for my $filename (@filenames) { |
|
101
|
104
|
|
|
|
|
671
|
my $filepath = catfile( $uscon_dir, $filename ); |
|
102
|
104
|
50
|
|
|
|
5216
|
open( my $fh, '<', $filepath ) |
|
103
|
|
|
|
|
|
|
or die "couldn't open file '$filepath': $!"; |
|
104
|
104
|
|
|
|
|
137
|
my $content = do { local $/; <$fh> }; |
|
|
104
|
|
|
|
|
386
|
|
|
|
104
|
|
|
|
|
53727
|
|
|
105
|
104
|
50
|
|
|
|
690
|
$content =~ /(.*?)\n\n(.*)/s |
|
106
|
|
|
|
|
|
|
or die "Can't extract title/bodytext from '$filepath'"; |
|
107
|
104
|
|
|
|
|
324
|
my $title = $1; |
|
108
|
104
|
|
|
|
|
292
|
my $bodytext = $2; |
|
109
|
104
|
|
|
|
|
9502
|
$bodytext =~ s/\s+/ /sg; |
|
110
|
104
|
50
|
|
|
|
636
|
my $category |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
= $filename =~ /art/ ? 'article' |
|
112
|
|
|
|
|
|
|
: $filename =~ /amend/ ? 'amendment' |
|
113
|
|
|
|
|
|
|
: $filename =~ /preamble/ ? 'preamble' |
|
114
|
|
|
|
|
|
|
: confess "Can't derive category for $filename"; |
|
115
|
|
|
|
|
|
|
|
|
116
|
104
|
|
|
|
|
2698
|
$docs{$filename} = { |
|
117
|
|
|
|
|
|
|
title => $title, |
|
118
|
|
|
|
|
|
|
bodytext => $bodytext, |
|
119
|
|
|
|
|
|
|
url => "/us_constitution/$filename", |
|
120
|
|
|
|
|
|
|
category => $category, |
|
121
|
|
|
|
|
|
|
}; |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
|
|
124
|
2
|
|
|
|
|
34
|
return \%docs; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub create_uscon_index { |
|
128
|
1
|
|
|
1
|
0
|
5
|
my $folder = KinoSearch::Store::FSFolder->new( |
|
129
|
|
|
|
|
|
|
path => persistent_test_index_loc() ); |
|
130
|
1
|
|
|
|
|
80
|
my $schema = KinoSearch::Test::USConSchema->new; |
|
131
|
1
|
|
|
|
|
13
|
my $indexer = KinoSearch::Index::Indexer->new( |
|
132
|
|
|
|
|
|
|
schema => $schema, |
|
133
|
|
|
|
|
|
|
index => $folder, |
|
134
|
|
|
|
|
|
|
truncate => 1, |
|
135
|
|
|
|
|
|
|
create => 1, |
|
136
|
|
|
|
|
|
|
); |
|
137
|
|
|
|
|
|
|
|
|
138
|
1
|
|
|
|
|
104
|
$indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 ); |
|
139
|
1
|
|
|
|
|
250320
|
$indexer->commit; |
|
140
|
1
|
|
|
|
|
4
|
undef $indexer; |
|
141
|
|
|
|
|
|
|
|
|
142
|
1
|
|
|
|
|
117
|
$indexer = KinoSearch::Index::Indexer->new( |
|
143
|
|
|
|
|
|
|
schema => $schema, |
|
144
|
|
|
|
|
|
|
index => $folder, |
|
145
|
|
|
|
|
|
|
); |
|
146
|
1
|
|
|
|
|
7
|
my $source_docs = get_uscon_docs(); |
|
147
|
|
|
|
|
|
|
$indexer->add_doc( { content => $_->{bodytext} } ) |
|
148
|
1
|
|
|
|
|
687
|
for values %$source_docs; |
|
149
|
1
|
|
|
|
|
28842
|
$indexer->commit; |
|
150
|
1
|
|
|
|
|
4
|
undef $indexer; |
|
151
|
|
|
|
|
|
|
|
|
152
|
1
|
|
|
|
|
233
|
$indexer = KinoSearch::Index::Indexer->new( |
|
153
|
|
|
|
|
|
|
schema => $schema, |
|
154
|
|
|
|
|
|
|
index => $folder, |
|
155
|
|
|
|
|
|
|
); |
|
156
|
1
|
|
|
|
|
11
|
my @chars = ( 'a' .. 'z' ); |
|
157
|
1
|
|
|
|
|
6
|
for ( 0 .. 1000 ) { |
|
158
|
1001
|
|
|
|
|
1898
|
my $content = ''; |
|
159
|
1001
|
|
|
|
|
2043
|
for my $num_words ( 1 .. int( rand(20) ) ) { |
|
160
|
9570
|
|
|
|
|
15882
|
for ( 1 .. ( int( rand(10) ) + 10 ) ) { |
|
161
|
139035
|
|
|
|
|
227414
|
$content .= @chars[ rand(@chars) ]; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
9570
|
|
|
|
|
14177
|
$content .= ' '; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
1001
|
|
|
|
|
73037
|
$indexer->add_doc( { content => $content } ); |
|
166
|
|
|
|
|
|
|
} |
|
167
|
1
|
|
|
|
|
9
|
$indexer->optimize; |
|
168
|
1
|
|
|
|
|
98
|
$indexer->commit; |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# Return 3 strings useful for verifying UTF-8 integrity. |
|
172
|
|
|
|
|
|
|
sub utf8_test_strings { |
|
173
|
5
|
|
|
5
|
0
|
136802
|
my $smiley = "\x{263a}"; |
|
174
|
5
|
|
|
|
|
14
|
my $not_a_smiley = $smiley; |
|
175
|
5
|
|
|
|
|
70
|
_utf8_off($not_a_smiley); |
|
176
|
5
|
|
|
|
|
13
|
my $frowny = $not_a_smiley; |
|
177
|
5
|
|
|
|
|
25
|
utf8::upgrade($frowny); |
|
178
|
5
|
|
|
|
|
22
|
return ( $smiley, $not_a_smiley, $frowny ); |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Verify an Analyzer's transform, transform_text, and split methods. |
|
182
|
|
|
|
|
|
|
sub test_analyzer { |
|
183
|
9
|
|
|
9
|
0
|
2733
|
my ( $analyzer, $source, $expected, $message ) = @_; |
|
184
|
|
|
|
|
|
|
|
|
185
|
9
|
|
|
|
|
212
|
my $inversion = KinoSearch::Analysis::Inversion->new( text => $source ); |
|
186
|
9
|
|
|
|
|
235
|
$inversion = $analyzer->transform($inversion); |
|
187
|
9
|
|
|
|
|
48
|
my @got; |
|
188
|
9
|
|
|
|
|
168
|
while ( my $token = $inversion->next ) { |
|
189
|
13
|
|
|
|
|
114
|
push @got, $token->get_text; |
|
190
|
|
|
|
|
|
|
} |
|
191
|
9
|
|
|
|
|
76
|
Test::More::is_deeply( \@got, $expected, "analyze: $message" ); |
|
192
|
|
|
|
|
|
|
|
|
193
|
9
|
|
|
|
|
11897
|
$inversion = $analyzer->transform_text($source); |
|
194
|
9
|
|
|
|
|
127
|
@got = (); |
|
195
|
9
|
|
|
|
|
90
|
while ( my $token = $inversion->next ) { |
|
196
|
13
|
|
|
|
|
96
|
push @got, $token->get_text; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
9
|
|
|
|
|
56
|
Test::More::is_deeply( \@got, $expected, "transform_text: $message" ); |
|
199
|
|
|
|
|
|
|
|
|
200
|
9
|
|
|
|
|
11672
|
@got = @{ $analyzer->split($source) }; |
|
|
9
|
|
|
|
|
297
|
|
|
201
|
9
|
|
|
|
|
72
|
Test::More::is_deeply( \@got, $expected, "split: $message" ); |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# Extract all doc nums from a SortCollector. Return two sorted array refs: |
|
205
|
|
|
|
|
|
|
# by_score and by_id. |
|
206
|
|
|
|
|
|
|
sub doc_ids_from_td_coll { |
|
207
|
1223
|
|
|
1223
|
0
|
498958
|
my $collector = shift; |
|
208
|
1223
|
|
|
|
|
2080
|
my @by_score; |
|
209
|
1223
|
|
|
|
|
29624
|
my $match_docs = $collector->pop_match_docs; |
|
210
|
17616
|
50
|
|
|
|
54919
|
my @by_score_then_id = map { $_->get_doc_id } |
|
|
19040
|
|
|
|
|
127458
|
|
|
211
|
|
|
|
|
|
|
sort { |
|
212
|
1223
|
|
|
|
|
5595
|
$b->get_score <=> $a->get_score |
|
213
|
|
|
|
|
|
|
|| $a->get_doc_id <=> $b->get_doc_id |
|
214
|
|
|
|
|
|
|
} @$match_docs; |
|
215
|
1223
|
|
|
|
|
3643
|
my @by_id = sort { $a <=> $b } @by_score_then_id; |
|
|
37343
|
|
|
|
|
41009
|
|
|
216
|
1223
|
|
|
|
|
21753
|
return ( \@by_score_then_id, \@by_id ); |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Use a modulus to generate a set of numbers. |
|
220
|
|
|
|
|
|
|
sub modulo_set { |
|
221
|
3229
|
|
|
3229
|
0
|
2401714
|
my ( $interval, $max ) = @_; |
|
222
|
3229
|
|
|
|
|
4835
|
my @out; |
|
223
|
3229
|
|
|
|
|
10929
|
for ( my $doc = $interval; $doc < $max; $doc += $interval ) { |
|
224
|
45031
|
|
|
|
|
106110
|
push @out, $doc; |
|
225
|
|
|
|
|
|
|
} |
|
226
|
3229
|
|
|
|
|
11216
|
return \@out; |
|
227
|
|
|
|
|
|
|
} |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
1; |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
__END__ |