| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package KinoSearch1::Index::MultiReader; |
|
2
|
34
|
|
|
34
|
|
200
|
use strict; |
|
|
34
|
|
|
|
|
77
|
|
|
|
34
|
|
|
|
|
2688
|
|
|
3
|
34
|
|
|
34
|
|
519
|
use warnings; |
|
|
34
|
|
|
|
|
81
|
|
|
|
34
|
|
|
|
|
1511
|
|
|
4
|
34
|
|
|
34
|
|
194
|
use KinoSearch1::Util::ToolSet; |
|
|
34
|
|
|
|
|
77
|
|
|
|
34
|
|
|
|
|
8526
|
|
|
5
|
34
|
|
|
34
|
|
202
|
use base qw( KinoSearch1::Index::IndexReader ); |
|
|
34
|
|
|
|
|
72
|
|
|
|
34
|
|
|
|
|
3796
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
BEGIN { |
|
8
|
34
|
|
|
34
|
|
381
|
__PACKAGE__->init_instance_vars( |
|
9
|
|
|
|
|
|
|
invindex => undef, |
|
10
|
|
|
|
|
|
|
sub_readers => undef, |
|
11
|
|
|
|
|
|
|
starts => undef, |
|
12
|
|
|
|
|
|
|
max_doc => 0, |
|
13
|
|
|
|
|
|
|
norms_cache => undef, |
|
14
|
|
|
|
|
|
|
); |
|
15
|
|
|
|
|
|
|
} |
|
16
|
|
|
|
|
|
|
|
|
17
|
34
|
|
|
34
|
|
235
|
use KinoSearch1::Index::FieldInfos; |
|
|
34
|
|
|
|
|
70
|
|
|
|
34
|
|
|
|
|
1689
|
|
|
18
|
34
|
|
|
34
|
|
192
|
use KinoSearch1::Index::SegReader; |
|
|
34
|
|
|
|
|
77
|
|
|
|
34
|
|
|
|
|
891
|
|
|
19
|
34
|
|
|
34
|
|
41025
|
use KinoSearch1::Index::MultiTermDocs; |
|
|
34
|
|
|
|
|
91
|
|
|
|
34
|
|
|
|
|
85425
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# use KinoSearch1::Util::Class's new() |
|
22
|
|
|
|
|
|
|
# Note: can't inherit IndexReader's new() without recursion problems |
|
23
|
|
|
|
|
|
|
*new = *KinoSearch1::Util::Class::new; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub init_instance { |
|
26
|
15
|
|
|
15
|
1
|
31
|
my $self = shift; |
|
27
|
15
|
|
50
|
|
|
97
|
$self->{sub_readers} ||= []; |
|
28
|
15
|
|
50
|
|
|
87
|
$self->{starts} ||= []; |
|
29
|
15
|
|
50
|
|
|
76
|
$self->{norms_cache} ||= {}; |
|
30
|
|
|
|
|
|
|
|
|
31
|
15
|
|
|
|
|
63
|
$self->_init_sub_readers; |
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub _init_sub_readers { |
|
35
|
15
|
|
|
15
|
|
24
|
my $self = shift; |
|
36
|
15
|
|
|
|
|
21
|
my @starts; |
|
37
|
15
|
|
|
|
|
22
|
my $max_doc = 0; |
|
38
|
15
|
|
|
|
|
24
|
for my $sub_reader ( @{ $self->{sub_readers} } ) { |
|
|
15
|
|
|
|
|
49
|
|
|
39
|
44
|
|
|
|
|
61
|
push @starts, $max_doc; |
|
40
|
44
|
|
|
|
|
130
|
$max_doc += $sub_reader->max_doc; |
|
41
|
|
|
|
|
|
|
} |
|
42
|
15
|
|
|
|
|
49
|
$self->{starts} = \@starts; |
|
43
|
15
|
|
|
|
|
58
|
$self->{max_doc} = $max_doc; |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
|
|
46
|
3
|
|
|
3
|
0
|
134
|
sub max_doc { shift->{max_doc} } |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub num_docs { |
|
49
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
|
50
|
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
0
|
my $num_docs = 0; |
|
52
|
0
|
|
|
|
|
0
|
$num_docs += $_->num_docs for @{ $self->{sub_readers} }; |
|
|
0
|
|
|
|
|
0
|
|
|
53
|
|
|
|
|
|
|
|
|
54
|
0
|
|
|
|
|
0
|
return $num_docs; |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub term_docs { |
|
58
|
3
|
|
|
3
|
0
|
19
|
my ( $self, $term ) = @_; |
|
59
|
|
|
|
|
|
|
|
|
60
|
3
|
|
|
|
|
38
|
my $term_docs = KinoSearch1::Index::MultiTermDocs->new( |
|
61
|
|
|
|
|
|
|
sub_readers => $self->{sub_readers}, |
|
62
|
|
|
|
|
|
|
starts => $self->{starts}, |
|
63
|
|
|
|
|
|
|
); |
|
64
|
3
|
|
|
|
|
18
|
$term_docs->seek($term); |
|
65
|
3
|
|
|
|
|
9
|
return $term_docs; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub doc_freq { |
|
69
|
2
|
|
|
2
|
0
|
3
|
my ( $self, $term ) = @_; |
|
70
|
2
|
|
|
|
|
8
|
my $doc_freq = 0; |
|
71
|
2
|
|
|
|
|
4
|
$doc_freq += $_->doc_freq($term) for @{ $self->{sub_readers} }; |
|
|
2
|
|
|
|
|
14
|
|
|
72
|
2
|
|
|
|
|
32
|
return $doc_freq; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub fetch_doc { |
|
76
|
10
|
|
|
10
|
0
|
14
|
my ( $self, $doc_num ) = @_; |
|
77
|
10
|
|
|
|
|
25
|
my $reader_index = $self->_reader_index($doc_num); |
|
78
|
10
|
|
|
|
|
20
|
$doc_num -= $self->{starts}[$reader_index]; |
|
79
|
10
|
|
|
|
|
40
|
return $self->{sub_readers}[$reader_index]->fetch_doc($doc_num); |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub delete_docs_by_term { |
|
83
|
1
|
|
|
1
|
0
|
3
|
my ( $self, $term ) = @_; |
|
84
|
1
|
|
|
|
|
4
|
$_->delete_docs_by_term($term) for @{ $self->{sub_readers} }; |
|
|
1
|
|
|
|
|
9
|
|
|
85
|
|
|
|
|
|
|
} |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
sub commit_deletions { |
|
88
|
12
|
|
|
12
|
0
|
27
|
my $self = shift; |
|
89
|
12
|
|
|
|
|
21
|
$_->commit_deletions for @{ $self->{sub_readers} }; |
|
|
12
|
|
|
|
|
96
|
|
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Determine which sub-reader a document resides in |
|
93
|
|
|
|
|
|
|
sub _reader_index { |
|
94
|
10
|
|
|
10
|
|
14
|
my ( $self, $doc_num ) = @_; |
|
95
|
10
|
|
|
|
|
74
|
my $starts = $self->{starts}; |
|
96
|
10
|
|
|
|
|
18
|
my ( $lo, $mid, $hi ) = ( 0, undef, $#$starts ); |
|
97
|
10
|
|
|
|
|
27
|
while ( $hi >= $lo ) { |
|
98
|
26
|
|
|
|
|
37
|
$mid = ( $lo + $hi ) >> 1; |
|
99
|
26
|
|
|
|
|
36
|
my $mid_start = $starts->[$mid]; |
|
100
|
26
|
100
|
|
|
|
58
|
if ( $doc_num < $mid_start ) { |
|
|
|
50
|
|
|
|
|
|
|
101
|
7
|
|
|
|
|
20
|
$hi = $mid - 1; |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
elsif ( $doc_num > $mid_start ) { |
|
104
|
19
|
|
|
|
|
42
|
$lo = $mid + 1; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
else { |
|
107
|
0
|
|
0
|
|
|
0
|
while ( $mid < $#$starts and $starts->[ $mid + 1 ] == $mid_start ) |
|
108
|
|
|
|
|
|
|
{ |
|
109
|
0
|
|
|
|
|
0
|
$mid++; |
|
110
|
|
|
|
|
|
|
} |
|
111
|
0
|
|
|
|
|
0
|
return $mid; |
|
112
|
|
|
|
|
|
|
} |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
} |
|
115
|
10
|
|
|
|
|
21
|
return $hi; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub norms_reader { |
|
119
|
|
|
|
|
|
|
# TODO refactor and minimize copying |
|
120
|
1
|
|
|
1
|
0
|
3
|
my ( $self, $field_num ) = @_; |
|
121
|
1
|
50
|
|
|
|
4
|
if ( exists $self->{norms_cache}{$field_num} ) { |
|
122
|
0
|
|
|
|
|
0
|
return $self->{norms_cache}{$field_num}; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
else { |
|
125
|
1
|
|
|
|
|
4
|
my $bytes = ''; |
|
126
|
1
|
|
|
|
|
2
|
for my $seg_reader ( @{ $self->{sub_readers} } ) { |
|
|
1
|
|
|
|
|
5
|
|
|
127
|
4
|
|
|
|
|
13
|
my $seg_norms_reader = $seg_reader->norms_reader($field_num); |
|
128
|
4
|
50
|
|
|
|
13
|
$bytes .= ${ $seg_norms_reader->get_bytes } if $seg_norms_reader; |
|
|
4
|
|
|
|
|
12
|
|
|
129
|
|
|
|
|
|
|
} |
|
130
|
1
|
|
|
|
|
6
|
my $norms_reader = $self->{norms_cache}{$field_num} |
|
131
|
|
|
|
|
|
|
= KinoSearch1::Index::NormsReader->new( |
|
132
|
|
|
|
|
|
|
bytes => $bytes, |
|
133
|
|
|
|
|
|
|
max_doc => $self->max_doc, |
|
134
|
|
|
|
|
|
|
); |
|
135
|
1
|
|
|
|
|
5
|
return $norms_reader; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub generate_field_infos { |
|
140
|
12
|
|
|
12
|
0
|
32
|
my $self = shift; |
|
141
|
12
|
|
|
|
|
85
|
my $new_finfos = KinoSearch1::Index::FieldInfos->new; |
|
142
|
|
|
|
|
|
|
my @sub_finfos |
|
143
|
12
|
|
|
|
|
32
|
= map { $_->generate_field_infos } @{ $self->{sub_readers} }; |
|
|
32
|
|
|
|
|
114
|
|
|
|
12
|
|
|
|
|
38
|
|
|
144
|
12
|
|
|
|
|
67
|
$new_finfos->consolidate(@sub_finfos); |
|
145
|
12
|
|
|
|
|
165
|
return $new_finfos; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub get_field_names { |
|
149
|
1
|
|
|
1
|
0
|
3
|
my $self = shift; |
|
150
|
1
|
|
|
|
|
4
|
my %field_names; |
|
151
|
1
|
|
|
|
|
4
|
for my $sub_reader ( @{ $self->{sub_readers} } ) { |
|
|
1
|
|
|
|
|
3
|
|
|
152
|
4
|
|
|
|
|
15
|
my $sub_field_names = $sub_reader->get_field_names; |
|
153
|
4
|
|
|
|
|
17
|
@field_names{@$sub_field_names} = (1) x scalar @$sub_field_names; |
|
154
|
|
|
|
|
|
|
} |
|
155
|
1
|
|
|
|
|
7
|
return [ keys %field_names ]; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub segreaders_to_merge { |
|
159
|
12
|
|
|
12
|
0
|
31
|
my ( $self, $all ) = @_; |
|
160
|
12
|
50
|
|
|
|
24
|
return unless @{ $self->{sub_readers} }; |
|
|
12
|
|
|
|
|
51
|
|
|
161
|
12
|
100
|
|
|
|
33
|
return @{ $self->{sub_readers} } if $all; |
|
|
1
|
|
|
|
|
7
|
|
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# sort by ascending size in docs |
|
164
|
|
|
|
|
|
|
my @sorted_sub_readers |
|
165
|
11
|
|
|
|
|
21
|
= sort { $a->num_docs <=> $b->num_docs } @{ $self->{sub_readers} }; |
|
|
24
|
|
|
|
|
97
|
|
|
|
11
|
|
|
|
|
45
|
|
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# find sparsely populated segments |
|
168
|
11
|
|
|
|
|
22
|
my $total_docs = 0; |
|
169
|
11
|
|
|
|
|
23
|
my $threshold = -1; |
|
170
|
11
|
|
|
|
|
37
|
for my $i ( 0 .. $#sorted_sub_readers ) { |
|
171
|
30
|
|
|
|
|
113
|
$total_docs += $sorted_sub_readers[$i]->num_docs; |
|
172
|
30
|
100
|
|
|
|
87
|
if ( $total_docs < fibonacci( $i + 5 ) ) { |
|
173
|
8
|
|
|
|
|
20
|
$threshold = $i; |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# if any of the segments are sparse, return their readers |
|
178
|
11
|
100
|
|
|
|
35
|
if ( $threshold > -1 ) { |
|
179
|
6
|
|
|
|
|
41
|
return @sorted_sub_readers[ 0 .. $threshold ]; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
else { |
|
182
|
5
|
|
|
|
|
18
|
return; |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
} |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# Generate fibonacci series |
|
187
|
|
|
|
|
|
|
my %fibo_cache; |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub fibonacci { |
|
190
|
58
|
|
|
58
|
0
|
76
|
my $n = shift; |
|
191
|
58
|
100
|
|
|
|
219
|
return $fibo_cache{$n} if exists $fibo_cache{$n}; |
|
192
|
18
|
100
|
|
|
|
68
|
my $result = $n < 2 ? $n : fibonacci( $n - 1 ) + fibonacci( $n - 2 ); |
|
193
|
18
|
|
|
|
|
46
|
$fibo_cache{$n} = $result; |
|
194
|
18
|
|
|
|
|
53
|
return $result; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub close { |
|
198
|
12
|
|
|
12
|
0
|
34
|
my $self = shift; |
|
199
|
12
|
50
|
|
|
|
51
|
return unless $self->{close_invindex}; |
|
200
|
12
|
|
|
|
|
33
|
$_->close for @{ $self->{sub_readers} }; |
|
|
12
|
|
|
|
|
79
|
|
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
1; |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
__END__ |