File Coverage

blib/lib/DBIx/TextIndex.pm

Criterion	Covered	Total	%
statement	96	1173	8.1
branch	1	454	0.2
condition	0	94	0.0
subroutine	32	101	31.6
pod	19	30	63.3
total	148	1852	7.9

line	stmt	bran	cond	sub	pod	time	code
1							package DBIx::TextIndex;
2
3	8			8		761982	use strict;
	8					25
	8					326
4	8			8		45	use warnings;
	8					17
	8					712
5
6							our $VERSION = '0.28';
7
8							require XSLoader;
9							XSLoader::load('DBIx::TextIndex', $VERSION);
10
11	8			8		8974	use Bit::Vector ();
	8					13667
	8					262
12	8			8		65	use Carp qw(carp croak);
	8					14
	8					738
13	8			8		5987	use DBIx::TextIndex::Exception qw(:all);
	8					29
	8					1264
14	8			8		5845	use DBIx::TextIndex::QueryParser;
	8					34
	8					347
15	8			8		6374	use DBIx::TextIndex::TermDocsCache;
	8					27
	8					222
16	8			8		8485	use HTML::Entities ();
	8					69472
	8					985
17
18							my $unac;
19							BEGIN {
20	8			8		19	eval { require Text::Unaccent; import Text::Unaccent qw(unac_string) };
	8					3828
	0					0
21	8	50				250	$unac = $@ ? 0 : 1;
22							}
23	8			8		46	use constant DO_UNAC => $unac;
	8					13
	8					831
24	8			8		49	use constant COLLECTION_NAME_MAX_LENGTH => 100;
	8					16
	8					367
25
26							# Version number when collection table definition last changed
27	8			8		96	use constant LAST_COLLECTION_TABLE_UPGRADE => 0.24;
	8					16
	8					536
28
29							# Largest size word to be indexed
30	8			8		38	use constant MAX_WORD_LENGTH => 20;
	8					14
	8					428
31
32							# Minimum number of alphanumeric characters in a term before a wildcard
33	8			8		68	use constant MIN_WILDCARD_LENGTH => 1;
	8					16
	8					368
34
35							# Maximum number of words a wildcard term can expand to
36	8			8		187	use constant MAX_WILDCARD_TERM_EXPANSION => 30;
	8					13
	8					402
37
38							# Used to screen stop words from the scoring process
39	8			8		37	use constant IDF_MIN_OKAPI => -1.8;
	8					12
	8					423
40
41							# What can be considered too many results, NO LONGER USED
42	8			8		65	use constant RESULT_THRESHOLD => 5000;
	8					13
	8					342
43
44							# Document score accumulator, higher numbers increase scoring accuracy
45							# but use more memory and cpu
46	8			8		37	use constant ACCUMULATOR_LIMIT => 20000;
	8					12
	8					440
47
48							# Clear out the hash key caches after this many searches
49	8			8		59	use constant SEARCH_CACHE_FLUSH_INTERVAL => 1000;
	8					115
	8					397
50
51							# Practical number of rows RDBMS can scan in acceptable amount of time
52	8			8		35	use constant PHRASE_THRESHOLD => 1000;
	8					10
	8					361
53
54							# Charset of data to be indexed
55	8			8		34	use constant CHARSET => 'iso-8859-1';
	8					12
	8					344
56
57							# SQL datatype to store document keys
58	8			8		33	use constant DOC_KEY_SQL_TYPE => 'varchar';
	8					18
	8					348
59
60							# Maximum length of above key
61	8			8		36	use constant DOC_KEY_LENGTH => '200';
	8					11
	8					1211
62
63
64							my %ERROR = (
65							empty_query => "You must be searching for something!",
66							quote_count => "Quotes must be used in matching pairs.",
67							no_results => "Your search did not produce any matching documents.",
68							no_results_stop => "Your search did not produce any matching " .
69							"documents. These common words were not included in the search:",
70
71							wildcard_length => MIN_WILDCARD_LENGTH > 1
72							?
73							"Use at least " . MIN_WILDCARD_LENGTH . " letters or " .
74							"numbers at the beginning of the word before wildcard characters."
75							:
76							"Use at least one letter or number at the beginning of the word " .
77							"before wildcard characters.",
78							wildcard_expansion => "The wildcard term you used was too broad, " .
79							"please use more characters before or after the wildcard",
80							);
81
82							my @MASK_TYPES = qw(and_mask or_mask not_mask);
83
84	8			8		39	use constant COLLECTION_TABLE => 'collection';
	8					13
	8					6934
85
86							my @COLLECTION_FIELDS = qw(
87							collection
88							version
89							max_indexed_id
90							doc_table
91							doc_id_field
92							doc_fields
93							charset
94							stoplist
95							proximity_index
96							error_empty_query
97							error_quote_count
98							error_no_results
99							error_no_results_stop
100							error_wildcard_length
101							error_wildcard_expansion
102							max_word_length
103							result_threshold
104							phrase_threshold
105							min_wildcard_length
106							max_wildcard_term_expansion
107							decode_html_entities
108							scoring_method
109							update_commit_interval
110							);
111
112							my %COLLECTION_FIELD_DEFAULT = (
113							collection => '',
114							version => $DBIx::TextIndex::VERSION,
115							max_indexed_id => '0',
116							doc_table => '',
117							doc_id_field => '',
118							doc_fields => '',
119							charset => CHARSET,
120							stoplist => '',
121							proximity_index => '1',
122							error_quote_count => $ERROR{quote_count},
123							error_empty_query => $ERROR{empty_query},
124							error_no_results => $ERROR{no_results},
125							error_no_results_stop => $ERROR{no_results_stop},
126							error_wildcard_length => $ERROR{wildcard_length},
127							error_wildcard_expansion => $ERROR{wildcard_expansion},
128							max_word_length => MAX_WORD_LENGTH,
129							result_threshold => RESULT_THRESHOLD,
130							phrase_threshold => PHRASE_THRESHOLD,
131							min_wildcard_length => MIN_WILDCARD_LENGTH,
132							max_wildcard_term_expansion => MAX_WILDCARD_TERM_EXPANSION,
133							decode_html_entities => '1',
134							scoring_method => 'okapi',
135							update_commit_interval => 20000,
136							);
137
138
139							my $PA = 0; # just a shortcut to $self->{PRINT_ACTIVITY}
140
141							sub new {
142	0			0	1		my $pkg = shift;
143	0						my $args = shift;
144
145	0		0				my $class = ref($pkg) \|\| $pkg;
146	0						my $self = bless {}, $class;
147
148	0						$self->{COLLECTION_FIELDS} = \@COLLECTION_FIELDS;
149
150	0						foreach my $arg ('collection', 'index_dbh') {
151	0	0					if ($args->{$arg}) {
152	0						$self->{uc $arg} = $args->{$arg};
153							} else {
154	0						throw_gen( error => "new $pkg needs $arg argument" );
155							}
156							}
157
158	0						my $coll = $self->{COLLECTION};
159
160	0	0					if ($args->{doc_dbh}) {
161	0						$self->{DOC_DBH} = $args->{doc_dbh};
162							}
163
164							# deprecated arguments
165	0	0					if ($args->{db}) {
166	0						throw_gen( error => "new $pkg no longer needs 'db' argument" );
167							}
168
169							# term_docs field can have character 32 at end of string,
170							# so DBI ChopBlanks must be turned off
171	0						$self->{INDEX_DBH}->{ChopBlanks} = 0;
172
173	0						$self->{PRINT_ACTIVITY} = 0;
174	0		0				$self->{PRINT_ACTIVITY} = $args->{'print_activity'} \|\| 0;
175	0						$PA = $self->{PRINT_ACTIVITY};
176
177	0						$args->{dbd} = $self->{INDEX_DBH}->{Driver}->{Name};
178	0						my $dbd_class = 'DBIx::TextIndex::DBD::' . $args->{dbd};
179	0						eval "require $dbd_class";
180	0	0					throw_gen( error => "Unsupported DBD driver: $dbd_class",
181							detail => $@ ) if $@;
182	0						$self->{DB} = $dbd_class->new({
183							index_dbh => $self->{INDEX_DBH},
184							collection_table => COLLECTION_TABLE,
185							collection_fields => $self->{COLLECTION_FIELDS},
186							});
187
188	0						$self->{DBD_TYPE} = $args->{dbd};
189
190	0	0					unless ($self->_fetch_collection_info) {
191	0						$self->{DOC_TABLE} = $args->{doc_table};
192	0						$self->{DOC_FIELDS} = $args->{doc_fields};
193	0						$self->{DOC_ID_FIELD} = $args->{doc_id_field};
194
195	0						$self->{STOPLIST} = $args->{stoplist};
196
197							# override default error messages
198	0						while (my($error, $msg) = each %{$args->{errors}}) {
	0
199	0						$ERROR{$error} = $msg;
200							}
201
202	0						foreach my $field ( qw(max_word_length
203							result_threshold
204							phrase_threshold
205							min_wildcard_length
206							max_wildcard_term_expansion
207							decode_html_entities
208							scoring_method
209							update_commit_interval
210							charset
211							proximity_index) )
212							{
213	0	0					$self->{uc($field)} = defined $args->{$field} ?
214							$args->{$field} :
215							$COLLECTION_FIELD_DEFAULT{$field};
216							}
217							}
218	0	0					$self->{CZECH_LANGUAGE} = $self->{CHARSET} eq 'iso-8859-2' ? 1 : 0;
219	0						$self->{MASK_TABLE} = $coll . '_mask';
220	0						$self->{DOCWEIGHTS_TABLE} = $coll . '_docweights';
221	0						$self->{ALL_DOCS_VECTOR_TABLE} = $coll . '_all_docs_vector';
222	0						$self->{DELETE_QUEUE_TABLE} = $coll . '_delete_queue';
223	0						$self->{DOC_KEY_TABLE} = $coll . '_doc_key';
224
225							# Field number, assign each field a number 0 .. N
226	0						my $fno = 0;
227
228	0						foreach my $field ( @{$self->{DOC_FIELDS}} ) {
	0
229	0						$self->{FIELD_NO}->{$field} = $fno;
230	0						push @{$self->{INVERTED_TABLES}},
	0
231							($coll . '_' . $field . '_inverted');
232	0						$fno++;
233							}
234
235							# Initialize stoplists
236	0	0	0				if ($self->{STOPLIST} and ref($self->{STOPLIST})) {
237	0						$self->{STOPLISTED_WORDS} = {};
238	0						foreach my $stoplist (@{$self->{STOPLIST}}) {
	0
239	0						my $stop_package = "DBIx::TextIndex::StopList::$stoplist";
240	0	0					_log("initializing stoplist: $stop_package\n") if $PA;
241	0						eval "require $stop_package";
242	8			8		48	no strict 'refs';
	8					17
	8					48937
243	0						my @words = @{$stop_package . '::words'};
	0
244	0						foreach my $word (@words) {
245	0						$self->{STOPLISTED_WORDS}->{$word} = 1;
246							}
247							}
248							}
249	0						$self->{STOPLISTED_QUERY} = [];
250
251							# Database driver object
252	0	0	0				$self->{DB}->set({
253							all_docs_vector_table => $self->{ALL_DOCS_VECTOR_TABLE},
254							delete_queue_table => $self->{DELETE_QUEUE_TABLE},
255							doc_table => $self->{DOC_TABLE},
256							doc_fields => $self->{DOC_FIELDS},
257							doc_id_field => $self->{DOC_ID_FIELD},
258							docweights_table => $self->{DOCWEIGHTS_TABLE},
259							doc_key_table => $self->{DOC_KEY_TABLE},
260							mask_table => $self->{MASK_TABLE},
261							max_word_length => $self->{MAX_WORD_LENGTH},
262							doc_key_sql_type => $args->{doc_key_sql_type} \|\| DOC_KEY_SQL_TYPE,
263							doc_key_length => exists $args->{doc_key_length} ?
264							$args->{doc_key_length} : DOC_KEY_LENGTH,
265							});
266
267							# Cache for term_doc postings
268	0						$self->{C} = DBIx::TextIndex::TermDocsCache->new({
269							db => $self->{DB},
270							index_dbh => $self->{INDEX_DBH},
271							max_indexed_id => $self->max_indexed_id,
272							inverted_tables => $self->{INVERTED_TABLES},
273							});
274
275							# Query parser object
276	0						$self->{QP} = DBIx::TextIndex::QueryParser->new({
277							charset => $self->{CHARSET},
278							stoplist => $self->{STOPLIST},
279							stoplisted_words => $self->{STOPLISTED_WORDS}
280							});
281
282							# Number of searches performed on this instance
283	0						$self->{SEARCH_COUNT} = 0;
284	0						return $self;
285							}
286
287							sub add_mask {
288	0			0	1		my $self = shift;
289	0						my $mask = shift;
290	0						my $doc_keys = shift;
291
292	0						my $ids = $self->{DB}->fetch_doc_ids($doc_keys);
293
294	0						my $max_indexed_id = $self->max_indexed_id;
295
296							# Trim ids from end instead here.
297	0	0					if ($ids->[-1] > $max_indexed_id) {
298	0						throw_gen( error => "Greatest doc_id ($ids->[-1]) in mask ($mask) is larger than greatest doc_id in index" );
299							}
300
301	0						my $vector = Bit::Vector->new($max_indexed_id + 1);
302	0						$vector->Index_List_Store(@$ids);
303
304	0	0					_log("Adding mask ($mask) to table $self->{MASK_TABLE}\n") if $PA > 1;
305	0						$self->{DB}->add_mask($mask, $vector->to_Enum);
306	0						return 1;
307							}
308
309							sub _log {
310	0			0			my @messages = @_;
311	0						print @messages;
312							}
313
314							sub delete_mask {
315	0			0	1		my $self = shift;
316	0						my $mask = shift;
317	0	0					_log("Deleting mask ($mask) from table $self->{MASK_TABLE}\n") if $PA > 1;
318	0						$self->{INDEX_DBH}->do($self->{DB}->delete_mask, undef, $mask);
319							}
320
321							# Stub method for older deprecated name
322	0			0	1		sub add_document { shift->add_doc(@_) }
323
324							sub add_doc {
325	0			0	1		my $self = shift;
326	0						my @keys = @_;
327
328	0	0					throw_gen( error => 'add_doc() needs doc_dbh to be defined' ) unless
329							defined $self->{DOC_DBH};
330
331	0						my $keys;
332	0	0					if (ref $keys[0] eq 'ARRAY') {
		0
333	0						$keys = $keys[0];
334							} elsif ($keys[0] =~ m/^\d+$/) {
335	0						$keys = \@keys;
336							}
337
338	0	0					return if $#$keys < 0;
339
340	0						my $add_count_guess = $#$keys + 1;
341	0						my $add_count = 0;
342	0	0					_log("Adding $add_count_guess docs\n") if $PA;
343
344	0						my @added_ids;
345	0						my $batch_count = 0;
346
347	0						foreach my $doc_key (@$keys) {
348	0	0					unless ($self->_ping_doc($doc_key)) {
349	0	0					_log("$doc_key skipped, no doc $doc_key found\n") if $PA;
350	0						next;
351							}
352
353	0						my $doc_id =
354							$self->_add_one($doc_key, $self->_fetch_doc_all_fields($doc_key));
355
356	0						push @added_ids, $doc_id;
357	0						$add_count++;
358	0						$batch_count++;
359	0	0	0				if ($self->{UPDATE_COMMIT_INTERVAL}
360							&& $batch_count >= $self->{UPDATE_COMMIT_INTERVAL}) {
361							# Update database
362	0						$self->_commit_docs(\@added_ids);
363	0						$batch_count = 0;
364	0						@added_ids = ();
365							}
366
367							} # end of doc indexing
368
369
370							# Update database
371	0						$self->_commit_docs(\@added_ids);
372	0						return $add_count;
373							}
374
375							sub _add_one {
376	0			0			my $self = shift;
377	0						my ($doc_key, $doc_fields) = @_;
378
379	0						my $doc_id = $self->{DB}->fetch_doc_id($doc_key);
380	0	0					if (defined $doc_id) {
381							# FIXME: need optimization if more than one doc is replaced at once
382	0	0					_log("Replacing doc $doc_key\n") if $PA;
383	0						$self->_remove($doc_id);
384							}
385	0						$doc_id = $self->{DB}->insert_doc_key($doc_key);
386
387	0						my $do_prox = $self->{PROXIMITY_INDEX};
388
389	0	0					_log("$doc_key - $doc_id") if $PA;
390
391	0						foreach my $fno ( 0 .. $#{$self->{DOC_FIELDS}} ) {
	0
392	0						my $field = $self->{DOC_FIELDS}->[$fno];
393	0	0					_log(" $field") if $PA;
394
395	0						my %positions;
396							my %frequency;
397
398	0						my @terms = $self->_terms($doc_fields->{$field});
399
400	0						my $term_count = 1;
401	0						foreach my $term (@terms) {
402	0	0					push @{$positions{$term}}, $term_count if $do_prox;
	0
403	0						$frequency{$term}++;
404	0						$term_count++;
405							}
406	0	0					_log(" $term_count") if $PA;
407	0						while (my ($term, $frequency) = each %frequency) {
408	0						$self->_docs($fno, $term, $doc_id, $frequency);
409	0	0					$self->_positions($fno, $term, $positions{$term}) if $do_prox;
410							}
411							# Doc weight
412	0	0					$self->{NEW_W_D}->[$fno]->[$doc_id] = $term_count ?
413							sprintf("%.5f", sqrt((1 + log($term_count))**2)) : 0;
414							} # end of field indexing
415	0	0					_log("\n") if $PA;
416	0						return $doc_id;
417							}
418
419							sub add {
420	0			0	1		my $self = shift;
421
422	0						my $add_count = 0;
423	0	0					unless ($self->{IN_ADD_TRANSACTION}) {
424	0						$self->{ADD_BATCH_COUNT} = 0;
425	0						$self->{ADDED_IDS} = [];
426							}
427
428	0						while (my ($doc_key, $doc_fields) = splice(@_, 0, 2)) {
429	0						my $doc_id = $self->_add_one($doc_key, $doc_fields);
430	0						push @{$self->{ADDED_IDS}}, $doc_id;
	0
431	0						$add_count++;
432	0						$self->{ADD_BATCH_COUNT}++;
433	0	0	0				if ($self->{UPDATE_COMMIT_INTERVAL}
434							&& $self->{ADD_BATCH_COUNT} >= $self->{UPDATE_COMMIT_INTERVAL}) {
435							# Update database
436	0						$self->_commit_docs();
437	0						$self->{ADD_BATCH_COUNT} = 0;
438	0						$self->{ADDED_IDS} = [];
439							}
440							}
441
442							# Update database
443	0	0					unless ($self->{IN_ADD_TRANSACTION}) {
444	0						$self->_commit_docs();
445	0						delete($self->{ADDED_IDS});
446							}
447	0						return $add_count;
448							}
449
450							sub begin_add {
451	0			0	1		my $self = shift;
452	0						$self->{IN_ADD_TRANSACTION} = 1;
453	0						$self->{ADD_BATCH_COUNT} = 0;
454	0						$self->{ADDED_IDS} = [];
455							}
456
457							sub commit_add {
458	0			0	1		my $self = shift;
459	0						$self->_commit_docs();
460	0						delete($self->{ADDED_IDS});
461	0						$self->{IN_ADD_TRANSACTION} = 0;
462							}
463
464							# Stub methods for older deprecated names
465	0			0	1		sub remove_document { shift->remove(@_) }
466	0			0	1		sub remove_doc { shift->remove(@_) }
467
468							sub remove {
469	0			0	1		my $self = shift;
470	0						my @doc_keys = @_;
471
472	0						my $doc_keys;
473	0	0					if (ref $doc_keys[0] eq 'ARRAY') {
		0
474	0						$doc_keys = $doc_keys[0];
475							} elsif ($doc_keys[0] =~ m/^\d+$/) {
476	0						$doc_keys = \@doc_keys;
477							}
478
479	0						my $doc_ids = $self->{DB}->fetch_doc_ids($doc_keys);
480	0						return $self->_remove($doc_ids);
481							}
482
483							sub _remove {
484	0			0			my $self = shift;
485	0						my @ids = @_;
486
487	0						my $ids;
488	0	0					if (ref $ids[0] eq 'ARRAY') {
		0
489	0						$ids = $ids[0];
490							} elsif ($ids[0] =~ m/^\d+$/) {
491	0						$ids = \@ids;
492							}
493
494	0	0					return if $#$ids < 0;
495
496	0						my $remove_count = $#$ids + 1;
497	0	0					_log("Removing $remove_count docs\n") if $PA;
498
499	0	0					_log("Removing docs from docweights table\n") if $PA;
500	0						$self->_docweights_remove($ids);
501
502	0						$self->_all_doc_ids_remove($ids);
503
504	0						$self->{DB}->delete_doc_key_doc_ids($ids);
505
506	0						$self->_add_to_delete_queue($ids);
507
508	0						return $remove_count; # return count of removed ids
509							}
510
511							sub _docweights_remove {
512	0			0			my $self = shift;
513	0						my $docs_ref = shift;
514
515	0						my @docs = @{$docs_ref};
	0
516	0						my $use_all_fields = 1;
517	0						$self->_fetch_docweights($use_all_fields);
518
519	0						my $sql = $self->{DB}->update_docweights;
520	0						my $sth = $self->{INDEX_DBH}->prepare($sql);
521	0						foreach my $fno ( 0 .. $#{$self->{DOC_FIELDS}} ) {
	0
522	0						my @w_d = @{$self->{W_D}->[$fno]};
	0
523	0						foreach my $doc_id (@docs) {
524	0						$w_d[$doc_id] = 0;
525							}
526	0						my $packed_w_d = pack 'f*', @w_d;
527							# FIXME: we should update the average, leave it alone for now
528	0						$self->{DB}->update_docweights_execute(
529							$sth,
530							$fno,
531							$self->{AVG_W_D}->[$fno],
532							$packed_w_d
533							);
534							}
535	0						$sth->finish;
536							}
537
538							sub stat {
539	0			0	1		my $self = shift;
540	0						my $query = shift;
541
542	0	0					if (lc($query) eq 'total_words') {
543	0						my $total_terms = 0;
544	0						foreach my $table (@{$self->{INVERTED_TABLES}}) {
	0
545	0						my $sql = $self->{DB}->total_terms($table);
546	0						$total_terms += scalar $self->{INDEX_DBH}->selectrow_array($sql);
547							}
548	0						return $total_terms;
549							}
550
551	0						return undef;
552							}
553
554							sub unscored_search {
555	0			0	1		my $self = shift;
556	0						my $query = shift;
557	0						my $args = shift;
558	0						$args->{unscored_search} = 1;
559	0						return $self->search($query, $args);
560							}
561
562							sub search {
563	0			0	1		my $self = shift;
564	0						my $query = shift;
565	0						my $args = shift;
566
567	0						$self->{SEARCH_COUNT}++;
568
569	0						$self->_flush_cache;
570
571	0						$self->{OR_TERM_COUNT} = 0;
572	0						$self->{AND_TERM_COUNT} = 0;
573
574	0	0					throw_query( error => $ERROR{empty_query}) unless $query;
575
576	0						my @query_field_nos;
577							my %term_field_nos;
578	0						while (my ($field, $query_string) = each %$query) {
579	0	0					next unless $query_string =~ m/\S+/;
580	0	0					throw_gen( error => "invalid field ($field) in search()" )
581							unless exists $self->{FIELD_NO}->{$field};
582	0						my $fno = $self->{FIELD_NO}->{$field};
583	0						$self->{QUERY}->[$fno] = $self->{QP}->parse($query_string);
584	0						$self->{STOPLISTED_QUERY} = $self->{QP}->stoplisted_query;
585	0						foreach my $fld ($self->{QP}->term_fields) {
586	0	0					if ($fld eq '__DEFAULT') {
587	0						$term_field_nos{$fno}++;
588							} else {
589	0	0					if (exists $self->{FIELD_NO}->{$fld}) {
590	0						$term_field_nos{$self->{FIELD_NO}->{$fld}}++;
591							}
592							# FIXME: should we throw a query exception here if $fld
593							# does not exist?
594							}
595							}
596	0						push @query_field_nos, $self->{FIELD_NO}->{$field};
597							}
598
599	0	0					throw_query( error => $ERROR{'empty_query'} )
600							unless $#query_field_nos >= 0;
601
602	0						@{$self->{QUERY_FIELD_NOS}} = sort { $a <=> $b } @query_field_nos;
	0
	0
603	0						@{$self->{TERM_FIELD_NOS}} = sort { $a <=> $b } keys %term_field_nos;
	0
	0
604
605	0						foreach my $mask_type (@MASK_TYPES) {
606	0	0					if ($args->{$mask_type}) {
607	0						$self->{MASK}->{$mask_type} = $args->{$mask_type};
608	0						foreach my $mask (@{$args->{$mask_type}}) {
	0
609	0	0					if (ref $mask) {
610	0						$self->{VALID_MASK} = 1;
611							} else {
612	0						push @{$self->{MASK_FETCH_LIST}}, $mask;
	0
613							}
614							}
615							}
616							}
617
618	0	0					if ($args->{or_mask_set}) {
619	0						$self->{MASK}->{or_mask_set} = $args->{or_mask_set};
620	0						foreach my $mask_set (@{$args->{or_mask_set}}) {
	0
621	0						foreach my $mask (@$mask_set) {
622	0	0					if (ref $mask) {
623	0						$self->{VALID_MASK} = 1;
624							} else {
625	0						push @{$self->{MASK_FETCH_LIST}}, $mask;
	0
626							}
627							}
628							}
629							}
630
631	0						$self->_optimize_or_search;
632	0						$self->_resolve_mask;
633	0						$self->_boolean_search;
634
635	0	0					if ($args->{unscored_search}) {
636	0						my @result_docs = $self->{RESULT_VECTOR}->Index_List_Read;
637	0	0					throw_query( error => $ERROR{'no_results'} ) if $#result_docs < 0;
638	0						return $self->{DB}->fetch_doc_keys(\@result_docs);
639							}
640
641	0		0				my $scoring_method = $args->{scoring_method} \|\| $self->{SCORING_METHOD};
642
643	0						my $results = {};
644	0	0					if ($scoring_method eq 'okapi') {
645	0						$results = $self->_search_okapi;
646							} else {
647	0						throw_gen( error => "Invalid scoring method $scoring_method, only choice is okapi");
648							}
649	0						$self->{C}->flush_term_docs;
650
651	0						return $results;
652							}
653
654
655							sub _boolean_search {
656	0			0			my $self = shift;
657	0						$self->fetch_all_docs_vector;
658
659	0						my @query_fnos = @{$self->{QUERY_FIELD_NOS}};
	0
660
661	0	0					if ($#query_fnos == 0) {
662	0						my $fno = $query_fnos[0];
663	0						$self->{RESULT_VECTOR} =
664							$self->_boolean_search_field($fno, $self->{QUERY}->[$fno]);
665							} else {
666	0						my $max_id = $self->max_indexed_id + 1;
667	0						$self->{RESULT_VECTOR} = Bit::Vector->new($max_id);
668	0						foreach my $fno (@query_fnos) {
669	0						my $field_vec =
670							$self->_boolean_search_field($fno, $self->{QUERY}->[$fno]);
671	0						$self->{RESULT_VECTOR}->Union($self->{RESULT_VECTOR}, $field_vec);
672							}
673							}
674
675	0	0					if ($self->{RESULT_MASK}) {
676	0						$self->{RESULT_VECTOR}->Intersection($self->{RESULT_VECTOR},
677							$self->{RESULT_MASK});
678							}
679
680	8			8		94	no warnings qw(uninitialized);
	8					19
	8					1952
681	0						foreach my $fno (@{$self->{TERM_FIELD_NOS}}) {
	0
682	0						my %f_t;
683	0						foreach my $term (@{$self->{TERMS}->[$fno]}) {
	0
684	0						$f_t{$term} = $self->{C}->f_t($fno, $term);
685							# query term frequency
686	0						$self->{F_QT}->[$fno]->{$term}++;
687							}
688							# Set TERMS to frequency-sorted list
689	0						my @freq_sort = sort {$f_t{$a} <=> $f_t{$b}} keys %f_t;
	0
690	0						$self->{TERMS}->[$fno] = \@freq_sort;
691							}
692							}
693
694							sub _boolean_search_field {
695
696	8			8		44	no warnings qw(uninitialized);
	8					17
	8					22125
697
698	0			0			my $self = shift;
699	0						my ($field_no, $clauses) = @_;
700
701	0						my $max_id = $self->max_indexed_id + 1;
702	0						my $field_vec = $self->{ALL_DOCS_VECTOR}->Clone;
703
704	0						my @or_vecs;
705
706	0						my $scorable_clause_count = 0; # Any clause without 'NOT' modifier
707
708	0						foreach my $clause (@$clauses) {
709	0						my $clause_vec;
710	0						my $expanded_terms = [];
711	0						my $fno = $field_no;
712	0	0					if (exists $self->{FIELD_NO}->{$clause->{FIELD}}) {
713	0						$fno = $self->{FIELD_NO}->{$clause->{FIELD}};
714							}
715	0	0	0				if ($clause->{TYPE} eq 'QUERY') {
		0
		0
		0
		0
716	0						$clause_vec =
717							$self->_boolean_search_field($fno, $clause->{QUERY});
718							} elsif ($clause->{TYPE} eq 'PLURAL') {
719	0						($clause_vec, $expanded_terms) =
720							$self->_resolve_plural($fno, $clause->{TERM});
721							} elsif ($clause->{TYPE} eq 'WILD') {
722	0						($clause_vec, $expanded_terms) =
723							$self->_resolve_wild($fno, $clause->{TERM});
724							} elsif ($clause->{TYPE} eq 'PHRASE'
725							\|\| $clause->{TYPE} eq 'IMPLICITPHRASE') {
726	0						$clause_vec = $self->_resolve_phrase($fno, $clause);
727							} elsif ($clause->{TYPE} eq 'TERM') {
728	0						$clause_vec = $self->{C}->vector($fno, $clause->{TERM});
729							} else {
730	0						next;
731							}
732
733							# AND/OR terms will be used later in scoring process
734	0	0					unless ($clause->{MODIFIER} eq 'NOT') {
735	0	0	0				if ($clause->{TYPE} eq 'PHRASE'
		0	0
736							\|\| $clause->{TYPE} eq 'IMPLICITPHRASE') {
737	0						foreach my $term_clause (@{$clause->{PHRASETERMS}}) {
	0
738	0						push @{$self->{TERMS}->[$fno]}, $term_clause->{TERM};
	0
739							}
740							} elsif ($clause->{TYPE} eq 'WILD' \|\|
741							$clause->{TYPE} eq 'PLURAL') {
742	0						push @{$self->{TERMS}->[$fno]}, @$expanded_terms;
	0
743							} else {
744	0						push @{$self->{TERMS}->[$fno]}, $clause->{TERM};
	0
745							}
746	0						$scorable_clause_count++;
747							}
748
749	0	0	0				if ($clause->{MODIFIER} eq 'NOT') {
		0
		0
750	0						my $not_vec = $clause_vec->Clone;
751	0						$not_vec->Flip;
752	0						$field_vec->Intersection($field_vec, $not_vec);
753							} elsif ($clause->{MODIFIER} eq 'AND'
754							\|\| $clause->{CONJ} eq 'AND') {
755	0						$field_vec->Intersection($field_vec, $clause_vec);
756							} elsif ($clause->{CONJ} eq 'OR') {
757	0	0					if ($#or_vecs >= 0) {
758	0						my $all_ors_vec = Bit::Vector->new($max_id);
759	0						foreach my $or_vec (@or_vecs) {
760	0						$all_ors_vec->Union($all_ors_vec, $or_vec);
761							}
762	0						$field_vec->Intersection($field_vec, $all_ors_vec);
763	0						@or_vecs = ();
764							}
765	0						$field_vec->Union($field_vec, $clause_vec);
766							} else {
767	0						push @or_vecs, $clause_vec;
768							}
769							}
770
771							# Handle edge case where we only have NOT words
772	0	0					if ($scorable_clause_count <= 0) {
773	0						$field_vec->Empty;
774	0						return $field_vec;
775							}
776
777							# Take the union of all the OR terms and intersect with result vector
778	0	0					if ($#or_vecs >= 0) {
779	0						my $all_ors_vec = Bit::Vector->new($max_id);
780	0						foreach my $or_vec (@or_vecs) {
781	0						$all_ors_vec->Union($all_ors_vec, $or_vec);
782							}
783	0						$field_vec->Intersection($field_vec, $all_ors_vec);
784							}
785
786	0						return $field_vec;
787
788							}
789
790							sub _resolve_phrase {
791	0			0			my $self = shift;
792	0						my ($fno, $clause) = @_;
793	0						my (@term_docs, @term_pos);
794	0						my $max_id = $self->max_indexed_id + 1;
795	0						my $and_vec = Bit::Vector->new($max_id);
796	0						$and_vec->Fill;
797
798	0						foreach my $term_clause (@{$clause->{PHRASETERMS}}) {
	0
799	0						$and_vec->Intersection($and_vec,
800							$self->{C}->vector($fno, $term_clause->{TERM}));
801							}
802
803	0	0					if ($self->{RESULT_MASK}) {
804	0						$and_vec->Intersection($and_vec, $self->{RESULT_MASK});
805							}
806
807	0	0					return $and_vec if $and_vec->is_empty();
808
809
810	0						foreach my $term_clause (@{$clause->{PHRASETERMS}}) {
	0
811	0						my $term = $term_clause->{TERM};
812	0						push @term_docs, $self->{C}->term_docs($fno, $term);
813	0						push @term_pos, $self->{C}->term_pos($fno, $term);
814							}
815
816	0						my $phrase_ids;
817
818	0	0					if ($self->{PROXIMITY_INDEX}) {
819	0						$phrase_ids = pos_search($and_vec, \@term_docs, \@term_pos,
820							$clause->{PROXIMITY}, $and_vec->Min, $and_vec->Max);
821							} else {
822	0						my @and_ids = $and_vec->Index_List_Read;
823	0	0					return $and_vec if $#and_ids < 0;
824	0	0					return $and_vec if $#and_ids > $self->{PHRASE_THRESHOLD};
825	0						$phrase_ids = $self->_phrase_fullscan(\@and_ids,$fno, $clause->{TERM});
826							}
827
828	0						$and_vec->Empty;
829	0						$and_vec->Index_List_Store(@$phrase_ids);
830
831	0						return $and_vec;
832							}
833
834							# perl prototype, we use pos_search from TextIndex.xs
835							sub pos_search_perl {
836	0			0	0		my ($and_vec, $term_docs, $term_pos, $proximity) = @_;
837	0		0				$proximity \|\|= 1;
838	0						my @phrase_ids;
839	0						my $term_count = $#$term_docs + 1;
840	0						my $and_vec_min = $and_vec->Min;
841	0						my $and_vec_max = $and_vec->Max;
842	0	0					return if $and_vec_min <= 0;
843
844	0						my @pos_lists;
845							my @td; # term docs
846	0						my @last_td_pos;
847	0						my @pos_idx;
848	0						foreach my $i (0 .. $#$term_docs) {
849	0						@{$pos_lists[$i]} = unpack 'w*', $term_pos->[$i];
	0
850	0						$td[$i] = term_docs_arrayref($term_docs->[$i]);
851	0						$last_td_pos[$i] = 0;
852	0						$pos_idx[$i] = 0;
853							}
854
855	0						for (my $i = 0 ; $i <= $#{$td[0]} ; $i += 2) {
	0
856	0						my $doc_id = $td[0]->[$i];
857	0						my $freq = $td[0]->[$i+1];
858	0						$pos_idx[0] += $freq;
859	0	0					next if ($doc_id < $and_vec_min);
860	0	0					next unless $and_vec->contains($doc_id);
861	0						my @pos_delta =
862	0						@{$pos_lists[0]}[$pos_idx[0] - $freq .. $pos_idx[0] - 1];
863	0						my @pos_first_term;
864	0						push @pos_first_term, $pos_delta[0];
865	0						foreach my $a (1 .. $#pos_delta) {
866	0						push @pos_first_term, $pos_delta[$a] + $pos_first_term[$a - 1];
867							}
868	0						my @next_pos;
869	0						foreach my $j (1 .. $term_count - 1) {
870	0						my $freq = 0;
871	0						for (my $k = $last_td_pos[$j] ;
	0
872							$k <= $#{$td[$j]} ;
873							$k += 2)
874							{
875	0						my $id = $td[$j]->[$k];
876	0						$freq = $td[$j]->[$k+1];
877	0						$pos_idx[$j] += $freq;
878	0						$last_td_pos[$j] = $k;
879	0	0					if ($id >= $doc_id) {
880	0						$last_td_pos[$j] += 2;
881	0						last;
882							}
883
884							}
885	0						my @pos_delta =
886	0						@{$pos_lists[$j]}[$pos_idx[$j] - $freq .. $pos_idx[$j] - 1];
887	0						push @{$next_pos[$j]}, $pos_delta[0];
	0
888	0						foreach my $a (1 .. $#pos_delta) {
889	0						push @{$next_pos[$j]}, $pos_delta[$a] + $next_pos[$j]->[$a - 1];
	0
890							}
891							}
892	0						foreach my $pos (@pos_first_term) {
893	0						my $seq_count = 1;
894	0						my $last_pos = $pos;
895	0						foreach my $j (1 .. $term_count - 1) { # FIXME: short circuit the search by remember positions already looked at
896	0						foreach my $next_pos (@{$next_pos[$j]}) {
	0
897	0	0	0				if ($next_pos > $last_pos &&
898							$next_pos <= $last_pos + $proximity) {
899	0						$seq_count++;
900	0						$last_pos = $next_pos;
901							}
902							}
903							}
904	0	0					if ($seq_count == $term_count) {
905	0						push @phrase_ids, $doc_id;
906							}
907							}
908	0	0					last if $doc_id > $and_vec_max;
909							}
910	0						return \@phrase_ids;
911							}
912
913							sub _resolve_plural {
914	8			8		63	no warnings qw(uninitialized);
	8					19
	8					37000
915	0			0			my $self = shift;
916	0						my ($fno, $term) = @_;
917	0						my $max_id = $self->max_indexed_id + 1;
918	0						my $terms_union = Bit::Vector->new($max_id);
919	0						my $count = 0;
920	0						my $sum_f_t;
921							# FIXME: cheap hack
922							my $max_t;
923	0						my $max_f_t = 0;
924	0						foreach my $t ($term, $term.'s') {
925	0						my $f_t = $self->{C}->f_t($fno, $t);
926	0	0					if ($f_t) {
927	0						$count++;
928	0						$sum_f_t += $f_t;
929							}
930	0	0					$max_t = $t, $max_f_t = $f_t if $f_t > $max_f_t;
931	0						$terms_union->Union($terms_union, $self->{C}->vector($fno, $t));
932							}
933	0	0					if ($count) {
934	0						$self->{F_T}->[$fno]->{$term} = int($sum_f_t/$count);
935							# FIXME: need to do a real merge
936							# $self->{TERM_DOCS}->[$fno]->{$term} = $self->{C}->term_docs($fno, $max_t);
937							}
938	0						return $terms_union, [$term, $term.'s'];
939							}
940
941							sub _resolve_wild {
942	0			0			my $self = shift;
943	0						my ($fno, $term) = @_;
944	0						my $max_id = $self->max_indexed_id + 1;
945	0						my $prefix = (split(/[\*\?]/, $term))[0];
946	0	0					throw_query( error => $ERROR{wildcard_length} )
947							if length($prefix) < $self->{MIN_WILDCARD_LENGTH};
948	0						my $sql = $self->{DB}->fetch_terms($self->{INVERTED_TABLES}->[$fno]);
949	0						my $terms = [];
950	0						my $sql_term = $term;
951	0						$sql_term =~ tr/\*\?/%_/;
952	0						$terms = $self->{INDEX_DBH}->selectcol_arrayref($sql, undef, $sql_term);
953							# To save resources, check to make sure wildcard search is not too broad
954	0	0					throw_query( error => $ERROR{wildcard_expansion} )
955							if $#$terms + 1 > $self->{MAX_WILDCARD_TERM_EXPANSION};
956
957	0						my $terms_union = Bit::Vector->new($max_id);
958	0						my $count = 0;
959	0						my $sum_f_t;
960							# FIXME: cheap hack
961							my $max_t;
962	0						my $max_f_t = 0;
963	0						foreach my $t (@$terms) {
964	0						my $f_t = $self->{C}->f_t($fno, $t);
965	0	0					if ($f_t) {
966	0						$count++;
967	0						$sum_f_t += $f_t;
968							}
969	0	0					$max_t = $t, $max_f_t = $f_t if $f_t > $max_f_t;
970	0						$terms_union->Union($terms_union, $self->{C}->vector($fno, $t));
971							}
972	0	0					if ($count) {
973	0						$self->{F_T}->[$fno]->{$term} = int($sum_f_t/$count);
974							# FIXME: need to do a real merge
975							# $self->{TERM_DOCS}->[$fno]->{$term} = $self->{C}->term_docs($fno, $max_t);
976							}
977							# FIXME: what should TERM_DOCS contain if count is 0?
978	0						return ($terms_union, $terms);
979							}
980
981							sub _flush_cache {
982	0			0			my $self = shift;
983
984	0						my @delete = qw(result_vector
985							result_mask
986							valid_mask
987							mask
988							mask_fetch_list
989							mask_vector
990							terms
991							f_qt
992							f_t
993							term_docs
994							term_pos);
995
996	0						delete @$self{map { uc $_ } @delete};
	0
997
998	0						$self->{STOPLISTED_QUERY} = [];
999							# check to see if documents have been added since we last called new()
1000	0						my $new_max_indexed_id = $self->fetch_max_indexed_id;
1001	0	0	0				if (($new_max_indexed_id != $self->{MAX_INDEXED_ID})
1002							\|\| ($self->{SEARCH_COUNT} > SEARCH_CACHE_FLUSH_INTERVAL)) {
1003							# flush things that stick around
1004	0						$self->max_indexed_id($new_max_indexed_id);
1005	0						$self->{C}->max_indexed_id($new_max_indexed_id);
1006	0						delete($self->{ALL_DOCS_VECTOR});
1007	0						delete($self->{W_D});
1008	0						delete($self->{AVG_W_D});
1009	0						$self->{SEARCH_COUNT} = 0;
1010							}
1011							}
1012
1013							sub highlight {
1014	0			0	0		return $_[0]->{HIGHLIGHT};
1015							}
1016
1017							sub html_highlight {
1018	0			0	0		my $self = shift;
1019	0						my $field = shift;
1020
1021	0						my $fno = $self->{FIELD_NO}->{$field};
1022
1023	0						my @terms = @{$self->{QUERY_HIGHLIGHT}->[$fno]};
	0
1024	0						push (@terms, @{$self->{QUERY_PHRASES}->[$fno]});
	0
1025
1026	0						return (\@terms, $self->{QUERY_WILDCARDS}->[$fno]);
1027							}
1028
1029							sub initialize {
1030	0			0	1		my $self = shift;
1031
1032	0						$self->{MAX_INDEXED_ID} = 0;
1033
1034	0	0					if ($self->_collection_table_exists) {
1035	0	0	0				if ($self->_collection_table_upgrade_required \|\|
1036							$self->collection_count < 1)
1037							{
1038	0						$self->upgrade_collection_table;
1039							}
1040							} else {
1041	0						$self->_create_collection_table;
1042							}
1043	0						$self->_create_tables;
1044	0						$self->_delete_collection_info;
1045	0						$self->_store_collection_info;
1046
1047	0						return $self;
1048							}
1049
1050							# FIXME: probably breaks if max_indexed_id has been removed. Test.
1051							sub last_indexed_key {
1052	0			0	1		my $self = shift;
1053	0						my $doc_keys = $self->{DB}->fetch_doc_keys([ $self->{MAX_INDEXED_ID} ]);
1054
1055	0	0					if (ref $doc_keys) {
1056	0						return $doc_keys->[0];
1057							} else {
1058	0						return undef;
1059							}
1060							}
1061
1062							sub indexed {
1063	0			0	1		my $self = shift;
1064	0						my $doc_key = shift;
1065
1066	0						my $doc_ids = $self->{DB}->fetch_doc_ids([$doc_key]);
1067
1068	0	0					if (ref $doc_ids) {
1069	0						return $doc_ids->[0];
1070							} else {
1071	0						return 0;
1072							}
1073							}
1074
1075							sub max_indexed_id {
1076	0			0	0		my $self = shift;
1077	0						my $max_indexed_id = shift;
1078
1079	0	0					if (defined $max_indexed_id) {
1080	0						$self->_update_collection_info('max_indexed_id', $max_indexed_id);
1081	0						$self->{C}->max_indexed_id($max_indexed_id);
1082	0						return $self->{MAX_INDEXED_ID};
1083							} else {
1084	0						return $self->{MAX_INDEXED_ID};
1085							}
1086							}
1087
1088							sub fetch_max_indexed_id {
1089	0			0	0		my $self = shift;
1090	0						my ($max_indexed_id) = $self->{INDEX_DBH}->selectrow_array(
1091							$self->{DB}->fetch_max_indexed_id,
1092							undef, $self->{COLLECTION} );
1093	0						return $max_indexed_id;
1094							}
1095
1096							sub delete {
1097
1098	0			0	1		my $self = shift;
1099
1100	0	0					_log("Deleting $self->{COLLECTION} from collection table\n") if $PA;
1101	0						$self->_delete_collection_info;
1102
1103	0	0					_log("Dropping mask table ($self->{MASK_TABLE})\n") if $PA;
1104	0						$self->{DB}->drop_table($self->{MASK_TABLE});
1105
1106	0	0					_log("Dropping docweights table ($self->{DOCWEIGHTS_TABLE})\n") if $PA;
1107	0						$self->{DB}->drop_table($self->{DOCWEIGHTS_TABLE});
1108
1109	0	0					_log("Dropping docs vector table ($self->{ALL_DOCS_VECTOR_TABLE})\n")
1110							if $PA;
1111	0						$self->{DB}->drop_table($self->{ALL_DOCS_VECTOR_TABLE});
1112
1113	0	0					_log("Dropping delete queue table ($self->{DELETE_QUEUE_TABLE})\n")
1114							if $PA;
1115	0						$self->{DB}->drop_table($self->{DELETE_QUEUE_TABLE});
1116
1117	0	0					_log("Dropping doc key table ($self->{DOC_KEY_TABLE})\n") if $PA;
1118	0						$self->{DB}->drop_doc_key_table();
1119
1120	0						foreach my $table ( @{$self->{INVERTED_TABLES}} ) {
	0
1121	0	0					_log("Dropping inverted table ($table)\n") if $PA;
1122	0						$self->{DB}->drop_table($table);
1123							}
1124							}
1125
1126							sub _collection_table_exists {
1127	0			0			my $self = shift;
1128	0						return $self->{DB}->table_exists(COLLECTION_TABLE);
1129							}
1130
1131							sub _create_collection_table {
1132	0			0			my $self = shift;
1133	0						my $sql = $self->{DB}->create_collection_table;
1134	0						$self->{INDEX_DBH}->do($sql);
1135	0	0					_log("Creating collection table (" . COLLECTION_TABLE . ")\n") if $PA;
1136							}
1137
1138							sub collection_count {
1139	0			0	0		my $self = shift;
1140	0						my $collection_count = $self->{INDEX_DBH}->selectrow_array(
1141							$self->{DB}->collection_count );
1142	0	0					croak $DBI::errstr if $DBI::errstr;
1143	0						return $collection_count;
1144							}
1145
1146							sub _collection_table_upgrade_required {
1147	0			0			my $self = shift;
1148	0						my $version = 0;
1149	0	0					_log("Checking if collection table upgrade required ...\n") if $PA > 1;
1150	0	0					unless ($self->collection_count) {
1151	0	0					_log("... Collection table contains no rows\n") if $PA > 1;
1152	0						return 0;
1153							}
1154	0						eval {
1155	0						$version = $self->{INDEX_DBH}->selectrow_array(
1156							$self->{DB}->fetch_collection_version );
1157	0	0					die $DBI::errstr if $DBI::errstr;
1158							};
1159	0	0					if ($@) {
1160	0	0					_log("... Problem fetching version column, must upgrade\n") if $PA > 1;
1161	0						return 1;
1162							}
1163	0	0	0				if ($version && ($version < LAST_COLLECTION_TABLE_UPGRADE)) {
1164	0	0					_log("... Collection table version too low, must upgrade\n")
1165							if $PA > 1;
1166	0						return 1;
1167							}
1168	0	0					_log("... Collection table up-to-date\n") if $PA > 1;
1169	0						return 0;
1170							}
1171
1172							sub upgrade_collection_table {
1173	0			0	1		my $self = shift;
1174	0						my $sth = $self->{INDEX_DBH}->prepare($self->{DB}->fetch_all_collection_rows);
1175	0						$sth->execute;
1176	0	0					croak $sth->errstr if $sth->errstr;
1177	0	0					if ($sth->rows < 1) {
1178	0	0					_log("No rows in collection table, dropping collection table ("
1179							. COLLECTION_TABLE . ")\n") if $PA;
1180	0						$self->{DB}->drop_table(COLLECTION_TABLE);
1181	0						$self->_create_collection_table;
1182	0						return 1;
1183							}
1184	0						my @table;
1185	0						while (my $row = $sth->fetchrow_hashref) {
1186	0						push @table, $row;
1187							}
1188
1189	0	0					_log("Upgrading collection table ...\n") if $PA;
1190	0	0					_log("... Dropping old collection table ...\n") if $PA;
1191	0						$self->{DB}->drop_table(COLLECTION_TABLE);
1192	0	0					_log("... Recreating collection table ...\n") if $PA;
1193	0						$self->_create_collection_table;
1194
1195	0						foreach my $old_row (@table) {
1196	0						my %new_row;
1197	0						foreach my $field (@COLLECTION_FIELDS) {
1198	0	0					$new_row{$field} = exists $old_row->{$field} ?
1199							$old_row->{$field} : $COLLECTION_FIELD_DEFAULT{$field};
1200	0						$new_row{version} = $COLLECTION_FIELD_DEFAULT{version};
1201							}
1202							# 'czech_language', 'language' options replaced with 'charset'
1203	0	0					if (exists $old_row->{czech_language}) {
1204	0	0					$new_row{charset} = 'iso-8859-2' if $old_row->{czech_language};
1205							}
1206	0	0					if (exists $old_row->{language}) {
1207	0	0					if ($old_row->{language} eq 'cz') {
1208	0						$new_row{charset} = 'iso-8859-2';
1209							} else {
1210	0						$new_row{charset} = $COLLECTION_FIELD_DEFAULT{charset}
1211							}
1212							}
1213	0	0					if (exists $old_row->{document_table}) {
1214	0						$new_row{doc_table} = $old_row->{document_table};
1215							}
1216	0	0					if (exists $old_row->{document_id_field}) {
1217	0						$new_row{doc_id_field} = $old_row->{document_id_field};
1218							}
1219	0	0					if (exists $old_row->{document_fields}) {
1220	0						$new_row{doc_fields} = $old_row->{document_fields};
1221							}
1222
1223	0	0					_log("... Inserting collection ($new_row{collection})\n") if $PA;
1224	0						$self->{DB}->insert_collection_table_row(\%new_row)
1225							}
1226	0						return 1;
1227							}
1228
1229							sub _update_collection_info {
1230	0			0			my $self = shift;
1231	0						my ($field, $value) = @_;
1232
1233	0						my $attribute = $field;
1234	0						$attribute =~ tr/[a-z]/[A-Z]/;
1235	0						my $sql = $self->{DB}->update_collection_info($field);
1236	0						$self->{INDEX_DBH}->do($sql, undef, $value, $self->{COLLECTION});
1237	0						$self->{$attribute} = $value;
1238							}
1239
1240							sub _delete_collection_info {
1241	0			0			my $self = shift;
1242
1243	0						my $sql = $self->{DB}->delete_collection_info;
1244	0						$self->{INDEX_DBH}->do($sql, undef, $self->{COLLECTION});
1245	0	0					_log("Deleting collection $self->{COLLECTION} from collection table\n")
1246							if $PA;
1247							}
1248
1249							sub _store_collection_info {
1250
1251	0			0			my $self = shift;
1252
1253	0	0					_log(qq(Inserting collection $self->{COLLECTION} into collection table\n))
1254							if $PA;
1255
1256	0						my $sql = $self->{DB}->store_collection_info;
1257	0						my $doc_fields = join (',', @{$self->{DOC_FIELDS}});
	0
1258	0						my $stoplists = ref $self->{STOPLIST} ?
1259	0	0					join (',', @{$self->{STOPLIST}}) : '';
1260
1261	0						my $version = $DBIx::TextIndex::VERSION;
1262
1263	0	0					if ($version =~ m/(\d+)\.(\d+)\.(\d+)/) {
1264	0						$version = "$1.$2$3" + 0;
1265							}
1266
1267	0	0					$self->{INDEX_DBH}->do($sql, undef,
1268
1269							$self->{COLLECTION},
1270							$version,
1271							$self->{MAX_INDEXED_ID},
1272							$self->{DOC_TABLE},
1273							$self->{DOC_ID_FIELD},
1274
1275							$doc_fields,
1276							$self->{CHARSET},
1277							$stoplists,
1278							$self->{PROXIMITY_INDEX},
1279
1280							$ERROR{empty_query},
1281							$ERROR{quote_count},
1282							$ERROR{no_results},
1283							$ERROR{no_results_stop},
1284							$ERROR{wildcard_length},
1285							$ERROR{wildcard_expansion},
1286
1287							$self->{MAX_WORD_LENGTH},
1288							$self->{RESULT_THRESHOLD},
1289							$self->{PHRASE_THRESHOLD},
1290							$self->{MIN_WILDCARD_LENGTH},
1291							$self->{MAX_WILDCARD_TERM_EXPANSION},
1292
1293							$self->{DECODE_HTML_ENTITIES},
1294							$self->{SCORING_METHOD},
1295							$self->{UPDATE_COMMIT_INTERVAL},
1296							) \|\| croak $DBI::errstr;
1297
1298							}
1299
1300							sub _fetch_collection_info {
1301
1302	0			0			my $self = shift;
1303
1304	0	0					return 0 unless $self->{COLLECTION};
1305
1306	0	0					return 0 unless $self->_collection_table_exists;
1307
1308	0	0					if ($self->_collection_table_upgrade_required) {
1309	0						carp __PACKAGE__ . ": Collection table must be upgraded, call \$index->upgrade_collection_table() or create a new() \$index and call \$index->initialize() to upgrade the collection table";
1310	0						return 0;
1311							}
1312
1313	0						my $sql = $self->{DB}->fetch_collection_info;
1314
1315	0						my $sth = $self->{INDEX_DBH}->prepare($sql);
1316
1317	0						$sth->execute($self->{COLLECTION});
1318
1319	0						my $doc_fields = '';
1320	0						my $stoplists = '';
1321
1322	0						my $collection;
1323	0						$sth->bind_columns(\(
1324							$collection,
1325							$self->{VERSION},
1326							$self->{MAX_INDEXED_ID},
1327							$self->{DOC_TABLE},
1328							$self->{DOC_ID_FIELD},
1329
1330							$doc_fields,
1331							$self->{CHARSET},
1332							$stoplists,
1333							$self->{PROXIMITY_INDEX},
1334
1335							$ERROR{empty_query},
1336							$ERROR{quote_count},
1337							$ERROR{no_results},
1338							$ERROR{no_results_stop},
1339							$ERROR{wildcard_length},
1340							$ERROR{wildcard_expansion},
1341
1342							$self->{MAX_WORD_LENGTH},
1343							$self->{RESULT_THRESHOLD},
1344							$self->{PHRASE_THRESHOLD},
1345							$self->{MIN_WILDCARD_LENGTH},
1346							$self->{MAX_WILDCARD_TERM_EXPANSION},
1347
1348							$self->{DECODE_HTML_ENTITIES},
1349							$self->{SCORING_METHOD},
1350							$self->{UPDATE_COMMIT_INTERVAL},
1351							));
1352
1353	0						$sth->fetch;
1354	0						$sth->finish;
1355
1356	0						my @doc_fields = split(/,/, $doc_fields);
1357	0						my @stoplists = split (/,\s*/, $stoplists);
1358
1359	0						$self->{DOC_FIELDS} = \@doc_fields;
1360	0						$self->{STOPLIST} = \@stoplists;
1361
1362	0		0				$self->{CHARSET} = $self->{CHARSET} \|\| $COLLECTION_FIELD_DEFAULT{charset};
1363	0	0					$self->{CZECH_LANGUAGE} = $self->{CHARSET} eq 'iso-8859-2' ? 1 : 0;
1364
1365	0	0					return $collection ? 1 : 0;
1366
1367							}
1368
1369							sub _phrase_fullscan {
1370	0			0			my $self = shift;
1371	0						my $docref = shift;
1372	0						my $fno = shift;
1373	0						my $phrase = shift;
1374
1375	0						my @docs = @{$docref};
	0
1376	0						my $docs = join(',', @docs);
1377	0						my @found;
1378
1379	0	0					my $sql = $self->{CZECH_LANGUAGE} ?
1380							$self->{DB}->phrase_scan_cz($docs, $fno) :
1381							$self->{DB}->phrase_scan($docs, $fno);
1382
1383	0						my $sth = $self->{DOC_DBH}->prepare($sql);
1384
1385	0	0					if ($self->{CZECH_LANGUAGE}) {
1386	0						$sth->execute;
1387							} else {
1388	0						$sth->execute("%$phrase%");
1389							}
1390
1391	0						my ($doc_id, $content);
1392	0	0					if ($self->{CZECH_LANGUAGE}) {
1393	0						$sth->bind_columns(\$doc_id, \$content);
1394							} else {
1395	0						$sth->bind_columns(\$doc_id);
1396							}
1397
1398							# FIXME: this now works on doc_keys, not ids
1399							# FIXME: come up with unit tests for indexes without proximity_index
1400
1401	0						while($sth->fetch) {
1402	0	0					if ($self->{CZECH_LANGUAGE}) {
1403	0						$content = $self->_lc_and_unac($content);
1404	0	0					push(@found, $doc_id) if (index($content, $phrase) != -1);
1405	0	0					_log("content scan for $doc_id, phrase = $phrase\n")
1406							if $PA > 1;
1407							} else {
1408	0						push(@found, $doc_id);
1409							}
1410							}
1411
1412	0						return \@found;
1413							}
1414
1415							sub _fetch_docweights {
1416	0			0			my $self = shift;
1417	0						my $all_fields = shift;
1418
1419	0						my @fnos;
1420	0	0					if ($all_fields) {
1421	0						@fnos = (0 .. $#{$self->{DOC_FIELDS}});
	0
1422							} else {
1423							# skip over if we already have hash entry
1424	0						foreach my $fno (@{$self->{TERM_FIELD_NOS}}) {
	0
1425	0	0					unless (ref $self->{W_D}->[$fno]) {
1426	0						push @fnos, $fno;
1427							}
1428							}
1429							}
1430
1431	0	0					if ($#fnos > -1) {
1432	0						my $fnos = join(',', @fnos);
1433
1434	0						my $sql = $self->{DB}->fetch_docweights($fnos);
1435
1436	0						my $sth = $self->{INDEX_DBH}->prepare($sql);
1437
1438	0	0					$sth->execute \|\| warn $DBI::errstr;
1439
1440	0						while (my $row = $sth->fetchrow_arrayref) {
1441	0						$self->{AVG_W_D}->[$row->[0]] = $row->[1];
1442							# Ugly, DBD::SQLite doesn't quote \0 when using placeholders
1443	0	0					if ($self->{DBD_TYPE} eq 'SQLite') {
1444	0						my $packed_w_d = $row->[2];
1445	0						$packed_w_d =~ s/\\0/\0/g;
1446	0						$packed_w_d =~ s/\\\\/\\/g;
1447	0						$self->{W_D}->[$row->[0]] = [ unpack('f*', $packed_w_d) ];
1448							} else {
1449	0						$self->{W_D}->[$row->[0]] = [ unpack('f*', $row->[2]) ];
1450							}
1451							}
1452							}
1453							}
1454
1455							sub _search_okapi {
1456
1457	8			8		112	no warnings qw(uninitialized);
	8					18
	8					17906
1458
1459	0			0			my $self = shift;
1460
1461	0						my %score; # accumulator to hold doc scores
1462
1463	0						my $b = 0.75; # $b, $k1, $k3 are parameters for Okapi
1464	0						my $k1 = 1.2; # BM25 algorithm
1465	0						my $k3 = 7; #
1466	0						my $f_qt; # frequency of term in query
1467							my $f_t; # Number of documents that contain term
1468	0						my $W_d; # weight of document, sqrt((1 + log(terms))**2)
1469	0						my $avg_W_d; # average document weight in collection
1470	0						my $doc_id; # document id
1471	0						my $f_dt; # frequency of term in given doc_id
1472	0						my $idf = 0;
1473	0						my $fno = 0;
1474
1475	0						my $acc_size = 0; # current number of keys in %score
1476
1477							# FIXME: use actual document count
1478	0						my $N = $self->{MAX_INDEXED_ID};
1479
1480	0						$self->_fetch_docweights;
1481
1482	0						my $result_max = $self->{RESULT_VECTOR}->Max;
1483	0						my $result_min = $self->{RESULT_VECTOR}->Min;
1484
1485	0	0					if ($result_max < 1) {
1486	0	0					if (not @{$self->{STOPLISTED_QUERY}}) {
	0
1487	0						throw_query( error => $ERROR{no_results} );
1488							}
1489							else {
1490	0						throw_query( error => $self->_format_stoplisted_error );
1491							}
1492							}
1493
1494	0						foreach my $fno ( @{$self->{TERM_FIELD_NOS}} ) {
	0
1495	0						$avg_W_d = $self->{AVG_W_D}->[$fno];
1496	0						foreach my $term (@{$self->{TERMS}->[$fno]}) {
	0
1497	0		0				$f_t = $self->{F_T}->[$fno]->{$term} \|\|
1498							$self->{C}->f_t($fno, $term);
1499	0						$idf = log(($N - $f_t + 0.5) / ($f_t + 0.5));
1500	0	0					next if $idf < IDF_MIN_OKAPI; # FIXME: do we want do warn that term was stoplisted?
1501	0						$f_qt = $self->{F_QT}->[$fno]->{$term}; # freq of term in query
1502	0						my $w_qt = (($k3 + 1) * $f_qt) / ($k3 + $f_qt); # query term weight
1503	0		0				my $term_docs = $self->{TERM_DOCS}->[$fno]->{$term} \|\|
1504							$self->{C}->term_docs($fno, $term);
1505	0						score_term_docs_okapi($term_docs, \%score, $self->{RESULT_VECTOR}, ACCUMULATOR_LIMIT, $result_min, $result_max, $idf, $f_t, $self->{W_D}->[$fno], $avg_W_d, $w_qt, $k1, $b);
1506							}
1507							}
1508
1509	0	0					unless (scalar keys %score) {
1510	0	0					if (not @{$self->{STOPLISTED_QUERY}}) {
	0
1511	0						throw_query( error => $ERROR{no_results} );
1512							}
1513							else {
1514	0						throw_query( error => $self->_format_stoplisted_error );
1515							}
1516							}
1517	0						return $self->_doc_ids_to_keys(\%score);
1518							}
1519
1520							sub _doc_ids_to_keys {
1521	0			0			my $self = shift;
1522	0						my $score = shift;
1523	0						my %copy = %$score;
1524	0						my @doc_ids = sort { $a <=> $b } keys %$score;
	0
1525	0						my $doc_keys = $self->{DB}->fetch_doc_keys(\@doc_ids);
1526	0						my %score_by_keys;
1527	0						@score_by_keys{@$doc_keys} = @$score{@doc_ids};
1528	0						return \%score_by_keys;
1529							}
1530
1531							sub _format_stoplisted_error {
1532	0			0			my $self = shift;
1533	0						my $stopped = join(', ', @{$self->{STOPLISTED_QUERY}});
	0
1534	0						return qq($ERROR{no_results_stop} $stopped.);
1535							}
1536
1537							######################################################################
1538							#
1539							# _optimize_or_search()
1540							#
1541							# If query contains large number of OR terms,
1542							# turn the rarest terms into AND terms to reduce result set size
1543							# before scoring.
1544							#
1545							# Algorithm: if there are four or less query terms turn the two
1546							# least frequent OR terms into AND terms. For five or more query
1547							# terms, make the three least frequent OR terms into AND terms.
1548							#
1549							# Does nothing if AND or NOT terms already exist
1550							#
1551
1552							sub _optimize_or_search {
1553	0			0			my $self = shift;
1554	0						foreach my $fno ( @{$self->{QUERY_FIELD_NOS}} ) {
	0
1555
1556	0						my @clauses = @{$self->{QUERY}->[$fno]};
	0
1557
1558	0						my %f_t;
1559							my @or_clauses;
1560	0						my $or_term_count = 0;
1561	0						foreach my $clause (@clauses) {
1562	0	0					return if exists $clause->{CONJ}; # user explicitly asked
1563	0	0	0				return if ($clause->{MODIFIER} eq 'NOT' # for boolean query
1564							\|\| $clause->{MODIFIER} eq 'AND');
1565	0	0	0				if ($clause->{TYPE} eq 'TERM'
		0	0
			0
1566							\|\| $clause->{TYPE} eq 'PLURAL'
1567							\|\| $clause->{TYPE} eq 'WILD') {
1568
1569	0	0					if ($clause->{MODIFIER} eq 'OR') {
1570	0						$or_term_count++;
1571	0						my $term = $clause->{TERM};
1572	0		0				$f_t{$term} = $self->{C}->f_t($fno, $term) \|\| 0;
1573	0						push @or_clauses, $clause;
1574							}
1575							} elsif ($clause->{TYPE} eq 'IMPLICITPHRASE'
1576							\|\| $clause->{TYPE} eq 'PHRASE') {
1577	0	0					if ($clause->{MODIFIER} eq 'OR') {
1578	0						$clause->{MODIFIER} = 'AND';
1579							}
1580							} else {
1581	0						return;
1582							}
1583							}
1584	0	0					return if $or_term_count < 1;
1585
1586							# sort in order of f_t
1587	0						my @f_t_sorted =
1588	0						sort { $f_t{$a->{TERM}} <=> $f_t{$b->{TERM}} } @or_clauses;
1589
1590	0	0					if ($or_term_count >= 1) {
1591	0						$f_t_sorted[0]->{MODIFIER} = 'AND';
1592							}
1593	0	0					if ($or_term_count >= 2) {
1594	0						$f_t_sorted[1]->{MODIFIER} = 'AND';
1595							}
1596	0	0					if ($or_term_count > 4) {
1597	0						$f_t_sorted[2]->{MODIFIER} = 'AND';
1598							}
1599							}
1600							}
1601
1602							sub _resolve_mask {
1603
1604	0			0			my $self = shift;
1605
1606	0	0					return unless $self->{MASK};
1607
1608	0						$self->{RESULT_MASK} = Bit::Vector->new($self->{MAX_INDEXED_ID} + 1);
1609	0						$self->{RESULT_MASK}->Fill;
1610
1611	0	0					if ($self->_fetch_mask) {
1612	0						$self->{VALID_MASK} = 1;
1613							}
1614	0	0					if ($self->{MASK}->{and_mask}) {
1615	0						foreach my $mask (@{$self->{MASK}->{and_mask}}) {
	0
1616	0	0					unless (ref $mask) {
1617	0	0					next unless ref $self->{MASK_VECTOR}->{$mask};
1618	0						$self->{RESULT_MASK}->Intersection(
1619							$self->{RESULT_MASK}, $self->{MASK_VECTOR}->{$mask});
1620							} else {
1621	0						my $vector = Bit::Vector->new($self->{MAX_INDEXED_ID} + 1);
1622	0						$vector->Index_List_Store(@$mask);
1623	0						$self->{RESULT_MASK}->Intersection(
1624							$self->{RESULT_MASK}, $vector);
1625							}
1626							}
1627							}
1628	0	0					if ($self->{MASK}->{not_mask}) {
1629	0						foreach my $mask (@{$self->{MASK}->{not_mask}}) {
	0
1630	0	0					unless (ref $mask) {
1631	0	0					next unless ref $self->{MASK_VECTOR}->{$mask};
1632	0						$self->{MASK_VECTOR}->{$mask}->Flip;
1633	0						$self->{RESULT_MASK}->Intersection(
1634							$self->{RESULT_MASK}, $self->{MASK_VECTOR}->{$mask});
1635							} else {
1636	0						my $vector = Bit::Vector->new($self->{MAX_INDEXED_ID} + 1);
1637	0						$vector->Index_List_Store(@$mask);
1638	0						$vector->Flip;
1639	0						$self->{RESULT_MASK}->Intersection(
1640							$self->{RESULT_MASK}, $vector);
1641							}
1642							}
1643							}
1644	0	0					if ($self->{MASK}->{or_mask}) {
1645	0						push @{$self->{MASK}->{or_mask_set}}, $self->{MASK}->{or_mask};
	0
1646							}
1647	0	0					if ($self->{MASK}->{or_mask_set}) {
1648	0						foreach my $mask_set (@{$self->{MASK}->{or_mask_set}}) {
	0
1649	0						my $or_mask_count = 0;
1650	0						my $union_vector = Bit::Vector->new($self->{MAX_INDEXED_ID} + 1);
1651	0						foreach my $mask (@$mask_set) {
1652	0	0					unless (ref $mask) {
1653	0	0					next unless ref $self->{MASK_VECTOR}->{$mask};
1654	0						$or_mask_count++;
1655	0						$union_vector->Union(
1656							$union_vector, $self->{MASK_VECTOR}->{$mask});
1657							} else {
1658	0						$or_mask_count++;
1659	0						my $vector = Bit::Vector->new($self->{MAX_INDEXED_ID} + 1);
1660	0						$vector->Index_List_Store(@$mask);
1661	0						$union_vector->Union(
1662							$union_vector, $self->{MASK_VECTOR}->{$mask});
1663							}
1664							}
1665	0	0					if ($or_mask_count) {
1666	0						$self->{RESULT_MASK}->Intersection(
1667							$self->{RESULT_MASK}, $union_vector);
1668							}
1669							}
1670							}
1671							}
1672
1673							sub _fetch_mask {
1674	0			0			my $self = shift;
1675
1676	0						my $sql = $self->{DB}->fetch_mask;
1677	0						my $sth = $self->{INDEX_DBH}->prepare($sql);
1678
1679	0						my $mask_count = 0;
1680	0						my $i = 0;
1681
1682	0						foreach my $mask (@{$self->{MASK_FETCH_LIST}}) {
	0
1683	0	0					if (ref ($self->{MASK_VECTOR}->{$mask})) {
1684							# We already have one, go ahead
1685	0						$mask_count++;
1686	0						next;
1687							}
1688
1689	0						$sth->execute($mask);
1690
1691	0	0					next if $sth->rows < 1;
1692	0						$mask_count += $sth->rows;
1693
1694	0						my $docs_vector;
1695	0						$sth->bind_col(1, \$docs_vector);
1696	0						$sth->fetch;
1697
1698	0						$self->{MASK_VECTOR}->{$mask} =
1699							Bit::Vector->new_Enum(($self->{MAX_INDEXED_ID} + 1), $docs_vector);
1700
1701	0						$i++;
1702
1703							}
1704	0						return $mask_count;
1705							}
1706
1707							# Set everything to lowercase and change accented characters to
1708							# unaccented equivalents
1709							sub _lc_and_unac {
1710	0			0			my $self = shift;
1711	0						my $s = shift;
1712	0						$s = unac_string($self->{CHARSET}, $s) if DO_UNAC;
1713	0						$s = lc($s);
1714	0						return $s;
1715							}
1716
1717							sub _docs {
1718	0			0			my $self = shift;
1719	0						my $fno = shift;
1720	0						my $term = shift;
1721
1722	0						local $^W = 0; # turn off uninitialized value warning
1723	0	0					if (@_) {
1724	0						$self->{TERM_DOCS_VINT}->[$fno]->{$term} .= pack 'w*', @_;
1725	0						$self->{DOCFREQ_T}->[$fno]->{$term}++;
1726							} else {
1727	0						$self->{C}->term_docs_hashref($fno, $term);
1728							}
1729							}
1730
1731							sub _positions {
1732	0			0			my $self = shift;
1733	0						my $fno = shift;
1734	0						my $term = shift;
1735	0	0					if (@_) {
1736	0						my $positions = shift;
1737	0						$self->{TERM_POS}->[$fno]->{$term} .=
1738							pack_vint_delta($positions);
1739							}
1740							}
1741
1742							sub _commit_docs {
1743	0			0			my $self = shift;
1744
1745	0		0				my $added_ids = shift \|\| $self->{ADDED_IDS};
1746
1747	0						my $id_a = $self->max_indexed_id + 1; # old max_indexed_id
1748	0						$self->max_indexed_id($added_ids->[-1]);
1749	0						$self->all_doc_ids($added_ids);
1750
1751	0						my ($sql, $sth);
1752	0						my $id_b = $self->{MAX_INDEXED_ID};
1753
1754	0	0					_log("Storing doc weights\n") if $PA;
1755
1756	0						$self->_fetch_docweights(1);
1757
1758	0						$self->{INDEX_DBH}->begin_work;
1759
1760	0						$sth = $self->{INDEX_DBH}->prepare($self->{DB}->update_docweights);
1761
1762	8			8		60	no warnings qw(uninitialized);
	8					16
	8					11172
1763	0						foreach my $fno ( 0 .. $#{$self->{DOC_FIELDS}} ) {
	0
1764	0						my @w_d;
1765	0	0					if ($#{$self->{W_D}->[$fno]} >= 0) {
	0
1766	0						@w_d = @{$self->{W_D}->[$fno]};
	0
1767	0						@w_d[$id_a .. $id_b] =
1768	0						@{$self->{NEW_W_D}->[$fno]}[$id_a .. $id_b];
1769							} else {
1770	0						@w_d = @{$self->{NEW_W_D}->[$fno]};
	0
1771							}
1772	0						my $sum;
1773	0						foreach (@w_d) {
1774	0						$sum += $_;
1775							}
1776							# FIXME: use actual doc count instead of max_indexed_id
1777	0						my $avg_w_d = $sum / $id_b;
1778	0	0					$w_d[0] = 0 unless defined $w_d[0];
1779							# FIXME: this takes too much space, use a float compression method
1780	0						my $packed_w_d = pack 'f*', @w_d;
1781	0						$self->{DB}->update_docweights_execute($sth, $fno, $avg_w_d, $packed_w_d);
1782							# Set AVG_W_D and W_D cached values to new value, in case same
1783							# instance is used for search immediately after adding to index
1784	0						$self->{AVG_W_D}->[$fno] = $avg_w_d;
1785	0						$self->{W_D}->[$fno] = \@w_d;
1786							}
1787
1788	0						$sth->finish;
1789
1790							# Delete temporary in-memory structure
1791	0						delete($self->{NEW_W_D});
1792
1793	0	0					_log("Committing inverted tables to database\n") if $PA;
1794
1795	0						foreach my $fno ( 0 .. $#{$self->{DOC_FIELDS}} ) {
	0
1796
1797	0	0					_log("field$fno ", scalar keys %{$self->{TERM_DOCS_VINT}->[$fno]},
	0
1798							" distinct terms\n") if $PA;
1799
1800	0						my $s_sth;
1801
1802							# SQLite chokes with "database table is locked" unless s_sth
1803							# is finished before i_sth->execute
1804	0	0					unless ($self->{DBD_TYPE} eq 'SQLite') {
1805	0						$s_sth = $self->{INDEX_DBH}->prepare(
1806							$self->{DB}->inverted_select(
1807							$self->{INVERTED_TABLES}->[$fno] ) );
1808							}
1809	0						my $i_sth = $self->{INDEX_DBH}->prepare(
1810							$self->{DB}->inverted_replace(
1811							$self->{INVERTED_TABLES}->[$fno] ) );
1812
1813	0						my $tc = 0;
1814	0						while (my ($term, $term_docs_vint) =
	0
1815							each %{$self->{TERM_DOCS_VINT}->[$fno]}) {
1816
1817	0	0					_log("$term\n") if $PA >= 2;
1818	0	0	0				if ($PA && $tc > 0) {
1819	0	0					_log("committed $tc terms\n") if $tc % 500 == 0;
1820							}
1821
1822	0						my $o_docfreq_t = 0;
1823	0						my $o_term_docs = '';
1824	0						my $o_term_pos = '';
1825
1826	0	0					$s_sth = $self->{INDEX_DBH}->prepare( $self->{DB}->inverted_select(
1827							$self->{INVERTED_TABLES}->[$fno]) )
1828							if $self->{DBD_TYPE} eq 'SQLite';
1829	0						$s_sth->execute($term);
1830	0						$s_sth->bind_columns(\$o_docfreq_t, \$o_term_docs, \$o_term_pos);
1831	0						$s_sth->fetch;
1832	0	0					$s_sth->finish if $self->{DBD_TYPE} eq 'SQLite';
1833	0						my $term_docs = pack_term_docs_append_vint($o_term_docs,
1834							$term_docs_vint);
1835
1836	0						my $term_pos = $o_term_pos . $self->{TERM_POS}->[$fno]->{$term};
1837
1838	0						$self->{DB}->inverted_replace_execute(
1839							$i_sth,
1840							$term,
1841							$self->{DOCFREQ_T}->[$fno]->{$term} + $o_docfreq_t,
1842							$term_docs,
1843							$term_pos,
1844							);
1845
1846	0						delete($self->{TERM_DOCS_VINT}->[$fno]->{$term});
1847	0						delete($self->{TERM_POS}->[$fno]->{$term});
1848	0						$tc++;
1849							}
1850	0	0					$i_sth->finish if $self->{DBD_TYPE} eq 'SQLite';
1851	0	0	0				_log("committed $tc terms\n") if $PA && $tc > 0;
1852							# Flush temporary hashes after data is stored
1853	0						delete($self->{TERM_DOCS_VINT}->[$fno]);
1854	0						delete($self->{TERM_POS}->[$fno]);
1855	0						delete($self->{DOCFREQ_T}->[$fno]);
1856							}
1857
1858	0						$self->{INDEX_DBH}->commit;
1859
1860							}
1861
1862							sub _add_to_delete_queue {
1863	0			0			my $self = shift;
1864	0						my @ids = @_;
1865	0	0					if (ref $ids[0] eq 'ARRAY') {
1866	0						@ids = @{$ids[0]};
	0
1867							}
1868
1869	0		0				my $delete_queue_enum = $self->{DB}->fetch_delete_queue \|\| "";
1870	0						my $delete_queue_vector = Bit::Vector->new_Enum($self->max_indexed_id + 1,
1871							$delete_queue_enum);
1872
1873	0						$delete_queue_vector->Index_List_Store(@ids);
1874
1875	0						$self->{DB}->update_delete_queue($delete_queue_vector->to_Enum);
1876
1877							}
1878
1879							sub _all_doc_ids_remove {
1880	0			0			my $self = shift;
1881	0						my @ids = @_;
1882							# doc_id bits to unset
1883	0	0					if (ref $ids[0] eq 'ARRAY') {
1884	0						@ids = @{$ids[0]};
	0
1885							}
1886
1887	0	0					unless (ref $self->{ALL_DOCS_VECTOR}) {
1888	0						$self->{ALL_DOCS_VECTOR} = Bit::Vector->new_Enum(
1889							$self->max_indexed_id + 1,
1890							$self->_fetch_all_docs_vector
1891							);
1892							}
1893
1894	0	0					if (@ids) {
1895	0						$self->{ALL_DOCS_VECTOR}->Index_List_Remove(@ids);
1896	0						$self->{INDEX_DBH}->do($self->{DB}->update_all_docs_vector, undef,
1897							$self->{ALL_DOCS_VECTOR}->to_Enum);
1898							}
1899
1900							}
1901
1902							sub all_doc_ids {
1903	0			0	0		my $self = shift;
1904	0						my @ids = @_;
1905
1906							# doc_id bits to set
1907	0	0					if (ref $ids[0] eq 'ARRAY') {
1908	0						@ids = @{$ids[0]};
	0
1909							}
1910	8			8		57	no warnings qw(uninitialized);
	8					15
	8					11161
1911	0	0					unless (ref $self->{ALL_DOCS_VECTOR}) {
1912	0						$self->{ALL_DOCS_VECTOR} = Bit::Vector->new_Enum(
1913							$self->max_indexed_id + 1,
1914							$self->_fetch_all_docs_vector
1915							);
1916							}
1917
1918	0	0					if (@ids) {
1919	0	0					if ($self->{ALL_DOCS_VECTOR}->Size() < $self->max_indexed_id + 1) {
1920	0						$self->{ALL_DOCS_VECTOR}->Resize($self->max_indexed_id + 1);
1921							}
1922	0						$self->{ALL_DOCS_VECTOR}->Index_List_Store(@ids);
1923	0						$self->{INDEX_DBH}->do($self->{DB}->update_all_docs_vector, undef,
1924							$self->{ALL_DOCS_VECTOR}->to_Enum);
1925							}
1926							else {
1927							# FIXME: this is probably unnecessary, but older versions
1928							# had this documented as a public method
1929	0						return $self->{ALL_DOCS_VECTOR}->Index_List_Read;
1930							}
1931							}
1932
1933							sub fetch_all_docs_vector {
1934	0			0	0		my $self = shift;
1935	0	0					unless (ref $self->{ALL_DOCS_VECTOR}) {
1936	0						$self->{ALL_DOCS_VECTOR} = Bit::Vector->new_Enum(
1937							$self->max_indexed_id + 1,
1938							$self->_fetch_all_docs_vector
1939							);
1940							}
1941							}
1942
1943							sub _fetch_all_docs_vector {
1944	0			0			my $self = shift;
1945	0						my $sql = $self->{DB}->fetch_all_docs_vector;
1946	0						return scalar $self->{INDEX_DBH}->selectrow_array($sql);
1947							}
1948
1949
1950							sub _fetch_doc {
1951	0			0			my $self = shift;
1952	0						my $id = shift;
1953	0						my $field = shift;
1954
1955	0						my $sql = $self->{DB}->fetch_doc($field);
1956	0						return scalar $self->{DOC_DBH}->selectrow_array($sql, undef, $id);
1957							}
1958
1959							sub _fetch_doc_all_fields {
1960	0			0			my $self = shift;
1961	0						my $id = shift;
1962	0						my $sql = $self->{DB}->fetch_doc_all_fields();
1963	0						my @fields = $self->{DOC_DBH}->selectrow_array($sql, undef, $id);
1964	0						my %fields;
1965	0						foreach my $i (0 .. $#fields) {
1966	0						$fields{$self->{DOC_FIELDS}->[$i]} = $fields[$i];
1967							}
1968	0						return \%fields;
1969							}
1970
1971							sub _terms {
1972	0			0			my $self = shift;
1973	0						my $doc = shift;
1974
1975							# kill tags
1976	0						$doc =~ s/<.*?>/ /g;
1977
1978							# Decode HTML entities
1979	0	0					if ($self->{DECODE_HTML_ENTITIES}) {
1980	0						$doc = HTML::Entities::decode($doc);
1981							}
1982
1983	0						$doc = $self->_lc_and_unac($doc);
1984
1985							# split words on any non-word character or on underscore
1986
1987	0						return grep {
1988	0						$_ = substr($_, 0, $self->{MAX_WORD_LENGTH});
1989	0	0					$_ =~ /[a-z0-9]+/ && not $self->_stoplisted($_)
1990							} split(/[^a-zA-Z0-9]+/, $doc);
1991							}
1992
1993							sub _ping_doc {
1994	0			0			my $self = shift;
1995	0						my $id = shift;
1996	0						my $found_doc = 0;
1997	0						my $sql = $self->{DB}->ping_doc;
1998	0						($found_doc) = $self->{DOC_DBH}->selectrow_array($sql, undef, $id);
1999	0						return $found_doc;
2000							}
2001
2002							sub _create_tables {
2003	0			0			my $self = shift;
2004	0						my ($sql, $sth);
2005
2006							# mask table
2007
2008	0	0					_log("Dropping mask table ($self->{MASK_TABLE})\n") if $PA;
2009	0						$self->{DB}->drop_table($self->{MASK_TABLE});
2010
2011	0						$sql = $self->{DB}->create_mask_table;
2012	0	0					_log("Creating mask table ($self->{MASK_TABLE})\n") if $PA;
2013	0						$self->{INDEX_DBH}->do($sql);
2014
2015							# docweights table
2016
2017	0	0					_log("Dropping docweights table ($self->{DOCWEIGHTS_TABLE})\n") if $PA;
2018	0						$self->{DB}->drop_table($self->{DOCWEIGHTS_TABLE});
2019
2020	0						$sql = $self->{DB}->create_docweights_table;
2021	0	0					_log("Creating docweights table ($self->{DOCWEIGHTS_TABLE})\n") if $PA;
2022	0						$self->{INDEX_DBH}->do($sql);
2023
2024							# docs vector table
2025
2026	0	0					_log("Dropping docs vector table ($self->{ALL_DOCS_VECTOR_TABLE})\n")
2027							if $PA;
2028	0						$self->{DB}->drop_table($self->{ALL_DOCS_VECTOR_TABLE});
2029
2030	0	0					_log("Creating docs vector table ($self->{ALL_DOCS_VECTOR_TABLE})\n")
2031							if $PA;
2032	0						$self->{INDEX_DBH}->do($self->{DB}->create_all_docs_vector_table);
2033
2034							# delete queue table
2035
2036	0	0					_log("Dropping delete queue table ($self->{DELETE_QUEUE_TABLE})\n")
2037							if $PA;
2038	0						$self->{DB}->drop_table($self->{DELETE_QUEUE_TABLE});
2039
2040	0	0					_log("Creating delete queue table ($self->{DELETE_QUEUE_TABLE})\n")
2041							if $PA;
2042	0						$self->{INDEX_DBH}->do($self->{DB}->create_delete_queue_table);
2043
2044							# doc key table
2045
2046	0	0					_log("Dropping doc key table ($self->{DOC_KEY_TABLE})\n") if $PA;
2047	0						$self->{DB}->drop_doc_key_table();
2048	0	0					_log("Creating doc key table ($self->{DOC_KEY_TABLE})\n") if $PA;
2049	0						$self->{INDEX_DBH}->do($self->{DB}->create_doc_key_table);
2050
2051							# inverted tables
2052
2053	0						foreach my $table ( @{$self->{INVERTED_TABLES}} ) {
	0
2054	0	0					_log("Dropping inverted table ($table)\n") if $PA;
2055	0						$self->{DB}->drop_table($table);
2056
2057	0						$sql = $self->{DB}->create_inverted_table($table);
2058	0	0					_log("Creating inverted table ($table)\n") if $PA;
2059	0						$self->{INDEX_DBH}->do($sql);
2060							}
2061							}
2062
2063							sub _stoplisted {
2064	0			0			my $self = shift;
2065	0						my $term = shift;
2066
2067	0	0	0				if ($self->{STOPLIST} and $self->{STOPLISTED_WORDS}->{$term}) {
2068	0						push(@{$self->{STOPLISTED_QUERY}}, $term);
	0
2069	0	0					_log(" stoplisting: $term\n") if $PA > 1;
2070	0						return 1;
2071							} else {
2072	0						return 0;
2073							}
2074							}
2075
2076							sub create_accessors {
2077	0			0	0		my $fields = shift;
2078	0						my $pkg = caller();
2079	8			8		63	no strict 'refs';
	8					20
	8					3558
2080	0						foreach my $field (@$fields) {
2081	0						*{"${pkg}::$field"} = sub {
2082	0			0			my $self = shift;
2083	0	0					$self->set({ $field => shift }) if @_;
2084	0						return $self->{$field};
2085							}
2086	0						}
2087							}
2088
2089							sub get {
2090	0			0	0		my $self = shift;
2091	0	0					return wantarray ? @{$self}{@_} : $self->{$_[0]};
	0
2092							}
2093
2094							sub set {
2095	0			0	0		my $self = shift;
2096
2097	0	0					throw_gen({ error => 'incorrect number of args for set()' })
2098							unless @_;
2099
2100	0	0					my ($keys, $values) = @_ == 1 ? ([keys %{$_[0]}], [values %{$_[0]}]) : @_;
	0
	0
2101
2102	0						my ($key, $old_value, $new_value, $is_dirty);
2103	0						foreach my $i (0 .. $#$keys) {
2104	0						$key = $keys->[$i];
2105	0						$new_value = $values->[$i];
2106	0						$old_value = $self->{uc $key};
2107
2108	0	0	0				if ((not defined $new_value and not defined $old_value) or
			0
			0
			0
2109							(defined $new_value and defined $old_value and
2110							$old_value eq $new_value)) {
2111	0						next;
2112							}
2113	0						$is_dirty = 1;
2114	0						$self->{uc $key} = $new_value;
2115							}
2116	0						return $self;
2117							}
2118
2119							1;
2120							__END__