| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
=head1 NAME |
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
MyConText - Indexing documents with MySQL as storage |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=cut |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
package MyConText; |
|
9
|
11
|
|
|
11
|
|
7740
|
use strict; |
|
|
11
|
|
|
|
|
19
|
|
|
|
11
|
|
|
|
|
341
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
11
|
|
|
11
|
|
48
|
use vars qw($errstr $VERSION); |
|
|
11
|
|
|
|
|
17
|
|
|
|
11
|
|
|
|
|
1723
|
|
|
12
|
|
|
|
|
|
|
$errstr = undef; |
|
13
|
|
|
|
|
|
|
$VERSION = '0.49'; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
my %DEFAULT_PARAMS = ( |
|
16
|
|
|
|
|
|
|
'num_of_docs' => 0, # statistical value, should be maintained |
|
17
|
|
|
|
|
|
|
'word_length' => 30, # max length of words we index |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
'protocol' => 40, # we only support protocol with the same numbers |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
'blob_direct_fetch' => 20, # with the blob store, when we stop searching |
|
22
|
|
|
|
|
|
|
# and fetch everything at once |
|
23
|
|
|
|
|
|
|
'data_table' => undef, # table where the actual index is stored |
|
24
|
|
|
|
|
|
|
'name_length' => 255, # for filenames or URLs, what's the max length |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
'word_id_bits' => 16, # num of bits for word_id (column store) |
|
27
|
|
|
|
|
|
|
'doc_id_bits' => 16, # num of bits for doc_id |
|
28
|
|
|
|
|
|
|
'count_bits' => 8, # num of bits for count value |
|
29
|
|
|
|
|
|
|
'position_bits' => 32, # num of bits for word positions |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
'backend' => 'blob', # what database backend (way the data is |
|
32
|
|
|
|
|
|
|
# stored) we use |
|
33
|
|
|
|
|
|
|
'frontend' => 'none', # what application frontend we use (how |
|
34
|
|
|
|
|
|
|
# the index behaves externaly) |
|
35
|
|
|
|
|
|
|
'filter' => 'map { lc $_ }', |
|
36
|
|
|
|
|
|
|
'splitter' => ' $data =~ /(\w{2,$word_length})/g', |
|
37
|
|
|
|
|
|
|
# can use the $data and $word_length |
|
38
|
|
|
|
|
|
|
# variables |
|
39
|
|
|
|
|
|
|
'init_env' => 'use locale' |
|
40
|
|
|
|
|
|
|
); |
|
41
|
|
|
|
|
|
|
my %backend_types = ( |
|
42
|
|
|
|
|
|
|
'blob' => 'MyConText::Blob', |
|
43
|
|
|
|
|
|
|
'column' => 'MyConText::Column', |
|
44
|
|
|
|
|
|
|
'phrase' => 'MyConText::Phrase', |
|
45
|
|
|
|
|
|
|
); |
|
46
|
|
|
|
|
|
|
my %frontend_types = ( |
|
47
|
|
|
|
|
|
|
'none' => 'MyConText', |
|
48
|
|
|
|
|
|
|
'default' => 'MyConText', |
|
49
|
|
|
|
|
|
|
'file' => 'MyConText::File', |
|
50
|
|
|
|
|
|
|
'string' => 'MyConText::String', |
|
51
|
|
|
|
|
|
|
'url' => 'MyConText::URL', |
|
52
|
|
|
|
|
|
|
'table' => 'MyConText::Table', |
|
53
|
|
|
|
|
|
|
); |
|
54
|
|
|
|
|
|
|
|
|
55
|
11
|
|
|
11
|
|
51
|
use vars qw! %BITS_TO_PACK %BITS_TO_INT %BITS_TO_PRECISION %PRECISION_TO_BITS !; |
|
|
11
|
|
|
|
|
28
|
|
|
|
11
|
|
|
|
|
17422
|
|
|
56
|
|
|
|
|
|
|
%BITS_TO_PACK = qw! 0 A0 8 C 16 S 32 L !; |
|
57
|
|
|
|
|
|
|
%BITS_TO_INT = qw! 8 tinyint 16 smallint 24 mediumint 32 int 64 bigint !; |
|
58
|
|
|
|
|
|
|
%BITS_TO_PRECISION = qw! 8 4 16 6 24 9 32 11 !; |
|
59
|
|
|
|
|
|
|
%PRECISION_TO_BITS = map { ( $BITS_TO_PRECISION{$_} => $_ ) } keys %BITS_TO_PRECISION; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# Open reads in the information about existing index, creates an object |
|
62
|
|
|
|
|
|
|
# in memory |
|
63
|
|
|
|
|
|
|
sub open { |
|
64
|
0
|
|
|
0
|
1
|
|
my ($class, $dbh, $TABLE) = @_; |
|
65
|
0
|
|
|
|
|
|
$errstr = undef; |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# the $dbh is either a real dbh of a DBI->connect parameters arrayref |
|
68
|
0
|
|
|
|
|
|
my $mydbh = 0; |
|
69
|
0
|
0
|
|
|
|
|
if (ref $dbh eq 'ARRAY') { |
|
70
|
|
|
|
|
|
|
$dbh = DBI->connect(@$dbh) or |
|
71
|
0
|
0
|
|
|
|
|
do { $errstr = $DBI::errstr; return; }; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
|
73
|
0
|
|
|
|
|
|
$mydbh = 1; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# load the parameters to the object |
|
77
|
0
|
|
|
|
|
|
my %PARAMS = %DEFAULT_PARAMS; |
|
78
|
0
|
|
|
|
|
|
my $sth = $dbh->prepare("select * from $TABLE"); |
|
79
|
0
|
|
|
|
|
|
$sth->{'PrintError'} = 0; |
|
80
|
0
|
|
|
|
|
|
$sth->{'RaiseError'} = 0; |
|
81
|
0
|
0
|
|
|
|
|
$sth->execute or do { |
|
82
|
0
|
0
|
|
|
|
|
if (not grep { $TABLE eq $_ } |
|
|
0
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
MyConText->list_context_indexes($dbh)) { |
|
84
|
0
|
|
|
|
|
|
$errstr = "ConText index $TABLE doesn't exist."; |
|
85
|
|
|
|
|
|
|
} |
|
86
|
0
|
|
|
|
|
|
else { $errstr = $sth->errstr; } |
|
87
|
0
|
|
|
|
|
|
return; |
|
88
|
|
|
|
|
|
|
}; |
|
89
|
0
|
|
|
|
|
|
while (my ($param, $value) = $sth->fetchrow_array) { |
|
90
|
0
|
|
|
|
|
|
$PARAMS{$param} = $value; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
0
|
|
|
|
|
|
my $self = bless { |
|
93
|
|
|
|
|
|
|
'dbh' => $dbh, |
|
94
|
|
|
|
|
|
|
'table' => $TABLE, |
|
95
|
|
|
|
|
|
|
%PARAMS, |
|
96
|
|
|
|
|
|
|
}, $class; |
|
97
|
0
|
|
|
|
|
|
my $data_table = $self->{'data_table'}; |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# we should disconnect if we've opened the dbh here |
|
100
|
0
|
0
|
|
|
|
|
if ($mydbh) { $self->{'disconnect_on_destroy'} = 1; } |
|
|
0
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# some basic sanity check |
|
103
|
|
|
|
|
|
|
defined $dbh->selectrow_array("select count(*) from $data_table") |
|
104
|
0
|
0
|
|
|
|
|
or do { $errstr = "Table $data_table not found in the database\n"; return; }; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
# load and set the application frontend |
|
108
|
0
|
|
|
|
|
|
my $front_module = $frontend_types{$PARAMS{'frontend'}}; |
|
109
|
0
|
0
|
|
|
|
|
if (defined $front_module) { |
|
110
|
0
|
0
|
|
|
|
|
if ($front_module ne $class) { |
|
111
|
0
|
|
|
|
|
|
eval "use $front_module"; |
|
112
|
0
|
0
|
|
|
|
|
die $@ if $@; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
0
|
|
|
|
|
|
bless $self, $front_module; |
|
115
|
0
|
|
|
|
|
|
$self->_open_tables; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
0
|
|
|
|
|
|
else { $errstr = "Specified frontend type `$PARAMS{'frontend'}' is unknown\n"; return; } |
|
|
0
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# load and set the backend (actual database access) module |
|
120
|
0
|
|
|
|
|
|
my $back_module = $backend_types{$PARAMS{'backend'}}; |
|
121
|
0
|
0
|
|
|
|
|
if (defined $back_module) { |
|
122
|
0
|
|
|
|
|
|
eval "use $back_module"; |
|
123
|
0
|
0
|
|
|
|
|
die $@ if $@; |
|
124
|
0
|
|
|
|
|
|
$self->{'db_backend'} = $back_module->open($self); |
|
125
|
|
|
|
|
|
|
} |
|
126
|
0
|
|
|
|
|
|
else { $errstr = "Specified backend type `$PARAMS{'backend'}' is unknown\n"; return; } |
|
|
0
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# finally, return the object |
|
129
|
0
|
|
|
|
|
|
$self; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Create creates tables in the database according to the options, then |
|
133
|
|
|
|
|
|
|
# calls open to load the object to memory |
|
134
|
|
|
|
|
|
|
sub create { |
|
135
|
0
|
|
|
0
|
1
|
|
my ($class, $dbh, $TABLE, %OPTIONS) = @_; |
|
136
|
0
|
|
|
|
|
|
$errstr = undef; |
|
137
|
0
|
|
|
|
|
|
my $mydbh = 0; |
|
138
|
0
|
0
|
|
|
|
|
if (ref $dbh eq 'ARRAY') { |
|
139
|
|
|
|
|
|
|
$dbh = DBI->connect(@$dbh) or |
|
140
|
0
|
0
|
|
|
|
|
do { $errstr = $DBI::errstr; return; }; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$mydbh = 1; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
my $self = bless { |
|
145
|
|
|
|
|
|
|
'dbh' => $dbh, |
|
146
|
|
|
|
|
|
|
'table' => $TABLE, |
|
147
|
|
|
|
|
|
|
%DEFAULT_PARAMS, |
|
148
|
|
|
|
|
|
|
%OPTIONS |
|
149
|
|
|
|
|
|
|
}, $class; |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
0
|
|
|
|
|
$self->{'data_table'} = $TABLE.'_data' |
|
152
|
|
|
|
|
|
|
unless defined $self->{'data_table'}; |
|
153
|
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $CREATE_PARAM = <
|
|
155
|
|
|
|
|
|
|
create table $TABLE ( |
|
156
|
|
|
|
|
|
|
param varchar(16) binary not null, |
|
157
|
|
|
|
|
|
|
value varchar(255), |
|
158
|
|
|
|
|
|
|
primary key (param) |
|
159
|
|
|
|
|
|
|
) |
|
160
|
|
|
|
|
|
|
EOF |
|
161
|
0
|
0
|
|
|
|
|
$dbh->do($CREATE_PARAM) or do { $errstr = $dbh->errstr; return; }; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
162
|
0
|
|
|
|
|
|
push @{$self->{'created_tables'}}, $TABLE; |
|
|
0
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# load and set the frontend database structures |
|
165
|
0
|
|
|
|
|
|
my $front_module = $frontend_types{$self->{'frontend'}}; |
|
166
|
0
|
0
|
|
|
|
|
if (defined $front_module) { |
|
167
|
0
|
|
|
|
|
|
eval "use $front_module"; |
|
168
|
0
|
0
|
|
|
|
|
die $@ if $@; |
|
169
|
0
|
|
|
|
|
|
bless $self, $front_module; |
|
170
|
0
|
|
|
|
|
|
$errstr = $self->_create_tables; |
|
171
|
0
|
0
|
|
|
|
|
if (defined $errstr) { $self->clean_failed_create; return; } |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
} |
|
173
|
0
|
|
|
|
|
|
else { $errstr = "Specified frontend type `$self->{'frontend'}' is unknown\n"; $self->clean_failed_create; return; } |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# create the backend database structures |
|
176
|
0
|
|
|
|
|
|
my $back_module = $backend_types{$self->{'backend'}}; |
|
177
|
0
|
0
|
|
|
|
|
if (defined $back_module) { |
|
178
|
0
|
|
|
|
|
|
eval "use $back_module"; |
|
179
|
0
|
0
|
|
|
|
|
die $@ if $@; |
|
180
|
0
|
|
|
|
|
|
$errstr = $back_module->_create_tables($self); |
|
181
|
0
|
0
|
|
|
|
|
if (defined $errstr) { $self->clean_failed_create; return; } |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
} |
|
183
|
0
|
|
|
|
|
|
else { $errstr = "Specified backend type `$self->{'backend'}' is unknown\n"; $self->clean_failed_create; return; } |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
|
|
185
|
0
|
|
|
|
|
|
for (grep { not ref $self->{$_} } keys %$self) { |
|
|
0
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
$dbh->do("insert into $TABLE values (?, ?)", {}, $_, $self->{$_}); |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
return $class->open($dbh, $TABLE); |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
0
|
|
|
0
|
|
|
sub _create_tables {} |
|
193
|
0
|
|
|
0
|
|
|
sub _open_tables {} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub clean_failed_create { |
|
196
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
197
|
0
|
|
|
|
|
|
my $dbh = $self->{'dbh'}; |
|
198
|
0
|
|
|
|
|
|
for my $table (@{$self->{'created_tables'}}) { |
|
|
0
|
|
|
|
|
|
|
|
199
|
0
|
|
|
|
|
|
$dbh->do("drop table $table"); |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub drop { |
|
204
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
205
|
0
|
|
|
|
|
|
my $dbh = $self->{'dbh'}; |
|
206
|
0
|
|
|
|
|
|
for my $tag (keys %$self) { |
|
207
|
0
|
0
|
|
|
|
|
next unless $tag =~ /(^|_)table$/; |
|
208
|
0
|
|
|
|
|
|
$dbh->do("drop table $self->{$tag}"); |
|
209
|
|
|
|
|
|
|
} |
|
210
|
0
|
|
|
|
|
|
1; |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
sub errstr { |
|
213
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
214
|
0
|
0
|
|
|
|
|
ref $self ? $self->{'errstr'} : $errstr; |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub list_context_indexes { |
|
218
|
0
|
|
|
0
|
0
|
|
my ($class, $dbh) = @_; |
|
219
|
0
|
|
|
|
|
|
my %tables = map { ( $_->[0] => 1 ) } |
|
|
0
|
|
|
|
|
|
|
|
220
|
0
|
|
|
|
|
|
@{$dbh->selectall_arrayref('show tables')}; |
|
221
|
0
|
|
|
|
|
|
my %indexes = (); |
|
222
|
0
|
|
|
|
|
|
for my $table (keys %tables) { |
|
223
|
0
|
|
|
|
|
|
local $dbh->{'PrintError'} = 0; |
|
224
|
0
|
|
|
|
|
|
local $dbh->{'RaiseError'} = 0; |
|
225
|
0
|
0
|
|
|
|
|
if ($dbh->selectrow_array("select param, value from $table |
|
226
|
|
|
|
|
|
|
where param = 'data_table'")) { |
|
227
|
0
|
|
|
|
|
|
$indexes{$table} = 1; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
} |
|
230
|
0
|
|
|
|
|
|
return sort keys %indexes; |
|
231
|
|
|
|
|
|
|
} |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub index_document { |
|
234
|
0
|
|
|
0
|
1
|
|
my ($self, $id, $data) = @_; |
|
235
|
0
|
0
|
|
|
|
|
return unless defined $id; |
|
236
|
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
my $dbh = $self->{'dbh'}; |
|
238
|
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
|
my $param_table = $self->{'table'}; |
|
240
|
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
|
my $adding_doc = 0; |
|
242
|
|
|
|
|
|
|
|
|
243
|
0
|
|
|
|
|
|
my $adding = 0; |
|
244
|
0
|
0
|
0
|
|
|
|
if (not defined $self->{'max_doc_id'} or $id > $self->{'max_doc_id'}) { |
|
245
|
0
|
|
|
|
|
|
$self->{'max_doc_id'} = $id; |
|
246
|
0
|
0
|
|
|
|
|
my $update_max_doc_id_sth = |
|
247
|
|
|
|
|
|
|
( defined $self->{'update_max_doc_id_sth'} |
|
248
|
|
|
|
|
|
|
? $self->{'update_max_doc_id_sth'} |
|
249
|
|
|
|
|
|
|
: $self->{'update_max_doc_id_sth'} = $dbh->prepare("replace into $param_table values (?, ?)")); |
|
250
|
0
|
|
|
|
|
|
$update_max_doc_id_sth->execute('max_doc_id', $id); |
|
251
|
0
|
|
|
|
|
|
$adding_doc = 1; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
0
|
|
|
|
|
|
my $init_env = $self->{'init_env'}; # use packages, etc. |
|
255
|
0
|
0
|
|
|
|
|
eval $init_env if defined $init_env; |
|
256
|
0
|
0
|
|
|
|
|
print STDERR "Init_env failed with $@\n" if $@; |
|
257
|
|
|
|
|
|
|
|
|
258
|
0
|
0
|
|
|
|
|
$data = '' unless defined $data; |
|
259
|
0
|
|
|
|
|
|
return $self->{'db_backend'}->parse_and_index_data($adding_doc, |
|
260
|
|
|
|
|
|
|
$id, $data); |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# used for backends that need a count for each of the words |
|
264
|
|
|
|
|
|
|
sub parse_and_index_data_count { |
|
265
|
0
|
|
|
0
|
0
|
|
my ($backend, $adding_doc, $id, $data) = @_; |
|
266
|
|
|
|
|
|
|
## note that this is run with backend object |
|
267
|
0
|
|
|
|
|
|
my $self = $backend->{'ctx'}; |
|
268
|
|
|
|
|
|
|
|
|
269
|
0
|
|
|
|
|
|
my $word_length = $self->{'word_length'}; |
|
270
|
|
|
|
|
|
|
# this needs to get parametrized (lc, il2_to_ascii, parsing of |
|
271
|
|
|
|
|
|
|
# HTML tags, ...) |
|
272
|
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
|
my %words; |
|
274
|
|
|
|
|
|
|
|
|
275
|
11
|
|
|
11
|
|
10459
|
use locale; |
|
|
11
|
|
|
|
|
2408
|
|
|
|
11
|
|
|
|
|
50
|
|
|
276
|
0
|
|
|
|
|
|
my $filter = $self->{'filter'} . ' ' . $self->{'splitter'}; |
|
277
|
0
|
|
|
|
|
|
for my $word ( eval $filter ) { |
|
278
|
0
|
0
|
|
|
|
|
$words{$word} = 0 if not defined $words{$word}; |
|
279
|
0
|
|
|
|
|
|
$words{$word}++; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
0
|
|
|
|
|
|
my @result; |
|
283
|
0
|
0
|
|
|
|
|
if ($adding_doc) { |
|
284
|
0
|
|
|
|
|
|
@result = $backend->add_document($id, \%words); |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
else { |
|
287
|
0
|
|
|
|
|
|
@result = $backend->update_document($id, \%words); |
|
288
|
|
|
|
|
|
|
} |
|
289
|
|
|
|
|
|
|
|
|
290
|
0
|
0
|
|
|
|
|
if (wantarray) { |
|
291
|
0
|
|
|
|
|
|
return @result; |
|
292
|
|
|
|
|
|
|
} |
|
293
|
0
|
|
|
|
|
|
return $result[0]; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# used for backends where list of occurencies is needed |
|
297
|
|
|
|
|
|
|
sub parse_and_index_data_list { |
|
298
|
0
|
|
|
0
|
0
|
|
my ($backend, $adding_doc, $id, $data) = @_; |
|
299
|
|
|
|
|
|
|
## note that this is run with backend object |
|
300
|
0
|
|
|
|
|
|
my $self = $backend->{'ctx'}; |
|
301
|
|
|
|
|
|
|
|
|
302
|
0
|
|
|
|
|
|
my $word_length = $self->{'word_length'}; |
|
303
|
|
|
|
|
|
|
# this needs to get parametrized (lc, il2_to_ascii, parsing of |
|
304
|
|
|
|
|
|
|
# HTML tags, ...) |
|
305
|
|
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
|
my %words; |
|
307
|
|
|
|
|
|
|
|
|
308
|
11
|
|
|
11
|
|
2267
|
use locale; |
|
|
11
|
|
|
|
|
22
|
|
|
|
11
|
|
|
|
|
41
|
|
|
309
|
0
|
|
|
|
|
|
my $filter = $self->{'filter'} . ' ' . $self->{'splitter'}; |
|
310
|
|
|
|
|
|
|
|
|
311
|
0
|
|
|
|
|
|
my $i = 0; |
|
312
|
0
|
|
|
|
|
|
for my $word ( eval $filter ) { |
|
313
|
0
|
|
|
|
|
|
push @{$words{$word}}, ++$i; |
|
|
0
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
|
|
316
|
0
|
|
|
|
|
|
my @result; |
|
317
|
0
|
0
|
|
|
|
|
if ($adding_doc) { |
|
318
|
0
|
|
|
|
|
|
@result = $backend->add_document($id, \%words); |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
else { |
|
321
|
0
|
|
|
|
|
|
@result = $backend->update_document($id, \%words); |
|
322
|
|
|
|
|
|
|
} |
|
323
|
|
|
|
|
|
|
|
|
324
|
0
|
0
|
|
|
|
|
if (wantarray) { |
|
325
|
0
|
|
|
|
|
|
return @result; |
|
326
|
|
|
|
|
|
|
} |
|
327
|
0
|
|
|
|
|
|
return $result[0]; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
sub delete_document { |
|
330
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
331
|
0
|
|
|
|
|
|
$self->{'db_backend'}->delete_document(@_); |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub contains_hashref { |
|
335
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
336
|
0
|
|
|
|
|
|
my $filter = $self->{'filter'}; |
|
337
|
0
|
|
|
|
|
|
$self->{'db_backend'}->contains_hashref(eval $filter.' @_'); |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
sub contains { |
|
340
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
341
|
0
|
|
|
|
|
|
my $res = $self->contains_hashref(@_); |
|
342
|
0
|
0
|
|
|
|
|
if (not $self->{'count_bits'}) { return keys %$res; } |
|
|
0
|
|
|
|
|
|
|
|
343
|
0
|
|
|
|
|
|
return sort { $res->{$b} <=> $res->{$a} } keys %$res; |
|
|
0
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
sub econtains_hashref { |
|
346
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
347
|
0
|
|
|
|
|
|
my $docs = {}; |
|
348
|
0
|
|
|
|
|
|
my $word_num = 0; |
|
349
|
|
|
|
|
|
|
|
|
350
|
0
|
|
|
|
|
|
my $is_some_plus = grep /^\+/, @_; |
|
351
|
|
|
|
|
|
|
|
|
352
|
0
|
|
|
|
|
|
for my $word ( map { /^\+(.+)$/s } @_) { |
|
|
0
|
|
|
|
|
|
|
|
353
|
0
|
|
|
|
|
|
$word_num++; |
|
354
|
0
|
|
|
|
|
|
my $oneword = $self->contains_hashref($word); |
|
355
|
0
|
0
|
|
|
|
|
if ($word_num == 1) { $docs = $oneword; next; } |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
|
for my $doc (keys %$oneword) { |
|
357
|
0
|
0
|
|
|
|
|
$docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
0
|
|
|
|
|
|
for my $doc (keys %$docs) { |
|
360
|
0
|
0
|
|
|
|
|
delete $docs->{$doc} unless defined $oneword->{$doc}; |
|
361
|
|
|
|
|
|
|
} |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
0
|
|
|
|
|
|
for my $word ( map { /^([^+-].*)$/s } @_) { |
|
|
0
|
|
|
|
|
|
|
|
365
|
0
|
|
|
|
|
|
my $oneword = $self->contains_hashref($word); |
|
366
|
0
|
|
|
|
|
|
for my $doc (keys %$oneword) { |
|
367
|
0
|
0
|
|
|
|
|
if ($is_some_plus) { |
|
368
|
0
|
0
|
|
|
|
|
$docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; |
|
369
|
|
|
|
|
|
|
} |
|
370
|
|
|
|
|
|
|
else { |
|
371
|
0
|
0
|
|
|
|
|
$docs->{$doc} = 0 unless defined $docs->{$doc}; |
|
372
|
0
|
|
|
|
|
|
$docs->{$doc} += $oneword->{$doc}; |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
0
|
|
|
|
|
|
for my $word ( map { /^-(.+)$/s } @_) { |
|
|
0
|
|
|
|
|
|
|
|
378
|
0
|
|
|
|
|
|
my $oneword = $self->contains_hashref($word); |
|
379
|
0
|
|
|
|
|
|
for my $doc (keys %$oneword) { |
|
380
|
0
|
|
|
|
|
|
delete $docs->{$doc}; |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
} |
|
383
|
0
|
|
|
|
|
|
$docs; |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
sub econtains { |
|
386
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
387
|
0
|
|
|
|
|
|
my $res = $self->econtains_hashref(@_); |
|
388
|
0
|
0
|
|
|
|
|
if (not $self->{'count_bits'}) { return keys %$res; } |
|
|
0
|
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
|
return sort { $res->{$b} <=> $res->{$a} } keys %$res; |
|
|
0
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
1; |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
use MyConText; |
|
397
|
|
|
|
|
|
|
use DBI; |
|
398
|
|
|
|
|
|
|
# connect to database (regular DBI) |
|
399
|
|
|
|
|
|
|
my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); |
|
400
|
|
|
|
|
|
|
# create a new index |
|
401
|
|
|
|
|
|
|
my $ctx = MyConText->create($dbh, 'ctx_web_1', |
|
402
|
|
|
|
|
|
|
'frontend' => 'string', 'backend' => 'blob'); |
|
403
|
|
|
|
|
|
|
# or open existing one |
|
404
|
|
|
|
|
|
|
# my $ctx = MyConText->open($dbh, 'ctx_web_1'); |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
# index documents |
|
407
|
|
|
|
|
|
|
$ctx->index_document('krtek', 'krtek leze pod zemi'); |
|
408
|
|
|
|
|
|
|
$ctx->index_document('jezek', 'Jezek ma ostre bodliny.'); |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# search for matches |
|
411
|
|
|
|
|
|
|
my @documents = $ctx->contains('krtek'); |
|
412
|
|
|
|
|
|
|
my @docs = $ctx->econtains('+krtek', '-Jezek'); |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
MyConText is a pure man's solution for indexing contents of documents. |
|
418
|
|
|
|
|
|
|
It uses the MySQL database to store the information about words and |
|
419
|
|
|
|
|
|
|
documents and provides Perl interface for indexing new documents, |
|
420
|
|
|
|
|
|
|
making changes and searching for matches. For MyConText, a document |
|
421
|
|
|
|
|
|
|
is nearly anything -- Perl scalar, file, Web document, database field. |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
The basic style of interface is shown above. What you need is a MySQL |
|
424
|
|
|
|
|
|
|
database and a DBI with DBD::mysql. Then you create a MyConText index |
|
425
|
|
|
|
|
|
|
-- a set of tables that maintain all necessary information. Once created |
|
426
|
|
|
|
|
|
|
it can be accessed many times, either for updating the index (adding |
|
427
|
|
|
|
|
|
|
documents) or searching. |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
MyConText uses one basic table to store parameters of the index. Second |
|
430
|
|
|
|
|
|
|
table is used to store the actual information about documents and words, |
|
431
|
|
|
|
|
|
|
and depending on the type of the index (specified during index creation) |
|
432
|
|
|
|
|
|
|
there may be more tables to store additional information (like |
|
433
|
|
|
|
|
|
|
conversion from external string names (eg. URL's) to internal numeric |
|
434
|
|
|
|
|
|
|
form). For a user, these internal thingies and internal behaviour of the |
|
435
|
|
|
|
|
|
|
index are not important. The important part is the API, the methods to |
|
436
|
|
|
|
|
|
|
index document and ask questions about words in documents. However, |
|
437
|
|
|
|
|
|
|
certain understanding of how it all works may be usefull when you are |
|
438
|
|
|
|
|
|
|
deciding if this module is for you and what type of index will best |
|
439
|
|
|
|
|
|
|
suit your needs. |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=head2 Frontends |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
From the user, application point of view, the MyConText index stores |
|
444
|
|
|
|
|
|
|
documents that are named in a certain way, allows adding new documents, |
|
445
|
|
|
|
|
|
|
and provides methods to ask: "give me list of names of documents that |
|
446
|
|
|
|
|
|
|
contain this list of words". The MyConText index doesn't store the |
|
447
|
|
|
|
|
|
|
documents itself. Instead, it stores information about words in the |
|
448
|
|
|
|
|
|
|
documents in such a structured way that it makes easy and fast to look |
|
449
|
|
|
|
|
|
|
up what documents contain certain words and return names of the |
|
450
|
|
|
|
|
|
|
documents. |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
MyConText provides a couple of predefined frontend classes that specify |
|
453
|
|
|
|
|
|
|
various types of documents (and the way they relate to their names). |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=over 4 |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=item default |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
By default, user specifies the integer number of the document and the |
|
460
|
|
|
|
|
|
|
content (body) of the document. The code would for example read |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
$ctx->index_document(53, 'zastavujeme vyplaty vkladu'); |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
and MyConText will remember that the document 53 contains three words. |
|
465
|
|
|
|
|
|
|
When looking for all documents containing word (string) vklad, a call |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
my @docs = $ctx->contains('vklad%'); |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
would return numbers of all documents containing words starting with |
|
470
|
|
|
|
|
|
|
'vklad', 53 among them. |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
So here it's user's responsibility to maintain a relation between the |
|
473
|
|
|
|
|
|
|
document numbers and their content, to know that a document 53 is about |
|
474
|
|
|
|
|
|
|
vklady. Perhaps the documents are already stored somewhere and have |
|
475
|
|
|
|
|
|
|
inique numeric id. |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=item string |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Frontend B allows the user to specify the names of the documents as |
|
480
|
|
|
|
|
|
|
strings, instead of numbers. Still the user has to specify both the |
|
481
|
|
|
|
|
|
|
name of the document and the content: |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
$ctx->index_document('upozorneni', |
|
484
|
|
|
|
|
|
|
'Odstrante z dosadu deti!'); |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
After that, |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
$ctx->contains('deti') |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
will return 'upozorneni' as one of the names of documents with word |
|
491
|
|
|
|
|
|
|
'deti' in it. |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=item file |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
To index files, use the frontend B. Here the content of the document |
|
496
|
|
|
|
|
|
|
is clearly the content of the file specified by the filename, so in |
|
497
|
|
|
|
|
|
|
a call to index_document, only the name is needed -- the content of the |
|
498
|
|
|
|
|
|
|
file is read by the MyConText transparently: |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
$ctx->index_document('/usr/doc/FAQ/Linux-FAQ'); |
|
501
|
|
|
|
|
|
|
my @files = $ctx->contains('penguin'); |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=item url |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
Web document can be indexed by the frontend B. MyConText uses LWP to |
|
506
|
|
|
|
|
|
|
get the document and then parses it normally: |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
$ctx->index_document('http://www.perl.com/'); |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=item table |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
You can have a MyConText index that indexes char or blob fields in MySQL |
|
513
|
|
|
|
|
|
|
table. Since MySQL doesn't support triggers, you have to call the |
|
514
|
|
|
|
|
|
|
index_document method of MyConText any time something changes in the |
|
515
|
|
|
|
|
|
|
table. So the sequence probably will be |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
$dbh->do('insert into the_table (id, data, other_fields) |
|
518
|
|
|
|
|
|
|
values (?, ?, ?)', {}, $name, $data, $date_or_something); |
|
519
|
|
|
|
|
|
|
$ctx->index_document($name); |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
When calling contains, the id (name) of the record will be returned. If |
|
522
|
|
|
|
|
|
|
the id in the_table is numeric, it's directly used as the internal |
|
523
|
|
|
|
|
|
|
numeric id, otherwise a string's way of converting the id to numeric |
|
524
|
|
|
|
|
|
|
form is used. |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=back |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
The structure of MyConText is very flexible and adding new frontend |
|
529
|
|
|
|
|
|
|
(what will be indexed) is very easy. |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=head2 Backends |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
While frontend specifies what is indexed and how the user sees the |
|
534
|
|
|
|
|
|
|
collection of documents, backend is about low level database way of |
|
535
|
|
|
|
|
|
|
actually storing the information in the tables. Three types are |
|
536
|
|
|
|
|
|
|
available: |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=over 4 |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=item blob |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
For each word, a blob holding list of all documents containing that word |
|
543
|
|
|
|
|
|
|
is stored in the table, with the count (number of occurencies) |
|
544
|
|
|
|
|
|
|
associated with each document number. That makes it for very compact |
|
545
|
|
|
|
|
|
|
storage. Since the document names (for example URL) are internally |
|
546
|
|
|
|
|
|
|
converted to numbers, storing and fetching the data is fast. However, |
|
547
|
|
|
|
|
|
|
updating the information is very slow, since information concerning one |
|
548
|
|
|
|
|
|
|
document is spread across all table, without any direct database access. |
|
549
|
|
|
|
|
|
|
Updating a document (or merely reindexing it) requires update of all |
|
550
|
|
|
|
|
|
|
blobs, which is slow. |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
The list of documents is stored sorted by document name so that |
|
553
|
|
|
|
|
|
|
fetching an information about a document for one word is relatively |
|
554
|
|
|
|
|
|
|
easy, still a need to update (or at least scan) all records in the table |
|
555
|
|
|
|
|
|
|
makes this storage unsuitable for collections of documents that often |
|
556
|
|
|
|
|
|
|
change. |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=item column |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
The B backend stores a word/document pair in database fields, |
|
561
|
|
|
|
|
|
|
indexing both, thus allowing both fast retrieval and updates -- it's |
|
562
|
|
|
|
|
|
|
easy to delete all records describing one document and insert new ones. |
|
563
|
|
|
|
|
|
|
However, the database indexes that have to be maintained are large. |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Both B and B backends only store a count -- number of |
|
566
|
|
|
|
|
|
|
occurencies of the word in the document (and even this can be switched |
|
567
|
|
|
|
|
|
|
off, yielding just a yes/no information about the word's presence). |
|
568
|
|
|
|
|
|
|
This allows questions like |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
all documents containing words 'voda' or 'Mattoni' |
|
571
|
|
|
|
|
|
|
but not a word 'kyselka' |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
but you cannot ask whether a document contains a phrase 'kyselka |
|
574
|
|
|
|
|
|
|
Mattoni' because such information is not maintained by these types of |
|
575
|
|
|
|
|
|
|
backends. |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=item phrase |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
To allow phrase matching, a B backend is available. For each word |
|
580
|
|
|
|
|
|
|
and document number it stores a blob of lists of positions of the word |
|
581
|
|
|
|
|
|
|
in the document. A query |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
$ctx->contains('kyselk%', 'Mattoni'); |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
then only returns those documents (document names/numbers) where word |
|
586
|
|
|
|
|
|
|
kyselka (or kyselky, or so) is just before word Mattoni. |
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=back |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=head2 Mixing frontends and backends |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
Any frontend can be used with any backend in one MyConText index. You |
|
593
|
|
|
|
|
|
|
can index Web documents with B frontend and B backend |
|
594
|
|
|
|
|
|
|
to be able to find phrases in the documents. And you can use the |
|
595
|
|
|
|
|
|
|
default, number based document scheme with B backend to use the disk |
|
596
|
|
|
|
|
|
|
space as efficiently as possible -- this is usefull for example for |
|
597
|
|
|
|
|
|
|
mailing-list archives, where we need to index huge number of documents |
|
598
|
|
|
|
|
|
|
that do not change at all. |
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
Finding optimal combination is very important and may require some |
|
601
|
|
|
|
|
|
|
analysis of the document collection and manipulation, as well as the |
|
602
|
|
|
|
|
|
|
speed and storage requirements. Benchmarking on actual target platform |
|
603
|
|
|
|
|
|
|
is very usefull during the design phase. |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head1 METHODS |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
The following methods are available on the user side as MyConText API. |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
=over 4 |
|
610
|
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
=item create |
|
612
|
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
my $ctx = MyConText->create($dbh, $index_name, %opts); |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
The class method B creates index of given name (the name of the |
|
616
|
|
|
|
|
|
|
index is the name of its basic parameter table) and all necessary |
|
617
|
|
|
|
|
|
|
tables, returns an object -- newly created index. The options that may |
|
618
|
|
|
|
|
|
|
be specified after the index name define the frontend and backend types, |
|
619
|
|
|
|
|
|
|
storage parameters (how many bits for what values), etc. See below for |
|
620
|
|
|
|
|
|
|
list of create options and discussion of their use. |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=item open |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
my $ctx = MyConText->open($dbh, $index_name); |
|
625
|
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
Opens and returns object, accessing specifies MyConText index. Since all |
|
627
|
|
|
|
|
|
|
the index parameters and information are stored in the $index_name table |
|
628
|
|
|
|
|
|
|
(including names of all other needed tables), the database handler and |
|
629
|
|
|
|
|
|
|
the name of the parameter table are the only needed arguments. |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=item index_document |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
$ctx->index_document(45, 'Sleva pri nakupu stribra.'); |
|
634
|
|
|
|
|
|
|
$ctx->index_document('http://www.mozilla.org/'); |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
For the default and B frontends, two arguments are expected -- the |
|
637
|
|
|
|
|
|
|
name (number or string) of the document and its content. For B and |
|
638
|
|
|
|
|
|
|
B frontends only the name of the document is needed. The method |
|
639
|
|
|
|
|
|
|
returns number of words indexed (subject to wild change). |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=item delete_document |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
$ctx->delete_document('http://www.mozilla.org/'); |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
Removes information about document from the index. Note that for B |
|
646
|
|
|
|
|
|
|
backend this is very time consuming process. |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=item contains |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
my @docs = $ctx->contains('sleva', 'strib%'); |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
Returns list of names (numbers or strings, depending on the frontend) |
|
653
|
|
|
|
|
|
|
of documents that contain some of specified words. |
|
654
|
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
=item econtains |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
my @docs = $ctx->contains('sleva', '+strib%', '-zlato'); |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
Econtains stands for extended contains and allows words to be prefixed |
|
660
|
|
|
|
|
|
|
by plus or minus signs to specify that the word must or mustn't be |
|
661
|
|
|
|
|
|
|
present in the document for it to match. |
|
662
|
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=item contains_hashref, econtains_hashref |
|
664
|
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
Similar to B and B, only instead of list of document |
|
666
|
|
|
|
|
|
|
names, there methods return a hash reference to a hash where keys are |
|
667
|
|
|
|
|
|
|
the document names and values are the number of occurencies of the |
|
668
|
|
|
|
|
|
|
words. |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=item drop |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
Removes all tables associated with the index, including the base |
|
673
|
|
|
|
|
|
|
parameter table. Effectivelly destroying the index form the database. |
|
674
|
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=back |
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=head1 INDEX OPTIONS |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
Here we list the options that may be passed to MyConText->create call. |
|
680
|
|
|
|
|
|
|
These allow to specify the style and storage parameters in great detail. |
|
681
|
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
=over 4 |
|
683
|
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=item backend |
|
685
|
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
The backend type, default B, possible values blob, column and phrase |
|
687
|
|
|
|
|
|
|
(see above for explanation). |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=item frontend |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
The frontend type. The default frontend requires the user to specify |
|
692
|
|
|
|
|
|
|
numeric id of the document together with the content of the document, |
|
693
|
|
|
|
|
|
|
other possible values are string, file and url (see above for |
|
694
|
|
|
|
|
|
|
more info). |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
=item word_length |
|
697
|
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
Maximum length of words that may be indexed, default 30. |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=item data_table |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
Name of the table where the actual data about word/document relation is |
|
703
|
|
|
|
|
|
|
stored. By default, the name of the index (of the base table) with _data |
|
704
|
|
|
|
|
|
|
suffix is used. |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=item name_length |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
Any frontend that uses strings as names of documents needs to maintain |
|
709
|
|
|
|
|
|
|
a conversion table from these names to internal integer ids. This value |
|
710
|
|
|
|
|
|
|
specifies maximum length of these string names (URLs, file names, ...). |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
=item blob_direct_fetch |
|
713
|
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
Only for blob backend. When looking for information about specific |
|
715
|
|
|
|
|
|
|
document in the list stored in the blob, the blob backend uses division |
|
716
|
|
|
|
|
|
|
of interval to find the correct place in the blob. When the interval |
|
717
|
|
|
|
|
|
|
gets equal or shorter that this value, all values are fetched from the |
|
718
|
|
|
|
|
|
|
database and the final search is done in Perl code sequentially. |
|
719
|
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
=item word_id_bits |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
With column or phase backends, MyConText maintains a numeric id for each |
|
723
|
|
|
|
|
|
|
word to optimize the space requirements. The word_id_bits parameter |
|
724
|
|
|
|
|
|
|
specifies the number of bits to reserve for this conversion and thus |
|
725
|
|
|
|
|
|
|
effectively limits number of distinct words that may be indexed. The |
|
726
|
|
|
|
|
|
|
default is 16 bits and possible values are 8, 16, 24 or 32 bits. |
|
727
|
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=item word_id_table |
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
Name of the table that holds conversion from words to their numeric id |
|
731
|
|
|
|
|
|
|
(for column and phrase backends). By default is the name of the index |
|
732
|
|
|
|
|
|
|
with _words suffix. |
|
733
|
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
=item doc_id_bits |
|
735
|
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
A number of bits to hold a numeric id of the document (that is either |
|
737
|
|
|
|
|
|
|
provided by the user (with default frontend) or generated by the module |
|
738
|
|
|
|
|
|
|
to accomplish the conversion from the string name of the document). This |
|
739
|
|
|
|
|
|
|
value limits the maximum number of documents to hold. The default is 16 |
|
740
|
|
|
|
|
|
|
bits and possible values are 8, 16 and 32 bits for blob backend and 8, |
|
741
|
|
|
|
|
|
|
16, 24 and 32 bits for column and phrase backends. |
|
742
|
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
=item doc_id_table |
|
744
|
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
Name of the table that holds conversion from string names of documents |
|
746
|
|
|
|
|
|
|
to their numeric id, by default the name of the index with _docid |
|
747
|
|
|
|
|
|
|
suffix. |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
=item count_bits |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
Number of bits reserved for storing number of occurencies of each word |
|
752
|
|
|
|
|
|
|
in the document. The default is 8 and possible values are the same as |
|
753
|
|
|
|
|
|
|
with doc_id_bits. |
|
754
|
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=item position_bits |
|
756
|
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
With phrase backend, MyConText stores positions of each word of the |
|
758
|
|
|
|
|
|
|
documents. This value specifies how much space should be reserved for |
|
759
|
|
|
|
|
|
|
this purpose. The default is 32 bits and possible values are 8, 16 or 32 |
|
760
|
|
|
|
|
|
|
bits. This value limits the maximum number of words of each document |
|
761
|
|
|
|
|
|
|
that can be stored. |
|
762
|
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=item splitter |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
MyConText allows the user to provide any Perl code that will be used to |
|
766
|
|
|
|
|
|
|
split the content of the document to words. The code will be evalled |
|
767
|
|
|
|
|
|
|
inside of the MyConText code. The default is |
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
$data =~ /(\w{2,$word_length})/g |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
and shows that the input is stored in the variable C<$data> and the code |
|
772
|
|
|
|
|
|
|
may access any other variable available in the perl_and_index_data_* |
|
773
|
|
|
|
|
|
|
methods (see source), especially C<$word_length> to get the maximum length |
|
774
|
|
|
|
|
|
|
of words and C<$backend> to get the backend object. |
|
775
|
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
The default value also shows that by default, the minimum length of |
|
777
|
|
|
|
|
|
|
words indexed is 2. |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=item filter |
|
780
|
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
The output words of splitter (and also any parameter of (e)contains* |
|
782
|
|
|
|
|
|
|
methods) are send to filter that may do further processing. Filter is |
|
783
|
|
|
|
|
|
|
again a Perl code, the default is |
|
784
|
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
map { lc $_ } |
|
786
|
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
showing that the filter operates on input list and by default does |
|
788
|
|
|
|
|
|
|
conversion to lowercase (yielding case insensitive index). |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=item init_env |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
Because user defined splitter or filter may depend on other things that |
|
793
|
|
|
|
|
|
|
it is reasonable to set before the actual procession of words, you can |
|
794
|
|
|
|
|
|
|
use yet another Perl hook to set things up. The default is |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
use locale |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=item table_name |
|
799
|
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
For table frontend; this is the name of the table that will be indexed. |
|
801
|
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
=item column_name |
|
803
|
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
For table frontend; this is the name of the column in the table_name |
|
805
|
|
|
|
|
|
|
that contains the documents -- data to be indexed. It can also have |
|
806
|
|
|
|
|
|
|
a form table.column that will be used if the table_name option is not |
|
807
|
|
|
|
|
|
|
specified. |
|
808
|
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
=item column_id_name |
|
810
|
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
For table frontend; this is the name of the field in table_name that |
|
812
|
|
|
|
|
|
|
holds names (ids) of the records. If not specified, a field that has |
|
813
|
|
|
|
|
|
|
primary key on it is used. If this field is numeric, it's values are |
|
814
|
|
|
|
|
|
|
directly used as identifiers, otherwise a conversion to numeric values |
|
815
|
|
|
|
|
|
|
is made. |
|
816
|
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
=back |
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head1 ERROR HANDLING |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
The create and open methods return the MyConText object on success, upon |
|
822
|
|
|
|
|
|
|
failure they return undef and set error message in $MyConText::errstr |
|
823
|
|
|
|
|
|
|
variable. |
|
824
|
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
All other methods return reasonable (documented above) value on success, |
|
826
|
|
|
|
|
|
|
failure is signalized by unreasonable (typically undef or null) return |
|
827
|
|
|
|
|
|
|
value; the error message may then be retrieved by $ctx->errstr method |
|
828
|
|
|
|
|
|
|
call. |
|
829
|
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=head1 VERSION |
|
831
|
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
This documentation describes MyConText module version 0.49. |
|
833
|
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
=head1 BUGS |
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
Error handling needs more polishing. |
|
837
|
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
We do not check if the stored values are larger that specified by the |
|
839
|
|
|
|
|
|
|
*_bits parameters. |
|
840
|
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
No CGI administration tool at the moment. |
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
Econtains doesn't work with phrase backend. |
|
844
|
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
No scoring algorithm implemented. |
|
846
|
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
No support for stop words at the moment. |
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=head1 AUTHOR |
|
850
|
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
(c) 1999 Jan Pazdziora, adelton@fi.muni.cz, |
|
852
|
|
|
|
|
|
|
http://www.fi.muni.cz/~adelton/ at Faculty of Informatics, Masaryk |
|
853
|
|
|
|
|
|
|
University in Brno, Czech Republic |
|
854
|
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
All rights reserved. This package is free software; you can |
|
856
|
|
|
|
|
|
|
redistribute it and/or modify it under the same terms as Perl itself. |
|
857
|
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
859
|
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
DBI(3), mycontextadmin(1). |
|
861
|
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
=head1 OTHER PRODUCTS and why I've written this module |
|
863
|
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
I'm aware of DBIx::TextIndex module and about UdmSearch utility, and |
|
865
|
|
|
|
|
|
|
about htdig and glimpse on the non-database side of the world. |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
To me, using a database gives reasonable maintenance benefits. With |
|
868
|
|
|
|
|
|
|
products that use their own files to store the information (even if the |
|
869
|
|
|
|
|
|
|
storage algorithms are efficient and well thought of), you always |
|
870
|
|
|
|
|
|
|
struggle with permissions on files and directories for various users, |
|
871
|
|
|
|
|
|
|
with files that somebody accidently deleted or mungled, and making the |
|
872
|
|
|
|
|
|
|
index available remotely is not trivial. |
|
873
|
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
That's why I've wanted a module that will use a database as a storage |
|
875
|
|
|
|
|
|
|
backend. With MySQL, you get remote access and access control for free, |
|
876
|
|
|
|
|
|
|
and on many web servers MySQL is part of the standard equipment. So |
|
877
|
|
|
|
|
|
|
using it for text indexes seemed natural. |
|
878
|
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
However, existing DBIx::TextIndex and UdmSearch are too narrow-aimed to |
|
880
|
|
|
|
|
|
|
me. The first only supports indexing of data that is stored in the |
|
881
|
|
|
|
|
|
|
database, but you may not always want or need to store the documents in |
|
882
|
|
|
|
|
|
|
the database as well. The UdmSearch on the other hand is only for web |
|
883
|
|
|
|
|
|
|
documents, making it unsuitable for indexing mailing-list archives or |
|
884
|
|
|
|
|
|
|
local data. |
|
885
|
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
I believe that MyConText is reasonably flexible and still very |
|
887
|
|
|
|
|
|
|
efficient. It doesn't enforce its own idea of what is good for you -- |
|
888
|
|
|
|
|
|
|
the number of options is big and you can always extend the module with |
|
889
|
|
|
|
|
|
|
your own backend of frontend if you feel that those provided are not |
|
890
|
|
|
|
|
|
|
sufficient. Or you can extend existing by adding one or two parameters |
|
891
|
|
|
|
|
|
|
that will add new features. Of course, patches are always welcome. |
|
892
|
|
|
|
|
|
|
MyConText is a tool that can be deployed in many projects. It's not |
|
893
|
|
|
|
|
|
|
a complete environment since different people have different needs. On |
|
894
|
|
|
|
|
|
|
the other hand, the methods that it provides make it easy to build |
|
895
|
|
|
|
|
|
|
a complete solution on top of this in very short course of time. |
|
896
|
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
I was primarily inspired by the ConText cartrige of Oracle server. Since |
|
898
|
|
|
|
|
|
|
MySQL doesn't support triggers, it showed up that Perl interface will be |
|
899
|
|
|
|
|
|
|
needed. Of course, porting this module to (for example) PostgreSQL |
|
900
|
|
|
|
|
|
|
should be easy, so different name is probably needed. On the other hand, |
|
901
|
|
|
|
|
|
|
the code is sometimes very MySQL specific to make the module work |
|
902
|
|
|
|
|
|
|
efficiently, so I didn't want a name that would suggest that it's |
|
903
|
|
|
|
|
|
|
a generic tool that will work with any SQL database. |
|
904
|
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
=cut |
|
906
|
|
|
|
|
|
|
|