File Coverage

blib/lib/MyConText.pm

Criterion	Covered	Total	%
statement	15	221	6.7
branch	0	86	0.0
condition	0	3	0.0
subroutine	5	21	23.8
pod	9	14	64.2
total	29	345	8.4

line	stmt	bran	cond	sub	pod	time	code
1
2							=head1 NAME
3
4							MyConText - Indexing documents with MySQL as storage
5
6							=cut
7
8							package MyConText;
9	11			11		7740	use strict;
	11					19
	11					341
10
11	11			11		48	use vars qw($errstr $VERSION);
	11					17
	11					1723
12							$errstr = undef;
13							$VERSION = '0.49';
14
15							my %DEFAULT_PARAMS = (
16							'num_of_docs' => 0, # statistical value, should be maintained
17							'word_length' => 30, # max length of words we index
18
19							'protocol' => 40, # we only support protocol with the same numbers
20
21							'blob_direct_fetch' => 20, # with the blob store, when we stop searching
22							# and fetch everything at once
23							'data_table' => undef, # table where the actual index is stored
24							'name_length' => 255, # for filenames or URLs, what's the max length
25
26							'word_id_bits' => 16, # num of bits for word_id (column store)
27							'doc_id_bits' => 16, # num of bits for doc_id
28							'count_bits' => 8, # num of bits for count value
29							'position_bits' => 32, # num of bits for word positions
30
31							'backend' => 'blob', # what database backend (way the data is
32							# stored) we use
33							'frontend' => 'none', # what application frontend we use (how
34							# the index behaves externaly)
35							'filter' => 'map { lc $_ }',
36							'splitter' => ' $data =~ /(\w{2,$word_length})/g',
37							# can use the $data and $word_length
38							# variables
39							'init_env' => 'use locale'
40							);
41							my %backend_types = (
42							'blob' => 'MyConText::Blob',
43							'column' => 'MyConText::Column',
44							'phrase' => 'MyConText::Phrase',
45							);
46							my %frontend_types = (
47							'none' => 'MyConText',
48							'default' => 'MyConText',
49							'file' => 'MyConText::File',
50							'string' => 'MyConText::String',
51							'url' => 'MyConText::URL',
52							'table' => 'MyConText::Table',
53							);
54
55	11			11		51	use vars qw! %BITS_TO_PACK %BITS_TO_INT %BITS_TO_PRECISION %PRECISION_TO_BITS !;
	11					28
	11					17422
56							%BITS_TO_PACK = qw! 0 A0 8 C 16 S 32 L !;
57							%BITS_TO_INT = qw! 8 tinyint 16 smallint 24 mediumint 32 int 64 bigint !;
58							%BITS_TO_PRECISION = qw! 8 4 16 6 24 9 32 11 !;
59							%PRECISION_TO_BITS = map { ( $BITS_TO_PRECISION{$_} => $_ ) } keys %BITS_TO_PRECISION;
60
61							# Open reads in the information about existing index, creates an object
62							# in memory
63							sub open {
64	0			0	1		my ($class, $dbh, $TABLE) = @_;
65	0						$errstr = undef;
66
67							# the $dbh is either a real dbh of a DBI->connect parameters arrayref
68	0						my $mydbh = 0;
69	0	0					if (ref $dbh eq 'ARRAY') {
70							$dbh = DBI->connect(@$dbh) or
71	0	0					do { $errstr = $DBI::errstr; return; };
	0
	0
72
73	0						$mydbh = 1;
74							}
75
76							# load the parameters to the object
77	0						my %PARAMS = %DEFAULT_PARAMS;
78	0						my $sth = $dbh->prepare("select * from $TABLE");
79	0						$sth->{'PrintError'} = 0;
80	0						$sth->{'RaiseError'} = 0;
81	0	0					$sth->execute or do {
82	0	0					if (not grep { $TABLE eq $_ }
	0
83							MyConText->list_context_indexes($dbh)) {
84	0						$errstr = "ConText index $TABLE doesn't exist.";
85							}
86	0						else { $errstr = $sth->errstr; }
87	0						return;
88							};
89	0						while (my ($param, $value) = $sth->fetchrow_array) {
90	0						$PARAMS{$param} = $value;
91							}
92	0						my $self = bless {
93							'dbh' => $dbh,
94							'table' => $TABLE,
95							%PARAMS,
96							}, $class;
97	0						my $data_table = $self->{'data_table'};
98
99							# we should disconnect if we've opened the dbh here
100	0	0					if ($mydbh) { $self->{'disconnect_on_destroy'} = 1; }
	0
101
102							# some basic sanity check
103							defined $dbh->selectrow_array("select count(*) from $data_table")
104	0	0					or do { $errstr = "Table $data_table not found in the database\n"; return; };
	0
	0
105
106
107							# load and set the application frontend
108	0						my $front_module = $frontend_types{$PARAMS{'frontend'}};
109	0	0					if (defined $front_module) {
110	0	0					if ($front_module ne $class) {
111	0						eval "use $front_module";
112	0	0					die $@ if $@;
113							}
114	0						bless $self, $front_module;
115	0						$self->_open_tables;
116							}
117	0						else { $errstr = "Specified frontend type `$PARAMS{'frontend'}' is unknown\n"; return; }
	0
118
119							# load and set the backend (actual database access) module
120	0						my $back_module = $backend_types{$PARAMS{'backend'}};
121	0	0					if (defined $back_module) {
122	0						eval "use $back_module";
123	0	0					die $@ if $@;
124	0						$self->{'db_backend'} = $back_module->open($self);
125							}
126	0						else { $errstr = "Specified backend type `$PARAMS{'backend'}' is unknown\n"; return; }
	0
127
128							# finally, return the object
129	0						$self;
130							}
131
132							# Create creates tables in the database according to the options, then
133							# calls open to load the object to memory
134							sub create {
135	0			0	1		my ($class, $dbh, $TABLE, %OPTIONS) = @_;
136	0						$errstr = undef;
137	0						my $mydbh = 0;
138	0	0					if (ref $dbh eq 'ARRAY') {
139							$dbh = DBI->connect(@$dbh) or
140	0	0					do { $errstr = $DBI::errstr; return; };
	0
	0
141	0						$mydbh = 1;
142							}
143
144	0						my $self = bless {
145							'dbh' => $dbh,
146							'table' => $TABLE,
147							%DEFAULT_PARAMS,
148							%OPTIONS
149							}, $class;
150
151	0	0					$self->{'data_table'} = $TABLE.'_data'
152							unless defined $self->{'data_table'};
153
154	0						my $CREATE_PARAM = <
155							create table $TABLE (
156							param varchar(16) binary not null,
157							value varchar(255),
158							primary key (param)
159							)
160							EOF
161	0	0					$dbh->do($CREATE_PARAM) or do { $errstr = $dbh->errstr; return; };
	0
	0
162	0						push @{$self->{'created_tables'}}, $TABLE;
	0
163
164							# load and set the frontend database structures
165	0						my $front_module = $frontend_types{$self->{'frontend'}};
166	0	0					if (defined $front_module) {
167	0						eval "use $front_module";
168	0	0					die $@ if $@;
169	0						bless $self, $front_module;
170	0						$errstr = $self->_create_tables;
171	0	0					if (defined $errstr) { $self->clean_failed_create; return; }
	0
	0
172							}
173	0						else { $errstr = "Specified frontend type `$self->{'frontend'}' is unknown\n"; $self->clean_failed_create; return; }
	0
	0
174
175							# create the backend database structures
176	0						my $back_module = $backend_types{$self->{'backend'}};
177	0	0					if (defined $back_module) {
178	0						eval "use $back_module";
179	0	0					die $@ if $@;
180	0						$errstr = $back_module->_create_tables($self);
181	0	0					if (defined $errstr) { $self->clean_failed_create; return; }
	0
	0
182							}
183	0						else { $errstr = "Specified backend type `$self->{'backend'}' is unknown\n"; $self->clean_failed_create; return; }
	0
	0
184
185	0						for (grep { not ref $self->{$_} } keys %$self) {
	0
186	0						$dbh->do("insert into $TABLE values (?, ?)", {}, $_, $self->{$_});
187							}
188
189	0						return $class->open($dbh, $TABLE);
190							}
191
192	0			0			sub _create_tables {}
193	0			0			sub _open_tables {}
194
195							sub clean_failed_create {
196	0			0	0		my $self = shift;
197	0						my $dbh = $self->{'dbh'};
198	0						for my $table (@{$self->{'created_tables'}}) {
	0
199	0						$dbh->do("drop table $table");
200							}
201							}
202
203							sub drop {
204	0			0	1		my $self = shift;
205	0						my $dbh = $self->{'dbh'};
206	0						for my $tag (keys %$self) {
207	0	0					next unless $tag =~ /(^\|_)table$/;
208	0						$dbh->do("drop table $self->{$tag}");
209							}
210	0						1;
211							}
212							sub errstr {
213	0			0	0		my $self = shift;
214	0	0					ref $self ? $self->{'errstr'} : $errstr;
215							}
216
217							sub list_context_indexes {
218	0			0	0		my ($class, $dbh) = @_;
219	0						my %tables = map { ( $_->[0] => 1 ) }
	0
220	0						@{$dbh->selectall_arrayref('show tables')};
221	0						my %indexes = ();
222	0						for my $table (keys %tables) {
223	0						local $dbh->{'PrintError'} = 0;
224	0						local $dbh->{'RaiseError'} = 0;
225	0	0					if ($dbh->selectrow_array("select param, value from $table
226							where param = 'data_table'")) {
227	0						$indexes{$table} = 1;
228							}
229							}
230	0						return sort keys %indexes;
231							}
232
233							sub index_document {
234	0			0	1		my ($self, $id, $data) = @_;
235	0	0					return unless defined $id;
236
237	0						my $dbh = $self->{'dbh'};
238
239	0						my $param_table = $self->{'table'};
240
241	0						my $adding_doc = 0;
242
243	0						my $adding = 0;
244	0	0	0				if (not defined $self->{'max_doc_id'} or $id > $self->{'max_doc_id'}) {
245	0						$self->{'max_doc_id'} = $id;
246	0	0					my $update_max_doc_id_sth =
247							( defined $self->{'update_max_doc_id_sth'}
248							? $self->{'update_max_doc_id_sth'}
249							: $self->{'update_max_doc_id_sth'} = $dbh->prepare("replace into $param_table values (?, ?)"));
250	0						$update_max_doc_id_sth->execute('max_doc_id', $id);
251	0						$adding_doc = 1;
252							}
253
254	0						my $init_env = $self->{'init_env'}; # use packages, etc.
255	0	0					eval $init_env if defined $init_env;
256	0	0					print STDERR "Init_env failed with $@\n" if $@;
257
258	0	0					$data = '' unless defined $data;
259	0						return $self->{'db_backend'}->parse_and_index_data($adding_doc,
260							$id, $data);
261							}
262
263							# used for backends that need a count for each of the words
264							sub parse_and_index_data_count {
265	0			0	0		my ($backend, $adding_doc, $id, $data) = @_;
266							## note that this is run with backend object
267	0						my $self = $backend->{'ctx'};
268
269	0						my $word_length = $self->{'word_length'};
270							# this needs to get parametrized (lc, il2_to_ascii, parsing of
271							# HTML tags, ...)
272
273	0						my %words;
274
275	11			11		10459	use locale;
	11					2408
	11					50
276	0						my $filter = $self->{'filter'} . ' ' . $self->{'splitter'};
277	0						for my $word ( eval $filter ) {
278	0	0					$words{$word} = 0 if not defined $words{$word};
279	0						$words{$word}++;
280							}
281
282	0						my @result;
283	0	0					if ($adding_doc) {
284	0						@result = $backend->add_document($id, \%words);
285							}
286							else {
287	0						@result = $backend->update_document($id, \%words);
288							}
289
290	0	0					if (wantarray) {
291	0						return @result;
292							}
293	0						return $result[0];
294							}
295
296							# used for backends where list of occurencies is needed
297							sub parse_and_index_data_list {
298	0			0	0		my ($backend, $adding_doc, $id, $data) = @_;
299							## note that this is run with backend object
300	0						my $self = $backend->{'ctx'};
301
302	0						my $word_length = $self->{'word_length'};
303							# this needs to get parametrized (lc, il2_to_ascii, parsing of
304							# HTML tags, ...)
305
306	0						my %words;
307
308	11			11		2267	use locale;
	11					22
	11					41
309	0						my $filter = $self->{'filter'} . ' ' . $self->{'splitter'};
310
311	0						my $i = 0;
312	0						for my $word ( eval $filter ) {
313	0						push @{$words{$word}}, ++$i;
	0
314							}
315
316	0						my @result;
317	0	0					if ($adding_doc) {
318	0						@result = $backend->add_document($id, \%words);
319							}
320							else {
321	0						@result = $backend->update_document($id, \%words);
322							}
323
324	0	0					if (wantarray) {
325	0						return @result;
326							}
327	0						return $result[0];
328							}
329							sub delete_document {
330	0			0	1		my $self = shift;
331	0						$self->{'db_backend'}->delete_document(@_);
332							}
333
334							sub contains_hashref {
335	0			0	1		my $self = shift;
336	0						my $filter = $self->{'filter'};
337	0						$self->{'db_backend'}->contains_hashref(eval $filter.' @_');
338							}
339							sub contains {
340	0			0	1		my $self = shift;
341	0						my $res = $self->contains_hashref(@_);
342	0	0					if (not $self->{'count_bits'}) { return keys %$res; }
	0
343	0						return sort { $res->{$b} <=> $res->{$a} } keys %$res;
	0
344							}
345							sub econtains_hashref {
346	0			0	1		my $self = shift;
347	0						my $docs = {};
348	0						my $word_num = 0;
349
350	0						my $is_some_plus = grep /^\+/, @_;
351
352	0						for my $word ( map { /^\+(.+)$/s } @_) {
	0
353	0						$word_num++;
354	0						my $oneword = $self->contains_hashref($word);
355	0	0					if ($word_num == 1) { $docs = $oneword; next; }
	0
	0
356	0						for my $doc (keys %$oneword) {
357	0	0					$docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc};
358							}
359	0						for my $doc (keys %$docs) {
360	0	0					delete $docs->{$doc} unless defined $oneword->{$doc};
361							}
362							}
363
364	0						for my $word ( map { /^([^+-].*)$/s } @_) {
	0
365	0						my $oneword = $self->contains_hashref($word);
366	0						for my $doc (keys %$oneword) {
367	0	0					if ($is_some_plus) {
368	0	0					$docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc};
369							}
370							else {
371	0	0					$docs->{$doc} = 0 unless defined $docs->{$doc};
372	0						$docs->{$doc} += $oneword->{$doc};
373							}
374							}
375							}
376
377	0						for my $word ( map { /^-(.+)$/s } @_) {
	0
378	0						my $oneword = $self->contains_hashref($word);
379	0						for my $doc (keys %$oneword) {
380	0						delete $docs->{$doc};
381							}
382							}
383	0						$docs;
384							}
385							sub econtains {
386	0			0	1		my $self = shift;
387	0						my $res = $self->econtains_hashref(@_);
388	0	0					if (not $self->{'count_bits'}) { return keys %$res; }
	0
389	0						return sort { $res->{$b} <=> $res->{$a} } keys %$res;
	0
390							}
391
392							1;
393
394							=head1 SYNOPSIS
395
396							use MyConText;
397							use DBI;
398							# connect to database (regular DBI)
399							my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd');
400							# create a new index
401							my $ctx = MyConText->create($dbh, 'ctx_web_1',
402							'frontend' => 'string', 'backend' => 'blob');
403							# or open existing one
404							# my $ctx = MyConText->open($dbh, 'ctx_web_1');
405
406							# index documents
407							$ctx->index_document('krtek', 'krtek leze pod zemi');
408							$ctx->index_document('jezek', 'Jezek ma ostre bodliny.');
409
410							# search for matches
411							my @documents = $ctx->contains('krtek');
412							my @docs = $ctx->econtains('+krtek', '-Jezek');
413
414
415							=head1 DESCRIPTION
416
417							MyConText is a pure man's solution for indexing contents of documents.
418							It uses the MySQL database to store the information about words and
419							documents and provides Perl interface for indexing new documents,
420							making changes and searching for matches. For MyConText, a document
421							is nearly anything -- Perl scalar, file, Web document, database field.
422
423							The basic style of interface is shown above. What you need is a MySQL
424							database and a DBI with DBD::mysql. Then you create a MyConText index
425							-- a set of tables that maintain all necessary information. Once created
426							it can be accessed many times, either for updating the index (adding
427							documents) or searching.
428
429							MyConText uses one basic table to store parameters of the index. Second
430							table is used to store the actual information about documents and words,
431							and depending on the type of the index (specified during index creation)
432							there may be more tables to store additional information (like
433							conversion from external string names (eg. URL's) to internal numeric
434							form). For a user, these internal thingies and internal behaviour of the
435							index are not important. The important part is the API, the methods to
436							index document and ask questions about words in documents. However,
437							certain understanding of how it all works may be usefull when you are
438							deciding if this module is for you and what type of index will best
439							suit your needs.
440
441							=head2 Frontends
442
443							From the user, application point of view, the MyConText index stores
444							documents that are named in a certain way, allows adding new documents,
445							and provides methods to ask: "give me list of names of documents that
446							contain this list of words". The MyConText index doesn't store the
447							documents itself. Instead, it stores information about words in the
448							documents in such a structured way that it makes easy and fast to look
449							up what documents contain certain words and return names of the
450							documents.
451
452							MyConText provides a couple of predefined frontend classes that specify
453							various types of documents (and the way they relate to their names).
454
455							=over 4
456
457							=item default
458
459							By default, user specifies the integer number of the document and the
460							content (body) of the document. The code would for example read
461
462							$ctx->index_document(53, 'zastavujeme vyplaty vkladu');
463
464							and MyConText will remember that the document 53 contains three words.
465							When looking for all documents containing word (string) vklad, a call
466
467							my @docs = $ctx->contains('vklad%');
468
469							would return numbers of all documents containing words starting with
470							'vklad', 53 among them.
471
472							So here it's user's responsibility to maintain a relation between the
473							document numbers and their content, to know that a document 53 is about
474							vklady. Perhaps the documents are already stored somewhere and have
475							inique numeric id.
476
477							=item string
478
479							Frontend B allows the user to specify the names of the documents as
480							strings, instead of numbers. Still the user has to specify both the
481							name of the document and the content:
482
483							$ctx->index_document('upozorneni',
484							'Odstrante z dosadu deti!');
485
486							After that,
487
488							$ctx->contains('deti')
489
490							will return 'upozorneni' as one of the names of documents with word
491							'deti' in it.
492
493							=item file
494
495							To index files, use the frontend B. Here the content of the document
496							is clearly the content of the file specified by the filename, so in
497							a call to index_document, only the name is needed -- the content of the
498							file is read by the MyConText transparently:
499
500							$ctx->index_document('/usr/doc/FAQ/Linux-FAQ');
501							my @files = $ctx->contains('penguin');
502
503							=item url
504
505							Web document can be indexed by the frontend B. MyConText uses LWP to
506							get the document and then parses it normally:
507
508							$ctx->index_document('http://www.perl.com/');
509
510							=item table
511
512							You can have a MyConText index that indexes char or blob fields in MySQL
513							table. Since MySQL doesn't support triggers, you have to call the
514							index_document method of MyConText any time something changes in the
515							table. So the sequence probably will be
516
517							$dbh->do('insert into the_table (id, data, other_fields)
518							values (?, ?, ?)', {}, $name, $data, $date_or_something);
519							$ctx->index_document($name);
520
521							When calling contains, the id (name) of the record will be returned. If
522							the id in the_table is numeric, it's directly used as the internal
523							numeric id, otherwise a string's way of converting the id to numeric
524							form is used.
525
526							=back
527
528							The structure of MyConText is very flexible and adding new frontend
529							(what will be indexed) is very easy.
530
531							=head2 Backends
532
533							While frontend specifies what is indexed and how the user sees the
534							collection of documents, backend is about low level database way of
535							actually storing the information in the tables. Three types are
536							available:
537
538							=over 4
539
540							=item blob
541
542							For each word, a blob holding list of all documents containing that word
543							is stored in the table, with the count (number of occurencies)
544							associated with each document number. That makes it for very compact
545							storage. Since the document names (for example URL) are internally
546							converted to numbers, storing and fetching the data is fast. However,
547							updating the information is very slow, since information concerning one
548							document is spread across all table, without any direct database access.
549							Updating a document (or merely reindexing it) requires update of all
550							blobs, which is slow.
551
552							The list of documents is stored sorted by document name so that
553							fetching an information about a document for one word is relatively
554							easy, still a need to update (or at least scan) all records in the table
555							makes this storage unsuitable for collections of documents that often
556							change.
557
558							=item column
559
560							The B backend stores a word/document pair in database fields,
561							indexing both, thus allowing both fast retrieval and updates -- it's
562							easy to delete all records describing one document and insert new ones.
563							However, the database indexes that have to be maintained are large.
564
565							Both B and B backends only store a count -- number of
566							occurencies of the word in the document (and even this can be switched
567							off, yielding just a yes/no information about the word's presence).
568							This allows questions like
569
570							all documents containing words 'voda' or 'Mattoni'
571							but not a word 'kyselka'
572
573							but you cannot ask whether a document contains a phrase 'kyselka
574							Mattoni' because such information is not maintained by these types of
575							backends.
576
577							=item phrase
578
579							To allow phrase matching, a B backend is available. For each word
580							and document number it stores a blob of lists of positions of the word
581							in the document. A query
582
583							$ctx->contains('kyselk%', 'Mattoni');
584
585							then only returns those documents (document names/numbers) where word
586							kyselka (or kyselky, or so) is just before word Mattoni.
587
588							=back
589
590							=head2 Mixing frontends and backends
591
592							Any frontend can be used with any backend in one MyConText index. You
593							can index Web documents with B frontend and B backend
594							to be able to find phrases in the documents. And you can use the
595							default, number based document scheme with B backend to use the disk
596							space as efficiently as possible -- this is usefull for example for
597							mailing-list archives, where we need to index huge number of documents
598							that do not change at all.
599
600							Finding optimal combination is very important and may require some
601							analysis of the document collection and manipulation, as well as the
602							speed and storage requirements. Benchmarking on actual target platform
603							is very usefull during the design phase.
604
605							=head1 METHODS
606
607							The following methods are available on the user side as MyConText API.
608
609							=over 4
610
611							=item create
612
613							my $ctx = MyConText->create($dbh, $index_name, %opts);
614
615							The class method B creates index of given name (the name of the
616							index is the name of its basic parameter table) and all necessary
617							tables, returns an object -- newly created index. The options that may
618							be specified after the index name define the frontend and backend types,
619							storage parameters (how many bits for what values), etc. See below for
620							list of create options and discussion of their use.
621
622							=item open
623
624							my $ctx = MyConText->open($dbh, $index_name);
625
626							Opens and returns object, accessing specifies MyConText index. Since all
627							the index parameters and information are stored in the $index_name table
628							(including names of all other needed tables), the database handler and
629							the name of the parameter table are the only needed arguments.
630
631							=item index_document
632
633							$ctx->index_document(45, 'Sleva pri nakupu stribra.');
634							$ctx->index_document('http://www.mozilla.org/');
635
636							For the default and B frontends, two arguments are expected -- the
637							name (number or string) of the document and its content. For B and
638							B frontends only the name of the document is needed. The method
639							returns number of words indexed (subject to wild change).
640
641							=item delete_document
642
643							$ctx->delete_document('http://www.mozilla.org/');
644
645							Removes information about document from the index. Note that for B
646							backend this is very time consuming process.
647
648							=item contains
649
650							my @docs = $ctx->contains('sleva', 'strib%');
651
652							Returns list of names (numbers or strings, depending on the frontend)
653							of documents that contain some of specified words.
654
655							=item econtains
656
657							my @docs = $ctx->contains('sleva', '+strib%', '-zlato');
658
659							Econtains stands for extended contains and allows words to be prefixed
660							by plus or minus signs to specify that the word must or mustn't be
661							present in the document for it to match.
662
663							=item contains_hashref, econtains_hashref
664
665							Similar to B and B, only instead of list of document
666							names, there methods return a hash reference to a hash where keys are
667							the document names and values are the number of occurencies of the
668							words.
669
670							=item drop
671
672							Removes all tables associated with the index, including the base
673							parameter table. Effectivelly destroying the index form the database.
674
675							=back
676
677							=head1 INDEX OPTIONS
678
679							Here we list the options that may be passed to MyConText->create call.
680							These allow to specify the style and storage parameters in great detail.
681
682							=over 4
683
684							=item backend
685
686							The backend type, default B, possible values blob, column and phrase
687							(see above for explanation).
688
689							=item frontend
690
691							The frontend type. The default frontend requires the user to specify
692							numeric id of the document together with the content of the document,
693							other possible values are string, file and url (see above for
694							more info).
695
696							=item word_length
697
698							Maximum length of words that may be indexed, default 30.
699
700							=item data_table
701
702							Name of the table where the actual data about word/document relation is
703							stored. By default, the name of the index (of the base table) with _data
704							suffix is used.
705
706							=item name_length
707
708							Any frontend that uses strings as names of documents needs to maintain
709							a conversion table from these names to internal integer ids. This value
710							specifies maximum length of these string names (URLs, file names, ...).
711
712							=item blob_direct_fetch
713
714							Only for blob backend. When looking for information about specific
715							document in the list stored in the blob, the blob backend uses division
716							of interval to find the correct place in the blob. When the interval
717							gets equal or shorter that this value, all values are fetched from the
718							database and the final search is done in Perl code sequentially.
719
720							=item word_id_bits
721
722							With column or phase backends, MyConText maintains a numeric id for each
723							word to optimize the space requirements. The word_id_bits parameter
724							specifies the number of bits to reserve for this conversion and thus
725							effectively limits number of distinct words that may be indexed. The
726							default is 16 bits and possible values are 8, 16, 24 or 32 bits.
727
728							=item word_id_table
729
730							Name of the table that holds conversion from words to their numeric id
731							(for column and phrase backends). By default is the name of the index
732							with _words suffix.
733
734							=item doc_id_bits
735
736							A number of bits to hold a numeric id of the document (that is either
737							provided by the user (with default frontend) or generated by the module
738							to accomplish the conversion from the string name of the document). This
739							value limits the maximum number of documents to hold. The default is 16
740							bits and possible values are 8, 16 and 32 bits for blob backend and 8,
741							16, 24 and 32 bits for column and phrase backends.
742
743							=item doc_id_table
744
745							Name of the table that holds conversion from string names of documents
746							to their numeric id, by default the name of the index with _docid
747							suffix.
748
749							=item count_bits
750
751							Number of bits reserved for storing number of occurencies of each word
752							in the document. The default is 8 and possible values are the same as
753							with doc_id_bits.
754
755							=item position_bits
756
757							With phrase backend, MyConText stores positions of each word of the
758							documents. This value specifies how much space should be reserved for
759							this purpose. The default is 32 bits and possible values are 8, 16 or 32
760							bits. This value limits the maximum number of words of each document
761							that can be stored.
762
763							=item splitter
764
765							MyConText allows the user to provide any Perl code that will be used to
766							split the content of the document to words. The code will be evalled
767							inside of the MyConText code. The default is
768
769							$data =~ /(\w{2,$word_length})/g
770
771							and shows that the input is stored in the variable C<$data> and the code
772							may access any other variable available in the perl_and_index_data_*
773							methods (see source), especially C<$word_length> to get the maximum length
774							of words and C<$backend> to get the backend object.
775
776							The default value also shows that by default, the minimum length of
777							words indexed is 2.
778
779							=item filter
780
781							The output words of splitter (and also any parameter of (e)contains*
782							methods) are send to filter that may do further processing. Filter is
783							again a Perl code, the default is
784
785							map { lc $_ }
786
787							showing that the filter operates on input list and by default does
788							conversion to lowercase (yielding case insensitive index).
789
790							=item init_env
791
792							Because user defined splitter or filter may depend on other things that
793							it is reasonable to set before the actual procession of words, you can
794							use yet another Perl hook to set things up. The default is
795
796							use locale
797
798							=item table_name
799
800							For table frontend; this is the name of the table that will be indexed.
801
802							=item column_name
803
804							For table frontend; this is the name of the column in the table_name
805							that contains the documents -- data to be indexed. It can also have
806							a form table.column that will be used if the table_name option is not
807							specified.
808
809							=item column_id_name
810
811							For table frontend; this is the name of the field in table_name that
812							holds names (ids) of the records. If not specified, a field that has
813							primary key on it is used. If this field is numeric, it's values are
814							directly used as identifiers, otherwise a conversion to numeric values
815							is made.
816
817							=back
818
819							=head1 ERROR HANDLING
820
821							The create and open methods return the MyConText object on success, upon
822							failure they return undef and set error message in $MyConText::errstr
823							variable.
824
825							All other methods return reasonable (documented above) value on success,
826							failure is signalized by unreasonable (typically undef or null) return
827							value; the error message may then be retrieved by $ctx->errstr method
828							call.
829
830							=head1 VERSION
831
832							This documentation describes MyConText module version 0.49.
833
834							=head1 BUGS
835
836							Error handling needs more polishing.
837
838							We do not check if the stored values are larger that specified by the
839							*_bits parameters.
840
841							No CGI administration tool at the moment.
842
843							Econtains doesn't work with phrase backend.
844
845							No scoring algorithm implemented.
846
847							No support for stop words at the moment.
848
849							=head1 AUTHOR
850
851							(c) 1999 Jan Pazdziora, adelton@fi.muni.cz,
852							http://www.fi.muni.cz/~adelton/ at Faculty of Informatics, Masaryk
853							University in Brno, Czech Republic
854
855							All rights reserved. This package is free software; you can
856							redistribute it and/or modify it under the same terms as Perl itself.
857
858							=head1 SEE ALSO
859
860							DBI(3), mycontextadmin(1).
861
862							=head1 OTHER PRODUCTS and why I've written this module
863
864							I'm aware of DBIx::TextIndex module and about UdmSearch utility, and
865							about htdig and glimpse on the non-database side of the world.
866
867							To me, using a database gives reasonable maintenance benefits. With
868							products that use their own files to store the information (even if the
869							storage algorithms are efficient and well thought of), you always
870							struggle with permissions on files and directories for various users,
871							with files that somebody accidently deleted or mungled, and making the
872							index available remotely is not trivial.
873
874							That's why I've wanted a module that will use a database as a storage
875							backend. With MySQL, you get remote access and access control for free,
876							and on many web servers MySQL is part of the standard equipment. So
877							using it for text indexes seemed natural.
878
879							However, existing DBIx::TextIndex and UdmSearch are too narrow-aimed to
880							me. The first only supports indexing of data that is stored in the
881							database, but you may not always want or need to store the documents in
882							the database as well. The UdmSearch on the other hand is only for web
883							documents, making it unsuitable for indexing mailing-list archives or
884							local data.
885
886							I believe that MyConText is reasonably flexible and still very
887							efficient. It doesn't enforce its own idea of what is good for you --
888							the number of options is big and you can always extend the module with
889							your own backend of frontend if you feel that those provided are not
890							sufficient. Or you can extend existing by adding one or two parameters
891							that will add new features. Of course, patches are always welcome.
892							MyConText is a tool that can be deployed in many projects. It's not
893							a complete environment since different people have different needs. On
894							the other hand, the methods that it provides make it easy to build
895							a complete solution on top of this in very short course of time.
896
897							I was primarily inspired by the ConText cartrige of Oracle server. Since
898							MySQL doesn't support triggers, it showed up that Perl interface will be
899							needed. Of course, porting this module to (for example) PostgreSQL
900							should be easy, so different name is probably needed. On the other hand,
901							the code is sometimes very MySQL specific to make the module work
902							efficiently, so I didn't want a name that would suggest that it's
903							a generic tool that will work with any SQL database.
904
905							=cut
906