File Coverage

blib/lib/CDB_File/BiIndex.pm

Criterion	Covered	Total	%
statement	115	156	73.7
branch	39	98	39.8
condition	8	21	38.1
subroutine	18	20	90.0
pod	7	11	63.6
total	187	306	61.1

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl -w
2
3							package CDB_File::BiIndex;
4							$REVISION=q$Revision: 1.14 $ ;
5	1			1		782	use vars qw($VERSION);
	1					2
	1					133
6
7							$VERSION = '0.030';
8
9							=head1 NAME
10
11							CDB_File::BiIndex - index two sets of values against eachother.
12
13							=head1 SYNOPSIS
14
15							use CDB_File::BiIndex;
16							#test bi-index is initiated with CDB_Generator
17							$index = new CDB_File::BiIndex "test";
18
19							$index->lookup_first("USA");
20							$index->lookup_second("Lilongwe");
21
22							=head1 DESCRIPTION
23
24							A CDB_File::BiIndex stores a set of relations from one set of strings to
25							another. It's rather similar to a translators dictionary with a list
26							of words in one language linked to words in the other language. The
27							same word can occur in each language, but it's translations would often
28							be different.
29
30							I <-> je
31							{bar, pub} <-> bar
32							{truck, lorry, heavy goods vehicle} <-> camion
33
34							In this implementation it's just two parallel cdb hashes, which you
35							have to generate in advance.
36
37							=head1 EXAMPLE
38
39
40							use CDB_File::BiIndex::Generator;
41							use CDB_File::BiIndex;
42							$gen = new CDB_File::BiIndex::Generator "test";
43
44							$gen->add_relation("John", "Jenny");
45							$gen->add_relation("Roger", "Beth");
46							$gen->add_relation("John", "Gregory");
47							$gen->add_relation("Jemima", "Jenny")
48							$gen->add_relation("John", "Gregory");
49
50							$gen->finish();
51
52							$index = new CDB_File::BiIndex::Generator "test";
53
54							$index->lookup_first("Roger");
55							["Jenny"]
56							$index->lookup_second("Jenny");
57							["John", "Jemima"]
58							$index->lookup_second("John");
59							[]
60							$index->lookup_first("John");
61							["Jenny", "Gregory"]
62
63
64							=cut
65
66	1			1		6	use warnings;
	1					3
	1					27
67	1			1		5	use strict;
	1					1
	1					28
68
69	1			1		5	use Fcntl;
	1					1
	1					330
70	1			1		888	use CDB_File 0.86; # there are serious bugs in previous versions
	1					2740
	1					53
71	1			1		7	use Carp;
	1					1
	1					96
72
73							# delete from here ...
74							BEGIN {
75	1	50		1		88	$CDB_File::VERSION==0.9 and die <
76
77							Suspicious CDB_File version string found (0.9). This was used by
78							CDB_File 0.83 and can cause confusion!!! Please verify that you have
79							CDB_File _distribution_ version equal to or better than 0.86 and then
80							delete this check from the CDB_File::BiIndex. See the
81							CDB_File::BiIndex Manual page (BUGS section) for details.
82
83							EOF
84							}
85
86							# ... delete to here
87
88							$CDB_File::BiIndex::verbose=0 unless defined $CDB_File::BiIndex::verbose;
89							#all debugging messages
90							#$CDB_File::BiIndex::verbose=0xffff unless defined $CDB_File::BiIndex::verbose;
91
92							=head1 METHODS
93
94							=cut
95
96							sub DUMB () {1};
97							sub SEEKABLE () {2};
98
99							our ($mode);
100
101							BEGIN {
102	1			1		2232	$mode=DUMB;
103							}
104
105							# =head1 _cdb_set_iterate
106
107							# _cdb_set_iterate sets of a CDB so that it will start just after the
108							# key given.
109
110							# =cut
111
112							sub _cdb_set_iterate {
113	2			2		4	my $cdb = shift;
114	2					13	my $target = shift;
115	2	50				8	print STDERR "cdb_set_iterate called for $target\n"
116							if $CDB_File::BiIndex::verbose & 32;
117							CASE: {
118	2	50				3	$mode == DUMB and do {
	2					6
119	2					10	my $key=$cdb->FIRSTKEY();
120	2		66			14	while ( defined $key and $key lt $target) {
121	11	50				41	print "key is $key\n"
122							if $CDB_File::BiIndex::verbose & 64;
123	11					70	$key=$cdb->NEXTKEY($key);
124							}
125	2	50				5	print "final key is $key\n"
126							if $CDB_File::BiIndex::verbose & 64;
127	2					5	return $key;
128							};
129	0					0	die "more efficient modes than DUMB not yet defined";
130							}
131	0					0	die "internal error: don't know how to _cdb_set_iterate";
132							}
133
134
135
136							=head2 CDB_File::BiIndex->new(,[])
137
138							new (CLASS, database_filenamebase)
139							new (CLASS, first_database_filename, second_database_filename)
140
141							New opens and sets up the databases.
142
143							=cut
144
145							#FIXME. This should be generalised so it works on any pair of hashes.
146							#which is very easy.
147
148
149							sub new ($$;$) {
150	1			1	1	36	my $class=shift;
151	1					4	my $self=bless {}, $class;
152
153							#work out what the arguments mean..
154	1					3	my $first_db_name = shift;
155	1	50				6	carp "usage new CDB_File::BiIndex (, [])"
156							unless defined $first_db_name;
157	1					2	my $second_db_name;
158	1	50				5	if (@_) {
159	1					2	$second_db_name = shift ;
160							} else {
161	0					0	$second_db_name = $first_db_name . ".2-1";
162	0					0	$first_db_name = $first_db_name . ".1-2";
163							}
164
165	1	50				71	$self->{"first_cdb"} = tie my %first_hash, "CDB_File", $first_db_name
166							or die "Couldn't tie $first_db_name" . $!;
167	1					3	$self->{"first_hash"} = \%first_hash;
168	1	50				38	$self->{"second_cdb"} = tie my %second_hash, "CDB_File", $second_db_name
169							or die "Couldn't tie $second_db_name" . $!;
170	1					3	$self->{"second_hash"} = \%second_hash;
171
172	1					3	$self->{"first_lastkey"}=undef;
173	1					2	$self->{"second_lastkey"}=undef;
174
175	1					4	return $self;
176							}
177
178
179							=head2 $bi->lookup_first() $bi->lookup_second()
180
181							returns a B to a list of values which are indexed against
182							key, direction of the relation depending on which function is used.
183
184							=cut
185
186
187							sub lookup_first ($$) {
188	0			0	1	0	my ($self, $key)=@_;
189	0	0				0	print STDERR "lookup_first has been called with key $key\n"
190							if $CDB_File::BiIndex::verbose & 32;
191	0	0				0	croak "lookup_first called with undefined key"
192							unless defined $key;
193	0					0	my $return=$self->{"first_cdb"}->multi_get($key);
194	0	0				0	return undef unless defined $return;
195	0	0				0	die "multi_get didn't return an array ref" unless
196							(ref $return) =~ m/ARRAY/;
197	0	0				0	return undef unless @$return;
198	0					0	return $return;
199							}
200
201							sub lookup_second ($$) {
202	1			1	1	15	my ($self, $key)=@_;
203	1	50				5	print STDERR "lookup_second has been called with key $key\n"
204							if $CDB_File::BiIndex::verbose & 32;
205	1	50				5	croak "lookup_second called with undefined key"
206							unless defined $key;
207	1					17	my $return=$self->{"second_cdb"}->multi_get($key);
208	1	50				4	return undef unless defined $return;
209	1	50				5	die "multi_get didn't return an array ref" unless
210							(ref $return) =~ m/ARRAY/;
211	1	50				4	return undef unless @$return;
212	1					4	return $return;
213							}
214
215							# =head1 validate
216
217							# Because the two indexes match eachother, they should make sense
218							# together. Anything which is indexed under a key in the first index
219							# should be a key in the second index with a the original key part of
220							# its value
221
222							# =cut
223
224							# sub validate {
225							# my $self=shift;
226							# if ( validate_against($self->{"first_cdb"},$self->{"second_cdb"})
227							# \|\| validate_against($self->{"second_cdb"},$self->{"first_cdb"}) ) {
228							# return 0; #the validation procedures found faults
229							# } else {
230							# return 1; #validated okay.
231							# }
232							# }
233
234							# sub validate_against{
235							# my $cdb_one = shift;
236							# die "non cdb passed as validate_against first arg"
237							# unless ref($cdb_one);
238							# my $cdb_two = shift;
239							# die "non cdb passed as validate_against second arg"
240							# unless ref($cdb_two);
241
242							# my $break_count = 0;
243
244							# #reset the iteration
245							# $cdb_one->start_iter();
246							# #loop through all of the entries in the first cdb
247							# my ($key,$value);
248							# RELATION: while (($key,$value) = $cdb_one->iterate()) {
249							# unless ($cdb_two->set_position($value)) {
250							# warn "Relation $key to $value in #1, but not $value as key in #2";
251							# $break_count++;
252							# next RELATION;
253							# }
254							# my ($rkey, $rvalue);
255							# CHECK: while (($rkey, $rvalue) = $cdb_two->iterate()) {
256							# last CHECK unless $rkey=$value;
257							# next RELATION if $rvalue=$key;
258							# }
259							# warn "Relation $key to $value in #1, but $key not in "
260							# . $value . "'s record in #2";
261							# $break_count++;
262							# }
263							# return $break_count;
264							# }
265
266							=head1 ITERATORS
267
268							The iterators iterate over the different keys in the database. They
269							skip repeated keys.
270
271							=over 4
272
273							=item first_set_iterate() second_set_iterate()
274
275							set the key of the next value that will be returned
276
277							=item first_next([]) second_next([])
278
279							return the next key in the hash. If there has never been any
280							iteration before we will return the first key from the database. If
281							there has been iteration, we will return the key imediately following
282							the key which was last returned.
283
284							If called with an argument, the key following that argument will be
285							returned in any case, but if that argument is exactly the last key
286							returned, we won't seek in the database (set_iterate would do that
287							anyway).
288
289							=cut
290
291
292							# we always have to make sure that FIRST is called once
293							# we can call nextkey all we want until we go off the end.
294							# when we go off the end, we should call FIRST again
295
296							# strictly internal functions to overcome some of CDB_Files
297							# wierdnesses and to allow us to iterate at the same time as doing
298							# other lookups.
299
300							# =item xx_first()
301
302							# return the first key
303
304							# =item xx_next()
305
306							# return the next key in the hash after a first
307
308							# =item xx_reset()
309
310							# iteration will start from the first key again. Don't normally need to call this.
311
312							# =cut
313
314							sub first_reset ($) {
315	2	50		2	0	17	print STDERR "first_reset called\n"
316							if $CDB_File::BiIndex::verbose & 32;
317	2					3	my $self=shift;
318	2					3	my $a=scalar keys %{$self->{"first_hash"}};
	2					52
319	2					8	$self->{"first_lastkey"}=undef;
320							}
321
322							sub first_first ($) {
323	1	50		1	0	4	print STDERR "first_first called\n"
324							if $CDB_File::BiIndex::verbose & 32;
325	1					3	my $self=shift;
326	1					4	$self->first_reset(); #overcomes CDB wierdness if I remember??
327	1					4	my $key = $self->{"first_cdb"}->FIRSTKEY();
328	1					2	$self->{"first_lastkey"}=$key;
329	1					4	return $key;
330							}
331
332							sub first_next ($;$) {
333	3			3	1	16	my $self=shift;
334	3					3	my $key=shift;
335
336	3					5	my $lastkey=$self->{"first_lastkey"};
337
338	3	50				8	$CDB_File::BiIndex::verbose & 32 && do {
339	0					0	print STDERR "first_next called ";
340	0	0				0	if (defined $lastkey ) {
341	0					0	print STDERR " stored key $lastkey";
342							} else {
343	0					0	print STDERR " no stored key";
344							}
345	0	0				0	if (defined $key ) {
346	0					0	print STDERR " key $key\n";
347							} else {
348	0					0	print STDERR " no key\n";
349							}
350							};
351
352							CASE: {
353
354	3	100	66			4	defined $lastkey or defined $key or do {
	3					12
355							#this is the start of iteration
356	1	50				9	print STDERR "never iterated; start with first_first\n"
357							if $CDB_File::BiIndex::verbose & 32;
358	1					5	return $self->first_first();
359							};
360
361	2	50	0			8	defined $key and not (defined $lastkey and $key eq $lastkey) and do {
			33
362	0					0	$self->first_set_iterate($key);
363	0					0	$lastkey=$key;
364							};
365
366							}
367
368	2					3	$key=$lastkey;
369
370	2					3	KEY: while (1) {
371	10					38	$key=$self->{"first_cdb"}->NEXTKEY($key);
372	10	100				21	defined $key or last KEY;
373	9	100				19	$key eq $lastkey or last KEY;
374	8	50				14	print STDERR "repeat of last key $key. skipping.\n"
375							if $CDB_File::BiIndex::verbose & 128;
376							}
377
378	2	50				6	( $CDB_File::BiIndex::verbose & 64 ) && do {
379	0	0				0	print STDERR "returning key $key\n" if defined $key ;
380	0	0				0	print STDERR "reached the end returning undefined key \n"
381							unless defined $key;
382							};
383
384							#if we run off the end then we should start at the beginning next time
385	2					3	$self->{"first_lastkey"}=$key;
386	2					5	return $key;
387							}
388
389							sub first_set_iterate ($$) {
390	0			0	1	0	my $self=shift;
391	0					0	my $key=shift;
392	0	0				0	print STDERR "first_set_iterate has been called with key $key\n"
393							if $CDB_File::BiIndex::verbose & 32;
394	0					0	$key=_cdb_set_iterate($self->{"first_cdb"}, $key);
395	0					0	$self->{"first_lastkey"}=$key;
396	0					0	return $key;
397							}
398
399							sub second_reset ($) {
400	1	50		1	0	12	print STDERR "second_reset called\n"
401							if $CDB_File::BiIndex::verbose & 32;
402	1					1	my $self=shift;
403	1					2	my $a=scalar keys %{$self->{"second_hash"}};
	1					31
404	1					10	delete $self->{"second_lastkey"};
405							}
406
407							sub second_first ($) {
408	2	50		2	0	6	print STDERR "second_first called\n"
409							if $CDB_File::BiIndex::verbose & 32;
410	2					3	my $self=shift;
411							# $self->second_reset(); #overcomes CDB wierdness if I remember??
412	2					9	my $key = $self->{"second_cdb"}->FIRSTKEY();
413	2					3	$self->{"second_lastkey"}=$key;
414	2					6	return $key;
415							}
416
417							sub second_next ($;$) {
418	19			19	1	113	my $self=shift;
419	19					20	my $key=shift;
420
421	19					30	my $lastkey=$self->{"second_lastkey"};
422
423	19	50				38	$CDB_File::BiIndex::verbose & 32 && do {
424	0					0	print STDERR "second_next called ";
425	0	0				0	if (defined $lastkey ) {
426	0					0	print STDERR " stored key $lastkey";
427							} else {
428	0					0	print STDERR " no stored key";
429							}
430	0	0				0	if (defined $key ) {
431	0					0	print STDERR " key $key\n";
432							} else {
433	0					0	print STDERR " no key\n";
434							}
435							};
436
437							CASE: {
438
439	19	100	66			18	defined $lastkey or defined $key or do {
	19					49
440							#this is the start of iteration
441	2	50				7	print STDERR "never iterated; start with second_first\n"
442							if $CDB_File::BiIndex::verbose & 32;
443	2					7	return $self->second_first();
444							};
445
446	17	50	0			38	defined $key and not (defined $lastkey and $key eq $lastkey) and do {
			33
447	0					0	$self->second_set_iterate($key);
448	0					0	$lastkey=$key;
449							};
450
451							}
452
453	17					21	$key=$lastkey;
454
455	17					15	KEY: while (1) {
456	17					52	$key=$self->{"second_cdb"}->NEXTKEY($key);
457	17	100				35	defined $key or last KEY;
458	15	50				33	$key eq $lastkey or last KEY;
459	0	0				0	print STDERR "repeat of last key $key. skipping.\n"
460							if $CDB_File::BiIndex::verbose & 128;
461							}
462
463	17	50				34	( $CDB_File::BiIndex::verbose & 64 ) && do {
464	0	0				0	print STDERR "returning key $key\n" if defined $key ;
465	0	0				0	print STDERR "reached the end returning undefined key \n"
466							unless defined $key;
467							};
468
469							#if we run off the end then we should start at the beginning next time
470	17					26	$self->{"second_lastkey"}=$key;
471	17					41	return $key;
472							}
473
474							sub second_set_iterate ($$) {
475	2			2	1	20	my $self=shift;
476	2					5	my $key=shift;
477	2	50				5	print STDERR "second_set_iterate has been called with key $key\n"
478							if $CDB_File::BiIndex::verbose & 32;
479	2					7	$key=_cdb_set_iterate($self->{"second_cdb"}, $key);
480	2					4	$self->{"second_lastkey"}=$key;
481	2					7	return $key;
482							}
483
484
485							=head1 BUGS
486
487							This module requires the version of the CDB_File perl module to be
488							better than 0.86. Unfortunately, version 0.83 was given the version
489							string "0.9" (and version 0.86 has the string '0.86'). This means
490							that normal perl version checking will not give the correct warnings.
491							There is a hardwired check that the version is not 0.9. I assume that
492							future CDB_File modules won't use that version number, but if they do,
493							then please edit inside the CDB_File::BiIndex perl module file its
494							self and delete the section between the lines
495
496							# delete from here ...
497
498							and
499
500							# ... delete to here
501
502							the module will then hopefully work properly.
503
504							N.B. please only do that B
505							version> of the distribution than 0.86.
506
507							=head1 COPYING
508
509							This module may be distributed under the same terms as perl.
510
511							=cut
512
513
514							1; #what does it prove...