File Coverage

blib/lib/CDB_File/BiIndex.pm
Criterion Covered Total %
statement 115 156 73.7
branch 39 98 39.8
condition 8 21 38.1
subroutine 18 20 90.0
pod 7 11 63.6
total 187 306 61.1


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             package CDB_File::BiIndex;
4             $REVISION=q$Revision: 1.14 $ ;
5 1     1   782 use vars qw($VERSION);
  1         2  
  1         133  
6              
7             $VERSION = '0.030';
8              
9             =head1 NAME
10              
11             CDB_File::BiIndex - index two sets of values against eachother.
12              
13             =head1 SYNOPSIS
14              
15             use CDB_File::BiIndex;
16             #test bi-index is initiated with CDB_Generator
17             $index = new CDB_File::BiIndex "test";
18              
19             $index->lookup_first("USA");
20             $index->lookup_second("Lilongwe");
21              
22             =head1 DESCRIPTION
23              
24             A CDB_File::BiIndex stores a set of relations from one set of strings to
25             another. It's rather similar to a translators dictionary with a list
26             of words in one language linked to words in the other language. The
27             same word can occur in each language, but it's translations would often
28             be different.
29              
30             I <-> je
31             {bar, pub} <-> bar
32             {truck, lorry, heavy goods vehicle} <-> camion
33              
34             In this implementation it's just two parallel cdb hashes, which you
35             have to generate in advance.
36              
37             =head1 EXAMPLE
38              
39              
40             use CDB_File::BiIndex::Generator;
41             use CDB_File::BiIndex;
42             $gen = new CDB_File::BiIndex::Generator "test";
43              
44             $gen->add_relation("John", "Jenny");
45             $gen->add_relation("Roger", "Beth");
46             $gen->add_relation("John", "Gregory");
47             $gen->add_relation("Jemima", "Jenny")
48             $gen->add_relation("John", "Gregory");
49              
50             $gen->finish();
51              
52             $index = new CDB_File::BiIndex::Generator "test";
53              
54             $index->lookup_first("Roger");
55             ["Jenny"]
56             $index->lookup_second("Jenny");
57             ["John", "Jemima"]
58             $index->lookup_second("John");
59             []
60             $index->lookup_first("John");
61             ["Jenny", "Gregory"]
62              
63              
64             =cut
65              
66 1     1   6 use warnings;
  1         3  
  1         27  
67 1     1   5 use strict;
  1         1  
  1         28  
68              
69 1     1   5 use Fcntl;
  1         1  
  1         330  
70 1     1   888 use CDB_File 0.86; # there are serious bugs in previous versions
  1         2740  
  1         53  
71 1     1   7 use Carp;
  1         1  
  1         96  
72              
73             # delete from here ...
74             BEGIN {
75 1 50   1   88 $CDB_File::VERSION==0.9 and die <
76              
77             Suspicious CDB_File version string found (0.9). This was used by
78             CDB_File 0.83 and can cause confusion!!! Please verify that you have
79             CDB_File _distribution_ version equal to or better than 0.86 and then
80             delete this check from the CDB_File::BiIndex. See the
81             CDB_File::BiIndex Manual page (BUGS section) for details.
82              
83             EOF
84             }
85              
86             # ... delete to here
87              
88             $CDB_File::BiIndex::verbose=0 unless defined $CDB_File::BiIndex::verbose;
89             #all debugging messages
90             #$CDB_File::BiIndex::verbose=0xffff unless defined $CDB_File::BiIndex::verbose;
91              
92             =head1 METHODS
93              
94             =cut
95              
96             sub DUMB () {1};
97             sub SEEKABLE () {2};
98              
99             our ($mode);
100              
101             BEGIN {
102 1     1   2232 $mode=DUMB;
103             }
104              
105             # =head1 _cdb_set_iterate
106              
107             # _cdb_set_iterate sets of a CDB so that it will start just after the
108             # key given.
109              
110             # =cut
111              
112             sub _cdb_set_iterate {
113 2     2   4 my $cdb = shift;
114 2         13 my $target = shift;
115 2 50       8 print STDERR "cdb_set_iterate called for $target\n"
116             if $CDB_File::BiIndex::verbose & 32;
117             CASE: {
118 2 50       3 $mode == DUMB and do {
  2         6  
119 2         10 my $key=$cdb->FIRSTKEY();
120 2   66     14 while ( defined $key and $key lt $target) {
121 11 50       41 print "key is $key\n"
122             if $CDB_File::BiIndex::verbose & 64;
123 11         70 $key=$cdb->NEXTKEY($key);
124             }
125 2 50       5 print "final key is $key\n"
126             if $CDB_File::BiIndex::verbose & 64;
127 2         5 return $key;
128             };
129 0         0 die "more efficient modes than DUMB not yet defined";
130             }
131 0         0 die "internal error: don't know how to _cdb_set_iterate";
132             }
133              
134              
135              
136             =head2 CDB_File::BiIndex->new(,[])
137              
138             new (CLASS, database_filenamebase)
139             new (CLASS, first_database_filename, second_database_filename)
140              
141             New opens and sets up the databases.
142              
143             =cut
144              
145             #FIXME. This should be generalised so it works on any pair of hashes.
146             #which is very easy.
147              
148              
149             sub new ($$;$) {
150 1     1 1 36 my $class=shift;
151 1         4 my $self=bless {}, $class;
152              
153             #work out what the arguments mean..
154 1         3 my $first_db_name = shift;
155 1 50       6 carp "usage new CDB_File::BiIndex (, [])"
156             unless defined $first_db_name;
157 1         2 my $second_db_name;
158 1 50       5 if (@_) {
159 1         2 $second_db_name = shift ;
160             } else {
161 0         0 $second_db_name = $first_db_name . ".2-1";
162 0         0 $first_db_name = $first_db_name . ".1-2";
163             }
164              
165 1 50       71 $self->{"first_cdb"} = tie my %first_hash, "CDB_File", $first_db_name
166             or die "Couldn't tie $first_db_name" . $!;
167 1         3 $self->{"first_hash"} = \%first_hash;
168 1 50       38 $self->{"second_cdb"} = tie my %second_hash, "CDB_File", $second_db_name
169             or die "Couldn't tie $second_db_name" . $!;
170 1         3 $self->{"second_hash"} = \%second_hash;
171              
172 1         3 $self->{"first_lastkey"}=undef;
173 1         2 $self->{"second_lastkey"}=undef;
174              
175 1         4 return $self;
176             }
177              
178              
179             =head2 $bi->lookup_first() $bi->lookup_second()
180              
181             returns a B to a list of values which are indexed against
182             key, direction of the relation depending on which function is used.
183              
184             =cut
185              
186              
187             sub lookup_first ($$) {
188 0     0 1 0 my ($self, $key)=@_;
189 0 0       0 print STDERR "lookup_first has been called with key $key\n"
190             if $CDB_File::BiIndex::verbose & 32;
191 0 0       0 croak "lookup_first called with undefined key"
192             unless defined $key;
193 0         0 my $return=$self->{"first_cdb"}->multi_get($key);
194 0 0       0 return undef unless defined $return;
195 0 0       0 die "multi_get didn't return an array ref" unless
196             (ref $return) =~ m/ARRAY/;
197 0 0       0 return undef unless @$return;
198 0         0 return $return;
199             }
200              
201             sub lookup_second ($$) {
202 1     1 1 15 my ($self, $key)=@_;
203 1 50       5 print STDERR "lookup_second has been called with key $key\n"
204             if $CDB_File::BiIndex::verbose & 32;
205 1 50       5 croak "lookup_second called with undefined key"
206             unless defined $key;
207 1         17 my $return=$self->{"second_cdb"}->multi_get($key);
208 1 50       4 return undef unless defined $return;
209 1 50       5 die "multi_get didn't return an array ref" unless
210             (ref $return) =~ m/ARRAY/;
211 1 50       4 return undef unless @$return;
212 1         4 return $return;
213             }
214              
215             # =head1 validate
216              
217             # Because the two indexes match eachother, they should make sense
218             # together. Anything which is indexed under a key in the first index
219             # should be a key in the second index with a the original key part of
220             # its value
221              
222             # =cut
223              
224             # sub validate {
225             # my $self=shift;
226             # if ( validate_against($self->{"first_cdb"},$self->{"second_cdb"})
227             # || validate_against($self->{"second_cdb"},$self->{"first_cdb"}) ) {
228             # return 0; #the validation procedures found faults
229             # } else {
230             # return 1; #validated okay.
231             # }
232             # }
233              
234             # sub validate_against{
235             # my $cdb_one = shift;
236             # die "non cdb passed as validate_against first arg"
237             # unless ref($cdb_one);
238             # my $cdb_two = shift;
239             # die "non cdb passed as validate_against second arg"
240             # unless ref($cdb_two);
241              
242             # my $break_count = 0;
243              
244             # #reset the iteration
245             # $cdb_one->start_iter();
246             # #loop through all of the entries in the first cdb
247             # my ($key,$value);
248             # RELATION: while (($key,$value) = $cdb_one->iterate()) {
249             # unless ($cdb_two->set_position($value)) {
250             # warn "Relation $key to $value in #1, but not $value as key in #2";
251             # $break_count++;
252             # next RELATION;
253             # }
254             # my ($rkey, $rvalue);
255             # CHECK: while (($rkey, $rvalue) = $cdb_two->iterate()) {
256             # last CHECK unless $rkey=$value;
257             # next RELATION if $rvalue=$key;
258             # }
259             # warn "Relation $key to $value in #1, but $key not in "
260             # . $value . "'s record in #2";
261             # $break_count++;
262             # }
263             # return $break_count;
264             # }
265              
266             =head1 ITERATORS
267              
268             The iterators iterate over the different keys in the database. They
269             skip repeated keys.
270              
271             =over 4
272              
273             =item first_set_iterate() second_set_iterate()
274              
275             set the key of the next value that will be returned
276              
277             =item first_next([]) second_next([])
278              
279             return the next key in the hash. If there has never been any
280             iteration before we will return the first key from the database. If
281             there has been iteration, we will return the key imediately following
282             the key which was last returned.
283              
284             If called with an argument, the key following that argument will be
285             returned in any case, but if that argument is exactly the last key
286             returned, we won't seek in the database (set_iterate would do that
287             anyway).
288              
289             =cut
290              
291              
292             # we always have to make sure that FIRST is called once
293             # we can call nextkey all we want until we go off the end.
294             # when we go off the end, we should call FIRST again
295              
296             # strictly internal functions to overcome some of CDB_Files
297             # wierdnesses and to allow us to iterate at the same time as doing
298             # other lookups.
299              
300             # =item xx_first()
301              
302             # return the first key
303              
304             # =item xx_next()
305              
306             # return the next key in the hash after a first
307              
308             # =item xx_reset()
309              
310             # iteration will start from the first key again. Don't normally need to call this.
311              
312             # =cut
313              
314             sub first_reset ($) {
315 2 50   2 0 17 print STDERR "first_reset called\n"
316             if $CDB_File::BiIndex::verbose & 32;
317 2         3 my $self=shift;
318 2         3 my $a=scalar keys %{$self->{"first_hash"}};
  2         52  
319 2         8 $self->{"first_lastkey"}=undef;
320             }
321              
322             sub first_first ($) {
323 1 50   1 0 4 print STDERR "first_first called\n"
324             if $CDB_File::BiIndex::verbose & 32;
325 1         3 my $self=shift;
326 1         4 $self->first_reset(); #overcomes CDB wierdness if I remember??
327 1         4 my $key = $self->{"first_cdb"}->FIRSTKEY();
328 1         2 $self->{"first_lastkey"}=$key;
329 1         4 return $key;
330             }
331              
332             sub first_next ($;$) {
333 3     3 1 16 my $self=shift;
334 3         3 my $key=shift;
335              
336 3         5 my $lastkey=$self->{"first_lastkey"};
337              
338 3 50       8 $CDB_File::BiIndex::verbose & 32 && do {
339 0         0 print STDERR "first_next called ";
340 0 0       0 if (defined $lastkey ) {
341 0         0 print STDERR " stored key $lastkey";
342             } else {
343 0         0 print STDERR " no stored key";
344             }
345 0 0       0 if (defined $key ) {
346 0         0 print STDERR " key $key\n";
347             } else {
348 0         0 print STDERR " no key\n";
349             }
350             };
351              
352             CASE: {
353              
354 3 100 66     4 defined $lastkey or defined $key or do {
  3         12  
355             #this is the start of iteration
356 1 50       9 print STDERR "never iterated; start with first_first\n"
357             if $CDB_File::BiIndex::verbose & 32;
358 1         5 return $self->first_first();
359             };
360              
361 2 50 0     8 defined $key and not (defined $lastkey and $key eq $lastkey) and do {
      33        
362 0         0 $self->first_set_iterate($key);
363 0         0 $lastkey=$key;
364             };
365              
366             }
367              
368 2         3 $key=$lastkey;
369              
370 2         3 KEY: while (1) {
371 10         38 $key=$self->{"first_cdb"}->NEXTKEY($key);
372 10 100       21 defined $key or last KEY;
373 9 100       19 $key eq $lastkey or last KEY;
374 8 50       14 print STDERR "repeat of last key $key. skipping.\n"
375             if $CDB_File::BiIndex::verbose & 128;
376             }
377              
378 2 50       6 ( $CDB_File::BiIndex::verbose & 64 ) && do {
379 0 0       0 print STDERR "returning key $key\n" if defined $key ;
380 0 0       0 print STDERR "reached the end returning undefined key \n"
381             unless defined $key;
382             };
383              
384             #if we run off the end then we should start at the beginning next time
385 2         3 $self->{"first_lastkey"}=$key;
386 2         5 return $key;
387             }
388              
389             sub first_set_iterate ($$) {
390 0     0 1 0 my $self=shift;
391 0         0 my $key=shift;
392 0 0       0 print STDERR "first_set_iterate has been called with key $key\n"
393             if $CDB_File::BiIndex::verbose & 32;
394 0         0 $key=_cdb_set_iterate($self->{"first_cdb"}, $key);
395 0         0 $self->{"first_lastkey"}=$key;
396 0         0 return $key;
397             }
398              
399             sub second_reset ($) {
400 1 50   1 0 12 print STDERR "second_reset called\n"
401             if $CDB_File::BiIndex::verbose & 32;
402 1         1 my $self=shift;
403 1         2 my $a=scalar keys %{$self->{"second_hash"}};
  1         31  
404 1         10 delete $self->{"second_lastkey"};
405             }
406              
407             sub second_first ($) {
408 2 50   2 0 6 print STDERR "second_first called\n"
409             if $CDB_File::BiIndex::verbose & 32;
410 2         3 my $self=shift;
411             # $self->second_reset(); #overcomes CDB wierdness if I remember??
412 2         9 my $key = $self->{"second_cdb"}->FIRSTKEY();
413 2         3 $self->{"second_lastkey"}=$key;
414 2         6 return $key;
415             }
416              
417             sub second_next ($;$) {
418 19     19 1 113 my $self=shift;
419 19         20 my $key=shift;
420              
421 19         30 my $lastkey=$self->{"second_lastkey"};
422              
423 19 50       38 $CDB_File::BiIndex::verbose & 32 && do {
424 0         0 print STDERR "second_next called ";
425 0 0       0 if (defined $lastkey ) {
426 0         0 print STDERR " stored key $lastkey";
427             } else {
428 0         0 print STDERR " no stored key";
429             }
430 0 0       0 if (defined $key ) {
431 0         0 print STDERR " key $key\n";
432             } else {
433 0         0 print STDERR " no key\n";
434             }
435             };
436              
437             CASE: {
438              
439 19 100 66     18 defined $lastkey or defined $key or do {
  19         49  
440             #this is the start of iteration
441 2 50       7 print STDERR "never iterated; start with second_first\n"
442             if $CDB_File::BiIndex::verbose & 32;
443 2         7 return $self->second_first();
444             };
445              
446 17 50 0     38 defined $key and not (defined $lastkey and $key eq $lastkey) and do {
      33        
447 0         0 $self->second_set_iterate($key);
448 0         0 $lastkey=$key;
449             };
450              
451             }
452              
453 17         21 $key=$lastkey;
454              
455 17         15 KEY: while (1) {
456 17         52 $key=$self->{"second_cdb"}->NEXTKEY($key);
457 17 100       35 defined $key or last KEY;
458 15 50       33 $key eq $lastkey or last KEY;
459 0 0       0 print STDERR "repeat of last key $key. skipping.\n"
460             if $CDB_File::BiIndex::verbose & 128;
461             }
462              
463 17 50       34 ( $CDB_File::BiIndex::verbose & 64 ) && do {
464 0 0       0 print STDERR "returning key $key\n" if defined $key ;
465 0 0       0 print STDERR "reached the end returning undefined key \n"
466             unless defined $key;
467             };
468              
469             #if we run off the end then we should start at the beginning next time
470 17         26 $self->{"second_lastkey"}=$key;
471 17         41 return $key;
472             }
473              
474             sub second_set_iterate ($$) {
475 2     2 1 20 my $self=shift;
476 2         5 my $key=shift;
477 2 50       5 print STDERR "second_set_iterate has been called with key $key\n"
478             if $CDB_File::BiIndex::verbose & 32;
479 2         7 $key=_cdb_set_iterate($self->{"second_cdb"}, $key);
480 2         4 $self->{"second_lastkey"}=$key;
481 2         7 return $key;
482             }
483              
484              
485             =head1 BUGS
486              
487             This module requires the version of the CDB_File perl module to be
488             better than 0.86. Unfortunately, version 0.83 was given the version
489             string "0.9" (and version 0.86 has the string '0.86'). This means
490             that normal perl version checking will not give the correct warnings.
491             There is a hardwired check that the version is not 0.9. I assume that
492             future CDB_File modules won't use that version number, but if they do,
493             then please edit inside the CDB_File::BiIndex perl module file its
494             self and delete the section between the lines
495              
496             # delete from here ...
497              
498             and
499              
500             # ... delete to here
501              
502             the module will then hopefully work properly.
503              
504             N.B. please only do that B
505             version> of the distribution than 0.86.
506              
507             =head1 COPYING
508              
509             This module may be distributed under the same terms as perl.
510              
511             =cut
512              
513              
514             1; #what does it prove...