File Coverage

blib/lib/CDB_File.pm
Criterion Covered Total %
statement 19 19 100.0
branch 1 2 50.0
condition n/a
subroutine 7 7 100.0
pod 0 1 0.0
total 27 29 93.1


line stmt bran cond sub pod time code
1             package CDB_File;
2              
3 4     4   276324 use strict;
  4         35  
  4         116  
4              
5 4     4   21 use XSLoader ();
  4         10  
  4         50  
6 4     4   16 use Exporter ();
  4         6  
  4         1073  
7              
8             our @ISA = qw(Exporter);
9             our $VERSION = '1.03';
10             our @EXPORT_OK = qw(create);
11              
12             =head1 NAME
13              
14             CDB_File - Perl extension for access to cdb databases
15              
16             =head1 SYNOPSIS
17              
18             use CDB_File;
19             $c = tie %h, 'CDB_File', 'file.cdb' or die "tie failed: $!\n";
20              
21             $fh = $c->handle;
22             sysseek $fh, $c->datapos, 0 or die ...;
23             sysread $fh, $x, $c->datalen;
24             undef $c;
25             untie %h;
26              
27             $t = CDB_File->new('t.cdb', "t.$$") or die ...;
28             $t->insert('key', 'value');
29             $t->finish;
30              
31             CDB_File::create %t, $file, "$file.$$";
32              
33             or
34              
35             use CDB_File 'create';
36             create %t, $file, "$file.$$";
37              
38             =head1 DESCRIPTION
39              
40             B is a module which provides a Perl interface to Dan
41             Bernstein's B package:
42              
43             cdb is a fast, reliable, lightweight package for creating and
44             reading constant databases.
45              
46             =head2 Reading from a cdb
47              
48             After the C shown above, accesses to C<%h> will refer
49             to the B file C, as described in L.
50              
51             Low level access to the database is provided by the three methods
52             C, C, and C. To use them, you must remember
53             the C object returned by the C call: C<$c> in the
54             example above. The C and C methods return the
55             file offset position and length respectively of the most recently
56             visited key (for example, via C).
57              
58             Beware that if you create an extra reference to the C object
59             (like C<$c> in the example above) you must destroy it (with C)
60             before calling C on the hash. This ensures that the object's
61             C method is called. Note that C will check this for
62             you; see L for further details.
63              
64             =head2 Creating a cdb
65              
66             A B file is created in three steps. First call C
67             ($final, $tmp)>, where C<$final> is the name of the database to be
68             created, and C<$tmp> is the name of a temporary file which can be
69             atomically renamed to C<$final>. Secondly, call the C method
70             once for each (I, I) pair. Finally, call the C
71             method to complete the creation and renaming of the B file.
72              
73             Alternatively, call the C method with multiple key/value
74             pairs. This can be significantly faster because there is less crossing
75             over the bridge from perl to C code. One simple way to do this is to pass
76             in an entire hash, as in: C<< $cdbmaker->insert(%hash); >>.
77              
78             A simpler interface to B file creation is provided by
79             C. This creates a B file named
80             C<$final> containing the contents of C<%t>. As before, C<$tmp> must
81             name a temporary file which can be atomically renamed to C<$final>.
82             C may be imported.
83              
84             =head1 EXAMPLES
85              
86             These are all complete programs.
87              
88             1. Convert a Berkeley DB (B-tree) database to B format.
89              
90             use CDB_File;
91             use DB_File;
92              
93             tie %h, DB_File, $ARGV[0], O_RDONLY, undef, $DB_BTREE or
94             die "$0: can't tie to $ARGV[0]: $!\n";
95              
96             CDB_File::create %h, $ARGV[1], "$ARGV[1].$$" or
97             die "$0: can't create cdb: $!\n";
98              
99             2. Convert a flat file to B format. In this example, the flat
100             file consists of one key per line, separated by a colon from the value.
101             Blank lines and lines beginning with B<#> are skipped.
102              
103             use CDB_File;
104              
105             $cdb = new CDB_File("data.cdb", "data.$$") or
106             die "$0: new CDB_File failed: $!\n";
107             while (<>) {
108             next if /^$/ or /^#/;
109             chop;
110             ($k, $v) = split /:/, $_, 2;
111             if (defined $v) {
112             $cdb->insert($k, $v);
113             } else {
114             warn "bogus line: $_\n";
115             }
116             }
117             $cdb->finish or die "$0: CDB_File finish failed: $!\n";
118              
119             3. Perl version of B.
120              
121             use CDB_File;
122              
123             tie %data, 'CDB_File', $ARGV[0] or
124             die "$0: can't tie to $ARGV[0]: $!\n";
125             while (($k, $v) = each %data) {
126             print '+', length $k, ',', length $v, ":$k->$v\n";
127             }
128             print "\n";
129              
130             4. For really enormous data values, you can use C, C,
131             and C, in combination with C and C, to
132             avoid reading the values into memory. Here is the script F,
133             which can extract uncompressed files and directories from a B
134             file.
135              
136             use CDB_File;
137              
138             sub unnetstrings {
139             my($netstrings) = @_;
140             my @result;
141             while ($netstrings =~ s/^([0-9]+)://) {
142             push @result, substr($netstrings, 0, $1, '');
143             $netstrings =~ s/^,//;
144             }
145             return @result;
146             }
147              
148             my $chunk = 8192;
149              
150             sub extract {
151             my($file, $t, $b) = @_;
152             my $head = $$b{"H$file"};
153             my ($code, $type) = $head =~ m/^([0-9]+)(.)/;
154             if ($type eq "/") {
155             mkdir $file, 0777;
156             } elsif ($type eq "_") {
157             my ($total, $now, $got, $x);
158             open OUT, ">$file" or die "open for output: $!\n";
159             exists $$b{"D$code"} or die "corrupt bun file\n";
160             my $fh = $t->handle;
161             sysseek $fh, $t->datapos, 0;
162             $total = $t->datalen;
163             while ($total) {
164             $now = ($total > $chunk) ? $chunk : $total;
165             $got = sysread $fh, $x, $now;
166             if (not $got) { die "read error\n"; }
167             $total -= $got;
168             print OUT $x;
169             }
170             close OUT;
171             } else {
172             print STDERR "warning: skipping unknown file type\n";
173             }
174             }
175              
176             die "usage\n" if @ARGV != 1;
177              
178             my (%b, $t);
179             $t = tie %b, 'CDB_File', $ARGV[0] or die "tie: $!\n";
180             map { extract $_, $t, \%b } unnetstrings $b{""};
181              
182             5. Although a B file is constant, you can simulate updating it
183             in Perl. This is an expensive operation, as you have to create a
184             new database, and copy into it everything that's unchanged from the
185             old database. (As compensation, the update does not affect database
186             readers. The old database is available for them, till the moment the
187             new one is Ced.)
188              
189             use CDB_File;
190              
191             $file = 'data.cdb';
192             $new = new CDB_File($file, "$file.$$") or
193             die "$0: new CDB_File failed: $!\n";
194              
195             # Add the new values; remember which keys we've seen.
196             while (<>) {
197             chop;
198             ($k, $v) = split;
199             $new->insert($k, $v);
200             $seen{$k} = 1;
201             }
202              
203             # Add any old values that haven't been replaced.
204             tie %old, 'CDB_File', $file or die "$0: can't tie to $file: $!\n";
205             while (($k, $v) = each %old) {
206             $new->insert($k, $v) unless $seen{$k};
207             }
208              
209             $new->finish or die "$0: CDB_File finish failed: $!\n";
210              
211             =head1 REPEATED KEYS
212              
213             Most users can ignore this section.
214              
215             A B file can contain repeated keys. If the C method is
216             called more than once with the same key during the creation of a B
217             file, that key will be repeated.
218              
219             Here's an example.
220              
221             $cdb = new CDB_File ("$file.cdb", "$file.$$") or die ...;
222             $cdb->insert('cat', 'gato');
223             $cdb->insert('cat', 'chat');
224             $cdb->finish;
225              
226             Normally, any attempt to access a key retrieves the first value
227             stored under that key. This code snippet always prints B.
228              
229             $catref = tie %catalogue, CDB_File, "$file.cdb" or die ...;
230             print "$catalogue{cat}";
231              
232             However, all the usual ways of iterating over a hash---C,
233             C, and C---do the Right Thing, even in the presence of
234             repeated keys. This code snippet prints B.
235              
236             print join(' ', keys %catalogue, values %catalogue);
237              
238             And these two both print B, although the second is
239             more efficient.
240              
241             foreach $key (keys %catalogue) {
242             print "$key:$catalogue{$key} ";
243             }
244              
245             while (($key, $val) = each %catalogue) {
246             print "$key:$val ";
247             }
248              
249             The C method retrieves all the values associated with a key.
250             It returns a reference to an array containing all the values. This code
251             prints B.
252              
253             print "@{$catref->multi_get('cat')}";
254              
255             C always returns an array reference. If the key was not
256             found in the database, it will be a reference to an empty array. To
257             test whether the key was found, you must test the array, and not the
258             reference.
259              
260             $x = $catref->multiget($key);
261             warn "$key not found\n" unless $x; # WRONG; message never printed
262             warn "$key not found\n" unless @$x; # Correct
263              
264             The C method returns a hashref of all keys with the first
265             value in the cdb. This is useful for quickly loading a cdb file where
266             there is a 1:1 key mapping. In practice it proved to be about 400%
267             faster then iterating a tied hash.
268              
269             # Slow
270             my %copy = %tied_cdb;
271              
272             # Much Faster
273             my $copy_hashref = $catref->fetch_all();
274              
275             =head1 RETURN VALUES
276              
277             The routines C, C, and C return B if the
278             attempted operation failed; C<$!> contains the reason for failure.
279              
280             =head1 DIAGNOSTICS
281              
282             The following fatal errors may occur. (See L if
283             you want to trap them.)
284              
285             =over 4
286              
287             =item Modification of a CDB_File attempted
288              
289             You attempted to modify a hash tied to a B.
290              
291             =item CDB database too large
292              
293             You attempted to create a B file larger than 4 gigabytes.
294              
295             =item [ Write to | Read of | Seek in ] CDB_File failed:
296              
297             If B is B, you tried to C to
298             access something that isn't a B file. Otherwise a serious OS level
299             problem occurred, for example, you have run out of disk space.
300              
301             =back
302              
303             =head1 PERFORMANCE
304              
305             Sometimes you need to get the most performance possible out of a
306             library. Rumour has it that perl's tie() interface is slow. In order
307             to get around that you can use CDB_File in an object oriented
308             fashion, rather than via tie().
309              
310             my $cdb = CDB_File->TIEHASH('/path/to/cdbfile.cdb');
311              
312             if ($cdb->EXISTS('key')) {
313             print "Key is: ", $cdb->FETCH('key'), "\n";
314             }
315              
316             For more information on the methods available on tied hashes see
317             L.
318              
319             =head1 THE ALGORITHM
320              
321             This algorithm is described at L It is
322             small enough that it is included inline in the event that the
323             internet loses the page:
324              
325             =head2 A structure for constant databases
326              
327             Copyright (c) 1996 D. J. Bernstein, L
328              
329             A cdb is an associative array: it maps strings ('keys'') to strings
330             ('data'').
331              
332             A cdb contains 256 pointers to linearly probed open hash tables. The
333             hash tables contain pointers to (key,data) pairs. A cdb is stored in
334             a single file on disk:
335              
336             +----------------+---------+-------+-------+-----+---------+
337             | p0 p1 ... p255 | records | hash0 | hash1 | ... | hash255 |
338             +----------------+---------+-------+-------+-----+---------+
339              
340             Each of the 256 initial pointers states a position and a length. The
341             position is the starting byte position of the hash table. The length
342             is the number of slots in the hash table.
343              
344             Records are stored sequentially, without special alignment. A record
345             states a key length, a data length, the key, and the data.
346              
347             Each hash table slot states a hash value and a byte position. If the
348             byte position is 0, the slot is empty. Otherwise, the slot points to
349             a record whose key has that hash value.
350              
351             Positions, lengths, and hash values are 32-bit quantities, stored in
352             little-endian form in 4 bytes. Thus a cdb must fit into 4 gigabytes.
353              
354             A record is located as follows. Compute the hash value of the key in
355             the record. The hash value modulo 256 is the number of a hash table.
356             The hash value divided by 256, modulo the length of that table, is a
357             slot number. Probe that slot, the next higher slot, and so on, until
358             you find the record or run into an empty slot.
359              
360             The cdb hash function is C, with a starting
361             hash of 5381.
362              
363              
364             =head1 BUGS
365              
366             The C interface could be done with C.
367              
368             =head1 SEE ALSO
369              
370             cdb(3)
371              
372             =head1 AUTHOR
373              
374             Tim Goodwin, . B began on 1997-01-08.
375              
376             Work provided through 2008 by Matt Sergeant,
377              
378             Now maintained by Todd Rinaldo,
379              
380             =cut
381              
382             XSLoader::load( 'CDB_File', $VERSION );
383              
384             sub CLEAR {
385 2     2   14 require Carp;
386 2         288 Carp::croak("Modification of a CDB_File attempted");
387             }
388              
389             sub DELETE {
390 1     1   627 &CLEAR;
391             }
392              
393             sub STORE {
394 1     1   7283 &CLEAR;
395             }
396              
397             # Must be preloaded for the prototype.
398              
399             sub create(\%$$) {
400 4     4 0 34485 my ( $RHdata, $fn, $fntemp ) = @_;
401              
402 4 50       271 my $cdb = CDB_File->new( $fn, $fntemp ) or return undef;
403 4         17 my ( $k, $v );
404 4         84 $cdb->insert(%$RHdata);
405 4         18003 $cdb->finish;
406 4         62 return 1;
407             }
408              
409             1;