File Coverage

blib/lib/CDB_File/Generator.pm
Criterion Covered Total %
statement 56 73 76.7
branch 5 14 35.7
condition 0 3 0.0
subroutine 9 13 69.2
pod 5 7 71.4
total 75 110 68.1


line stmt bran cond sub pod time code
1             package CDB_File::Generator;
2             $REVISION=q$Revision: 1.13 $ ;
3 1     1   746 use vars qw($VERSION);
  1         2  
  1         71  
4             $VERSION='0.030';
5              
6             =head1 NAME
7              
8             CDB_File::Generator - generate massive sorted CDB files simply.
9              
10             =head1 SYNOPSIS
11              
12             use CDB_File::Generator;
13             $gen = new CDB::Generator "my.cdb";
14             $gen->("Fred", "Martha");
15             $gen->("Fred", "Olivia");
16             $gen->("Fred", "Jenny");
17             $gen->("Roger", "Joe");
18             $gen->("Roger", "Jenny");
19             $gen = undef;
20             use CDB_File;
21              
22              
23             =head1 DESCRIPTION
24              
25             This is a class which makes generating sorted large (much bigger than
26             memory, but the speed will depend on the efficiency of your sort
27             command. If you haven't got one, for example, it won't work at all.)
28             CDB files on the fly very easy
29              
30             =cut
31              
32             #this lets us have lots of unique temporary files.
33              
34 1     1   5 use Carp;
  1         2  
  1         58  
35 1     1   994 use IO::File;
  1         11625  
  1         164  
36 1     1   9 use strict;
  1         2  
  1         56  
37              
38             BEGIN {
39 1     1   1291 my $tempfile_no=0;
40 2     2 0 9 sub next_tmp_file () { $tempfile_no++ };
41             }
42              
43             =head1 METHODS
44              
45             =head2 Generator::new $cdbfile [$cdbmaketemp [{$tmpname [$sorttmpname] | $tmpdir}]]
46              
47             The new function creates a generator for a given filename, optionally
48             specifying where it sould put it's temporary files.
49              
50             =cut
51              
52             my $deftemp="/tmp/cdb_generator_tmp.$$.";
53              
54             sub new ($$@) {
55 1     1 1 42 my $class=shift;
56 1         3 my $self = bless {}, $class;
57 1         3 my $cdbfilename = shift;
58 1         7 $self->{"cdbfile"} = $cdbfilename;
59 1         7 $self->{"tmpoutfile"} = $deftemp . next_tmp_file();
60 1         4 $self->{"tmpsortfile"} = $deftemp . next_tmp_file();
61 1         2 my $tmpmake = $cdbfilename;
62 1 50       10 $tmpmake =~ s/.cdb$/.tmp/ or $tmpmake = $tmpmake . ".tmp";
63 1 50       13 Carp::croak "file $tmpmake for cdb temfile exists" if -e $tmpmake;
64 1         3 $self->{"tmpmakefile"} = $tmpmake;
65 1 50       11 my $fh = new IO::File $self->{"tmpoutfile"}, '>'
66             or die "couldn't create output file";
67 1         271 $self->{"fh"} = $fh;
68 1         4 $self->{"added"} = 0;
69 1         3 return $self;
70             }
71              
72             =head2 $gen->add($key, $value)
73              
74             Adds a value to the CDB being created
75              
76             =cut
77              
78             sub add ($$$) {
79 5     5 1 48 my ($self, $key, $value) = @_;
80 5         9 my $fh = $self->{"fh"};
81              
82             #change newlines so sort can sort everything as lines.
83             #change tabs so we can use them as a separator
84 5         7 $key =~ s,\\,\\\\,g; #now we have all \s in even numbered groups
85 5         7 $key =~ s,\n,\\n,g; #an odd \ followed by a newline is a nl
86 5         6 $key =~ s,\t,\\t,g; #an odd \ followed by a tab is a nl
87 5         6 $value =~ s,\\,\\\\,g; #....
88 5         7 $value =~ s,\n,\\n,g;
89 5         7 $value =~ s,\t,\\t,g; # ....
90              
91 5         27 print $fh $key, "\t", $value, "\n";
92              
93 5         14 $self->{"added"} ++;
94             }
95              
96             =head2 $gen->DESTROY
97              
98             This is not normally called by the user, but rather by the completion
99             of the cdbfile being writen out and that block of the program being
100             exited or by the program completing. When it us run, it calls the
101             finish method which ends the CDB creation. See below.
102              
103             =cut
104              
105             sub DESTROY ($) {
106 0     0   0 my $self=shift;
107 0 0 0     0 $self->{"added"} && $self->finish() unless $self->{"abort"};
108 0         0 foreach my $del ( "tmpoutfile", "tmpsortfile", "tmpmakefile") {
109 0         0 unlink $self->{$del};
110             }
111             }
112              
113             =head2 finish
114              
115             Finish ends of the cdb creation. First it closes the output temporary
116             file, then it sorts it to another file and finally it calls C
117             to complete the creation job.
118              
119             In the current implementation this uses C and deletes repeats of
120             the same key with the same value.
121              
122             In order to increase database portability, by default all sorting is
123             done in the 'C' locale, even if the current program is working in
124             another locale. This is "the right thing" in many cases. Where you
125             are dealing with real word keys it won't be the right thing. In this case, use the locale function to set the locale.
126              
127             =cut
128              
129             sub locale ($) {
130 0     0 0 0 my $self=shift;
131 0         0 my $locale=shift;
132 0         0 $self->{locale}=$locale;
133             }
134              
135             sub finish ($) {
136 1     1 1 9 my $self=shift;
137 1         56 close $self->{"fh"};
138              
139             {
140 1         3 local $ENV{LC_ALL};
  1         7  
141 1 50       6 if ($self->{locale}) {
142 0         0 $ENV{LC_ALL}=$self->{locale};
143             } else {
144 1         8 $ENV{LC_ALL}='C';
145             }
146 1         15392 system 'sort' , '-u', '-o' ,$self->{"tmpsortfile"} , $self->{"tmpoutfile"};
147             }
148              
149 1 50       67 my $fh = new IO::File $self->{"tmpsortfile"}
150             or die "couldn't open sorted output file";
151              
152 1         372 my $cdbmakeout = new IO::File ( '|cdbmake ' . $self->{"cdbfile"}
153             . ' ' . $self->{"tmpmakefile"} );
154              
155 1         10887 while (<$fh>) {
156 1         35 my ($key, $value) = m/^(.*)\t(.*)$/;
157              
158             #the \G s allow for multiple new lines in a row
159             #odd numbered slash with t is tab
160 1         8 $key =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\t,$1\t,g;
161             #odd numbered slash with t is tab
162 1         7 $key =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\n,$1\n,g;
163             #pairs of slashes match a single slash
164 1         2 $key =~ s,\\\\,\\,g;
165              
166             #same for value....
167 1         8 $value =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\t,$1\t,g;
168 1         7 $value =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\n,$1\n,g;
169 1         7 $value =~ s,\\\\,\\,g;
170              
171 1         216 print $cdbmakeout &gen_cdb_input($key, $value);
172             }
173 0           print $cdbmakeout "\n";
174 0           $cdbmakeout->close;
175 0           $fh->close;
176              
177             # $self->{"abort"} = 1;
178             #FIXME return codes etc..
179 0 0         unlink $self->{"tmpsortfile"} , $self->{"tmpoutfile"}
180             or warn "trouble deleting temp files "
181             . $self->{"tmpsortfile"} . $self->{"tmpoutfile"};
182 0           $self->{"added"} = 0;
183             }
184              
185             =head2 $gen->abort
186              
187             If you decide not to create the CDB file you were creating, you have
188             to call this method. Otherwise, it will be created as your program
189             exits (or possibly earlier)
190              
191             =cut
192              
193             sub abort ($) {
194 0     0 1   shift->{"abort"} = 1
195             }
196              
197             =head2 gen_cdb_input($key,$value)
198              
199             This is a little utility function which formats a cdbmake input line.
200              
201             =cut
202              
203             sub gen_cdb_input ($$) {
204 0     0 1   my $key=shift;
205 0           my $value=shift;
206              
207 0           return "+" . length($key) . "," . length($value) . ":"
208             . $key . "->" . $value . "\n";
209             }
210              
211             =head1 BUGS
212              
213             We use the external programs C and C. These almost
214             certainly improve our performance on large databases (and those are
215             all we care about), but they make portability difficult.. Possibly
216             system independent alternatives should be written and used where
217             needed.
218              
219             We should write out to the sort file with some encoding that gets rid
220             of new lines and then read back, de-coding that to feed it to cdbmake..
221              
222             =cut
223