File Coverage

blib/lib/DBIx/FileStore.pm
Criterion Covered Total %
statement 18 175 10.2
branch 0 72 0.0
condition 0 27 0.0
subroutine 6 21 28.5
pod 10 10 100.0
total 34 305 11.1


line stmt bran cond sub pod time code
1             package DBIx::FileStore;
2 3     3   46540 use strict;
  3         6  
  3         123  
3              
4             # this reads and writes files from the db.
5 3     3   4841 use DBI;
  3         49319  
  3         248  
6 3     3   34 use Digest::MD5 qw( md5_base64 );
  3         9  
  3         195  
7 3     3   2398 use File::Copy;
  3         10465  
  3         229  
8              
9 3     3   1214 use DBIx::FileStore::ConfigFile;
  3         8  
  3         129  
10              
11 3         14 use fields qw( dbh dbuser dbpasswd
12             dbhost dbname filetable blockstable blocksize
13             verbose
14             confhash
15             uselocks
16 3     3   17 );
  3         4  
17              
18             our $VERSION = '0.28'; # version also mentioned in POD below.
19              
20             sub new {
21 0     0 1   my ($self, %opts) = @_;
22 0 0         unless (ref $self) {
23 0           $self = fields::new($self);
24             }
25 0 0         if ($opts{verbose}) { $self->{verbose}=1; }
  0            
26              
27 0           my $config_reader = new DBIx::FileStore::ConfigFile();
28 0           my $conf = $self->{confhash} = $config_reader->read_config_file();
29              
30             # FOR TESTING WITH 1 BYTE BLOCKS
31             #my $block_size = 1; # 1 byte blocks (!)
32 0           my $block_size = 500 * 1024; # 512K blocks
33 0           $self->{blocksize} = $block_size;
34              
35             # with 900K (or even 600K) blocks, when inserting binary .rpm files,
36             # we get
37             #
38             # DBD::mysql::db do failed: Got a packet bigger than 'max_allowed_packet' bytes.
39             #
40             # We think there's some encoding of the binary data going
41             # that inflates binary data during transmission.
42             #
43            
44              
45             ###############################################################################
46             # By default, WE DON'T USE LOCKS ANY MORE. Like a real filesystem, you might
47             # get interspersed or truncated information if the filesystem is
48             # being changed while you're reading!
49             ###############################################################################
50 0           $self->{uselocks} = 0;
51              
52 0   0       $self->{dbuser} = $conf->{dbuser} || die "$0: no dbuser set\n";
53 0   0       $self->{dbpasswd} = $conf->{dbpasswd} || warn "$0: no dbpasswd set\n"; # this could be ok.
54 0   0       $self->{dbname} = $conf->{dbname} || die "$0: no dbname set\n";
55              
56             # dbhost defaults to 127.0.0.1
57 0   0       $self->{dbhost} = $conf->{dbhost} || "127.0.0.1";
58              
59 0           $self->{filetable} = "files";
60 0           $self->{blockstable} = "fileblocks";
61              
62 0           my $dsn = "DBI:mysql:database=$self->{dbname};host=$self->{dbhost}";
63              
64 0           my %attr = ( RaiseError => 1, PrintError => 1, AutoCommit => 1 ); # for mysql
65              
66 0           $self->{dbh} = DBI->connect_cached(
67             $dsn, $self->{dbuser}, $self->{dbpasswd}, \%attr);
68 0           $self->{dbh}->{mysql_auto_reconnect} = 1; # auto reconnect
69              
70 0           return $self;
71             }
72              
73             sub get_all_filenames {
74 0     0 1   my ($self) = @_;
75 0           my $files = $self->{dbh}->selectall_arrayref( # lasttime + 0 gives us an int back
76             "select name, c_len, c_md5, lasttime+0 from $self->{filetable}
77             where b_num=0 order by name");
78 0           return $files;
79             }
80              
81             sub get_filenames_matching_prefix {
82 0     0 1   my ($self, $name) = @_;
83 0           my $pattern = $name . "%";
84 0           my $files = $self->{dbh}->selectall_arrayref( # lasttime + 0 gives us an int back
85             "select name, c_len, c_md5, lasttime+0 from $self->{filetable}
86             where name like ? and b_num=0 order by name", {}, $pattern);
87 0           return $files;
88             }
89              
90             sub rename_file {
91 0     0 1   my ($self, $from, $to) = @_;
92 0 0         die "$0: name not ok: $from" unless name_ok($from);
93 0 0         die "$0: name not ok: $to" unless name_ok($to);
94             # renames the rows in the filetable and the blockstable
95 0           my $dbh = $self->{dbh};
96 0           $self->_lock_tables();
97              
98 0           for my $table ( ( $self->{filetable}, $self->{blockstable} ) ) {
99 0           my $sql = "select name from $table where name like ?";
100 0 0         $sql .= " order by b_num" if $table eq $self->{filetable};
101              
102 0           my $files = $dbh->selectall_arrayref( $sql, {}, $from . " %");
103 0           for my $f (@$files) {
104 0           (my $num = $f->[0]) =~ s/.* //;
105 0 0         print "$0: Moving $table:$f->[0], (num $num) to '$to $num'...\n" if $self->{verbose};
106 0           $dbh->do("update $table set name=? where name=?", {}, "$to $num", $f->[0]);
107             }
108             }
109              
110 0           $self->_unlock_tables();
111 0           return 1;
112             }
113              
114             sub delete_file {
115 0     0 1   my ($self, $name) = @_;
116 0 0         die "$0: name not ok: $name" unless name_ok($name);
117              
118 0           my $dbh = $self->{dbh};
119 0           my $filetable = $self->{filetable}; # probably "files"
120 0           my $blockstable = $self->{blockstable}; # probably "fileblocks"
121 0           for my $table ( ( $filetable, $blockstable ) ) {
122 0           my $rv = int($dbh->do( "delete from $table where name like ?", {}, "$name %" ));
123 0 0         if($rv) {
124 0 0         print "$0: $table: deleted $name ($rv blocks)\n" if $self->{verbose};
125             } else {
126 0 0         warn "$0: no blocks to delete for $table:$name\n" if $self->{verbose};
127             }
128             }
129 0           return 1;
130             }
131              
132              
133             sub copy_blocks_from_db_to_filehandle {
134 0     0 1   my ($self, $fdbname, $filehandle) = @_;
135 0 0         die "$0: name not ok: $fdbname" unless name_ok($fdbname);
136             my $print_to_filehandle_callback = sub {
137             # this is a closure, so $fh comes from the surrounding context
138 0     0     my $content = shift;
139 0           print $filehandle $content;
140 0           };
141             # read all our blocks, calling our callback for each one
142 0           my $ret = $self->_read_blocks_from_db( $print_to_filehandle_callback, $fdbname );
143 0           return $ret;
144             }
145              
146              
147             # reads the content into $pathname, returns the length of the data read.
148             sub read_from_db {
149 0     0 1   my ($self, $pathname, $fdbname) = @_;
150              
151 0 0         die "$0: name not ok: $fdbname" unless name_ok($fdbname);
152              
153 0 0         open( my $fh, ">", $pathname) || die "$0: can't open for output: $pathname\n";
154              
155             # this is a function used as a callback and called with each chunk of the data
156             # into a temporary file. $fh stay in context for the function (closure) below.
157             my $print_to_file_callback = sub {
158             # this is a closure, so $fh comes from the surrounding context
159 0     0     my $content = shift;
160 0           print $fh $content;
161 0           };
162              
163             # read all our blocks, calling our callback for each one
164 0           my $ret = $self->_read_blocks_from_db( $print_to_file_callback, $fdbname );
165              
166             # if we fetched *something* into our scoped $fh to the temp file,
167             # then copy it to the destination they asked for, and delete
168             # the temp file.
169 0 0         if (!$fh) {
170 0           warn "$0: not found in FileDB: $fdbname\n";
171             }
172             # clear our fh member
173 0 0         close($fh) if defined($fh); # this 'close' should cause the associated file to be deleted
174 0           $fh = undef;
175              
176             # return number of bytes read.
177 0           return $ret;
178             }
179              
180              
181             # my $bytes_written = $self->write_to_db( $localpathname, $filestorename );
182             sub write_to_db {
183 0     0 1   my ($self, $pathname, $fdbname) = @_;
184              
185 0 0         die "$0: name not ok: $fdbname" unless name_ok($fdbname);
186              
187 0 0         open(my $fh, "<" , $pathname)
188             || die "$0: Couldn't open: $pathname\n";
189            
190              
191 0           my $total_length = -s $pathname; # get the length
192 0 0         if ($total_length == 0) { warn "$0: warning: writing 0 bytes for $pathname\n"; }
  0            
193              
194 0           my $bytecount = $self->write_from_filehandle_to_db( $fh, $fdbname );
195              
196 0 0         die "$0: file length didn't match written data length for $pathname: $bytecount != $total_length\n"
197             if ($bytecount != $total_length);
198              
199 0 0         close ($fh) || die "$0: couldn't close: $pathname\n";
200 0           return $total_length;
201             }
202              
203             sub write_from_filehandle_to_db {
204 0     0 1   my ($self, $fh, $fdbname) = @_;
205              
206 0           my $ctx = Digest::MD5->new;
207 0           my $dbh = $self->{dbh};
208 0           my $filetable = $self->{filetable};
209 0           my $blockstable = $self->{blockstable};
210 0           my $verbose = $self->{verbose};
211 0           my $size = 0;
212              
213 0           $self->_lock_tables();
214 0           for( my ($bytes,$part,$block) = (0,0,""); # init
215             $bytes = read($fh, $block, $self->{blocksize}); # test
216             $part++, $block="" ) { # increment
217 0           $ctx->add( $block );
218 0           $size += length( $block );
219 0           my $b_md5 = md5_base64( $block );
220              
221 0 0 0       printf("saving from filehandle into '%s $part'\n", $fdbname )
222             if($verbose && $part%25==0);
223              
224 0           my $name = sprintf("%s %05d", $fdbname, $part);
225 0           $dbh->do("replace into $filetable set name=?, c_md5=?, b_md5=?, c_len=?, b_num=?", {},
226             $name, "?", $b_md5, 0, $part);
227 0           $dbh->do("replace into $blockstable set name=?, block=?", {},
228             $name, $block);
229             }
230 0 0         print "\n" if $verbose;
231 0           $dbh->do( "update $filetable set c_md5=?, c_len=? where name like ?", {}, $ctx->b64digest, $size, "$fdbname %" );
232 0           $self->_unlock_tables();
233              
234 0           return $size;
235             }
236              
237              
238             # From here below is utility code and implementation details
239             # returns the length of the data read,
240             # calls &$callback( $block ) for each block read.
241             sub _read_blocks_from_db {
242 0     0     my ($self, $callback, $fdbname) = @_;
243             # callback is called on each block, like &$callback( $block )
244 0 0         die "$0: name not ok: $fdbname" unless name_ok($fdbname);
245 0           my $dbh = $self->{dbh};
246 0           my $verbose = $self->{verbose};
247 0           my $ctx = Digest::MD5->new();
248            
249 0 0         warn "$0: Fetching rows $fdbname" if $verbose;
250 0           $self->_lock_tables();
251 0           my $cmd = "select name, b_md5, c_md5, b_num, c_len from $self->{filetable} where name like ? order by b_num";
252 0           my @params = ( $fdbname . ' %' );
253 0           my $sth = $dbh->prepare($cmd);
254 0           my $rv = $sth->execute( @params );
255 0           my $rownum = 0;
256 0           my $c_len = 0;
257 0           my $orig_c_md5;
258 0           while( my $row = $sth->fetchrow_arrayref() ) {
259 0   0       $c_len ||= $row->[5];
260 0 0 0       unless ($row && defined($row->[0])) {
261 0           warn "$0: Error: bad row returned?";
262 0           next;
263             }
264 0 0         print "$row->[0]\n" if $verbose;
265 0 0         unless ($row->[0] =~ /\s+(\d+)$/) {
266 0           warn "$0: Skipping block that doesn't match ' [0-9]+\$': $row->[0]";
267 0           next;
268             }
269 0           my $name_num = $1;
270 0   0       $orig_c_md5 ||= $row->[2];
271              
272             # check the MD5...
273 0 0         if ($row->[2] ne $orig_c_md5) { die "$0: Error: Is DB being updated? Bad content md5sum for $row->[0]\n"; }
  0            
274 0 0         die "$0: Error: our count is row num $rownum, but file says $name_num"
275             unless $rownum == $name_num;
276            
277 0           my ($block) = $dbh->selectrow_array("select block from fileblocks where name=?", {}, $row->[0]);
278 0 0         die "$0: Bad MD5 checksum for $row->[0] ($row->[1] != " . md5_base64( $block )
279             unless ($row->[1] eq md5_base64( $block ));
280 0           $ctx->add($block);
281              
282 0           &$callback( $block ); # call the callback, and pass it the block!
283              
284 0           $rownum++;
285             }
286 0           $self->_unlock_tables();
287              
288 0           my $retrieved_md5 = $ctx->b64digest();
289 0 0         die "$0: Bad MD5 checksum for $fdbname ($retrieved_md5 != $orig_c_md5)"
290             unless ($retrieved_md5 eq $orig_c_md5);
291            
292 0           return $c_len; # for your inspection
293             }
294              
295              
296             sub _lock_tables {
297 0     0     my $self = shift;
298 0 0         if ($self->{uselocks}) {
299 0           $self->{dbh}->do("lock tables $self->{filetable} write, $self->{blockstable} write");
300             }
301             }
302             sub _unlock_tables {
303 0     0     my $self = shift;
304 0 0         if ($self->{uselocks}) {
305 0           $self->{dbh}->do("unlock tables");
306             }
307             }
308              
309              
310             # warns and returns 0 if passed filename ending with like '/tmp/file 1'
311             # else returns 1, ie, that the name is OK
312             # Note that this is a FUNCTION, not a METHOD
313             sub name_ok {
314 0     0 1   my $file = shift;
315 0 0 0       if (!defined($file) || $file eq "" ) {
316 0           warn "$0: Can't use empty filename\n";
317 0           return 0;
318             }
319 0 0 0       if ($file && $file =~ /\s/) {
320 0           warn "$0: Can't use filedbname containing whitespace\n";
321 0           return 0;
322             }
323 0 0         if (length($file) > 75) {
324 0           warn "$0: Can't use filedbname longer than 75 chars\n";
325 0           return 0;
326             }
327 0           return 1;
328             }
329              
330             1;
331              
332             =pod
333              
334             =head1 NAME
335              
336             DBIx::FileStore - Module to store files in a DBI backend
337              
338             =head1 VERSION
339              
340             Version 0.28
341              
342             =head1 SYNOPSIS
343              
344             Ever wanted to store files in a database?
345              
346             This code helps you do that.
347              
348             All the fdb tools in script/ use this library to
349             get at file names and contents in the database.
350              
351             To get started, see the README file (which includes a QUICKSTART
352             guide) from the DBIx-FileStore distribution.
353              
354             This document details the DBIx::FileStore module implementation.
355              
356             =head1 FILENAME NOTES
357              
358             The name of the file in the filestore cannot contain spaces.
359              
360             The maximum length of the name of a file in the filestore
361             is 75 characters.
362              
363             You can store files under any name you wish in the filestore.
364             The name need not correspond to the original name on the filesystem.
365              
366             All filenames in the filestore are in one flat address space.
367             You can use / in filenames, but it does not represent an actual
368             directory. (Although fdbls has some support for viewing files in the
369             filestore as if they were in folders. See the docs on 'fdbls'
370             for details.)
371              
372              
373             =head1 METHODS
374              
375             =head2 new DBIx::FileStore()
376              
377             my $filestore = new DBIx::FileStore();
378              
379             returns a new DBIx::FileStore object
380              
381             =head2 get_all_filenames()
382              
383             my $fileinfo_ref = $filestore->get_all_filenames()
384              
385             Returns a list of references to data about all the files in the
386             filestore.
387              
388             Each row consist of the following columns:
389             name, c_len, c_md5, lasttime_as_int
390              
391             =head2 get_filenames_matching_prefix( $prefix )
392              
393             my $fileinfo_ref = get_filenames_matching_prefix( $prefix );
394              
395             Returns a list of references to data about the files in the
396             filestore whose name matches the prefix $prefix.
397              
398             Returns a list of references in the same format as get_all_filenames().
399              
400             =head2 read_from_db( $filesystem_name, $storage_name);
401              
402             my $bytecount = $filestore->read_from_db( "filesystemname.txt", "filestorename.txt" );
403              
404             Copies the file 'filestorename.txt' from the filestore to the file filesystemname.txt
405             on the local filesystem.
406              
407             =head2 rename_file( $from, $to );
408              
409             my $ok = $self->rename_file( $from, $to );
410              
411             Renames the file in the database from $from to $to.
412             Returns 1;
413              
414             =head2 delete_file( $fdbname );
415              
416             my $ok = $self->delete_file( $fdbname );
417              
418             Removes data named $filename from the filestore.
419              
420             =head2 copy_blocks_from_db_to_filehandle()
421              
422             my $bytecount = $filestore->copy_blocks_from_db_to_filehandle( $fdbname, $fh );
423              
424             copies blocks from the filehandle $fh into the fdb at the name $fdbname
425              
426             =head2 _read_blocks_from_db( $callback_function, $fdbname );
427              
428             my $bytecount = $filestore->_read_blocks_from_db( $callback_function, $fdbname );
429              
430             ** Intended for internal use by this module. **
431              
432             Fetches the blocks from the database for the file stored under $fdbname,
433             and calls the $callback_function on the data from each one after it is read.
434              
435             It also confirms that the base64 md5 checksum for each block and the file contents
436             as a whole are correct. Die()'s with an error if a checksum doesn't match.
437              
438             If uselocks is set, lock the relevant tables while data is extracted.
439              
440             =head2 write_to_db( $localpathname, $filestorename );
441              
442             my $bytecount = $self->write_to_db( $localpathname, $filestorename );
443              
444             Copies the file $localpathname from the filesystem to the name
445             $filestorename in the filestore.
446              
447             Locks the relevant tables while data is extracted. Locking should probably
448             be configurable by the caller.
449              
450             Returns the number of bytes written. Dies with a message if the source
451             file could not be read.
452              
453             Note that it currently reads the file twice: once to compute the md5 checksum
454             before insterting it, and a second time to insert the blocks.
455              
456             =head2 write_from_filehandle_to_db ($fh, $fdbname)
457              
458             Reads blocks of the appropriate block size from $fb and writes them
459             into the fdb under the name $fdbname.
460             Returns the number of bytes written into the filestore.
461              
462             =head1 FUNCTIONS
463              
464             =head2 name_ok( $fdbname )
465              
466             my $filename_ok = DBIx::FileStore::name_ok( $fdbname )
467              
468             Checks that the name $fdbname is acceptable for using as a name
469             in the filestore. Must not contain spaces or be over 75 chars.
470              
471             =head1 IMPLEMENTATION
472              
473             The data is stored in the database using two tables: 'files' and
474             'fileblocks'. All meta-data is stored in the 'files' table,
475             and the file contents are stored in the 'fileblocks' table.
476              
477             =head2 fileblocks table
478              
479             The fileblocks table has only three fields:
480              
481             =head3 name
482              
483             The name of the block, exactly as used in the fileblocks table.
484             Always looks like "filename.txt ",
485             for example "filestorename.txt 00000".
486              
487             =head3 block
488              
489             The contents of the named block. Each block is currently set
490             to be 512K. Care must be taken to use blocks that are
491             not larger than mysql buffers can handle (in particular,
492             max_allowed_packet).
493              
494             =head3 lasttime
495              
496             The timestamp of when this block was inserted into the DB or updated.
497              
498             =head2 files table
499              
500             The files table has several fields. There is one row in the files table
501             for each row in the fileblocks table-- not one per file (see IMPLEMENTATION
502             CAVEATS, below). The fields in the files table are:
503              
504             =head3 c_len
505              
506             Content length. The content length of the complete file (sum of length of all the file's blocks).
507              
508             =head3 b_num
509              
510             Block number. The number of the block this row represents. The b_num is repeated as a five
511             (or more) digit number at the end of the name field (see above). We denormalize
512             the data like this so we can quickly and easily find blocks by name or block number.
513              
514             =head3 b_md5
515              
516             Block md5. The md5 checksum for the block (b is for 'block') represented by this row.
517             We use base64 encoding (which uses 0-9, a-z, A-Z, and a few other characters)
518             to represent md5's because it's a little shorter than the hex
519             representation. (22 vs. 32 characters)
520              
521             =head3 c_md5
522              
523             Content md5. The base64 md5 checksum for the whole file (c is for 'content') represented by this row.
524              
525             =head3 lasttime
526              
527             The timestamp of when this row was inserted into the DB or updated.
528              
529             =head2 See the file 'table-definitions.sql' for more details about
530             the db schema used.
531              
532             =head1 IMPLEMENTATION CAVEATS
533              
534             DBIx::FileStore is what I would consider production-grade code,
535             but the overall wisdom of storing files in blobs in a mysql database
536             may be questionable (for good reason).
537              
538             That having been said, if you have a good reason to do so, as long
539             as you understand the repercussions of storing files in
540             your mysql database, then this toolkit offers a stable and
541             flexible backend for binary data storage, and it works quite nicely.
542              
543             If we were to redesign the system, in particular we might reconsider
544             having one row in the 'files' table for each block stored in the
545             'fileblocks' table. Perhaps instead, we'd have one entry in
546             the 'files' table per file.
547              
548             In concrete terms, though, the storage overhead of doing it this way
549             (which only affects files larger than the block size, which defaults
550             to 512K) is about 100 bytes per block. Assuming files larger than
551             512K, and with a conservative average block size of 256K, the extra
552             storage overhead of doing it this way is still only about 0.039%
553              
554             =head1 AUTHOR
555              
556             Josh Rabinowitz, C<< >>
557              
558             =head1 SUPPORT
559              
560             You should probably read the documentation for the various filestore command-line
561             tools:
562              
563             L, L, L, L, L, L, L, L, and L.
564              
565             =over 4
566              
567             =item * Search CPAN
568              
569             You can also read the documentation at:
570              
571             L
572              
573             =back
574              
575             =head1 LICENSE AND COPYRIGHT
576              
577             Copyright 2010-2015 Josh Rabinowitz.
578              
579             This program is free software; you can redistribute it and/or modify it
580             under the terms of either: the GNU General Public License as published
581             by the Free Software Foundation; or the Artistic License.
582              
583             See http://dev.perl.org/licenses/ for more information.
584              
585             =cut
586              
587             1; # End of DBIx::FileStore
588