File Coverage

blib/lib/Document/Repository.pm
Criterion Covered Total %
statement 24 304 7.8
branch 1 116 0.8
condition 1 82 1.2
subroutine 7 24 29.1
pod 13 15 86.6
total 46 541 8.5


line stmt bran cond sub pod time code
1              
2             =head1 NAME
3              
4             Document::Repository
5              
6             =head1 SYNOPSIS
7              
8             my $repository = new Document::Repository;
9              
10             my $doc_id = $repository->add($filename);
11              
12             my $filename = $repository->get($doc_id, $dir);
13              
14             $repository->put($doc_id, $filename, $filename, $filename)
15             or die "couldn't put $filename";
16              
17             $repository->delete($doc_id)
18             or die "couldn't delete $doc_id";
19              
20             =head1 DESCRIPTION
21              
22             This module implements a repository of documents, providing general
23             access to add/get/delete documents. This module is not intended to be
24             used directly; for that see Document::Manager. This acts as a general
25             purpose backend.
26              
27             A document is a collection of one or more files that are checked out,
28             modified, and checked back in as a unit. Each revision of a document is
29             numbered, and documents can be reverted to older revisions if needed. A
30             document can also have an arbitrary set of metadata associated with it.
31              
32             =head1 FUNCTIONS
33              
34             =cut
35              
36             package Document::Repository;
37              
38 2     2   60936 use strict;
  2         3  
  2         75  
39 2     2   2004 use File::Copy;
  2         11835  
  2         167  
40 2     2   15 use File::Path;
  2         9  
  2         137  
41 2     2   2188 use File::Spec::Functions qw(:ALL);
  2         2044  
  2         530  
42              
43              
44 2     2   12 use vars qw(%FIELDS);
  2         4  
  2         92  
45              
46 2         15 use fields qw(
47             _repository_dir
48             _repository_permissions
49             _next_id
50             _error_msg
51             _debug
52 2     2   3111 );
  2         3554  
53              
54              
55             =head2 new($confighash)
56              
57             Establishes the repository interface object. You must pass it the
58             location of the repository, and optionally can indicate what permissions
59             to use (0600 is the default).
60              
61             If the repository already exists, indicate where Document::Repository
62             should start its numbering (e.g., you may want to store this info
63             in a config file or something between invokations...)
64              
65             =cut
66              
67             sub new {
68 1     1 1 405 my ($this, %args) = @_;
69 1   33     8 my $class = ref($this) || $this;
70 1         4 my $self = bless [\%FIELDS], $class;
71              
72 1         8 while (my ($field, $value) = each %args) {
73 1 50       8 if (exists $FIELDS{"_$field"}) {
74 1         146 $self->{"_$field"} = $value;
75 0 0 0       if ($args{debug} && $args{debug}>3 && defined $value) {
      0        
76 0           warn 'Setting Document::Repository::_'.$field." = $value\n";
77             }
78             }
79             }
80              
81             # Specify defaults
82 0   0       $self->{_repository_dir} ||= '/var/dms';
83 0   0       $self->{_repository_permissions} ||= '0700';
84 0           $self->{_next_id} = 1;
85 0   0       $self->{_debug} ||= 0;
86              
87             # If caller has requested doing initialization, do that as well
88 0 0         if ($args{create_new_repository}) {
89 0           $self->_init($self->{_repository_dir},
90             $self->{_repository_permissions});
91             }
92              
93             # Verify everything is sane...
94 0 0         if (! -d $self->{_repository_dir} ) {
95 0           $self->dbg("Repository directory '" . $self->{_repository_dir} . "' does not exist\n", 1);
96             }
97 0 0         if (! -x $self->{_repository_dir} ) {
98 0           $self->dbg("Repository directory '" . $self->{_repository_dir} . "' is not accessible\n", 1);
99             }
100              
101             # Determine what the next id is based on the maximum document id number
102 0           foreach my $doc_id ($self->documents()) {
103 0 0         last if (! $doc_id);
104 0           $self->dbg("Found document id '$doc_id'\n", 4);
105              
106 0 0         if ($doc_id >= $self->{_next_id}) {
107 0           $self->{_next_id} = $doc_id + 1;
108             }
109             }
110              
111 0 0 0       if ($self->{_debug} > 4 or 1==1) {
112 0           warn "Document::Repository settings:\n";
113 0           warn " debug = $self->{_debug}\n";
114 0           warn " repository_dir = $self->{_repository_dir}\n";
115 0           warn " repository_permissions = $self->{_repository_permissions}\n";
116 0           warn " next_id = $self->{_next_id}\n";
117             }
118              
119 0           return $self;
120             }
121              
122             # Establishes a new directory for a document repository.
123             # Basically just does a mkdir after validating the inputs.
124             sub _init {
125 0     0     my $self = shift;
126 0           my $dir = shift;
127 0           my $perms = shift;
128              
129 0 0         if (! $dir) {
130 0           $self->_set_error("Undefined repository dir '$dir' specified to _init()");
131 0           return undef;
132             }
133              
134 0 0 0       if (-d $dir && ! -x $dir) {
135 0           $self->_set_error("Repository dir '$dir' exists but is not accessible");
136 0           return undef;
137             }
138              
139 0 0 0       if (-f $dir && ! -d $dir) {
140 0           $self->_set_error("New repository '$dir' exists as a file, not as a dir");
141 0           return undef;
142             }
143              
144 0 0         if (! -d $dir) {
145 0           eval { mkpath([$dir], 0, oct($perms)) };
  0            
146 0 0         if ($@) {
147 0           $self->_set_error("Error creating repository '$dir': $@");
148 0           return undef;
149             }
150             }
151             }
152              
153             sub _set_error {
154 0     0     my $self = shift;
155 0           $self->{_error_msg} = shift;
156             }
157              
158             =head2 get_error()
159              
160             Retrieves the most recent error message
161              
162             =cut
163              
164             sub get_error {
165 0     0 1   my $self = shift;
166 0   0       return $self->{_error_msg} || '';
167             }
168              
169             sub dbg {
170 0     0 0   my $self = shift;
171 0   0       my $message = shift || return undef;
172 0   0       my $thresh = shift || 1;
173              
174 0 0         warn $message if ($self->{_debug} >= $thresh);
175             }
176              
177             =head2 repository_path($doc_id)
178              
179             Returns a path to the location of the document within the repository
180             repository.
181              
182             =cut
183              
184             sub repository_path {
185 0     0 1   my $self = shift;
186 0   0       my $doc_id = shift || return undef;
187 0           $self->_set_error('');
188              
189 0           my $repo = $self->{_repository_dir};
190              
191             # Verify the repository exists
192 0 0         if (! $repo) {
    0          
    0          
193 0           $self->_set_error("Document repository dir is not defined");
194 0           return undef;
195             } elsif (! -d $repo) {
196 0           $self->_set_error("Document repository '$repo' does not exist");
197 0           return undef;
198             } elsif (! -x $repo) {
199 0           $self->_set_error("Document repository '$repo' cannot be accessed by this user");
200 0           return undef;
201             }
202              
203             # Millions subdir
204 0 0         if ($doc_id > 999999) {
205 0           $repo = catdir($repo,
206             sprintf("M%03d", int($doc_id/1000000)));
207             }
208              
209             # Thousands subdir
210 0 0         if ($doc_id > 999) {
211 0           $repo = catdir($repo,
212             sprintf("k%03d", int($doc_id/1000)%1000));
213             }
214              
215             # Ones subdir
216 0           $repo = catdir($repo,
217             sprintf("%03d", $doc_id % 1000));
218              
219 0 0 0       if (-d $repo && ! -x $repo) {
220 0           $self->_set_error("Document directory '$repo' exists but is inaccessible\n");
221 0           return undef;
222             }
223              
224 0           return $repo;
225             }
226              
227             =head2 current_revision($doc_id, [$doc_path])
228              
229             Returns the current (latest & highest) revision number for the document,
230             or undef if there is no revisions for the document or if the document
231             does not exist.
232              
233             You must specify the $doc_id to be looked up. Optionally, the $doc_path
234             may be given (saves the lookup time if you have already calculated it).
235              
236             =cut
237              
238             sub current_revision {
239 0     0 1   my $self = shift;
240 0   0       my $doc_id = shift || return undef;
241 0   0       my $doc_path = shift || $self->repository_path($doc_id);
242 0           my $rev_number;
243              
244             # Get the current revision number by looking for highest numbered
245             # file or directory if the document already exists
246 0 0 0       if (! defined $rev_number && -d $doc_path) {
247 0 0         if (! opendir(DIR, $doc_path)) {
248 0           $self->_set_error("Could not open document directory '$doc_path' ".
249             "to find the max revision number: $!");
250 0           return undef;
251             }
252 0           my @files = sort { $a <=> $b } grep { /^\d+$/ } readdir(DIR);
  0            
  0            
253 0           $self->dbg("Revisions for '$doc_id' are: @files\n", 2);
254 0           $rev_number = pop @files;
255 0           closedir(DIR);
256             }
257 0           return $rev_number;
258             }
259              
260              
261             =head2 add(@filenames)
262              
263             Adds a new document of revision 001 to the repository by adding its
264             files. Establishes a new document ID and returns it.
265              
266             If you wish to simply register the document ID without actually
267             uploading files, @filenames can be left undefined.
268              
269             Returns undef on failure. You can retrieve the error message by
270             calling get_error().
271              
272             =cut
273              
274             sub add {
275 0     0 1   my $self = shift;
276 0           my @filenames = @_;
277              
278 0           my $revision = 1;
279 0           $self->_set_error('');
280              
281 0           my $doc_id = $self->{_next_id};
282              
283 0 0         if (! $doc_id) {
284 0           $self->_set_error("next_id not defined");
285 0           return undef;
286             }
287              
288 0           my $repo = $self->repository_path($doc_id);
289              
290 0 0         if (! $repo) {
    0          
291 0           $self->_set_error("Directory in repository could not be created\n");
292 0           return undef;
293             } elsif (-e $repo) {
294             # Problem... This document should not already exist...
295 0           $self->_set_error("Document '$doc_id' already exists in the repository");
296 0           return undef;
297             }
298              
299 0           $self->dbg("Creating path '$repo' as $self->{_repository_permissions}\n", 2);
300 0           eval { mkpath([$repo], 0, oct($self->{_repository_permissions})) };
  0            
301 0 0         if ($@) {
302 0           $self->_set_error("Error creating '$repo' for doc id '$doc_id': $@");
303 0           return undef;
304             }
305              
306 0           $self->{_next_id}++;
307              
308 0 0         if (@filenames) {
309 0 0         $self->put($doc_id, @filenames) || return undef;
310             }
311              
312 0           return $doc_id;
313             }
314              
315              
316             =head2 put($doc_id, @filenames)
317              
318             Adds a new revision to a document in the repository. All files must
319             exist.
320              
321             Returns the revision number created, or undef on failure. You can
322             retrieve the error message by calling get_error().
323              
324             =cut
325              
326             sub put {
327 0     0 1   my $self = shift;
328 0   0       my $doc_id = shift || '';
329 0           my @filenames = @_;
330              
331 0   0       my $doc_path = $self->repository_path($doc_id) || return undef;
332 0   0       my $revision = ($self->current_revision($doc_id, $doc_path) || 0) + 1;
333 0           $self->dbg("Adding revision '$revision' for doc id '$doc_id'\n");
334              
335 0           my $rev_path = catdir($doc_path,
336             sprintf("%03d", $revision));
337 0 0         if (-e $rev_path) {
338             # Problem... This revision should not already exist...
339 0           $self->_set_error("Revision '$revision' for doc id '$doc_id' already exists in the repository");
340 0           return undef;
341             }
342              
343 0           $self->dbg("Creating path '$rev_path' as $self->{_repository_permissions}\n", 2);
344 0           eval { mkpath([$rev_path], 0, oct($self->{_repository_permissions})) };
  0            
345 0 0         if ($@) {
346 0           $self->_set_error("Error making path '$rev_path' to repository: $@");
347 0           return undef;
348             }
349              
350 0           foreach my $filename (@filenames) {
351 0 0         next unless defined $filename;
352 0 0         if (! -e $filename) {
353 0           $self->_set_error("File '$filename' does not exist.");
354 0           return undef;
355             }
356 0           my ($vol,$dirs,$base_filename) = splitpath( $filename );
357              
358             # Install the file into the repository
359 0 0         if (! copy($filename, catfile($rev_path, $base_filename)) ) {
360 0           $self->_set_error("Error copying '$filename' to repository: $!");
361 0           return undef;
362             }
363             }
364              
365 0           return $revision;
366             }
367              
368             =head2 get($doc_id, $revision, $destination, [$copy_function], [$select_function])
369              
370             Retrieves a copy of the document specified by $doc_id of the given
371             $revision (or the latest, if not specified), and places it at
372             $location (or the cwd if not specified).
373              
374             See files() for a description of the optional \&select_function.
375              
376             The document is copied using the routine specified by $copy_function.
377             This permits overloading the behavior in order to perform network
378             copying, tarball dist generation, etc.
379              
380             If defined, $copy_function must be a reference to a function that
381             accepts two parameters: an array of filenames (with full path) to be
382             copied, and the $destination parameter that was passed to get(). The
383             caller is allowed to define $destination however desired - it can be a
384             filename, URI, hash reference, etc. $copy_function should return a
385             list of the filenames actually copied.
386              
387             If $copy_function is not defined, the default behavior is simply to call
388             the File::Copy routine copy($fn, $destination) iteratively on each file
389             in the document, returning the number of files
390              
391             Returns a list of files (or the return value from $copy_function), or
392             undef if get() encountered an error (such as bad parameters). The error
393             message can be retrieved via get_error().
394              
395             =cut
396              
397             sub get {
398 0     0 1   my $self = shift;
399 0   0       my $doc_id = shift || '';
400 0   0       my $revision = shift || '';
401 0   0       my $destination = shift || '';
402 0   0       my $copy_function = shift || '';
403 0           my $select_function = shift;
404              
405 0 0         if (! $destination) {
406 0           $self->_set_error("No destination specified for get()");
407 0           return undef;
408             }
409              
410 0           my @files = $self->files($doc_id, $revision, $select_function, 1);
411              
412 0           $self->dbg("Retrieving document files (@files)\n",2);
413              
414 0 0         if ($copy_function) {
415 0           return &$copy_function(\@files, $destination);
416             } else {
417 0           foreach my $filename (@files) {
418 0 0         next unless defined $filename;
419 0 0         if (! copy($filename, $destination)) {
420 0           $self->_set_error("Could not copy '$filename' for document '$doc_id': $!");
421 0           return undef;
422             }
423             }
424             }
425 0           return @files;
426             }
427              
428             =head2 content( $filename, $doc_id [, $revision] )
429              
430             Retrieves the contents of a file within the given document id.
431              
432             If the specified filename is actually a directory, returns an array of
433             the files in that directory, instead.
434              
435             Returns undef and sets an error (retrievable via get_error() if there is
436             any problem.
437              
438             =cut
439             sub content {
440 0     0 1   my $self = shift;
441 0   0       my $filename = shift || return undef;
442 0   0       my $doc_id = shift || return undef;
443 0           my $revision = shift;
444              
445 0   0       my $doc_path = $self->repository_path($doc_id) || return undef;
446              
447             # Default $revision to current revision if not specified
448 0   0       $revision ||= $self->current_revision($doc_id, $doc_path);
449              
450 0           my $file = catfile($doc_path,
451             sprintf("%03d", $revision),
452             $filename);
453 0 0         if (-d $file) {
454 0           my @files;
455 0 0         opendir(DIR, $file) or return undef;
456 0           while (defined(my $dir_content = readdir(DIR))) {
457 0           push @files, $dir_content;
458             }
459 0           return @files;
460             }
461              
462 0 0         if (! -e $file) {
463 0           return undef;
464             }
465              
466 0 0         if (! open(FILE, "< $file")) {
467 0           $self->_set_error("Could not open file '$file': $?\n");
468 0           return undef;
469             }
470              
471             # Open the file and read in the content from it
472 0           my $content = '';
473 0           while () {
474 0           $content .= $_;
475             }
476 0           close(FILE);
477              
478 0           return $content;
479             }
480              
481             =head2 update( $filename, $doc_id, $content[, $append] )
482              
483             This routine alters a file within the repository without creating a new
484             revision number to be generated. This is not intended for regular use
485             but instead for adding comments, updating metadata, etc.
486              
487             By default, update() replaces the existing file. If $append is defined,
488             however, update() will append $content onto the end of the file (such as
489             for logs). Note that no separation characters are inserted, so make sure
490             to add newlines and record delimiters if you need them.
491              
492             Returns a true value if the file was successfully updated, or undef on
493             any error. Retrieve the error via get_error();
494              
495             =cut
496             sub update {
497 0     0 1   my $self = shift;
498 0   0       my $filename = shift || return undef;
499 0   0       my $doc_id = shift || return undef;
500 0           my $content = shift;
501 0           my $append = shift;
502              
503 0 0         if (! defined $content) {
504 0           $self->_set_error("Undefined content not allowed\n");
505 0           return undef;
506             }
507              
508 0   0       my $doc_path = $self->repository_path($doc_id) || return undef;
509              
510             # Default $revision to current revision if not specified
511 0           my $revision = $self->current_revision($doc_id, $doc_path);
512              
513 0           my $file = catfile($doc_path,
514             sprintf("%03d", $revision),
515             $filename);
516              
517 0 0         my $w = ($append)? ">>" : ">";
518 0 0         if (! open(FILE, "$w $file")) {
519 0           $self->_set_error("Could not open '$file' for writing: $?\n");
520 0           return undef;
521             }
522 0           print FILE $content;
523 0           return close(FILE);
524             }
525              
526             # Recursively iterates through the document repository, running the
527             # given function '$func' against document ids it finds.
528             sub _iterate_doc_ids {
529 0     0     my $self = shift;
530 0           my $dir = shift;
531 0           my $func = shift;
532 0   0       my $prefix = shift || '';
533              
534 0 0         if (! opendir(DIR, $dir)) {
535 0           $self->_set_error("Could not open directory '$dir': $!\n");
536 0           return undef;
537             }
538 0           while (defined(my $subdir = readdir DIR)) {
539 0 0         if ($subdir =~ /^\d+$/) {
    0          
540             # This is a document subdir, so we process
541 0 0         if (! &$func("$prefix$subdir")) {
542 0           $self->_set_error("Error running function while iterating '$subdir'");
543 0           return undef;
544             }
545             } elsif ($subdir =~ /^[Mk](\d+)$/) {
546             # This is a thousands (k) or millions (M) dir, so it contains
547             # additional subdirs for documents within it. We recurse into
548             # this directory and continue processing...
549 0 0         if (! $self->_iterate_doc_ids(catdir($dir,$subdir), $func, $1)) {
550 0           $self->_set_error("Error descending into '$subdir'");
551 0           return undef;
552             }
553             }
554             }
555 0           close(DIR);
556            
557 0           return 1;
558             }
559              
560             =head2 documents()
561              
562             Returns a list of document ids in the system.
563              
564             Note that if you have a lot of documents, this list could be huge, but
565             it's assumed you know what you're doing in this case...
566              
567             =cut
568              
569             sub documents {
570 0     0 1   my $self = shift;
571              
572 0           my $repo = $self->{_repository_dir};
573 0           $self->dbg("Getting list of documents from '$repo'\n", 4);
574              
575 0           our @documents = ();
576              
577             sub get_doc_ids {
578 0     0 0   my $doc_id = shift;
579 0           warn "Got document '$doc_id'\n";
580 0           push @documents, $doc_id;
581             }
582 0 0         if (! $self->_iterate_doc_ids($repo, \&get_doc_ids)) {
583 0           warn "Error iterating doc ids\n";
584             # Error msg will already be set by _iterate_doc in this case
585 0           return undef;
586             }
587              
588 0           return @documents;
589             }
590              
591             =head2 revisions()
592              
593             Lists the revisions for the given document id
594              
595             =cut
596              
597             sub revisions {
598 0     0 1   my $self = shift;
599 0           my $doc_id = shift;
600              
601 0   0       my $repo = $self->repository_path($doc_id) || return undef;
602 0 0         if (! defined $repo) {
603 0           $self->dbg("Repository undefined: $repo->get_error()", 2);
604 0           return undef;
605             }
606 0           $self->dbg("Getting revisions from '$repo'\n", 4);
607              
608             # Retrieve all of the valid revisions of this document
609 0           my @revisions;
610 0 0         if (!opendir(DIR, $repo)) {
611 0           $self->_set_error("Could not open repository '$repo': $!");
612 0           return undef;
613             }
614 0           @revisions = grep { /^\d+$/ } readdir(DIR);
  0            
615 0           $self->dbg("Retrieved revisions: @revisions\n", 4);
616 0           closedir(DIR);
617              
618 0           return @revisions;
619             }
620              
621              
622             =head2 files($doc_id, $revision, [\&selection_function], [$with_path])
623              
624             Lists the files for the given document id and revision (or the latest
625             revision if not specified.)
626              
627             The optional \&selection_function allows customized constraints to be
628             placed on what files() returns. This function must accept a file path
629             and return true if the file should be selected for the list to return.
630              
631             The optional $with_path argument allows control over whether to return
632             files with their path prepended or not.
633              
634             =cut
635              
636             sub files {
637 0     0 1   my $self = shift;
638 0           my $doc_id = shift;
639 0           my $revision = shift;
640 0           my $select_function = shift;
641 0           my $with_path = shift;
642              
643 0   0       my $doc_path = $self->repository_path($doc_id) || return undef;
644              
645             # Default $revision to current revision if not specified
646 0   0       $revision ||= $self->current_revision($doc_id, $doc_path);
647              
648 0           my $rev_path = catdir($doc_path,
649             sprintf("%03d", $revision));
650              
651 0           $self->dbg("Getting files from '$rev_path'\n", 4);
652              
653 0 0         if (! opendir(DIR, $rev_path)) {
654 0           $self->_set_error("Could not open '$rev_path' to get files: $!");
655 0           return undef;
656             }
657              
658 0           my @files = ();
659 0           while (defined(my $filename = readdir DIR)) {
660 0           $self->dbg("Considering file '$filename'\n",3);
661 0           my $file_path = catfile($rev_path, $filename);
662 0 0         if ($filename =~ /^\./ ) {
    0          
663 0           $self->dbg("Skipping '$filename' since it is a hidden file\n",4);
664 0           next;
665             } elsif (! -f $file_path) {
666 0           $self->dbg("Skipping '$filename' since it is not a valid file\n",4);
667 0           next;
668             }
669              
670 0 0         if (defined $select_function) {
671 0           $self->dbg("Applying custom selection function\n", 4);
672 0 0         next unless (&$select_function($file_path));
673             }
674 0           $self->dbg("Selecting file '$filename' to get\n", 3);
675 0 0         if ($with_path) {
676 0           push @files, $file_path;
677             } else {
678 0           push @files, $filename;
679             }
680             }
681 0           closedir(DIR);
682              
683 0           return @files;
684             }
685              
686             =head2 stats()
687              
688             Returns a hash containing statistics about the document repository as a
689             whole, including the following:
690              
691             * num_documents
692             * num_revisions
693             * disk_space
694             * num_files
695             * next_id
696              
697             =cut
698              
699             sub stats {
700 0     0 1   my $self = shift;
701 0           my %stats;
702              
703 0           my $repo = $self->{_repository_dir};
704              
705             # Number of documents
706 0           my @doc_ids = $self->documents();
707 0           $stats{num_documents} = scalar @doc_ids;
708              
709             # Num revisions
710 0           $stats{num_revisions} = 0;
711 0           foreach my $doc_id (@doc_ids) {
712 0   0       $stats{num_revisions} += ($self->revisions($doc_id) || 0);
713             }
714              
715             # Disk space used
716 0           $stats{disk_space} = `du -s $repo`;
717              
718             # Number of files
719 0           $stats{num_files} = `find $repo -type f | wc -l`;
720              
721             # Next document ID number
722 0           $stats{next_id} = $self->{_next_id};
723              
724 0           return \%stats;
725             }
726              
727              
728             1;