File Coverage

Bio/DB/Flat.pm
Criterion Covered Total %
statement 85 241 35.2
branch 33 140 23.5
condition 0 12 0.0
subroutine 18 42 42.8
pod 6 26 23.0
total 142 461 30.8


line stmt bran cond sub pod time code
1             #
2             #
3             # BioPerl module for Bio::DB::Flat
4             #
5             # Please direct questions and support issues to
6             #
7             # Cared for by Lincoln Stein
8             #
9             # You may distribute this module under the same terms as perl itself
10              
11             # POD documentation - main docs before the code
12              
13             =head1 NAME
14              
15             Bio::DB::Flat - Interface for indexed flat files
16              
17             =head1 SYNOPSIS
18              
19             $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
20             -dbname => 'mydb',
21             -format => 'embl',
22             -index => 'bdb',
23             -write_flag => 1);
24             $db->build_index('/usr/share/embl/primate.embl',
25             '/usr/share/embl/protists.embl');
26             $seq = $db->get_Seq_by_id('HSFOS');
27             @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
28             $raw = $db->fetch_raw('HSFOS');
29              
30             =head1 DESCRIPTION
31              
32             This object provides the basic mechanism to associate positions in
33             files with primary and secondary name spaces. Unlike
34             Bio::Index::Abstract (see L), this is specialized
35             to work with the "flat index" and BerkeleyDB indexed flat file formats
36             worked out at the 2002 BioHackathon.
37              
38             This object is a general front end to the underlying databases.
39              
40             =head1 FEEDBACK
41              
42             =head2 Mailing Lists
43              
44             User feedback is an integral part of the evolution of this and other
45             Bioperl modules. Send your comments and suggestions preferably to one
46             of the Bioperl mailing lists. Your participation is much appreciated.
47              
48             bioperl-l@bioperl.org - General discussion
49             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50              
51             =head2 Support
52              
53             Please direct usage questions or support issues to the mailing list:
54              
55             I
56              
57             rather than to the module maintainer directly. Many experienced and
58             reponsive experts will be able look at the problem and quickly
59             address it. Please include a thorough description of the problem
60             with code and data examples if at all possible.
61              
62             =head2 Reporting Bugs
63              
64             Report bugs to the Bioperl bug tracking system to help us keep track
65             the bugs and their resolution. Bug reports can be submitted via the
66             web:
67              
68             https://github.com/bioperl/bioperl-live/issues
69              
70             =head1 AUTHOR - Lincoln Stein
71              
72             Email - lstein@cshl.org
73              
74             =head1 APPENDIX
75              
76             The rest of the documentation details each of the object methods. Internal
77             methods are usually preceded with an "_" (underscore).
78              
79             =cut
80              
81              
82             # Let the code begin...
83             package Bio::DB::Flat;
84              
85 1     1   1067 use File::Spec;
  1         2  
  1         26  
86              
87 1     1   3 use base qw(Bio::Root::Root Bio::DB::RandomAccessI);
  1         0  
  1         76  
88              
89 1     1   4 use constant CONFIG_FILE_NAME => 'config.dat';
  1         1  
  1         2067  
90              
91             =head2 new
92              
93             Title : new
94             Usage : my $db = Bio::DB::Flat->new(
95             -directory => $root_directory,
96             -dbname => 'mydb',
97             -write_flag => 1,
98             -index => 'bdb',
99             -verbose => 0,
100             -out => 'outputfile',
101             -format => 'genbank');
102             Function: create a new Bio::DB::Flat object
103             Returns : new Bio::DB::Flat object
104             Args : -directory Root directory containing "config.dat"
105             -write_flag If true, allows creation/updating.
106             -verbose Verbose messages
107             -out File to write to when write_seq invoked
108             -index 'bdb' or 'binarysearch'
109             Status : Public
110              
111             The required -directory argument indicates where the flat file indexes
112             will be stored. The build_index() and write_seq() methods will
113             automatically create subdirectories of this root directory. Each
114             subdirectory will contain a human-readable configuration file named
115             "config.dat" that specifies where the individual indexes are stored.
116              
117             The required -dbname argument gives a name to the database index. The
118             index files will actually be stored in a like-named subdirectory
119             underneath the root directory.
120              
121             The -write_flag enables writing new entries into the database as well
122             as the creation of the indexes. By default the indexes will be opened
123             read only.
124              
125             -index is one of "bdb" or "binarysearch" and indicates the type of
126             index to generate. "bdb" corresponds to Berkeley DB. You *must* be
127             using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB
128             extension installed (DB_File will *not* work). "binarysearch"
129             corresponds to the OBDA "flat" indexed file.
130              
131             The -out argument specifies the output file for writing objects created
132             with write_seq().
133              
134             The -format argument specifies the format of the input file or files. If
135             the file suffix is one that Bioperl can already associate with a format
136             then this is optional.
137              
138             =cut
139              
140             sub new {
141 1     1 1 264 my $class = shift;
142 1 50       4 $class = ref($class) if ref($class);
143 1         9 my $self = $class->SUPER::new(@_);
144              
145             # first we initialize ourselves
146 1         6 my ($flat_directory,$dbname,$format) =
147             $self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_);
148              
149 1 50       4 defined $flat_directory
150             or $self->throw('Please supply a -directory argument');
151 1 50       2 defined $dbname
152             or $self->throw('Please supply a -dbname argument');
153              
154             # set values from configuration file
155 1         4 $self->directory($flat_directory);
156 1         2 $self->dbname($dbname);
157              
158 1 50       10 $self->throw("Base directory $flat_directory doesn't exist")
159             unless -e $flat_directory;
160 1 50       3 $self->throw("$flat_directory isn't a directory")
161             unless -d _;
162 1         7 my $dbpath = File::Spec->catfile($flat_directory,$dbname);
163 1 50       13 unless (-d $dbpath) {
164 1         7 $self->debug("creating db directory $dbpath\n");
165 1 50       41 mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!");
166             }
167 1         4 $self->_read_config();
168              
169             # but override with initialization values
170 1         4 $self->_initialize(@_);
171              
172 1 50       2 $self->throw('you must specify an indexing scheme')
173             unless $self->indexing_scheme;
174              
175             # now we figure out what subclass to instantiate
176 1 50       2 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB'
    50          
177             :$self->indexing_scheme eq 'flat/1' ? 'Binary'
178             :$self->throw("unknown indexing scheme: " .
179             $self->indexing_scheme);
180 1         3 $format = $self->file_format;
181              
182             # because Michele and Lincoln did it differently
183             # Michele's way is via a standalone concrete class
184 1 50       2 if ($index_type eq 'Binary') {
185 1         2 my $child_class = 'Bio::DB::Flat::BinarySearch';
186 1     1   433 eval "use $child_class";
  1         2  
  1         18  
  1         43  
187 1 50       4 $self->throw($@) if $@;
188 1         5 push @_, ('-format', $format);
189 1         4 return $child_class->new(@_);
190             }
191              
192             # Lincoln uses Bio::SeqIO style delegation.
193 0         0 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
194 0         0 eval "use $child_class";
195 0 0       0 $self->throw($@) if $@;
196              
197             # rebless & reinitialize with the new class
198             # (this prevents subclasses from forgetting to call our own initialization)
199 0         0 bless $self,$child_class;
200 0         0 $self->_initialize(@_);
201 0         0 $self->_set_namespaces(@_);
202              
203 0         0 $self;
204             }
205              
206             sub _initialize {
207 1     1   1 my $self = shift;
208              
209 1         5 my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
210             = $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_);
211              
212 1 50       5 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
213              
214 1 50       3 if (defined $flat_indexing) {
215             # very permissive
216 1 50       3 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/;
217 1 50       6 $flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat|binary)/;
218 1         3 $self->indexing_scheme($flat_indexing);
219             }
220              
221 1 50       3 $self->verbose($flat_verbose) if defined $flat_verbose;
222 1 50       3 $self->dbname($dbname) if defined $dbname;
223 1 50       2 $self->out_file($flat_outfile) if defined $flat_outfile;
224 1 50       4 $self->file_format($flat_format) if defined $flat_format;
225             }
226              
227             sub _set_namespaces {
228 0     0   0 my $self = shift;
229              
230             $self->primary_namespace($self->default_primary_namespace)
231 0 0       0 unless defined $self->{flat_primary_namespace};
232              
233             $self->secondary_namespaces($self->default_secondary_namespaces)
234 0 0       0 unless defined $self->{flat_secondary_namespaces};
235              
236             $self->file_format($self->default_file_format)
237 0 0       0 unless defined $self->{flat_format};
238             }
239              
240             =head2 new_from_registry
241              
242             Title : new_from_registry
243             Usage : $db = Bio::DB::Flat->new_from_registry(%config)
244             Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry-
245             compatible fashion
246             Returns : new Bio::DB::Flat
247             Args : provided by the registry, see below
248             Status : Public
249              
250             The following registry-configuration tags are recognized:
251              
252             location Root of the indexed flat file; corresponds to the new() method's
253             -directory argument.
254              
255             =cut
256              
257             sub new_from_registry {
258 0     0 1 0 my ($self,%config) = @_;
259 0 0       0 my $location = $config{'location'} or
260             $self->throw('location tag must be specified.');
261 0 0       0 my $dbname = $config{'dbname'} or
262             $self->throw('dbname tag must be specified.');
263              
264 0         0 my $db = $self->new(-directory => $location,
265             -dbname => $dbname,
266             );
267 0         0 $db;
268             }
269              
270             # accessors
271             sub directory {
272 2     2 0 2 my $self = shift;
273 2         4 my $d = $self->{flat_directory};
274 2 100       4 $self->{flat_directory} = shift if @_;
275 2         4 $d;
276             }
277             sub write_flag {
278 1     1 0 1 my $self = shift;
279 1         2 my $d = $self->{flat_write_flag};
280 1 50       2 $self->{flat_write_flag} = shift if @_;
281 1         1 $d;
282             }
283             sub verbose {
284 2     2 1 3 my $self = shift;
285 2         8 my $d = $self->{flat_verbose};
286 2 100       6 $self->{flat_verbose} = shift if @_;
287 2         7 $d;
288             }
289             sub out_file {
290 0     0 0 0 my $self = shift;
291 0         0 my $d = $self->{flat_outfile};
292 0 0       0 $self->{flat_outfile} = shift if @_;
293 0         0 $d;
294             }
295             sub dbname {
296 3     3 0 3 my $self = shift;
297 3         2 my $d = $self->{flat_dbname};
298 3 100       6 $self->{flat_dbname} = shift if @_;
299 3         9 $d;
300             }
301             sub primary_namespace {
302 0     0 0 0 my $self = shift;
303 0         0 my $d = $self->{flat_primary_namespace};
304 0 0       0 $self->{flat_primary_namespace} = shift if @_;
305 0         0 $d;
306             }
307              
308             # get/set secondary namespace(s)
309             # pass an array ref.
310             # get an array ref in scalar context, list in list context.
311             sub secondary_namespaces {
312 0     0 0 0 my $self = shift;
313 0         0 my $d = $self->{flat_secondary_namespaces};
314 0 0       0 $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_;
    0          
315 0 0       0 return unless $d;
316 0 0 0     0 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
317 0 0       0 return wantarray ? @$d : $d;
318             }
319              
320             # return the file format
321             sub file_format {
322 2     2 0 1 my $self = shift;
323 2         3 my $d = $self->{flat_format};
324 2 100       4 $self->{flat_format} = shift if @_;
325 2         3 $d;
326             }
327              
328             # return the alphabet
329             sub alphabet {
330 0     0 0 0 my $self = shift;
331 0         0 my $d = $self->{flat_alphabet};
332 0 0       0 $self->{flat_alphabet} = shift if @_;
333 0         0 $d;
334             }
335              
336             sub parse_one_record {
337 0     0 0 0 my $self = shift;
338 0         0 my $fh = shift;
339             my $parser =
340 0   0     0 $self->{cached_parsers}{fileno($fh)}
341             ||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format);
342 0 0       0 my $seq = $parser->next_seq or return;
343 0   0     0 $self->{flat_alphabet} ||= $seq->alphabet;
344 0         0 my $ids = $self->seq_to_ids($seq);
345 0         0 return $ids;
346             }
347              
348              
349             # return the indexing scheme
350             sub indexing_scheme {
351 4     4 0 3 my $self = shift;
352 4         4 my $d = $self->{flat_indexing};
353 4 100       7 $self->{flat_indexing} = shift if @_;
354 4         10 $d;
355             }
356              
357             sub add_flat_file {
358 0     0 0 0 my $self = shift;
359 0         0 my ($file_path,$file_length,$nf) = @_;
360              
361             # check that file_path is absolute
362 0 0       0 unless (File::Spec->file_name_is_absolute($file_path)) {
363 0         0 $file_path = File::Spec->rel2abs($file_path);
364             }
365              
366 0 0       0 -r $file_path or $self->throw("flat file $file_path cannot be read: $!");
367              
368 0         0 my $current_size = -s _;
369 0 0       0 if (defined $file_length) {
370 0 0       0 $current_size == $file_length
371             or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
372             } else {
373 0         0 $file_length = $current_size;
374             }
375              
376 0 0       0 unless (defined $nf) {
377 0 0       0 $self->{flat_file_index} = 0 unless exists $self->{flat_file_index};
378 0         0 $nf = $self->{flat_file_index}++;
379             }
380 0         0 $self->{flat_flat_file_path}{$nf} = $file_path;
381 0         0 $self->{flat_flat_file_no}{$file_path} = $nf;
382 0         0 $nf;
383             }
384              
385             sub write_config {
386 0     0 0 0 my $self = shift;
387 0 0       0 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
388 0         0 my $path = $self->_config_path;
389              
390 0 0       0 open my $F, '>', $path or $self->throw("Could not write file '$path': $!");
391              
392 0         0 my $index_type = $self->indexing_scheme;
393 0         0 print $F "index\t$index_type\n";
394              
395 0         0 my $format = $self->file_format;
396 0         0 my $alphabet = $self->alphabet;
397 0 0       0 my $alpha = $alphabet ? "/$alphabet" : '';
398 0         0 print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n";
399              
400 0 0       0 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
401 0         0 for my $nf (@filenos) {
402 0         0 my $path = $self->{flat_flat_file_path}{$nf};
403 0         0 my $size = -s $path;
404 0         0 print $F join("\t","fileid_$nf",$path,$size),"\n";
405             }
406              
407             # write primary namespace
408 0 0       0 my $primary_ns = $self->primary_namespace
409             or $self->throw('cannot write config file because no primary namespace defined');
410              
411 0         0 print $F join("\t",'primary_namespace',$primary_ns),"\n";
412              
413             # write secondary namespaces
414 0         0 my @secondary = $self->secondary_namespaces;
415 0         0 print $F join("\t",'secondary_namespaces',@secondary),"\n";
416              
417 0 0       0 close $F or $self->throw("close error on $path: $!");
418             }
419              
420             sub files {
421 0     0 0 0 my $self = shift;
422 0 0       0 return unless $self->{flat_flat_file_no};
423 0         0 return keys %{$self->{flat_flat_file_no}};
  0         0  
424             }
425              
426             sub write_seq {
427 0     0 0 0 my $self = shift;
428 0         0 my $seq = shift;
429              
430 0 0       0 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
431              
432 0 0       0 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
433 0 0 0     0 my $seqio = $self->{flat_cached_parsers}{$file}
434             ||= Bio::SeqIO->new(-Format => $self->file_format,
435             -file => ">$file")
436             or $self->throw("couldn't create Bio::SeqIO object");
437              
438 0 0       0 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
439 0         0 my $offset = tell($fh);
440 0         0 $seqio->write_seq($seq);
441 0         0 my $length = tell($fh)-$offset;
442 0         0 my $ids = $self->seq_to_ids($seq);
443 0         0 $self->_store_index($ids,$file,$offset,$length);
444              
445 0         0 $self->{flat_outfile_dirty}++;
446             }
447              
448             sub close {
449 1     1 0 1 my $self = shift;
450 1 50       12 return unless $self->{flat_outfile_dirty};
451 0         0 $self->write_config;
452 0         0 delete $self->{flat_outfile_dirty};
453 0         0 delete $self->{flat_cached_parsers}{$self->out_file};
454             }
455              
456              
457             sub _filenos {
458 0     0   0 my $self = shift;
459 0 0       0 return unless $self->{flat_flat_file_path};
460 0         0 return keys %{$self->{flat_flat_file_path}};
  0         0  
461             }
462              
463             # read the configuration file
464             sub _read_config {
465 1     1   1 my $self = shift;
466 1         3 my $path = $self->_config_path;
467 1 50       14 return unless -e $path;
468              
469 0 0       0 open my $F, '<', $path or $self->throw("Could not read file '$path': $!");
470 0         0 my %config;
471 0         0 while (<$F>) {
472 0         0 chomp;
473 0         0 my ($tag,@values) = split "\t";
474 0         0 $config{$tag} = \@values;
475             }
476 0 0       0 CORE::close $F or $self->throw("close error on $path: $!");
477              
478 0 0       0 $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~
479             or $self->throw("invalid configuration file $path: no index line");
480              
481 0         0 $self->indexing_scheme($1);
482              
483 0 0       0 if ($config{format}) {
484             # handle LSID format
485 0 0       0 if ($config{format}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))/) {
486 0         0 $self->file_format($1);
487 0         0 $self->alphabet($2);
488             } else { # compatibility with older versions
489 0         0 $self->file_format($config{format}[0]);
490             }
491             }
492              
493             # set up primary namespace
494 0 0       0 my $primary_namespace = $config{primary_namespace}[0]
495             or $self->throw("invalid configuration file $path: no primary namespace defined");
496 0         0 $self->primary_namespace($primary_namespace);
497              
498             # set up secondary namespaces (may be empty)
499 0         0 $self->secondary_namespaces($config{secondary_namespaces});
500              
501             # get file paths and their normalization information
502 0 0       0 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
  0         0  
  0         0  
503 0         0 for my $nf (@normalized_files) {
504 0         0 my ($file_path,$file_length) = @{$config{"fileid_${nf}"}};
  0         0  
505 0         0 $self->add_flat_file($file_path,$file_length,$nf);
506             }
507 0         0 1;
508             }
509              
510              
511             sub _config_path {
512 1     1   1 my $self = shift;
513 1         3 $self->_catfile($self->_config_name);
514             }
515              
516             sub _catfile {
517 1     1   1 my $self = shift;
518 1         1 my $component = shift;
519 1         3 File::Spec->catfile($self->directory,$self->dbname,$component);
520             }
521              
522 1     1   4 sub _config_name { CONFIG_FILE_NAME }
523              
524             sub _path2fileno {
525 0     0   0 my $self = shift;
526 0         0 my $path = shift;
527             return $self->add_flat_file($path)
528 0 0       0 unless exists $self->{flat_flat_file_no}{$path};
529 0         0 $self->{flat_flat_file_no}{$path};
530             }
531              
532             sub _fileno2path {
533 0     0   0 my $self = shift;
534 0         0 my $fileno = shift;
535 0         0 $self->{flat_flat_file_path}{$fileno};
536             }
537              
538             sub _files {
539 0     0   0 my $self = shift;
540 0         0 my $paths = $self->{flat_flat_file_no};
541 0         0 return keys %$paths;
542             }
543              
544             =head2 fetch
545              
546             Title : fetch
547             Usage : $index->fetch( $id )
548             Function: Returns a Bio::Seq object from the index
549             Example : $seq = $index->fetch( 'dJ67B12' )
550             Returns : Bio::Seq object
551             Args : ID
552              
553             Deprecated. Use get_Seq_by_id instead.
554              
555             =cut
556              
557 0     0 1 0 sub fetch { shift->get_Seq_by_id(@_) }
558              
559              
560             =head2 To Be Implemented in Subclasses
561              
562             The following methods MUST be implemented by subclasses.
563              
564             =cut
565              
566             # create real live Bio::Seq object
567             sub get_Seq_by_id {
568 0     0 1 0 my $self = shift;
569 0         0 my $id = shift;
570 0         0 $self->throw_not_implemented;
571             }
572              
573              
574             # fetch array of Bio::Seq objects
575             sub get_Seq_by_acc {
576 0     0 1 0 my $self = shift;
577 0 0       0 return $self->get_Seq_by_id(shift) if @_ == 1;
578 0         0 my ($ns,$key) = @_;
579              
580 0         0 $self->throw_not_implemented;
581             }
582              
583             sub fetch_raw {
584 0     0 0 0 my ($self,$id,$namespace) = @_;
585 0         0 $self->throw_not_implemented;
586             }
587              
588             sub default_file_format {
589 0     0 0 0 my $self = shift;
590 0         0 $self->throw_not_implemented;
591             }
592              
593             sub _store_index {
594 0     0   0 my $self = shift;
595 0         0 my ($ids,$file,$offset,$length) = @_;
596 0         0 $self->throw_not_implemented;
597             }
598              
599             =head2 May Be Overridden in Subclasses
600              
601             The following methods MAY be overridden by subclasses.
602              
603             =cut
604              
605             sub default_primary_namespace {
606 0     0 0 0 return "ACC";
607             }
608              
609             sub default_secondary_namespaces {
610 0     0 0 0 return;
611             }
612              
613             sub seq_to_ids {
614 0     0 0 0 my $self = shift;
615 0         0 my $seq = shift;
616 0         0 my %ids;
617 0         0 $ids{$self->primary_namespace} = $seq->accession_number;
618 0         0 \%ids;
619             }
620              
621             sub DESTROY {
622 1     1   1 my $self = shift;
623 1         4 $self->close;
624             }
625              
626              
627             1;