File Coverage

Bio/DB/IndexedBase.pm
Criterion Covered Total %
statement 275 308 89.2
branch 89 134 66.4
condition 35 57 61.4
subroutine 61 71 85.9
pod 20 20 100.0
total 480 590 81.3


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::IndexedBase
3             #
4             # You may distribute this module under the same terms as perl itself
5             #
6              
7              
8             =head1 NAME
9              
10             Bio::DB::IndexedBase - Base class for modules using indexed sequence files
11              
12             =head1 SYNOPSIS
13              
14             use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
15              
16             # 1/ Bio::SeqIO-style access
17              
18             # Index some sequence files
19             my $db = Bio::DB::XXX->new('/path/to/file'); # from a single file
20             my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
21             my $db = Bio::DB::XXX->new('/path/to/files/'); # from a directory
22              
23             # Get IDs of all the sequences in the database
24             my @ids = $db->get_all_primary_ids;
25              
26             # Get a specific sequence
27             my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
28              
29             # Loop through all sequences
30             my $stream = $db->get_PrimarySeq_stream;
31             while (my $seq = $stream->next_seq) {
32             # Do something...
33             }
34              
35              
36             # 2/ Access via filehandle
37             my $fh = Bio::DB::XXX->newFh('/path/to/file');
38             while (my $seq = <$fh>) {
39             # Do something...
40             }
41              
42              
43             # 3/ Tied-hash access
44             tie %sequences, 'Bio::DB::XXX', '/path/to/file';
45             print $sequences{'CHROMOSOME_I:1,20000'};
46              
47             =head1 DESCRIPTION
48              
49             Bio::DB::IndexedBase provides a base class for modules that want to index
50             and read sequence files and provides persistent, random access to each sequence
51             entry, without bringing the entire file into memory. This module is compliant
52             with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
53             Bio::DB::IndexedBase.
54              
55             When you initialize the module, you point it at a single file, several files, or
56             a directory of files. The first time it is run, the module generates an index
57             of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
58             followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
59             index file to find the sequence file and offset for any requested sequence. If
60             one of the source files is updated, the module reindexes just that one file. You
61             can also force reindexing manually at any time. For improved performance, the
62             module keeps a cache of open filehandles, closing less-recently used ones when
63             the cache is full.
64              
65             Entries may have any line length up to 65,536 characters, and different line
66             lengths are allowed in the same file. However, within a sequence entry, all
67             lines must be the same length except for the last. An error will be thrown if
68             this is not the case!
69              
70             This module was developed for use with the C. elegans and human genomes, and has
71             been tested with sequence segments as large as 20 megabases. Indexing the C.
72             elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
73             minutes on my 300 MHz pentium laptop. On the same system, average access time
74             for any 200-mer within the C. elegans genome was E0.02s.
75              
76             =head1 DATABASE CREATION AND INDEXING
77              
78             The two constructors for this class are new() and newFh(). The former creates a
79             Bio::DB::IndexedBase object which is accessed via method calls. The latter
80             creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
81             objects in a stream fashion. There is also a tied hash interface.
82              
83             =over
84              
85             =item $db = Bio::DB::IndexedBase-Enew($path [,%options])
86              
87             Create a new Bio::DB::IndexedBase object from the files designated by $path
88             $path may be a single file, an arrayref of files, or a directory containing
89             such files.
90              
91             After the database is created, you can use methods like get_all_primary_ids()
92             and get_Seq_by_id() to retrieve sequence objects.
93              
94             =item $fh = Bio::DB::IndexedBase-EnewFh($path [,%options])
95              
96             Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
97             from this filehandle with EE will return a stream of sequence objects,
98             Bio::SeqIO style. The path and the options should be specified as for new().
99              
100             =item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
101              
102             Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
103             path to the files. The optional @args list is the same set used by new(). If
104             successful, tie() returns the tied object, undef otherwise.
105              
106             Once tied, you can use the hash to retrieve an individual sequence by
107             its ID, like this:
108              
109             my $seq = $db{CHROMOSOME_I};
110              
111             The keys() and values() functions will return the sequence IDs and their
112             sequences, respectively. In addition, each() can be used to iterate over the
113             entire data set:
114              
115             while (my ($id,$sequence) = each %db) {
116             print "$id => $sequence\n";
117             }
118              
119              
120             When dealing with very large sequences, you can avoid bringing them into memory
121             by calling each() in a scalar context. This returns the key only. You can then
122             use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
123              
124             while (my $id = each %db) {
125             print "$id: $db{$sequence:1,100}\n";
126             print "$id: ".tied(%db)->length($id)."\n";
127             }
128              
129             In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
130             to retrieve the first and next ID in the database, respectively. This allows to
131             write the following iterative loop using just the object-oriented interface:
132              
133             my $db = Bio::DB::IndexedBase->new('/path/to/file');
134             for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
135             # do something with sequence
136             }
137              
138             =back
139              
140             =head1 INDEX CONTENT
141              
142             Several attributes of each sequence are stored in the index file. Given a
143             sequence ID, these attributes can be retrieved using the following methods:
144              
145             =over
146              
147             =item offset($id)
148              
149             Get the offset of the indicated sequence from the beginning of the file in which
150             it is located. The offset points to the beginning of the sequence, not the
151             beginning of the header line.
152              
153             =item strlen($id)
154              
155             Get the number of characters in the sequence string.
156              
157             =item length($id)
158              
159             Get the number of residues of the sequence.
160              
161             =item linelen($id)
162              
163             Get the length of the line for this sequence. If the sequence is wrapped, then
164             linelen() is likely to be much shorter than strlen().
165              
166             =item headerlen($id)
167              
168             Get the length of the header line for the indicated sequence.
169              
170             =item header_offset
171              
172             Get the offset of the header line for the indicated sequence from the beginning
173             of the file in which it is located. This attribute is not stored. It is
174             calculated from offset() and headerlen().
175              
176             =item alphabet($id)
177              
178             Get the molecular type (alphabet) of the indicated sequence. This method handles
179             residues according to the IUPAC convention.
180              
181             =item file($id)
182              
183             Get the the name of the file in which the indicated sequence can be found.
184              
185             =back
186              
187             =head1 INTERFACE COMPLIANCE NOTES
188              
189             Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
190             Bio::RandomAccessI interfaces.
191              
192             Database do not necessarily provide any meaningful internal primary ID for the
193             sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
194             the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
195             and get_Seq_by_primary_id() will return the same sequence.
196              
197             Since this database index has no notion of sequence version or namespace, the
198             get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
199              
200             =head1 BUGS
201              
202             When a sequence is deleted from one of the files, this deletion is not detected
203             by the module and removed from the index. As a result, a "ghost" entry will
204             remain in the index and will return garbage results if accessed.
205              
206             Also, if you are indexing a directory, it is wise to not add or remove files
207             from it.
208              
209             In case you have changed the files in a directory, or the sequences in a file,
210             you can to rebuild the entire index, either by deleting it manually, or by
211             passing -reindex=E1 to new() when initializing the module.
212              
213             =head1 SEE ALSO
214              
215             L
216              
217             L
218              
219             L
220              
221             =head1 AUTHOR
222              
223             Lincoln Stein Elstein@cshl.orgE.
224              
225             Copyright (c) 2001 Cold Spring Harbor Laboratory.
226              
227             Florent Angly (for the modularization)
228              
229             This library is free software; you can redistribute it and/or modify
230             it under the same terms as Perl itself. See DISCLAIMER.txt for
231             disclaimers of warranty.
232              
233             =head1 APPENDIX
234              
235             The rest of the documentation details each of the object
236             methods. Internal methods are usually preceded with a _
237              
238             =cut
239              
240              
241             package Bio::DB::IndexedBase;
242              
243             BEGIN {
244             @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
245 2 50   2   74 if(!$INC{'AnyDBM_File.pm'});
246             }
247              
248 2     2   8 use strict;
  2         2  
  2         28  
249 2     2   6 use IO::File;
  2         2  
  2         266  
250 2     2   825 use AnyDBM_File;
  2         5022  
  2         81  
251 2     2   8 use Fcntl;
  2         2  
  2         448  
252 2     2   8 use File::Spec;
  2         2  
  2         39  
253 2     2   6 use File::Basename qw(basename dirname);
  2         1  
  2         105  
254 2     2   654 use Bio::PrimarySeq;
  2         4  
  2         54  
255              
256 2     2   8 use base qw(Bio::DB::SeqI);
  2         2  
  2         511  
257              
258             # Store offset, strlen, linelen, headerlen, type and fileno
259 2     2   8 use constant STRUCT => 'NNNnnCa*'; # 32-bit file offset and seq length
  2         2  
  2         85  
260 2     2   7 use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
  2         2  
  2         67  
261              
262 2     2   6 use constant NA => 0;
  2         1  
  2         62  
263 2     2   6 use constant DNA => 1;
  2         2  
  2         59  
264 2     2   6 use constant RNA => 2;
  2         2  
  2         61  
265 2     2   6 use constant PROTEIN => 3;
  2         2  
  2         71  
266              
267 2     2   10 use constant DIE_ON_MISSMATCHED_LINES => 1;
  2         2  
  2         5423  
268             # you can avoid dying if you want but you may get incorrect results
269              
270              
271             # Compiling the below regular expressions speeds up the Pure Perl
272             # seq/subseq() from Bio::DB::Fasta by about 7% from 7.76s to 7.22s
273             # over 32358 calls on Variant Effect Prediction data.
274             my $nl = qr/\n/;
275             my $cr = qr/\r/;
276              
277             # Remove carriage returns (\r) and newlines (\n) from a string. When
278             # called from subseq, this can take a signficiant portion of time, in
279             # Variant Effect Prediction. Therefore we compile the match portion.
280             sub _strip_crnl {
281             my $str = shift;
282             $str =~ s/$nl//g;
283             $str =~ s/$cr//g;
284             return $str;
285             }
286              
287             # C can do perfrom _strip_crnl much faster. But this requires the
288             # Inline::C module which we don't require people to have. So we make
289             # this optional by wrapping the C code in an eval. If the eval works,
290             # the Perl strip_crnl() function is overwritten.
291 2     2   1057 eval q{
  2         26210  
  2         9  
292             use Inline C => <<'END_OF_C_CODE';
293             /* Strip all new line (\n) and carriage return (\r) characters
294             from string str
295             */
296             char* _strip_crnl(char* str) {
297             char *s;
298             char *s2 = str;
299             for (s = str; *s; *s++) {
300             if (*s != '\n' && *s != '\r') {
301             *s2++ = *s;
302             }
303             }
304             *s2 = '\0';
305             return str;
306             }
307             END_OF_C_CODE
308             };
309              
310              
311             =head2 new
312              
313             Title : new
314             Usage : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
315             Function: Initialize a new database object
316             Returns : A Bio::DB::IndexedBase object
317             Args : A single file, or path to dir, or arrayref of files
318             Optional arguments:
319              
320             Option Description Default
321             ----------- ----------- -------
322             -glob Glob expression to search for files in directories *
323             -makeid A code subroutine for transforming IDs None
324             -maxopen Maximum size of filehandle cache 32
325             -debug Turn on status messages 0
326             -reindex Force the index to be rebuilt 0
327             -dbmargs Additional arguments to pass to the DBM routine None
328             -index_name Name of the file that will hold the indices
329             -clean Remove the index file when finished 0
330              
331             The -dbmargs option can be used to control the format of the index. For example,
332             you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
333             retrieved alphabetically. Note that you must use the same arguments every time
334             you open the index!
335              
336             The -makeid option gives you a chance to modify sequence IDs during indexing.
337             For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
338             that GenBank Fasta files use. The original header line can be recovered later.
339             The option value for -makeid should be a code reference that takes a scalar
340             argument (the full header line) and returns a scalar or an array of scalars (the
341             ID or IDs you want to assign). For example:
342              
343             $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
344              
345             sub extract_gi {
346             # Extract GI from GenBank
347             my $header = shift;
348             my ($id) = ($header =~ /gi\|(\d+)/m);
349             return $id || '';
350             }
351              
352             extract_gi() will be called with the full header line, e.g. a Fasta line would
353             include the "E", the ID and the description:
354              
355             >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
356              
357             In the database, this sequence can now be retrieved by its GI instead of its
358             complete ID:
359              
360             my $seq = $db->get_Seq_by_id(352962132);
361              
362             The -makeid option is ignored after the index is constructed.
363              
364             =cut
365              
366             sub new {
367 20     20 1 711 my ($class, $path, %opts) = @_;
368              
369             my $self = bless {
370             debug => $opts{-debug} || 0,
371             makeid => $opts{-makeid},
372             glob => $opts{-glob} || eval '$'.$class.'::file_glob' || '*',
373             maxopen => $opts{-maxopen} || 32,
374             clean => $opts{-clean} || 0,
375             dbmargs => $opts{-dbmargs} || undef,
376             fhcache => {},
377             cacheseq => {},
378             curopen => 0,
379             openseq => 1,
380             dirname => undef,
381             offsets => undef,
382             index_name => $opts{-index_name},
383             obj_class => eval '$'.$class.'::obj_class',
384 20   50     912 offset_meth => \&{$class.'::_calculate_offsets'},
  20   50     199  
      50        
      100        
      50        
385             fileno2path => [],
386             filepath2no => {},
387             }, $class;
388              
389 20         48 my ($offsets, $dirname);
390 20   100     67 my $ref = ref $path || '';
391 20 100       32 if ( $ref eq 'ARRAY' ) {
392 1         7 $offsets = $self->index_files($path, $opts{-reindex});
393 1         7 require Cwd;
394 1         4 $dirname = Cwd::getcwd();
395             } else {
396 19   66     94 $self->{index_name} ||= $self->_default_index_name($path);
397 19 100       150 if (-d $path) {
    50          
398             # because Win32 glob() is broken with respect to long file names
399             # that contain whitespace.
400 9 50 33     44 $path = Win32::GetShortPathName($path)
401             if $^O =~ /^MSWin/i && eval 'use Win32; 1';
402 9         36 $offsets = $self->index_dir($path, $opts{-reindex});
403 8         15 $dirname = $path;
404             } elsif (-f _) {
405 10         36 $offsets = $self->index_file($path, $opts{-reindex});
406 9         216 $dirname = dirname($path);
407             } else {
408 0         0 $self->throw( "No file or directory called '$path'");
409             }
410             }
411 18         21 @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
  18         37  
412              
413 18         95 return $self;
414             }
415              
416              
417             =head2 newFh
418              
419             Title : newFh
420             Usage : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
421             Function: Index and get a new Fh for a single file, several files or a directory
422             Returns : Filehandle object
423             Args : Same as new()
424              
425             =cut
426              
427             sub newFh {
428 1     1 1 3 my ($class, @args) = @_;
429 1         2 my $self = $class->new(@args);
430 1         7 require Symbol;
431 1         3 my $fh = Symbol::gensym;
432 1 50       12 tie $$fh, 'Bio::DB::Indexed::Stream', $self
433             or $self->throw("Could not tie filehandle: $!");
434 1         4 return $fh;
435             }
436              
437              
438             =head2 dbmargs
439              
440             Title : dbmargs
441             Usage : my @args = $db->dbmargs;
442             Function: Get stored dbm arguments
443             Returns : Array
444             Args : None
445              
446             =cut
447              
448             sub dbmargs {
449 38     38 1 33 my $self = shift;
450 38 50       101 my $args = $self->{dbmargs} or return;
451 0 0       0 return ref($args) eq 'ARRAY' ? @$args : $args;
452             }
453              
454              
455             =head2 glob
456              
457             Title : glob
458             Usage : my $glob = $db->glob;
459             Function: Get the expression used to match files in directories
460             Returns : String
461             Args : None
462              
463             =cut
464              
465             sub glob {
466 2     2 1 4 my $self = shift;
467 2         9 return $self->{glob};
468             }
469              
470              
471             =head2 index_dir
472              
473             Title : index_dir
474             Usage : $db->index_dir($dir);
475             Function: Index the files that match -glob in the given directory
476             Returns : Hashref of offsets
477             Args : Dirname
478             Boolean to force a reindexing the directory
479              
480             =cut
481              
482             sub index_dir {
483 9     9 1 16 my ($self, $dir, $force_reindex) = @_;
484 9         1636 my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
485 9 50       33 return if scalar @files == 0;
486 9   33     21 $self->{index_name} ||= $self->_default_index_name($dir);
487 9         30 my $offsets = $self->_index_files(\@files, $force_reindex);
488 8         26 return $offsets;
489             }
490              
491              
492             =head2 get_all_primary_ids
493              
494             Title : get_all_primary_ids, get_all_ids, ids
495             Usage : my @ids = $db->get_all_primary_ids;
496             Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
497             implementation. Note that in this implementation, the internal
498             database primary IDs are also the sequence IDs.
499             Returns : List of ids
500             Args : None
501              
502             =cut
503              
504             sub get_all_primary_ids {
505 6     6 1 7 return keys %{shift->{offsets}};
  6         193  
506             }
507              
508             *ids = *get_all_ids = \&get_all_primary_ids;
509              
510              
511             =head2 index_file
512              
513             Title : index_file
514             Usage : $db->index_file($filename);
515             Function: Index the given file
516             Returns : Hashref of offsets
517             Args : Filename
518             Boolean to force reindexing the file
519              
520             =cut
521              
522             sub index_file {
523 10     10 1 12 my ($self, $file, $force_reindex) = @_;
524 10   33     231 $self->{index_name} ||= $self->_default_index_name($file);
525 10         22 my $offsets = $self->_index_files([$file], $force_reindex);
526 9         19 return $offsets;
527             }
528              
529             sub _default_index_name {
530 18     18   25 my ($self,$path) = @_;
531 18 100       353 return File::Spec->catfile($path,'directory.index') if -d $path;
532 9         27 return "$path.index";
533             }
534              
535             =head2 index_files
536              
537             Title : index_files
538             Usage : $db->index_files(\@files);
539             Function: Index the given files
540             Returns : Hashref of offsets
541             Args : Arrayref of filenames
542             Boolean to force reindexing the files
543              
544             =cut
545              
546             sub index_files {
547 1     1 1 3 my ($self, $files, $force_reindex) = @_;
548 1         23 my @paths = map { File::Spec->rel2abs($_) } @$files;
  2         43  
549 1         6 require Digest::MD5;
550 1         9 my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
551 1   33     8 $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
552 1         2 my $offsets = $self->_index_files($files, $force_reindex);
553 1         2 return $offsets;
554             }
555              
556              
557             =head2 index_name
558              
559             Title : index_name
560             Usage : my $indexname = $db->index_name($path);
561             Function: Get the full name of the index file
562             Returns : String
563             Args : None
564              
565             =cut
566              
567             sub index_name {
568 38     38 1 998 return shift->{index_name};
569             }
570              
571              
572             =head2 path
573              
574             Title : path
575             Usage : my $path = $db->path($path);
576             Function: When a single file or a directory of files is indexed, this returns
577             the file directory. When indexing an arbitrary list of files, the
578             return value is the path of the current working directory.
579             Returns : String
580             Args : None
581              
582             =cut
583              
584             sub path {
585 0     0 1 0 return shift->{dirname};
586             }
587              
588              
589             =head2 get_PrimarySeq_stream
590              
591             Title : get_PrimarySeq_stream
592             Usage : my $stream = $db->get_PrimarySeq_stream();
593             Function: Get a SeqIO-like stream of sequence objects. The stream supports a
594             single method, next_seq(). Each call to next_seq() returns a new
595             PrimarySeqI compliant sequence object, until no more sequences remain.
596             This is a Bio::DB::SeqI method implementation.
597             Returns : A Bio::DB::Indexed::Stream object
598             Args : None
599              
600             =cut
601              
602             sub get_PrimarySeq_stream {
603 3     3 1 5 my $self = shift;
604 3         14 return Bio::DB::Indexed::Stream->new($self);
605             }
606              
607              
608             =head2 get_Seq_by_id
609              
610             Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
611             Usage : my $seq = $db->get_Seq_by_id($id);
612             Function: Given an ID, fetch the corresponding sequence from the database.
613             This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
614             Returns : A sequence object
615             Args : ID
616              
617             =cut
618              
619             sub get_Seq_by_id {
620 22     22 1 460 my ($self, $id) = @_;
621 22 50       36 $self->throw('Need to provide a sequence ID') if not defined $id;
622 22 100       162 return if not exists $self->{offsets}{$id};
623 18         84 return $self->{obj_class}->new($self, $id);
624             }
625              
626             *get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
627              
628              
629             =head2 _calculate_offsets
630              
631             Title : _calculate_offsets
632             Usage : $db->_calculate_offsets($filename, $offsets);
633             Function: This method calculates the sequence offsets in a file based on ID and
634             should be implemented by classes that use Bio::DB::IndexedBase.
635             Returns : Hash of offsets
636             Args : File to process
637             Hashref of file offsets keyed by IDs.
638              
639             =cut
640              
641             sub _calculate_offsets {
642 0     0   0 my $self = shift;
643 0         0 $self->throw_not_implemented();
644             }
645              
646              
647             sub _index_files {
648             # Do the indexing of the given files using the index file on record
649 20     20   24 my ($self, $files, $force_reindex) = @_;
650              
651 20         42 $self->_set_pack_method( @$files );
652              
653             # Get name of index file
654 20         36 my $index = $self->index_name;
655              
656             # If caller has requested reindexing, unlink the index file.
657 20 100       39 if ($force_reindex) {
658             # Tied-hash in Strawberry Perl creates "$file.index"
659 12 50       159 unlink $index if -e $index;
660             # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
661 12 100       263 unlink "$index.dir" if -e "$index.dir";
662 12 100       220 unlink "$index.pag" if -e "$index.pag";
663             }
664              
665             # Get the modification time of the index
666 20   50     175 my $indextime = (stat $index)[9] || 0;
667              
668             # Register files and find if there has been any update
669 20         12 my $modtime = 0;
670 20         20 my @updated;
671 20         35 for my $file (@$files) {
672             # Register file
673 63         1160 $self->_path2fileno(basename($file));
674             # Any update?
675 63   50     564 my $m = (stat $file)[9] || 0;
676 63 100       104 if ($m > $modtime) {
677 20         21 $modtime = $m;
678             }
679 63 50       88 if ($m > $indextime) {
680 63         85 push @updated, $file;
681             }
682             }
683              
684             # Get termination length from first file
685 20         49 $self->{termination_length} = $self->_calc_termination_length( $files->[0] );
686              
687             # Reindex contents of changed files if needed
688 20   66     57 my $reindex = $force_reindex || (scalar @updated > 0);
689 20 50       48 $self->{offsets} = $self->_open_index($index, $reindex) or return;
690 20 50       40 if ($reindex) {
691 20         27 $self->{indexing} = $index;
692 20         28 for my $file (@updated) {
693 62         1455 my $fileno = $self->_path2fileno(basename($file));
694 62         87 &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
  62         119  
695             }
696 18         29 delete $self->{indexing};
697             }
698              
699             # Closing and reopening might help corrupted index file problem on Windows
700 18         51 $self->_close_index($self->{offsets});
701              
702 18         29 return $self->{offsets} = $self->_open_index($index);
703             }
704              
705              
706             sub _open_index {
707             # Open index file in read-only or write mode
708 38     38   46 my ($self, $index_file, $write) = @_;
709 38         34 my %offsets;
710 38 100       51 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
711 38         63 my @dbmargs = $self->dbmargs;
712 38 50       1870 tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
713             or $self->throw( "Could not open index file $index_file: $!");
714 38         130 return \%offsets;
715             }
716              
717              
718             sub _close_index {
719             # Close index file
720 45     45   46 my ($self, $index) = @_;
721 45         341 untie %$index;
722 45         64 return 1;
723             }
724              
725             # Compiling the below regular expression speeds up _parse_compound_id
726             my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;
727              
728             sub _parse_compound_id {
729             # Handle compound IDs:
730             # $db->seq($id)
731             # $db->seq($id, $start, $stop, $strand)
732             # $db->seq("$id:$start,$stop")
733             # $db->seq("$id:$start..$stop")
734             # $db->seq("$id:$start-$stop")
735             # $db->seq("$id:$start,$stop/$strand")
736             # $db->seq("$id:$start..$stop/$strand")
737             # $db->seq("$id:$start-$stop/$strand")
738             # $db->seq("$id/$strand")
739 45     45   51 my ($self, $id, $start, $stop, $strand) = @_;
740              
741 45 100 33     362 if ( (not defined $start ) &&
      66        
742             (not defined $stop ) &&
743             (not defined $strand) &&
744             ($id =~ m{$compound_id}) ) {
745             # Start, stop and strand not provided and ID looks like a compound ID
746 20         61 ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
747             }
748              
749             # Start, stop and strand defaults
750 45   100     84 $stop ||= $self->length($id) || 0; # 0 if sequence not found in database
      100        
751 45 100 100     84 $start ||= ($stop > 0) ? 1 : 0;
752 45   100     114 $strand ||= 1;
753              
754             # Convert numbers such as 1_000_000 to 1000000
755 45         62 $start =~ s/_//g;
756 45         37 $stop =~ s/_//g;
757              
758 45 100       75 if ($start > $stop) {
759             # Change the strand
760 8         11 ($start, $stop) = ($stop, $start);
761 8         12 $strand *= -1;
762             }
763              
764 45         129 return $id, $start, $stop, $strand;
765             }
766              
767              
768             sub _guess_alphabet {
769             # Determine the molecular type of the given sequence string:
770             # 'dna', 'rna', 'protein' or '' (unknown/empty)
771 2997     2997   2522 my ($self, $string) = @_;
772             # Handle IUPAC residues like PrimarySeq does
773 2997         4275 my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
774 2997 50       7095 return $alphabet eq 'dna' ? DNA
    100          
    100          
775             : $alphabet eq 'rna' ? RNA
776             : $alphabet eq 'protein' ? PROTEIN
777             : NA;
778             }
779              
780              
781             sub _makeid {
782             # Process the header line by applying any transformation given in -makeid
783 3053     3053   2952 my ($self, $header_line) = @_;
784 3053 100       9873 return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
785             }
786              
787              
788             sub _check_linelength {
789             # Check that the line length is valid. Generate an error otherwise.
790 105     105   95 my ($self, $linelength) = @_;
791 105 100       146 return if not defined $linelength;
792 96 50       176 $self->throw(
793             "Each line of the file must be less than 65,536 characters. Line ".
794             "$. is $linelength chars."
795             ) if $linelength > 65535;
796             }
797              
798              
799             sub _calc_termination_length {
800             # Try the beginning of the file to determine termination length
801             # Account for crlf-terminated Windows and Mac files
802 20     20   22 my ($self, $file) = @_;
803 20 50       89 my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
804              
805             # In Windows, text files have '\r\n' as line separator, but when reading in
806             # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
807             # "length $_" will report 4 although the line is 5 bytes in length.
808             # We assume that all lines have the same line separator and only read current line.
809 20         972 my $init_pos = tell($fh);
810 20         166 my $curr_line = <$fh>;
811 20         23 my $pos_diff = tell($fh) - $init_pos;
812 20         26 my $correction = $pos_diff - length $curr_line;
813 20         124 close $fh;
814              
815 20 50       72 $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
816 20         56 return $self->{termination_length};
817             }
818              
819              
820             sub _calc_offset {
821             # Get the offset of the n-th residue of the sequence with the given ID
822             # and termination length (tl)
823 88     88   83 my ($self, $id, $n) = @_;
824 88         68 my $tl = $self->{termination_length};
825 88         68 $n--;
826 88         125 my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
  88         107  
827 88 100       177 $n = 0 if $n < 0;
828 88 100       101 $n = $seqlen-1 if $n >= $seqlen;
829 88         209 return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
830             }
831              
832              
833             sub _fh {
834             # Given a sequence ID, return the filehandle on which to find this sequence
835 49     49   44 my ($self, $id) = @_;
836 49 50       68 $self->throw('Need to provide a sequence ID') if not defined $id;
837 49 100       81 my $file = $self->file($id) or return;
838 48 0       420 return $self->_fhcache( File::Spec->catfile($self->{dirname}, $file) ) or
839             $self->throw( "Can't open file $file");
840             }
841              
842              
843             sub _fhcache {
844 48     48   51 my ($self, $path) = @_;
845 48 100       87 if (!$self->{fhcache}{$path}) {
846 9 50       22 if ($self->{curopen} >= $self->{maxopen}) {
847 0         0 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
848 0         0 keys %{$self->{fhcache}};
  0         0  
849 0         0 splice(@lru, $self->{maxopen} / 3);
850 0         0 $self->{curopen} -= @lru;
851 0         0 for (@lru) {
852 0         0 delete $self->{fhcache}{$_};
853             }
854             }
855 9   50     34 $self->{fhcache}{$path} = IO::File->new($path) || return;
856 9         463 binmode $self->{fhcache}{$path};
857 9         16 $self->{curopen}++;
858             }
859 48         58 $self->{cacheseq}{$path}++;
860 48         126 return $self->{fhcache}{$path};
861             }
862              
863              
864             #-------------------------------------------------------------
865             # Methods to store and retrieve data from indexed file
866             #
867              
868             =head2 offset
869              
870             Title : offset
871             Usage : my $offset = $db->offset($id);
872             Function: Get the offset of the indicated sequence from the beginning of the
873             file in which it is located. The offset points to the beginning of
874             the sequence, not the beginning of the header line.
875             Returns : String
876             Args : ID of sequence
877              
878             =cut
879              
880             sub offset {
881 1     1 1 3 my ($self, $id) = @_;
882 1 50       4 $self->throw('Need to provide a sequence ID') if not defined $id;
883 1 50       13 my $offset = $self->{offsets}{$id} or return;
884 1         2 return (&{$self->{unpackmeth}}($offset))[0];
  1         3  
885             }
886              
887              
888             =head2 strlen
889              
890             Title : strlen
891             Usage : my $length = $db->strlen($id);
892             Function: Get the number of characters in the sequence string.
893             Returns : Integer
894             Args : ID of sequence
895              
896             =cut
897              
898             sub strlen {
899 16     16 1 12 my ($self, $id) = @_;
900 16 50       22 $self->throw('Need to provide a sequence ID') if not defined $id;
901 16 50       85 my $offset = $self->{offsets}{$id} or return;
902 16         21 return (&{$self->{unpackmeth}}($offset))[1];
  16         21  
903             }
904              
905              
906             =head2 length
907              
908             Title : length
909             Usage : my $length = $db->length($id);
910             Function: Get the number of residues of the sequence.
911             Returns : Integer
912             Args : ID of sequence
913              
914             =cut
915              
916             sub length {
917 30     30 1 34 my ($self, $id) = @_;
918 30 50       48 $self->throw('Need to provide a sequence ID') if not defined $id;
919 30 100       200 my $offset = $self->{offsets}{$id} or return;
920 29         40 return (&{$self->{unpackmeth}}($offset))[2];
  29         43  
921             }
922              
923              
924             =head2 linelen
925              
926             Title : linelen
927             Usage : my $linelen = $db->linelen($id);
928             Function: Get the length of the line for this sequence.
929             Returns : Integer
930             Args : ID of sequence
931              
932             =cut
933              
934             sub linelen {
935 0     0 1 0 my ($self, $id) = @_;
936 0 0       0 $self->throw('Need to provide a sequence ID') if not defined $id;
937 0 0       0 my $offset = $self->{offsets}{$id} or return;
938 0         0 return (&{$self->{unpackmeth}}($offset))[3];
  0         0  
939             }
940              
941              
942             =head2 headerlen
943              
944             Title : headerlen
945             Usage : my $length = $db->headerlen($id);
946             Function: Get the length of the header line for the indicated sequence.
947             Returns : Integer
948             Args : ID of sequence
949              
950             =cut
951              
952             sub headerlen {
953 0     0 1 0 my ($self, $id) = @_;
954 0 0       0 $self->throw('Need to provide a sequence ID') if not defined $id;
955 0 0       0 my $offset = $self->{offsets}{$id} or return;
956 0         0 return (&{$self->{unpackmeth}}($offset))[4];
  0         0  
957             }
958              
959              
960             =head2 header_offset
961              
962             Title : header_offset
963             Usage : my $offset = $db->header_offset($id);
964             Function: Get the offset of the header line for the indicated sequence from
965             the beginning of the file in which it is located.
966             Returns : String
967             Args : ID of sequence
968              
969             =cut
970              
971             sub header_offset {
972 0     0 1 0 my ($self, $id) = @_;
973 0 0       0 $self->throw('Need to provide a sequence ID') if not defined $id;
974 0 0       0 return if not $self->{offsets}{$id};
975 0         0 return $self->offset($id) - $self->headerlen($id);
976             }
977              
978              
979             =head2 alphabet
980              
981             Title : alphabet
982             Usage : my $alphabet = $db->alphabet($id);
983             Function: Get the molecular type of the indicated sequence: dna, rna or protein
984             Returns : String
985             Args : ID of sequence
986              
987             =cut
988              
989             sub alphabet {
990 13     13 1 19 my ($self, $id) = @_;
991 13 50       23 $self->throw('Need to provide a sequence ID') if not defined $id;
992 13 50       74 my $offset = $self->{offsets}{$id} or return;
993 13         21 my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
  13         22  
994 13 100       59 return : $alphabet == Bio::DB::IndexedBase::DNA ? 'dna'
    100          
    100          
995             : $alphabet == Bio::DB::IndexedBase::RNA ? 'rna'
996             : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
997             : '';
998             }
999              
1000              
1001             =head2 file
1002              
1003             Title : file
1004             Usage : my $file = $db->file($id);
1005             Function: Get the the name of the file in which the indicated sequence can be
1006             found.
1007             Returns : String
1008             Args : ID of sequence
1009              
1010             =cut
1011              
1012             sub file {
1013 59     59 1 53 my ($self, $id) = @_;
1014 59 50       84 $self->throw('Need to provide a sequence ID') if not defined $id;
1015 59 100       362 my $offset = $self->{offsets}{$id} or return;
1016 58         77 return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
  58         85  
1017             }
1018              
1019              
1020             sub _fileno2path {
1021 58     58   57 my ($self, $fileno) = @_;
1022 58         166 return $self->{fileno2path}->[$fileno];
1023             }
1024              
1025              
1026             sub _path2fileno {
1027 125     125   142 my ($self, $path) = @_;
1028 125 100       243 if ( not exists $self->{filepath2no}->{$path} ) {
1029 63         143 my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
1030 63         85 $self->{fileno2path}->[$fileno] = $path; # Save path
1031             }
1032 125         143 return $self->{filepath2no}->{$path};
1033              
1034             }
1035              
1036              
1037             sub _packSmall {
1038 3052     3052   8245 return pack STRUCT, @_;
1039             }
1040              
1041              
1042             sub _packBig {
1043 0     0   0 return pack STRUCTBIG, @_;
1044             }
1045              
1046              
1047             sub _unpackSmall {
1048 209     209   778 return unpack STRUCT, shift;
1049             }
1050              
1051              
1052             sub _unpackBig {
1053 0     0   0 return unpack STRUCTBIG, shift;
1054             }
1055              
1056              
1057             sub _set_pack_method {
1058             # Determine whether to use 32 or 64 bit integers for the given files.
1059 20     20   20 my $self = shift;
1060             # Find the maximum file size:
1061 20         32 my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
  88         70  
  63         451  
1062 20         23 my $fourGB = (2 ** 32) - 1;
1063              
1064 20 50       31 if ($maxsize > $fourGB) {
1065             # At least one file exceeds 4Gb - we will need to use 64 bit ints
1066 0         0 $self->{packmeth} = \&_packBig;
1067 0         0 $self->{unpackmeth} = \&_unpackBig;
1068             } else {
1069 20         34 $self->{packmeth} = \&_packSmall;
1070 20         34 $self->{unpackmeth} = \&_unpackSmall;
1071             }
1072 20         21 return 1;
1073             }
1074              
1075              
1076             #-------------------------------------------------------------
1077             # Tied hash logic
1078             #
1079              
1080             sub TIEHASH {
1081 2     2   26 return shift->new(@_);
1082             }
1083              
1084              
1085             sub FETCH {
1086 3     3   42 return shift->subseq(@_);
1087             }
1088              
1089              
1090             sub STORE {
1091 0     0   0 shift->throw("Read-only database");
1092             }
1093              
1094              
1095             sub DELETE {
1096 0     0   0 shift->throw("Read-only database");
1097             }
1098              
1099              
1100             sub CLEAR {
1101 0     0   0 shift->throw("Read-only database");
1102             }
1103              
1104              
1105             sub EXISTS {
1106 1     1   5 return defined shift->offset(@_);
1107             }
1108              
1109              
1110             sub FIRSTKEY {
1111 4     4   4 return tied(%{shift->{offsets}})->FIRSTKEY(@_);
  4         35  
1112             }
1113              
1114              
1115             sub NEXTKEY {
1116 9     9   8 return tied(%{shift->{offsets}})->NEXTKEY(@_);
  9         32  
1117             }
1118              
1119              
1120             sub DESTROY {
1121 27     27   1815 my $self = shift;
1122              
1123             # Close filehandles
1124 27         25 while (my ($file, $fh) = each %{ $self->{fhcache} }) {
  37         228  
1125 10 50       20 if (defined $fh) {
1126 10         35 $fh->close;
1127             }
1128             }
1129 27         327 $self->_close_index($self->{offsets});
1130              
1131 27 100 66     91 if ( $self->{clean} || $self->{indexing} ) {
1132             # Indexing aborted or cleaning requested. Delete the index file.
1133 6         9 my $index = $self->{index_name};
1134              
1135             # Tied-hash in Strawberry Perl creates "$file.index"
1136 6 50       53 unlink $index if -e $index;
1137             # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
1138 6 100       221 unlink "$index.dir" if -e "$index.dir";
1139 6 100       191 unlink "$index.pag" if -e "$index.pag";
1140             }
1141 27         185 return 1;
1142             }
1143              
1144              
1145             #-------------------------------------------------------------
1146             # stream-based access to the database
1147             #
1148              
1149             package Bio::DB::Indexed::Stream;
1150 2     2   10 use base qw(Tie::Handle Bio::DB::SeqI);
  2         2  
  2         798  
1151              
1152              
1153             sub new {
1154 4     4   5 my ($class, $db) = @_;
1155 4         13 my $key = $db->FIRSTKEY;
1156 4         23 return bless {
1157             db => $db,
1158             key => $key
1159             }, $class;
1160             }
1161              
1162             sub next_seq {
1163 10     10   10 my $self = shift;
1164 10         9 my ($key, $db) = @{$self}{'key', 'db'};
  10         15  
1165 10 100       17 return if not defined $key;
1166 9         12 my $value = $db->get_Seq_by_id($key);
1167 9         18 $self->{key} = $db->NEXTKEY($key);
1168 9         17 return $value;
1169             }
1170              
1171             sub TIEHANDLE {
1172 1     1   2 my ($class, $db) = @_;
1173 1         2 return $class->new($db);
1174             }
1175              
1176             sub READLINE {
1177 1     1   1 my $self = shift;
1178 1   50     2 return $self->next_seq || undef;
1179             }
1180              
1181              
1182             1;