File Coverage

Bio/DB/IndexedBase.pm
Criterion Covered Total %
statement 286 323 88.5
branch 90 136 66.1
condition 36 57 63.1
subroutine 64 75 85.3
pod 20 20 100.0
total 496 611 81.1


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::IndexedBase
3             #
4             # You may distribute this module under the same terms as perl itself
5             #
6              
7             =head1 NAME
8              
9             Bio::DB::IndexedBase - Base class for modules using indexed sequence files
10              
11             =head1 SYNOPSIS
12              
13             use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
14              
15             # 1/ Bio::SeqIO-style access
16              
17             # Index some sequence files
18             my $db = Bio::DB::XXX->new('/path/to/file'); # from a single file
19             my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
20             my $db = Bio::DB::XXX->new('/path/to/files/'); # from a directory
21              
22             # Get IDs of all the sequences in the database
23             my @ids = $db->get_all_primary_ids;
24              
25             # Get a specific sequence
26             my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
27              
28             # Loop through all sequences
29             my $stream = $db->get_PrimarySeq_stream;
30             while (my $seq = $stream->next_seq) {
31             # Do something...
32             }
33              
34              
35             # 2/ Access via filehandle
36             my $fh = Bio::DB::XXX->newFh('/path/to/file');
37             while (my $seq = <$fh>) {
38             # Do something...
39             }
40              
41              
42             # 3/ Tied-hash access
43             tie %sequences, 'Bio::DB::XXX', '/path/to/file';
44             print $sequences{'CHROMOSOME_I:1,20000'};
45              
46             =head1 DESCRIPTION
47              
48             Bio::DB::IndexedBase provides a base class for modules that want to index
49             and read sequence files and provides persistent, random access to each sequence
50             entry, without bringing the entire file into memory. This module is compliant
51             with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
52             Bio::DB::IndexedBase.
53              
54             When you initialize the module, you point it at a single file, several files, or
55             a directory of files. The first time it is run, the module generates an index
56             of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
57             followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
58             index file to find the sequence file and offset for any requested sequence. If
59             one of the source files is updated, the module reindexes just that one file. You
60             can also force reindexing manually at any time. For improved performance, the
61             module keeps a cache of open filehandles, closing less-recently used ones when
62             the cache is full.
63              
64             Entries may have any line length up to 65,536 characters, and different line
65             lengths are allowed in the same file. However, within a sequence entry, all
66             lines must be the same length except for the last. An error will be thrown if
67             this is not the case!
68              
69             This module was developed for use with the C. elegans and human genomes, and has
70             been tested with sequence segments as large as 20 megabases. Indexing the C.
71             elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
72             minutes on my 300 MHz pentium laptop. On the same system, average access time
73             for any 200-mer within the C. elegans genome was E0.02s.
74              
75             =head1 DATABASE CREATION AND INDEXING
76              
77             The two constructors for this class are new() and newFh(). The former creates a
78             Bio::DB::IndexedBase object which is accessed via method calls. The latter
79             creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
80             objects in a stream fashion. There is also a tied hash interface.
81              
82             =over
83              
84             =item $db = Bio::DB::IndexedBase-Enew($path [,%options])
85              
86             Create a new Bio::DB::IndexedBase object from the files designated by $path
87             $path may be a single file, an arrayref of files, or a directory containing
88             such files.
89              
90             After the database is created, you can use methods like get_all_primary_ids()
91             and get_Seq_by_id() to retrieve sequence objects.
92              
93             =item $fh = Bio::DB::IndexedBase-EnewFh($path [,%options])
94              
95             Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
96             from this filehandle with EE will return a stream of sequence objects,
97             Bio::SeqIO style. The path and the options should be specified as for new().
98              
99             =item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
100              
101             Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
102             path to the files. The optional @args list is the same set used by new(). If
103             successful, tie() returns the tied object, undef otherwise.
104              
105             Once tied, you can use the hash to retrieve an individual sequence by
106             its ID, like this:
107              
108             my $seq = $db{CHROMOSOME_I};
109              
110             The keys() and values() functions will return the sequence IDs and their
111             sequences, respectively. In addition, each() can be used to iterate over the
112             entire data set:
113              
114             while (my ($id,$sequence) = each %db) {
115             print "$id => $sequence\n";
116             }
117              
118              
119             When dealing with very large sequences, you can avoid bringing them into memory
120             by calling each() in a scalar context. This returns the key only. You can then
121             use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
122              
123             while (my $id = each %db) {
124             print "$id: $db{$sequence:1,100}\n";
125             print "$id: ".tied(%db)->length($id)."\n";
126             }
127              
128             In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
129             to retrieve the first and next ID in the database, respectively. This allows one to
130             write the following iterative loop using just the object-oriented interface:
131              
132             my $db = Bio::DB::IndexedBase->new('/path/to/file');
133             for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
134             # do something with sequence
135             }
136              
137             =back
138              
139             =head1 INDEX CONTENT
140              
141             Several attributes of each sequence are stored in the index file. Given a
142             sequence ID, these attributes can be retrieved using the following methods:
143              
144             =over
145              
146             =item offset($id)
147              
148             Get the offset of the indicated sequence from the beginning of the file in which
149             it is located. The offset points to the beginning of the sequence, not the
150             beginning of the header line.
151              
152             =item strlen($id)
153              
154             Get the number of characters in the sequence string.
155              
156             =item length($id)
157              
158             Get the number of residues of the sequence.
159              
160             =item linelen($id)
161              
162             Get the length of the line for this sequence. If the sequence is wrapped, then
163             linelen() is likely to be much shorter than strlen().
164              
165             =item headerlen($id)
166              
167             Get the length of the header line for the indicated sequence.
168              
169             =item header_offset
170              
171             Get the offset of the header line for the indicated sequence from the beginning
172             of the file in which it is located. This attribute is not stored. It is
173             calculated from offset() and headerlen().
174              
175             =item alphabet($id)
176              
177             Get the molecular type (alphabet) of the indicated sequence. This method handles
178             residues according to the IUPAC convention.
179              
180             =item file($id)
181              
182             Get the the name of the file in which the indicated sequence can be found.
183              
184             =back
185              
186             =head1 INTERFACE COMPLIANCE NOTES
187              
188             Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
189             Bio::RandomAccessI interfaces.
190              
191             Database do not necessarily provide any meaningful internal primary ID for the
192             sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
193             the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
194             and get_Seq_by_primary_id() will return the same sequence.
195              
196             Since this database index has no notion of sequence version or namespace, the
197             get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
198              
199             =head1 BUGS
200              
201             When a sequence is deleted from one of the files, this deletion is not detected
202             by the module and removed from the index. As a result, a "ghost" entry will
203             remain in the index and will return garbage results if accessed.
204              
205             Also, if you are indexing a directory, it is wise to not add or remove files
206             from it.
207              
208             In case you have changed the files in a directory, or the sequences in a file,
209             you can to rebuild the entire index, either by deleting it manually, or by
210             passing -reindex=E1 to new() when initializing the module.
211              
212             =head1 SEE ALSO
213              
214             L
215              
216             L
217              
218             L
219              
220             =head1 AUTHOR
221              
222             Lincoln Stein Elstein@cshl.orgE.
223              
224             Copyright (c) 2001 Cold Spring Harbor Laboratory.
225              
226             Florent Angly (for the modularization)
227              
228             This library is free software; you can redistribute it and/or modify
229             it under the same terms as Perl itself. See DISCLAIMER.txt for
230             disclaimers of warranty.
231              
232             =head1 APPENDIX
233              
234             The rest of the documentation details each of the object
235             methods. Internal methods are usually preceded with a _
236              
237             =cut
238              
239              
240             package Bio::DB::IndexedBase;
241              
242             BEGIN {
243             @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
244 2 50   2   83 if(!$INC{'AnyDBM_File.pm'});
245             }
246              
247 2     2   10 use strict;
  2         3  
  2         36  
248 2     2   9 use warnings;
  2         4  
  2         49  
249 2     2   7 use IO::File;
  2         3  
  2         291  
250 2     2   570 use AnyDBM_File;
  2         4373  
  2         80  
251 2     2   12 use Fcntl;
  2         3  
  2         484  
252 2     2   14 use File::Spec;
  2         3  
  2         42  
253 2     2   9 use File::Basename qw(basename dirname);
  2         2  
  2         118  
254 2     2   649 use Bio::PrimarySeq;
  2         4  
  2         59  
255              
256 2     2   11 use base qw(Bio::DB::SeqI);
  2         4  
  2         476  
257              
258             # Store offset, strlen, linelen, headerlen, type and fileno
259 2     2   11 use constant STRUCT => 'NNNnnCa*'; # 32-bit file offset and seq length
  2         4  
  2         116  
260 2     2   15 use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
  2         3  
  2         75  
261              
262 2     2   8 use constant NA => 0;
  2         4  
  2         67  
263 2     2   10 use constant DNA => 1;
  2         4  
  2         62  
264 2     2   11 use constant RNA => 2;
  2         3  
  2         64  
265 2     2   9 use constant PROTEIN => 3;
  2         4  
  2         94  
266              
267             # You can avoid dying if you want but you may get incorrect results
268 2     2   10 use constant DIE_ON_MISSMATCHED_LINES => 1;
  2         3  
  2         1334  
269              
270             # Remove carriage returns (\r) and newlines (\n) from a string. When
271             # called from subseq, this can take a signficiant portion of time, in
272             # Variant Effect Prediction. Therefore we compile the match portion.
273             sub _strip_crnl {
274 2     2   164 eval 'require Inline::C';
275 2 50       41581 if ( $INC{'Inline/C.pm'} ) {
276             # C can do _strip_crnl much faster. But this requires the
277             # Inline::C module which we don't require people to have. So we make
278             # this optional by wrapping the C code in an eval. If the eval works,
279             # the Perl strip_crnl() function is overwritten.
280 2         13 Inline->bind(
281             C => q(
282             /*
283             Strip all newlines (\n) and carriage returns (\r) from the string
284             */
285             char* _strip_crnl(char* str) {
286             char *s;
287             char *s2 = str;
288             for (s = str; *s; *s++) {
289             if (*s != '\n' && *s != '\r') {
290             *s2++ = *s;
291             }
292             }
293             *s2 = '\0';
294             return str;
295             }
296             )
297             );
298             } else {
299             # "tr" is much faster than the regex, with "s"
300             *Bio::DB::IndexedBase::_strip_crnl = sub {
301 0     0   0 my $str = shift;
302 0         0 $str =~ tr/\n\r//d;
303 0         0 return $str;
304 0         0 };
305             }
306              
307 2         1057515 return _strip_crnl(@_);
308             }
309              
310             =head2 new
311              
312             Title : new
313             Usage : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
314             Function: Initialize a new database object
315             Returns : A Bio::DB::IndexedBase object
316             Args : A single file, or path to dir, or arrayref of files
317             Optional arguments:
318              
319             Option Description Default
320             ----------- ----------- -------
321             -glob Glob expression to search for files in directories *
322             -makeid A code subroutine for transforming IDs None
323             -maxopen Maximum size of filehandle cache 32
324             -debug Turn on status messages 0
325             -reindex Force the index to be rebuilt 0
326             -dbmargs Additional arguments to pass to the DBM routine None
327             -index_name Name of the file that will hold the indices
328             -clean Remove the index file when finished 0
329              
330             The -dbmargs option can be used to control the format of the index. For example,
331             you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
332             retrieved alphabetically. Note that you must use the same arguments every time
333             you open the index!
334              
335             The -makeid option gives you a chance to modify sequence IDs during indexing.
336             For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
337             that GenBank Fasta files use. The original header line can be recovered later.
338             The option value for -makeid should be a code reference that takes a scalar
339             argument (the full header line) and returns a scalar or an array of scalars (the
340             ID or IDs you want to assign). For example:
341              
342             $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
343              
344             sub extract_gi {
345             # Extract GI from GenBank
346             my $header = shift;
347             my ($id) = ($header =~ /gi\|(\d+)/m);
348             return $id || '';
349             }
350              
351             extract_gi() will be called with the full header line, e.g. a Fasta line would
352             include the "E", the ID and the description:
353              
354             >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
355              
356             In the database, this sequence can now be retrieved by its GI instead of its
357             complete ID:
358              
359             my $seq = $db->get_Seq_by_id(352962132);
360              
361             The -makeid option is ignored after the index is constructed.
362              
363             =cut
364              
365             sub new {
366 20     20 1 865 my ($class, $path, %opts) = @_;
367              
368             my $self = bless {
369             debug => $opts{-debug} || 0,
370             makeid => $opts{-makeid},
371             glob => $opts{-glob} || eval '$'.$class.'::file_glob' || '*',
372             maxopen => $opts{-maxopen} || 32,
373             clean => $opts{-clean} || 0,
374             dbmargs => $opts{-dbmargs} || undef,
375             fhcache => {},
376             cacheseq => {},
377             curopen => 0,
378             openseq => 1,
379             dirname => undef,
380             offsets => undef,
381             index_name => $opts{-index_name},
382             obj_class => eval '$'.$class.'::obj_class',
383 20   50     1170 offset_meth => \&{$class.'::_calculate_offsets'},
  20   50     292  
      50        
      100        
      50        
384             fileno2path => [],
385             filepath2no => {},
386             }, $class;
387              
388 20         74 my ($offsets, $dirname);
389 20   100     93 my $ref = ref $path || '';
390 20 100       51 if ( $ref eq 'ARRAY' ) {
391 1         9 $offsets = $self->index_files($path, $opts{-reindex});
392 1         7 require Cwd;
393 1         5 $dirname = Cwd::getcwd();
394             } else {
395 19   66     133 $self->{index_name} ||= $self->_default_index_name($path);
396 19 100       131 if (-d $path) {
    50          
397             # because Win32 glob() is broken with respect to long file names
398             # that contain whitespace.
399 9 50 33     58 $path = Win32::GetShortPathName($path)
400             if $^O =~ /^MSWin/i && eval 'use Win32; 1';
401 9         49 $offsets = $self->index_dir($path, $opts{-reindex});
402 8         26 $dirname = $path;
403             } elsif (-f _) {
404 10         49 $offsets = $self->index_file($path, $opts{-reindex});
405 9         355 $dirname = dirname($path);
406             } else {
407 0         0 $self->throw( "No file or directory called '$path'");
408             }
409             }
410 18         39 @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
  18         53  
411              
412 18         155 return $self;
413             }
414              
415              
416             =head2 newFh
417              
418             Title : newFh
419             Usage : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
420             Function: Index and get a new Fh for a single file, several files or a directory
421             Returns : Filehandle object
422             Args : Same as new()
423              
424             =cut
425              
426             sub newFh {
427 1     1 1 71 my ($class, @args) = @_;
428 1         6 my $self = $class->new(@args);
429 1         6 require Symbol;
430 1         3 my $fh = Symbol::gensym;
431 1 50       59 tie $$fh, 'Bio::DB::Indexed::Stream', $self
432             or $self->throw("Could not tie filehandle: $!");
433 1         5 return $fh;
434             }
435              
436              
437             =head2 dbmargs
438              
439             Title : dbmargs
440             Usage : my @args = $db->dbmargs;
441             Function: Get stored dbm arguments
442             Returns : Array
443             Args : None
444              
445             =cut
446              
447             sub dbmargs {
448 38     38 1 51 my $self = shift;
449 38 50       127 my $args = $self->{dbmargs} or return;
450 0 0       0 return ref($args) eq 'ARRAY' ? @$args : $args;
451             }
452              
453              
454             =head2 glob
455              
456             Title : glob
457             Usage : my $glob = $db->glob;
458             Function: Get the expression used to match files in directories
459             Returns : String
460             Args : None
461              
462             =cut
463              
464             sub glob {
465 2     2 1 4 my $self = shift;
466 2         11 return $self->{glob};
467             }
468              
469              
470             =head2 index_dir
471              
472             Title : index_dir
473             Usage : $db->index_dir($dir);
474             Function: Index the files that match -glob in the given directory
475             Returns : Hashref of offsets
476             Args : Dirname
477             Boolean to force a reindexing the directory
478              
479             =cut
480              
481             sub index_dir {
482 9     9 1 31 my ($self, $dir, $force_reindex) = @_;
483 9         1789 my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
484 9 50       48 return if scalar @files == 0;
485 9   33     31 $self->{index_name} ||= $self->_default_index_name($dir);
486 9         45 my $offsets = $self->_index_files(\@files, $force_reindex);
487 8         39 return $offsets;
488             }
489              
490              
491             =head2 get_all_primary_ids
492              
493             Title : get_all_primary_ids, get_all_ids, ids
494             Usage : my @ids = $db->get_all_primary_ids;
495             Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
496             implementation. Note that in this implementation, the internal
497             database primary IDs are also the sequence IDs.
498             Returns : List of ids
499             Args : None
500              
501             =cut
502              
503             sub get_all_primary_ids {
504 6     6 1 13 return keys %{shift->{offsets}};
  6         216  
505             }
506              
507             {
508 2     2   18 no warnings 'once';
  2         3  
  2         731  
509             *ids = *get_all_ids = \&get_all_primary_ids;
510             }
511              
512              
513             =head2 index_file
514              
515             Title : index_file
516             Usage : $db->index_file($filename);
517             Function: Index the given file
518             Returns : Hashref of offsets
519             Args : Filename
520             Boolean to force reindexing the file
521              
522             =cut
523              
524             sub index_file {
525 10     10 1 30 my ($self, $file, $force_reindex) = @_;
526 10   33     31 $self->{index_name} ||= $self->_default_index_name($file);
527 10         32 my $offsets = $self->_index_files([$file], $force_reindex);
528 9         40 return $offsets;
529             }
530              
531             sub _default_index_name {
532 18     18   42 my ($self,$path) = @_;
533 18 100       393 return File::Spec->catfile($path,'directory.index') if -d $path;
534 9         41 return "$path.index";
535             }
536              
537             =head2 index_files
538              
539             Title : index_files
540             Usage : $db->index_files(\@files);
541             Function: Index the given files
542             Returns : Hashref of offsets
543             Args : Arrayref of filenames
544             Boolean to force reindexing the files
545              
546             =cut
547              
548             sub index_files {
549 1     1 1 3 my ($self, $files, $force_reindex) = @_;
550 1         5 my @paths = map { File::Spec->rel2abs($_) } @$files;
  2         66  
551 1         8 require Digest::MD5;
552 1         11 my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
553 1   33     8 $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
554 1         4 my $offsets = $self->_index_files($files, $force_reindex);
555 1         3 return $offsets;
556             }
557              
558              
559             =head2 index_name
560              
561             Title : index_name
562             Usage : my $indexname = $db->index_name($path);
563             Function: Get the full name of the index file
564             Returns : String
565             Args : None
566              
567             =cut
568              
569             sub index_name {
570 38     38 1 1244 return shift->{index_name};
571             }
572              
573              
574             =head2 path
575              
576             Title : path
577             Usage : my $path = $db->path($path);
578             Function: When a single file or a directory of files is indexed, this returns
579             the file directory. When indexing an arbitrary list of files, the
580             return value is the path of the current working directory.
581             Returns : String
582             Args : None
583              
584             =cut
585              
586             sub path {
587 0     0 1 0 return shift->{dirname};
588             }
589              
590              
591             =head2 get_PrimarySeq_stream
592              
593             Title : get_PrimarySeq_stream
594             Usage : my $stream = $db->get_PrimarySeq_stream();
595             Function: Get a SeqIO-like stream of sequence objects. The stream supports a
596             single method, next_seq(). Each call to next_seq() returns a new
597             PrimarySeqI compliant sequence object, until no more sequences remain.
598             This is a Bio::DB::SeqI method implementation.
599             Returns : A Bio::DB::Indexed::Stream object
600             Args : None
601              
602             =cut
603              
604             sub get_PrimarySeq_stream {
605 3     3 1 6 my $self = shift;
606 3         21 return Bio::DB::Indexed::Stream->new($self);
607             }
608              
609              
610             =head2 get_Seq_by_id
611              
612             Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
613             Usage : my $seq = $db->get_Seq_by_id($id);
614             Function: Given an ID, fetch the corresponding sequence from the database.
615             This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
616             Returns : A sequence object
617             Args : ID
618              
619             =cut
620              
621             sub get_Seq_by_id {
622 22     22 1 640 my ($self, $id) = @_;
623 22 50       48 $self->throw('Need to provide a sequence ID') if not defined $id;
624 22 100       233 return if not exists $self->{offsets}{$id};
625 18         120 return $self->{obj_class}->new($self, $id);
626             }
627              
628             {
629 2     2   13 no warnings 'once';
  2         6  
  2         4151  
630             *get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
631             }
632              
633              
634             =head2 _calculate_offsets
635              
636             Title : _calculate_offsets
637             Usage : $db->_calculate_offsets($filename, $offsets);
638             Function: This method calculates the sequence offsets in a file based on ID and
639             should be implemented by classes that use Bio::DB::IndexedBase.
640             Returns : Hash of offsets
641             Args : File to process
642             Hashref of file offsets keyed by IDs.
643              
644             =cut
645              
646             sub _calculate_offsets {
647 0     0   0 my $self = shift;
648 0         0 $self->throw_not_implemented();
649             }
650              
651              
652             sub _index_files {
653             # Do the indexing of the given files using the index file on record
654 20     20   50 my ($self, $files, $force_reindex) = @_;
655              
656 20         77 $self->_set_pack_method( @$files );
657              
658             # Get name of index file
659 20         73 my $index = $self->index_name;
660              
661             # If caller has requested reindexing, unlink the index file.
662 20 100       50 if ($force_reindex) {
663             # Tied-hash in Strawberry Perl creates "$file.index"
664 12 50       134 unlink $index if -e $index;
665             # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
666 12 100       231 unlink "$index.dir" if -e "$index.dir";
667 12 100       162 unlink "$index.pag" if -e "$index.pag";
668             }
669              
670             # Get the modification time of the index
671 20   50     224 my $indextime = (stat $index)[9] || 0;
672              
673             # Register files and find if there has been any update
674 20         39 my $modtime = 0;
675 20         44 my @updated;
676 20         54 for my $file (@$files) {
677             # Register file
678 63         1560 $self->_path2fileno(basename($file));
679             # Any update?
680 63   50     499 my $m = (stat $file)[9] || 0;
681 63 100       142 if ($m > $modtime) {
682 20         32 $modtime = $m;
683             }
684 63 50       109 if ($m > $indextime) {
685 63         125 push @updated, $file;
686             }
687             }
688              
689             # Get termination length from first file
690 20         82 $self->{termination_length} = $self->_calc_termination_length( $files->[0] );
691              
692             # Reindex contents of changed files if needed
693 20   66     76 my $reindex = $force_reindex || (scalar @updated > 0);
694 20 50       77 $self->{offsets} = $self->_open_index($index, $reindex) or return;
695 20 50       58 if ($reindex) {
696 20         37 $self->{indexing} = $index;
697 20         42 for my $file (@updated) {
698 62         3011 my $fileno = $self->_path2fileno(basename($file));
699 62         172 &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
  62         499  
700             }
701 18         56 delete $self->{indexing};
702             }
703              
704             # Closing and reopening might help corrupted index file problem on Windows
705 18         100 $self->_close_index($self->{offsets});
706              
707 18         57 return $self->{offsets} = $self->_open_index($index);
708             }
709              
710              
711             sub _open_index {
712             # Open index file in read-only or write mode
713 38     38   82 my ($self, $index_file, $write) = @_;
714 38         53 my %offsets;
715 38 100       74 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
716 38         111 my @dbmargs = $self->dbmargs;
717 38 50       1539 tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
718             or $self->throw( "Could not open index file $index_file: $!");
719 38         204 return \%offsets;
720             }
721              
722              
723             sub _close_index {
724             # Close index file
725 45     45   81 my ($self, $index) = @_;
726 45         367 untie %$index;
727 45         75 return 1;
728             }
729              
730             # Compiling the below regular expression speeds up _parse_compound_id
731             my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;
732              
733             sub _parse_compound_id {
734             # Handle compound IDs:
735             # $db->seq($id)
736             # $db->seq($id, $start, $stop, $strand)
737             # $db->seq("$id:$start,$stop")
738             # $db->seq("$id:$start..$stop")
739             # $db->seq("$id:$start-$stop")
740             # $db->seq("$id:$start,$stop/$strand")
741             # $db->seq("$id:$start..$stop/$strand")
742             # $db->seq("$id:$start-$stop/$strand")
743             # $db->seq("$id/$strand")
744 45     45   91 my ($self, $id, $start, $stop, $strand) = @_;
745              
746 45 100 33     402 if ( (not defined $start ) &&
      66        
747             (not defined $stop ) &&
748             (not defined $strand) &&
749             ($id =~ m{$compound_id}) ) {
750             # Start, stop and strand not provided and ID looks like a compound ID
751 20         88 ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
752             }
753              
754             # Start, stop and strand defaults
755 45   100     107 $stop ||= $self->length($id) || 0; # 0 if sequence not found in database
      100        
756 45 100 100     97 $start ||= ($stop > 0) ? 1 : 0;
757 45   100     117 $strand ||= 1;
758              
759             # Convert numbers such as 1_000_000 to 1000000
760 45         88 $start =~ s/_//g;
761 45         70 $stop =~ s/_//g;
762              
763 45 100       86 if ($start > $stop) {
764             # Change the strand
765 8         22 ($start, $stop) = ($stop, $start);
766 8         16 $strand *= -1;
767             }
768              
769 45         153 return $id, $start, $stop, $strand;
770             }
771              
772              
773             sub _guess_alphabet {
774             # Determine the molecular type of the given sequence string:
775             # 'dna', 'rna', 'protein' or '' (unknown/empty)
776 2997     2997   4162 my ($self, $string) = @_;
777             # Handle IUPAC residues like PrimarySeq does
778 2997         5398 my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
779 2997 50       8303 return $alphabet eq 'dna' ? DNA
    100          
    100          
780             : $alphabet eq 'rna' ? RNA
781             : $alphabet eq 'protein' ? PROTEIN
782             : NA;
783             }
784              
785              
786             sub _makeid {
787             # Process the header line by applying any transformation given in -makeid
788 3053     3053   5120 my ($self, $header_line) = @_;
789 3053 100       10989 return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
790             }
791              
792              
793             sub _check_linelength {
794             # Check that the line length is valid. Generate an error otherwise.
795 105     105   212 my ($self, $linelength) = @_;
796 105 100       233 return if not defined $linelength;
797 96 50       284 $self->throw(
798             "Each line of the file must be less than 65,536 characters. Line ".
799             "$. is $linelength chars."
800             ) if $linelength > 65535;
801             }
802              
803              
804             sub _calc_termination_length {
805             # Try the beginning of the file to determine termination length
806             # Account for crlf-terminated Windows and Mac files
807 20     20   50 my ($self, $file) = @_;
808 20 50       134 my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
809              
810             # In Windows, text files have '\r\n' as line separator, but when reading in
811             # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
812             # "length $_" will report 4 although the line is 5 bytes in length.
813             # We assume that all lines have the same line separator and only read current line.
814 20         1411 my $init_pos = tell($fh);
815 20         218 my $curr_line = <$fh>;
816 20         50 my $pos_diff = tell($fh) - $init_pos;
817 20         44 my $correction = $pos_diff - length $curr_line;
818 20         106 close $fh;
819              
820 20 50       87 $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
821 20         69 return $self->{termination_length};
822             }
823              
824              
825             sub _calc_offset {
826             # Get the offset of the n-th residue of the sequence with the given ID
827             # and termination length (tl)
828 88     88   141 my ($self, $id, $n) = @_;
829 88         102 my $tl = $self->{termination_length};
830 88         93 $n--;
831 88         171 my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
  88         112  
832 88 100       226 $n = 0 if $n < 0;
833 88 100       126 $n = $seqlen-1 if $n >= $seqlen;
834 88         254 return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
835             }
836              
837              
838             sub _fh {
839             # Given a sequence ID, return the filehandle on which to find this sequence
840 49     49   76 my ($self, $id) = @_;
841 49 50       92 $self->throw('Need to provide a sequence ID') if not defined $id;
842 49 100       100 my $file = $self->file($id) or return;
843 48 0       84 return eval {
844 48         528 $self->_fhcache( File::Spec->catfile($self->{dirname}, $file));
845             } or $self->throw( "Can't open file $file" );
846             }
847              
848              
849             sub _fhcache {
850 48     48   94 my ($self, $path) = @_;
851 48 100       102 if (!$self->{fhcache}{$path}) {
852 9 50       29 if ($self->{curopen} >= $self->{maxopen}) {
853 0         0 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
854 0         0 keys %{$self->{fhcache}};
  0         0  
855 0         0 splice(@lru, $self->{maxopen} / 3);
856 0         0 $self->{curopen} -= @lru;
857 0         0 for (@lru) {
858 0         0 delete $self->{fhcache}{$_};
859             }
860             }
861 9   50     50 $self->{fhcache}{$path} = IO::File->new($path) || return;
862 9         648 binmode $self->{fhcache}{$path};
863 9         21 $self->{curopen}++;
864             }
865 48         85 $self->{cacheseq}{$path}++;
866 48         181 return $self->{fhcache}{$path};
867             }
868              
869              
870             #-------------------------------------------------------------
871             # Methods to store and retrieve data from indexed file
872             #
873              
874             =head2 offset
875              
876             Title : offset
877             Usage : my $offset = $db->offset($id);
878             Function: Get the offset of the indicated sequence from the beginning of the
879             file in which it is located. The offset points to the beginning of
880             the sequence, not the beginning of the header line.
881             Returns : String
882             Args : ID of sequence
883              
884             =cut
885              
886             sub offset {
887 1     1 1 2 my ($self, $id) = @_;
888 1 50       5 $self->throw('Need to provide a sequence ID') if not defined $id;
889 1 50       18 my $offset = $self->{offsets}{$id} or return;
890 1         3 return (&{$self->{unpackmeth}}($offset))[0];
  1         4  
891             }
892              
893              
894             =head2 strlen
895              
896             Title : strlen
897             Usage : my $length = $db->strlen($id);
898             Function: Get the number of characters in the sequence string.
899             Returns : Integer
900             Args : ID of sequence
901              
902             =cut
903              
904             sub strlen {
905 16     16 1 19 my ($self, $id) = @_;
906 16 50       25 $self->throw('Need to provide a sequence ID') if not defined $id;
907 16 50       83 my $offset = $self->{offsets}{$id} or return;
908 16         30 return (&{$self->{unpackmeth}}($offset))[1];
  16         24  
909             }
910              
911              
912             =head2 length
913              
914             Title : length
915             Usage : my $length = $db->length($id);
916             Function: Get the number of residues of the sequence.
917             Returns : Integer
918             Args : ID of sequence
919              
920             =cut
921              
922             sub length {
923 30     30 1 65 my ($self, $id) = @_;
924 30 50       70 $self->throw('Need to provide a sequence ID') if not defined $id;
925 30 100       273 my $offset = $self->{offsets}{$id} or return;
926 29         61 return (&{$self->{unpackmeth}}($offset))[2];
  29         59  
927             }
928              
929              
930             =head2 linelen
931              
932             Title : linelen
933             Usage : my $linelen = $db->linelen($id);
934             Function: Get the length of the line for this sequence.
935             Returns : Integer
936             Args : ID of sequence
937              
938             =cut
939              
940             sub linelen {
941 0     0 1 0 my ($self, $id) = @_;
942 0 0       0 $self->throw('Need to provide a sequence ID') if not defined $id;
943 0 0       0 my $offset = $self->{offsets}{$id} or return;
944 0         0 return (&{$self->{unpackmeth}}($offset))[3];
  0         0  
945             }
946              
947              
948             =head2 headerlen
949              
950             Title : headerlen
951             Usage : my $length = $db->headerlen($id);
952             Function: Get the length of the header line for the indicated sequence.
953             Returns : Integer
954             Args : ID of sequence
955              
956             =cut
957              
958             sub headerlen {
959 0     0 1 0 my ($self, $id) = @_;
960 0 0       0 $self->throw('Need to provide a sequence ID') if not defined $id;
961 0 0       0 my $offset = $self->{offsets}{$id} or return;
962 0         0 return (&{$self->{unpackmeth}}($offset))[4];
  0         0  
963             }
964              
965              
966             =head2 header_offset
967              
968             Title : header_offset
969             Usage : my $offset = $db->header_offset($id);
970             Function: Get the offset of the header line for the indicated sequence from
971             the beginning of the file in which it is located.
972             Returns : String
973             Args : ID of sequence
974              
975             =cut
976              
977             sub header_offset {
978 0     0 1 0 my ($self, $id) = @_;
979 0 0       0 $self->throw('Need to provide a sequence ID') if not defined $id;
980 0 0       0 return if not $self->{offsets}{$id};
981 0         0 return $self->offset($id) - $self->headerlen($id);
982             }
983              
984              
985             =head2 alphabet
986              
987             Title : alphabet
988             Usage : my $alphabet = $db->alphabet($id);
989             Function: Get the molecular type of the indicated sequence: dna, rna or protein
990             Returns : String
991             Args : ID of sequence
992              
993             =cut
994              
995             sub alphabet {
996 13     13 1 30 my ($self, $id) = @_;
997 13 50       31 $self->throw('Need to provide a sequence ID') if not defined $id;
998 13 50       85 my $offset = $self->{offsets}{$id} or return;
999 13         29 my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
  13         28  
1000 13 100       89 return : $alphabet == Bio::DB::IndexedBase::DNA ? 'dna'
    100          
    100          
1001             : $alphabet == Bio::DB::IndexedBase::RNA ? 'rna'
1002             : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
1003             : '';
1004             }
1005              
1006              
1007             =head2 file
1008              
1009             Title : file
1010             Usage : my $file = $db->file($id);
1011             Function: Get the the name of the file in which the indicated sequence can be
1012             found.
1013             Returns : String
1014             Args : ID of sequence
1015              
1016             =cut
1017              
1018             sub file {
1019 59     59 1 111 my ($self, $id) = @_;
1020 59 50       123 $self->throw('Need to provide a sequence ID') if not defined $id;
1021 59 100       424 my $offset = $self->{offsets}{$id} or return;
1022 58         131 return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
  58         110  
1023             }
1024              
1025              
1026             sub _fileno2path {
1027 58     58   103 my ($self, $fileno) = @_;
1028 58         225 return $self->{fileno2path}->[$fileno];
1029             }
1030              
1031              
1032             sub _path2fileno {
1033 125     125   319 my ($self, $path) = @_;
1034 125 100       409 if ( not exists $self->{filepath2no}->{$path} ) {
1035 63         184 my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
1036 63         111 $self->{fileno2path}->[$fileno] = $path; # Save path
1037             }
1038 125         247 return $self->{filepath2no}->{$path};
1039              
1040             }
1041              
1042              
1043             sub _packSmall {
1044 3052     3052   11033 return pack STRUCT, @_;
1045             }
1046              
1047              
1048             sub _packBig {
1049 0     0   0 return pack STRUCTBIG, @_;
1050             }
1051              
1052              
1053             sub _unpackSmall {
1054 209     209   1014 return unpack STRUCT, shift;
1055             }
1056              
1057              
1058             sub _unpackBig {
1059 0     0   0 return unpack STRUCTBIG, shift;
1060             }
1061              
1062              
1063             sub _set_pack_method {
1064             # Determine whether to use 32 or 64 bit integers for the given files.
1065 20     20   35 my $self = shift;
1066             # Find the maximum file size:
1067 20         49 my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
  88         106  
  63         368  
1068 20         35 my $fourGB = (2 ** 32) - 1;
1069              
1070 20 50       46 if ($maxsize > $fourGB) {
1071             # At least one file exceeds 4Gb - we will need to use 64 bit ints
1072 0         0 $self->{packmeth} = \&_packBig;
1073 0         0 $self->{unpackmeth} = \&_unpackBig;
1074             } else {
1075 20         50 $self->{packmeth} = \&_packSmall;
1076 20         50 $self->{unpackmeth} = \&_unpackSmall;
1077             }
1078 20         35 return 1;
1079             }
1080              
1081              
1082             #-------------------------------------------------------------
1083             # Tied hash logic
1084             #
1085              
1086             sub TIEHASH {
1087 2     2   42 return shift->new(@_);
1088             }
1089              
1090              
1091             sub FETCH {
1092 3     3   64 return shift->subseq(@_);
1093             }
1094              
1095              
1096             sub STORE {
1097 0     0   0 shift->throw("Read-only database");
1098             }
1099              
1100              
1101             sub DELETE {
1102 0     0   0 shift->throw("Read-only database");
1103             }
1104              
1105              
1106             sub CLEAR {
1107 0     0   0 shift->throw("Read-only database");
1108             }
1109              
1110              
1111             sub EXISTS {
1112 1     1   8 return defined shift->offset(@_);
1113             }
1114              
1115              
1116             sub FIRSTKEY {
1117 4     4   6 return tied(%{shift->{offsets}})->FIRSTKEY(@_);
  4         41  
1118             }
1119              
1120              
1121             sub NEXTKEY {
1122 9     9   11 return tied(%{shift->{offsets}})->NEXTKEY(@_);
  9         44  
1123             }
1124              
1125              
1126             sub DESTROY {
1127 27     27   2143 my $self = shift;
1128              
1129             # Close filehandles
1130 27         48 while (my ($file, $fh) = each %{ $self->{fhcache} }) {
  37         260  
1131 10 50       24 if (defined $fh) {
1132 10         62 $fh->close;
1133             }
1134             }
1135 27         98 $self->_close_index($self->{offsets});
1136              
1137 27 100 100     115 if ( $self->{clean} || $self->{indexing} ) {
1138             # Indexing aborted or cleaning requested. Delete the index file.
1139 6         10 my $index = $self->{index_name};
1140              
1141             # Tied-hash in Strawberry Perl creates "$file.index"
1142 6 50       42 unlink $index if -e $index;
1143             # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
1144 6 100       171 unlink "$index.dir" if -e "$index.dir";
1145 6 100       157 unlink "$index.pag" if -e "$index.pag";
1146             }
1147 27         262 return 1;
1148             }
1149              
1150              
1151             #-------------------------------------------------------------
1152             # stream-based access to the database
1153             #
1154              
1155             package Bio::DB::Indexed::Stream;
1156 2     2   15 use base qw(Tie::Handle Bio::DB::SeqI);
  2         4  
  2         630  
1157              
1158              
1159             sub new {
1160 4     4   7 my ($class, $db) = @_;
1161 4         15 my $key = $db->FIRSTKEY;
1162 4         26 return bless {
1163             db => $db,
1164             key => $key
1165             }, $class;
1166             }
1167              
1168             sub next_seq {
1169 10     10   17 my $self = shift;
1170 10         13 my ($key, $db) = @{$self}{'key', 'db'};
  10         21  
1171 10 100       24 return if not defined $key;
1172 9         18 my $value = $db->get_Seq_by_id($key);
1173 9         29 $self->{key} = $db->NEXTKEY($key);
1174 9         25 return $value;
1175             }
1176              
1177             sub TIEHANDLE {
1178 1     1   4 my ($class, $db) = @_;
1179 1         3 return $class->new($db);
1180             }
1181              
1182             sub READLINE {
1183 1     1   2 my $self = shift;
1184 1   50     3 return $self->next_seq || undef;
1185             }
1186              
1187              
1188             1;