File Coverage

Bio/DB/Fasta.pm
Criterion Covered Total %
statement 132 139 94.9
branch 29 40 72.5
condition 15 21 71.4
subroutine 22 24 91.6
pod 1 2 50.0
total 199 226 88.0


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Fasta
3             #
4             # You may distribute this module under the same terms as perl itself
5             #
6              
7              
8             =head1 NAME
9              
10             Bio::DB::Fasta - Fast indexed access to fasta files
11              
12             =head1 SYNOPSIS
13              
14             use Bio::DB::Fasta;
15              
16             # Create database from a directory of Fasta files
17             my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18             my @ids = $db->get_all_primary_ids;
19              
20             # Simple access
21             my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22             my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23             my $length = $db->length('CHROMOSOME_I');
24             my $header = $db->header('CHROMOSOME_I');
25             my $alphabet = $db->alphabet('CHROMOSOME_I');
26              
27             # Access to sequence objects. See Bio::PrimarySeqI.
28             my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29             my $seqstr = $seq->seq;
30             my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31             my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32             my $length = $seq->length;
33              
34             # Loop through sequence objects
35             my $stream = $db->get_PrimarySeq_stream;
36             while (my $seq = $stream->next_seq) {
37             # Bio::PrimarySeqI stuff
38             }
39              
40             # Filehandle access
41             my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42             while (my $seq = <$fh>) {
43             # Bio::PrimarySeqI stuff
44             }
45              
46             # Tied hash access
47             tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48             print $sequences{'CHROMOSOME_I:1,20000'};
49              
50             =head1 DESCRIPTION
51              
52             Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53             or a directory of files. It provides persistent random access to each sequence
54             entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55             subsequences within each entry, allowing you to retrieve portions of very large
56             sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57             based on Bio::DB::IndexedBase. See this module's documentation for details.
58              
59             The Fasta files may contain any combination of nucleotide and protein sequences;
60             during indexing the module guesses the molecular type. Entries may have any line
61             length up to 65,536 characters, and different line lengths are allowed in the
62             same file. However, within a sequence entry, all lines must be the same length
63             except for the last. An error will be thrown if this is not the case.
64              
65             The module uses /^E(\S+)/ to extract the primary ID of each sequence
66             from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67             routine to reversibly modify this primary ID, e.g. if you wish to extract a
68             specific portion of the gi|gb|abc|xyz GenBank IDs.
69              
70             =head1 DATABASE CREATION AND INDEXING
71              
72             The object-oriented constructor is new(), the filehandle constructor is newFh()
73             and the tied hash constructor is tie(). They all allow one to index a single Fasta
74             file, several files, or a directory of files. See Bio::DB::IndexedBase.
75              
76             =head1 SEE ALSO
77              
78             L
79              
80             L
81              
82             L
83              
84             =head1 AUTHOR
85              
86             Lincoln Stein Elstein@cshl.orgE.
87              
88             Copyright (c) 2001 Cold Spring Harbor Laboratory.
89              
90             This library is free software; you can redistribute it and/or modify
91             it under the same terms as Perl itself. See DISCLAIMER.txt for
92             disclaimers of warranty.
93              
94             =head1 APPENDIX
95              
96             The rest of the documentation details each of the object
97             methods. Internal methods are usually preceded with a _
98              
99             For BioPerl-style access, the following methods are provided:
100              
101             =head2 get_Seq_by_id
102              
103             Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104             Usage : my $seq = $db->get_Seq_by_id($id);
105             Function: Given an ID, fetch the corresponding sequence from the database.
106             Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107             Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108             only load the sequence string into memory when requested using seq().
109             See L for methods provided by the sequence objects
110             returned from get_Seq_by_id() and get_PrimarySeq_stream().
111             Args : ID
112              
113             =head2 get_PrimarySeq_stream
114              
115             Title : get_PrimarySeq_stream
116             Usage : my $stream = $db->get_PrimarySeq_stream();
117             Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118             single method, next_seq(). Each call to next_seq() returns a new
119             Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120             Returns : A Bio::DB::Indexed::Stream object
121             Args : None
122              
123             =head1
124              
125             For simple access, the following methods are provided:
126              
127             =cut
128              
129              
130             package Bio::DB::Fasta;
131              
132 1     1   5 use strict;
  1         1  
  1         24  
133 1     1   304 use IO::File;
  1         802  
  1         89  
134 1     1   6 use File::Spec;
  1         1  
  1         16  
135 1     1   290 use Bio::PrimarySeqI;
  1         3  
  1         32  
136              
137 1     1   6 use base qw(Bio::DB::IndexedBase);
  1         1  
  1         547  
138              
139             our $obj_class = 'Bio::PrimarySeq::Fasta';
140             our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
141              
142              
143             =head2 new
144              
145             Title : new
146             Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147             Function: Initialize a new database object. When indexing a directory, files
148             ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149             Returns : A new Bio::DB::Fasta object.
150             Args : A single file, or path to dir, or arrayref of files
151             Optional arguments: see Bio::DB::IndexedBase
152              
153             =cut
154              
155              
156             sub _calculate_offsets {
157             # Bio::DB::IndexedBase calls this to calculate offsets
158 53     53   201 my ($self, $fileno, $file, $offsets) = @_;
159              
160 53 50       587 my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
161 53         5562 binmode $fh;
162 53 50       183 warn "Indexing $file\n" if $self->{debug};
163 53         119 my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164             $last_line, %offsets);
165 53         113 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
166              
167 53         104 my $termination_length = $self->{termination_length};
168 53         849 while (my $line = <$fh>) {
169             # Account for crlf-terminated Windows files
170 36648 100       81405 if (index($line, '>') == 0) {
    100          
171 3009 100       7908 if ($line =~ /^>(\S+)/) {
172             print STDERR "Indexed $count sequences...\n"
173 3008 50 33     5753 if $self->{debug} && (++$count%1000) == 0;
174            
175             # please, do not enforce arbitrary line length requirements.
176             # It's good practice but not enforced.
177            
178             #$self->_check_linelength($linelen);
179 3008         4160 my $pos = tell($fh);
180 3008 100       4337 if (@ids) {
181 2956         3537 my $strlen = $pos - $offset - length($line);
182 2956         3178 $strlen -= $termination_length * $seq_lines;
183 2956         2731 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
  2956         5689  
184             $linelen, $headerlen, $alphabet, $fileno);
185 2956         3796 $alphabet = Bio::DB::IndexedBase::NA;
186 2956         3703 for my $id (@ids) {
187 2962         39350 $offsets->{$id} = $ppos;
188             }
189             }
190 3008         7180 @ids = $self->_makeid($line);
191 3008         4689 ($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192 3008         4045 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
193             } else {
194             # Catch bad header lines, bug 3172
195 1         17 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
196             }
197             } elsif ($line !~ /\S/) {
198             # Skip blank line
199 89         86 $blank_lines++;
200 89         268 next;
201             } else {
202             # Need to check every line :(
203 33550         34650 $l3_len = $l2_len;
204 33550         28514 $l2_len = $l_len;
205 33550         27020 $l_len = length $line;
206 33550         27113 if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
207 33550 50 66     87276 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
      66        
208 0         0 my $fap = substr($line, 0, 20)."..";
209 0         0 $self->throw("Each line of the fasta entry must be the same ".
210             "length except the last. Line above #$. '$fap' is $l2_len".
211             " != $l3_len chars.");
212             }
213 33550 100       40878 if ($blank_lines) {
214             # Blank lines not allowed in entry
215 1         7 $self->throw("Blank lines can only precede header lines, ".
216             "found preceding line #$.");
217             }
218             }
219 33549   66     42208 $linelen ||= length $line;
220 33549   66     41379 $alphabet ||= $self->_guess_alphabet($line);
221 33549         28152 $seq_lines++;
222             }
223 36557         74805 $last_line = $line;
224             }
225              
226             # Process last entry
227 51         322 $self->_check_linelength($linelen);
228 51         142 my $pos = tell $fh;
229 51 50       149 if (@ids) {
230 51         112 my $strlen = $pos - $offset;
231 51 100       120 if ($linelen == 0) { # yet another pesky empty chr_random.fa file
232 11         26 $strlen = 0;
233             } else {
234 40 50       242 if ($last_line !~ /\s$/) {
235 0         0 $seq_lines--;
236             }
237 40         93 $strlen -= $termination_length * $seq_lines;
238             }
239 51         116 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
  51         174  
240             $headerlen, $alphabet, $fileno);
241 51         133 for my $id (@ids) {
242 52         859 $offsets->{$id} = $ppos;
243             }
244             }
245              
246 51         1534 return \%offsets;
247             }
248              
249             =head2 seq
250              
251             Title : seq, sequence, subseq
252             Usage : # Entire sequence string
253             my $seqstr = $db->seq($id);
254             # Subsequence
255             my $subseqstr = $db->seq($id, $start, $stop, $strand);
256             # or...
257             my $subseqstr = $db->seq($compound_id);
258             Function: Get a subseq of a sequence from the database. For your convenience,
259             the sequence to extract can be specified with any of the following
260             compound IDs:
261             $db->seq("$id:$start,$stop")
262             $db->seq("$id:$start..$stop")
263             $db->seq("$id:$start-$stop")
264             $db->seq("$id:$start,$stop/$strand")
265             $db->seq("$id:$start..$stop/$strand")
266             $db->seq("$id:$start-$stop/$strand")
267             $db->seq("$id/$strand")
268             In the case of DNA or RNA sequence, if $stop is less than $start,
269             then the reverse complement of the sequence is returned. Avoid using
270             it if possible since this goes against Bio::Seq conventions.
271             Returns : A string
272             Args : ID of sequence to retrieve
273             or
274             Compound ID of subsequence to fetch
275             or
276             ID, optional start (defaults to 1), optional end (defaults to length
277             of sequence) and optional strand (defaults to 1).
278              
279             =cut
280              
281             sub subseq {
282 29     29 0 402 my ($self, $id, $start, $stop, $strand) = @_;
283 29 50       64 $self->throw('Need to provide a sequence ID') if not defined $id;
284 29         91 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
285              
286 29         44 my $data;
287              
288 29 100       82 my $fh = $self->_fh($id) or return;
289 28         78 my $filestart = $self->_calc_offset($id, $start);
290 28         153 my $filestop = $self->_calc_offset($id, $stop );
291              
292 28         134 seek($fh, $filestart,0);
293 28         245 read($fh, $data, $filestop-$filestart+1);
294              
295 28         105 $data = Bio::DB::IndexedBase::_strip_crnl($data);
296              
297 28 100       76 if ($strand == -1) {
298             # Reverse-complement the sequence
299 7         32 $data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
300             }
301 28         173 return $data;
302             }
303              
304             *seq = *sequence = \&subseq;
305              
306              
307             =head2 length
308              
309             Title : length
310             Usage : my $length = $qualdb->length($id);
311             Function: Get the number of residues in the indicated sequence.
312             Returns : Number
313             Args : ID of entry
314              
315             =head2 header
316              
317             Title : header
318             Usage : my $header = $db->header($id);
319             Function: Get the header line (ID and description fields) of the specified
320             sequence.
321             Returns : String
322             Args : ID of sequence
323              
324             =cut
325              
326             sub header {
327 2     2 1 5 my ($self, $id) = @_;
328 2 50       9 $self->throw('Need to provide a sequence ID') if not defined $id;
329 2         8 my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
  2         8  
330 2         7 $offset -= $headerlen;
331 2         7 my $data;
332 2 50       7 my $fh = $self->_fh($id) or return;
333 2         11 seek($fh, $offset, 0);
334 2         16 read($fh, $data, $headerlen);
335             # On Windows chomp remove '\n' but leaves '\r'
336             # when reading '\r\n' in binary mode
337 2         9 $data = Bio::DB::IndexedBase::_strip_crnl($data);
338 2         8 substr($data, 0, 1) = '';
339 2         7 return $data;
340             }
341              
342              
343             =head2 alphabet
344              
345             Title : alphabet
346             Usage : my $alphabet = $db->alphabet($id);
347             Function: Get the molecular type of the indicated sequence: dna, rna or protein
348             Returns : String
349             Args : ID of sequence
350              
351             =cut
352              
353              
354             #-------------------------------------------------------------
355             # Bio::PrimarySeqI compatibility
356             #
357             package Bio::PrimarySeq::Fasta;
358 1     1   7 use overload '""' => 'display_id';
  1         5  
  1         9  
359              
360 1     1   75 use base qw(Bio::Root::Root Bio::PrimarySeqI);
  1         55  
  1         508  
361              
362             sub new {
363 18     18   46 my ($class, @args) = @_;
364 18         77 my $self = $class->SUPER::new(@args);
365 18         85 my ($db, $id, $start, $stop) = $self->_rearrange(
366             [qw(DATABASE ID START STOP)],
367             @args);
368 18         42 $self->{db} = $db;
369 18         32 $self->{id} = $id;
370 18   100     61 $self->{stop} = $stop || $db->length($id);
371 18   100     93 $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
372 18         74 return $self;
373             }
374              
375             sub fetch_sequence {
376 0     0   0 return shift->seq(@_);
377             }
378              
379             sub seq {
380 4     4   9 my $self = shift;
381 4         13 return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
382             }
383              
384             sub subseq {
385 1     1   2 my $self = shift;
386 1         4 return $self->trunc(@_)->seq();
387             }
388              
389             sub trunc {
390             # Override Bio::PrimarySeqI trunc() method. This way, we create an object
391             # that does not store the sequence in memory.
392 2     2   3 my ($self, $start, $stop) = @_;
393 2 50       7 $self->throw("Stop cannot be smaller than start") if $stop < $start;
394 2 50       15 if ($self->{start} <= $self->{stop}) {
395 2         5 $start = $self->{start}+$start-1;
396 2         3 $stop = $self->{start}+$stop-1;
397             } else {
398 0         0 $start = $self->{start}-($start-1);
399 0         0 $stop = $self->{start}-($stop-1);
400             }
401 2         7 return $self->new( $self->{db}, $self->{id}, $start, $stop );
402             }
403              
404             sub is_circular {
405 1     1   3 my $self = shift;
406 1         5 return $self->{is_circular};
407             }
408              
409             sub display_id {
410 9     9   854 my $self = shift;
411 9         26 return $self->{id};
412             }
413              
414             sub accession_number {
415 1     1   2 my $self = shift;
416 1         8 return 'unknown';
417             }
418              
419             sub primary_id {
420             # Following Bio::PrimarySeqI, since this sequence has no accession number,
421             # its primary_id should be a stringified memory location.
422 1     1   4 my $self = shift;
423 1         14 return overload::StrVal($self);
424             }
425              
426             sub can_call_new {
427 0     0   0 return 0;
428             }
429              
430             sub alphabet {
431 1     1   628 my $self = shift;
432 1         8 return $self->{db}->alphabet($self->{id});
433             }
434              
435             sub revcom {
436             # Override Bio::PrimarySeqI revcom() with optimized method.
437 1     1   3 my $self = shift;
438 1         2 return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
  1         5  
439             }
440              
441             sub length {
442             # Get length from sequence location, not the sequence string (too expensive)
443 2     2   4 my $self = shift;
444             return $self->{start} < $self->{stop} ?
445             $self->{stop} - $self->{start} + 1 :
446 2 100       14 $self->{start} - $self->{stop} + 1 ;
447             }
448              
449             sub description {
450 1     1   2 my $self = shift;
451 1         5 my $header = $self->{'db'}->header($self->{id});
452             # Remove the ID from the header
453 1         14 return (split(/\s+/, $header, 2))[1];
454             }
455             *desc = \&description;
456              
457             1;