File Coverage

Bio/DB/Fasta.pm
Criterion Covered Total %
statement 132 139 94.9
branch 29 40 72.5
condition 14 21 66.6
subroutine 22 24 91.6
pod 1 2 50.0
total 198 226 87.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Fasta
3             #
4             # You may distribute this module under the same terms as perl itself
5             #
6              
7              
8             =head1 NAME
9              
10             Bio::DB::Fasta - Fast indexed access to fasta files
11              
12             =head1 SYNOPSIS
13              
14             use Bio::DB::Fasta;
15              
16             # Create database from a directory of Fasta files
17             my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18             my @ids = $db->get_all_primary_ids;
19              
20             # Simple access
21             my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22             my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23             my $length = $db->length('CHROMOSOME_I');
24             my $header = $db->header('CHROMOSOME_I');
25             my $alphabet = $db->alphabet('CHROMOSOME_I');
26              
27             # Access to sequence objects. See Bio::PrimarySeqI.
28             my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29             my $seqstr = $seq->seq;
30             my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31             my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32             my $length = $seq->length;
33              
34             # Loop through sequence objects
35             my $stream = $db->get_PrimarySeq_stream;
36             while (my $seq = $stream->next_seq) {
37             # Bio::PrimarySeqI stuff
38             }
39              
40             # Filehandle access
41             my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42             while (my $seq = <$fh>) {
43             # Bio::PrimarySeqI stuff
44             }
45              
46             # Tied hash access
47             tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48             print $sequences{'CHROMOSOME_I:1,20000'};
49              
50             =head1 DESCRIPTION
51              
52             Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53             or a directory of files. It provides persistent random access to each sequence
54             entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55             subsequences within each entry, allowing you to retrieve portions of very large
56             sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57             based on Bio::DB::IndexedBase. See this module's documentation for details.
58              
59             The Fasta files may contain any combination of nucleotide and protein sequences;
60             during indexing the module guesses the molecular type. Entries may have any line
61             length up to 65,536 characters, and different line lengths are allowed in the
62             same file. However, within a sequence entry, all lines must be the same length
63             except for the last. An error will be thrown if this is not the case.
64              
65             The module uses /^E(\S+)/ to extract the primary ID of each sequence
66             from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67             routine to reversibly modify this primary ID, e.g. if you wish to extract a
68             specific portion of the gi|gb|abc|xyz GenBank IDs.
69              
70             =head1 DATABASE CREATION AND INDEXING
71              
72             The object-oriented constructor is new(), the filehandle constructor is newFh()
73             and the tied hash constructor is tie(). They all allow to index a single Fasta
74             file, several files, or a directory of files. See Bio::DB::IndexedBase.
75              
76             =head1 SEE ALSO
77              
78             L
79              
80             L
81              
82             L
83              
84             =head1 AUTHOR
85              
86             Lincoln Stein Elstein@cshl.orgE.
87              
88             Copyright (c) 2001 Cold Spring Harbor Laboratory.
89              
90             This library is free software; you can redistribute it and/or modify
91             it under the same terms as Perl itself. See DISCLAIMER.txt for
92             disclaimers of warranty.
93              
94             =head1 APPENDIX
95              
96             The rest of the documentation details each of the object
97             methods. Internal methods are usually preceded with a _
98              
99             For BioPerl-style access, the following methods are provided:
100              
101             =head2 get_Seq_by_id
102              
103             Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104             Usage : my $seq = $db->get_Seq_by_id($id);
105             Function: Given an ID, fetch the corresponding sequence from the database.
106             Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107             Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108             only load the sequence string into memory when requested using seq().
109             See L for methods provided by the sequence objects
110             returned from get_Seq_by_id() and get_PrimarySeq_stream().
111             Args : ID
112              
113             =head2 get_PrimarySeq_stream
114              
115             Title : get_PrimarySeq_stream
116             Usage : my $stream = $db->get_PrimarySeq_stream();
117             Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118             single method, next_seq(). Each call to next_seq() returns a new
119             Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120             Returns : A Bio::DB::Indexed::Stream object
121             Args : None
122              
123             =head1
124              
125             For simple access, the following methods are provided:
126              
127             =cut
128              
129              
130             package Bio::DB::Fasta;
131              
132 1     1   3 use strict;
  1         1  
  1         23  
133 1     1   394 use IO::File;
  1         767  
  1         88  
134 1     1   5 use File::Spec;
  1         1  
  1         15  
135 1     1   331 use Bio::PrimarySeqI;
  1         2  
  1         29  
136              
137 1     1   5 use base qw(Bio::DB::IndexedBase);
  1         2  
  1         663  
138              
139             our $obj_class = 'Bio::PrimarySeq::Fasta';
140             our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
141              
142              
143             =head2 new
144              
145             Title : new
146             Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147             Function: Initialize a new database object. When indexing a directory, files
148             ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149             Returns : A new Bio::DB::Fasta object.
150             Args : A single file, or path to dir, or arrayref of files
151             Optional arguments: see Bio::DB::IndexedBase
152              
153             =cut
154              
155              
156             sub _calculate_offsets {
157             # Bio::DB::IndexedBase calls this to calculate offsets
158 53     53   125 my ($self, $fileno, $file, $offsets) = @_;
159              
160 53 50       541 my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
161 53         5620 binmode $fh;
162 53 50       203 warn "Indexing $file\n" if $self->{debug};
163 53         70 my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164             $last_line, %offsets);
165 53         190 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
166              
167 53         136 my $termination_length = $self->{termination_length};
168 53         895 while (my $line = <$fh>) {
169             # Account for crlf-terminated Windows files
170 36648 100       88521 if (index($line, '>') == 0) {
    100          
171 3009 100       8618 if ($line =~ /^>(\S+)/) {
172             print STDERR "Indexed $count sequences...\n"
173 3008 50 33     6240 if $self->{debug} && (++$count%1000) == 0;
174            
175             # please, do not enforce arbitrary line length requirements.
176             # It's good practice but not enforced.
177            
178             #$self->_check_linelength($linelen);
179 3008         3851 my $pos = tell($fh);
180 3008 100       5369 if (@ids) {
181 2956         3243 my $strlen = $pos - $offset - length($line);
182 2956         2687 $strlen -= $termination_length * $seq_lines;
183 2956         2336 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
  2956         7710  
184             $linelen, $headerlen, $alphabet, $fileno);
185 2956         3162 $alphabet = Bio::DB::IndexedBase::NA;
186 2956         3695 for my $id (@ids) {
187 2962         50274 $offsets->{$id} = $ppos;
188             }
189             }
190 3008         7141 @ids = $self->_makeid($line);
191 3008         3907 ($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192 3008         3563 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
193             } else {
194             # Catch bad header lines, bug 3172
195 1         28 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
196             }
197             } elsif ($line !~ /\S/) {
198             # Skip blank line
199 89         74 $blank_lines++;
200 89         333 next;
201             } else {
202             # Need to check every line :(
203 33550         24154 $l3_len = $l2_len;
204 33550         20584 $l2_len = $l_len;
205 33550         20312 $l_len = length $line;
206 33550         21671 if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
207 33550 50 66     118855 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
      66        
208 0         0 my $fap = substr($line, 0, 20)."..";
209 0         0 $self->throw("Each line of the fasta entry must be the same ".
210             "length except the last. Line above #$. '$fap' is $l2_len".
211             " != $l3_len chars.");
212             }
213 33550 100       40920 if ($blank_lines) {
214             # Blank lines not allowed in entry
215 1         9 $self->throw("Blank lines can only precede header lines, ".
216             "found preceding line #$.");
217             }
218             }
219 33549   66     44425 $linelen ||= length $line;
220 33549   66     39936 $alphabet ||= $self->_guess_alphabet($line);
221 33549         22163 $seq_lines++;
222             }
223 36557         84267 $last_line = $line;
224             }
225              
226             # Process last entry
227 51         290 $self->_check_linelength($linelen);
228 51         106 my $pos = tell $fh;
229 51 50       138 if (@ids) {
230 51         82 my $strlen = $pos - $offset;
231 51 100       114 if ($linelen == 0) { # yet another pesky empty chr_random.fa file
232 11         20 $strlen = 0;
233             } else {
234 40 50       240 if ($last_line !~ /\s$/) {
235 0         0 $seq_lines--;
236             }
237 40         82 $strlen -= $termination_length * $seq_lines;
238             }
239 51         70 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
  51         225  
240             $headerlen, $alphabet, $fileno);
241 51         130 for my $id (@ids) {
242 52         1020 $offsets->{$id} = $ppos;
243             }
244             }
245              
246 51         2395 return \%offsets;
247             }
248              
249             =head2 seq
250              
251             Title : seq, sequence, subseq
252             Usage : # Entire sequence string
253             my $seqstr = $db->seq($id);
254             # Subsequence
255             my $subseqstr = $db->seq($id, $start, $stop, $strand);
256             # or...
257             my $subseqstr = $db->seq($compound_id);
258             Function: Get a subseq of a sequence from the database. For your convenience,
259             the sequence to extract can be specified with any of the following
260             compound IDs:
261             $db->seq("$id:$start,$stop")
262             $db->seq("$id:$start..$stop")
263             $db->seq("$id:$start-$stop")
264             $db->seq("$id:$start,$stop/$strand")
265             $db->seq("$id:$start..$stop/$strand")
266             $db->seq("$id:$start-$stop/$strand")
267             $db->seq("$id/$strand")
268             In the case of DNA or RNA sequence, if $stop is less than $start,
269             then the reverse complement of the sequence is returned. Avoid using
270             it if possible since this goes against Bio::Seq conventions.
271             Returns : A string
272             Args : ID of sequence to retrieve
273             or
274             Compound ID of subsequence to fetch
275             or
276             ID, optional start (defaults to 1), optional end (defaults to length
277             of sequence) and optional strand (defaults to 1).
278              
279             =cut
280              
281             sub subseq {
282 29     29 0 841 my ($self, $id, $start, $stop, $strand) = @_;
283 29 50       69 $self->throw('Need to provide a sequence ID') if not defined $id;
284 29         85 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
285              
286 29         44 my $data;
287              
288 29 100       81 my $fh = $self->_fh($id) or return;
289 28         99 my $filestart = $self->_calc_offset($id, $start);
290 28         63 my $filestop = $self->_calc_offset($id, $stop );
291              
292 28         106 seek($fh, $filestart,0);
293 28         280 read($fh, $data, $filestop-$filestart+1);
294              
295 28         86 $data = Bio::DB::IndexedBase::_strip_crnl($data);
296              
297 28 100       63 if ($strand == -1) {
298             # Reverse-complement the sequence
299 7         24 $data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
300             }
301 28         130 return $data;
302             }
303              
304             *seq = *sequence = \&subseq;
305              
306              
307             =head2 length
308              
309             Title : length
310             Usage : my $length = $qualdb->length($id);
311             Function: Get the number of residues in the indicated sequence.
312             Returns : Number
313             Args : ID of entry
314              
315             =head2 header
316              
317             Title : header
318             Usage : my $header = $db->header($id);
319             Function: Get the header line (ID and description fields) of the specified
320             sequence.
321             Returns : String
322             Args : ID of sequence
323              
324             =cut
325              
326             sub header {
327 2     2 1 6 my ($self, $id) = @_;
328 2 50       8 $self->throw('Need to provide a sequence ID') if not defined $id;
329 2         10 my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
  2         8  
330 2         6 $offset -= $headerlen;
331 2         2 my $data;
332 2 50       8 my $fh = $self->_fh($id) or return;
333 2         9 seek($fh, $offset, 0);
334 2         16 read($fh, $data, $headerlen);
335             # On Windows chomp remove '\n' but leaves '\r'
336             # when reading '\r\n' in binary mode
337 2         6 $data = Bio::DB::IndexedBase::_strip_crnl($data);
338 2         7 substr($data, 0, 1) = '';
339 2         8 return $data;
340             }
341              
342              
343             =head2 alphabet
344              
345             Title : alphabet
346             Usage : my $alphabet = $db->alphabet($id);
347             Function: Get the molecular type of the indicated sequence: dna, rna or protein
348             Returns : String
349             Args : ID of sequence
350              
351             =cut
352              
353              
354             #-------------------------------------------------------------
355             # Bio::PrimarySeqI compatibility
356             #
357             package Bio::PrimarySeq::Fasta;
358 1     1   16 use overload '""' => 'display_id';
  1         2  
  1         19  
359              
360 1     1   120 use base qw(Bio::Root::Root Bio::PrimarySeqI);
  1         2  
  1         1016  
361              
362             sub new {
363 18     18   38 my ($class, @args) = @_;
364 18         64 my $self = $class->SUPER::new(@args);
365 18         97 my ($db, $id, $start, $stop) = $self->_rearrange(
366             [qw(DATABASE ID START STOP)],
367             @args);
368 18         36 $self->{db} = $db;
369 18         27 $self->{id} = $id;
370 18   100     56 $self->{stop} = $stop || $db->length($id);
371 18   66     87 $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
372 18         72 return $self;
373             }
374              
375             sub fetch_sequence {
376 0     0   0 return shift->seq(@_);
377             }
378              
379             sub seq {
380 4     4   7 my $self = shift;
381 4         20 return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
382             }
383              
384             sub subseq {
385 1     1   3 my $self = shift;
386 1         5 return $self->trunc(@_)->seq();
387             }
388              
389             sub trunc {
390             # Override Bio::PrimarySeqI trunc() method. This way, we create an object
391             # that does not store the sequence in memory.
392 2     2   5 my ($self, $start, $stop) = @_;
393 2 50       7 $self->throw("Stop cannot be smaller than start") if $stop < $start;
394 2 50       13 if ($self->{start} <= $self->{stop}) {
395 2         5 $start = $self->{start}+$start-1;
396 2         4 $stop = $self->{start}+$stop-1;
397             } else {
398 0         0 $start = $self->{start}-($start-1);
399 0         0 $stop = $self->{start}-($stop-1);
400             }
401 2         8 return $self->new( $self->{db}, $self->{id}, $start, $stop );
402             }
403              
404             sub is_circular {
405 1     1   3 my $self = shift;
406 1         3 return $self->{is_circular};
407             }
408              
409             sub display_id {
410 9     9   798 my $self = shift;
411 9         30 return $self->{id};
412             }
413              
414             sub accession_number {
415 1     1   3 my $self = shift;
416 1         5 return 'unknown';
417             }
418              
419             sub primary_id {
420             # Following Bio::PrimarySeqI, since this sequence has no accession number,
421             # its primary_id should be a stringified memory location.
422 1     1   3 my $self = shift;
423 1         11 return overload::StrVal($self);
424             }
425              
426             sub can_call_new {
427 0     0   0 return 0;
428             }
429              
430             sub alphabet {
431 1     1   577 my $self = shift;
432 1         7 return $self->{db}->alphabet($self->{id});
433             }
434              
435             sub revcom {
436             # Override Bio::PrimarySeqI revcom() with optimized method.
437 1     1   3 my $self = shift;
438 1         2 return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
  1         3  
439             }
440              
441             sub length {
442             # Get length from sequence location, not the sequence string (too expensive)
443 2     2   3 my $self = shift;
444             return $self->{start} < $self->{stop} ?
445             $self->{stop} - $self->{start} + 1 :
446 2 100       15 $self->{start} - $self->{stop} + 1 ;
447             }
448              
449             sub description {
450 1     1   3 my $self = shift;
451 1         8 my $header = $self->{'db'}->header($self->{id});
452             # Remove the ID from the header
453 1         14 return (split(/\s+/, $header, 2))[1];
454             }
455             *desc = \&description;
456              
457             1;