File Coverage

Bio/DB/Fasta.pm
Criterion Covered Total %
statement 132 139 94.9
branch 29 40 72.5
condition 14 21 66.6
subroutine 22 24 91.6
pod 1 2 50.0
total 198 226 87.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Fasta
3             #
4             # You may distribute this module under the same terms as perl itself
5             #
6              
7              
8             =head1 NAME
9              
10             Bio::DB::Fasta - Fast indexed access to fasta files
11              
12             =head1 SYNOPSIS
13              
14             use Bio::DB::Fasta;
15              
16             # Create database from a directory of Fasta files
17             my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18             my @ids = $db->get_all_primary_ids;
19              
20             # Simple access
21             my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22             my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23             my $length = $db->length('CHROMOSOME_I');
24             my $header = $db->header('CHROMOSOME_I');
25             my $alphabet = $db->alphabet('CHROMOSOME_I');
26              
27             # Access to sequence objects. See Bio::PrimarySeqI.
28             my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29             my $seqstr = $seq->seq;
30             my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31             my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32             my $length = $seq->length;
33              
34             # Loop through sequence objects
35             my $stream = $db->get_PrimarySeq_stream;
36             while (my $seq = $stream->next_seq) {
37             # Bio::PrimarySeqI stuff
38             }
39              
40             # Filehandle access
41             my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42             while (my $seq = <$fh>) {
43             # Bio::PrimarySeqI stuff
44             }
45              
46             # Tied hash access
47             tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48             print $sequences{'CHROMOSOME_I:1,20000'};
49              
50             =head1 DESCRIPTION
51              
52             Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53             or a directory of files. It provides persistent random access to each sequence
54             entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55             subsequences within each entry, allowing you to retrieve portions of very large
56             sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57             based on Bio::DB::IndexedBase. See this module's documentation for details.
58              
59             The Fasta files may contain any combination of nucleotide and protein sequences;
60             during indexing the module guesses the molecular type. Entries may have any line
61             length up to 65,536 characters, and different line lengths are allowed in the
62             same file. However, within a sequence entry, all lines must be the same length
63             except for the last. An error will be thrown if this is not the case.
64              
65             The module uses /^E(\S+)/ to extract the primary ID of each sequence
66             from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67             routine to reversibly modify this primary ID, e.g. if you wish to extract a
68             specific portion of the gi|gb|abc|xyz GenBank IDs.
69              
70             =head1 DATABASE CREATION AND INDEXING
71              
72             The object-oriented constructor is new(), the filehandle constructor is newFh()
73             and the tied hash constructor is tie(). They all allow to index a single Fasta
74             file, several files, or a directory of files. See Bio::DB::IndexedBase.
75              
76             =head1 SEE ALSO
77              
78             L
79              
80             L
81              
82             L
83              
84             =head1 AUTHOR
85              
86             Lincoln Stein Elstein@cshl.orgE.
87              
88             Copyright (c) 2001 Cold Spring Harbor Laboratory.
89              
90             This library is free software; you can redistribute it and/or modify
91             it under the same terms as Perl itself. See DISCLAIMER.txt for
92             disclaimers of warranty.
93              
94             =head1 APPENDIX
95              
96             The rest of the documentation details each of the object
97             methods. Internal methods are usually preceded with a _
98              
99             For BioPerl-style access, the following methods are provided:
100              
101             =head2 get_Seq_by_id
102              
103             Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104             Usage : my $seq = $db->get_Seq_by_id($id);
105             Function: Given an ID, fetch the corresponding sequence from the database.
106             Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107             Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108             only load the sequence string into memory when requested using seq().
109             See L for methods provided by the sequence objects
110             returned from get_Seq_by_id() and get_PrimarySeq_stream().
111             Args : ID
112              
113             =head2 get_PrimarySeq_stream
114              
115             Title : get_PrimarySeq_stream
116             Usage : my $stream = $db->get_PrimarySeq_stream();
117             Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118             single method, next_seq(). Each call to next_seq() returns a new
119             Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120             Returns : A Bio::DB::Indexed::Stream object
121             Args : None
122              
123             =head1
124              
125             For simple access, the following methods are provided:
126              
127             =cut
128              
129              
130             package Bio::DB::Fasta;
131              
132 1     1   3 use strict;
  1         1  
  1         24  
133 1     1   384 use IO::File;
  1         675  
  1         85  
134 1     1   5 use File::Spec;
  1         1  
  1         13  
135 1     1   318 use Bio::PrimarySeqI;
  1         2  
  1         25  
136              
137 1     1   5 use base qw(Bio::DB::IndexedBase);
  1         2  
  1         417  
138              
139             our $obj_class = 'Bio::PrimarySeq::Fasta';
140             our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
141              
142              
143             =head2 new
144              
145             Title : new
146             Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147             Function: Initialize a new database object. When indexing a directory, files
148             ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149             Returns : A new Bio::DB::Fasta object.
150             Args : A single file, or path to dir, or arrayref of files
151             Optional arguments: see Bio::DB::IndexedBase
152              
153             =cut
154              
155              
156             sub _calculate_offsets {
157             # Bio::DB::IndexedBase calls this to calculate offsets
158 53     53   66 my ($self, $fileno, $file, $offsets) = @_;
159              
160 53 50       219 my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
161 53         2565 binmode $fh;
162 53 50       112 warn "Indexing $file\n" if $self->{debug};
163 53         41 my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164             $last_line, %offsets);
165 53         59 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
166              
167 53         55 my $termination_length = $self->{termination_length};
168 53         576 while (my $line = <$fh>) {
169             # Account for crlf-terminated Windows files
170 36648 100       67818 if (index($line, '>') == 0) {
    100          
171 3009 100       6825 if ($line =~ /^>(\S+)/) {
172             print STDERR "Indexed $count sequences...\n"
173 3008 50 33     4998 if $self->{debug} && (++$count%1000) == 0;
174            
175             # please, do not enforce arbitrary line length requirements.
176             # It's good practice but not enforced.
177            
178             #$self->_check_linelength($linelen);
179 3008         2950 my $pos = tell($fh);
180 3008 100       3914 if (@ids) {
181 2956         2736 my $strlen = $pos - $offset - length($line);
182 2956         2425 $strlen -= $termination_length * $seq_lines;
183 2956         2302 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
  2956         5212  
184             $linelen, $headerlen, $alphabet, $fileno);
185 2956         2530 $alphabet = Bio::DB::IndexedBase::NA;
186 2956         3023 for my $id (@ids) {
187 2962         34889 $offsets->{$id} = $ppos;
188             }
189             }
190 3008         4992 @ids = $self->_makeid($line);
191 3008         3382 ($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192 3008         3277 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
193             } else {
194             # Catch bad header lines, bug 3172
195 1         15 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
196             }
197             } elsif ($line !~ /\S/) {
198             # Skip blank line
199 89         64 $blank_lines++;
200 89         225 next;
201             } else {
202             # Need to check every line :(
203 33550         20669 $l3_len = $l2_len;
204 33550         18499 $l2_len = $l_len;
205 33550         18733 $l_len = length $line;
206 33550         17768 if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
207 33550 50 66     105757 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
      66        
208 0         0 my $fap = substr($line, 0, 20)."..";
209 0         0 $self->throw("Each line of the fasta entry must be the same ".
210             "length except the last. Line above #$. '$fap' is $l2_len".
211             " != $l3_len chars.");
212             }
213 33550 100       36635 if ($blank_lines) {
214             # Blank lines not allowed in entry
215 1         6 $self->throw("Blank lines can only precede header lines, ".
216             "found preceding line #$.");
217             }
218             }
219 33549   66     42823 $linelen ||= length $line;
220 33549   66     36615 $alphabet ||= $self->_guess_alphabet($line);
221 33549         20213 $seq_lines++;
222             }
223 36557         62412 $last_line = $line;
224             }
225              
226             # Process last entry
227 51         107 $self->_check_linelength($linelen);
228 51         51 my $pos = tell $fh;
229 51 50       69 if (@ids) {
230 51         47 my $strlen = $pos - $offset;
231 51 100       56 if ($linelen == 0) { # yet another pesky empty chr_random.fa file
232 11         14 $strlen = 0;
233             } else {
234 40 50       124 if ($last_line !~ /\s$/) {
235 0         0 $seq_lines--;
236             }
237 40         43 $strlen -= $termination_length * $seq_lines;
238             }
239 51         47 my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
  51         92  
240             $headerlen, $alphabet, $fileno);
241 51         72 for my $id (@ids) {
242 52         552 $offsets->{$id} = $ppos;
243             }
244             }
245              
246 51         916 return \%offsets;
247             }
248              
249             =head2 seq
250              
251             Title : seq, sequence, subseq
252             Usage : # Entire sequence string
253             my $seqstr = $db->seq($id);
254             # Subsequence
255             my $subseqstr = $db->seq($id, $start, $stop, $strand);
256             # or...
257             my $subseqstr = $db->seq($compound_id);
258             Function: Get a subseq of a sequence from the database. For your convenience,
259             the sequence to extract can be specified with any of the following
260             compound IDs:
261             $db->seq("$id:$start,$stop")
262             $db->seq("$id:$start..$stop")
263             $db->seq("$id:$start-$stop")
264             $db->seq("$id:$start,$stop/$strand")
265             $db->seq("$id:$start..$stop/$strand")
266             $db->seq("$id:$start-$stop/$strand")
267             $db->seq("$id/$strand")
268             In the case of DNA or RNA sequence, if $stop is less than $start,
269             then the reverse complement of the sequence is returned. Avoid using
270             it if possible since this goes against Bio::Seq conventions.
271             Returns : A string
272             Args : ID of sequence to retrieve
273             or
274             Compound ID of subsequence to fetch
275             or
276             ID, optional start (defaults to 1), optional end (defaults to length
277             of sequence) and optional strand (defaults to 1).
278              
279             =cut
280              
281             sub subseq {
282 29     29 0 328 my ($self, $id, $start, $stop, $strand) = @_;
283 29 50       61 $self->throw('Need to provide a sequence ID') if not defined $id;
284 29         66 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
285              
286 29         40 my $data;
287              
288 29 100       53 my $fh = $self->_fh($id) or return;
289 28         68 my $filestart = $self->_calc_offset($id, $start);
290 28         40 my $filestop = $self->_calc_offset($id, $stop );
291              
292 28         78 seek($fh, $filestart,0);
293 28         199 read($fh, $data, $filestop-$filestart+1);
294              
295 28         69 $data = Bio::DB::IndexedBase::_strip_crnl($data);
296              
297 28 100       47 if ($strand == -1) {
298             # Reverse-complement the sequence
299 7         20 $data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
300             }
301 28         98 return $data;
302             }
303              
304             *seq = *sequence = \&subseq;
305              
306              
307             =head2 length
308              
309             Title : length
310             Usage : my $length = $qualdb->length($id);
311             Function: Get the number of residues in the indicated sequence.
312             Returns : Number
313             Args : ID of entry
314              
315             =head2 header
316              
317             Title : header
318             Usage : my $header = $db->header($id);
319             Function: Get the header line (ID and description fields) of the specified
320             sequence.
321             Returns : String
322             Args : ID of sequence
323              
324             =cut
325              
326             sub header {
327 2     2 1 2 my ($self, $id) = @_;
328 2 50       6 $self->throw('Need to provide a sequence ID') if not defined $id;
329 2         6 my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
  2         5  
330 2         3 $offset -= $headerlen;
331 2         2 my $data;
332 2 50       4 my $fh = $self->_fh($id) or return;
333 2         8 seek($fh, $offset, 0);
334 2         12 read($fh, $data, $headerlen);
335             # On Windows chomp remove '\n' but leaves '\r'
336             # when reading '\r\n' in binary mode
337 2         6 $data = Bio::DB::IndexedBase::_strip_crnl($data);
338 2         4 substr($data, 0, 1) = '';
339 2         5 return $data;
340             }
341              
342              
343             =head2 alphabet
344              
345             Title : alphabet
346             Usage : my $alphabet = $db->alphabet($id);
347             Function: Get the molecular type of the indicated sequence: dna, rna or protein
348             Returns : String
349             Args : ID of sequence
350              
351             =cut
352              
353              
354             #-------------------------------------------------------------
355             # Bio::PrimarySeqI compatibility
356             #
357             package Bio::PrimarySeq::Fasta;
358 1     1   9 use overload '""' => 'display_id';
  1         2  
  1         11  
359              
360 1     1   61 use base qw(Bio::Root::Root Bio::PrimarySeqI);
  1         1  
  1         477  
361              
362             sub new {
363 18     18   28 my ($class, @args) = @_;
364 18         50 my $self = $class->SUPER::new(@args);
365 18         55 my ($db, $id, $start, $stop) = $self->_rearrange(
366             [qw(DATABASE ID START STOP)],
367             @args);
368 18         31 $self->{db} = $db;
369 18         19 $self->{id} = $id;
370 18   100     46 $self->{stop} = $stop || $db->length($id);
371 18   66     60 $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
372 18         49 return $self;
373             }
374              
375             sub fetch_sequence {
376 0     0   0 return shift->seq(@_);
377             }
378              
379             sub seq {
380 4     4   6 my $self = shift;
381 4         13 return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
382             }
383              
384             sub subseq {
385 1     1   2 my $self = shift;
386 1         4 return $self->trunc(@_)->seq();
387             }
388              
389             sub trunc {
390             # Override Bio::PrimarySeqI trunc() method. This way, we create an object
391             # that does not store the sequence in memory.
392 2     2   3 my ($self, $start, $stop) = @_;
393 2 50       5 $self->throw("Stop cannot be smaller than start") if $stop < $start;
394 2 50       8 if ($self->{start} <= $self->{stop}) {
395 2         4 $start = $self->{start}+$start-1;
396 2         3 $stop = $self->{start}+$stop-1;
397             } else {
398 0         0 $start = $self->{start}-($start-1);
399 0         0 $stop = $self->{start}-($stop-1);
400             }
401 2         5 return $self->new( $self->{db}, $self->{id}, $start, $stop );
402             }
403              
404             sub is_circular {
405 1     1   2 my $self = shift;
406 1         4 return $self->{is_circular};
407             }
408              
409             sub display_id {
410 9     9   639 my $self = shift;
411 9         23 return $self->{id};
412             }
413              
414             sub accession_number {
415 1     1   3 my $self = shift;
416 1         3 return 'unknown';
417             }
418              
419             sub primary_id {
420             # Following Bio::PrimarySeqI, since this sequence has no accession number,
421             # its primary_id should be a stringified memory location.
422 1     1   2 my $self = shift;
423 1         6 return overload::StrVal($self);
424             }
425              
426             sub can_call_new {
427 0     0   0 return 0;
428             }
429              
430             sub alphabet {
431 1     1   445 my $self = shift;
432 1         5 return $self->{db}->alphabet($self->{id});
433             }
434              
435             sub revcom {
436             # Override Bio::PrimarySeqI revcom() with optimized method.
437 1     1   2 my $self = shift;
438 1         2 return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
  1         4  
439             }
440              
441             sub length {
442             # Get length from sequence location, not the sequence string (too expensive)
443 2     2   4 my $self = shift;
444             return $self->{start} < $self->{stop} ?
445             $self->{stop} - $self->{start} + 1 :
446 2 100       13 $self->{start} - $self->{stop} + 1 ;
447             }
448              
449             sub description {
450 1     1   1 my $self = shift;
451 1         4 my $header = $self->{'db'}->header($self->{id});
452             # Remove the ID from the header
453 1         9 return (split(/\s+/, $header, 2))[1];
454             }
455             *desc = \&description;
456              
457             1;