File Coverage

Bio/DB/Fasta.pm

Criterion	Covered	Total	%
statement	132	139	94.9
branch	29	40	72.5
condition	15	21	71.4
subroutine	22	24	91.6
pod	1	2	50.0
total	199	226	88.0

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Fasta
3							#
4							# You may distribute this module under the same terms as perl itself
5							#
6
7
8							=head1 NAME
9
10							Bio::DB::Fasta - Fast indexed access to fasta files
11
12							=head1 SYNOPSIS
13
14							use Bio::DB::Fasta;
15
16							# Create database from a directory of Fasta files
17							my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18							my @ids = $db->get_all_primary_ids;
19
20							# Simple access
21							my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22							my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23							my $length = $db->length('CHROMOSOME_I');
24							my $header = $db->header('CHROMOSOME_I');
25							my $alphabet = $db->alphabet('CHROMOSOME_I');
26
27							# Access to sequence objects. See Bio::PrimarySeqI.
28							my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29							my $seqstr = $seq->seq;
30							my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31							my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32							my $length = $seq->length;
33
34							# Loop through sequence objects
35							my $stream = $db->get_PrimarySeq_stream;
36							while (my $seq = $stream->next_seq) {
37							# Bio::PrimarySeqI stuff
38							}
39
40							# Filehandle access
41							my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42							while (my $seq = <$fh>) {
43							# Bio::PrimarySeqI stuff
44							}
45
46							# Tied hash access
47							tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48							print $sequences{'CHROMOSOME_I:1,20000'};
49
50							=head1 DESCRIPTION
51
52							Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53							or a directory of files. It provides persistent random access to each sequence
54							entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55							subsequences within each entry, allowing you to retrieve portions of very large
56							sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57							based on Bio::DB::IndexedBase. See this module's documentation for details.
58
59							The Fasta files may contain any combination of nucleotide and protein sequences;
60							during indexing the module guesses the molecular type. Entries may have any line
61							length up to 65,536 characters, and different line lengths are allowed in the
62							same file. However, within a sequence entry, all lines must be the same length
63							except for the last. An error will be thrown if this is not the case.
64
65							The module uses /^E(\S+)/ to extract the primary ID of each sequence
66							from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67							routine to reversibly modify this primary ID, e.g. if you wish to extract a
68							specific portion of the gi\|gb\|abc\|xyz GenBank IDs.
69
70							=head1 DATABASE CREATION AND INDEXING
71
72							The object-oriented constructor is new(), the filehandle constructor is newFh()
73							and the tied hash constructor is tie(). They all allow one to index a single Fasta
74							file, several files, or a directory of files. See Bio::DB::IndexedBase.
75
76							=head1 SEE ALSO
77
78							L
79
80							L
81
82							L
83
84							=head1 AUTHOR
85
86							Lincoln Stein Elstein@cshl.orgE.
87
88							Copyright (c) 2001 Cold Spring Harbor Laboratory.
89
90							This library is free software; you can redistribute it and/or modify
91							it under the same terms as Perl itself. See DISCLAIMER.txt for
92							disclaimers of warranty.
93
94							=head1 APPENDIX
95
96							The rest of the documentation details each of the object
97							methods. Internal methods are usually preceded with a _
98
99							For BioPerl-style access, the following methods are provided:
100
101							=head2 get_Seq_by_id
102
103							Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104							Usage : my $seq = $db->get_Seq_by_id($id);
105							Function: Given an ID, fetch the corresponding sequence from the database.
106							Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107							Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108							only load the sequence string into memory when requested using seq().
109							See L for methods provided by the sequence objects
110							returned from get_Seq_by_id() and get_PrimarySeq_stream().
111							Args : ID
112
113							=head2 get_PrimarySeq_stream
114
115							Title : get_PrimarySeq_stream
116							Usage : my $stream = $db->get_PrimarySeq_stream();
117							Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118							single method, next_seq(). Each call to next_seq() returns a new
119							Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120							Returns : A Bio::DB::Indexed::Stream object
121							Args : None
122
123							=head1
124
125							For simple access, the following methods are provided:
126
127							=cut
128
129
130							package Bio::DB::Fasta;
131
132	1			1		5	use strict;
	1					1
	1					24
133	1			1		304	use IO::File;
	1					802
	1					89
134	1			1		6	use File::Spec;
	1					1
	1					16
135	1			1		290	use Bio::PrimarySeqI;
	1					3
	1					32
136
137	1			1		6	use base qw(Bio::DB::IndexedBase);
	1					1
	1					547
138
139							our $obj_class = 'Bio::PrimarySeq::Fasta';
140							our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
141
142
143							=head2 new
144
145							Title : new
146							Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147							Function: Initialize a new database object. When indexing a directory, files
148							ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149							Returns : A new Bio::DB::Fasta object.
150							Args : A single file, or path to dir, or arrayref of files
151							Optional arguments: see Bio::DB::IndexedBase
152
153							=cut
154
155
156							sub _calculate_offsets {
157							# Bio::DB::IndexedBase calls this to calculate offsets
158	53			53		201	my ($self, $fileno, $file, $offsets) = @_;
159
160	53	50				587	my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
161	53					5562	binmode $fh;
162	53	50				183	warn "Indexing $file\n" if $self->{debug};
163	53					119	my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164							$last_line, %offsets);
165	53					113	my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
166
167	53					104	my $termination_length = $self->{termination_length};
168	53					849	while (my $line = <$fh>) {
169							# Account for crlf-terminated Windows files
170	36648	100				81405	if (index($line, '>') == 0) {
		100
171	3009	100				7908	if ($line =~ /^>(\S+)/) {
172							print STDERR "Indexed $count sequences...\n"
173	3008	50	33			5753	if $self->{debug} && (++$count%1000) == 0;
174
175							# please, do not enforce arbitrary line length requirements.
176							# It's good practice but not enforced.
177
178							#$self->_check_linelength($linelen);
179	3008					4160	my $pos = tell($fh);
180	3008	100				4337	if (@ids) {
181	2956					3537	my $strlen = $pos - $offset - length($line);
182	2956					3178	$strlen -= $termination_length * $seq_lines;
183	2956					2731	my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
	2956					5689
184							$linelen, $headerlen, $alphabet, $fileno);
185	2956					3796	$alphabet = Bio::DB::IndexedBase::NA;
186	2956					3703	for my $id (@ids) {
187	2962					39350	$offsets->{$id} = $ppos;
188							}
189							}
190	3008					7180	@ids = $self->_makeid($line);
191	3008					4689	($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192	3008					4045	($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
193							} else {
194							# Catch bad header lines, bug 3172
195	1					17	$self->throw("FASTA header doesn't match '>(\\S+)': $line");
196							}
197							} elsif ($line !~ /\S/) {
198							# Skip blank line
199	89					86	$blank_lines++;
200	89					268	next;
201							} else {
202							# Need to check every line :(
203	33550					34650	$l3_len = $l2_len;
204	33550					28514	$l2_len = $l_len;
205	33550					27020	$l_len = length $line;
206	33550					27113	if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
207	33550	50	66			87276	if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
			66
208	0					0	my $fap = substr($line, 0, 20)."..";
209	0					0	$self->throw("Each line of the fasta entry must be the same ".
210							"length except the last. Line above #$. '$fap' is $l2_len".
211							" != $l3_len chars.");
212							}
213	33550	100				40878	if ($blank_lines) {
214							# Blank lines not allowed in entry
215	1					7	$self->throw("Blank lines can only precede header lines, ".
216							"found preceding line #$.");
217							}
218							}
219	33549		66			42208	$linelen \|\|= length $line;
220	33549		66			41379	$alphabet \|\|= $self->_guess_alphabet($line);
221	33549					28152	$seq_lines++;
222							}
223	36557					74805	$last_line = $line;
224							}
225
226							# Process last entry
227	51					322	$self->_check_linelength($linelen);
228	51					142	my $pos = tell $fh;
229	51	50				149	if (@ids) {
230	51					112	my $strlen = $pos - $offset;
231	51	100				120	if ($linelen == 0) { # yet another pesky empty chr_random.fa file
232	11					26	$strlen = 0;
233							} else {
234	40	50				242	if ($last_line !~ /\s$/) {
235	0					0	$seq_lines--;
236							}
237	40					93	$strlen -= $termination_length * $seq_lines;
238							}
239	51					116	my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
	51					174
240							$headerlen, $alphabet, $fileno);
241	51					133	for my $id (@ids) {
242	52					859	$offsets->{$id} = $ppos;
243							}
244							}
245
246	51					1534	return \%offsets;
247							}
248
249							=head2 seq
250
251							Title : seq, sequence, subseq
252							Usage : # Entire sequence string
253							my $seqstr = $db->seq($id);
254							# Subsequence
255							my $subseqstr = $db->seq($id, $start, $stop, $strand);
256							# or...
257							my $subseqstr = $db->seq($compound_id);
258							Function: Get a subseq of a sequence from the database. For your convenience,
259							the sequence to extract can be specified with any of the following
260							compound IDs:
261							$db->seq("$id:$start,$stop")
262							$db->seq("$id:$start..$stop")
263							$db->seq("$id:$start-$stop")
264							$db->seq("$id:$start,$stop/$strand")
265							$db->seq("$id:$start..$stop/$strand")
266							$db->seq("$id:$start-$stop/$strand")
267							$db->seq("$id/$strand")
268							In the case of DNA or RNA sequence, if $stop is less than $start,
269							then the reverse complement of the sequence is returned. Avoid using
270							it if possible since this goes against Bio::Seq conventions.
271							Returns : A string
272							Args : ID of sequence to retrieve
273							or
274							Compound ID of subsequence to fetch
275							or
276							ID, optional start (defaults to 1), optional end (defaults to length
277							of sequence) and optional strand (defaults to 1).
278
279							=cut
280
281							sub subseq {
282	29			29	0	402	my ($self, $id, $start, $stop, $strand) = @_;
283	29	50				64	$self->throw('Need to provide a sequence ID') if not defined $id;
284	29					91	($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
285
286	29					44	my $data;
287
288	29	100				82	my $fh = $self->_fh($id) or return;
289	28					78	my $filestart = $self->_calc_offset($id, $start);
290	28					153	my $filestop = $self->_calc_offset($id, $stop );
291
292	28					134	seek($fh, $filestart,0);
293	28					245	read($fh, $data, $filestop-$filestart+1);
294
295	28					105	$data = Bio::DB::IndexedBase::_strip_crnl($data);
296
297	28	100				76	if ($strand == -1) {
298							# Reverse-complement the sequence
299	7					32	$data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
300							}
301	28					173	return $data;
302							}
303
304							seq = sequence = \&subseq;
305
306
307							=head2 length
308
309							Title : length
310							Usage : my $length = $qualdb->length($id);
311							Function: Get the number of residues in the indicated sequence.
312							Returns : Number
313							Args : ID of entry
314
315							=head2 header
316
317							Title : header
318							Usage : my $header = $db->header($id);
319							Function: Get the header line (ID and description fields) of the specified
320							sequence.
321							Returns : String
322							Args : ID of sequence
323
324							=cut
325
326							sub header {
327	2			2	1	5	my ($self, $id) = @_;
328	2	50				9	$self->throw('Need to provide a sequence ID') if not defined $id;
329	2					8	my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
	2					8
330	2					7	$offset -= $headerlen;
331	2					7	my $data;
332	2	50				7	my $fh = $self->_fh($id) or return;
333	2					11	seek($fh, $offset, 0);
334	2					16	read($fh, $data, $headerlen);
335							# On Windows chomp remove '\n' but leaves '\r'
336							# when reading '\r\n' in binary mode
337	2					9	$data = Bio::DB::IndexedBase::_strip_crnl($data);
338	2					8	substr($data, 0, 1) = '';
339	2					7	return $data;
340							}
341
342
343							=head2 alphabet
344
345							Title : alphabet
346							Usage : my $alphabet = $db->alphabet($id);
347							Function: Get the molecular type of the indicated sequence: dna, rna or protein
348							Returns : String
349							Args : ID of sequence
350
351							=cut
352
353
354							#-------------------------------------------------------------
355							# Bio::PrimarySeqI compatibility
356							#
357							package Bio::PrimarySeq::Fasta;
358	1			1		7	use overload '""' => 'display_id';
	1					5
	1					9
359
360	1			1		75	use base qw(Bio::Root::Root Bio::PrimarySeqI);
	1					55
	1					508
361
362							sub new {
363	18			18		46	my ($class, @args) = @_;
364	18					77	my $self = $class->SUPER::new(@args);
365	18					85	my ($db, $id, $start, $stop) = $self->_rearrange(
366							[qw(DATABASE ID START STOP)],
367							@args);
368	18					42	$self->{db} = $db;
369	18					32	$self->{id} = $id;
370	18		100			61	$self->{stop} = $stop \|\| $db->length($id);
371	18		100			93	$self->{start} = $start \|\| ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
372	18					74	return $self;
373							}
374
375							sub fetch_sequence {
376	0			0		0	return shift->seq(@_);
377							}
378
379							sub seq {
380	4			4		9	my $self = shift;
381	4					13	return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
382							}
383
384							sub subseq {
385	1			1		2	my $self = shift;
386	1					4	return $self->trunc(@_)->seq();
387							}
388
389							sub trunc {
390							# Override Bio::PrimarySeqI trunc() method. This way, we create an object
391							# that does not store the sequence in memory.
392	2			2		3	my ($self, $start, $stop) = @_;
393	2	50				7	$self->throw("Stop cannot be smaller than start") if $stop < $start;
394	2	50				15	if ($self->{start} <= $self->{stop}) {
395	2					5	$start = $self->{start}+$start-1;
396	2					3	$stop = $self->{start}+$stop-1;
397							} else {
398	0					0	$start = $self->{start}-($start-1);
399	0					0	$stop = $self->{start}-($stop-1);
400							}
401	2					7	return $self->new( $self->{db}, $self->{id}, $start, $stop );
402							}
403
404							sub is_circular {
405	1			1		3	my $self = shift;
406	1					5	return $self->{is_circular};
407							}
408
409							sub display_id {
410	9			9		854	my $self = shift;
411	9					26	return $self->{id};
412							}
413
414							sub accession_number {
415	1			1		2	my $self = shift;
416	1					8	return 'unknown';
417							}
418
419							sub primary_id {
420							# Following Bio::PrimarySeqI, since this sequence has no accession number,
421							# its primary_id should be a stringified memory location.
422	1			1		4	my $self = shift;
423	1					14	return overload::StrVal($self);
424							}
425
426							sub can_call_new {
427	0			0		0	return 0;
428							}
429
430							sub alphabet {
431	1			1		628	my $self = shift;
432	1					8	return $self->{db}->alphabet($self->{id});
433							}
434
435							sub revcom {
436							# Override Bio::PrimarySeqI revcom() with optimized method.
437	1			1		3	my $self = shift;
438	1					2	return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
	1					5
439							}
440
441							sub length {
442							# Get length from sequence location, not the sequence string (too expensive)
443	2			2		4	my $self = shift;
444							return $self->{start} < $self->{stop} ?
445							$self->{stop} - $self->{start} + 1 :
446	2	100				14	$self->{start} - $self->{stop} + 1 ;
447							}
448
449							sub description {
450	1			1		2	my $self = shift;
451	1					5	my $header = $self->{'db'}->header($self->{id});
452							# Remove the ID from the header
453	1					14	return (split(/\s+/, $header, 2))[1];
454							}
455							*desc = \&description;
456
457							1;