File Coverage

Bio/DB/Fasta.pm

Criterion	Covered	Total	%
statement	132	139	94.9
branch	29	40	72.5
condition	14	21	66.6
subroutine	22	24	91.6
pod	1	2	50.0
total	198	226	87.6

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Fasta
3							#
4							# You may distribute this module under the same terms as perl itself
5							#
6
7
8							=head1 NAME
9
10							Bio::DB::Fasta - Fast indexed access to fasta files
11
12							=head1 SYNOPSIS
13
14							use Bio::DB::Fasta;
15
16							# Create database from a directory of Fasta files
17							my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18							my @ids = $db->get_all_primary_ids;
19
20							# Simple access
21							my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22							my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23							my $length = $db->length('CHROMOSOME_I');
24							my $header = $db->header('CHROMOSOME_I');
25							my $alphabet = $db->alphabet('CHROMOSOME_I');
26
27							# Access to sequence objects. See Bio::PrimarySeqI.
28							my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29							my $seqstr = $seq->seq;
30							my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31							my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32							my $length = $seq->length;
33
34							# Loop through sequence objects
35							my $stream = $db->get_PrimarySeq_stream;
36							while (my $seq = $stream->next_seq) {
37							# Bio::PrimarySeqI stuff
38							}
39
40							# Filehandle access
41							my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42							while (my $seq = <$fh>) {
43							# Bio::PrimarySeqI stuff
44							}
45
46							# Tied hash access
47							tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48							print $sequences{'CHROMOSOME_I:1,20000'};
49
50							=head1 DESCRIPTION
51
52							Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53							or a directory of files. It provides persistent random access to each sequence
54							entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55							subsequences within each entry, allowing you to retrieve portions of very large
56							sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57							based on Bio::DB::IndexedBase. See this module's documentation for details.
58
59							The Fasta files may contain any combination of nucleotide and protein sequences;
60							during indexing the module guesses the molecular type. Entries may have any line
61							length up to 65,536 characters, and different line lengths are allowed in the
62							same file. However, within a sequence entry, all lines must be the same length
63							except for the last. An error will be thrown if this is not the case.
64
65							The module uses /^E(\S+)/ to extract the primary ID of each sequence
66							from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67							routine to reversibly modify this primary ID, e.g. if you wish to extract a
68							specific portion of the gi\|gb\|abc\|xyz GenBank IDs.
69
70							=head1 DATABASE CREATION AND INDEXING
71
72							The object-oriented constructor is new(), the filehandle constructor is newFh()
73							and the tied hash constructor is tie(). They all allow to index a single Fasta
74							file, several files, or a directory of files. See Bio::DB::IndexedBase.
75
76							=head1 SEE ALSO
77
78							L
79
80							L
81
82							L
83
84							=head1 AUTHOR
85
86							Lincoln Stein Elstein@cshl.orgE.
87
88							Copyright (c) 2001 Cold Spring Harbor Laboratory.
89
90							This library is free software; you can redistribute it and/or modify
91							it under the same terms as Perl itself. See DISCLAIMER.txt for
92							disclaimers of warranty.
93
94							=head1 APPENDIX
95
96							The rest of the documentation details each of the object
97							methods. Internal methods are usually preceded with a _
98
99							For BioPerl-style access, the following methods are provided:
100
101							=head2 get_Seq_by_id
102
103							Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104							Usage : my $seq = $db->get_Seq_by_id($id);
105							Function: Given an ID, fetch the corresponding sequence from the database.
106							Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107							Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108							only load the sequence string into memory when requested using seq().
109							See L for methods provided by the sequence objects
110							returned from get_Seq_by_id() and get_PrimarySeq_stream().
111							Args : ID
112
113							=head2 get_PrimarySeq_stream
114
115							Title : get_PrimarySeq_stream
116							Usage : my $stream = $db->get_PrimarySeq_stream();
117							Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118							single method, next_seq(). Each call to next_seq() returns a new
119							Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120							Returns : A Bio::DB::Indexed::Stream object
121							Args : None
122
123							=head1
124
125							For simple access, the following methods are provided:
126
127							=cut
128
129
130							package Bio::DB::Fasta;
131
132	1			1		3	use strict;
	1					1
	1					24
133	1			1		384	use IO::File;
	1					675
	1					85
134	1			1		5	use File::Spec;
	1					1
	1					13
135	1			1		318	use Bio::PrimarySeqI;
	1					2
	1					25
136
137	1			1		5	use base qw(Bio::DB::IndexedBase);
	1					2
	1					417
138
139							our $obj_class = 'Bio::PrimarySeq::Fasta';
140							our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
141
142
143							=head2 new
144
145							Title : new
146							Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147							Function: Initialize a new database object. When indexing a directory, files
148							ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149							Returns : A new Bio::DB::Fasta object.
150							Args : A single file, or path to dir, or arrayref of files
151							Optional arguments: see Bio::DB::IndexedBase
152
153							=cut
154
155
156							sub _calculate_offsets {
157							# Bio::DB::IndexedBase calls this to calculate offsets
158	53			53		66	my ($self, $fileno, $file, $offsets) = @_;
159
160	53	50				219	my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
161	53					2565	binmode $fh;
162	53	50				112	warn "Indexing $file\n" if $self->{debug};
163	53					41	my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164							$last_line, %offsets);
165	53					59	my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
166
167	53					55	my $termination_length = $self->{termination_length};
168	53					576	while (my $line = <$fh>) {
169							# Account for crlf-terminated Windows files
170	36648	100				67818	if (index($line, '>') == 0) {
		100
171	3009	100				6825	if ($line =~ /^>(\S+)/) {
172							print STDERR "Indexed $count sequences...\n"
173	3008	50	33			4998	if $self->{debug} && (++$count%1000) == 0;
174
175							# please, do not enforce arbitrary line length requirements.
176							# It's good practice but not enforced.
177
178							#$self->_check_linelength($linelen);
179	3008					2950	my $pos = tell($fh);
180	3008	100				3914	if (@ids) {
181	2956					2736	my $strlen = $pos - $offset - length($line);
182	2956					2425	$strlen -= $termination_length * $seq_lines;
183	2956					2302	my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
	2956					5212
184							$linelen, $headerlen, $alphabet, $fileno);
185	2956					2530	$alphabet = Bio::DB::IndexedBase::NA;
186	2956					3023	for my $id (@ids) {
187	2962					34889	$offsets->{$id} = $ppos;
188							}
189							}
190	3008					4992	@ids = $self->_makeid($line);
191	3008					3382	($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192	3008					3277	($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
193							} else {
194							# Catch bad header lines, bug 3172
195	1					15	$self->throw("FASTA header doesn't match '>(\\S+)': $line");
196							}
197							} elsif ($line !~ /\S/) {
198							# Skip blank line
199	89					64	$blank_lines++;
200	89					225	next;
201							} else {
202							# Need to check every line :(
203	33550					20669	$l3_len = $l2_len;
204	33550					18499	$l2_len = $l_len;
205	33550					18733	$l_len = length $line;
206	33550					17768	if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
207	33550	50	66			105757	if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
			66
208	0					0	my $fap = substr($line, 0, 20)."..";
209	0					0	$self->throw("Each line of the fasta entry must be the same ".
210							"length except the last. Line above #$. '$fap' is $l2_len".
211							" != $l3_len chars.");
212							}
213	33550	100				36635	if ($blank_lines) {
214							# Blank lines not allowed in entry
215	1					6	$self->throw("Blank lines can only precede header lines, ".
216							"found preceding line #$.");
217							}
218							}
219	33549		66			42823	$linelen \|\|= length $line;
220	33549		66			36615	$alphabet \|\|= $self->_guess_alphabet($line);
221	33549					20213	$seq_lines++;
222							}
223	36557					62412	$last_line = $line;
224							}
225
226							# Process last entry
227	51					107	$self->_check_linelength($linelen);
228	51					51	my $pos = tell $fh;
229	51	50				69	if (@ids) {
230	51					47	my $strlen = $pos - $offset;
231	51	100				56	if ($linelen == 0) { # yet another pesky empty chr_random.fa file
232	11					14	$strlen = 0;
233							} else {
234	40	50				124	if ($last_line !~ /\s$/) {
235	0					0	$seq_lines--;
236							}
237	40					43	$strlen -= $termination_length * $seq_lines;
238							}
239	51					47	my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
	51					92
240							$headerlen, $alphabet, $fileno);
241	51					72	for my $id (@ids) {
242	52					552	$offsets->{$id} = $ppos;
243							}
244							}
245
246	51					916	return \%offsets;
247							}
248
249							=head2 seq
250
251							Title : seq, sequence, subseq
252							Usage : # Entire sequence string
253							my $seqstr = $db->seq($id);
254							# Subsequence
255							my $subseqstr = $db->seq($id, $start, $stop, $strand);
256							# or...
257							my $subseqstr = $db->seq($compound_id);
258							Function: Get a subseq of a sequence from the database. For your convenience,
259							the sequence to extract can be specified with any of the following
260							compound IDs:
261							$db->seq("$id:$start,$stop")
262							$db->seq("$id:$start..$stop")
263							$db->seq("$id:$start-$stop")
264							$db->seq("$id:$start,$stop/$strand")
265							$db->seq("$id:$start..$stop/$strand")
266							$db->seq("$id:$start-$stop/$strand")
267							$db->seq("$id/$strand")
268							In the case of DNA or RNA sequence, if $stop is less than $start,
269							then the reverse complement of the sequence is returned. Avoid using
270							it if possible since this goes against Bio::Seq conventions.
271							Returns : A string
272							Args : ID of sequence to retrieve
273							or
274							Compound ID of subsequence to fetch
275							or
276							ID, optional start (defaults to 1), optional end (defaults to length
277							of sequence) and optional strand (defaults to 1).
278
279							=cut
280
281							sub subseq {
282	29			29	0	328	my ($self, $id, $start, $stop, $strand) = @_;
283	29	50				61	$self->throw('Need to provide a sequence ID') if not defined $id;
284	29					66	($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
285
286	29					40	my $data;
287
288	29	100				53	my $fh = $self->_fh($id) or return;
289	28					68	my $filestart = $self->_calc_offset($id, $start);
290	28					40	my $filestop = $self->_calc_offset($id, $stop );
291
292	28					78	seek($fh, $filestart,0);
293	28					199	read($fh, $data, $filestop-$filestart+1);
294
295	28					69	$data = Bio::DB::IndexedBase::_strip_crnl($data);
296
297	28	100				47	if ($strand == -1) {
298							# Reverse-complement the sequence
299	7					20	$data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
300							}
301	28					98	return $data;
302							}
303
304							seq = sequence = \&subseq;
305
306
307							=head2 length
308
309							Title : length
310							Usage : my $length = $qualdb->length($id);
311							Function: Get the number of residues in the indicated sequence.
312							Returns : Number
313							Args : ID of entry
314
315							=head2 header
316
317							Title : header
318							Usage : my $header = $db->header($id);
319							Function: Get the header line (ID and description fields) of the specified
320							sequence.
321							Returns : String
322							Args : ID of sequence
323
324							=cut
325
326							sub header {
327	2			2	1	2	my ($self, $id) = @_;
328	2	50				6	$self->throw('Need to provide a sequence ID') if not defined $id;
329	2					6	my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
	2					5
330	2					3	$offset -= $headerlen;
331	2					2	my $data;
332	2	50				4	my $fh = $self->_fh($id) or return;
333	2					8	seek($fh, $offset, 0);
334	2					12	read($fh, $data, $headerlen);
335							# On Windows chomp remove '\n' but leaves '\r'
336							# when reading '\r\n' in binary mode
337	2					6	$data = Bio::DB::IndexedBase::_strip_crnl($data);
338	2					4	substr($data, 0, 1) = '';
339	2					5	return $data;
340							}
341
342
343							=head2 alphabet
344
345							Title : alphabet
346							Usage : my $alphabet = $db->alphabet($id);
347							Function: Get the molecular type of the indicated sequence: dna, rna or protein
348							Returns : String
349							Args : ID of sequence
350
351							=cut
352
353
354							#-------------------------------------------------------------
355							# Bio::PrimarySeqI compatibility
356							#
357							package Bio::PrimarySeq::Fasta;
358	1			1		9	use overload '""' => 'display_id';
	1					2
	1					11
359
360	1			1		61	use base qw(Bio::Root::Root Bio::PrimarySeqI);
	1					1
	1					477
361
362							sub new {
363	18			18		28	my ($class, @args) = @_;
364	18					50	my $self = $class->SUPER::new(@args);
365	18					55	my ($db, $id, $start, $stop) = $self->_rearrange(
366							[qw(DATABASE ID START STOP)],
367							@args);
368	18					31	$self->{db} = $db;
369	18					19	$self->{id} = $id;
370	18		100			46	$self->{stop} = $stop \|\| $db->length($id);
371	18		66			60	$self->{start} = $start \|\| ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
372	18					49	return $self;
373							}
374
375							sub fetch_sequence {
376	0			0		0	return shift->seq(@_);
377							}
378
379							sub seq {
380	4			4		6	my $self = shift;
381	4					13	return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
382							}
383
384							sub subseq {
385	1			1		2	my $self = shift;
386	1					4	return $self->trunc(@_)->seq();
387							}
388
389							sub trunc {
390							# Override Bio::PrimarySeqI trunc() method. This way, we create an object
391							# that does not store the sequence in memory.
392	2			2		3	my ($self, $start, $stop) = @_;
393	2	50				5	$self->throw("Stop cannot be smaller than start") if $stop < $start;
394	2	50				8	if ($self->{start} <= $self->{stop}) {
395	2					4	$start = $self->{start}+$start-1;
396	2					3	$stop = $self->{start}+$stop-1;
397							} else {
398	0					0	$start = $self->{start}-($start-1);
399	0					0	$stop = $self->{start}-($stop-1);
400							}
401	2					5	return $self->new( $self->{db}, $self->{id}, $start, $stop );
402							}
403
404							sub is_circular {
405	1			1		2	my $self = shift;
406	1					4	return $self->{is_circular};
407							}
408
409							sub display_id {
410	9			9		639	my $self = shift;
411	9					23	return $self->{id};
412							}
413
414							sub accession_number {
415	1			1		3	my $self = shift;
416	1					3	return 'unknown';
417							}
418
419							sub primary_id {
420							# Following Bio::PrimarySeqI, since this sequence has no accession number,
421							# its primary_id should be a stringified memory location.
422	1			1		2	my $self = shift;
423	1					6	return overload::StrVal($self);
424							}
425
426							sub can_call_new {
427	0			0		0	return 0;
428							}
429
430							sub alphabet {
431	1			1		445	my $self = shift;
432	1					5	return $self->{db}->alphabet($self->{id});
433							}
434
435							sub revcom {
436							# Override Bio::PrimarySeqI revcom() with optimized method.
437	1			1		2	my $self = shift;
438	1					2	return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
	1					4
439							}
440
441							sub length {
442							# Get length from sequence location, not the sequence string (too expensive)
443	2			2		4	my $self = shift;
444							return $self->{start} < $self->{stop} ?
445							$self->{stop} - $self->{start} + 1 :
446	2	100				13	$self->{start} - $self->{stop} + 1 ;
447							}
448
449							sub description {
450	1			1		1	my $self = shift;
451	1					4	my $header = $self->{'db'}->header($self->{id});
452							# Remove the ID from the header
453	1					9	return (split(/\s+/, $header, 2))[1];
454							}
455							*desc = \&description;
456
457							1;