File Coverage

Bio/DB/Qual.pm

Criterion	Covered	Total	%
statement	139	150	92.6
branch	29	50	58.0
condition	10	24	41.6
subroutine	21	21	100.0
pod	1	4	25.0
total	200	249	80.3

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Qual
3							#
4							# You may distribute this module under the same terms as perl itself
5							#
6
7							=head1 NAME
8
9							Bio::DB::Qual - Fast indexed access to quality files
10
11							=head1 SYNOPSIS
12
13							use Bio::DB::Qual;
14
15							# create database from directory of qual files
16							my $db = Bio::DB::Qual->new('/path/to/qual/files/');
17							my @ids = $db->get_all_primary_ids;
18
19							# Simple access
20							my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
21							my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
22							my $length = $db->length('CHROMOSOME_I');
23							my $header = $db->header('CHROMOSOME_I');
24
25							# Access to sequence objects. See Bio::PrimarySeqI.
26							my $obj = $db->get_Qual_by_id('CHROMOSOME_I');
27							my @qual = @{$obj->qual};
28							my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
29							my $length = $obj->length;
30
31							# Loop through sequence objects
32							my $stream = $db->get_PrimarySeq_stream;
33							while (my $qual = $stream->next_seq) {
34							# Bio::Seq::PrimaryQual operations
35							}
36
37							# Filehandle access
38							my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/');
39							while (my $qual = <$fh>) {
40							# Bio::Seq::PrimaryQual operations
41							}
42
43							# Tied hash access
44							tie %qualities,'Bio::DB::Qual','/path/to/qual/files/';
45							print $qualities{'CHROMOSOME_I:1,20000'};
46
47							=head1 DESCRIPTION
48
49							Bio::DB::Qual provides indexed access to a single Fasta file, several files,
50							or a directory of files. It provides random access to each quality score entry
51							without having to read the file from the beginning. Access to subqualities
52							(portions of a quality score) is provided, although contrary to Bio::DB::Fasta,
53							the full quality score has to be brought in memory. Bio::DB::Qual is based on
54							Bio::DB::IndexedBase. See this module's documentation for details.
55
56							The qual files should contain decimal quality scores. Entries may have any line
57							length up to 65,536 characters, and different line lengths are allowed in the
58							same file. However, within a quality score entry, all lines must be the same
59							length except for the last. An error will be thrown if this is not the case.
60
61							The module uses /^E(\S+)/ to extract the primary ID of each quality score
62							from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback
63							routine to reversibly modify this primary ID, e.g. if you wish to extract a
64							specific portion of the gi\|gb\|abc\|xyz GenBank IDs.
65
66							=head1 DATABASE CREATION AND INDEXING
67
68							The object-oriented constructor is new(), the filehandle constructor is newFh()
69							and the tied hash constructor is tie(). They all allow one to index a single Fasta
70							file, several files, or a directory of files. See Bio::DB::IndexedBase.
71
72							=head1 SEE ALSO
73
74							L
75
76							L
77
78							L
79
80							=head1 LIMITATIONS
81
82							When a quality score is deleted from one of the qual files, this deletion is not
83							detected by the module and removed from the index. As a result, a "ghost" entry
84							will remain in the index and will return garbage results if accessed. Currently,
85							the only way to accommodate deletions is to rebuild the entire index, either by
86							deleting it manually, or by passing -reindex=E1 to new() when
87							initializing the module.
88
89							All quality score lines for a given quality score must have the same length
90							except for the last (not sure why there is this limitation). This is not
91							problematic for sequences but could be annoying for quality scores. A workaround
92							is to make sure that your quality scores fit on no more than 2 lines. Another
93							solution could be to padd them with blank spaces so that each line has the same
94							number of characters (maybe this padding should be implemented in
95							Bio::SeqIO::qual?).
96
97							=head1 AUTHOR
98
99							Florent E Angly Eflorent . angly @ gmail-dot-comE.
100
101							Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
102
103							Copyright (c) 2007 Florent E Angly.
104
105							This library is free software; you can redistribute it and/or modify
106							it under the same terms as Perl itself.
107
108							=head1 APPENDIX
109
110							The rest of the documentation details each of the object
111							methods. Internal methods are usually preceded with a _
112
113							For BioPerl-style access, the following methods are provided:
114
115							=head2 get_Seq_by_id
116
117							Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id,
118							get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id,
119							Usage : my $seq = $db->get_Seq_by_id($id);
120							Function: Given an ID, fetch the corresponding sequence from the database.
121							Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
122							Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
123							only load the sequence string into memory when requested using seq().
124							See L for methods provided by the sequence objects
125							returned from get_Seq_by_id() and get_PrimarySeq_stream().
126							Args : ID
127
128							=head2 get_PrimarySeq_stream
129
130							Title : get_Seq_stream, get_PrimarySeq_stream
131							Usage : my $stream = $db->get_Seq_stream();
132							Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
133							single method, next_seq(). Each call to next_seq() returns a new
134							Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
135							Returns : A Bio::DB::Indexed::Stream object
136							Args : None
137
138							=head1
139
140							For simple access, the following methods are provided:
141
142							=cut
143
144
145							package Bio::DB::Qual;
146
147	1			1		5	use strict;
	1					2
	1					23
148	1			1		234	use IO::File;
	1					674
	1					84
149	1			1		6	use File::Spec;
	1					1
	1					17
150
151	1			1		4	use base qw(Bio::DB::IndexedBase);
	1					1
	1					343
152
153							our $obj_class = 'Bio::Seq::PrimaryQual::Qual';
154							our $file_glob = '*.{qual,QUAL,qa,QA}';
155
156
157							=head2 new
158
159							Title : new
160							Usage : my $db = Bio::DB::Qual->new( $path, %options);
161							Function: Initialize a new database object. When indexing a directory, files
162							ending in .qual,qa are indexed by default.
163							Returns : A new Bio::DB::Qual object
164							Args : A single file, or path to dir, or arrayref of files
165							Optional arguments: see Bio::DB::IndexedBase
166
167							=cut
168
169
170							sub _calculate_offsets {
171							# Bio::DB::IndexedBase calls this to calculate offsets
172	9			9		16	my ($self, $fileno, $file, $offsets) = @_;
173
174	9	50				29	my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!");
175	9					459	binmode $fh;
176	9	50				20	warn "Indexing $file\n" if $self->{debug};
177	9					12	my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line,
178							$numres, %offsets);
179	9					13	my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
180
181	9					11	my $termination_length = $self->{termination_length};
182	9					65	while (my $line = <$fh>) {
183							# Account for crlf-terminated Windows files
184	90	100				229	if (index($line, '>') == 0) {
		50
185	45	50				109	if ($line =~ /^>(\S+)/) {
186							print STDERR "Indexed $count quality scores...\n"
187	45	50	33			70	if $self->{debug} && (++$count%1000) == 0;
188	45					87	$self->_check_linelength($linelen);
189	45					51	my $pos = tell($fh);
190	45	100				62	if (@ids) {
191	36					39	my $strlen = $pos - $offset - length($line);
192	36					36	$strlen -= $termination_length * $qual_lines;
193	36					32	my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres,
	36					59
194							$linelen, $headerlen, Bio::DB::IndexedBase::NA, $fileno);
195	36					47	for my $id (@ids) {
196	36					385	$offsets->{$id} = $ppos;
197							}
198	36					61	$numres = 0;
199							}
200	45					82	@ids = $self->_makeid($line);
201	45					63	($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0);
202	45					60	($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
203							} else {
204							# Catch bad header lines, bug 3172
205	0					0	$self->throw("FASTA header doesn't match '>(\\S+)': $line");
206							}
207							} elsif ($line !~ /\S/) {
208							# Skip blank line
209	0					0	$blank_lines++;
210	0					0	next;
211							} else {
212							# Need to check every line :(
213	45					41	$l3_len = $l2_len;
214	45					38	$l2_len = $l_len;
215	45					43	$l_len = length $line;
216	45					38	if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
217	45	0	33			66	if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
			33
218	0					0	my $fap = substr($line, 0, 20)."..";
219	0					0	$self->throw("Each line of the qual entry must be the same ".
220							"length except the last. Line above #$. '$fap' is $l2_len".
221							" != $l3_len chars.");
222							}
223	45	50				61	if ($blank_lines) {
224							# Blank lines not allowed in entry
225	0					0	$self->throw("Blank lines can only precede header lines, ".
226							"found preceding line #$.");
227							}
228							}
229	45		33			106	$linelen \|\|= length $line;
230	45					38	$qual_lines++;
231	45					636	$numres += scalar(split /\s+/, $line);
232							}
233	90					265	$last_line = $line;
234							}
235
236							# Process last entry
237	9					19	$self->_check_linelength($linelen);
238	9					12	my $pos = tell($fh);
239	9	50				15	if (@ids) {
240	9					10	my $strlen = $pos - $offset;
241	9	50				14	if ($linelen == 0) {
242	0					0	$strlen = 0;
243							} else {
244	9	50				24	if ($last_line !~ /\s$/) {
245	0					0	$qual_lines--;
246							}
247	9					10	$strlen -= $termination_length * $qual_lines;
248							}
249	9					10	my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, $linelen,
	9					17
250							$headerlen, Bio::DB::IndexedBase::NA, $fileno);
251	9					14	for my $id (@ids) {
252	9					80	$offsets->{$id} = $ppos;
253							}
254							}
255
256	9					70	return \%offsets;
257							}
258
259
260							# for backward compatibility
261							sub get_PrimaryQual_stream {
262	1			1	0	6	my $self = shift;
263	1					6	return $self->get_PrimarySeq_stream;
264							}
265
266
267							# for backward compatibility
268							sub get_Qual_by_id {
269	2			2	0	5	my ($self, $id) = @_;
270	2					10	return $self->get_Seq_by_id($id);
271							}
272
273							get_qual_by_version = get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id;
274
275
276							=head2 qual
277
278							Title : qual, quality, subqual
279							Usage : # All quality scores
280							my @qualarr = @{$qualdb->subqual($id)};
281							# Subset of the quality scores
282							my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)};
283							# or...
284							my @subqualarr = @{$qualdb->subqual($compound_id)};
285							Function: Get a subqual of an entry in the database. For your convenience,
286							the sequence to extract can be specified with any of the following
287							compound IDs:
288							$db->qual("$id:$start,$stop")
289							$db->qual("$id:$start..$stop")
290							$db->qual("$id:$start-$stop")
291							$db->qual("$id:$start,$stop/$strand")
292							$db->qual("$id:$start..$stop/$strand")
293							$db->qual("$id:$start-$stop/$strand")
294							$db->qual("$id/$strand")
295							If $stop is less than $start, then the reverse complement of the
296							sequence is returned. Avoid using it if possible since this goes
297							against Bio::Seq conventions.
298							Returns : Reference to an array of quality scores
299							Args : Compound ID of entry to retrieve
300							or
301							ID, optional start (defaults to 1), optional end (defaults to the
302							number of quality scores for this sequence), and strand (defaults to
303							1).
304
305							=cut
306
307							sub subqual {
308	16			16	0	34	my ($self, $id, $start, $stop, $strand) = @_;
309
310							# Quality values in a quality score can have 1 or 2 digits and are separated
311							# by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
312							# is no easy way match the position of a quality value to its position in
313							# the quality string.
314							# As a consequence, if a subqual of the quality is requested, we still need
315							# to grab the full quality string first - performance penalty for big
316							# quality scores :(
317							# I think there is no way around starting at the begining of the quality
318							# score but maybe there is a resource-efficient way of starting at the
319							# begining of the quality score and stopping when the the position of the
320							# last quality value requested is reached??
321
322	16	50				30	$self->throw('Need to provide a sequence ID') if not defined $id;
323	16					40	($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
324
325							# Position in quality string
326	16					21	my $string_start = 1;
327	16					36	my $string_stop = $self->strlen($id);
328
329							# Fetch full quality string
330	16	50				39	my $fh = $self->_fh($id) or return;
331	16					36	my $filestart = $self->_calc_offset($id, $string_start);
332	16					27	my $filestop = $self->_calc_offset($id, $string_stop );
333	16					68	seek($fh, $filestart,0);
334	16					16	my $data;
335	16					79	read($fh, $data, $filestop-$filestart+1);
336
337							# Process quality score
338	16					44	Bio::DB::IndexedBase::_strip_crnl($data);
339	16					21	my $subqual = 0;
340	16	50	33			35	$subqual = 1 if ( $start \|\| $stop );
341	16					18	my @data;
342	16	50	33			33	if ( $subqual \|\| ($strand == -1) ) {
343	16					78	@data = split / /, $data, $stop+1;
344	16					23	my $length = scalar(@data);
345	16	50				23	$start = 1 if $start < 1;
346	16	50				21	$stop = $length if $stop > $length;
347	16	100				27	pop @data if ($stop != $length);
348	16					23	splice @data, 0, $start-1;
349	16	100				21	@data = reverse(@data) if $strand == -1;
350	16					38	$data = join ' ', @data;
351							} else {
352	0					0	@data = split / /, $data;
353							}
354
355	16					60	return \@data;
356							}
357
358							qual = quality = \&subqual;
359
360
361							=head2 header
362
363							Title : header
364							Usage : my $header = $db->header($id);
365							Function: Get the header line (ID and description fields) of the specified entry.
366							Returns : String
367							Args : ID of entry
368
369							=cut
370
371							sub header {
372	2			2	1	4	my ($self, $id) = @_;
373	2	50				5	$self->throw('Need to provide a sequence ID') if not defined $id;
374	2					7	my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
	2					4
375	2					5	$offset -= $headerlen;
376	2					2	my $data;
377	2	50				5	my $fh = $self->_fh($id) or return;
378	2					10	seek($fh, $offset, 0);
379	2					11	read($fh, $data, $headerlen);
380							# On Windows chomp remove '\n' but leaves '\r'
381							# when reading '\r\n' in binary mode,
382							# _strip_crnl removes both
383	2					6	$data = Bio::DB::IndexedBase::_strip_crnl($data);
384	2					5	substr($data, 0, 1) = '';
385	2					5	return $data;
386							}
387
388
389							#-------------------------------------------------------------
390							# Tied hash overrides
391							#
392
393							sub FETCH {
394	3			3		13	return shift->subqual(@_);
395							}
396
397
398							#-------------------------------------------------------------
399							# Bio::Seq::PrimaryQual compatibility
400							#
401							# Usage is the same as in Bio::Seq::PrimaryQual
402
403							package Bio::Seq::PrimaryQual::Qual;
404	1			1		7	use overload '""' => 'display_id';
	1					2
	1					6
405
406	1			1		63	use base qw(Bio::Root::Root Bio::Seq::PrimaryQual);
	1					2
	1					367
407
408							sub new {
409	7			7		17	my ($class, @args) = @_;
410	7					21	my $self = $class->SUPER::new(@args);
411	7					26	my ($db, $id, $start, $stop) = $self->_rearrange(
412							[qw(DATABASE ID START STOP)],
413							@args);
414	7					15	$self->{db} = $db;
415	7					10	$self->{id} = $id;
416	7		66			16	$self->{stop} = $stop \|\| $db->length($id);
417	7		66			19	$self->{start} = $start \|\| ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
418	7					18	return $self;
419							}
420
421
422							sub qual {
423	6			6		460	my $self = shift;
424	6					17	my $qual = $self->{db}->qual($self->{id}, $self->{start}, $self->{stop});
425	6					33	return $qual;
426							}
427
428
429							sub subqual {
430	2			2		4	my ($self, $start, $stop) = @_;
431	2					5	return $self->trunc($start, $stop)->qual;
432							}
433
434
435							sub trunc {
436							# Override Bio::Seq::QualI trunc() method. This way, we create an object
437							# that does not store the quality array in memory.
438	3			3		5	my ($self, $start, $stop) = @_;
439	3	50				8	$self->throw(
440							"$stop is smaller than $stop. If you want to truncate and reverse ".
441							"complement, you must call trunc followed by revcom."
442							) if $start > $stop;
443	3	50				7	if ($self->{start} <= $self->{stop}) {
444	3					5	$start = $self->{start}+$start-1;
445	3					4	$stop = $self->{start}+$stop-1;
446							} else {
447	0					0	$start = $self->{start}-($start-1);
448	0					0	$stop = $self->{start}-($stop-1);
449							}
450							my $obj = $self->new( -database => $self->{db},
451							-id => $self->{id},
452	3					8	-start => $start,
453							-stop => $stop
454							);
455	3					9	return $obj;
456							}
457
458
459							sub display_id {
460	8			8		303	my $self = shift;
461	8					20	return $self->{id};
462							}
463
464
465							sub primary_id {
466	1			1		2	my $self = shift;
467	1					5	return overload::StrVal($self);
468							}
469
470							sub revcom {
471							# Override Bio::QualI revcom() with optimized method.
472	1			1		2	my $self = shift;
473	1					2	return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
	1					3
474							}
475
476							sub length {
477							# Get length from quality location, not the quality array (too expensive)
478	3			3		5	my $self = shift;
479							return $self->{start} < $self->{stop} ?
480							$self->{stop} - $self->{start} + 1 :
481	3	100				16	$self->{start} - $self->{stop} + 1 ;
482							}
483
484
485							sub description {
486	1			1		3	my $self = shift;
487	1					4	my $header = $self->{'db'}->header($self->{id});
488							# remove the id from the header
489	1					4	$header = (split(/\s+/, $header, 2))[2];
490	1					16	return $header;
491							}
492							*desc = \&description;
493
494
495							1;