File Coverage

Bio/DB/Qual.pm
Criterion Covered Total %
statement 139 150 92.6
branch 29 50 58.0
condition 10 24 41.6
subroutine 21 21 100.0
pod 1 4 25.0
total 200 249 80.3


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Qual
3             #
4             # You may distribute this module under the same terms as perl itself
5             #
6              
7             =head1 NAME
8              
9             Bio::DB::Qual - Fast indexed access to quality files
10              
11             =head1 SYNOPSIS
12              
13             use Bio::DB::Qual;
14              
15             # create database from directory of qual files
16             my $db = Bio::DB::Qual->new('/path/to/qual/files/');
17             my @ids = $db->get_all_primary_ids;
18              
19             # Simple access
20             my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
21             my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
22             my $length = $db->length('CHROMOSOME_I');
23             my $header = $db->header('CHROMOSOME_I');
24              
25             # Access to sequence objects. See Bio::PrimarySeqI.
26             my $obj = $db->get_Qual_by_id('CHROMOSOME_I');
27             my @qual = @{$obj->qual};
28             my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
29             my $length = $obj->length;
30              
31             # Loop through sequence objects
32             my $stream = $db->get_PrimarySeq_stream;
33             while (my $qual = $stream->next_seq) {
34             # Bio::Seq::PrimaryQual operations
35             }
36              
37             # Filehandle access
38             my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/');
39             while (my $qual = <$fh>) {
40             # Bio::Seq::PrimaryQual operations
41             }
42              
43             # Tied hash access
44             tie %qualities,'Bio::DB::Qual','/path/to/qual/files/';
45             print $qualities{'CHROMOSOME_I:1,20000'};
46              
47             =head1 DESCRIPTION
48              
49             Bio::DB::Qual provides indexed access to a single Fasta file, several files,
50             or a directory of files. It provides random access to each quality score entry
51             without having to read the file from the beginning. Access to subqualities
52             (portions of a quality score) is provided, although contrary to Bio::DB::Fasta,
53             the full quality score has to be brought in memory. Bio::DB::Qual is based on
54             Bio::DB::IndexedBase. See this module's documentation for details.
55              
56             The qual files should contain decimal quality scores. Entries may have any line
57             length up to 65,536 characters, and different line lengths are allowed in the
58             same file. However, within a quality score entry, all lines must be the same
59             length except for the last. An error will be thrown if this is not the case.
60              
61             The module uses /^E(\S+)/ to extract the primary ID of each quality score
62             from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback
63             routine to reversibly modify this primary ID, e.g. if you wish to extract a
64             specific portion of the gi|gb|abc|xyz GenBank IDs.
65              
66             =head1 DATABASE CREATION AND INDEXING
67              
68             The object-oriented constructor is new(), the filehandle constructor is newFh()
69             and the tied hash constructor is tie(). They all allow one to index a single Fasta
70             file, several files, or a directory of files. See Bio::DB::IndexedBase.
71              
72             =head1 SEE ALSO
73              
74             L
75              
76             L
77              
78             L
79              
80             =head1 LIMITATIONS
81              
82             When a quality score is deleted from one of the qual files, this deletion is not
83             detected by the module and removed from the index. As a result, a "ghost" entry
84             will remain in the index and will return garbage results if accessed. Currently,
85             the only way to accommodate deletions is to rebuild the entire index, either by
86             deleting it manually, or by passing -reindex=E1 to new() when
87             initializing the module.
88              
89             All quality score lines for a given quality score must have the same length
90             except for the last (not sure why there is this limitation). This is not
91             problematic for sequences but could be annoying for quality scores. A workaround
92             is to make sure that your quality scores fit on no more than 2 lines. Another
93             solution could be to padd them with blank spaces so that each line has the same
94             number of characters (maybe this padding should be implemented in
95             Bio::SeqIO::qual?).
96              
97             =head1 AUTHOR
98              
99             Florent E Angly Eflorent . angly @ gmail-dot-comE.
100              
101             Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
102              
103             Copyright (c) 2007 Florent E Angly.
104              
105             This library is free software; you can redistribute it and/or modify
106             it under the same terms as Perl itself.
107              
108             =head1 APPENDIX
109              
110             The rest of the documentation details each of the object
111             methods. Internal methods are usually preceded with a _
112              
113             For BioPerl-style access, the following methods are provided:
114              
115             =head2 get_Seq_by_id
116              
117             Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id,
118             get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id,
119             Usage : my $seq = $db->get_Seq_by_id($id);
120             Function: Given an ID, fetch the corresponding sequence from the database.
121             Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
122             Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
123             only load the sequence string into memory when requested using seq().
124             See L for methods provided by the sequence objects
125             returned from get_Seq_by_id() and get_PrimarySeq_stream().
126             Args : ID
127              
128             =head2 get_PrimarySeq_stream
129              
130             Title : get_Seq_stream, get_PrimarySeq_stream
131             Usage : my $stream = $db->get_Seq_stream();
132             Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
133             single method, next_seq(). Each call to next_seq() returns a new
134             Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
135             Returns : A Bio::DB::Indexed::Stream object
136             Args : None
137              
138             =head1
139              
140             For simple access, the following methods are provided:
141              
142             =cut
143              
144              
145             package Bio::DB::Qual;
146              
147 1     1   5 use strict;
  1         2  
  1         23  
148 1     1   234 use IO::File;
  1         674  
  1         84  
149 1     1   6 use File::Spec;
  1         1  
  1         17  
150              
151 1     1   4 use base qw(Bio::DB::IndexedBase);
  1         1  
  1         343  
152              
153             our $obj_class = 'Bio::Seq::PrimaryQual::Qual';
154             our $file_glob = '*.{qual,QUAL,qa,QA}';
155              
156              
157             =head2 new
158              
159             Title : new
160             Usage : my $db = Bio::DB::Qual->new( $path, %options);
161             Function: Initialize a new database object. When indexing a directory, files
162             ending in .qual,qa are indexed by default.
163             Returns : A new Bio::DB::Qual object
164             Args : A single file, or path to dir, or arrayref of files
165             Optional arguments: see Bio::DB::IndexedBase
166              
167             =cut
168              
169              
170             sub _calculate_offsets {
171             # Bio::DB::IndexedBase calls this to calculate offsets
172 9     9   16 my ($self, $fileno, $file, $offsets) = @_;
173              
174 9 50       29 my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!");
175 9         459 binmode $fh;
176 9 50       20 warn "Indexing $file\n" if $self->{debug};
177 9         12 my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line,
178             $numres, %offsets);
179 9         13 my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
180              
181 9         11 my $termination_length = $self->{termination_length};
182 9         65 while (my $line = <$fh>) {
183             # Account for crlf-terminated Windows files
184 90 100       229 if (index($line, '>') == 0) {
    50          
185 45 50       109 if ($line =~ /^>(\S+)/) {
186             print STDERR "Indexed $count quality scores...\n"
187 45 50 33     70 if $self->{debug} && (++$count%1000) == 0;
188 45         87 $self->_check_linelength($linelen);
189 45         51 my $pos = tell($fh);
190 45 100       62 if (@ids) {
191 36         39 my $strlen = $pos - $offset - length($line);
192 36         36 $strlen -= $termination_length * $qual_lines;
193 36         32 my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres,
  36         59  
194             $linelen, $headerlen, Bio::DB::IndexedBase::NA, $fileno);
195 36         47 for my $id (@ids) {
196 36         385 $offsets->{$id} = $ppos;
197             }
198 36         61 $numres = 0;
199             }
200 45         82 @ids = $self->_makeid($line);
201 45         63 ($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0);
202 45         60 ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
203             } else {
204             # Catch bad header lines, bug 3172
205 0         0 $self->throw("FASTA header doesn't match '>(\\S+)': $line");
206             }
207             } elsif ($line !~ /\S/) {
208             # Skip blank line
209 0         0 $blank_lines++;
210 0         0 next;
211             } else {
212             # Need to check every line :(
213 45         41 $l3_len = $l2_len;
214 45         38 $l2_len = $l_len;
215 45         43 $l_len = length $line;
216 45         38 if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
217 45 0 33     66 if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
      33        
218 0         0 my $fap = substr($line, 0, 20)."..";
219 0         0 $self->throw("Each line of the qual entry must be the same ".
220             "length except the last. Line above #$. '$fap' is $l2_len".
221             " != $l3_len chars.");
222             }
223 45 50       61 if ($blank_lines) {
224             # Blank lines not allowed in entry
225 0         0 $self->throw("Blank lines can only precede header lines, ".
226             "found preceding line #$.");
227             }
228             }
229 45   33     106 $linelen ||= length $line;
230 45         38 $qual_lines++;
231 45         636 $numres += scalar(split /\s+/, $line);
232             }
233 90         265 $last_line = $line;
234             }
235              
236             # Process last entry
237 9         19 $self->_check_linelength($linelen);
238 9         12 my $pos = tell($fh);
239 9 50       15 if (@ids) {
240 9         10 my $strlen = $pos - $offset;
241 9 50       14 if ($linelen == 0) {
242 0         0 $strlen = 0;
243             } else {
244 9 50       24 if ($last_line !~ /\s$/) {
245 0         0 $qual_lines--;
246             }
247 9         10 $strlen -= $termination_length * $qual_lines;
248             }
249 9         10 my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, $linelen,
  9         17  
250             $headerlen, Bio::DB::IndexedBase::NA, $fileno);
251 9         14 for my $id (@ids) {
252 9         80 $offsets->{$id} = $ppos;
253             }
254             }
255              
256 9         70 return \%offsets;
257             }
258              
259              
260             # for backward compatibility
261             sub get_PrimaryQual_stream {
262 1     1 0 6 my $self = shift;
263 1         6 return $self->get_PrimarySeq_stream;
264             }
265              
266              
267             # for backward compatibility
268             sub get_Qual_by_id {
269 2     2 0 5 my ($self, $id) = @_;
270 2         10 return $self->get_Seq_by_id($id);
271             }
272              
273             *get_qual_by_version = *get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id;
274              
275              
276             =head2 qual
277              
278             Title : qual, quality, subqual
279             Usage : # All quality scores
280             my @qualarr = @{$qualdb->subqual($id)};
281             # Subset of the quality scores
282             my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)};
283             # or...
284             my @subqualarr = @{$qualdb->subqual($compound_id)};
285             Function: Get a subqual of an entry in the database. For your convenience,
286             the sequence to extract can be specified with any of the following
287             compound IDs:
288             $db->qual("$id:$start,$stop")
289             $db->qual("$id:$start..$stop")
290             $db->qual("$id:$start-$stop")
291             $db->qual("$id:$start,$stop/$strand")
292             $db->qual("$id:$start..$stop/$strand")
293             $db->qual("$id:$start-$stop/$strand")
294             $db->qual("$id/$strand")
295             If $stop is less than $start, then the reverse complement of the
296             sequence is returned. Avoid using it if possible since this goes
297             against Bio::Seq conventions.
298             Returns : Reference to an array of quality scores
299             Args : Compound ID of entry to retrieve
300             or
301             ID, optional start (defaults to 1), optional end (defaults to the
302             number of quality scores for this sequence), and strand (defaults to
303             1).
304              
305             =cut
306              
307             sub subqual {
308 16     16 0 34 my ($self, $id, $start, $stop, $strand) = @_;
309              
310             # Quality values in a quality score can have 1 or 2 digits and are separated
311             # by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
312             # is no easy way match the position of a quality value to its position in
313             # the quality string.
314             # As a consequence, if a subqual of the quality is requested, we still need
315             # to grab the full quality string first - performance penalty for big
316             # quality scores :(
317             # I think there is no way around starting at the begining of the quality
318             # score but maybe there is a resource-efficient way of starting at the
319             # begining of the quality score and stopping when the the position of the
320             # last quality value requested is reached??
321              
322 16 50       30 $self->throw('Need to provide a sequence ID') if not defined $id;
323 16         40 ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
324              
325             # Position in quality string
326 16         21 my $string_start = 1;
327 16         36 my $string_stop = $self->strlen($id);
328              
329             # Fetch full quality string
330 16 50       39 my $fh = $self->_fh($id) or return;
331 16         36 my $filestart = $self->_calc_offset($id, $string_start);
332 16         27 my $filestop = $self->_calc_offset($id, $string_stop );
333 16         68 seek($fh, $filestart,0);
334 16         16 my $data;
335 16         79 read($fh, $data, $filestop-$filestart+1);
336              
337             # Process quality score
338 16         44 Bio::DB::IndexedBase::_strip_crnl($data);
339 16         21 my $subqual = 0;
340 16 50 33     35 $subqual = 1 if ( $start || $stop );
341 16         18 my @data;
342 16 50 33     33 if ( $subqual || ($strand == -1) ) {
343 16         78 @data = split / /, $data, $stop+1;
344 16         23 my $length = scalar(@data);
345 16 50       23 $start = 1 if $start < 1;
346 16 50       21 $stop = $length if $stop > $length;
347 16 100       27 pop @data if ($stop != $length);
348 16         23 splice @data, 0, $start-1;
349 16 100       21 @data = reverse(@data) if $strand == -1;
350 16         38 $data = join ' ', @data;
351             } else {
352 0         0 @data = split / /, $data;
353             }
354              
355 16         60 return \@data;
356             }
357              
358             *qual = *quality = \&subqual;
359              
360              
361             =head2 header
362              
363             Title : header
364             Usage : my $header = $db->header($id);
365             Function: Get the header line (ID and description fields) of the specified entry.
366             Returns : String
367             Args : ID of entry
368              
369             =cut
370              
371             sub header {
372 2     2 1 4 my ($self, $id) = @_;
373 2 50       5 $self->throw('Need to provide a sequence ID') if not defined $id;
374 2         7 my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
  2         4  
375 2         5 $offset -= $headerlen;
376 2         2 my $data;
377 2 50       5 my $fh = $self->_fh($id) or return;
378 2         10 seek($fh, $offset, 0);
379 2         11 read($fh, $data, $headerlen);
380             # On Windows chomp remove '\n' but leaves '\r'
381             # when reading '\r\n' in binary mode,
382             # _strip_crnl removes both
383 2         6 $data = Bio::DB::IndexedBase::_strip_crnl($data);
384 2         5 substr($data, 0, 1) = '';
385 2         5 return $data;
386             }
387              
388              
389             #-------------------------------------------------------------
390             # Tied hash overrides
391             #
392              
393             sub FETCH {
394 3     3   13 return shift->subqual(@_);
395             }
396              
397              
398             #-------------------------------------------------------------
399             # Bio::Seq::PrimaryQual compatibility
400             #
401             # Usage is the same as in Bio::Seq::PrimaryQual
402              
403             package Bio::Seq::PrimaryQual::Qual;
404 1     1   7 use overload '""' => 'display_id';
  1         2  
  1         6  
405              
406 1     1   63 use base qw(Bio::Root::Root Bio::Seq::PrimaryQual);
  1         2  
  1         367  
407              
408             sub new {
409 7     7   17 my ($class, @args) = @_;
410 7         21 my $self = $class->SUPER::new(@args);
411 7         26 my ($db, $id, $start, $stop) = $self->_rearrange(
412             [qw(DATABASE ID START STOP)],
413             @args);
414 7         15 $self->{db} = $db;
415 7         10 $self->{id} = $id;
416 7   66     16 $self->{stop} = $stop || $db->length($id);
417 7   66     19 $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
418 7         18 return $self;
419             }
420              
421              
422             sub qual {
423 6     6   460 my $self = shift;
424 6         17 my $qual = $self->{db}->qual($self->{id}, $self->{start}, $self->{stop});
425 6         33 return $qual;
426             }
427              
428              
429             sub subqual {
430 2     2   4 my ($self, $start, $stop) = @_;
431 2         5 return $self->trunc($start, $stop)->qual;
432             }
433              
434              
435             sub trunc {
436             # Override Bio::Seq::QualI trunc() method. This way, we create an object
437             # that does not store the quality array in memory.
438 3     3   5 my ($self, $start, $stop) = @_;
439 3 50       8 $self->throw(
440             "$stop is smaller than $stop. If you want to truncate and reverse ".
441             "complement, you must call trunc followed by revcom."
442             ) if $start > $stop;
443 3 50       7 if ($self->{start} <= $self->{stop}) {
444 3         5 $start = $self->{start}+$start-1;
445 3         4 $stop = $self->{start}+$stop-1;
446             } else {
447 0         0 $start = $self->{start}-($start-1);
448 0         0 $stop = $self->{start}-($stop-1);
449             }
450             my $obj = $self->new( -database => $self->{db},
451             -id => $self->{id},
452 3         8 -start => $start,
453             -stop => $stop
454             );
455 3         9 return $obj;
456             }
457              
458              
459             sub display_id {
460 8     8   303 my $self = shift;
461 8         20 return $self->{id};
462             }
463              
464              
465             sub primary_id {
466 1     1   2 my $self = shift;
467 1         5 return overload::StrVal($self);
468             }
469              
470             sub revcom {
471             # Override Bio::QualI revcom() with optimized method.
472 1     1   2 my $self = shift;
473 1         2 return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
  1         3  
474             }
475              
476             sub length {
477             # Get length from quality location, not the quality array (too expensive)
478 3     3   5 my $self = shift;
479             return $self->{start} < $self->{stop} ?
480             $self->{stop} - $self->{start} + 1 :
481 3 100       16 $self->{start} - $self->{stop} + 1 ;
482             }
483              
484              
485             sub description {
486 1     1   3 my $self = shift;
487 1         4 my $header = $self->{'db'}->header($self->{id});
488             # remove the id from the header
489 1         4 $header = (split(/\s+/, $header, 2))[2];
490 1         16 return $header;
491             }
492             *desc = \&description;
493              
494              
495             1;