File Coverage

blib/lib/Bio/CUA/Summarizer.pm
Criterion Covered Total %
statement 94 127 74.0
branch 28 74 37.8
condition 1 7 14.2
subroutine 19 23 82.6
pod 9 11 81.8
total 151 242 62.4


line stmt bran cond sub pod time code
1             package Bio::CUA::Summarizer;
2              
3 4     4   29792 use 5.006;
  4         10  
4 4     4   22 use strict;
  4         4  
  4         93  
5 4     4   18 use warnings;
  4         5  
  4         142  
6 4     4   1044 use parent qw/Bio::CUA/;
  4         608  
  4         31  
7 4     4   2138 use Bio::CUA::CodonTable;
  4         10  
  4         448  
8              
9             my $codonPkg = 'Bio::CUA::CodonTable';
10             my $pkg = __PACKAGE__;
11             my @bases = qw/A T C G/;
12              
13             # determine which class is used for sequence processing
14             my $seq_io_pkg;
15             BEGIN{
16             # set version which might be checked during compilation
17             #our $VERSION = 1.01;
18              
19             # determine sequence processing module
20 4     4   8 eval { require Bio::SeqIO; };
  4         1016  
21 4 50       24 if($@) # Bio::SeqIO is not available
22             {
23 4         6 $seq_io_pkg = 'Bio::CUA::SeqIO';
24 4         2674 require Bio::CUA::SeqIO;
25             }else # otherwise use Bio::SeqIO
26             {
27 0         0 $seq_io_pkg = 'Bio::SeqIO';
28             }
29             }
30              
31             =pod
32              
33             =head1 NAME
34              
35             Bio::CUA::Summarizer - a class to summarize features of sequences.
36              
37             =head1 SYNOPSIS
38              
39             This class provides convenience for its child classes with methods
40             summarizing sequence features, such
41             as counting and listing amino acids and codons, retrieving amino acids
42             with certain degree degeneracy in a genetic table. Refer to the
43             L section for more details.
44              
45             use Bio::CUA::Summarizer;
46              
47             my $summarizer = Bio::CUA::Summarizer->new(
48             codon_table => 1 ); # using stardard genetic code
49            
50             # get codons in a sequence file
51             my $codonList = $summarizer->tabulate_codons('seqs.fa');
52             # get the codon table object of this summarizer
53             my $table = $summarizer->codon_table;
54             # get all sense codons in the genetic codon table
55             my @senseCodons = $summarizer->all_sense_codons;
56             # get codons encoding an amino acid
57             my @codons = $summarizer->codons_of_AA('Ser');
58              
59             =cut
60              
61              
62             =head2 new
63              
64             Title : new
65             Usage : $obj=Bio::CUA::Summarizer->new(%args);
66             Function: create an object which can be used to summarizing sequence
67             features.
68             Returns : an object of this or child class
69             Args : a hash with a key 'codon_table', acceptable values are
70             codon_table => id of genetic codon table # 1
71             codon_table => Bio::CUA::CodonTable object # 2
72             codon_table => 'map-file' # 3
73              
74             =over 3
75              
76             =item 1
77              
78             id of genetic codon table can be found from L
79             codes|http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=t>.
80             A valid id is an integer.
81              
82             =item 2
83              
84             an object of L. One can directly provide an
85             object to the method.
86              
87             =item 3
88              
89             If genetic code in analyzed sequences is not included in NCBI, one can
90             also provide its own genetic code in a map-file, in the format of
91             codon1AA1
92             codon2AA2,
93             ... ... ....
94              
95             =back
96              
97             Note all the analyzed sequences will use this provided genetic codon
98             table to map between amino acids and codons.
99              
100             =cut
101              
102             sub new
103             {
104 3     3 1 772 my ($caller, @args) = @_;
105 3         26 my $self = $caller->SUPER::new(@args);
106              
107 3         13 my $hashRef = $self->_array_to_hash(\@args);
108              
109             # only process its own argument
110 3         4 my $codonTable;
111 3         23 while(my ($tag, $val) = each %$hashRef)
112             {
113 3 50       11 next unless($tag eq 'codon_table');
114 3 50       25 if(ref($val))
    50          
115             {
116 0 0       0 $self->throw("$val is not an object of $codonPkg")
117             unless($val->isa($codonPkg));
118 0         0 $codonTable = $val;
119             }elsif($val =~ /^\d+$/) # genetic code id
120             {
121 3 50       28 $codonTable = $codonPkg->new(-id => $val) or
122             $self->throw("Invalid genetic code ID '$val'");
123             }else # a map file
124             {
125 0 0       0 $codonTable = $codonPkg->new(-map_file => $val) or
126             $self->throw("Can not construct codon table with file '$val'");
127             }
128 3         9 last;
129             }
130 3 50 0     13 $self->warn("option 'codon_table' is missing in the method",
131             "'new' of $pkg") and return undef unless($codonTable);
132             # store the result
133 3         26 $self->{'_codon_table'} = $codonTable;
134              
135 3         17 return $self;
136             }
137              
138             =head2 codon_table
139              
140             Title : codon_table
141             Usage : $table = $self->codon_table;
142             Function: get associated codon table of this object
143             Returns : an object of L
144             Args : None
145              
146             =cut
147              
148             sub codon_table
149             {
150 249 50   249 1 1135 my $table = $_[0]->{'_codon_table'} or
151             $_[0]->warn("No codon table associated with this object $_[0]");
152 249         499 return $table;
153             }
154              
155             =head2 bases
156              
157             get the 4 nucleotides A,T,C,G always in this order, to keep
158             consistency among different classes
159              
160             =cut
161              
162             # get all the nucleotide bases in a certain order
163             sub bases
164             {
165 1 50   1 1 481 return wantarray? @bases : \@bases;
166             }
167              
168             # sequence-level functions
169             =head2 get_codon_list
170              
171             Title : get_codon_list
172             Usage : $codonList = $self->get_codon_list($input)
173             Function: get codons and their counts in input
174             Returns : reference to a hash containing codons as keys and counts
175             as values.
176             Args : seq string, seq object, seq file, or another codon list
177              
178             =cut
179              
180             # the main interface to preprocess input to most methods
181             # return a codon list with its count
182             # acceptable parameters: seq string, seq object, seq file, codon list
183             sub get_codon_list
184             {
185 57     57 0 67 my ($self, $input) = @_;
186              
187 57         83 my $ref = ref($input);
188 57 100       100 unless($ref) # a scalar variable
189             {
190             # a sequence string
191 5 50 33     31 if($input =~ /^[ATGCUN]+$/ and (! -f $input))
192             {
193 0         0 return $self->_catalog_codons($input);
194             }else # a sequence file
195             {
196 5         20 return $self->tabulate_codons($input);
197             }
198             }
199              
200 52 50       88 if($ref eq 'HASH') # codon list
201             {
202 0         0 return $input;
203             }else # an seq object
204             {
205 52         89 return $self->_catalog_codons($input);
206             }
207             }
208              
209             =head2 tabulate_codons
210              
211             Title : tabulate_codons
212             Usage : $codonList = $self->tabulate_codons($input,[$each]);
213             Function: count codons in the input sequences
214             Returns : reference to a hash in which codon is the key and counts as
215             values. If $each is true, then each sequence is separately processed
216             and stored in a larger hash. The count of a codon in a sequence can
217             be retrieved like this: $codonList->{'seqId'}->{'codon'}.
218             Args : accepted arguments are as follows:
219             I = name of a file containing fasta sequences
220             I = optional, if TRUE (i.e., non-zero values), each sequence is
221             separately processed.
222              
223             This is a companionate method of L for situations
224             when one want to get codon counts for each sequence separately.
225              
226             =cut
227              
228             sub tabulate_codons
229             {
230 6     6 1 13 my ($self, $input, $each) = @_;
231              
232 6 50       33 my $seqIO = $self->_get_seq_io($input) or return;
233 6         11 my %list;
234              
235 6 50       16 if($each)
236             {
237 0         0 while(my $seq = $seqIO->next_seq())
238             {
239 0         0 my $codons = $self->_catalog_codons($seq->seq);
240 0         0 $list{$seq->id} = $codons;
241             }
242              
243             }else # otherwise process together
244             {
245 6         31 while(my $seq = $seqIO->next_seq())
246             {
247 78         188 my $codons = $self->_catalog_codons($seq->seq);
248             # merge all codons together
249 78         413 while(my ($c, $v) = each %$codons)
250             {
251 4758         9979 $list{$c} += $v;
252             }
253             }
254             }
255            
256 6 50       33 return undef unless(keys %list);
257 6         41 return \%list;
258             }
259              
260             =head2 tabulate_AAs
261              
262             Title : tabulate_AAs
263             Usage : $AAList = $self->tabulate_AAs($input,[$each]);
264             Function: similar to L, but for counting amino acids
265             Returns : the same as L, but for amino acids
266             Args : refer to L.
267              
268             =cut
269              
270             sub tabulate_AAs
271             {
272 0     0 1 0 my ($self, $input, $each) = @_;
273              
274 0 0       0 my $codonList = $self->tabulate_codons($input) or return;
275              
276 0         0 my %AAs;
277 0 0       0 if($each)
278             {
279 0         0 while(my ($id, $hashRef) = each %$codonList)
280             {
281 0         0 while(my ($codon, $count) = each %$codonList)
282             {
283 0 0       0 my $AA = $self->_codon_to_aa($codon) or next;
284 0         0 $AAs{$id}->{$AA} += $count;
285             }
286             }
287             }else
288             {
289 0         0 while(my ($codon, $count) = each %$codonList)
290             {
291 0 0       0 my $AA = $self->_codon_to_aa($codon) or next;
292 0         0 $AAs{$AA} += $count;
293             }
294             }
295              
296 0         0 return \%AAs;
297             }
298              
299             # get the sequence IO and return it
300             sub _get_seq_io
301             {
302 6     6   12 my ($self, $input) = @_;
303              
304 6 50 0     19 $self->warn("input fasta file is needed to obtain seq IO") and return
305             unless($input);
306             # at present, use Bio::SeqIO
307 6         79 my $io = $seq_io_pkg->new(-file => $input, -format => 'fasta');
308 6         29 return $io;
309             }
310              
311              
312             # codon table functions
313             =head2 all_sense_codons
314              
315             get all sense codons in the genetic codon table of this object
316              
317             =cut
318              
319             sub all_sense_codons
320             {
321 2     2 0 5 my ($self) = @_;
322              
323 2 50       3 my $codonTable = $self->codon_table() or return;
324              
325 2         9 return $codonTable->all_sense_codons;
326             }
327              
328             =head2 all_AAs_in_table
329              
330             get all the amino acids coded in the genetic code table of this object
331              
332             =cut
333              
334             sub all_AAs_in_table
335             {
336 8     8 1 13 my ($self) = @_;
337              
338 8 50       27 my $codonTable = $self->codon_table() or return;
339              
340 8         35 $codonTable->all_amino_acids();
341             }
342              
343             =head2 codons_of_AA
344              
345             get codons coding the given amino acid, I,
346              
347             my @codons = $self->codons_of_AA('Ser');
348              
349             =cut
350              
351             sub codons_of_AA
352             {
353 160     160 1 149 my ($self, $AA) = @_;
354              
355 160 50       197 my $codonTable = $self->codon_table() or return;
356              
357 160         292 return $codonTable->codons_of_AA($AA);
358             }
359              
360             =head2 aa_degeneracy_classes
361              
362             Title : aa_degeneracy_classes
363             Usage : $hashRef = $self->aa_degeneracy_classes;
364             Function: get amino acid degeneracy classes according to the
365             associated genetic code table
366             Returns : a hash reference in which first level key is degeneracy
367             degrees such as 1,2,3,4,6, second level is amino acid, the value is
368             reference to the corresponding codon array. For example:
369            
370             { 2 => { D => [GAU, GAC],
371             C => [UGU, UGC],
372             ... ...
373             },
374             4 => { A => [GCU, GCC, GCA, GCG],
375             ... ...
376             },
377             ... ... ...
378             }
379              
380             Args : None
381              
382             =cut
383              
384             sub aa_degeneracy_classes
385             {
386 52 50   52 1 97 my $codonTable = $_[0]->codon_table or return;
387 52         178 return $codonTable->codon_degeneracy;
388             }
389              
390             =head2 codons_by_degeneracy
391              
392             Title : codons_by_degeneracy
393             Usage : @codons = $self->codons_by_degeneracy(2);
394             Function: get all the codons of AAs which have the specified degree
395             of degeneracy, for example, codons of amino acids with degenracy
396             degree 2.
397             Returns : an array of codons, or its reference in scalar context
398             Args : an integer for degeneracy degree
399              
400             =cut
401              
402             sub codons_by_degeneracy
403             {
404 26     26 1 42 my ($self, $deg) = @_;
405 26 50       49 my $degHash = $self->aa_degeneracy_classes or return;
406 26 50       72 my $aaClass = $degHash->{$deg} or return;
407 26         31 my @codons;
408 26         84 while(my ($aa, $codonRef) = each %$aaClass)
409             {
410 52         147 push @codons, @$codonRef;
411             }
412 26 50       127 return wantarray? @codons : \@codons;
413             }
414              
415             # other misc functions
416             #############################################
417             # Other methods used internally
418             #############################################
419              
420             # check whether a codon is valid and also not stop codon
421             sub _is_sense_codon
422             {
423 0     0   0 my ($self, $codon) = @_;
424              
425 0 0       0 my $codonTable = $self->codon_table() or return;
426 0 0       0 return 0 unless($codonTable->is_valid_codon($codon));
427 0 0       0 return 0 if($codonTable->is_stop_codon($codon));
428              
429 0         0 return 1;
430             }
431              
432             # check whether a codon is a stop codon
433             sub _is_stop_codon
434             {
435 0     0   0 my ($self, $codon) = @_;
436 0 0       0 my $codonTable = $self->codon_table() or return;
437 0         0 $codonTable->is_stop_codon($codon);
438             }
439              
440             # get the corresponding AA of a codon
441             sub _codon_to_aa
442             {
443 0     0   0 my ($self, $codon) = @_;
444              
445 0 0       0 my $codonTable = $self->codon_table() or return;
446 0         0 $codonTable->translate($codon);
447             }
448              
449             # get all the codons in the sequence, that is, split into nucleotide
450             # triplet
451             sub _catalog_codons
452             {
453 130     130   228 my ($self, $seq) = @_;
454              
455 130         271 $seq = $self->_get_seq_str($seq);
456 130         166 my %codons;
457 130         136 my $accuLen = 0;
458 130         198 my $seqLen = length($seq);
459 130 50       369 $self->warn("sequence [$seq] is not multiple of 3 long") unless($seqLen %
460             3 == 0);
461 130         125 my $codon;
462 130         293 while($accuLen + 3 <= $seqLen)
463             {
464 88570         114341 $codon = substr($seq,$accuLen,3);
465 88570         88053 $codons{$codon}++;
466 88570         165545 $accuLen += 3;
467             }
468              
469 130 50       284 return undef unless($accuLen);
470 130         694 return \%codons;
471             }
472              
473             # get and preprocess sequence
474             # get a seq string or seq object and return sequence string
475             sub _get_seq_str
476             {
477 130     130   186 my ($self, $seq) = @_;
478 130 100       315 $seq = ref($seq)? $seq->seq : $seq;
479 130         979 $seq = uc($seq);
480 130         531 $seq =~ tr/U/T/;
481 130         1373 $seq =~ s/[^A-Z]+//g; # remove all non-nucleotide characters
482              
483 130         239 my $len = length($seq);
484 130 50       311 if($len > 0)
485             {
486 130         397 return $seq;
487             }else
488             {
489 0           return undef;
490             }
491             }
492              
493             =head1 AUTHOR
494              
495             Zhenguo Zhang, C<< >>
496              
497             =head1 BUGS
498              
499             Please report any bugs or feature requests to C, or through
500             the web interface at L. I will be notified, and then you'll
501             automatically be notified of progress on your bug as I make changes.
502              
503             =head1 SUPPORT
504              
505             You can find documentation for this module with the perldoc command.
506              
507             perldoc Bio::CUA::Summarizer
508              
509              
510             You can also look for information at:
511              
512             =over 4
513              
514             =item * RT: CPAN's request tracker (report bugs here)
515              
516             L
517              
518             =item * AnnoCPAN: Annotated CPAN documentation
519              
520             L
521              
522             =item * CPAN Ratings
523              
524             L
525              
526             =item * Search CPAN
527              
528             L
529              
530             =back
531              
532              
533             =head1 ACKNOWLEDGEMENTS
534              
535              
536             =head1 LICENSE AND COPYRIGHT
537              
538             Copyright 2015 Zhenguo Zhang.
539              
540             This program is free software: you can redistribute it and/or modify
541             it under the terms of the GNU General Public License as published by
542             the Free Software Foundation, either version 3 of the License, or
543             (at your option) any later version.
544              
545             This program is distributed in the hope that it will be useful,
546             but WITHOUT ANY WARRANTY; without even the implied warranty of
547             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
548             GNU General Public License for more details.
549              
550             You should have received a copy of the GNU General Public License
551             along with this program. If not, see L.
552              
553              
554             =cut
555              
556             1; # End of Bio::CUA::Summarizer
557