File Coverage

blib/lib/Bio/CUA/Summarizer.pm
Criterion Covered Total %
statement 94 127 74.0
branch 27 72 37.5
condition 1 5 20.0
subroutine 19 23 82.6
pod 9 11 81.8
total 150 238 63.0


line stmt bran cond sub pod time code
1             package Bio::CUA::Summarizer;
2              
3 4     4   29029 use 5.006;
  4         14  
  4         158  
4 4     4   28 use strict;
  4         5  
  4         121  
5 4     4   18 use warnings;
  4         6  
  4         119  
6 4     4   1103 use parent qw/Bio::CUA/;
  4         634  
  4         31  
7 4     4   2176 use Bio::CUA::CodonTable;
  4         9  
  4         334  
8              
9             my $codonPkg = 'Bio::CUA::CodonTable';
10             my $pkg = __PACKAGE__;
11             my @bases = qw/A T C G/;
12              
13             # determine which class is used for sequence processing
14             my $seq_io_pkg;
15             BEGIN{
16             # set version which might be checked during compilation
17             #our $VERSION = 1.01;
18              
19             # determine sequence processing module
20 4     4   6 eval { require Bio::SeqIO; };
  4         859  
21 4 50       24 if($@) # Bio::SeqIO is not available
22             {
23 4         7 $seq_io_pkg = 'Bio::CUA::SeqIO';
24 4         2495 require Bio::CUA::SeqIO;
25             }else # otherwise use Bio::SeqIO
26             {
27 0         0 $seq_io_pkg = 'Bio::SeqIO';
28             }
29             }
30              
31             =pod
32              
33             =head1 NAME
34              
35             Bio::CUA::Summarizer - a class to summarize features of sequences.
36              
37             =head1 SYNOPSIS
38              
39             This class provides convenience for its child classes with methods
40             summarizing sequence features, such
41             as counting and listing amino acids and codons, retrieving amino acids
42             with certain degree degeneracy in a genetic table. Refer to the
43             L section for more details.
44              
45             use Bio::CUA::Summarizer;
46              
47             my $summarizer = Bio::CUA::Summarizer->new(
48             codon_table => 1 ); # using stardard genetic code
49            
50             # get codons in a sequence file
51             my $codonList = $summarizer->tabulate_codons('seqs.fa');
52             # get the codon table object of this summarizer
53             my $table = $summarizer->codon_table;
54             # get all sense codons in the genetic codon table
55             my @senseCodons = $summarizer->all_sense_codons;
56             # get codons encoding an amino acid
57             my @codons = $summarizer->codons_of_AA('Ser');
58              
59             =cut
60              
61              
62             =head2 new
63              
64             Title : new
65             Usage : $obj=Bio::CUA::Summarizer->new(%args);
66             Function: create an object which can be used to summarizing sequence
67             features.
68             Returns : an object of this or child class
69             Args : a hash with a key 'codon_table', acceptable values are
70             codon_table => id of genetic codon table # 1
71             codon_table => Bio::CUA::CodonTable object # 2
72             codon_table => 'map-file' # 3
73              
74             =over 3
75              
76             =item 1
77              
78             id of genetic codon table can be found from L
79             codes|http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=t>.
80             A valid id is an integer.
81              
82             =item 2
83              
84             an object of L. One can directly provide an
85             object to the method.
86              
87             =item 3
88              
89             If genetic code in analyzed sequences is not included in NCBI, one can
90             also provide its own genetic code in a map-file, in the format of
91             codon1AA1
92             codon2AA2,
93             ... ... ....
94              
95             =back
96              
97             Note all the analyzed sequences will use this provided genetic codon
98             table to map between amino acids and codons.
99              
100             =cut
101              
102             sub new
103             {
104 3     3 1 930 my ($caller, @args) = @_;
105 3         25 my $self = $caller->SUPER::new(@args);
106              
107 3         12 my $hashRef = $self->_array_to_hash(\@args);
108              
109             # only process its own argument
110 3         5 my $codonTable;
111 3         16 while(my ($tag, $val) = each %$hashRef)
112             {
113 3 50       11 next unless($tag eq 'codon_table');
114 3 50       23 if(ref($val))
    50          
115             {
116 0 0       0 $self->throw("$val is not an object of $codonPkg")
117             unless($val->isa($codonPkg));
118 0         0 $codonTable = $val;
119             }elsif($val =~ /^\d+$/) # genetic code id
120             {
121 3 50       27 $codonTable = $codonPkg->new(-id => $val) or
122             $self->throw("Invalid genetic code ID '$val'");
123             }else # a map file
124             {
125 0 0       0 $codonTable = $codonPkg->new(-map_file => $val) or
126             $self->throw("Can not construct codon table with file '$val'");
127             }
128 3         11 last;
129             }
130             # store the result
131 3         33 $self->{'_codon_table'} = $codonTable;
132              
133 3         22 return $self;
134             }
135              
136             =head2 codon_table
137              
138             Title : codon_table
139             Usage : $table = $self->codon_table;
140             Function: get associated codon table of this object
141             Returns : an object of L
142             Args : None
143              
144             =cut
145              
146             sub codon_table
147             {
148 249 50   249 1 1359 my $table = $_[0]->{'_codon_table'} or
149             $_[0]->warn("No codon table associated with this object $_[0]");
150 249         736 return $table;
151             }
152              
153             =head2 bases
154              
155             get the 4 nucleotides A,T,C,G always in this order, to keep
156             consistency among different classes
157              
158             =cut
159              
160             # get all the nucleotide bases in a certain order
161             sub bases
162             {
163 1 50   1 1 628 return wantarray? @bases : \@bases;
164             }
165              
166             # sequence-level functions
167             =head2 get_codon_list
168              
169             Title : get_codon_list
170             Usage : $codonList = $self->get_codon_list($input)
171             Function: get codons and their counts in input
172             Returns : reference to a hash containing codons as keys and counts
173             as values.
174             Args : seq string, seq object, seq file, or another codon list
175              
176             =cut
177              
178             # the main interface to preprocess input to most methods
179             # return a codon list with its count
180             # acceptable parameters: seq string, seq object, seq file, codon list
181             sub get_codon_list
182             {
183 57     57 0 101 my ($self, $input) = @_;
184              
185 57         144 my $ref = ref($input);
186 57 100       177 unless($ref) # a scalar variable
187             {
188             # a sequence string
189 5 50 33     82 if($input =~ /^[ATGCUN]+$/ and (! -f $input))
190             {
191 0         0 return $self->_catalog_codons($input);
192             }else # a sequence file
193             {
194 5         23 return $self->tabulate_codons($input);
195             }
196             }
197              
198 52 50       156 if($ref eq 'HASH') # codon list
199             {
200 0         0 return $input;
201             }else # an seq object
202             {
203 52         186 return $self->_catalog_codons($input);
204             }
205             }
206              
207             =head2 tabulate_codons
208              
209             Title : tabulate_codons
210             Usage : $codonList = $self->tabulate_codons($input,[$each]);
211             Function: count codons in the input sequences
212             Returns : reference to a hash in which codon is the key and counts as
213             values. If $each is true, then each sequence is separately processed
214             and stored in a larger hash with the sequence names as hash keys
215             Args : accepted arguments are as follows:
216             I = name of a file containing fasta sequences
217             I = optional, if TRUE (i.e., non-zero values), each sequence is
218             separately processed.
219              
220             This is a companionate method of L for situations
221             when one want to get codon counts for each sequence separately.
222              
223             =cut
224              
225             sub tabulate_codons
226             {
227 6     6 1 12 my ($self, $input, $each) = @_;
228              
229 6 50       31 my $seqIO = $self->_get_seq_io($input) or return;
230 6         11 my %list;
231              
232 6 50       27 if($each)
233             {
234 0         0 while(my $seq = $seqIO->next_seq())
235             {
236 0         0 my $codons = $self->_catalog_codons($seq->seq);
237 0         0 $list{$seq->id} = $codons;
238             }
239              
240             }else # otherwise process together
241             {
242 6         39 while(my $seq = $seqIO->next_seq())
243             {
244 78         237 my $codons = $self->_catalog_codons($seq->seq);
245             # merge all codons together
246 78         571 while(my ($c, $v) = each %$codons)
247             {
248 4758         11807 $list{$c} += $v;
249             }
250             }
251             }
252            
253 6 50       37 return undef unless(keys %list);
254 6         50 return \%list;
255             }
256              
257             =head2 tabulate_AAs
258              
259             Title : tabulate_AAs
260             Usage : $AAList = $self->tabulate_AAs($input,[$each]);
261             Function: similar to L, but for counting amino acids
262             Returns : the same as L, but for amino acids
263             Args : refer to L.
264              
265             =cut
266              
267             sub tabulate_AAs
268             {
269 0     0 1 0 my ($self, $input, $each) = @_;
270              
271 0 0       0 my $codonList = $self->tabulate_codons($input) or return;
272              
273 0         0 my %AAs;
274 0 0       0 if($each)
275             {
276 0         0 while(my ($id, $hashRef) = each %$codonList)
277             {
278 0         0 while(my ($codon, $count) = each %$codonList)
279             {
280 0 0       0 my $AA = $self->_codon_to_aa($codon) or next;
281 0         0 $AAs{$id}->{$AA} += $count;
282             }
283             }
284             }else
285             {
286 0         0 while(my ($codon, $count) = each %$codonList)
287             {
288 0 0       0 my $AA = $self->_codon_to_aa($codon) or next;
289 0         0 $AAs{$AA} += $count;
290             }
291             }
292              
293 0         0 return \%AAs;
294             }
295              
296             # get the sequence IO and return it
297             sub _get_seq_io
298             {
299 6     6   15 my ($self, $input) = @_;
300              
301 6 50 0     42 $self->warn("input fasta file is needed to obtain seq IO") and return
302             unless($input);
303             # at present, use Bio::SeqIO
304 6         88 my $io = $seq_io_pkg->new(-file => $input, -format => 'fasta');
305 6         31 return $io;
306             }
307              
308              
309             # codon table functions
310             =head2 all_sense_codons
311              
312             get all sense codons in the genetic codon table this object
313              
314             =cut
315              
316             sub all_sense_codons
317             {
318 2     2 0 4 my ($self) = @_;
319              
320 2 50       5 my $codonTable = $self->codon_table() or return;
321              
322 2         9 return $codonTable->all_sense_codons;
323             }
324              
325             =head2 all_AAs_in_table
326              
327             get all the amino acids coded in the genetic code table of this object
328              
329             =cut
330              
331             sub all_AAs_in_table
332             {
333 8     8 1 17 my ($self) = @_;
334              
335 8 50       28 my $codonTable = $self->codon_table() or return;
336              
337 8         43 $codonTable->all_amino_acids();
338             }
339              
340             =head2 codons_of_AA
341              
342             get codons coding the given amino acid, e.g.,
343              
344             my @codons = $self->codons_of_AA('Ser');
345              
346             =cut
347              
348             sub codons_of_AA
349             {
350 160     160 1 202 my ($self, $AA) = @_;
351              
352 160 50       241 my $codonTable = $self->codon_table() or return;
353              
354 160         394 return $codonTable->codons_of_AA($AA);
355             }
356              
357             =head2 aa_degeneracy_classes
358              
359             Title : aa_degeneracy_classes
360             Usage : $hashRef = $self->aa_degeneracy_classes;
361             Function: get amino acid degeneracy classes according to the
362             associated genetic code table
363             Returns : a hash reference in which first level key is degeneracy
364             degrees such as 1,2,3,4,6, second level is amino acid, the value is
365             reference to the corresponding codon array. For example:
366            
367             { 2 => { D => [GAU, GAC],
368             C => [UGU, UGC],
369             ... ...
370             },
371             4 => { A => [GCU, GCC, GCA, GCG],
372             ... ...
373             },
374             ... ... ...
375             }
376              
377             Args : None
378              
379             =cut
380              
381             sub aa_degeneracy_classes
382             {
383 52 50   52 1 266 my $codonTable = $_[0]->codon_table or return;
384 52         275 return $codonTable->codon_degeneracy;
385             }
386              
387             =head2 codons_by_degeneracy
388              
389             Title : codons_by_degeneracy
390             Usage : @codons = $self->codons_by_degeneracy(2);
391             Function: get all the codons of AAs which have the specified degree
392             of degeneracy, for example, codons of amino acids with degenracy
393             degree 2.
394             Returns : an array of codons, or its reference in scalar context
395             Args : an integer for degeneracy degree
396              
397             =cut
398              
399             sub codons_by_degeneracy
400             {
401 26     26 1 54 my ($self, $deg) = @_;
402 26 50       124 my $degHash = $self->aa_degeneracy_classes or return;
403 26 50       171 my $aaClass = $degHash->{$deg} or return;
404 26         52 my @codons;
405 26         160 while(my ($aa, $codonRef) = each %$aaClass)
406             {
407 52         242 push @codons, @$codonRef;
408             }
409 26 50       187 return wantarray? @codons : \@codons;
410             }
411              
412             # other misc functions
413             #############################################
414             # Other methods used internally
415             #############################################
416              
417             # check whether a codon is valid and also not stop codon
418             sub _is_sense_codon
419             {
420 0     0   0 my ($self, $codon) = @_;
421              
422 0 0       0 my $codonTable = $self->codon_table() or return;
423 0 0       0 return 0 unless($codonTable->is_valid_codon($codon));
424 0 0       0 return 0 if($codonTable->is_stop_codon($codon));
425              
426 0         0 return 1;
427             }
428              
429             # check whether a codon is a stop codon
430             sub _is_stop_codon
431             {
432 0     0   0 my ($self, $codon) = @_;
433 0 0       0 my $codonTable = $self->codon_table() or return;
434 0         0 $codonTable->is_stop_codon($codon);
435             }
436              
437             # get the corresponding AA of a codon
438             sub _codon_to_aa
439             {
440 0     0   0 my ($self, $codon) = @_;
441              
442 0 0       0 my $codonTable = $self->codon_table() or return;
443 0         0 $codonTable->translate($codon);
444             }
445              
446             # get all the codons in the sequence, that is, split into nucleotide
447             # triplet
448             sub _catalog_codons
449             {
450 130     130   362 my ($self, $seq) = @_;
451              
452 130         455 $seq = $self->_get_seq_str($seq);
453 130         555 my %codons;
454 130         231 my $accuLen = 0;
455 130         292 my $seqLen = length($seq);
456 130 50       599 $self->warn("sequence [$seq] is not multiple of 3 long") unless($seqLen %
457             3 == 0);
458 130         210 my $codon;
459 130         419 while($accuLen + 3 <= $seqLen)
460             {
461 88570         140845 $codon = substr($seq,$accuLen,3);
462 88570         112460 $codons{$codon}++;
463 88570         205005 $accuLen += 3;
464             }
465              
466 130 50       427 return undef unless($accuLen);
467 130         1239 return \%codons;
468             }
469              
470             # get and preprocess sequence
471             # get a seq string or seq object and return sequence string
472             sub _get_seq_str
473             {
474 130     130   243 my ($self, $seq) = @_;
475 130 100       523 $seq = ref($seq)? $seq->seq : $seq;
476 130         1346 $seq = uc($seq);
477 130         737 $seq =~ tr/U/T/;
478 130         4740 $seq =~ s/[^A-Z]+//g; # remove all non-nucleotide characters
479              
480 130         388 my $len = length($seq);
481 130 50       520 if($len > 0)
482             {
483 130         601 return $seq;
484             }else
485             {
486 0           return undef;
487             }
488             }
489              
490             =head1 AUTHOR
491              
492             Zhenguo Zhang, C<< >>
493              
494             =head1 BUGS
495              
496             Please report any bugs or feature requests to C, or through
497             the web interface at L. I will be notified, and then you'll
498             automatically be notified of progress on your bug as I make changes.
499              
500             =head1 SUPPORT
501              
502             You can find documentation for this module with the perldoc command.
503              
504             perldoc Bio::CUA::Summarizer
505              
506              
507             You can also look for information at:
508              
509             =over 4
510              
511             =item * RT: CPAN's request tracker (report bugs here)
512              
513             L
514              
515             =item * AnnoCPAN: Annotated CPAN documentation
516              
517             L
518              
519             =item * CPAN Ratings
520              
521             L
522              
523             =item * Search CPAN
524              
525             L
526              
527             =back
528              
529              
530             =head1 ACKNOWLEDGEMENTS
531              
532              
533             =head1 LICENSE AND COPYRIGHT
534              
535             Copyright 2015 Zhenguo Zhang.
536              
537             This program is free software: you can redistribute it and/or modify
538             it under the terms of the GNU General Public License as published by
539             the Free Software Foundation, either version 3 of the License, or
540             (at your option) any later version.
541              
542             This program is distributed in the hope that it will be useful,
543             but WITHOUT ANY WARRANTY; without even the implied warranty of
544             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
545             GNU General Public License for more details.
546              
547             You should have received a copy of the GNU General Public License
548             along with this program. If not, see L.
549              
550              
551             =cut
552              
553             1; # End of Bio::CUA::Summarizer
554