File Coverage

blib/lib/Bio/CUA/Summarizer.pm
Criterion Covered Total %
statement 95 128 74.2
branch 29 74 39.1
condition 1 7 14.2
subroutine 19 23 82.6
pod 9 11 81.8
total 153 243 62.9


line stmt bran cond sub pod time code
1             package Bio::CUA::Summarizer;
2              
3 4     4   20553 use 5.006;
  4         10  
  4         123  
4 4     4   23 use strict;
  4         5  
  4         89  
5 4     4   15 use warnings;
  4         3  
  4         89  
6 4     4   819 use parent qw/Bio::CUA/;
  4         504  
  4         26  
7 4     4   2440 use Bio::CUA::CodonTable;
  4         8  
  4         303  
8              
9             my $codonPkg = 'Bio::CUA::CodonTable';
10             my $pkg = __PACKAGE__;
11             my @bases = qw/A T C G/;
12              
13             # determine which class is used for sequence processing
14             my $seq_io_pkg;
15             BEGIN{
16             # set version which might be checked during compilation
17             #our $VERSION = 1.01;
18              
19             # determine sequence processing module
20 4     4   6 eval { require Bio::SeqIO; };
  4         817  
21 4 50       21 if($@) # Bio::SeqIO is not available
22             {
23 4         6 $seq_io_pkg = 'Bio::CUA::SeqIO';
24 4         2578 require Bio::CUA::SeqIO;
25             }else # otherwise use Bio::SeqIO
26             {
27 0         0 $seq_io_pkg = 'Bio::SeqIO';
28             }
29             }
30              
31             =pod
32              
33             =head1 NAME
34              
35             Bio::CUA::Summarizer - a class to summarize features of sequences.
36              
37             =head1 SYNOPSIS
38              
39             This class provides convenience for its child classes with methods
40             summarizing sequence features, such
41             as counting and listing amino acids and codons, retrieving amino acids
42             with certain degree degeneracy in a genetic table. Refer to the
43             L section for more details.
44              
45             use Bio::CUA::Summarizer;
46              
47             my $summarizer = Bio::CUA::Summarizer->new(
48             codon_table => 1 ); # using stardard genetic code
49            
50             # get codons in a sequence file
51             my $codonList = $summarizer->tabulate_codons('seqs.fa');
52             # get the codon table object of this summarizer
53             my $table = $summarizer->codon_table;
54             # get all sense codons in the genetic codon table
55             my @senseCodons = $summarizer->all_sense_codons;
56             # get codons encoding an amino acid
57             my @codons = $summarizer->codons_of_AA('Ser');
58              
59             =cut
60              
61              
62             =head2 new
63              
64             Title : new
65             Usage : $obj=Bio::CUA::Summarizer->new(%args);
66             Function: create an object which can be used to summarizing sequence
67             features.
68             Returns : an object of this or child class
69             Args : a hash with a key 'codon_table', acceptable values are
70             codon_table => id of genetic codon table # 1
71             codon_table => Bio::CUA::CodonTable object # 2
72             codon_table => 'map-file' # 3
73              
74             =over 3
75              
76             =item 1
77              
78             id of genetic codon table can be found from L
79             codes|http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=t>.
80             A valid id is an integer.
81              
82             =item 2
83              
84             an object of L. One can directly provide an
85             object to the method.
86              
87             =item 3
88              
89             If genetic code in analyzed sequences is not included in NCBI, one can
90             also provide its own genetic code in a map-file, in the format of
91             codon1AA1
92             codon2AA2,
93             ... ... ....
94              
95             =back
96              
97             Note all the analyzed sequences will use this provided genetic codon
98             table to map between amino acids and codons.
99              
100             =cut
101              
102             sub new
103             {
104 3     3 1 636 my ($caller, @args) = @_;
105 3         31 my $self = $caller->SUPER::new(@args);
106              
107 3         17 my $hashRef = $self->_array_to_hash(\@args);
108              
109             # only process its own argument
110 3         6 my $codonTable;
111 3         26 while(my ($tag, $val) = each %$hashRef)
112             {
113 4 100       20 next unless($tag eq 'codon_table');
114 3 50       29 if(ref($val))
    50          
115             {
116 0 0       0 $self->throw("$val is not an object of $codonPkg")
117             unless($val->isa($codonPkg));
118 0         0 $codonTable = $val;
119             }elsif($val =~ /^\d+$/) # genetic code id
120             {
121 3 50       32 $codonTable = $codonPkg->new(-id => $val) or
122             $self->throw("Invalid genetic code ID '$val'");
123             }else # a map file
124             {
125 0 0       0 $codonTable = $codonPkg->new(-map_file => $val) or
126             $self->throw("Can not construct codon table with file '$val'");
127             }
128 3         14 last;
129             }
130 3 50 0     10 $self->warn("option 'codon_table' is missing in the method",
131             "'new' of $pkg") and return undef unless($codonTable);
132             # store the result
133 3         39 $self->{'_codon_table'} = $codonTable;
134              
135 3         31 return $self;
136             }
137              
138             =head2 codon_table
139              
140             Title : codon_table
141             Usage : $table = $self->codon_table;
142             Function: get associated codon table of this object
143             Returns : an object of L
144             Args : None
145              
146             =cut
147              
148             sub codon_table
149             {
150 249 50   249 1 1355 my $table = $_[0]->{'_codon_table'} or
151             $_[0]->warn("No codon table associated with this object $_[0]");
152 249         935 return $table;
153             }
154              
155             =head2 bases
156              
157             get the 4 nucleotides A,T,C,G always in this order, to keep
158             consistency among different classes
159              
160             =cut
161              
162             # get all the nucleotide bases in a certain order
163             sub bases
164             {
165 1 50   1 1 559 return wantarray? @bases : \@bases;
166             }
167              
168             # sequence-level functions
169             =head2 get_codon_list
170              
171             Title : get_codon_list
172             Usage : $codonList = $self->get_codon_list($input)
173             Function: get codons and their counts in input
174             Returns : reference to a hash containing codons as keys and counts
175             as values.
176             Args : seq string, seq object, seq file, or another codon list
177              
178             =cut
179              
180             # the main interface to preprocess input to most methods
181             # return a codon list with its count
182             # acceptable parameters: seq string, seq object, seq file, codon list
183             sub get_codon_list
184             {
185 57     57 0 145 my ($self, $input) = @_;
186              
187 57         159 my $ref = ref($input);
188 57 100       165 unless($ref) # a scalar variable
189             {
190             # a sequence string
191 5 50 33     38 if($input =~ /^[ATGCUN]+$/ and (! -f $input))
192             {
193 0         0 return $self->_catalog_codons($input);
194             }else # a sequence file
195             {
196 5         26 return $self->tabulate_codons($input);
197             }
198             }
199              
200 52 50       907 if($ref eq 'HASH') # codon list
201             {
202 0         0 return $input;
203             }else # an seq object
204             {
205 52         348 return $self->_catalog_codons($input);
206             }
207             }
208              
209             =head2 tabulate_codons
210              
211             Title : tabulate_codons
212             Usage : $codonList = $self->tabulate_codons($input,[$each]);
213             Function: count codons in the input sequences
214             Returns : reference to a hash in which codon is the key and counts as
215             values. If $each is true, then each sequence is separately processed
216             and stored in a larger hash with the sequence names as hash keys
217             Args : accepted arguments are as follows:
218             I = name of a file containing fasta sequences
219             I = optional, if TRUE (i.e., non-zero values), each sequence is
220             separately processed.
221              
222             This is a companionate method of L for situations
223             when one want to get codon counts for each sequence separately.
224              
225             =cut
226              
227             sub tabulate_codons
228             {
229 6     6 1 10 my ($self, $input, $each) = @_;
230              
231 6 50       35 my $seqIO = $self->_get_seq_io($input) or return;
232 6         11 my %list;
233              
234 6 50       21 if($each)
235             {
236 0         0 while(my $seq = $seqIO->next_seq())
237             {
238 0         0 my $codons = $self->_catalog_codons($seq->seq);
239 0         0 $list{$seq->id} = $codons;
240             }
241              
242             }else # otherwise process together
243             {
244 6         44 while(my $seq = $seqIO->next_seq())
245             {
246 78         272 my $codons = $self->_catalog_codons($seq->seq);
247             # merge all codons together
248 78         581 while(my ($c, $v) = each %$codons)
249             {
250 4758         12722 $list{$c} += $v;
251             }
252             }
253             }
254            
255 6 50       45 return undef unless(keys %list);
256 6         52 return \%list;
257             }
258              
259             =head2 tabulate_AAs
260              
261             Title : tabulate_AAs
262             Usage : $AAList = $self->tabulate_AAs($input,[$each]);
263             Function: similar to L, but for counting amino acids
264             Returns : the same as L, but for amino acids
265             Args : refer to L.
266              
267             =cut
268              
269             sub tabulate_AAs
270             {
271 0     0 1 0 my ($self, $input, $each) = @_;
272              
273 0 0       0 my $codonList = $self->tabulate_codons($input) or return;
274              
275 0         0 my %AAs;
276 0 0       0 if($each)
277             {
278 0         0 while(my ($id, $hashRef) = each %$codonList)
279             {
280 0         0 while(my ($codon, $count) = each %$codonList)
281             {
282 0 0       0 my $AA = $self->_codon_to_aa($codon) or next;
283 0         0 $AAs{$id}->{$AA} += $count;
284             }
285             }
286             }else
287             {
288 0         0 while(my ($codon, $count) = each %$codonList)
289             {
290 0 0       0 my $AA = $self->_codon_to_aa($codon) or next;
291 0         0 $AAs{$AA} += $count;
292             }
293             }
294              
295 0         0 return \%AAs;
296             }
297              
298             # get the sequence IO and return it
299             sub _get_seq_io
300             {
301 6     6   11 my ($self, $input) = @_;
302              
303 6 50 0     19 $self->warn("input fasta file is needed to obtain seq IO") and return
304             unless($input);
305             # at present, use Bio::SeqIO
306 6         71 my $io = $seq_io_pkg->new(-file => $input, -format => 'fasta');
307 6         28 return $io;
308             }
309              
310              
311             # codon table functions
312             =head2 all_sense_codons
313              
314             get all sense codons in the genetic codon table this object
315              
316             =cut
317              
318             sub all_sense_codons
319             {
320 2     2 0 5 my ($self) = @_;
321              
322 2 50       5 my $codonTable = $self->codon_table() or return;
323              
324 2         9 return $codonTable->all_sense_codons;
325             }
326              
327             =head2 all_AAs_in_table
328              
329             get all the amino acids coded in the genetic code table of this object
330              
331             =cut
332              
333             sub all_AAs_in_table
334             {
335 8     8 1 18 my ($self) = @_;
336              
337 8 50       27 my $codonTable = $self->codon_table() or return;
338              
339 8         41 $codonTable->all_amino_acids();
340             }
341              
342             =head2 codons_of_AA
343              
344             get codons coding the given amino acid, e.g.,
345              
346             my @codons = $self->codons_of_AA('Ser');
347              
348             =cut
349              
350             sub codons_of_AA
351             {
352 160     160 1 238 my ($self, $AA) = @_;
353              
354 160 50       278 my $codonTable = $self->codon_table() or return;
355              
356 160         425 return $codonTable->codons_of_AA($AA);
357             }
358              
359             =head2 aa_degeneracy_classes
360              
361             Title : aa_degeneracy_classes
362             Usage : $hashRef = $self->aa_degeneracy_classes;
363             Function: get amino acid degeneracy classes according to the
364             associated genetic code table
365             Returns : a hash reference in which first level key is degeneracy
366             degrees such as 1,2,3,4,6, second level is amino acid, the value is
367             reference to the corresponding codon array. For example:
368            
369             { 2 => { D => [GAU, GAC],
370             C => [UGU, UGC],
371             ... ...
372             },
373             4 => { A => [GCU, GCC, GCA, GCG],
374             ... ...
375             },
376             ... ... ...
377             }
378              
379             Args : None
380              
381             =cut
382              
383             sub aa_degeneracy_classes
384             {
385 52 50   52 1 448 my $codonTable = $_[0]->codon_table or return;
386 52         393 return $codonTable->codon_degeneracy;
387             }
388              
389             =head2 codons_by_degeneracy
390              
391             Title : codons_by_degeneracy
392             Usage : @codons = $self->codons_by_degeneracy(2);
393             Function: get all the codons of AAs which have the specified degree
394             of degeneracy, for example, codons of amino acids with degenracy
395             degree 2.
396             Returns : an array of codons, or its reference in scalar context
397             Args : an integer for degeneracy degree
398              
399             =cut
400              
401             sub codons_by_degeneracy
402             {
403 26     26 1 177 my ($self, $deg) = @_;
404 26 50       186 my $degHash = $self->aa_degeneracy_classes or return;
405 26 50       142 my $aaClass = $degHash->{$deg} or return;
406 26         52 my @codons;
407 26         203 while(my ($aa, $codonRef) = each %$aaClass)
408             {
409 52         301 push @codons, @$codonRef;
410             }
411 26 50       224 return wantarray? @codons : \@codons;
412             }
413              
414             # other misc functions
415             #############################################
416             # Other methods used internally
417             #############################################
418              
419             # check whether a codon is valid and also not stop codon
420             sub _is_sense_codon
421             {
422 0     0   0 my ($self, $codon) = @_;
423              
424 0 0       0 my $codonTable = $self->codon_table() or return;
425 0 0       0 return 0 unless($codonTable->is_valid_codon($codon));
426 0 0       0 return 0 if($codonTable->is_stop_codon($codon));
427              
428 0         0 return 1;
429             }
430              
431             # check whether a codon is a stop codon
432             sub _is_stop_codon
433             {
434 0     0   0 my ($self, $codon) = @_;
435 0 0       0 my $codonTable = $self->codon_table() or return;
436 0         0 $codonTable->is_stop_codon($codon);
437             }
438              
439             # get the corresponding AA of a codon
440             sub _codon_to_aa
441             {
442 0     0   0 my ($self, $codon) = @_;
443              
444 0 0       0 my $codonTable = $self->codon_table() or return;
445 0         0 $codonTable->translate($codon);
446             }
447              
448             # get all the codons in the sequence, that is, split into nucleotide
449             # triplet
450             sub _catalog_codons
451             {
452 130     130   366 my ($self, $seq) = @_;
453              
454 130         496 $seq = $self->_get_seq_str($seq);
455 130         281 my %codons;
456 130         189 my $accuLen = 0;
457 130         326 my $seqLen = length($seq);
458 130 50       667 $self->warn("sequence [$seq] is not multiple of 3 long") unless($seqLen %
459             3 == 0);
460 130         158 my $codon;
461 130         494 while($accuLen + 3 <= $seqLen)
462             {
463 88570         161547 $codon = substr($seq,$accuLen,3);
464 88570         129855 $codons{$codon}++;
465 88570         223645 $accuLen += 3;
466             }
467              
468 130 50       506 return undef unless($accuLen);
469 130         2225 return \%codons;
470             }
471              
472             # get and preprocess sequence
473             # get a seq string or seq object and return sequence string
474             sub _get_seq_str
475             {
476 130     130   296 my ($self, $seq) = @_;
477 130 100       609 $seq = ref($seq)? $seq->seq : $seq;
478 130         1627 $seq = uc($seq);
479 130         960 $seq =~ tr/U/T/;
480 130         5687 $seq =~ s/[^A-Z]+//g; # remove all non-nucleotide characters
481              
482 130         490 my $len = length($seq);
483 130 50       684 if($len > 0)
484             {
485 130         858 return $seq;
486             }else
487             {
488 0           return undef;
489             }
490             }
491              
492             =head1 AUTHOR
493              
494             Zhenguo Zhang, C<< >>
495              
496             =head1 BUGS
497              
498             Please report any bugs or feature requests to C, or through
499             the web interface at L. I will be notified, and then you'll
500             automatically be notified of progress on your bug as I make changes.
501              
502             =head1 SUPPORT
503              
504             You can find documentation for this module with the perldoc command.
505              
506             perldoc Bio::CUA::Summarizer
507              
508              
509             You can also look for information at:
510              
511             =over 4
512              
513             =item * RT: CPAN's request tracker (report bugs here)
514              
515             L
516              
517             =item * AnnoCPAN: Annotated CPAN documentation
518              
519             L
520              
521             =item * CPAN Ratings
522              
523             L
524              
525             =item * Search CPAN
526              
527             L
528              
529             =back
530              
531              
532             =head1 ACKNOWLEDGEMENTS
533              
534              
535             =head1 LICENSE AND COPYRIGHT
536              
537             Copyright 2015 Zhenguo Zhang.
538              
539             This program is free software: you can redistribute it and/or modify
540             it under the terms of the GNU General Public License as published by
541             the Free Software Foundation, either version 3 of the License, or
542             (at your option) any later version.
543              
544             This program is distributed in the hope that it will be useful,
545             but WITHOUT ANY WARRANTY; without even the implied warranty of
546             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
547             GNU General Public License for more details.
548              
549             You should have received a copy of the GNU General Public License
550             along with this program. If not, see L.
551              
552              
553             =cut
554              
555             1; # End of Bio::CUA::Summarizer
556