File Coverage

blib/lib/Peptide/Kmers.pm
Criterion Covered Total %
statement 51 54 94.4
branch 16 28 57.1
condition 8 25 32.0
subroutine 6 6 100.0
pod 3 3 100.0
total 84 116 72.4


line stmt bran cond sub pod time code
1             package Peptide::Kmers;
2             # Peptide::Kmers - provides log_prop_kmers and kmers methods.
3              
4 2     2   24136 use Carp;
  2         5  
  2         188  
5 2     2   12 use warnings;
  2         4  
  2         64  
6 2     2   11 use strict;
  2         5  
  2         1521  
7              
8             =head2 new
9              
10             Args : none
11             Example : my $pk = Peptide::Kmers->new(verbose => 2);
12             Description : bare constructor
13             Returns : TRUE if successful, FALSE otherwise.
14              
15             =cut
16              
17             sub new {
18 1     1 1 17 my $class = shift;
19 1   33     14 my $self = bless { verbose => 1, @_ }, ref($class) || $class;
20 1         5 return $self;
21             }
22              
23             =head2 log_prop_kmers
24              
25             Args : %named_parameters:
26             k : length of the k-mer. Allowed values: positive integer. Mandatory.
27             in_fh : filehandle for the input file. If missing, STDIN is used.
28             min : min number of occurrences. Allowed values: non-negative real number. Mandatory. Suggested values: 1 (equivalent to assuming that the k-mer occurred once even if it actually did not occur) or 0.5.
29             maxtotal : max total number of occurrences. Allowed values: positive integer. Optional.
30             Example : my %log_prop = $pk->log_prop_kmers(k => 3, min => 1, text => [ qw(aecaecaecd aecd xyz123) ]);
31             Description : For all overlapping k-mers (words of length k) in the input, computes the frequency of occurrence within the input text. Only allowed k-mers (those returned by kmers()) are counted. If the k-mer does not occur, min number of occurrences (eg, 0.5 or 1) is used as default threshold. Computes log10(proportion(k-mer)), where proportion(k-mer) = frequency(k-mer) / sum( all frequencies). For example, if input has 2 words: 'fooo' and 'bar', and k=2, the k-mers are: 'fo', 'oo', 'oo', 'ba', 'ar'. The frequencies are: 2 for 'oo', and 1 for the rest of the k-mers. Prints sum( all frequencies) into STDERR. If maxtotal is defined, input processing is stopped if sum( all frequencies) >= maxtotal. If this prevents any input from being processed, a warning is printed into STDERR. All warning can be suppressed by setting verbose arg to 1 or below). For comparing log10(proportion(k-mer)) from 2 different text sources, for example sequences and non-sequences, it is best to compute using the same sum( all frequencies). Thus, execute the code twice. First time, execute using undefined maxtotal, just to compute the sum( all frequencies) for the entire text from each of the sources. Second time, execute using the same maxtotal = min sum( all frequencies) for each source.
32             Returns : hash with keys = k-mers and values = log10(proportion(k-mer)) if successful, FALSE otherwise.
33              
34             =cut
35              
36             sub log_prop_kmers {
37 3     3 1 31067 my ($self, %args) = @_;
38 3 50 0     17 $args{k} > 0 or carp "not ok: got k = $args{k}, expected > 0" and return;
39 3 50 0     15 $args{min} >= 0 or carp "not ok: got min = $args{min}, expected >= 0" and return;
40 3 50 0     24 not defined $args{maxtotal} or $args{maxtotal} > 0 or
      66        
41             carp "not ok: got maxtotal = $args{maxtotal}, expected undefined or > 0" and return;
42 3   33     13 my $in_fh = $args{in_fh} || *STDIN;
43 3         54 my $re = qr/(?=(.{$args{k}}))/;
44 3         22 my %freq = map { $_ => 0 } $self->kmers(%args);
  52728         98716  
45             # min $freq_total, if all k-mers did not occur:
46 3         9658 my $num_kmers = keys %freq;
47 3         18 my $freq_total = $args{min} * $num_kmers;
48 3 50       31 if ($self->{verbose} > 1) {
49 0 0 0     0 carp "WARNING: min * num_kmers >= maxtotal ",
50             "($args{min} * $num_kmers >= $args{maxtotal}), ",
51             "no input is processed"
52             if defined $args{maxtotal} and $freq_total >= $args{maxtotal};
53             }
54 3         6 my $i = 0;
55 3         1835 INPUT: while (<$in_fh>) {
56 4         12 chomp;
57 4 50       16 if ($self->{verbose} > 1) {
58 0 0       0 if (not ++$i % 10_000) {
59 0         0 print STDERR "log_prop_kmers: processing input line $i\n";
60             }
61             }
62 4         127 foreach (/$re/g) {
63 23 100 100     79 last INPUT if defined $args{maxtotal} and $freq_total >= $args{maxtotal};
64             # count only allowed k-mers:
65 21 100       61 next unless defined $freq{$_};
66             # For the first occurrence of the k-mer, $freq{$_} is changed from $args{min}
67             # to 1. Thus, subtract $args{min} from $freq_total.
68             # $freq{$_} is assigned to $args{min} separately, below,
69             # because testing if $freq{$_} is not ambiguous, and $freq{$_} == $args{min}
70             # is ambiguous for $args{min} = 1 and for k-mer that actually occurred once.
71             # Note that k-mers that occur only once do not increase $freq_total
72             # if $args{min} = 1
73 15 100       37 $freq_total -= $args{min} unless $freq{$_};
74 15         14 $freq_total++;
75 15         29 $freq{$_}++;
76             }
77             }
78 3 50       16 print STDERR "freq_total=$freq_total" if $self->{verbose} > 1;
79 3         11848 foreach (keys %freq) {
80 52728 100       132604 $freq{$_} = $args{min} if $freq{$_} < $args{min};
81             }
82             #warn 'log_prop_kmers: all: ', join "; ", map { "$_ => $freq{$_}" } sort keys %freq;
83             #warn 'log_prop_kmers: freq: ', join "; ", map { "$_ => $freq{$_}" } qw(acc ppp 123);
84 3   50     7247 $freq_total ||= 1; # prevent division by 0
85             #warn "freq_total=$freq_total";
86 3 50       18 return unless keys %freq;
87 3         9390 my %log_prop = map { ( $_ =>
  52728         245743  
88             sprintf("%.2f",
89             ( log($freq{$_} / $freq_total) / log(10) )
90             )
91             )
92             } keys %freq;
93             #warn 'log_prop_kmers: log_prop: ', join "; ", map { "$_ => $log_prop{$_}" } qw(acc ppp 123);
94 3         55630 return %log_prop;
95             }
96              
97             =head2 kmers
98              
99             Args : %named_parameters:
100             mandatory:
101             k : length of the k-mer. Allowed values: positive integer.
102             Example : $pk->kmers(k => 3))[0,1,-1] # returns qw(aaa aab zzz)
103             Description : Creates a list of different allowed k-mers (words of length k, using lowercase chars a-z).
104             Returns : resulting list
105              
106             =cut
107              
108             sub kmers {
109 4     4 1 956 my ($self, %args) = @_;
110 4 50 0     18 $args{k} > 0 or carp "not ok: got k = $args{k}, expected > 0" and return;
111 4         12 my @kmers = ('');
112             # grow @kmers by adding to each existing el 1 char from the list of allowed chars
113 4         54 for my $i (1..$args{k}) {
114 12         16 my @kmers_new;
115 12         22 foreach my $kmer (@kmers) {
116             # keep allowed chars in kmers(): 'a'..'z' in sync with
117             # WordPropProtein() args : lc of tr/A-Za-z//cd;
118 2812         5483 foreach my $char ( 'a'..'z' ) {
119 73112         109200 push @kmers_new, "$kmer$char";
120             }
121             }
122 12         18304 @kmers = @kmers_new;
123             }
124             #warn "@kmers";
125 4         19828 return @kmers;
126             }
127              
128             1;