File Coverage

blib/lib/Bio/ViennaNGS/Expression.pm
Criterion Covered Total %
statement 24 108 22.2
branch 0 16 0.0
condition n/a
subroutine 8 11 72.7
pod 3 3 100.0
total 35 138 25.3


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2015-02-06 16:29:16 mtw>
3              
4             package Bio::ViennaNGS::Expression;
5              
6 1     1   1408 use version; our $VERSION = qv('0.12_15');
  1         2  
  1         6  
7 1     1   77 use Moose;
  1         2  
  1         7  
8 1     1   4834 use Carp;
  1         2  
  1         61  
9 1     1   4 use Data::Dumper;
  1         2  
  1         39  
10 1     1   3 use Path::Class;
  1         2  
  1         43  
11 1     1   4 use Bio::ViennaNGS::Bed;
  1         1  
  1         20  
12 1     1   5 use Bio::ViennaNGS::Util qw(sortbed);
  1         5  
  1         37  
13              
14 1     1   5 use namespace::autoclean;
  1         2  
  1         9  
15              
16              
17             has 'readcountfile' => (
18             is => 'rw',
19             predicate => 'has_readcountfile',
20             );
21              
22             has 'data' => (
23             is => 'rw',
24             isa => 'ArrayRef',
25             default => sub { [] },
26             );
27              
28             has 'conds' => (
29             is => 'rw',
30             isa => 'Int',
31             predicate => 'has_conds',
32             );
33              
34             has 'nr_features' => (
35             is => 'rw',
36             isa => 'Int',
37             predicate => 'has_features',
38             );
39              
40              
41             sub parse_readcounts_bed12 {
42 0     0 1   my ($self,$file) = @_;
43 0           my @mcData = ();
44 0           my ($i,$n) = 0x2;
45 0           my $this_function = (caller(0))[3];
46              
47 0 0         croak "ERROR [$this_function] readcount / multicov file $self->readcountfile not available\n"
48             unless (-e $file);
49 0           $self->readcountfile($file);
50 0 0         open (RC_IN, "< $file") or croak $!;
51              
52 0           while (<RC_IN>){
53 0           $n++;
54 0           chomp;
55             # 0:chr|1:start|2:end|3:name|4:score|5:strand
56             # 6:thickStart|7:thickEnd|8:itemRgb|9:blockCount|
57             # 10:blockSizes|11:blockStarts
58 0           @mcData = split(/\t/);
59 0           my $conditions = (scalar @mcData)-12; # multicov extends BED12
60 0           $self->conds($conditions);
61              
62             # NOTE: Better keep BED12 entries in a hash, generating UUIDs as
63             # keys instead of storing the same BED12 entry n times (ie for
64             # each sample) in $self->data
65              
66 0           my $bedobj = Bio::ViennaNGS::Bed->new(chromosome => $mcData[0],
67             start => $mcData[1],
68             end => $mcData[2],
69             name => $mcData[3],
70             score => $mcData[4],
71             strand => $mcData[5],
72             thickStart => $mcData[6],
73             thickEnd => $mcData[7],
74             itemRgb => $mcData[8],
75             blockCount => $mcData[9],
76             blockSizes => $mcData[10],
77             blockStarts => $mcData[11],
78             );
79 0           my $len = $bedobj->length;
80 0           my $id = sprintf("%s:%d-%d_%s_%s", $mcData[0],$mcData[1],
81             $mcData[2],$mcData[3],$mcData[5]);
82             #print "\$id: $id\n";
83 0           for ($i=0;$i<$self->conds;$i++){
84 0           ${$self->data}[$i]{$id} = {
  0            
85             bed_entry => $bedobj,
86             length => $len,
87             count => $mcData[eval(12+$i)],
88             };
89             # print Dumper(${$self->data}[$i]);
90             }
91             }
92 0           $self->nr_features($n);
93 0           close(RC_IN);
94             }
95              
96             sub write_expression_bed12 {
97 0     0 1   my ($self,$item,$dest,$base_name) = @_;
98 0           my ($bedname,$bedname_u,$outfile,$feat,$i);
99 0           my $this_function = (caller(0))[3];
100              
101 0 0         croak "ERROR [$this_function]: $dest does not exist\n"
102             unless (-d $dest);
103              
104 0           $bedname = $base_name.".".$item.".multicov.bed12";
105 0           $bedname_u = $base_name.".".$item.".multicov.u.bed12";
106              
107 0           $outfile = file($dest,$bedname_u);
108              
109 0 0         croak "ERROR [$this_function]: $self->conds not available\n"
110             unless ($self->has_conds);
111              
112 0 0         croak "ERROR [$this_function]: $self->nr_features not available\n"
113             unless ($self->has_features);
114              
115             #print "=====> write_multicov: writing multicov file $bedfile with ".
116             # eval($self->nr_features)." lines and ".eval($self->conds)." conditions\n";
117              
118             # check whether each element in @{$self->data} has the same amount of entries
119 0           for($i=0;$i<$self->conds;$i++){
120 0           my $fc = scalar keys %{$self->data->[$i]}; # of keys in %{$featCount}
  0            
121             #print "condition $i => $fc keys\n";
122 0 0         croak "ERROR [$this_function]: unequal element count in @{$self->data}"
  0            
123             unless($self->nr_features == $fc)
124             }
125              
126 0 0         open (MULTICOV_OUT, "> $bedname_u") or croak $!;
127              
128             # use BED12 data stored with condition 0 here, assuming its the same for all conditions
129 0           foreach $feat (keys %{${$self->data}[0]} ){
  0            
  0            
130 0           my @mcLine = ();
131             # retrieve BED12 line first
132 0           my $bedo = ${${$self->data}[0]}{$feat}{bed_entry};
  0            
  0            
133 0           my $bedline = $bedo->as_bed_line(12);
134 0           push @mcLine, $bedline;
135              
136             # process multicov values for all samples
137 0           for($i=0;$i<$self->conds;$i++){
138             # print "------------> "; print "processing $i th condition "; print "<-----------\n";
139 0           croak "Could not find item $feat in mcSample $i\n"
140 0 0         unless ( defined ${${$self->data}[$i]}{$feat} );
  0            
141 0           push @mcLine, ${${$self->data}[$i]}{$feat}{$item};
  0            
  0            
142              
143             }
144 0           print MULTICOV_OUT join("\t",@mcLine)."\n";
145             }
146 0           close(MULTICOV_OUT);
147              
148 0           sortbed($bedname_u,"./",$bedname,1,undef); # sort bed file
149             }
150              
151              
152             sub computeTPM {
153 0     0 1   my ($self,$sample,$rl) = @_;
154 0           my ($TPM,$T,$totalTPM) = (0)x3;
155 0           my ($i,$meanTPM);
156              
157             # iterate through $self->data[$i] twice:
158             # 1. for computing T (denominator in TPM formula)
159 0           foreach $i (keys %{${$self->data}[$sample]}){
  0            
  0            
160 0           my $count = ${${$self->data}[$sample]}{$i}{count};
  0            
  0            
161 0           my $length = ${${$self->data}[$sample]}{$i}{length};
  0            
  0            
162             #print "count: $count\nlength: $length\n";
163 0           $T += $count * $rl / $length;
164             }
165              
166             # 2. for computng actual TPM values
167 0           foreach $i (keys %{${$self->data}[$sample]}){
  0            
  0            
168 0           my $count = ${${$self->data}[$sample]}{$i}{count};
  0            
  0            
169 0           my $length = ${${$self->data}[$sample]}{$i}{length};
  0            
  0            
170 0           $TPM = 1000000 * $count * $rl/($length * $T);
171 0           ${${$self->data}[$sample]}{$i}{TPM} = $TPM;
  0            
  0            
172 0           $totalTPM += $TPM;
173             }
174              
175 0           $meanTPM = $totalTPM/$self->nr_features;
176              
177             # print Dumper(${$self->data}[$sample]);
178             # print "totalTPM=$totalTPM | meanTPM=$meanTPM\n";
179              
180              
181 0           return $meanTPM;
182             }
183              
184              
185             __PACKAGE__->meta->make_immutable;
186              
187             1;
188             __END__
189              
190              
191             =head1 NAME
192              
193             Bio::ViennaNGS::Expression - An object oriented interface for
194             read-count based gene expression
195              
196             =head1 SYNOPSIS
197              
198             use Bio::ViennaNGS::Expression;
199              
200             my $expression = Bio::ViennaNGS::Expression->new();
201              
202             # parse read counts from an extended BED12 file
203             $expression->parse_readcounts_bed12("$bed12");
204              
205             # compute normalized expression of ith sample in Transcript per Million (TPM)
206             $expression->computeTPM($i, $readlength);
207              
208             # write extended BED12 file with TPM for each condition past
209             # the 12th column
210             $expression->write_expression_bed12("TPM", $dest, $basename);
211              
212             =head1 DESCRIPTION
213              
214             This module provides a L<Moose> interface for computation of gene /
215             transcript expression from read counts.
216              
217             =head1 METHODS
218              
219             =over
220              
221             =item parse_readcounts_bed12
222              
223             Title : parse_readcounts_bed12
224              
225             Usage : C<$obj-E<gt>parse_readcounts_bed12($file);>
226              
227             Function : Parses a bedtools multicov (multiBamCov) file, i.e. an
228             extended BED12 file, into an Array of Hash of Hashes data
229             structure (C<@{$self-E<gt>data}>).
230              
231             Args : C<$file> is the input file, i.e. and extended BED12 file where
232             each column past the 12th lists read counts for this bedline's
233             feature(s) for a specific sample/condition.
234              
235             Returns :
236              
237             Notes : This method evaluates the number of samples/conditions present
238             in the input, i.e. the number of columns extending the
239             canonical BED12 columns in the input multicov file and
240             populates C<$self-E<gt>conds>. Also populates
241             C<$self-E<gt>nr_features> with the number of genes/features
242             present in the input (evidently, this should be the same for
243             each sample/condition in the input).
244              
245             =item computeTPM
246              
247             Title : computeTPM
248              
249             Usage : C<$obj-E<gt>computeTPM($sample, $readlength);>
250              
251             Function : Computes expression of each gene/feature present in
252             C<$self-E<gt>data> in Transcript per Million (TPM) [Wagner
253             et.al. Theory Biosci. (2012)]. is a reference to a Hash of
254             Hashes data straucture where keys are feature names and
255             values hold a hash that must at least contain length and
256             raw read counts. Practically, C<$featCount_sample> is
257             represented by _one_ element of C<@featCount>, which is
258             populated from a multicov file by C<parse_multicov()>.
259              
260             Args : C<$sample> is the sample index of C<@{$self-E<gt>data}>. This is
261             especially handy if one is only interested in computing
262             normalized expression values for a specific sample, rather
263             than all samples in multicov BED12 file. C<$readlength> is the
264             read length of the RNA-seq sequencing experiment.
265              
266             Returns : Returns the mean TPM of the processed sample, which is
267             invariant among samples. (TPM models relative molar
268             concentration and thus fulfills the invariant average
269             criterion.)
270              
271             =item write_expression_bed12
272              
273             Title : write_expression_bed12
274              
275             Usage : C<$obj-E<gt>write_expression_bed12($measure,
276             $dest,$basename);>
277              
278             Function : Writes normalized expression data to a bedtools multicov
279             (multiBamCov)-type BED12 file.
280              
281             Args : C<$measure> specifies the type in which normalized expression
282             data from C<@{$self-E<gt>data}> is dumped. Currently supports
283             'TPM', however 'RPKM' support will be added in a future
284             release. These values must have been computed and inserted into
285             C<@{self-E<gt>data}> beforehand by
286             e.g. C<$self-E<gt>computeTPM()>. C<$dest> and C<$base_name>
287             give path and base name of the output file, respectively.
288              
289             Returns : None. The output is position-sorted extended BED12 file.
290              
291             =back
292              
293             =head1 DEPENDENCIES
294              
295             =over
296              
297             =item L<Moose>
298              
299             =item L<Carp>
300              
301             =item L<Path::Class>
302              
303             =item L<namespace::autoclean>
304              
305             =back
306              
307             =head1 SEE ALSO
308              
309             =over
310              
311             =item L<Bio::ViennaNGS>
312              
313             =item L<Bio::ViennaNGS::Bed>
314              
315             =item L<Bio::ViennaNGS::Util>
316              
317             =back
318              
319             =head1 AUTHOR
320              
321             Michael T. Wolfinger, E<lt>michael@wolfinger.euE<gt>
322              
323             =head1 COPYRIGHT AND LICENSE
324              
325             Copyright (C) 2015 by Michael T. Wolfinger
326              
327             This library is free software; you can redistribute it and/or modify
328             it under the same terms as Perl itself, either Perl version 5.10.0 or,
329             at your option, any later version of Perl 5 you may have available.
330              
331             This software is distributed in the hope that it will be useful, but
332             WITHOUT ANY WARRANTY; without even the implied warranty of
333             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
334              
335             =cut