File Coverage

blib/lib/Bio/FdrFet.pm
Criterion Covered Total %
statement 270 297 90.9
branch 69 102 67.6
condition 19 33 57.5
subroutine 30 30 100.0
pod 13 15 86.6
total 401 477 84.0


line stmt bran cond sub pod time code
1             package Bio::FdrFet;
2              
3 1     1   16296 use 5.008;
  1         2  
  1         29  
4 1     1   4 use strict;
  1         1  
  1         32  
5 1     1   3 use warnings;
  1         4  
  1         33  
6 1     1   4 use Carp;
  1         1  
  1         2866  
7              
8             require Exporter;
9              
10             our @ISA = qw(Exporter);
11              
12             # Items to export into callers namespace by default. Note: do not export
13             # names by default without a very good reason. Use EXPORT_OK instead.
14             # Do not simply export all your public functions/methods/constants.
15              
16             # This allows declaration use Bio::FdrFet ':all';
17             # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
18             # will save memory.
19             our %EXPORT_TAGS = ( 'all' => [ qw(
20            
21             ) ] );
22              
23             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
24              
25             our @EXPORT = qw(
26            
27             );
28              
29             our $VERSION = '0.05';
30              
31             1;
32            
33             =head1 NAME
34              
35             Bio::FdrFet - Perl extension for False Discovery Rate and Fisher Exact Test applied to pathway analysis.
36              
37             =head1 SYNOPSIS
38              
39             use Bio::FdrFet;
40             my $obj = new Bio::FdrFet($fdrcutoff);
41              
42             open (IN, $pathwayfile) ||
43             die "can not open pathway annotation file $pathwayfile: $!\n";
44             while () {
45             chomp;
46             my ($gene, $dbacc, $desc, $rest) = split (/\t/, $_, 4);
47             $obj->add_to_pathway("gene" => $gene,
48             "dbacc" => $dbacc,
49             "desc" => $desc);
50             }
51             close IN;
52              
53             #read in genes and associated p values
54             my (%genes, @fdrs);
55             open (IN, $genefile) || die "can not open gene file $genefile: $!\n";
56             my $ref_size = 0;
57             while () {
58             my ($gene, $pval) = split (/\t/, $_);
59             $obj->add_to_genes("gene" => $gene,
60             "pval" => $pval);
61             $ref_size++;
62             }
63             close IN;
64             $obj->gene_count($ref_size);
65             $obj->calculate;
66             foreach my $pathway ($obj->pathways('sorted')) {
67             my $logpval = $obj->pathway_result($pathway, 'LOGPVAL');
68             printf "Pathway $pathway %s has - log(pval) = %6.4f",
69             $obj->pathway_desc($pathway),
70             $logpval;
71             }
72              
73             =head2 Constructor
74              
75             $obj = new Bio::FdrFet($fdrcutoff);
76              
77             # You can also use $obj = new Bio::FdrFet->new($fdrcutoff);
78              
79             =head2 Object Methods
80              
81             =head3 Input Methods
82              
83             $obj->fdr_cutoff($new_cutoff);
84             $obj->universe($universe_option);
85             $obj->verbose($new_verbose);
86             $obj->add_to_pathway("gene" => ,
87             "dbacc" => ,
88             "desc" => );
89             $obj->add_to_genes("gene" => ,
90             "pval" => );
91              
92             =head3 Output Methods
93              
94             $obj->genes;
95             $obj->pathways[($order)];
96             $obj->pathway_result($pathway, $data_name[, $all_flag]);
97             $obj->pathway_desc($pathway);
98             $obj->pathway_genes($pathway);
99             $obj->fdr_position($fdr);
100              
101             =head3 Other Methods
102              
103             $obj->gene_count[($fet_gene_count)];
104             $obj->calculate;
105              
106             =head1 DESCRIPTION
107              
108             Bio::FdrFet implements the False Discovery Rate Fisher Exact Test of gene
109             expression analysis applied to pathways described in the paper by
110             Ruiru Ji, Karl-Heinz Ott, Roumyana Yordanova, and Robert E
111             Bruccoleri. A copy of the paper is included with the distribution in
112             the file, Fdr-Fet-Manuscript.pdf.
113              
114             The module is implemented using a simple object oriented paradigm
115             where the object stores all the information needed for a calculation
116             along with a state variable, C. The state variable has two
117             possible values, C<'setup'> and C<'calculated'>. The value of
118             C<'setup'> indicates that the object is being setup with data, and any
119             results in the object are inconsistent with the data. The value of
120             C<'calculated'> indicates that the object's computational results are
121             consistent with the data, and may be returned to a calling program.
122              
123             The C<'calculate'> method is used to update all the calculated values
124             from the input data. It checks the state variable first, and only does
125             the calculation if the state is C<'setup'>. Once the calculations are
126             complete, then the state variable is set to C<'calculated'>. Thus, the C method
127             can be called whenever a calculated value is needed, and there is no
128             performance penalty.
129              
130             The module initializes the C object with a state of
131             C<'setup'>. Any data input sets the state to C<'setup'>. Any requests
132             for calculated data, calls C<'calculate'>, which updates the state
133             variable so futures requests for calculated data return quickly.
134              
135             =head1 METHODS
136              
137             The following methods are provided:
138              
139             =over 4
140              
141             =cut
142            
143              
144             =item C
145              
146             Creates a new Bio::FdrFet object. The optional parameter is the False
147             Discovery Rate cutoff in units of percent. See the C
148             method below for more details.
149              
150             =cut
151              
152             sub new {
153 4     4 1 41569 my $pkg;
154 4         17 my $class = shift;
155 4         12 eval {($pkg) = caller(0);};
  4         54  
156 4 50       24 if ($class ne $pkg) {
157 0         0 unshift @_, $class;
158             }
159 4         10 my $self = {};
160 4         10 bless $self;
161 4         9 my $cutoff = shift;
162 4 50       17 $cutoff = 35 if not defined $cutoff;
163 4         21 $self->{STATE} = "setup";
164 4         21 $self->{FDR_CUTOFF} = _check_fdr_cutoff($cutoff);
165 4         11 $self->{PATHWAYS} = {};
166 4         12 $self->{GENES} = {};
167 4         14 $self->{INPUT_GENE_COUNT} = 0;
168 4         9 $self->{GENE_COUNT} = undef;
169 4         9 $self->{VERBOSE} = 1;
170 4         13 $self->{UNIVERSE} = "genes";
171 4         17 return $self;
172             }
173              
174             =item C
175              
176             Retrieves the current setting for the False Discovery Rate threshold,
177             and optionally sets it. This threshold must be an integer greater than
178             0 and less than or equal to 100, and is divided by 100 for the value
179             used by the computation.
180              
181             =cut
182              
183             sub fdr_cutoff {
184              
185 8     8 1 13 my $self = shift;
186 8         12 my $new_cutoff = shift;
187 8 100       22 if (defined($new_cutoff)) {
188 4         11 $self->{FDR_CUTOFF} = _check_fdr_cutoff($new_cutoff);
189 4         37 $self->{STATE} = 'setup';
190             }
191 8         30 return $self->{FDR_CUTOFF};
192             }
193              
194             =item C
195              
196             Retrieves the current setting for the verbose parameter
197             and optionally sets it. It can be either 0, no verbosity, or 1, lots
198             of messages sent to STDERR.
199              
200             =cut
201              
202             sub verbose {
203              
204 4     4 1 10 my $self = shift;
205 4         6 my $new_verbose = shift;
206 4 50       15 if (defined($new_verbose)) {
207 4         10 $self->{VERBOSE} = $new_verbose;
208             }
209 4         16 return $self->{VERBOSE};
210             }
211              
212             =item C
213              
214             Retrieves the current setting for the B option
215             and optionally sets it. The B option specifies how the number of
216             genes in our statistical universe is calculated. There are four possible
217             settings to this option:
218              
219             =over 2
220              
221             =item union
222              
223             The universe is the union of all gene names specified
224             individually and in pathways. Genes which have no P value are counted in the universe,
225             but they are not counted as regulated in the FET or FDR calculations.
226              
227             =item genes
228              
229             Only genes specified by the add_to_genes method count.
230              
231             =item intersection
232              
233             Only genes in the intersection of the gene list and pathways are used for the universe
234             calculation
235              
236             =item user
237              
238             The user specifies the universe size by calling the C method with an argument.
239              
240             =back
241              
242             =cut
243              
244             sub universe {
245              
246 4     4 1 10211 my $self = shift;
247 4         13 my $new_universe = shift;
248 4 50       18 if (defined($new_universe)) {
249 4         13 $new_universe = lc($new_universe);
250 4 50       32 if ($new_universe =~ m/^(user|union|genes|intersection)$/) {
251 4         11 $self->{UNIVERSE} = $new_universe;
252             }
253             else {
254 0         0 confess "Bad value ($new_universe) for setting universe option\n";
255             }
256             }
257 4         23 return $self->{UNIVERSE};
258             }
259              
260             =item C<< add_to_pathway( >>
261              
262             "gene" => ,
263             "dbacc" => ,
264             "desc" => )
265              
266             Adds a gene to a pathway and also defines a pathway. The arguments are
267             specified as a pseudo hash, with each argument being preceded by its
268             name.
269              
270             Pathways are defined by an accession key (C parameter), a
271             description (C parameter, and a set of genes (specified by the
272             Cparameter). To use this function to specify a pathway with
273             multiple genes, you call this method multiple times with the same
274             accession key and description, and vary the gene name. The gene names
275             are just arbitrary strings, but they must match the values used for
276             specifying Probability Values (pvalues) used by the C
277             method.
278              
279             =cut
280              
281             sub add_to_pathway {
282            
283 5380     5380 1 33404 my $self = shift;
284 5380         9878 my %arg = &_check_args(\@_, "gene", "dbacc", "desc");
285 5380 100 66     27672 if (exists($self->{PATHWAYS}->{$arg{dbacc}}) and
286             exists($self->{PATHWAYS}->{$arg{dbacc}}->{DESC})) {
287 5352 50       13557 if ($arg{desc} ne $self->{PATHWAYS}->{$arg{dbacc}}->{DESC}) {
288 0         0 confess(sprintf("Gene %s has a dbacc = %s with descriptor of %s does not match previous entry descriptor = %s\n",
289             $arg{gene},
290             $arg{dbacc},
291             $arg{desc},
292             $self->{PATHWAYS}->{$arg{dbacc}}));
293             }
294             }
295             else {
296 28         117 $self->{PATHWAYS}->{$arg{dbacc}}->{DESC} = $arg{desc};
297             }
298 5380         4993 push (@{$self->{PATHWAYS}->{$arg{dbacc}}->{GENES}}, $arg{gene});
  5380         12455  
299 5380         4734 push(@{$self->{GENES}->{$arg{gene}}->{PATHWAYS}}, $arg{dbacc});
  5380         18227  
300 5380         22370 $self->{STATE} = 'setup';
301             }
302              
303             =item C<< add_to_genes( >>
304              
305             "gene" => ,
306             "pval" => )
307              
308              
309             Adds a probability values for a gene in the calculation. The arguments
310             are specified using a pseudo hash with the nameof parameter preceding
311             its value. The gene names must match those used in the pathways. The
312             probability values are estimates of non-randomness and should range
313             from 0 to 1.
314              
315             =cut
316              
317             sub add_to_genes {
318 45396     45396 1 48285 my $self = shift;
319 45396         75713 my %arg = &_check_args(\@_, "gene", "pval");
320 45396 50 33     208955 if (exists($self->{GENES}->{$arg{gene}}->{PVAL}) and
321             $self->{GENES}->{$arg{gene}}->{PVAL} != $arg{pval}) {
322 0         0 confess sprintf("Gene %s has a pre-existing pval (%g) different than the current argument = %g\n",
323             $arg{gene},
324             $self->{GENES}->{$arg{gene}}->{PVAL},
325             $arg{pval});
326             }
327 45396         90524 $self->{GENES}->{$arg{gene}}->{PVAL} = $arg{pval};
328 45396         54900 $self->{STATE} = 'setup';
329 45396         122035 $self->{INPUT_GENE_COUNT} += 1;
330             }
331              
332             =item C
333              
334             Returns the list of gene names in the system.
335              
336             =cut
337              
338             sub genes {
339 4     4 1 17 my $self = shift;
340 4         7 return keys %{$self->{GENES}};
  4         30  
341             }
342              
343             =item C
344              
345             Returns the list of pathways. If the optional argument, C<$order>, is
346             specified and contains the word, C<"sorted">, (comparison is case
347             insensitive), then the object will return the pathways in order of
348             most significant to least. If sorting is done, then the object will
349             update the calculation of probability values, whereas if no sorting is
350             done, then the object does no calculation.
351              
352             =cut
353              
354             sub pathways {
355 9     9 0 1570 my $self = shift;
356 9         21 my $order = shift;
357 9 100       515 if (lc($order) ne 'sorted') {
358 5         12 return keys %{$self->{PATHWAYS}};
  5         64  
359             }
360             else {
361 4         16 $self->calculate;
362 51         136 return sort { $self->{PATHWAYS}->{$b}->{BEST_RESULTS}->{LOGPVAL} <=>
  4         39  
363 4         10 $self->{PATHWAYS}->{$a}->{BEST_RESULTS}->{LOGPVAL} } keys %{$self->{PATHWAYS}};
364             }
365             }
366              
367             =item C
368              
369             Returns a calculated result for a pathway. The following values may be
370             used for C<$data_name>. Case of C<$data_name> does not matter.
371              
372             LOGPVAL -log10(probability value for pathway).
373             PVAL probability value for pathway
374             ODDS Odds ratio.
375             Q Number of genes in the pathway passing the FDR cutoff
376             M Number of genes overall passing the FDR cutoff
377             N Number of genes in the system minus C above.
378             K Number of genes in the pathway.
379             FDR FDR cutoff in percent giving the best pvalue.
380             LOCI Reference to an array of gene names in the pathway
381             that satisfy FDR cutoff.
382              
383             If C<$all_flag> is specified and has the value, "all", then this
384             returns an array of values for all the attempted FDR cutoffs, except
385             for the c cutoff.
386              
387             =cut
388              
389             sub pathway_result {
390 7736     7736 1 1539972 my $self = shift;
391 7736         15293 my $pathway = $self->_check_pathway_arg(shift(@_));
392 7736         11911 my $name = uc(shift(@_));
393 7736         8985 my $all_option = shift(@_);
394 7736 100       15831 $all_option = "" if not defined $all_option;
395 7736         7579 $all_option = uc($all_option);
396            
397 7736         14172 $self->calculate;
398 7736 50       16737 if (not exists($self->{PATHWAYS}->{$pathway})) {
399 0         0 confess "Pathway ($pathway) not found.\n";
400             }
401 7736 100       14474 if ($all_option eq 'ALL') {
402 665 50       1307 if ($name eq 'FDR') {
403 0         0 confess "All option not valid for FDR result.\n";
404             }
405 665 50       2379 if (not exists($self->{PATHWAYS}->{$pathway}->{ALL_RESULTS}->{$name})) {
406 0         0 confess "Pathway all_results ($name) not found.\n";
407             }
408 665         673 return @{$self->{PATHWAYS}->{$pathway}->{ALL_RESULTS}->{$name}};
  665         10545  
409             }
410             else {
411 7071 50       19854 if (not exists($self->{PATHWAYS}->{$pathway}->{BEST_RESULTS}->{$name})) {
412 0         0 confess "Pathway results ($name) not found.\n";
413             }
414 7071         39370 return $self->{PATHWAYS}->{$pathway}->{BEST_RESULTS}->{$name};
415             }
416             }
417              
418             =item C
419              
420             Returns the description field of the specified pathway.
421              
422             =cut
423              
424             sub pathway_desc {
425 999     999 1 91517 my $self = shift;
426 999         2445 my $pathway = $self->_check_pathway_arg(shift(@_));
427 999         6583 return $self->{PATHWAYS}->{$pathway}->{DESC};
428             }
429              
430             =item C
431              
432             Returns an array containing the genes of the specified pathway.
433              
434             =cut
435              
436             sub pathway_genes {
437 28     28 1 7605 my $self = shift;
438 28         72 my $pathway = $self->_check_pathway_arg(shift(@_));
439 28         33 return @{$self->{PATHWAYS}->{$pathway}->{GENES}};
  28         210  
440             }
441              
442              
443             =item C
444              
445             Returns the position in the gene list for a specific FDR value. The
446             C<$fdr> variable must be an integer between 1 and the FDR cutoff.
447              
448             =cut
449              
450             sub fdr_position {
451 4     4 1 9 my $self = shift;
452 4         8 my $fdr = shift;
453 4 50 33     59 if ($fdr < 1 or
      33        
454             $fdr > $self->{FDR_CUTOFF} or
455             int($fdr) != $fdr) {
456 0         0 confess "Invalid FDR value = $fdr passed to fdr_position.\n";
457             }
458 4         45 return $self->{FDRS}->[$fdr-1];
459             }
460              
461              
462             =item C
463              
464             Returns the count of genes in the system which the size of the union
465             of the gene names seen from both the C and
466             C methods. This value is used in the Fisher Exact Test
467             calculation. You can change the total gene count value used in the
468             calculation by specifying a parameter to this method.
469              
470             =cut
471              
472             sub gene_count {
473 1     1 1 4 my $self = shift;
474 1 50       20 if (defined($_[0])) {
475 1 50       7 if ($self->{UNIVERSE} eq 'user') {
476 1         3 $self->{GENE_COUNT} = $_[0];
477 1         4 $self->{STATE} = 'setup';
478             }
479             else {
480 0         0 confess "Gene count can be updated only if the universe option is set to 'user'";
481             }
482             }
483 1         4 return $self->{GENE_COUNT};
484             }
485              
486             =item C
487              
488             Run the FDR FET calculation.
489              
490             =cut
491              
492             sub calculate {
493 7744     7744 1 7326 my $self = shift;
494 7744 100       20567 return if $self->{STATE} eq 'calculated';
495 4 50       13 print STDERR "New calculation initiated.\n" if $self->{VERBOSE};
496 4         19 $self->_sort_by_pvals;
497 4         4049 $self->_clean_genes;
498 4         23 $self->_calc_fdrs;
499 4         38 $self->_calculate_fets;
500 4         36 $self->{STATE} = 'calculated';
501             }
502            
503             # Internal procedures.
504              
505             sub _check_fdr_cutoff {
506 8     8   10 my $cutoff = shift;
507 8 50 33     96 if ($cutoff > 0 and
      33        
508             $cutoff <= 100 and
509             $cutoff == int($cutoff)) {
510 8         27 return $cutoff;
511             }
512             else {
513 0         0 confess "New fdr_cutoff ($cutoff) is outside the range (0, 100] or is not an integer.\n";
514 0         0 return undef; # We shouldn't get here, but if so, return something that will cause problems.
515             }
516             }
517              
518             sub _check_args {
519             # Check validity of argument list.
520             # First argument is reference to argument list.
521             # Remaining arguments are required arguments.
522 50776     50776   48921 my $arg_ref = shift;
523 50776 50       38403 if (scalar(@{$arg_ref}) % 2 == 1) {
  50776         108764  
524 0         0 confess "Argument to caller of _check_args has an odd number of elements and is not interpretable as a hash.\n";
525             }
526 50776         43529 my %args = @{$arg_ref};
  50776         123608  
527 50776         63070 my @missing = ();
528 50776         66325 foreach my $arg (@_) {
529 106932 50       226244 if (not exists($args{$arg})) {
530 0         0 push (@missing, $arg);
531             }
532             }
533 50776 50       92800 if (scalar(@missing)) {
534 0         0 confess sprintf("Argument(s) %s missing to caller of _check_args.\n",
535             join(", ", @missing));
536             }
537 50776         198045 return %args;
538             }
539              
540             sub _check_pathway_arg {
541 8763     8763   8538 my $self = shift;
542 8763         11205 my $pathway = shift;
543 8763 50       24478 if (not exists($self->{PATHWAYS}->{$pathway})) {
544 0         0 confess "Pathway ($pathway) not found.\n";
545             }
546 8763         14759 return $pathway;
547             }
548              
549             sub _sort_by_pvals {
550 5     5   11 my $self = shift;
551 567591 100 100     2757621 $self->{SORTED_GENES} = [ sort
    100          
    100          
552 5         20955 { ( defined ($self->{GENES}->{$a}->{PVAL}) and
553             defined ($self->{GENES}->{$b}->{PVAL})) ?
554             $self->{GENES}->{$a}->{PVAL} <=> $self->{GENES}->{$b}->{PVAL} :
555             ( defined ($self->{GENES}->{$a}->{PVAL}) ? -1 :
556             defined ($self->{GENES}->{$b}->{PVAL}) ? 1 :
557             $a cmp $b ) }
558 5         10 keys %{$self->{GENES}} ];
559 5         6733 $self->{SORTED_PVALS} = [ map {$self->{GENES}->{$_}->{PVAL}} @{$self->{SORTED_GENES}} ];
  46455         115776  
  5         194  
560             }
561              
562             sub _calc_fdrs {
563 4     4   7 my $self = shift;
564 4         17 $self->{FDRS} = [];
565             #calculate FDRs based p values
566 4         30 for (my $i = 1; $i <= $self->{FDR_CUTOFF}; $i++) {
567 140         325 my $cutoff = $i / 100;
568 140         914 my $result = $self->_fdr($cutoff, $self->{SORTED_PVALS});
569 140         746 $self->{FDRS}->[$i-1] = $result;
570 140 50       1341 printf STDERR "FDR count at %d%% is %d\n", $i, $result if $self->{VERBOSE};
571             }
572             }
573              
574             sub _fdr {
575 140     140   327 my $self = shift;
576 140         373 my ($qlevel, $pvals) = @_;
577 140         257 my $i=1;
578 140         465 my $count=0;
579 140 100 100     5689 map { $count = ($i - 1) if (defined $_ and
  1225735         4939793  
580             $_ <= $qlevel * $i++ / $self->{GENE_COUNT}) } @$pvals; # / ) }; Fix Emacs cperl confusion.
581 140         809 return $count;
582             }
583              
584             sub _clean_genes {
585 4     4   13 my $self = shift;
586 4 100       28 if (not defined $self->{GENE_COUNT}) {
587 3         6 $self->{GENE_COUNT} = scalar(keys %{$self->{GENES}});
  3         17  
588             }
589 4 100       50 if ($self->{UNIVERSE} eq 'genes') {
    100          
    100          
    50          
590 1         8 $self->_clean_unused_genes;
591 1         15 $self->{GENE_COUNT} = $self->{INPUT_GENE_COUNT};
592             }
593             elsif ($self->{UNIVERSE} eq 'user') {
594 1         3 my $sum = 0;
595 1         8 foreach my $p ($self->pathways) {
596 7         8 $sum += scalar(@{$self->{PATHWAYS}->{$p}->{GENES}});
  7         21  
597             }
598 1 50 33     4 if (not ($self->{GENE_COUNT} >= scalar(keys %{$self->{GENES}}) and
  1         16  
599             $self->{GENE_COUNT} >= $sum)) {
600 0         0 confess sprintf("Gene count setting %d is too small. Gene count is %d and pathway gene count is %d\n",
601             $self->{GENE_COUNT},
602 0         0 scalar(keys %{$self->{GENES}}),
603             $sum);
604             }
605             }
606             elsif ($self->{UNIVERSE} eq 'union') {
607             # No action required.
608             }
609             elsif ($self->{UNIVERSE} eq 'intersection') {
610 1         2 foreach my $gene (keys %{$self->{GENES}}) {
  1         2765  
611 11434         18242 my $exists_pathway = exists($self->{GENES}->{$gene}->{PATHWAYS});
612 11434   66     30959 my $exists_pval =
613             (exists($self->{GENES}->{$gene}->{PVAL}) and
614             defined($self->{GENES}->{$gene}->{PVAL}));
615 11434 100 100     17852 if (not ($exists_pathway and $exists_pval)) {
616 10715 50       13738 printf STDERR "Gene $gene eliminated. exists_pathway = $exists_pathway " .
617             "exists_pval = $exists_pval\n" if $self->{VERBOSE};
618 10715         17074 delete $self->{GENES}->{$gene};
619             }
620             }
621 1         1578 $self->{GENE_COUNT} = scalar(keys %{$self->{GENES}});
  1         10  
622 1         9 $self->_sort_by_pvals;
623 1         814 $self->_clean_unused_genes;
624             }
625 4 50       27 printf STDERR "Gene count set to %d\n", $self->{GENE_COUNT} if $self->{VERBOSE};
626             }
627              
628             sub _clean_unused_genes {
629 2     2   13 my $self = shift;
630 2         53 my %seen;
631 2         5 foreach my $gene (@{$self->{SORTED_GENES}}) {
  2         11  
632 12153 100       33141 if (defined ($self->{GENES}->{$gene}->{PVAL})) {
633 12068         22653 $seen{$gene} = 1;
634             }
635             }
636 2         7 foreach my $pathway (keys %{$self->{PATHWAYS}}) {
  2         19  
637 14         22 my @new_gene_list = ();
638 14         11 my $old_size = scalar(@{$self->{PATHWAYS}->{$pathway}->{GENES}});
  14         37  
639 14         15 foreach my $pgene (@{$self->{PATHWAYS}->{$pathway}->{GENES}}) {
  14         32  
640 2690 100       4740 if (exists($seen{$pgene})) {
641 2410         3024 push (@new_gene_list, $pgene);
642             }
643             }
644 14         693 $self->{PATHWAYS}->{$pathway}->{GENES} = [ @new_gene_list ];
645 14         234 my $new_size = scalar(@new_gene_list);
646 14 50       3278 if ($self->{VERBOSE}) {
647 0 0       0 if ($old_size != $new_size) {
648 0         0 printf STDERR "Pathway %s size reduced from %d to %d\n", $pathway, $old_size, $new_size;
649             }
650             else {
651 0         0 printf STDERR "Pathway %s size left at %d\n", $pathway, $old_size;
652             }
653             }
654             }
655             }
656              
657             sub _calculate_fets {
658 4     4   15 my $self = shift;
659 4         44 foreach my $p (keys %{$self->{PATHWAYS}}) {
  4         49  
660 28         58 my @pids = @{$self->{PATHWAYS}->{$p}->{GENES}};
  28         2176  
661 28         71 my ($fdr, $fdr_gene_cutoff, @s);
662 0         0 my ($pval, $logpval, $q, $m, $n, $k, $bestpval, $oddratio, $bestratio);
663 0         0 my ($bestm, $bestn, $bestq, $bestfdr, $bestloci, $bestlogpval);
664 0         0 my (@q);
665 28         74 $k = scalar(@pids);
666 28         53 $bestpval = 1.1;
667 28         48 $bestlogpval = 0;
668 28         61 $bestratio = 0;
669 28         52 $bestm = 0;
670 28         72 $bestn = 0;
671 28         50 $bestq = 0;
672 28         56 $bestfdr = 100;
673 28         83 my $path_ref = $self->{PATHWAYS}->{$p};
674 28         149 for (my $fdr = $self->{FDR_CUTOFF} - 1; $fdr >= 0; $fdr--) {
675 980         2318 $fdr_gene_cutoff = $self->{FDRS}->[$fdr];
676 980 50       2109 last if ($fdr_gene_cutoff == 0);
677 980         997 $fdr_gene_cutoff--;
678 980         32932 @s = @{$self->{SORTED_GENES}}[0..$fdr_gene_cutoff];
  980         230923  
679            
680             #find intersection of pathway and regulated genes
681 980         47010 my %seen = ();
682 980         4358 undef @q;
683 980         2211 foreach my $g (@pids) {
684 178500         210504 $seen{$g} = 1;
685             }
686 980         2057 foreach my $g (@s) {
687 767655 100       1209119 push @q , $g if (defined $seen{$g});
688             }
689            
690             #calculate values in 2 by 2 contigency table
691 980         2630 $m = $fdr_gene_cutoff + 1;
692 980         2734 $n = $self->{GENE_COUNT} - $m;
693 980         1677 $q = scalar(@q);
694            
695             #FET calculation
696 980         9721 ($pval, $oddratio) = _fet($q, $m, $n, $k);
697 980 50       3345 if ($self->{VERBOSE}) {
698 0         0 printf STDERR "FET calculation for $p FDR = %d: P = %.3g Odds = %.2f q = $q m = $m n = $n k = $k\n",
699             $fdr+1, $pval, $oddratio;
700             }
701 980 50       2981 if ($pval == 0) {
702 0         0 $logpval = 1000;
703             }
704             else {
705 980         2619 $logpval = 0 - log10($pval); # The unary operator "-" does not work correctly
706             # if log10($pval) is 0. You get -0 for $logpval.
707             }
708 980         3250 $path_ref->{ALL_RESULTS}->{PVAL}->[$fdr] = $pval;
709 980         2113 $path_ref->{ALL_RESULTS}->{LOGPVAL}->[$fdr] = $logpval;
710 980         1870 $path_ref->{ALL_RESULTS}->{ODDS}->[$fdr] = $oddratio;
711 980         1920 $path_ref->{ALL_RESULTS}->{Q}->[$fdr] = $q;
712 980         2063 $path_ref->{ALL_RESULTS}->{M}->[$fdr] = $m;
713 980         1886 $path_ref->{ALL_RESULTS}->{N}->[$fdr] = $n;
714 980         1663 $path_ref->{ALL_RESULTS}->{K}->[$fdr] = $k;
715 980         8375 $path_ref->{ALL_RESULTS}->{LOCI}->[$fdr] = [ @q ];
716 980 100       18836 if ($pval < $bestpval) {
717 302         559 $bestpval = $pval;
718 302         488 $bestlogpval = $logpval;
719 302         401 $bestratio = $oddratio;
720 302         449 $bestm = $m;
721 302         332 $bestn = $n;
722 302         544 $bestq = $q;
723 302         448 $bestfdr = $fdr + 1;
724 302         9523 $bestloci = [ @q ];
725             }
726             }
727 28         135 &_complete_all_results_array($path_ref, 'PVAL', 1.0);
728 28         98 &_complete_all_results_array($path_ref, 'LOGPVAL', 0.0);
729 28         108 &_complete_all_results_array($path_ref, 'ODDS', 0.0);
730 28         94 &_complete_all_results_array($path_ref, 'Q', -1);
731 28         76 &_complete_all_results_array($path_ref, 'M', -1);
732 28         73 &_complete_all_results_array($path_ref, 'N', -1);
733 28         78 &_complete_all_results_array($path_ref, 'K', -1);
734 28         90 &_complete_all_results_array($path_ref, 'LOCI', [ ]);
735 28 50       155 if ($self->{VERBOSE}) {
736 0         0 print STDERR "FET calculation for $p Bestfdr = $bestfdr\n",
737             }
738 28         913 $path_ref->{BEST_RESULTS} = { PVAL => $bestpval,
739             LOGPVAL => $bestlogpval,
740             ODDS => $bestratio,
741             Q => $bestq,
742             M => $bestm,
743             N => $bestn,
744             K => $k,
745             FDR => $bestfdr,
746             LOCI => $bestloci };
747             }
748             }
749              
750             sub log10 {
751 980     980 0 1754 my $number = shift;
752 980         3714 return log($number)/log(10);
753             }
754              
755             sub _complete_all_results_array {
756 224     224   246 my $path_ref = shift;
757 224         270 my $datum = shift;
758 224         232 my $default = shift;
759 224         365 my $array_ref = $path_ref->{ALL_RESULTS}->{$datum};
760            
761 224         289 for (my $i = 0; $i < scalar(@{$array_ref}); $i++) {
  8064         12061  
762 7840 50       12822 $array_ref->[$i] = $default if not defined $array_ref->[$i];
763             }
764             }
765              
766 1     1   759 use Inline C => <<'END_OF_C_CODE', NAME => 'Bio::FdrFet::FastFet';
  1         16087  
  1         9  
767            
768             #include
769             #include
770             #include
771             #include
772             #include
773            
774             #define MAXIT 100
775             #define MAX_ITER 2000
776             #define TOL 1.0e-7
777             #define EPS 1.0e-12
778             #define EPS1 1.0e-13
779             #define DBL_EPSILON 2.220446049250313e-16
780             #define EPSKS1 0.001
781             #define EPSKS2 1.0e-8
782             #define FPMIN 1.0e-30
783             #define TINY 1.0e-20
784             #define MD fmod(1,0)
785             #define PI 3.141593
786             #define PIX2 6.283185307179586476925286766559
787            
788             #define SQR(a) ((a)*(a))
789             #define MAX(a,b) ((a) > (b) ? (a) : (b))
790             #define MIN(a,b) ((a) > (b) ? (b) : (a))
791             #define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;
792             #define SIGN(a, b) ((b) < 0 ? -fabs(a) : fabs(a))
793            
794             void _fet (double q, double m, double n, double k);
795             double dhyper (double q, double m, double n, double k);
796             double dbinom_raw (double q, double k, double p, double pq);
797             double stirlerr (double n);
798             double pdhyper (double q, double m, double n, double k);
799             double bd0 (double x, double np);
800            
801             void
802             _fet (double q,
803             double m,
804             double n,
805             double k)
806             {
807            
808             double oldn;
809             double d;
810             double pd;
811             double pval;
812             double oddratio;
813             I32 flag;
814             Inline_Stack_Vars;
815            
816             if ((m-q) == 0 || (k-q) == 0) {
817             oddratio = 999999;
818             } else {
819             oddratio = (q*(n-k+q))/((m-q)*(k-q));
820             }
821            
822             if ((q*(m+n)) > (k*m)) {
823             oldn = n;
824             n = m;
825             m = oldn;
826             q = k-q;
827             flag = 1;
828             } else {
829             flag = 0;
830             }
831            
832             if (flag == 1) {
833             d = dhyper(q, m, n, k);
834             pd = pdhyper(q, m, n, k);
835             pval = d * pd;
836             } else if (flag == 0) {
837             d = dhyper(q-1, m, n, k);
838             pd = pdhyper(q-1, m, n, k);
839             pval = 0.5 - d * pd + 0.5;
840             }
841            
842             Inline_Stack_Reset;
843             Inline_Stack_Push(sv_2mortal(newSVnv(pval)));
844             Inline_Stack_Push(sv_2mortal(newSVnv(oddratio)));
845             Inline_Stack_Done;
846            
847             }
848            
849             double
850             dhyper (double q,
851             double m,
852             double n,
853             double k) {
854            
855             double p, pq, p1, p2, p3;
856            
857             p = ((double)k)/((double)(m+n));
858             pq = ((double)(m+n-k))/((double)(m+n));
859            
860             p1 = dbinom_raw(q, m, p,pq);
861             p2 = dbinom_raw(k-q,n, p,pq);
862             p3 = dbinom_raw(k,m+n, p,pq);
863            
864             return(p1*p2/p3);
865             }
866            
867             double
868             dbinom_raw (double q,
869             double k,
870             double p,
871             double pq) {
872            
873             double lf, lc;
874            
875             if (p == 0) return ((q == 0) ? 1 : 0);
876             if (pq == 0) return ((q == k) ? 1 : 0);
877            
878             if (q == 0) {
879             if (k == 0) return 1;
880             lc = (p < 0.1) ? -bd0(k,k*pq) - k*p : k*log(pq);
881             return(exp(lc));
882             }
883             if (q == k) {
884             lc = (pq < 0.1) ? -bd0(k,k*p) - k*pq : k*log(p);
885             return(exp(lc));
886             }
887             if (q < 0 || q > k) return(0);
888            
889             lc = stirlerr(k) - stirlerr(q) - stirlerr(k-q) - bd0(q,k*p) - bd0(k-q,k*pq);
890            
891             lf = log(PIX2) + log(q) + log(k-q) - log(k);
892            
893             return exp(lc - 0.5*lf);
894             }
895            
896            
897             double
898             stirlerr (double n) {
899            
900             double nn;
901            
902             #define S0 0.083333333333333333333 /* 1/12 */
903             #define S1 0.00277777777777777777778 /* 1/360 */
904             #define S2 0.00079365079365079365079365 /* 1/1260 */
905             #define S3 0.000595238095238095238095238 /* 1/1680 */
906             #define S4 0.0008417508417508417508417508/* 1/1188 */
907            
908             const double sferr_halves[31] = {
909             0.0, /* n=0 - wrong, place holder only */
910             0.1534264097200273452913848, /* 0.5 */
911             0.0810614667953272582196702, /* 1.0 */
912             0.0548141210519176538961390, /* 1.5 */
913             0.0413406959554092940938221, /* 2.0 */
914             0.03316287351993628748511048, /* 2.5 */
915             0.02767792568499833914878929, /* 3.0 */
916             0.02374616365629749597132920, /* 3.5 */
917             0.02079067210376509311152277, /* 4.0 */
918             0.01848845053267318523077934, /* 4.5 */
919             0.01664469118982119216319487, /* 5.0 */
920             0.01513497322191737887351255, /* 5.5 */
921             0.01387612882307074799874573, /* 6.0 */
922             0.01281046524292022692424986, /* 6.5 */
923             0.01189670994589177009505572, /* 7.0 */
924             0.01110455975820691732662991, /* 7.5 */
925             0.010411265261972096497478567, /* 8.0 */
926             0.009799416126158803298389475, /* 8.5 */
927             0.009255462182712732917728637, /* 9.0 */
928             0.008768700134139385462952823, /* 9.5 */
929             0.008330563433362871256469318, /* 10.0 */
930             0.007934114564314020547248100, /* 10.5 */
931             0.007573675487951840794972024, /* 11.0 */
932             0.007244554301320383179543912, /* 11.5 */
933             0.006942840107209529865664152, /* 12.0 */
934             0.006665247032707682442354394, /* 12.5 */
935             0.006408994188004207068439631, /* 13.0 */
936             0.006171712263039457647532867, /* 13.5 */
937             0.005951370112758847735624416, /* 14.0 */
938             0.005746216513010115682023589, /* 14.5 */
939             0.005554733551962801371038690 /* 15.0 */
940             };
941            
942            
943             if (n <= 15.0) {
944             nn = n + n;
945             if (nn == (int)nn) {
946             return (sferr_halves[(int)nn]);
947             } else {
948             fprintf(stderr, "%f not integer\n", nn);
949             exit(1);
950             }
951             } else {
952             nn = n*n;
953             if (n>500) return ((S0-S1/nn)/n);
954             if (n> 80) return ((S0-(S1-S2/nn)/nn)/n);
955             if (n> 35) return ((S0-(S1-(S2-S3/nn)/nn)/nn)/n);
956             /* 15 < n <= 35 : */
957             return ((S0-(S1-(S2-(S3-S4/nn)/nn)/nn)/nn)/n);
958             }
959            
960             }
961            
962            
963             double
964             bd0 (double x,
965             double np) {
966            
967             double ej, s, s1, v;
968             int j;
969            
970             if ((abs(x-np)) < (0.1*(x+np))) {
971             v = (x-np)/(x+np);
972             s = (x-np)*v; /* s using v -- change by MM */
973             ej = 2*x*v;
974             v = v*v;
975             for (j=1; ; j++) { /* Taylor series */
976             ej *= v;
977             s1 = s+ej/((j<<1)+1);
978             if (s1==s) { /* last term was effectively 0 */
979             return (s1);
980             }
981             s = s1;
982             }
983             } else {
984             // | x - np | is not too small */
985             return (x*log(x/np)+np-x);
986             }
987            
988             }
989            
990            
991             double
992             pdhyper (double q,
993             double m,
994             double n,
995             double k) {
996            
997             double sum = 0;
998             double term = 1;
999            
1000             while (q > 0 && term >= DBL_EPSILON * sum) {
1001             term *= q * (n - k + q) / (k + 1 - q) / (m + 1 - q);
1002             sum += term;
1003             q--;
1004             }
1005            
1006             return 1 + sum;
1007            
1008             }
1009            
1010             END_OF_C_CODE
1011            
1012              
1013             __END__