File Coverage

blib/lib/Bio/FdrFet.pm
Criterion Covered Total %
statement 182 297 61.2
branch 43 102 42.1
condition 15 33 45.4
subroutine 22 30 73.3
pod n/a
total 262 462 56.7


line stmt bran cond sub pod time code
1             package Bio::FdrFet;
2              
3 1     1   43750 use 5.008;
  1         3  
  1         38  
4 1     1   6 use strict;
  1         2  
  1         36  
5 1     1   4 use warnings;
  1         7  
  1         42  
6 1     1   6 use Carp;
  1         1  
  1         4343  
7              
8             require Exporter;
9              
10             our @ISA = qw(Exporter);
11              
12             # Items to export into callers namespace by default. Note: do not export
13             # names by default without a very good reason. Use EXPORT_OK instead.
14             # Do not simply export all your public functions/methods/constants.
15              
16             # This allows declaration use Bio::FdrFet ':all';
17             # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
18             # will save memory.
19             our %EXPORT_TAGS = ( 'all' => [ qw(
20            
21             ) ] );
22              
23             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
24              
25             our @EXPORT = qw(
26            
27             );
28              
29             our $VERSION = '0.04';
30              
31             1;
32            
33             =head1 NAME
34              
35             Bio::FdrFet - Perl extension for False Discovery Rate and Fisher Exact Test applied to pathway analysis.
36              
37             =head1 SYNOPSIS
38              
39             use Bio::FdrFet;
40             my $obj = new Bio::FdrFet($fdrcutoff);
41              
42             open (IN, $pathwayfile) ||
43             die "can not open pathway annotation file $pathwayfile: $!\n";
44             while () {
45             chomp;
46             my ($gene, $dbacc, $desc, $rest) = split (/\t/, $_, 4);
47             $obj->add_to_pathway("gene" => $gene,
48             "dbacc" => $dbacc,
49             "desc" => $desc);
50             }
51             close IN;
52              
53             #read in genes and associated p values
54             my (%genes, @fdrs);
55             open (IN, $genefile) || die "can not open gene file $genefile: $!\n";
56             my $ref_size = 0;
57             while () {
58             my ($gene, $pval) = split (/\t/, $_);
59             $obj->add_to_genes("gene" => $gene,
60             "pval" => $pval);
61             $ref_size++;
62             }
63             close IN;
64             $obj->gene_count($ref_size);
65             $obj->calculate;
66             foreach my $pathway ($obj->pathways('sorted')) {
67             my $logpval = $obj->pathway_result($pathway, 'LOGPVAL');
68             printf "Pathway $pathway %s has - log(pval) = %6.4f",
69             $obj->pathway_desc($pathway),
70             $logpval;
71             }
72              
73             =head2 Constructor
74              
75             $obj = new Bio::FdrFet($fdrcutoff);
76              
77             # You can also use $obj = new Bio::FdrFet->new($fdrcutoff);
78              
79             =head2 Object Methods
80              
81             =head3 Input Methods
82              
83             $obj->fdr_cutoff($new_cutoff);
84             $obj->universe($universe_option);
85             $obj->verbose($new_verbose);
86             $obj->add_to_pathway("gene" => ,
87             "dbacc" => ,
88             "desc" => );
89             $obj->add_to_genes("gene" => ,
90             "pval" => );
91              
92             =head3 Output Methods
93              
94             $obj->genes;
95             $obj->pathways[($order)];
96             $obj->pathway_result($pathway, $data_name[, $all_flag]);
97             $obj->pathway_desc($pathway);
98             $obj->pathway_genes($pathway);
99             $obj->fdr_position($fdr);
100              
101             =head3 Other Methods
102              
103             $obj->gene_count[($fet_gene_count)];
104             $obj->calculate;
105              
106             =head1 DESCRIPTION
107              
108             Bio::FdrFet implements the False Discovery Rate Fisher Exact Test of gene
109             expression analysis applied to pathways described in the paper by
110             Ruiru Ji, Karl-Heinz Ott, Roumyana Yordanova, and Robert E
111             Bruccoleri. A copy of the paper is included with the distribution in
112             the file, Fdr-Fet-Manuscript.pdf.
113              
114             The module is implemented using a simple object oriented paradigm
115             where the object stores all the information needed for a calculation
116             along with a state variable, C. The state variable has two
117             possible values, C<'setup'> and C<'calculated'>. The value of
118             C<'setup'> indicates that the object is being setup with data, and any
119             results in the object are inconsistent with the data. The value of
120             C<'calculated'> indicates that the object's computational results are
121             consistent with the data, and may be returned to a calling program.
122              
123             The C<'calculate'> method is used to update all the calculated values
124             from the input data. It checks the state variable first, and only does
125             the calculation if the state is C<'setup'>. Once the calculations are
126             complete, then the state variable is set to C<'calculated'>. Thus, the C method
127             can be called whenever a calculated value is needed, and there is no
128             performance penalty.
129              
130             The module initializes the C object with a state of
131             C<'setup'>. Any data input sets the state to C<'setup'>. Any requests
132             for calculated data, calls C<'calculate'>, which updates the state
133             variable so futures requests for calculated data return quickly.
134              
135             =head1 METHODS
136              
137             The following methods are provided:
138              
139             =over 4
140              
141             =cut
142            
143              
144             =item C
145              
146             Creates a new Bio::FdrFet object. The optional parameter is the False
147             Discovery Rate cutoff in units of percent. See the C
148             method below for more details.
149              
150             =cut
151              
152             sub new {
153 1     1   44 my $pkg;
154 1         3 my $class = shift;
155 1         3 eval {($pkg) = caller(0);};
  1         11  
156 1 50       6 if ($class ne $pkg) {
157 0         0 unshift @_, $class;
158             }
159 1         3 my $self = {};
160 1         3 bless $self;
161 1         2 my $cutoff = shift;
162 1 50       4 $cutoff = 35 if not defined $cutoff;
163 1         9 $self->{STATE} = "setup";
164 1         8 $self->{FDR_CUTOFF} = _check_fdr_cutoff($cutoff);
165 1         4 $self->{PATHWAYS} = {};
166 1         3 $self->{GENES} = {};
167 1         3 $self->{INPUT_GENE_COUNT} = 0;
168 1         4 $self->{GENE_COUNT} = undef;
169 1         2 $self->{VERBOSE} = 1;
170 1         4 $self->{UNIVERSE} = "genes";
171 1         4 return $self;
172             }
173              
174             =item C
175              
176             Retrieves the current setting for the False Discovery Rate threshold,
177             and optionally sets it. This threshold must be an integer greater than
178             0 and less than or equal to 100, and is divided by 100 for the value
179             used by the computation.
180              
181             =cut
182              
183             sub fdr_cutoff {
184              
185 2     2   5 my $self = shift;
186 2         3 my $new_cutoff = shift;
187 2 100       7 if (defined($new_cutoff)) {
188 1         4 $self->{FDR_CUTOFF} = _check_fdr_cutoff($new_cutoff);
189 1         3 $self->{STATE} = 'setup';
190             }
191 2         9 return $self->{FDR_CUTOFF};
192             }
193              
194             =item C
195              
196             Retrieves the current setting for the verbose parameter
197             and optionally sets it. It can be either 0, no verbosity, or 1, lots
198             of messages sent to STDERR.
199              
200             =cut
201              
202             sub verbose {
203              
204 1     1   3 my $self = shift;
205 1         2 my $new_verbose = shift;
206 1 50       5 if (defined($new_verbose)) {
207 1         3 $self->{VERBOSE} = $new_verbose;
208             }
209 1         6 return $self->{VERBOSE};
210             }
211              
212             =item C
213              
214             Retrieves the current setting for the B option
215             and optionally sets it. The B option specifies how the number of
216             genes in our statistical universe is calculated. There are four possible
217             settings to this option:
218              
219             =over 2
220              
221             =item union
222              
223             The universe is the union of all gene names specified
224             individually and in pathways. Genes which have no P value are counted in the universe,
225             but they are not counted as regulated in the FET or FDR calculations.
226              
227             =item genes
228              
229             Only genes specified by the add_to_genes method count.
230              
231             =item intersection
232              
233             Only genes in the intersection of the gene list and pathways are used for the universe
234             calculation
235              
236             =item user
237              
238             The user specifies the universe size by calling the C method with an argument.
239              
240             =back
241              
242             =cut
243              
244             sub universe {
245              
246 1     1   433 my $self = shift;
247 1         3 my $new_universe = shift;
248 1 50       5 if (defined($new_universe)) {
249 1         3 $new_universe = lc($new_universe);
250 1 50       8 if ($new_universe =~ m/^(user|union|genes|intersection)$/) {
251 1         3 $self->{UNIVERSE} = $new_universe;
252             }
253             else {
254 0         0 confess "Bad value ($new_universe) for setting universe option\n";
255             }
256             }
257 1         7 return $self->{UNIVERSE};
258             }
259              
260             =item C<< add_to_pathway( >>
261              
262             "gene" => ,
263             "dbacc" => ,
264             "desc" => )
265              
266             Adds a gene to a pathway and also defines a pathway. The arguments are
267             specified as a pseudo hash, with each argument being preceded by its
268             name.
269              
270             Pathways are defined by an accession key (C parameter), a
271             description (C parameter, and a set of genes (specified by the
272             Cparameter). To use this function to specify a pathway with
273             multiple genes, you call this method multiple times with the same
274             accession key and description, and vary the gene name. The gene names
275             are just arbitrary strings, but they must match the values used for
276             specifying Probability Values (pvalues) used by the C
277             method.
278              
279             =cut
280              
281             sub add_to_pathway {
282            
283 1345     1345   12170 my $self = shift;
284 1345         4197 my %arg = &_check_args(\@_, "gene", "dbacc", "desc");
285 1345 100 66     9576 if (exists($self->{PATHWAYS}->{$arg{dbacc}}) and
286             exists($self->{PATHWAYS}->{$arg{dbacc}}->{DESC})) {
287 1338 50       5112 if ($arg{desc} ne $self->{PATHWAYS}->{$arg{dbacc}}->{DESC}) {
288 0         0 confess(sprintf("Gene %s has a dbacc = %s with descriptor of %s does not match previous entry descriptor = %s\n",
289             $arg{gene},
290             $arg{dbacc},
291             $arg{desc},
292             $self->{PATHWAYS}->{$arg{dbacc}}));
293             }
294             }
295             else {
296 7         32 $self->{PATHWAYS}->{$arg{dbacc}}->{DESC} = $arg{desc};
297             }
298 1345         1323 push (@{$self->{PATHWAYS}->{$arg{dbacc}}->{GENES}}, $arg{gene});
  1345         4688  
299 1345         1507 push(@{$self->{GENES}->{$arg{gene}}->{PATHWAYS}}, $arg{dbacc});
  1345         5644  
300 1345         6743 $self->{STATE} = 'setup';
301             }
302              
303             =item C<< add_to_genes( >>
304              
305             "gene" => ,
306             "pval" => )
307              
308              
309             Adds a probability values for a gene in the calculation. The arguments
310             are specified using a pseudo hash with the nameof parameter preceding
311             its value. The gene names must match those used in the pathways. The
312             probability values are estimates of non-randomness and should range
313             from 0 to 1.
314              
315             =cut
316              
317             sub add_to_genes {
318 11349     11349   13526 my $self = shift;
319 11349         25417 my %arg = &_check_args(\@_, "gene", "pval");
320 11349 50 33     87623 if (exists($self->{GENES}->{$arg{gene}}->{PVAL}) and
321             $self->{GENES}->{$arg{gene}}->{PVAL} != $arg{pval}) {
322 0         0 confess sprintf("Gene %s has a pre-existing pval (%g) different than the current argument = %g\n",
323             $arg{gene},
324             $self->{GENES}->{$arg{gene}}->{PVAL},
325             $arg{pval});
326             }
327 11349         32501 $self->{GENES}->{$arg{gene}}->{PVAL} = $arg{pval};
328 11349         20500 $self->{STATE} = 'setup';
329 11349         37429 $self->{INPUT_GENE_COUNT} += 1;
330             }
331              
332             =item C
333              
334             Returns the list of gene names in the system.
335              
336             =cut
337              
338             sub genes {
339 1     1   3 my $self = shift;
340 1         2 return keys %{$self->{GENES}};
  1         11  
341             }
342              
343             =item C
344              
345             Returns the list of pathways. If the optional argument, C<$order>, is
346             specified and contains the word, C<"sorted">, (comparison is case
347             insensitive), then the object will return the pathways in order of
348             most significant to least. If sorting is done, then the object will
349             update the calculation of probability values, whereas if no sorting is
350             done, then the object does no calculation.
351              
352             =cut
353              
354             sub pathways {
355 1     1   31 my $self = shift;
356 1         2 my $order = shift;
357 1 50       629 if (lc($order) ne 'sorted') {
358 1         2 return keys %{$self->{PATHWAYS}};
  1         16  
359             }
360             else {
361 0         0 $self->calculate;
362 0         0 return sort { $self->{PATHWAYS}->{$b}->{BEST_RESULTS}->{LOGPVAL} <=>
  0         0  
363 0         0 $self->{PATHWAYS}->{$a}->{BEST_RESULTS}->{LOGPVAL} } keys %{$self->{PATHWAYS}};
364             }
365             }
366              
367             =item C
368              
369             Returns a calculated result for a pathway. The following values may be
370             used for C<$data_name>. Case of C<$data_name> does not matter.
371              
372             LOGPVAL -log10(probability value for pathway).
373             PVAL probability value for pathway
374             ODDS Odds ratio.
375             Q Number of genes in the pathway passing the FDR cutoff
376             M Number of genes overall passing the FDR cutoff
377             N Number of genes in the system minus C above.
378             K Number of genes in the pathway.
379             FDR FDR cutoff in percent giving the best pvalue.
380             LOCI Reference to an array of gene names in the pathway
381             that satisfy FDR cutoff.
382              
383             If C<$all_flag> is specified and has the value, "all", then this
384             returns an array of values for all the attempted FDR cutoffs, except
385             for the c cutoff.
386              
387             =cut
388              
389             sub pathway_result {
390 0     0   0 my $self = shift;
391 0         0 my $pathway = $self->_check_pathway_arg(shift(@_));
392 0         0 my $name = uc(shift(@_));
393 0         0 my $all_option = shift(@_);
394 0 0       0 $all_option = "" if not defined $all_option;
395 0         0 $all_option = uc($all_option);
396            
397 0         0 $self->calculate;
398 0 0       0 if (not exists($self->{PATHWAYS}->{$pathway})) {
399 0         0 confess "Pathway ($pathway) not found.\n";
400             }
401 0 0       0 if ($all_option eq 'ALL') {
402 0 0       0 if ($name eq 'FDR') {
403 0         0 confess "All option not valid for FDR result.\n";
404             }
405 0 0       0 if (not exists($self->{PATHWAYS}->{$pathway}->{ALL_RESULTS}->{$name})) {
406 0         0 confess "Pathway all_results ($name) not found.\n";
407             }
408 0         0 return @{$self->{PATHWAYS}->{$pathway}->{ALL_RESULTS}->{$name}};
  0         0  
409             }
410             else {
411 0 0       0 if (not exists($self->{PATHWAYS}->{$pathway}->{BEST_RESULTS}->{$name})) {
412 0         0 confess "Pathway results ($name) not found.\n";
413             }
414 0         0 return $self->{PATHWAYS}->{$pathway}->{BEST_RESULTS}->{$name};
415             }
416             }
417              
418             =item C
419              
420             Returns the description field of the specified pathway.
421              
422             =cut
423              
424             sub pathway_desc {
425 0     0   0 my $self = shift;
426 0         0 my $pathway = $self->_check_pathway_arg(shift(@_));
427 0         0 return $self->{PATHWAYS}->{$pathway}->{DESC};
428             }
429              
430             =item C
431              
432             Returns an array containing the genes of the specified pathway.
433              
434             =cut
435              
436             sub pathway_genes {
437 0     0   0 my $self = shift;
438 0         0 my $pathway = $self->_check_pathway_arg(shift(@_));
439 0         0 return @{$self->{PATHWAYS}->{$pathway}->{GENES}};
  0         0  
440             }
441              
442              
443             =item C
444              
445             Returns the position in the gene list for a specific FDR value. The
446             C<$fdr> variable must be an integer between 1 and the FDR cutoff.
447              
448             =cut
449              
450             sub fdr_position {
451 0     0   0 my $self = shift;
452 0         0 my $fdr = shift;
453 0 0 0     0 if ($fdr < 1 or
      0        
454             $fdr > $self->{FDR_CUTOFF} or
455             int($fdr) != $fdr) {
456 0         0 confess "Invalid FDR value = $fdr passed to fdr_position.\n";
457             }
458 0         0 return $self->{FDRS}->[$fdr-1];
459             }
460              
461              
462             =item C
463              
464             Returns the count of genes in the system which the size of the union
465             of the gene names seen from both the C and
466             C methods. This value is used in the Fisher Exact Test
467             calculation. You can change the total gene count value used in the
468             calculation by specifying a parameter to this method.
469              
470             =cut
471              
472             sub gene_count {
473 0     0   0 my $self = shift;
474 0 0       0 if (defined($_[0])) {
475 0 0       0 if ($self->{UNIVERSE} eq 'user') {
476 0         0 $self->{GENE_COUNT} = $_[0];
477 0         0 $self->{STATE} = 'setup';
478             }
479             else {
480 0         0 confess "Gene count can be updated only if the universe option is set to 'user'";
481             }
482             }
483 0         0 return $self->{GENE_COUNT};
484             }
485              
486             =item C
487              
488             Run the FDR FET calculation.
489              
490             =cut
491              
492             sub calculate {
493 1     1   3 my $self = shift;
494 1 50       7 return if $self->{STATE} eq 'calculated';
495 1 50       6 print STDERR "New calculation initiated.\n" if $self->{VERBOSE};
496 1         5 $self->_sort_by_pvals;
497 1         552 $self->_clean_genes;
498 1         6 $self->_calc_fdrs;
499 1         11 $self->_calculate_fets;
500 0         0 $self->{STATE} = 'calculated';
501             }
502            
503             # Internal procedures.
504              
505             sub _check_fdr_cutoff {
506 2     2   3 my $cutoff = shift;
507 2 50 33     23 if ($cutoff > 0 and
      33        
508             $cutoff <= 100 and
509             $cutoff == int($cutoff)) {
510 2         7 return $cutoff;
511             }
512             else {
513 0         0 confess "New fdr_cutoff ($cutoff) is outside the range (0, 100] or is not an integer.\n";
514 0         0 return undef; # We shouldn't get here, but if so, return something that will cause problems.
515             }
516             }
517              
518             sub _check_args {
519             # Check validity of argument list.
520             # First argument is reference to argument list.
521             # Remaining arguments are required arguments.
522 12694     12694   18309 my $arg_ref = shift;
523 12694 50       13601 if (scalar(@{$arg_ref}) % 2 == 1) {
  12694         36680  
524 0         0 confess "Argument to caller of _check_args has an odd number of elements and is not interpretable as a hash.\n";
525             }
526 12694         22741 my %args = @{$arg_ref};
  12694         43647  
527 12694         18079 my @missing = ();
528 12694         20217 foreach my $arg (@_) {
529 26733 50       67555 if (not exists($args{$arg})) {
530 0         0 push (@missing, $arg);
531             }
532             }
533 12694 50       34274 if (scalar(@missing)) {
534 0         0 confess sprintf("Argument(s) %s missing to caller of _check_args.\n",
535             join(", ", @missing));
536             }
537 12694         63719 return %args;
538             }
539              
540             sub _check_pathway_arg {
541 0     0   0 my $self = shift;
542 0         0 my $pathway = shift;
543 0 0       0 if (not exists($self->{PATHWAYS}->{$pathway})) {
544 0         0 confess "Pathway ($pathway) not found.\n";
545             }
546 0         0 return $pathway;
547             }
548              
549             sub _sort_by_pvals {
550 2     2   6 my $self = shift;
551 146339 100 100     872193 $self->{SORTED_GENES} = [ sort
    100          
    100          
552 2         11639 { ( defined ($self->{GENES}->{$a}->{PVAL}) and
553             defined ($self->{GENES}->{$b}->{PVAL})) ?
554             $self->{GENES}->{$a}->{PVAL} <=> $self->{GENES}->{$b}->{PVAL} :
555             ( defined ($self->{GENES}->{$a}->{PVAL}) ? -1 :
556             defined ($self->{GENES}->{$b}->{PVAL}) ? 1 :
557             $a cmp $b ) }
558 2         4 keys %{$self->{GENES}} ];
559 2         2975 $self->{SORTED_PVALS} = [ map {$self->{GENES}->{$_}->{PVAL}} @{$self->{SORTED_GENES}} ];
  12153         36337  
  2         47  
560             }
561              
562             sub _calc_fdrs {
563 1     1   3 my $self = shift;
564 1         2 $self->{FDRS} = [];
565             #calculate FDRs based p values
566 1         6 for (my $i = 1; $i <= $self->{FDR_CUTOFF}; $i++) {
567 35         49 my $cutoff = $i / 100;
568 35         110 my $result = $self->_fdr($cutoff, $self->{SORTED_PVALS});
569 35         115 $self->{FDRS}->[$i-1] = $result;
570 35 50       152 printf STDERR "FDR count at %d%% is %d\n", $i, $result if $self->{VERBOSE};
571             }
572             }
573              
574             sub _fdr {
575 35     35   59 my $self = shift;
576 35         51 my ($qlevel, $pvals) = @_;
577 35         42 my $i=1;
578 35         37 my $count=0;
579 35 100 66     104 map { $count = ($i - 1) if (defined $_ and
  25165         116110  
580             $_ <= $qlevel * $i++ / $self->{GENE_COUNT}) } @$pvals; # / ) }; Fix Emacs cperl confusion.
581 35         88 return $count;
582             }
583              
584             sub _clean_genes {
585 1     1   5 my $self = shift;
586 1 50       11 if (not defined $self->{GENE_COUNT}) {
587 1         2 $self->{GENE_COUNT} = scalar(keys %{$self->{GENES}});
  1         8  
588             }
589 1 50       23 if ($self->{UNIVERSE} eq 'genes') {
    50          
    50          
    50          
590 0         0 $self->_clean_unused_genes;
591 0         0 $self->{GENE_COUNT} = $self->{INPUT_GENE_COUNT};
592             }
593             elsif ($self->{UNIVERSE} eq 'user') {
594 0         0 my $sum = 0;
595 0         0 foreach my $p ($self->pathways) {
596 0         0 $sum += scalar(@{$self->{PATHWAYS}->{$p}->{GENES}});
  0         0  
597             }
598 0 0 0     0 if (not ($self->{GENE_COUNT} >= scalar(keys %{$self->{GENES}}) and
  0         0  
599             $self->{GENE_COUNT} >= $sum)) {
600 0         0 confess sprintf("Gene count setting %d is too small. Gene count is %d and pathway gene count is %d\n",
601             $self->{GENE_COUNT},
602 0         0 scalar(keys %{$self->{GENES}}),
603             $sum);
604             }
605             }
606             elsif ($self->{UNIVERSE} eq 'union') {
607             # No action required.
608             }
609             elsif ($self->{UNIVERSE} eq 'intersection') {
610 1         1 foreach my $gene (keys %{$self->{GENES}}) {
  1         4093  
611 11434         24148 my $exists_pathway = exists($self->{GENES}->{$gene}->{PATHWAYS});
612 11434   66     41913 my $exists_pval =
613             (exists($self->{GENES}->{$gene}->{PVAL}) and
614             defined($self->{GENES}->{$gene}->{PVAL}));
615 11434 100 100     22117 if (not ($exists_pathway and $exists_pval)) {
616 10715 50       17717 printf STDERR "Gene $gene eliminated. exists_pathway = $exists_pathway " .
617             "exists_pval = $exists_pval\n" if $self->{VERBOSE};
618 10715         22190 delete $self->{GENES}->{$gene};
619             }
620             }
621 1         1975 $self->{GENE_COUNT} = scalar(keys %{$self->{GENES}});
  1         7  
622 1         10 $self->_sort_by_pvals;
623 1         1006 $self->_clean_unused_genes;
624             }
625 1 50       9 printf STDERR "Gene count set to %d\n", $self->{GENE_COUNT} if $self->{VERBOSE};
626             }
627              
628             sub _clean_unused_genes {
629 1     1   14 my $self = shift;
630 1         60 my %seen;
631 1         3 foreach my $gene (@{$self->{SORTED_GENES}}) {
  1         7  
632 719 50       1539 if (defined ($self->{GENES}->{$gene}->{PVAL})) {
633 719         1063 $seen{$gene} = 1;
634             }
635             }
636 1         5 foreach my $pathway (keys %{$self->{PATHWAYS}}) {
  1         9  
637 7         13 my @new_gene_list = ();
638 7         9 my $old_size = scalar(@{$self->{PATHWAYS}->{$pathway}->{GENES}});
  7         26  
639 7         8 foreach my $pgene (@{$self->{PATHWAYS}->{$pathway}->{GENES}}) {
  7         36  
640 1345 100       2509 if (exists($seen{$pgene})) {
641 1205         1822 push (@new_gene_list, $pgene);
642             }
643             }
644 7         348 $self->{PATHWAYS}->{$pathway}->{GENES} = [ @new_gene_list ];
645 7         114 my $new_size = scalar(@new_gene_list);
646 7 50       203 if ($self->{VERBOSE}) {
647 0 0       0 if ($old_size != $new_size) {
648 0         0 printf STDERR "Pathway %s size reduced from %d to %d\n", $pathway, $old_size, $new_size;
649             }
650             else {
651 0         0 printf STDERR "Pathway %s size left at %d\n", $pathway, $old_size;
652             }
653             }
654             }
655             }
656              
657             sub _calculate_fets {
658 1     1   4 my $self = shift;
659 1         2 foreach my $p (keys %{$self->{PATHWAYS}}) {
  1         11  
660 1         3 my @pids = @{$self->{PATHWAYS}->{$p}->{GENES}};
  1         25  
661 1         4 my ($fdr, $fdr_gene_cutoff, @s);
662 0         0 my ($pval, $logpval, $q, $m, $n, $k, $bestpval, $oddratio, $bestratio);
663 0         0 my ($bestm, $bestn, $bestq, $bestfdr, $bestloci, $bestlogpval);
664 0         0 my (@q);
665 1         3 $k = scalar(@pids);
666 1         4 $bestpval = 1.1;
667 1         2 $bestlogpval = 0;
668 1         2 $bestratio = 0;
669 1         3 $bestm = 0;
670 1         3 $bestn = 0;
671 1         2 $bestq = 0;
672 1         3 $bestfdr = 100;
673 1         5 my $path_ref = $self->{PATHWAYS}->{$p};
674 1         8 for (my $fdr = $self->{FDR_CUTOFF} - 1; $fdr >= 0; $fdr--) {
675 1         4 $fdr_gene_cutoff = $self->{FDRS}->[$fdr];
676 1 50       18 last if ($fdr_gene_cutoff == 0);
677 1         6 $fdr_gene_cutoff--;
678 1         43 @s = @{$self->{SORTED_GENES}}[0..$fdr_gene_cutoff];
  1         175  
679            
680             #find intersection of pathway and regulated genes
681 1         17 my %seen = ();
682 1         3 undef @q;
683 1         4 foreach my $g (@pids) {
684 33         65 $seen{$g} = 1;
685             }
686 1         3 foreach my $g (@s) {
687 360 100       658 push @q , $g if (defined $seen{$g});
688             }
689            
690             #calculate values in 2 by 2 contigency table
691 1         4 $m = $fdr_gene_cutoff + 1;
692 1         49 $n = $self->{GENE_COUNT} - $m;
693 1         5 $q = scalar(@q);
694            
695             #FET calculation
696 1         3550 ($pval, $oddratio) = _fet($q, $m, $n, $k);
697 0 0         if ($self->{VERBOSE}) {
698 0           printf STDERR "FET calculation for $p FDR = %d: P = %.3g Odds = %.2f q = $q m = $m n = $n k = $k\n",
699             $fdr+1, $pval, $oddratio;
700             }
701 0 0         if ($pval == 0) {
702 0           $logpval = 1000;
703             }
704             else {
705 0           $logpval = 0 - log10($pval); # The unary operator "-" does not work correctly
706             # if log10($pval) is 0. You get -0 for $logpval.
707             }
708 0           $path_ref->{ALL_RESULTS}->{PVAL}->[$fdr] = $pval;
709 0           $path_ref->{ALL_RESULTS}->{LOGPVAL}->[$fdr] = $logpval;
710 0           $path_ref->{ALL_RESULTS}->{ODDS}->[$fdr] = $oddratio;
711 0           $path_ref->{ALL_RESULTS}->{Q}->[$fdr] = $q;
712 0           $path_ref->{ALL_RESULTS}->{M}->[$fdr] = $m;
713 0           $path_ref->{ALL_RESULTS}->{N}->[$fdr] = $n;
714 0           $path_ref->{ALL_RESULTS}->{K}->[$fdr] = $k;
715 0           $path_ref->{ALL_RESULTS}->{LOCI}->[$fdr] = [ @q ];
716 0 0         if ($pval < $bestpval) {
717 0           $bestpval = $pval;
718 0           $bestlogpval = $logpval;
719 0           $bestratio = $oddratio;
720 0           $bestm = $m;
721 0           $bestn = $n;
722 0           $bestq = $q;
723 0           $bestfdr = $fdr + 1;
724 0           $bestloci = [ @q ];
725             }
726             }
727 0           &_complete_all_results_array($path_ref, 'PVAL', 1.0);
728 0           &_complete_all_results_array($path_ref, 'LOGPVAL', 0.0);
729 0           &_complete_all_results_array($path_ref, 'ODDS', 0.0);
730 0           &_complete_all_results_array($path_ref, 'Q', -1);
731 0           &_complete_all_results_array($path_ref, 'M', -1);
732 0           &_complete_all_results_array($path_ref, 'N', -1);
733 0           &_complete_all_results_array($path_ref, 'K', -1);
734 0           &_complete_all_results_array($path_ref, 'LOCI', [ ]);
735 0 0         if ($self->{VERBOSE}) {
736 0           print STDERR "FET calculation for $p Bestfdr = $bestfdr\n",
737             }
738 0           $path_ref->{BEST_RESULTS} = { PVAL => $bestpval,
739             LOGPVAL => $bestlogpval,
740             ODDS => $bestratio,
741             Q => $bestq,
742             M => $bestm,
743             N => $bestn,
744             K => $k,
745             FDR => $bestfdr,
746             LOCI => $bestloci };
747             }
748             }
749              
750             sub log10 {
751 0     0     my $number = shift;
752 0           return log($number)/log(10);
753             }
754              
755             sub _complete_all_results_array {
756 0     0     my $path_ref = shift;
757 0           my $datum = shift;
758 0           my $default = shift;
759 0           my $array_ref = $path_ref->{ALL_RESULTS}->{$datum};
760            
761 0           for (my $i = 0; $i < scalar(@{$array_ref}); $i++) {
  0            
762 0 0         $array_ref->[$i] = $default if not defined $array_ref->[$i];
763             }
764             }
765              
766 1     1   1361 use Inline C => <<'END_OF_C_CODE', NAME => 'Bio::FdrFet::FastFet';
  1         24457  
  1         8  
767            
768             #include
769             #include
770             #include
771             #include
772             #include
773            
774             #define MAXIT 100
775             #define MAX_ITER 2000
776             #define TOL 1.0e-7
777             #define EPS 1.0e-12
778             #define EPS1 1.0e-13
779             #define DBL_EPSILON 2.220446049250313e-16
780             #define EPSKS1 0.001
781             #define EPSKS2 1.0e-8
782             #define FPMIN 1.0e-30
783             #define TINY 1.0e-20
784             #define MD fmod(1,0)
785             #define PI 3.141593
786             #define PIX2 6.283185307179586476925286766559
787            
788             #define SQR(a) ((a)*(a))
789             #define MAX(a,b) ((a) > (b) ? (a) : (b))
790             #define MIN(a,b) ((a) > (b) ? (b) : (a))
791             #define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;
792             #define SIGN(a, b) ((b) < 0 ? -fabs(a) : fabs(a))
793            
794             void _fet (double q, double m, double n, double k);
795             double dhyper (double q, double m, double n, double k);
796             double dbinom_raw (double q, double k, double p, double pq);
797             double stirlerr (double n);
798             double pdhyper (double q, double m, double n, double k);
799             double bd0 (double x, double np);
800            
801             void
802             _fet (double q,
803             double m,
804             double n,
805             double k)
806             {
807            
808             double oldn;
809             double d;
810             double pd;
811             double pval;
812             double oddratio;
813             I32 flag;
814             Inline_Stack_Vars;
815            
816             if ((m-q) == 0 || (k-q) == 0) {
817             oddratio = 999999;
818             } else {
819             oddratio = (q*(n-k+q))/((m-q)*(k-q));
820             }
821            
822             if ((q*(m+n)) > (k*m)) {
823             oldn = n;
824             n = m;
825             m = oldn;
826             q = k-q;
827             flag = 1;
828             } else {
829             flag = 0;
830             }
831            
832             if (flag == 1) {
833             d = dhyper(q, m, n, k);
834             pd = pdhyper(q, m, n, k);
835             pval = d * pd;
836             } else if (flag == 0) {
837             d = dhyper(q-1, m, n, k);
838             pd = pdhyper(q-1, m, n, k);
839             pval = 0.5 - d * pd + 0.5;
840             }
841            
842             Inline_Stack_Reset;
843             Inline_Stack_Push(sv_2mortal(newSVnv(pval)));
844             Inline_Stack_Push(sv_2mortal(newSVnv(oddratio)));
845             Inline_Stack_Done;
846            
847             }
848            
849             double
850             dhyper (double q,
851             double m,
852             double n,
853             double k) {
854            
855             double p, pq, p1, p2, p3;
856            
857             p = ((double)k)/((double)(m+n));
858             pq = ((double)(m+n-k))/((double)(m+n));
859            
860             p1 = dbinom_raw(q, m, p,pq);
861             p2 = dbinom_raw(k-q,n, p,pq);
862             p3 = dbinom_raw(k,m+n, p,pq);
863            
864             return(p1*p2/p3);
865             }
866            
867             double
868             dbinom_raw (double q,
869             double k,
870             double p,
871             double pq) {
872            
873             double lf, lc;
874            
875             if (p == 0) return ((q == 0) ? 1 : 0);
876             if (pq == 0) return ((q == k) ? 1 : 0);
877            
878             if (q == 0) {
879             if (k == 0) return 1;
880             lc = (p < 0.1) ? -bd0(k,k*pq) - k*p : k*log(pq);
881             return(exp(lc));
882             }
883             if (q == k) {
884             lc = (pq < 0.1) ? -bd0(k,k*p) - k*pq : k*log(p);
885             return(exp(lc));
886             }
887             if (q < 0 || q > k) return(0);
888            
889             lc = stirlerr(k) - stirlerr(q) - stirlerr(k-q) - bd0(q,k*p) - bd0(k-q,k*pq);
890            
891             lf = log(PIX2) + log(q) + log(k-q) - log(k);
892            
893             return exp(lc - 0.5*lf);
894             }
895            
896            
897             double
898             stirlerr (double n) {
899            
900             double nn;
901            
902             #define S0 0.083333333333333333333 /* 1/12 */
903             #define S1 0.00277777777777777777778 /* 1/360 */
904             #define S2 0.00079365079365079365079365 /* 1/1260 */
905             #define S3 0.000595238095238095238095238 /* 1/1680 */
906             #define S4 0.0008417508417508417508417508/* 1/1188 */
907            
908             const double sferr_halves[31] = {
909             0.0, /* n=0 - wrong, place holder only */
910             0.1534264097200273452913848, /* 0.5 */
911             0.0810614667953272582196702, /* 1.0 */
912             0.0548141210519176538961390, /* 1.5 */
913             0.0413406959554092940938221, /* 2.0 */
914             0.03316287351993628748511048, /* 2.5 */
915             0.02767792568499833914878929, /* 3.0 */
916             0.02374616365629749597132920, /* 3.5 */
917             0.02079067210376509311152277, /* 4.0 */
918             0.01848845053267318523077934, /* 4.5 */
919             0.01664469118982119216319487, /* 5.0 */
920             0.01513497322191737887351255, /* 5.5 */
921             0.01387612882307074799874573, /* 6.0 */
922             0.01281046524292022692424986, /* 6.5 */
923             0.01189670994589177009505572, /* 7.0 */
924             0.01110455975820691732662991, /* 7.5 */
925             0.010411265261972096497478567, /* 8.0 */
926             0.009799416126158803298389475, /* 8.5 */
927             0.009255462182712732917728637, /* 9.0 */
928             0.008768700134139385462952823, /* 9.5 */
929             0.008330563433362871256469318, /* 10.0 */
930             0.007934114564314020547248100, /* 10.5 */
931             0.007573675487951840794972024, /* 11.0 */
932             0.007244554301320383179543912, /* 11.5 */
933             0.006942840107209529865664152, /* 12.0 */
934             0.006665247032707682442354394, /* 12.5 */
935             0.006408994188004207068439631, /* 13.0 */
936             0.006171712263039457647532867, /* 13.5 */
937             0.005951370112758847735624416, /* 14.0 */
938             0.005746216513010115682023589, /* 14.5 */
939             0.005554733551962801371038690 /* 15.0 */
940             };
941            
942            
943             if (n <= 15.0) {
944             nn = n + n;
945             if (nn == (int)nn) {
946             return (sferr_halves[(int)nn]);
947             } else {
948             fprintf(stderr, "%f not integer\n", nn);
949             exit(1);
950             }
951             } else {
952             nn = n*n;
953             if (n>500) return ((S0-S1/nn)/n);
954             if (n> 80) return ((S0-(S1-S2/nn)/nn)/n);
955             if (n> 35) return ((S0-(S1-(S2-S3/nn)/nn)/nn)/n);
956             /* 15 < n <= 35 : */
957             return ((S0-(S1-(S2-(S3-S4/nn)/nn)/nn)/nn)/n);
958             }
959            
960             }
961            
962            
963             double
964             bd0 (double x,
965             double np) {
966            
967             double ej, s, s1, v;
968             int j;
969            
970             if ((abs(x-np)) < (0.1*(x+np))) {
971             v = (x-np)/(x+np);
972             s = (x-np)*v; /* s using v -- change by MM */
973             ej = 2*x*v;
974             v = v*v;
975             for (j=1; ; j++) { /* Taylor series */
976             ej *= v;
977             s1 = s+ej/((j<<1)+1);
978             if (s1==s) { /* last term was effectively 0 */
979             return (s1);
980             }
981             s = s1;
982             }
983             } else {
984             // | x - np | is not too small */
985             return (x*log(x/np)+np-x);
986             }
987            
988             }
989            
990            
991             double
992             pdhyper (double q,
993             double m,
994             double n,
995             double k) {
996            
997             double sum = 0;
998             double term = 1;
999            
1000             while (q > 0 && term >= DBL_EPSILON * sum) {
1001             term *= q * (n - k + q) / (k + 1 - q) / (m + 1 - q);
1002             sum += term;
1003             q--;
1004             }
1005            
1006             return 1 + sum;
1007            
1008             }
1009            
1010             END_OF_C_CODE
1011            
1012              
1013             __END__