File Coverage

blib/lib/GO/TermFinder.pm

Criterion	Covered	Total	%
statement	306	360	85.0
branch	80	120	66.6
condition	22	38	57.8
subroutine	45	47	95.7
pod	6	6	100.0
total	459	571	80.3

line	stmt	bran	cond	sub	pod	time	code
1							package GO::TermFinder;
2
3							# File : TermFinder.pm
4							# Author : Gavin Sherlock
5							# Date Begun : December 31st 2002
6
7							# $Id: TermFinder.pm,v 1.52 2009/11/19 17:27:52 sherlock Exp $
8
9							# License information (the MIT license)
10
11							# Copyright (c) 2003-2006 Gavin Sherlock; Stanford University
12
13							# Permission is hereby granted, free of charge, to any person
14							# obtaining a copy of this software and associated documentation files
15							# (the "Software"), to deal in the Software without restriction,
16							# including without limitation the rights to use, copy, modify, merge,
17							# publish, distribute, sublicense, and/or sell copies of the Software,
18							# and to permit persons to whom the Software is furnished to do so,
19							# subject to the following conditions:
20
21							# The above copyright notice and this permission notice shall be
22							# included in all copies or substantial portions of the Software.
23
24							# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25							# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26							# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27							# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28							# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29							# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30							# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31							# SOFTWARE.
32
33							=pod
34
35							=head1 NAME
36
37							GO::TermFinder - identify GO nodes that annotate a group of genes with a significant p-value
38
39							=head1 DESCRIPTION
40
41							This package is intended to provide a method whereby the P-values of a
42							set of GO annotations can be determined for a set of genes, based on
43							the number of genes that exist in the particular genome (or in a
44							selected background distribution from the genome), and their
45							annotation, and the frequency with which the GO nodes are annotated
46							across the provided set of genes. The P-value is simply calculated
47							using the hypergeometric distribution as the probability of x or more
48							out of n genes having a given annotation, given that G of N have that
49							annotation in the genome in general. We chose the hypergeometric
50							distribution (sampling without replacement) since it is more accurate,
51							though slower to calculate, than the binomial distribution (sampling
52							with replacement).
53
54							In addition, a corrected p-value can be calculated, to correct for
55							multiple hypothesis testing. The correction factor used is the total
56							number of nodes to which the provided list of genes are annotated,
57							excepting any nodes which have only a single annotation in the
58							background, as a priori, we know that these cannot be significantly
59							enriched. The client has access to both the corrected and uncorrected
60							values. It is also possible to correct the p-value using 1000
61							simulations, which control the Family Wise Error Rate - using this
62							option suggests that the Bonferroni correction is in fact somewhat
63							liberal, rather than conservative, as might be expected. Finally, the
64							False Discovery Rate can also be calculated.
65
66							The general idea is that a list of genes may have been identified for
67							some reason, e.g. they are co-regulated, and TermFinder can be used to
68							find out if any nodes annotate the set of genes to a level which is
69							extremely improbable if the genes had simply been picked at random.
70
71							=head1 TODO
72
73							1. May want the client to decide the behavior for ambiguous names,
74							rather than having it hard coded (e.g. always ignore; use if
75							standard name (current implementation); use all databaseIds for
76							the ambiguous name; decide on a case by case basis (potentially
77							useful if running on command line)).
78
79							2. Create new GO::Hypothesis and GO::HypothesisSet objects, so that
80							it is easier to access the information generated about the p-value
81							etc. of any particular GO node that annotates a set of genes.
82
83							3. Instead of all the global variables, $k..., replace them with
84							constants, which may improve runtime, as the optimizer should
85							optimize the hash look ups to look like hard-coded strings at
86							runtime, rather than variable lookups.
87
88							4. Lots of other stuff....
89
90							=cut
91
92	1			1		210558	use strict;
	1					3
	1					44
93	1			1		6	use warnings;
	1					3
	1					36
94	1			1		5	use diagnostics;
	1					2
	1					8
95
96	1			1		37	use vars qw ($PACKAGE $VERSION $WARNINGS);
	1					2
	1					64
97
98	1			1		620	use GO::Node;
	1					2
	1					30
99	1			1		906	use GO::TermFinder::Native;
	1					3
	1					5965
100
101							$VERSION = '0.86';
102							$PACKAGE = 'GO::TermFinder';
103
104							$WARNINGS = 1; # toggle this to zero if you don't want warnings
105
106							# class variables
107
108							my @kRequiredArgs = qw (annotationProvider ontologyProvider aspect);
109
110							my $kArgs = $PACKAGE.'::__args';
111							my $kPopulationNamesHash = $PACKAGE.'::__populationNamesHash';
112							my $kBackgroundDatabaseIds = $PACKAGE.'::__backgroundDatabaseIds';
113							my $kTotalGoNodeCounts = $PACKAGE.'::__totalGoNodeCounts';
114							my $kGoCounts = $PACKAGE.'::__goCounts';
115							my $kGOIDsForDatabaseIds = $PACKAGE.'::__goidsForDatabaseIds';
116							my $kDatabaseIds = $PACKAGE.'::__databaseIds';
117							my $kTotalNumAnnotatedGenes = $PACKAGE.'::__totalNumAnnotatedGenes';
118							my $kCorrectionMethod = $PACKAGE.'::__correctionMethod';
119							my $kShouldCalculateFDR = $PACKAGE.'::__shouldCalculateFDR';
120							my $kPvalues = $PACKAGE.'::__pValues';
121							my $kDatabaseId2OrigName = $PACKAGE.'::__databaseId2OrigName';
122							my $kDistributions = $PACKAGE.'::__distributions';
123							my $kDiscardedGenes = $PACKAGE.'::__discardedGenes';
124							my $kDirectAnnotationToAspect = $PACKAGE.'::__directAnnotationToAspect';
125
126							# the methods by which the p-value can be corrected
127
128							my %kAllowedCorrectionMethods = ('bonferroni' => undef,
129							'none' => undef,
130							'simulation' => undef);
131
132							# set up a GO node that corresponds to anything passed in that has no
133							# annotation
134
135							my $kUnannotatedNode = GO::Node->new(goid => "GO:XXXXXXX",
136							term => "unannotated");
137
138							my $kFakeIdPrefix = "NO_DETERMINED_DATABASE_ID_";
139
140							#####################################################################
141							sub new{
142							#####################################################################
143							=pod
144
145							=head1 Instance Constructor
146
147							=head2 new
148
149							This is the constructor. It expects to be passed named arguments for
150							an annotationProvider, and an ontologyProvider. In addition, it must
151							be told the aspect of the ontology provider, so that it knows how to
152							query the annotationProvider.
153
154							There are also some additional, optional arguments:
155
156							population:
157
158							This argument allows a client to indicate the population that should
159							used to calculate a background distribution of GO terms. In the
160							absence of population argument, then the background distribution will
161							be drawn from all genes in the annotationProvider. This should be
162							provided as an array reference, and no ambiguous names should be
163							provided (see AnnotationProvider for details of name ambiguity). This
164							option is particularly pertinent in a case where for example you
165							assayed only 2000 genes in a two hybrid experiment, and found 20
166							interesting ones. To find significant terms, you need to do it in the
167							context of the genes that you assayed, not in the context of all genes
168							with annotation.
169
170							Note, new in version 0.71, if you provided a population as the
171							background distribution from which genes have been drawn, any genes
172							provided to the findTerms method that are not in the background
173							distribution will be discarded from the calculations. The identity of
174							these genes can be retrieved using the discardedGenes() method, after
175							the findTerms() method has been called.
176
177							totalNumGenes:
178
179							This argument allows a client to indicate that the size of the
180							background distribution is in fact larger that the number of genes
181							that exist in the annotation provider, and the extra genes are merely
182							assumed to be entirely unannotated.
183
184							NB: This is an API change, as totalNumGenes was previously required.
185
186							Thus - if using 'population', the total number of genes considered as
187							the background will be the number of genes in the provided population.
188							If not using 'population', then the number of genes that will be
189							considered as the total population will be the number of genes in the
190							annotationProvider. However, if the totalNumGenes argument is
191							provided, then that number will be used as the size of the population.
192							If it is not larger than the total number of genes in the
193							annotationParser, then the number of genes in the annotationParser
194							will be used. The totalNumGenes and the population arguments are
195							mutually exclusive, and both should not be provided at the same time.
196
197							Usage ($num is larger than the number of genes with annotations):
198
199							my $termFinder = GO::TermFinder->new(annotationProvider=> $annotationProvider,
200							ontologyProvider => $ontologyProvider,
201							totalNumGenes => $num,
202							aspect => );
203
204
205							Usage (use all annotated genes as population):
206
207							my $termFinder = GO::TermFinder->new(annotationProvider=> $annotationProvider,
208							ontologyProvider => $ontologyProvider,
209							aspect => );
210
211							Usage (use a subset of genes as the background population):
212
213							my $termFinder = GO::TermFinder->new(annotationProvider=> $annotationProvider,
214							ontologyProvider => $ontologyProvider,
215							population => \@genes,
216							aspect => );
217
218							=cut
219
220	4			4	1	120	my ($class, %args) = @_;
221
222	4					15	my $self = {};
223
224	4					21	bless $self, $class;
225
226	4					36	$self->__checkAndStoreArgs(%args);
227
228	4					26	$self->__init; # initialize counts for all GO nodes
229
230	4					48	return $self;
231
232							}
233
234							#####################################################################
235							sub __checkAndStoreArgs{
236							#####################################################################
237							# This private method simply checks that all the required arguments
238							# have been provided, and stores them within the object
239
240	4			4		19	my ($self, %args) = @_;
241
242							# first check that the required arguments were provided
243
244	4					16	foreach my $arg (@kRequiredArgs){
245
246	12	50				64	if (!exists ($args{$arg})){
		50
247
248	0					0	die "You did not provide a $arg argument.";
249
250							}elsif (!defined ($args{$arg})){
251
252	0					0	die "Your $arg argument is not defined";
253
254							}
255
256	12					59	$self->{$kArgs}{$arg} = $args{$arg}; # store in object
257
258							}
259
260							# store the population, and also create a hash of the population
261							# names for quick look up
262
263	4	100				21	if (exists($args{'population'})){
264
265	2					9	$self->{$kArgs}{'population'} = $args{'population'};
266
267	2					5	my %population;
268
269	2					4	foreach my $name (@{$args{'population'}}){
	2					6
270
271	6489					10134	$population{$name} = undef;
272
273							}
274
275	2					13	$self->{$kPopulationNamesHash} = \%population;
276
277							}
278
279	4	100				24	if (exists($args{'totalNumGenes'})){
280
281	1					4	$self->{$kArgs}{'totalNumGenes'} = $args{'totalNumGenes'};
282
283							}
284
285							# now check that we didn't get a funky combination
286
287	4	50	66			52	if (exists($args{'population'}) && exists($args{'totalNumGenes'})){
288
289	0					0	die "The population and totalNumGenes arguments are mutually exclusive, but you have provided both.";
290
291							}
292
293							}
294
295							#####################################################################
296							sub __init{
297							#####################################################################
298							# This private method determines all counts to all GO nodes, as the
299							# background frequency of annotations in the genome
300
301	4			4		12	my ($self) = @_;
302
303							# first we determine the databaseIds for the background
304							# distribution
305
306	4					12	my @databaseIds;
307
308	4	100				24	if ($self->__isUsingPopulation){
309
310							# we need to get databaseids for the provided population
311
312	2					10	my ($databaseIdsRef, $databaseId2OrigNameRef) = $self->__determineDatabaseIdsFromGenes($self->__population);
313
314	2					10	@databaseIds = @{$databaseIdsRef};
	2					24892
315
316							}else{
317
318							# we simply use all databaseIds from the annotationProvider
319
320	2					12	@databaseIds = $self->__annotationProvider->allDatabaseIds();
321
322							}
323
324	4					6428	my $populationSize = scalar(@databaseIds);
325
326							# check that they said there's at least as many genes in total
327							# as the annotation provider says that there is.
328
329	4	100				43	if (! defined $self->totalNumGenes){
		50
330
331							# in this case, no 'totalNumGenes' argument was provided
332
333	3					12	$self->{$kArgs}{totalNumGenes} = $populationSize;
334
335							}elsif ($populationSize > $self->totalNumGenes){
336
337							# in this case, they are using an annotation provider, and
338							# have provided a totalNumGenes that is less than the number
339							# of genes that the annotation provider knows about
340
341	0	0				0	if ($WARNINGS){
342
343	0					0	print STDERR "The annotation provider indicates that there are more genes than the client indicated.\n";
344	0					0	print STDERR "The annotation provider indicates there are $populationSize, while the client indicated only ", $self->totalNumGenes, ".\n";
345	0					0	print STDERR "Thus, assuming the correct total number of genes is that indicated by the annotation provider.\n";
346
347							}
348
349	0					0	$self->{$kArgs}{totalNumGenes} = $populationSize;
350
351							}
352
353							# now determine the level of annotation for each GO node in the
354							# population of genes to be used as the background distribution
355
356	4					28	my $totalNodeCounts = $self->__buildHashRefOfAnnotations(\@databaseIds);
357
358							# adjust those counts if needs be
359
360	4	100				27	if ($populationSize < $self->totalNumGenes){
361
362							# if there are extra, entirely unannotated genes (indicated by
363							# the total number of genes provided being greater than the
364							# number that existed in the annotation provider), we must
365							# make sure that it's treated that they will at least be
366							# annotated to the root (Gene Ontology), and its immediate
367							# child (which is the name of the Ontology, eg
368							# Biological_process, Molecular_function, and
369							# Cellular_component), and the 'unannotated' node
370
371							# so simply add extra annotations
372
373	1					5	my $rootNodeId = $self->__ontologyProvider->rootNode->goid;
374
375	1					3	my $childNodeId = ($self->__ontologyProvider->rootNode->childNodes())[0]->goid;
376
377	1					3	$totalNodeCounts->{$rootNodeId} = $self->totalNumGenes;
378
379	1					4	$totalNodeCounts->{$childNodeId} += ($self->totalNumGenes - $populationSize);
380
381	1					4	$totalNodeCounts->{$kUnannotatedNode->goid} += ($self->totalNumGenes - $populationSize);
382
383							}
384
385							# and now store the information
386
387	4					16	$self->{$kTotalGoNodeCounts} = $totalNodeCounts;
388	4					81	$self->{$kTotalNumAnnotatedGenes} = $populationSize;
389
390							# set the discarded genes to be a reference to an empty list
391							# (technically they shouldn't ask to retrieve the discarded genes
392							# before calling findTerms, but this will prevent such behavior
393							# from being fatal
394
395	4					14	$self->{$kDiscardedGenes} = [];
396
397							# store a hash of the databaseIDs that are in the background set of genes
398
399	4					11	my %databaseIds;
400
401	4					11	foreach my $databaseId (@databaseIds){
402
403	19429					42438	$databaseIds{$databaseId} = undef;
404
405							}
406
407	4					35	$self->{$kBackgroundDatabaseIds} = \%databaseIds;
408
409							# create a Distributions object, which has C code for all the various
410							# Math that we will do.
411
412	4					36	$self->{$kDistributions} = GO::TermFinder::Native::Distributions->new($self->totalNumGenes);
413
414							}
415
416							=pod
417
418							=head1 Instance Methods
419
420							=cut
421
422							#####################################################################
423							sub findTerms{
424							#####################################################################
425							=pod
426
427							=head2 findTerms
428
429							This method returns an array of hash references, one for each GO::Node
430							that was tested as a hypothesis, that indicates which terms annotate
431							the list of genes with what P-values. The contents of the hashes in
432							the returned array depend on some of the run time options. They are:
433
434							key value
435							-------------------------------------------------------------------------
436
437							Always Present:
438
439							NODE A GO::Node
440
441							PVALUE The P-value for having the observed number of
442							annotations that the provided list of genes
443							has to that node.
444
445							NUM_ANNOTATIONS The number of genes within the provided list that
446							are annotated to the node.
447
448							TOTAL_NUM_ANNOTATIONS The number of genes in the population (total
449							or provided) that are annotated to the node.
450
451							ANNOTATED_GENES A hash reference, whose keys are the
452							databaseIds that are annotated to the node,
453							and whose values are the original name
454							supplied to the findTerms() method.
455
456							Present if corrected p-values are calculated:
457
458							CORRECTED_PVALUE The CORRECTED_PVALUE is the PVALUE, but corrected
459							for multiple hypothesis testing, due to the
460							fact that you are more likely to generate
461							significant looking p-values if you test a
462							lot of hypotheses. See below for details of
463							how this pvalue is calculated, and the
464							options associated with it.
465
466							Present if p-values were corrected by simulation:
467
468							NUM_OBSERVATIONS The number of simulations in which a p-value as
469							good as this one, or better, was observed.
470
471							Present if the False Discovery Rate is calculated:
472
473							FDR_RATE The False Discovery Rate - this is a fraction
474							of how many of the nodes with p-values as good or
475							better than the node with this FDR would be expected
476							to be false positives.
477
478							FDR_OBSERVATIONS The average number of nodes during simulations
479							that had an uncorrected p-value as good or better
480							than the p-value of this node.
481
482							EXPECTED_FALSE_POSITIVES The expected number of false positives if this node
483							is chosen as the cut-off.
484
485							The entries in the returned array are sorted by increasing p-value
486							(i.e. least likely is first). If there is a tie in the p-value, then
487							the sort order is determined by GOID, using a cmp comparison.
488
489							findTerm() expects to be passed, by reference, a list of gene names
490							for which terms will be found. If a passed in name is ambiguous (see
491							AnnotationProvider), then the following will occur:
492
493							1) If the name can be used as a standard name, it will assume that
494							it is that.
495
496							2) Otherwise it will not use it.
497
498							Currently a warning will be printed to STDOUT in the case of an
499							ambiguous name being used.
500
501							The passed in gene names are converted into a list of databaseIds. If
502							a gene does not map to a databaseId, then an undef is put in the list
503							- however, if the same gene name, which does not map to a databaseId,
504							is used twice then it will produce only one undef in the list. If
505							more than one gene name maps to the same databaseId (either because
506							you used the same name twice, or you used an alias as well), then that
507							databaseId is only put into the list once, and a warning is printed.
508
509							If a gene name does not have any information returned from the
510							AnnotationProvider, then it is assumed that the gene is entirely
511							unannotated. For these purposes, TermFinder annotates such genes to
512							the root node (Gene_Ontology), its immediate child (which indicates
513							the aspect of the ontology (such as biological_process), and a dummy
514							go node, corresponding to unannotated. This node will have a goid of
515							'GO:XXXXXXX', and a term name of 'unannotated'. No other information
516							will be set up for this GO::Node, so you should not count on being
517							able to retrieve it. What it does mean is that you can determine if
518							the predominant feature of a set of genes is that they have no
519							annotation.
520
521							If more genes are provided that have been indicated exist in the
522							genome (as provided during object construction), then an error message
523							will be printed out, and an empty list will be returned.
524
525							In addition, it is possible that for a small list of genes, that no
526							hypotheses will be tested - in this case, those genes will only have
527							annotated nodes with a count of 1, other than the Gene_Ontology node
528							itself, and the node corresponding to the aspect of the ontology.
529							Neither of these are considered for p-value testing, as a priori they
530							must have a p-value of 1.
531
532							MULTIPLE HYPOTHESIS CORRECTION
533
534							An optional argument, 'correction' may be used, which indicates what
535							method of multiple hypothesis correction should be used. Multiple
536							hypothesis correction attempts to keep the overall chance of getting
537							any false positives at the same level (e.g. 0.05). Acceptable values
538							are:
539
540							bonferroni, none, simulation
541
542							: 'bonferroni' will correct the p-values by using as the correction
543							factor the total number of nodes to which the provided list of
544							genes are annotated, either directly or indirectly, excepting any
545							nodes that are annotated only once in the background distribution,
546							as, a priori, these cannot be overrepresented.
547
548							: 'none' will perform no multiple hypothesis correction
549
550							: 'simulation' will run 1000 simulations with random lists of genes
551							(the same size as the originally provided gene list), and determine
552							a corrected value by how many simulations produced a p-value better
553							than the p-value associated with one of the real hypotheses.
554							E.g. if a node from the real data has a p-value of 0.05, but a
555							p-value that good or better is generated in 500 out of 1000 trials,
556							the corrected pvalue will be 0.5. In the case that a p-value
557							generated from a real list of genes is never seen in the
558							simulations, it will be given a corrected p-value of < 0.001, and
559							the NUM_OBSERVATIONS attribute of the hypothesis will be 0. Using
560							this option takes 1000 time as long!
561
562							The default for this argument, if not provided, is bonferroni.
563
564							FALSE DISCOVERY RATE
565
566							As a way of preempting the potential problems of using p-values
567							corrected for multiple hypothesis testing, the False Discovery Rate
568							can instead be calculated, and you can instead set your cutoff based
569							on an acceptable false discovery rate, such as 0.01 (1%), or 0.05 (5%)
570							etc. Thus, the optional argument 'calculateFDR' can be used. A
571							non-zero value means that the False Discovery Rate will be calculated
572							for each node, such that you can determine, if you chose your p-value
573							cut-off at that node, what the FDR would be. The FDR is calculated by
574							running 50 simulations, and counting the average number of times a
575							p-value as good or better that a p-value generated from the real data
576							is seen. This is used as the numerator. The denominator is the
577							number of p-values in the real data that are as good or better than
578							it.
579
580							Usage example - in this example, the default (Bonferroni) correction
581							is used to calculate a corrected p-value, and in addition, the False
582							Discovery Rate is also calculated:
583
584							my @pvalueStructures = $termFinder->findTerms(genes => \@genes,
585							calculateFDR => 1);
586
587							my $hypothesis = 1;
588
589							foreach my $pvalue (@pvalueStructures){
590
591							print "-- $hypothesis of ", scalar @pvalueStructures, "--\n",
592
593							"GOID\t", $pvalue->{NODE}->goid, "\n",
594
595							"TERM\t", $pvalue->{NODE}->term, "\n",
596
597							"P-VALUE\t", $pvalue->{PVALUE}, "\n",
598
599							"CORRECTED P-VALUE\t", $pvalue->{CORRECTED_PVALUE}, "\n",
600
601							"FALSE DISCOVERY RATE\t", $pvalue->{FDR_RATE}, "\n",
602
603							"NUM_ANNOTATIONS\t", $pvalue->{NUM_ANNOTATIONS}, " (of ", $pvalue->{TOTAL_NUM_ANNOTATIONS}, ")\n",
604
605							"ANNOTATED_GENES\t", join(", ", values (%{$pvalue->{ANNOTATED_GENES}})), "\n\n";
606
607							$hypothesis++;
608
609							}
610
611							If a background population had been provided when the object was
612							constructed, you should check to see if any of your genes for which
613							you are finding terms were discarded, due to not being found in the background
614							population, e.g.:
615
616							my @pvalueStructures = $termFinder->findTerms(genes => \@genes,
617							calculateFDR => 1);
618
619							my @discardedGenes = $termFinder->discardedGenes;
620
621							if (@discardedGenes){
622
623							print "The following genes were not considered in the pvalue
624							calculations, as they were not found in the provided background
625							population.\n\n", join("\n", @discardedGenes), "\n\n";
626
627							}
628
629							=cut
630
631	2116			2116	1	155093	my ($self, %args) = @_;
632
633							# let's check that they have provided the required information
634
635	2116					17338	$self->__checkAndStoreFindTermsArgs(%args);
636
637							# now we determine all the count for direct and indirect
638							# annotations for the provided list of genes.
639
640	2115					8499	$self->{$kGoCounts} = $self->__buildHashRefOfAnnotations([$self->genesDatabaseIds]);
641
642							# now we have these counts, and because we determined the counts
643							# of the background distribution during object construction, we
644							# can determine the p-values for the annotations of our list of
645							# genes of interest.
646
647	2115					113522	$self->__calculatePValues;
648
649							# now we want to add in which genes were annotated to each node
650							# so that the client can determine them
651
652	2115					269020	$self->__addAnnotationsToPValues;
653
654							# now what we want to do is calculate pvalues that are corrected
655							# for multiple hypothesis testing, unless it is specifically
656							# requested not to.
657
658	2115	100				17675	$self->__correctPvalues unless ($self->__correctionMethod eq 'none');
659
660							# now calculate the False Discovery Rate, if requested to
661
662	2115	100				10200	$self->__calculateFDR if ($self->__shouldCalculateFDR);
663
664	2115					10226	return $self->__pValues;
665
666							}
667
668							#####################################################################
669							sub __checkAndStoreFindTermsArgs{
670							#####################################################################
671							# This private method checks the arguments that are passed into the
672							# findTerms() method, and stores various variables internally.
673
674	2116			2116		8237	my ($self, %args) = @_;
675
676							# check they gave us a list of genes
677
678	2116	50				14951	if (!exists ($args{'genes'})){
		50
679
680	0					0	die "You must provide a genes argument";
681
682							}elsif (!defined ($args{'genes'})){
683
684	0					0	die "Your genes argument is undefined";
685
686							}
687
688							# see if they gave us an allowable method by which to correct for
689							# multiple hypotheses
690
691	2116		100			18663	$self->{$kCorrectionMethod} = $args{'correction'} \|\| 'bonferroni';
692
693	2116	50				11549	if (!exists $kAllowedCorrectionMethods{$self->__correctionMethod}){
694
695	0					0	die $self->__correctionMethod." is not an allowed correction method. Use one of :".
696
697							join(", ", keys %kAllowedCorrectionMethods);
698
699							}
700
701							# store whether to calculate the FDR
702
703	2116	100	100			20342	if (exists $args{'calculateFDR'} && $args{'calculateFDR'} != 0){
704
705	2					9	$self->{$kShouldCalculateFDR} = 1;
706
707							}else{
708
709							# default is not to calculate it
710
711	2114					7086	$self->{$kShouldCalculateFDR} = 0;
712
713							}
714
715							# what we want to do now is build up an array of identifiers that
716							# are unambiguous - ie databaseIds
717							#
718							# This means that when retrieving GOID's, we can always retrieve
719							# them by databaseId, which is unambiguous.
720
721	2116					12666	my ($databaseIdsRef, $databaseId2OrigNameRef) = $self->__determineDatabaseIdsFromGenes($args{'genes'});
722
723							# now we want to make sure that if they provided a population as
724							# the background, then all of the provided genes that are being
725							# tested for enriched GO terms are sampled from that population
726
727	2116					4922	my @discardedGenes;
728
729	2116	100				11528	if ($self->__isUsingPopulation){
730
731	1058					3501	my @missingIds;
732
733							# go through each databaseID, and see if it is in the databaseIDs
734							# associated with the GO counts for the background population. If
735							# it's a fake ID, then see if the original name is in the names
736							# that were passed in.
737
738	1058					1959	foreach my $databaseId (@{$databaseIdsRef}){
	1058					3556
739
740							# if it's a fake databaseId, we have to see if the orig
741							# name was in the provided population, otherwise, if it's
742							# a real databaseId, check that the databaseId is in the
743							# background
744
745	20103	100	66			77043	if ((
			66
746
747							$databaseId =~ /^$kFakeIdPrefix/o &&
748							!$self->__origNameInPopulation($databaseId2OrigNameRef->{$databaseId}))
749
750							\|\|
751
752							!$self->__databaseIdIsInBackground($databaseId)){
753
754	16					34	push(@missingIds, $databaseId);
755
756							}
757
758							}
759
760							# Now see if we have any missing names
761
762							# If we have as many missing names as there were genes
763							# provided, then we'll die, as there is nothing that can be
764							# done, as no gene remain for any enrichment calculations
765
766	1058	100				2282	if (@missingIds == @{$databaseIdsRef}){
	1058					3578
767
768	1					12	die "None of the genes provided for analysis are found in the background population.\n";
769
770							}
771
772							# Otherwise, we will print a warning that genes were
773							# discarded, but we also provide an API for them to retrieve
774							# the names of genes that were discarded.
775
776	1057	100				4426	if (@missingIds){
777
778	3	50				13	if ($WARNINGS){
779
780	0					0	print STDERR "\nThe following names in the provided list of genes do not have a\n",
781
782							"counterpart in the background population that you provided.\n",
783
784							"These genes will not be used in the analysis for enriched GO terms.\n\n";
785
786	0					0	foreach my $databaseId (@missingIds){
787
788	0					0	print STDERR $databaseId2OrigNameRef->{$databaseId}, "\n";
789
790							}
791
792	0					0	print STDERR "\n";
793
794							}
795
796							# now we have to actually remove them from the list of
797							# considered genes
798
799							# create a dummy hash of the databaseIds, delete the
800							# elements, and then assign the remaining keys back to the
801							# $databaseIdsRef
802
803							# we'll also remember it
804
805	3					7	my %dummyDatabaseIdsHash = %{$databaseId2OrigNameRef};
	3					57
806
807	3					12	foreach my $databaseId (@missingIds){
808
809	12					23	push (@discardedGenes, $databaseId2OrigNameRef->{$databaseId});
810
811	12					26	delete $dummyDatabaseIdsHash{$databaseId};
812
813							}
814
815	3					33	$databaseIdsRef = [keys %dummyDatabaseIdsHash]
816
817							}
818
819							}
820
821							# now remember the genes that were discarded
822
823	2115					11073	$self->__setDiscardedGenes(\@discardedGenes);
824
825							# now store them the databaseIDs for the genes that can be used to
826							# determine enriched GO terms in the self object
827
828	2115					7881	$self->{$kDatabaseIds} = $databaseIdsRef;
829
830							# also store the mapping of the databaseId to its original name
831
832	2115					25434	$self->{$kDatabaseId2OrigName} = $databaseId2OrigNameRef;
833
834							# note, we need to provide the client with a way of determining
835							# how many genes were used when calculating p-values for
836							# annotations
837
838	2115	50				26829	if (scalar ($self->genesDatabaseIds) > $self->totalNumGenes){
839
840	0	0				0	if ($WARNINGS){
841
842	0					0	print "You have provided a list corresponding to ", scalar ($self->genesDatabaseIds), "genes, ",
843
844							"yet you have indicated that there are only ", $self->totalNumGenes, " in the genome.\n";
845
846	0					0	print "No probabilities can be calculated.\n";
847
848							}
849
850	0					0	return (); # simply return an empty list
851
852							}
853
854
855
856							}
857
858							#####################################################################
859							sub discardedGenes {
860							#####################################################################
861							=pod
862
863							=head2 discardedGenes
864
865							This method returns an array of genes which were discarded from the
866							pvalue calculations, because they could not be found in the background
867							population. It should only be called after findTerms. It will either
868							return an empty list, if no genes were discarded, or an array of genes
869							that were discarded.
870
871							Usage:
872
873							my @pvalueStructures = $termFinder->findTerms(genes => \@genes,
874							calculateFDR => 1);
875
876							my @discardedGenes = $termFinder->discardedGenes;
877
878							if (@discardedGenes){
879
880							print "The following genes were not considered in the pvalue
881							calculations, as they were not found in the provided background
882							population.\n\n", join("\n", @discardedGenes), "\n\n";
883
884							}
885
886							=cut
887
888	3			3	1	37	return @{$_[0]->{$kDiscardedGenes}};
	3					21
889
890							}
891
892
893							#
894							# PRIVATE INSTANCE METHODS
895							#
896
897							#####################################################################
898							sub __databaseIdIsInBackground{
899							#####################################################################
900							# This private method will return a Boolean to indicate whether the
901							# supplied databaseId is in the set of databaseIds determined for the
902							# background set of genes. Note, it does not check if the databaseId
903							# is a fake one, so the client should do that if it needs to
904
905	20087			20087		106417	return exists $_[0]->{$kBackgroundDatabaseIds}{$_[1]};
906
907							}
908
909							#####################################################################
910							sub __isUsingPopulation{
911							#####################################################################
912							# This private method returns a boolean to indicate whether the client
913							# passed in a population of genes to use as the background distribution
914
915	2124			2124		13375	return exists $_[0]->{$kArgs}{population};
916
917							}
918
919							#####################################################################
920							sub __population{
921							#####################################################################
922							# This private method returns a reference to an array of identifiers
923							# that were passed in to be used as a background population
924
925	4			4		3741	return $_[0]->{$kArgs}{population};
926
927							}
928
929							#####################################################################
930							sub __origNameInPopulation{
931							#####################################################################
932							# This private method returns a Boolean to indicate whether the
933							# provided name is in the list of names that were provided as a
934							# background population
935
936	16			16		104	return exists $_[0]->{$kPopulationNamesHash}{$_[1]};
937
938							}
939
940							#####################################################################
941							sub __setDiscardedGenes{
942							#####################################################################
943							# This private method will store the passed in array reference, which
944							# points to a list of genes that had to be discarded.
945
946	2115			2115		9981	$_[0]->{$kDiscardedGenes} = $_[1];
947
948							}
949
950							#####################################################################
951							sub __totalNumAnnotatedGenes{
952							#####################################################################
953							# This private method returns the number of genes that have any annotation,
954							# as determined from the AnnotationProvider. This is set during object
955							# initialization.
956
957	38			38		149	return $_[0]->{$kTotalNumAnnotatedGenes};
958
959							}
960
961							#####################################################################
962							sub __numAnnotatedNodesInBackground{
963							#####################################################################
964							# This private method returns the number of nodes in the ontology that
965							# have any annotation in the background distribution. This is stored
966							# during object initialization as a hash of GOID to the number of
967							# counts.
968
969	0			0		0	return scalar keys %{$_[0]->{$kTotalGoNodeCounts}};
	0					0
970
971							}
972
973							#####################################################################
974							sub __allGoIdsForBackground{
975							#####################################################################
976							# This private method returns as an array all the GOIDs of nodes in
977							# the ontology that have any annotation in the background
978							# distribution. This is stored during object initialization as a hash
979							# of GOID to the number of counts.
980
981	0			0		0	return keys %{$_[0]->{$kTotalGoNodeCounts}};
	0					0
982
983							}
984
985							#####################################################################
986							sub genesDatabaseIds{
987							#####################################################################
988							=pod
989
990							=head2 genesDatabaseIds
991
992							This method returns an array of databaseIds corresponding to the genes
993							that were used for the findTerms() method. Thus it allows a client to
994							find out how many actual entities their list of genes that were passed
995							in mapped to, e.g. they may have passed in the same thing with two
996							different names. Using this method, immediately following use of the
997							findTerms method, they will determine how many genes their list
998							collapsed to.
999
1000							=cut
1001
1002	8464			8464	1	10124	return @{$_[0]->{$kDatabaseIds}};
	8464					68182
1003
1004							}
1005
1006							#####################################################################
1007							sub __origNameForDatabaseId{
1008							#####################################################################
1009							# This method returns the original name that was provided to the term
1010							# finder for the databaseId that it was translated to.
1011
1012	554230			554230		2489849	return $_[0]->{$kDatabaseId2OrigName}->{$_[1]};
1013
1014							}
1015
1016							#####################################################################
1017							sub __pValues{
1018							#####################################################################
1019							# This method returns an array of pValues structures
1020
1021	2125			2125		3799	return @{$_[0]->{$kPvalues}};
	2125					42010
1022
1023							}
1024
1025							#####################################################################
1026							sub __correctionMethod{
1027							#####################################################################
1028							# This method returns the name of the method by which the client has
1029							# chosen to have their p-values corrected - either none, bonferroni,
1030							# custom, or simulation.
1031
1032	4245			4245		26473	return $_[0]->{$kCorrectionMethod};
1033
1034							}
1035
1036							#####################################################################
1037							sub __shouldCalculateFDR{
1038							#####################################################################
1039							# This method returns a boolean, to indicate whether the false discovery
1040							# rate should be calculated
1041
1042	2115			2115		12108	return $_[0]->{$kShouldCalculateFDR};
1043
1044							}
1045
1046							#####################################################################
1047							sub __determineDatabaseIdsFromGenes{
1048							#####################################################################
1049							# This method determines a list of databaseIds for a list of genes
1050							# passed in by reference. It then returns a reference to that list,
1051							# and a reference to a hash that maps the databaseIds to the
1052							# originally supplied name
1053							#
1054							# If more than one gene maps to the same databaseId, then the
1055							# databaseId is only put in the list once, and a warning is printed.
1056							#
1057							# If a gene does not map to a databaseId, then an undef is put in the
1058							# list - however, if the same gene name, which does not map to a
1059							# databaseId, is used twice then it will produce only one undef in the
1060							# list.
1061							#
1062							# In addition, it removes leading and trailing whitespace from supplied
1063							# gene names (assuming they should have none) and will skip any names that
1064							# are either empty, or whitespace only.
1065
1066	2118			2118		4765	my ($self, $genesRef) = @_;
1067
1068	2118					3823	my (@databaseIds, $databaseId, %databaseIds, %genes, %duplicates, %warned);
1069
1070	2118					3790	foreach my $gene (@{$genesRef}){
	2118					5548
1071
1072							# strip leading and trailing spaces
1073
1074	53129					115625	$gene =~ s/^\s+//;
1075	53129					81347	$gene =~ s/\s+$//;
1076
1077	53129	50				105878	next if $gene eq ""; # skip empty names
1078
1079							# skip and warn if we've already seen the gene
1080
1081	53129	50				115059	if (exists ($genes{$gene})){
1082
1083	0	0	0			0	if ($WARNINGS && !exists($warned{$gene})){
1084
1085	0					0	print "The gene name '$gene' was used more than once.\n";
1086	0					0	print "It will only be considered once.\n\n";
1087
1088	0					0	$warned{$gene} = undef;
1089
1090							}
1091
1092	0					0	next; # just skip to the next supplied gene
1093
1094							}
1095
1096							# determine if the gene is ambiguous
1097
1098	53129	50				107115	if ($self->__annotationProvider->nameIsAmbiguous($gene)){
1099
1100	0	0				0	print "$gene is an ambiguous name.\n" if $WARNINGS;
1101
1102	0	0				0	if ($self->__annotationProvider->nameIsStandardName($gene)){
1103
1104	0	0				0	if ($WARNINGS){
1105
1106	0					0	print "Since $gene is used as a standard name, it will be assumed to be one.\n\n";
1107
1108							}
1109
1110	0					0	$databaseId = $self->__annotationProvider->databaseIdByStandardName($gene);
1111
1112	0					0	push (@databaseIds, $databaseId);
1113
1114							}else{
1115
1116	0	0				0	if ($WARNINGS){
1117
1118	0					0	print "Since $gene is an ambiguous alias, it will not be used.\n\n";
1119
1120							}
1121
1122							}
1123
1124							}else{
1125
1126							# note, if the gene has no annotation, then we will want
1127							# to create a fake databaseId, that we can easily
1128							# recognize, and will have to make sure that we deal with
1129							# this later when getting annotations.
1130
1131	53129					103779	$databaseId = $self->__annotationProvider->databaseIdByName($gene);
1132
1133							# if the total number of genes is equal to the number of
1134							# things with some annotation, then there should be no
1135							# genes that do not return a databaseId. If this is the
1136							# case, we will warn them.
1137
1138	53129	100				110433	if (!defined $databaseId){
1139
1140							# If we've already defined the total number of genes
1141							# with annotation, and it's equal to the number of
1142							# genes for the background distribution, and we're not
1143							# using a population, we'll print a warning, as under
1144							# these circumstances we shouldn't not get a
1145							# databaseId.
1146
1147	19	50	66			48	if (defined ($self->__totalNumAnnotatedGenes) &&
			66
			33
1148							$self->__totalNumAnnotatedGenes == $self->totalNumGenes &&
1149							$WARNINGS &&
1150							!$self->__isUsingPopulation){
1151
1152	0					0	print "\nThe name '$gene' did not correspond to an entry from the AnnotationProvider.\n";
1153	0					0	print "However, the client has indicated that all genes have annotation.\n";
1154	0					0	print "You should probably check that '$gene' is a real name.\n\n";
1155
1156							}
1157
1158							# Now we need to deal with the lack of databaseId
1159							# We'll simply create a fake one, that we can easily
1160							# recognize later, so we can deal with it accordingly
1161
1162	19					50	$databaseId = $kFakeIdPrefix.$gene;
1163
1164							}
1165
1166	53129					89340	push (@databaseIds, $databaseId);
1167
1168							}
1169
1170							# if we have a databaseId that we've already seen, we want to
1171							# make sure we only consider it once.
1172
1173	53129	50	33			244646	if (defined ($databaseId) && exists($databaseIds{$databaseId})){
1174
1175	0					0	pop (@databaseIds); # get rid of the extra
1176
1177							# and let's remember what it was, as well as the previous
1178							# name associated with this databaseId, so we can give an
1179							# appropriate warning
1180
1181	0					0	$duplicates{$databaseId}{$gene} = undef;
1182	0					0	$duplicates{$databaseId}{$databaseIds{$databaseId}} = undef;
1183
1184
1185							}
1186
1187							# remember the databaseId and gene, in case we see them again
1188
1189	53129	50				158519	$databaseIds{$databaseId} = $gene if (defined ($databaseId));
1190	53129					102744	$genes{$gene} = undef;
1191
1192							}
1193
1194
1195	2118	0	33			10093	if (%duplicates && $WARNINGS){
1196
1197	0					0	print "The following databaseIds were represented multiple times:\n\n";
1198
1199	0					0	foreach my $duplicate (sort keys %duplicates){
1200
1201	0					0	print $duplicate, " represented by ", join(", ", (sort keys %{$duplicates{$duplicate}})), "\n";
	0					0
1202
1203							}
1204
1205	0					0	print "\nEach of these databaseIds will only be considered once.\n";
1206
1207							}
1208
1209							# return databaseIds, and their mapping to the originally supplied
1210							# name
1211
1212	2118					72567	return (\@databaseIds, \%databaseIds);
1213
1214							}
1215
1216							############################################################################
1217							sub __buildHashRefOfAnnotations{
1218							############################################################################
1219							# This private method takes a reference to an array of databaseIds and
1220							# calculates the level of annotations for all GO nodes that those
1221							# databaseIds have either direct or indirect annotation for. It
1222							# returns a reference to a hash of GO node counts, with the goids
1223							# being the keys, and the number of annotations they have from the
1224							# list of databaseId's being the values.
1225
1226	2119			2119		7269	my ($self, $databaseIdsRef) = @_;
1227
1228	2119					5107	my %goNodeCounts;
1229
1230							# keep track of how many are annotated to the aspect node
1231							# (e.g. such as molecular function). See comments for
1232							# __allGOIDsForDatabaseId for more information
1233
1234	2119					5726	my $aspectNodeDirectAnnotations = 0;
1235
1236	2119					13550	my $aspectNodeGoid = ($self->__ontologyProvider->rootNode->childNodes())[0]->goid;
1237
1238							# If gene has no annotation, annotate it to the top node
1239							# (Gene_Ontology), and its immediate child (the aspect itself) and
1240							# the 'unannotated' node.
1241
1242	2119					12253	my @noAnnotationNodes = ($aspectNodeGoid,
1243							$self->__ontologyProvider->rootNode->goid,
1244							$kUnannotatedNode->goid);
1245
1246	2119					7133	foreach my $databaseId (@{$databaseIdsRef}) {
	2119					6559
1247
1248							# get goids count, if the databaseId is not a fake one
1249
1250	66053					91736	my $goidsRef;
1251
1252	66053	100				151956	if ($databaseId !~ /^$kFakeIdPrefix/o){
1253
1254	66050					167048	$goidsRef = $self->__allGOIDsForDatabaseId($databaseId);
1255
1256							}
1257
1258	66053	100	66			181424	if (!defined $goidsRef \|\| !(@{$goidsRef})) {
	66050					166152
1259
1260							# If gene has no annotation, annotate it to the top node
1261							# (Gene_Ontology), and its immediate child (the aspect itself)
1262							# and the 'unannotated' node, which we cached earlier.
1263
1264	534					2209	$goidsRef = [@noAnnotationNodes];
1265
1266							# now cache the goids for the unnannotated genes. The
1267							# ones that were annotated, had their goids cached in the
1268							# __allGOIDsForDatabaseId. It is an optimization to take
1269							# care of that there, but this here.
1270
1271	534					1697	$self->{$kGOIDsForDatabaseIds}->{$databaseId} = $goidsRef;
1272
1273							}
1274
1275							# increment count for all goids appearing in @goids;
1276
1277	66053					96308	foreach my $goid (@{$goidsRef}) {
	66053					125303
1278
1279	1195884					1977907	$goNodeCounts{$goid}++;
1280
1281							}
1282
1283							# keep count of how many are directly annotated to the aspect node
1284
1285	66053	100				282559	if (exists ($self->{$kDirectAnnotationToAspect}{$databaseId})){
1286
1287	15361					32785	$aspectNodeDirectAnnotations++;
1288
1289							}
1290
1291							}
1292
1293							# now we'd like to replace the counts for the aspect annotations,
1294							# so that they only refer to the direct annotations, rather than
1295							# direct and indirect annotations
1296
1297	2119					6056	$goNodeCounts{$aspectNodeGoid} = $aspectNodeDirectAnnotations;
1298
1299	2119					11387	return \%goNodeCounts;
1300
1301							}
1302
1303							############################################################################
1304							sub __allGOIDsForDatabaseId{
1305							############################################################################
1306							# This method returns a reference to an array of all GOIDs to which a
1307							# databaseId is annotated, whether explicitly, or implicitly, by
1308							# virtue of the GO node being an ancestor of an explicitly annotated
1309							# one. The returned array contains no duplicates.
1310
1311							# Because the Gene Ontology no longer has the unknown terms, then
1312							# direct annotation to the aspect node (e.g. molecular function),
1313							# means what annotation to the unknown terms previously meant. But,
1314							# as all nodes are descendents of the aspect node, then enrichment for
1315							# this node will never happen, unless we only look for enrichment of
1316							# direct annotations to this node. Thus, in this method, we also
1317							# record which databaseIds are directly annotated to the aspect node, which
1318							# will be used elsewhere.
1319
1320	112674			112674		179733	my ($self, $databaseId) = @_;
1321
1322							# cache aspect's ID, so we don't have to repeatedly retrieve it
1323
1324	112674					268225	my $aspectId = ($self->__ontologyProvider->rootNode->childNodes())[0]->goid; #
1325
1326							# generate list of GOIDs if not cached
1327
1328	112674	100				447643	if (!exists($self->{$kGOIDsForDatabaseIds}->{$databaseId})) {
1329
1330	19429					27106	my %goids; # so we keep the list unique
1331
1332							# go through the direct annotations
1333
1334	19429					26102	foreach my $goid (@{$self->__annotationProvider->goIdsByDatabaseId(databaseId => $databaseId,
	19429					47272
1335							aspect => $self->aspect)}){
1336
1337							# just in case an annotation is to a goid not present in the ontology
1338
1339	32710	50				70519	if (!$self->__ontologyProvider->nodeFromId($goid)){
1340
1341	0	0				0	if ($WARNINGS){
1342
1343	0					0	print STDERR "\nWarning : $goid, used to annotate $databaseId with an aspect of ".$self->aspect.", does not appear in the provided ontology.\n";
1344
1345							}
1346
1347							# don't record annotations to this goid
1348
1349	0					0	next;
1350
1351							}
1352
1353							# record the goid and its ancestors
1354
1355	32710					66754	$goids{$goid} = undef;
1356
1357	32710					65139	foreach my $ancestor ($self->__ontologyProvider->nodeFromId($goid)->ancestors){
1358
1359	455507					1079355	$goids{$ancestor->goid} = undef;
1360
1361							}
1362
1363							# record in the self object if it's directly annotated to the aspectId
1364
1365	32710	100				128101	if ($goid eq $aspectId){
1366
1367	4516					17883	$self->{$kDirectAnnotationToAspect}{$databaseId} = undef;
1368
1369							}
1370
1371							}
1372
1373							# cache the value
1374
1375	19429					241384	$self->{$kGOIDsForDatabaseIds}->{$databaseId} = [keys %goids];
1376
1377							}
1378
1379	112674					403529	return ($self->{$kGOIDsForDatabaseIds}->{$databaseId});
1380
1381							}
1382
1383							#####################################################################
1384							sub __calculatePValues{
1385							#####################################################################
1386							# This method actually determines the p-values of the various levels
1387							# of annotation for the particular GO nodes, and stores them within
1388							# the object.
1389
1390	2115			2115		4650	my $self = shift;
1391
1392	2115					6412	my $numDatabaseIds = scalar $self->genesDatabaseIds;
1393
1394	2115					3914	my @pvalueArray;
1395
1396							# cache so we don't have to repeatedly look it up
1397
1398	2115					5976	my $rootGoid = $self->__ontologyProvider->rootNode->goid;
1399
1400							# each node we consider here must have at least one annotation in
1401							# our list of provided genes.
1402
1403	2115					9505	foreach my $goid ($self->__allGoIdsForList) {
1404
1405							# skip the root node, as it has to have a probability of 1,
1406							# but don't skip its immediate child though, as we now test
1407							# for enriched direct annotations
1408
1409	349325	100				710885	next if ($goid eq $rootGoid);
1410
1411							# skip any that has only one (or zero - could happen for the
1412							# aspect goid, as we replaced its counts) annotation in the
1413							# background distribution, as by definition these cannot be
1414							# overrepresented
1415
1416	347210	100				658653	next if ($self->__numAnnotationsToGoId($goid) <= 1);
1417
1418							# if we get here, we should calculate a p-value for this node
1419
1420	105137					214482	push (@pvalueArray, $self->__processOneGOID($goid, $numDatabaseIds));
1421
1422							}
1423
1424							# now sort the pvalueArray by their pValues. If the values are the same,
1425							# then sort by goid (text based comparison).
1426
1427	2115	50				62178	@pvalueArray = sort {$a->{PVALUE} <=> $b->{PVALUE} \|\|
	480335					1020951
1428							$a->{NODE}->goid cmp $b->{NODE}->goid } @pvalueArray;
1429
1430	2115					10606	$self->{$kPvalues} = \@pvalueArray;
1431
1432							}
1433
1434							############################################################################
1435							sub __processOneGOID{
1436							############################################################################
1437							# This processes one GOID. It determines the number of annotations to
1438							# the current GOID, and the P-value of that number of annotations.
1439							# The pvalue is calculated as the probability of observing x or more
1440							# positives in a sample on n, given that there are M positives in a
1441							# population of N. This is calculated using the hypergeometric
1442							# distribution.
1443							#
1444							# It returns a hash reference encoding that information.
1445
1446	105137			105137		136238	my ($self, $goid, $n) = @_;
1447
1448	105137					197130	my $M = $self->__totalNumAnnotationsToGoId($goid);
1449	105137					213976	my $x = $self->__numAnnotationsToGoId($goid);
1450	105137					212609	my $N = $self->totalNumGenes();
1451
1452							# logic checking on data
1453
1454	105137	50				251066	if (($N - $M) < ($n - $x)){
1455
1456							# this situation should never arise, because the number of
1457							# failures in the sampling cannot exceed the total number of
1458							# failures in the population. For example, if all but one
1459							# gene has a particular annotation, then you can't pick 3
1460							# genes and get 2 without it
1461
1462	0					0	die 'For $N, $M, $n, $x being '."$N, $M, $n, $x, ".'($N - $M) < ($n - $x) which is impossible'."\n";
1463
1464							}
1465
1466	105137					110493	my $pvalue;
1467
1468	105137	50				169813	if ($M == $N){
1469
1470							# the p-value must be equal to 1, so we don't even need to
1471							# bother calling the p-value code
1472
1473	0					0	$pvalue = 1;
1474
1475							}else{
1476
1477	105137					1650990	$pvalue = $self->{$kDistributions}->pValueByHypergeometric($x, $n, $M, $N);
1478
1479							}
1480
1481	105137		66			235593	my $node = $self->__ontologyProvider->nodeFromId($goid) \|\| $kUnannotatedNode;
1482
1483	105137					445885	my $hashRef = {
1484
1485							NODE => $node,
1486							PVALUE => $pvalue,
1487							NUM_ANNOTATIONS => $x,
1488							TOTAL_NUM_ANNOTATIONS => $M
1489
1490							};
1491
1492	105137					284548	return $hashRef;
1493
1494							}
1495
1496							############################################################################
1497							sub __numAnnotationsToGoId{
1498							############################################################################
1499							# This private method returns the number of annotations to a
1500							# particular GOID for the list of genes supplied to the findTerms
1501							# method.
1502
1503	452347			452347		551140	my ($self, $goid) = @_;
1504
1505	452347					1414980	return $self->{$kGoCounts}->{$goid};
1506
1507							}
1508
1509							############################################################################
1510							sub __totalNumAnnotationsToGoId{
1511							############################################################################
1512							# This returns the total number of genes that have been annotated to a
1513							# particular GOID based on all annotations.
1514
1515	105137			105137		142317	my ($self, $goid) = @_;
1516
1517	105137					327685	return $self->{$kTotalGoNodeCounts}->{$goid};
1518							}
1519
1520							############################################################################
1521							sub totalNumGenes{
1522							############################################################################
1523							=pod
1524
1525							=head2 totalNumGenes
1526
1527							This returns the total number of genes that are in the background set
1528							of genes from which the genes of interest were drawn. Unannotated
1529							genes are included in this count.
1530
1531							=cut
1532
1533	107295			107295	1	274117	return $_[0]->{$kArgs}{totalNumGenes};
1534
1535							}
1536
1537							############################################################################
1538							sub __allGoIdsForList{
1539							############################################################################
1540							# This returns an array of GOIDs to which genes in the passed in gene
1541							# list were directly or indirectly annotated.
1542
1543	2115			2115		3277	return keys %{$_[0]->{$kGoCounts}};
	2115					62452
1544
1545							}
1546
1547							############################################################################
1548							sub __correctPvalues{
1549							############################################################################
1550							# This method corrects the pvalues for multiple hypothesis testing, by
1551							# dispatching to the appropriate method based on what method was
1552							# requested for hypothesis correction.
1553
1554	14			14		27	my $self = shift;
1555
1556	14					37	my $correctionMethod = "__correctPvaluesBy".$self->__correctionMethod;
1557
1558	14					76	$self->$correctionMethod;
1559
1560							}
1561
1562							#####################################################################
1563							sub __correctPvaluesBybonferroni{
1564							#####################################################################
1565							# This method corrects the p-values using a Bonferroni correction,
1566							# where the correction factor is the total number of nodes for which
1567							# we tested whether there was significant enrichment
1568
1569	12			12		24	my $self = shift;
1570
1571							# now correct the pvalues with the correction factor
1572
1573	12					20	my $correctionFactor = scalar(@{$self->{$kPvalues}});
	12					36
1574
1575							# no correction needs to be done if there is 0 or 1 hypotheses
1576							# that were tested
1577
1578	12	100				42	if ($correctionFactor > 1){
1579
1580							# simply go through each hypothesis and calculate the corrected
1581							# p-value by multiplying the uncorrected p-value by the number of
1582							# nodes in the ontology
1583
1584	10					43	foreach my $hypothesis ($self->__pValues){
1585
1586	1911					3797	$hypothesis->{CORRECTED_PVALUE} = $hypothesis->{PVALUE} * $correctionFactor;
1587
1588							# make sure we have a ceiling of 1
1589
1590	1911	100				4886	$hypothesis->{CORRECTED_PVALUE} = 1 if ($hypothesis->{CORRECTED_PVALUE} > 1);
1591
1592							}
1593
1594							}
1595
1596							}
1597
1598							############################################################################
1599							sub __correctPvaluesBysimulation{
1600							############################################################################
1601							# This method corrects the P-values based on a thousand random trials,
1602							# using the same number of genes for each trial as was used in the
1603							# client query. A p-value will be corrected based on the number of
1604							# simulations in which that p-value was seen, e.g. if an uncorrected
1605							# p-value of 0.05 or better was observed in 100 of 1000 trials, the
1606							# corrected value will be 0.1 (100/1000).
1607
1608	2			2		7	my $self = shift;
1609
1610							# when we run any simulation, any of the variables that get
1611							# modified during the findTerms method will be trampled on - thus
1612							# we have to save them away, and then restore them afterwards
1613
1614	2					13	my $variables = $self->__saveVariables();
1615
1616							# we will need access to the real hypotheses - we'll reverse them
1617							# for now, as it makes them easier when we use them later on
1618
1619	2					5	my @realHypotheses = reverse @{$self->{$kPvalues}};
	2					15
1620
1621							# now let's get the population from which we will sample genes
1622							# randomly
1623
1624	2					11	my @names = $self->__samplingPopulation;
1625
1626	2					902	my $populationSize = scalar @names;
1627
1628							# now get the number of genes in the original test set
1629							# for which terms were found.
1630
1631	2					17	my $numGenes = scalar $self->genesDatabaseIds;
1632
1633							# now we can finally run the simulations
1634
1635	2					6	my $numSimulations = 1000;
1636
1637	2					12	for (my $i = 1; $i <= $numSimulations; $i++) {
1638
1639							# run simulation
1640
1641	2000					10621	my @pvals = $self->__runOneSimulation(\@names, $numGenes, $populationSize);
1642
1643							# go onto a new simulation if no hypothese resulted (which is
1644							# possible if the randomly selected genes did not have more
1645							# than one annotation to any particular GO node)
1646
1647	2000	50				11696	next if !@pvals;
1648
1649							# now we look at the best pvalue for the random genes, and
1650							# determine whether it is more significant that any of the
1651							# p-values generated for the real genes. We will keep a count
1652							# of how many times we see a p-value that is better than one
1653							# calculated with the real genes, on a per simulation basis
1654
1655							# if we go through the p-values for the real nodes in reverse
1656							# order (we reversed them above), then we can quit out of the
1657							# loop as soon as we have a p-value better than the best one
1658							# generated from the random genes
1659
1660	2000					7004	foreach my $realHypothesis (@realHypotheses){
1661
1662							# skip examining, if the real pvalue is better than the
1663							# best one for the random genes
1664
1665	17269	100				74500	last if ($pvals[0]->{PVALUE} > $realHypothesis->{PVALUE});
1666
1667							# if we get here, we know that this simulation has generated
1668							# a P_VALUE that is better than the P_VALUE for the currently
1669							# considered hypothesis. We'll simply keep count for now
1670
1671	15269					29833	$realHypothesis->{NUM_OBSERVATIONS}++;
1672
1673							}
1674
1675							}
1676
1677							# now we've run all the simulations, we should be able to simply divide
1678							# the observed frequency by the number of simulations.
1679
1680	2					4	foreach my $realHypothesis (@realHypotheses){
1681
1682	74	100				158	if (exists $realHypothesis->{NUM_OBSERVATIONS}){
1683
1684	34					87	$realHypothesis->{CORRECTED_PVALUE} = $realHypothesis->{NUM_OBSERVATIONS}/$numSimulations;
1685
1686							}else{
1687
1688							# a pvalue better than this wasn't observed in any
1689							# simulation - just record the minimum
1690
1691	40					61	$realHypothesis->{CORRECTED_PVALUE} = 1/$numSimulations;
1692
1693							# and say that we never saw it
1694
1695	40					71	$realHypothesis->{NUM_OBSERVATIONS} = 0;
1696
1697							}
1698
1699							}
1700
1701	2					6	@realHypotheses = reverse @realHypotheses;
1702
1703							# now restore the variables
1704
1705	2					11	$self->__restoreVariables($variables);
1706
1707							# finally replace the hypotheses with our local copy, which we've
1708							# made some modifications to
1709
1710	2					2658	$self->{$kPvalues} = \@realHypotheses;
1711
1712							}
1713
1714							############################################################################
1715							sub __saveVariables{
1716							############################################################################
1717							# This private method returns a hash containing various of the
1718							# instance variables that might get trampled on during a simulation
1719
1720	4			4		13	my ($self) = @_;
1721
1722	4					8	my %variables;
1723
1724	4					21	my @keys = ($kCorrectionMethod, $kShouldCalculateFDR, $kDatabaseIds,
1725							$kDatabaseId2OrigName, $kGoCounts, $kPvalues, $kDiscardedGenes);
1726
1727	4					12	foreach my $key (@keys){
1728
1729	28					75	$variables{$key} = $self->{$key};
1730
1731							}
1732
1733	4					16	return \%variables;
1734
1735							}
1736
1737							############################################################################
1738							sub __restoreVariables{
1739							############################################################################
1740							# This private method uses a passed in hash (by reference) to restore
1741							# variables within the instance
1742
1743	4			4		9	my ($self, $hashRef) = @_;
1744
1745	4					11	foreach my $key (%{$hashRef}){
	4					22
1746
1747	56					422	$self->{$key} = $hashRef->{$key};
1748
1749							}
1750
1751							}
1752
1753							############################################################################
1754							sub __samplingPopulation{
1755							############################################################################
1756							# This private method returns an array of id's that should be used as
1757							# the sampling population for the simulation
1758
1759	4			4		11	my $self = shift;
1760
1761							# we will need to pick genes randomly from the background
1762							# population. Note that population may be larger than the
1763							# databaseIds that are referenced in the annotations file - if so,
1764							# we have to be able to randomly select unannotated genes too
1765
1766							# alternatively, the user may have specified a population of genes
1767							# that define the background - in which case we should pick only
1768							# from that population
1769
1770	4					8	my @names;
1771
1772	4	100				15	if ($self->__isUsingPopulation){
1773
1774	2					5	@names = @{$self->__population};
	2					10
1775
1776							}else{
1777
1778							# we simply use all databaseIds from the annotationProvider
1779
1780	2					9	@names = $self->__annotationProvider->allDatabaseIds();
1781
1782							}
1783
1784							# note the population size
1785
1786	4					866	my $populationSize;
1787
1788	4	50				32	if (! defined $self->totalNumGenes){
1789
1790	0					0	$populationSize = scalar @names;
1791
1792							}else{
1793
1794	4					39	$populationSize = $self->totalNumGenes;
1795
1796							}
1797
1798							# now, if the population from which we should sample is bigger
1799							# that the number of databaseIds which we have to sample from, we
1800							# want to expand the the list of databaseIds with some fake ones,
1801							# that correspond to unnannotated genes.
1802
1803	4					12	my $numDatabaseIds = scalar @names;
1804
1805	4					23	for (my $n = $numDatabaseIds; $n < $populationSize; $n++){
1806
1807	0					0	push (@names, $kFakeIdPrefix.$n);
1808
1809							}
1810
1811	4					8512	return @names;
1812
1813							}
1814
1815							############################################################################
1816							sub __runOneSimulation{
1817							############################################################################
1818							# This method runs a single simulation of GO::TermFinder, and returns the
1819							# generated hypotheses. It requires a reference to a list of genes that
1820							# should be used to sample from, the number of genes that should be chosen,
1821							# and the size of the background distribution
1822
1823	2100			2100		4769	my ($self, $namesRef, $numGenes, $populationSize) = @_;
1824
1825							# first get a random list of genes
1826
1827	2100					9097	my $listRef = $self->__listOfRandomGenes($namesRef, $numGenes, $populationSize);
1828
1829							# now we have a list of genes, we can findTerms for them
1830
1831							# however, we have to make sure that for these guys, we attempt
1832							# no p-value correction, otherwise we will infinitely recurse,
1833							# and make sure that we don't ask to calculate the FDR
1834
1835	2100					19926	my @pvals = $self->findTerms(genes => $listRef,
1836							correction => 'none',
1837							calculateFDR => 0);
1838
1839							# now return the hypotheses
1840
1841	2100					37096	return (@pvals);
1842
1843							}
1844
1845							############################################################################
1846							sub __listOfRandomGenes{
1847							############################################################################
1848							# This private method returns a reference to an array of randomly
1849							# chosen genes from a population that was passed in by reference
1850
1851	2100			2100		5038	my ($self, $namesRef, $numGenes, $populationSize) = @_;
1852
1853							# create an array with as many indices as there are genes in the
1854							# background set of genes from which those of interest were drawn
1855
1856	2100					3209	my @indices;
1857
1858	2100					7641	for (my $i = 0; $i < $populationSize; $i++){
1859
1860	13587000					26695816	$indices[$i] = $i;
1861
1862							}
1863
1864							# now sample those indices, removing sampled elements as we go.
1865							# Use the randomly chosen index to get a random gene, and select
1866							# as many random genes as were in the test set
1867
1868	2100					4349	my @list;
1869
1870	2100					10583	for (my $i = 0; $i < $numGenes; $i++) {
1871
1872	39900					76583	my $index = int(rand(scalar(@indices))); # random number between 0 and last array index.
1873
1874	39900					155340	my $selectedIndex = splice(@indices, $index, 1); # Remove the randomly selected element from the array.
1875
1876	39900					167507	push(@list, $namesRef->[$selectedIndex]);
1877
1878							}
1879
1880	2100					352553	return \@list;
1881
1882							}
1883
1884							############################################################################
1885							sub __calculateFDR{
1886							############################################################################
1887							# This method calculates the false discovery rate for each hypothesis,
1888							# such that you know if you draw your cut-off at a particular node,
1889							# what the false discovery rate is. It does 50 simulations with
1890							# random genes, and calculates on average the percentage of nodes that
1891							# exceed a given value in the simulation, compared to the number that
1892							# exceed that p-value in the real data.
1893
1894	2			2		5	my $self = shift;
1895
1896							# when we run any simulation, any of the variables that get
1897							# modified during the findTerms method will be trampled on - thus
1898							# we have to save them away, and then restore them afterwards
1899
1900	2					12	my $variables = $self->__saveVariables();
1901
1902							# we will need access to the real hypotheses
1903
1904	2					5	my @realHypotheses = @{$self->{$kPvalues}};
	2					14
1905
1906							# now let's get the population from which we will sample genes
1907							# randomly
1908
1909	2					13	my @names = $self->__samplingPopulation;
1910
1911	2					886	my $populationSize = scalar @names;
1912
1913							# now get the number of genes in the original test set
1914							# for which terms were found.
1915
1916	2					12	my $numGenes = scalar $self->genesDatabaseIds;
1917
1918							# now we can finally run the simulations
1919
1920	2					7	my $numSimulations = 50;
1921
1922	2					12	for (my $i = 1; $i <= $numSimulations; $i++) {
1923
1924							# now run a simulation
1925
1926	100					582	my @pvals = $self->__runOneSimulation(\@names, $numGenes, $populationSize);
1927
1928							# go onto a new simulation if no hypotheses resulted (which is
1929							# theoretically possible if the randomly selected genes did
1930							# not have more than one annotation to any particular GO node)
1931
1932	100	50				570	next if !@pvals;
1933
1934							# now we look at the best pvalue for the random genes, and
1935							# determine whether it is more significant that any of the
1936							# p-values generated for the real genes. We will keep a count
1937							# of how many times we see a p-value that is better than one
1938							# calculated with the real genes, on a per simulation basis
1939
1940							# if we go through the p-values for the real nodes in reverse
1941							# order (we reversed them above), then we can quit out of the
1942							# loop as soon as we have a p-value better than the best one
1943							# generated from the random genes
1944
1945	100					360	foreach my $realHypothesis (@realHypotheses){
1946
1947							# count the number of nodes that this simulation has
1948							# generated a P_VALUE that is better than the P_VALUE for
1949							# the currently considered hypothesis.
1950
1951	3700					4471	foreach my $pval (@pvals){
1952
1953							# finish considering this real hypothesis as soon as
1954							# we see a pvalue that is worse from the simulated
1955							# data
1956
1957	18024	100				40749	last if ($pval->{PVALUE} > $realHypothesis->{PVALUE});
1958
1959							# if we get here, our simulated pvalue must exceed the
1960							# pvalue associated with the real hypothesis
1961
1962	14324					19219	$realHypothesis->{FDR_OBSERVATIONS}++;
1963
1964							}
1965
1966							}
1967
1968							}
1969
1970							# now we've run all the simulations, and counted for each real
1971							# hypothesis how many hypotheses from the simulations were better,
1972							# we calculate on average how many were better per simulation,
1973							# then divide by the number of hypotheses as good or better in our
1974							# real data. We threshold this at a maximum of 1, as we can't
1975							# have a FDR of greater than 100%
1976
1977	2					11	foreach (my $i = 0; $i < @realHypotheses; $i++){
1978
1979	74	100				153	if (exists $realHypotheses[$i]->{FDR_OBSERVATIONS}){
1980
1981							# the rate is the average number in the simulations that
1982							# are better than this pvalue, divided by the number that
1983							# are better in the real data
1984
1985	32					47	$realHypotheses[$i]->{FDR_OBSERVATIONS} /= $numSimulations;
1986
1987	32					88	$realHypotheses[$i]->{FDR_RATE} = $realHypotheses[$i]->{FDR_OBSERVATIONS} / ($i + 1);
1988
1989	32	100				75	if ($realHypotheses[$i]->{FDR_RATE} > 1){
1990
1991	6					10	$realHypotheses[$i]->{FDR_RATE} = 1;
1992
1993							}
1994
1995							}else{
1996
1997							# a pvalue better than this wasn't observed in any
1998							# simulation - so the FDR should be 0
1999
2000	42					62	$realHypotheses[$i]->{FDR_RATE} = 0;
2001
2002							# and say that we never saw it
2003
2004	42					86	$realHypotheses[$i]->{FDR_OBSERVATIONS} = 0;
2005
2006							}
2007
2008							# now based on the FDR, and the number of hypotheses that would
2009							# be chosen at this point, we can calculate the expected number of
2010							# false positives, as the FDR x the number of hypotheses
2011
2012	74					208	$realHypotheses[$i]->{EXPECTED_FALSE_POSITIVES} = $realHypotheses[$i]->{FDR_RATE} * ($i+1);
2013
2014							}
2015
2016							# now restore the variables
2017
2018	2					13	$self->__restoreVariables($variables);
2019
2020							# finally we want to replace our real hypotheses with our local
2021							# copy, as we've made some changes
2022
2023	2					2528	$self->{$kPvalues} = \@realHypotheses;
2024
2025							}
2026
2027							############################################################################
2028							sub __addAnnotationsToPValues{
2029							############################################################################
2030							# This method looks through the annotated nodes, and adds in information
2031							# about which genes are annotated to them, so that the client can retrieve
2032							# that information.
2033
2034	2115			2115		5089	my $self = shift;
2035
2036							# to do this, we can take advantage of the fact that all the
2037							# nodes should have all their databaseIds cached, and we can
2038							# retrieve them through the __allGOIDsForDatabaseId() method
2039
2040							# first go through the annotated nodes, and simply hash the goid to the
2041							# entry in the pValues array
2042
2043	2115					3301	my %nodeToIndex;
2044
2045	2115					5593	for (my $i = 0; $i < @{$self->{$kPvalues}}; $i++){
	107252					288379
2046
2047	105137					324413	$nodeToIndex{$self->{$kPvalues}->[$i]->{NODE}->goid} = $i;
2048
2049							}
2050
2051							# now go through each databaseId, and add the information in
2052
2053	2115					8182	foreach my $databaseId ($self->genesDatabaseIds) {
2054
2055							# look at all goids for this database id
2056
2057	46624					56075	foreach my $goid (@{$self->__allGOIDsForDatabaseId($databaseId)}){
	46624					96271
2058
2059	844729	100				1736931	next if (! exists $nodeToIndex{$goid}); # this node wasn't a hypothesis
2060
2061							# if this goid was a hypothesis, we can annotate the
2062							# corresponding hypothesis with the gene
2063
2064	554230					971226	$self->{$kPvalues}->[$nodeToIndex{$goid}]->{ANNOTATED_GENES}->{$databaseId} = $self->__origNameForDatabaseId($databaseId);
2065
2066							}
2067
2068							}
2069
2070							}
2071
2072							############################################################################
2073							sub __annotationProvider{
2074							############################################################################
2075							# This private method returns the annotationProvider that was used
2076							# during construction.
2077
2078	125691			125691		493338	return $_[0]->{$kArgs}{annotationProvider};
2079
2080							}
2081
2082							############################################################################
2083							sub __ontologyProvider{
2084							############################################################################
2085							# This private methid returns the ontologyProvider that was used
2086							# during construction.
2087
2088	289586			289586		1211252	return $_[0]->{$kArgs}{ontologyProvider};
2089
2090							}
2091
2092							############################################################################
2093							sub aspect{
2094							############################################################################
2095							=pod
2096
2097							=head2 aspect
2098
2099							Returns the aspect with the the GO::TermFinder object was constructed.
2100
2101							Usage:
2102
2103							my $aspect = $termFinder->aspect;
2104
2105							=cut
2106
2107	19429			19429	1	91971	return $_[0]->{$kArgs}{aspect};
2108
2109							}
2110
2111							1; # to make perl happy
2112
2113
2114							__END__