File Coverage

blib/lib/Bio/MaxQuant/Evidence/Statistics.pm

Criterion	Covered	Total	%
statement	336	343	97.9
branch	108	132	81.8
condition	32	51	62.7
subroutine	37	39	94.8
pod	32	32	100.0
total	545	597	91.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MaxQuant::Evidence::Statistics;
2
3	2			2		116425	use 5.006;
	2					10
	2					78
4	2			2		11	use strict;
	2					5
	2					504
5	2			2		13	use warnings;
	2					34
	2					61
6
7	2			2		25849	use Text::CSV;
	2					41290
	2					71
8	2			2		76	use Carp;
	2					5
	2					174
9	2			2		2970	use Storable;
	2					7531
	2					132
10	2			2		1769	use Statistics::Distributions;
	2					6904
	2					8860
11
12							=head1 NAME
13
14							Bio::MaxQuant::Evidence::Statistics - Additional statistics on your SILAC evidence
15
16							=head1 VERSION
17
18							Version 0.01
19
20							=cut
21
22							our $VERSION = '0.01';
23
24
25							=head1 SYNOPSIS
26
27							Read/convert your evidence file to a more rapidly processable format,
28							and perform various operations and statistics across/between multiple
29							experiments. Supports multidimensional experiments with replicate
30							analyses.
31
32
33							use Bio::MaxQuant::Evidence::Statistics;
34
35							my $foo = Bio::MaxQuant::Evidence::Statistics->new();
36
37							# get the essential data from an evidence file
38							$foo->parseEssentials(filename=>$evidencePath);
39
40							# store the essentials for later
41							$foo->writeEssentials(filename=>$essentialsPath);
42
43							# laod previously stored essentials
44							$foo->readEssentials(filename=>$essentialsPath);
45
46
47							=head1 SUBROUTINES/METHODS
48
49							=head2 new
50
51							Create a new object:
52
53							my $foo = Bio::MaxQuant::Evidence::Statistics->new();
54
55							=cut
56
57							sub new {
58	8526			8526	1	244897	my $p = shift;
59	8526		66			39854	my $c = ref($p) \|\| $p;
60	8526					212771	my %defaults = (
61							separator => "\t",
62							essential_column_names => {
63							'Protein group IDs' => 1,
64							'Modified' => 1,
65							'Leading Proteins' => 1,
66							'PEP' => 1,
67							'Ratio H/L' => 1,
68							'Intensity H' => 1,
69							'Intensity L' => 1,
70							'Contaminant' => 1,
71							'Reverse' => 1,
72							},
73							list_column_names => {
74							'Modified' => 1,
75							'PEP' => 1,
76							'Ratio H/L' => 1,
77							'Intensity H' => 1,
78							'Intensity L' => 1,
79							},
80							key_column_name => 'id',
81							experiment_column_name => 'Experiment',
82							csv_options => {sep_char=>"\t"},
83							);
84	8526					61038	my %options = (%defaults, @_);
85	8526					38674	my $o = {defaults=>\%options};
86	8526					45526	bless $o, $c;
87	8526					44095	return $o;
88							}
89
90							=head2 parseEssentials(%options)
91
92							Reads the essential data from an evidence file. Evidence files
93							for large analyses can be very big and take a long time to process,
94							to we only read what's necessary, and can save this for convenience
95							and speed too, using writeEssentials().
96
97							The data are stored by Protein group IDs, i.e. one entry per protein
98							group. Other data stored here are:
99
100							=over
101
102							=item id
103
104							=item Protein group IDs
105
106							=item Modified -- is this actually the right name??
107
108							=item Leading Proteins
109
110							=item Experiment
111
112							=item PEP
113
114							=item Ratio H/L
115
116							=item Intensity H
117
118							=item Intensity L
119
120							=item Contaminant
121
122							=item Reverse
123
124							=back
125
126							The column names used for storage are defined in the default option
127							essential_column_names, and can be changed when you call new, or when you call
128							parseEssentials. The option is a hash of column names whose values
129							detmerine whether the column is kept by their truthness... e.g.
130
131							$o->parseEssentials(essential_column_names=>(
132							'id' => 1, # kept
133							'PEP' => 0, # discarded
134							#foo => ?, # discarded
135							));
136
137							If a column doesn't exist, it does not complain!
138
139							The method takes a hash of options.
140
141							options:
142
143							=over
144
145							=item filename - path of the file to process
146
147							=item separator - passed to Text::CSV (default is tab)
148
149							=item key_column_name - change the column keyed on (default is id)
150
151							=item experiment_column_name - change the column the data are split on
152
153							=item list_column_names - change the columns stored as lists
154
155							=back
156
157							=head3 list_column_names
158
159							Some columns are the same across all the evidence in a protein group,
160							eg, the id is obviously the same, Contaminant and Reverse, Protein IDs,
161							and so on. The default, therefore, is to overwrite the column with
162							the value seen in an evidence. BUT, some columns have a different value
163							in each evidence, e.g. Ratio H/L or PEP. Whatever columns are given in
164							list_column_names, which true values, will be appended as lists, so in the
165							final data, there will be one row per protein and any column bearing multiple
166							evidences for that protein will be a list.
167
168							If that makes no sense, write to me and I'll try to change it.
169
170							=cut
171
172							sub parseEssentials {
173	1			1	1	32	my $o = shift;
174	1					3	my %defaults = %{$o->{defaults}};
	1					7
175	1					9	my %options = (%defaults, @_);
176	1					109	my $io = IO::File->new($options{filename}, 'r');
177	1					10403	my $csv = Text::CSV->new($options{csv_options});
178							# head the column names, just like in the pod...
179	1					104	$csv->column_names ($csv->getline ($io));
180							# now getline_hr will give us hashrefs :-)
181							# we just need to know which to keep...
182	1					2762	my %k = %{$options{essential_column_names}};
	1					11
183	1					3	my $i = $options{key_column_name};
184	1					2	my $e = $options{experiment_column_name};
185	1					1	my %l = %{$options{list_column_names}};
	1					5
186	1					3	my %data = ();
187	1					3	my @ids = ();
188	1					2	my @sharedids = ();
189	1					2	my @uniqueids = ();
190
191	1					6	while(! eof($io)){
192	2793					14698	my $hr = $csv->getline_hr($io);
193	192717	100	66			733196	my %h = map {
194	2793					10827691	exists $k{$_} && $k{$_} # exists and true
195							? ($_=>$hr->{$_}) # key => value
196							: () # empty
197							} keys %$hr;
198	2793					20018	my $key = $hr->{$i};
199	2793					6330	my $expt = $hr->{$e};
200							# store it...
201	2793					5536	push @ids, $key; # keep track of what we've got
202							# store stuff by expt, then id, then column
203	2793	100				10747	$data{$expt} = {} unless exists $data{$expt};
204	22344	100	66			168005	$data{$expt}->{$key} = { # set up this expt/key... unless it exists
205							map {
206	2793	50				18909	exists $l{$_} && $l{$_}
207							? ($_ => []) # it's an array
208							: ($_ => '') # it's a scalar
209							} keys %h
210							} unless exists $data{$expt}->{$key};
211							# add the data...
212	2793					35190	foreach (keys %h){ # each column
213	22344	100	66			106263	if(exists $l{$_} && $l{$_}){ # is it a list column?
214	11172					15756	push @{$data{$expt}->{$key}->{$_}}, $h{$_}; # push it
	11172					57249
215							}
216							else {
217	11172					43575	$data{$expt}->{$key}->{$_} = $h{$_}; # set it
218							}
219							}
220	2793	100				24661	if($data{$expt}->{$key}->{'Protein group IDs'} =~ /;/){
221	400					19618	push @sharedids, $key;
222							}
223							else {
224	2393					62966	push @uniqueids, $key;
225							}
226							}
227	1					5	$o->{data} = \%data;
228	1					176	$o->{ids} = [sort {$a <=> $b} @ids];
	2792					5087
229	1					35	$o->{sharedids} = [sort {$a <=> $b} @sharedids];
	399					615
230	1					104	$o->{uniqueids} = [sort {$a <=> $b} @uniqueids];
	2392					4168
231	1					8937	$o->{cache} = {};
232							}
233
234							=head2 experiments
235
236							Returns a list of the experiments in the data.
237
238							=cut
239
240							sub experiments {
241	876			876	1	4189	my $o = shift;
242	876					3042	my $data = $o->{data};
243	876					10294	return keys %$data;
244							}
245
246							=head2 replicated
247
248							Returns a list of the experiment names without the replicate portion.
249
250							The names are assumed to be Cell.Condition.Replicate, i.e. full-stop (period) separated.
251
252							=cut
253
254							sub replicated {
255	5			5	1	11	my $o = shift;
256	5	100				26	return @{$o->{cache}->{replicated}} if exists $o->{cache}->{replicated};
	3					20
257	2					5	my %repl = ();
258	2					9	my @expts = $o->experiments();
259	2					12	foreach (@expts){
260	54					177	s/\.[^.]+$//;
261	54					112	$repl{$_} = 1;
262							}
263	2					16	$o->{cache}->{replicated} = [keys %repl];
264	2					12	return @{$o->{cache}->{replicated}};
	2					29
265							}
266
267							=head2 orthogonals
268
269							Returns a list of sets of orthogonal experiments, that is 3 experiments in which the first has
270							one condition in common with the other two, but they have nothing in common with each other.
271
272							e.g. A.X A.Y B.X
273
274							The rationale behind this is that quantitative differences across this set indicate mechanistic
275							links between, for example, cell line and drug treatment. If a reponse is seen to a drug, and
276							a different repsonse is seen in a different cell-type, this system will pick that up. The
277							fourth member of the comparison (in the example that would be B.Y) could be anything... and the
278							interpretation would still be that there is a differential response.
279
280							=cut
281
282							sub orthogonals {
283	9			9	1	26	my $o = shift;
284	9	100				49	return @{$o->{cache}->{orthogonals}} if exists $o->{cache}->{orthogonals};
	7					167
285	2					6	my @repls = $o->replicated();
286	2					7	my @orths = ();
287	2					5	foreach my $c1(@repls){
288	18					47	my ($p,$x) = split(/\./, $c1, 2);
289	18					31	foreach my $c2(@repls){
290	162	100				298	next if $c2 eq $c1;
291	144					312	my ($q,$y) = split(/\./, $c2, 2);
292	144	100				344	next unless $p eq $q;
293	36					56	foreach my $c3(@repls){
294	324	100	100			1218	next if $c3 eq $c1 \|\| $c3 eq $c2;
295	252					513	my ($r,$z) = split(/\./, $c3, 2);
296	252	100				631	next unless $x eq $z;
297	72					212	push @orths, "$c1 $c2 $c3";
298							}
299							}
300							}
301	2					9	$o->{cache}->{orthogonals} = \@orths;
302	2					60	return @orths;
303							}
304
305							=head2 pairs
306
307							Returns a list of pairs of replicated experiments (e.g. A.X A.Y, A.X B.X ...)
308							that represents all possible comparisons.
309
310							=cut
311
312							sub pairs {
313	9			9	1	32	my $o = shift;
314	9	100				67	return @{$o->{cache}->{pairs}} if exists $o->{cache}->{pairs};
	7					180
315	2					12	my @r = $o->replicated();
316	2					7	my @pairs = ();
317	2					17	foreach my $r1(sort @r){
318	18					68	foreach my $r2(sort @r){
319	162	100				322	next unless $r1 lt $r2;
320	72					191	push @pairs, "$r1 $r2";
321							}
322							}
323	2					9	$o->{cache}->{pairs} = \@pairs;
324	2					38	return @pairs;
325							}
326
327							=head2 ids
328
329							Returns a list of evidence ids in the data.
330
331							=cut
332
333							sub ids {
334	2			2	1	5	return @{shift()->{ids}};
	2					18
335							}
336
337							=head2 sharedIds
338
339							Returns a list containing the ids of those evidences shared between protein groups.
340
341							=cut
342
343							sub sharedIds {
344	2			2	1	826	return @{shift()->{sharedids}};
	2					13
345							}
346
347							=head2 uniqueIds
348
349							Returns a list containing the ids of those evidences unique to one protein group.
350
351							=cut
352
353							sub uniqueIds {
354	2			2	1	6	return @{shift()->{uniqueids}};
	2					14
355							}
356
357							=head2 saveEssentials(%options)
358
359							Save the essential data (quicker to read again in future)
360
361							=cut
362
363							sub saveEssentials {
364	1			1	1	811	my $o = shift;
365	1					2	my %defaults = %{$o->{defaults}};
	1					14
366	1					8	my %options = (%defaults, @_);
367							# here we want to save everything
368	1					10	store $o, $options{filename};
369							}
370
371							=head2 loadEssentials
372
373							Load up previously saved essentials
374
375							=cut
376
377							sub loadEssentials {
378	12			12	1	488851	my $o = shift;
379	12					46	my %defaults = %{$o->{defaults}};
	12					112
380	12					247	my %options = (%defaults, @_);
381	12					98	my $p = retrieve($options{filename});
382	12					517213	%$o = %$p;
383	12					257	return $o;
384							}
385
386
387							=head2 extractColumnValues
388
389							=cut
390
391							sub extractColumnValues {
392	8379			8379	1	31340	my ($o, %options) = @_;
393							# options:
394	8379					43524	my %defaults = (
395							column => 'id', # which column to collect
396							experiment => '', # only extract this expt (all if false)
397							'split' => 1, # true = split cell on ; before adding to results
398							'nodup' => 1, # true = remove duplicates
399							'emptiesok' => 0, # true = include empty values in output
400							);
401	8379					72925	%options = (%defaults, %options);
402	8379					29336	my $data = $o->{data};
403	8379	100				35181	my $results = $options{nodup} ? {} : [];
404	8379	50				65973	my @expts = $options{experiment} ? ($options{experiment}) : (keys %$data);
405	8379					21929	foreach my $e(@expts){
406	9009					15520	foreach my $k(keys %{$data->{$e}}){
	9009					143376
407	457848					1621126	my $value = $data->{$e}->{$k}->{$options{column}};
408	457848	100				1460648	if(ref($value) eq ''){
409	8379					41310	$value = [split /;/, $value];
410							}
411	457848	50				1687922	my @values = $options{'split'} ? (@$value) : (join(';',@$value));
412	457848					873372	foreach (@values){
413	459048	100	66			2430962	next unless $_ ne '' \|\| $options{'emptiesok'};
414	437416	100				1012527	if($options{nodup}){
415	51727					377677	$results->{$_} = 1;
416							}
417							else {
418	385689					1858907	push @$results, $_;
419							}
420							}
421							}
422							}
423							# use Data::Dumper;
424							# print STDERR Dumper $results;
425	8379	100				179184	return $options{nodup} ? (keys %$results) : (@$results);
426							}
427
428							=head2 proteinCount
429
430							=cut
431
432							sub proteinCount {
433	2			2	1	4689705	my $o = shift;
434	2	100				22	return $o->{cache}->{proteinCount} if exists $o->{cache}->{proteinCount};
435	1					8	my @proteins = $o->getLeadingProteins();
436	1					6	$o->{cache}->{proteinCount} = scalar @proteins;
437	1					14	return $o->{cache}->{proteinCount};
438							}
439
440							=head2 getProteinGroupIds
441
442							=cut
443
444							sub getProteinGroupIds {
445	2			2	1	8	my $o = shift;
446	2	100				16	$o->{cache}->{proteinGroupIds} = [sort $o->extractColumnValues(column=>'Protein group IDs')] unless exists $o->{cache}->{proteinGroupIds};
447	2					7	return @{$o->{cache}->{proteinGroupIds}}
	2					32
448							}
449
450							=head2 getLeadingProteins
451
452							=cut
453
454							sub getLeadingProteins {
455	4			4	1	1343	my $o = shift;
456	4	100				37	$o->{cache}->{leadingProteins} = [sort $o->extractColumnValues(column=>'Leading Proteins')] unless exists $o->{cache}->{proteinGroupIds};
457	4					13	return @{$o->{cache}->{leadingProteins}};
	4					54
458							}
459
460							=head2 logRatios
461
462							Logs ratios (base 2) throughout the dataset, and sets a flag so it can't get logged again.
463
464							Treatment of "special values": empty string, <= 0, NaN, and any other non-number are removed
465							from the data!
466
467							=cut
468
469							sub logRatios {
470	17			17	1	1120	my $o = shift;
471	17	100				120	return 0 if $o->{logged};
472	11					41	$o->{logged} = 1;
473	11					31	my $data = $o->{data};
474	11					350	foreach my $exptname(keys %$data){
475	297					1436	my $experiment = $data->{$exptname};
476	297					17597	foreach my $proteinGroupId(keys %$experiment){
477	30724					73697	my $proteinGroup = $experiment->{$proteinGroupId};
478	30724					94740	my $ratios = $proteinGroup->{'Ratio H/L'};
479	30724					53972	my @newRatios = ();
480	30724					82672	foreach (0..$#$ratios){
481	30724	100				265522	$ratios->[$_] = $ratios->[$_] =~ /^\d+\.?\d*$/
482							? log($ratios->[$_])/log(2)
483							: '';
484							}
485							}
486							}
487	11					166	return 1;
488							}
489
490							=head2 filter
491
492							returns a set of protein records based on filter parameters...
493
494							=head3 options
495
496							=over
497
498							=item experiment - regular expression to match experiment name
499
500							=item proteinGroupId - regular expression to match protein group id
501
502							=item leadingProteins - regular expression to match leading protein ids
503
504							=item notLeadingProteins - regular expression to not match leading protein ids
505
506							=back
507
508							Returns a filtered object of the same type, with relevant flags set (e.g. whether
509							data has been logged, etc).
510
511							Warning, intentionally does not perform a deep clone!
512
513							=cut
514
515							sub filter {
516	8511			8511	1	47300	my ($o,%opts) = @_;
517							# options :
518							# use Data::Dumper;
519							# print STDERR 'OPTS: ', Dumper \%opts;
520	8511					24747	my $data = $o->{data};
521	8511					29499	my $result = {};
522	8511					122390	foreach my $experiment(keys %$data){
523	229797	100	66			1723510	if(! exists $opts{experiment} \|\| $experiment =~ /$opts{experiment}/){
524	9063					34023	$result->{$experiment} = {};
525	9063					24743	my $exptdata = $data->{$experiment};
526	9063					578225	foreach my $pgid(keys %$exptdata){
527	944201	50	33			2852151	if(! exists $opts{proteinGroupId} \|\| $pgid =~ /$opts{proteinGroupId}/){
528	944201					1904095	my $pgdata = $exptdata->{$pgid};
529	944201	100	100			5914064	if(! exists $opts{leadingProteins} \|\| $pgdata->{'Leading Proteins'} =~ /$opts{leadingProteins}/){
530	466181	100	66			1402744	if(! exists $opts{notLeadingProteins} \|\| $pgdata->{'Leading Proteins'} !~ /$opts{notLeadingProteins}/){
531	463434					1914190	$result->{$experiment}->{$pgid} = $pgdata;
532							}
533							}
534							}
535							}
536							}
537							}
538							# print STDERR Dumper $result if $opts{experiment} eq qr/^LCC1.nE.r2$/;
539	8511					80978	my $p = $o->new;
540	8511					143809	%$p = %$o;
541	8511					36483	$p->{data} = $result;
542	8511					20120	$o->{lastfiltered} = $p;
543	8511					37450	return $p;
544							}
545
546							=head2 replicateMedian
547
548							options are passed to filter.
549
550							=cut
551
552							sub replicateMedian {
553	355			355	1	174821	my ($o,%opts) = @_;
554	355					1563	my $f = $o->filter(%opts);
555	355					1707	return $f->median(
556							$f->extractColumnValues(
557							column => 'Ratio H/L',
558							nodup => 0,
559							)
560							);
561							}
562
563							=head2 deviations
564
565							returns an hashref with the following keys
566
567							=over
568
569							=item n - the number of items
570
571							=item sd - the standard deviation (from the mean)
572
573							=item mad - the median absolute deviation (from the median)
574
575							=item sd_via_mad - the standard deviation estimated from the median absolute deviation
576
577							=back
578
579							=cut
580
581							sub deviations {
582	2826			2826	1	79414	my ($o,%opts) = @_;
583							# cache
584	2826	100				38227	$o->{cache}->{deviations} = {} unless exists $o->{cache}->{deviations};
585	2826					22879	my $cachekey = join('::', map {"$_=$opts{$_}"} sort keys %opts);
	16780					59712
586							# print STDERR "$cachekey\n";
587	2826	100				21707	return $o->{cache}->{deviations}->{$cachekey} if exists $o->{cache}->{deviations}->{$cachekey};
588							##
589	2825					18148	my $f = $o->filter(%opts);
590	2825					17091	my @values = $f->extractColumnValues(
591							column => 'Ratio H/L',
592							nodup => 0,
593							);
594	2825					24307	my $n = scalar @values;
595	2825	50				30591	my $d = $n > 1 ? $o->sd(@values) : '';
596	2825					12770	$d->{'values'} = \@values;
597	2825	50				22829	$d->{mad} = $n ? $o->mad(@values) : '';
598
599	2825	50				25093	$d->{sd_from_mad} = $d->{sd_via_mad} = $n ? $d->{mad} * 1.4826016694 : '';
600	2825	50				20046	$d->{usv_mad} = $n ? $d->{sd_from_mad} ** 2 : '';
601	2825	50				23226	$d->{median} = $n ? $o->median(@values) : '';
602							# I should think about caching here!!! DONE!
603	2825					28047	$o->{cache}->{deviations}->{$cachekey} = $d;
604	2825					36098	return $d;
605							}
606
607							=head2 mean
608
609							given a list of values, returns the mean
610
611							=cut
612
613							sub mean {
614	2827			2827	1	68611	my ($o,@values) = @_;
615	2827	50				19824	if(scalar(@values) < 1){ return ''; }
	0					0
616	2827					26156	return $o->sum(@values) / scalar @values;
617							}
618
619							=head2 sd (unbiased standard deviation)
620
621							given a list of values, returns a hash with keys mean and sd (standard deviation).
622
623							=cut
624
625							sub sd {
626	2827			2827	1	37100	my ($o,@values) = @_;
627	2827					6132	my $n = scalar(@values);
628	2827	50				19764	my $mean = $n ? $o->mean(@values) : '';
629	2827					12553	my $sos = $o->sum(map {($_ - $mean)**2} @values);
	330832					586274
630	2827	50				21744	if($n > 1){
631	2827					10641	$sos /= ($n-1);
632							}
633							else {
634	0					0	$sos = '';
635							}
636							return {
637	2827					37144	sd => sqrt($sos),
638							usv => $sos,
639							mean => $mean,
640							n => $n,
641							};
642							}
643
644							=head2 sum
645
646							given a list of values, returns the sum
647
648							=cut
649
650							sub sum {
651	5654			5654	1	114118	my ($o,@values) = @_;
652	5654					12944	my $t = 0;
653	5654					326071	$t += $_ foreach @values;
654	5654					58407	return $t;
655							}
656
657							=head2 mad
658
659							given a list of values, returns the median absolute deviation
660
661							=cut
662
663							sub mad {
664	2825			2825	1	43932	my ($o,@values) = @_;
665	2825	50				13610	if(scalar(@values) < 1){
666	0					0	return '';
667							}
668	2825					17105	my $median = $o->median(@values);
669	2825					9318	my @ads = map {abs ($_ - $median)} @values;
	330822					613525
670	2825					37064	return $o->median(@ads);
671							}
672
673							=head2 ttest
674
675							Given options, experiment1, experiment2 and optional filters,
676							returns a hash of statistics...
677
678							stats1 and stats2 are hashes of deviations: sd, mad, sd_via_mad, usv, n, values
679
680							ttest is hash of Welch's ttest results: t, df, p
681
682							ttest_mad is like ttest but based on median and median absolute deviateions.
683
684							The p-values are derived using Welch's Ttest and the t-distribution function from
685							Statistics::Distributions.
686
687							MAD and medians are much more robust to outliers, which are significant in peptide ratios.
688
689
690							=cut
691
692							sub ttest {
693	5202			5202	1	52181	my ($o,%opts) = @_;
694							# cache
695	5202	100				19464	if($opts{experiment1} gt $opts{experiment2}){ # sort requested expts
696	1723					5586	($opts{experiment1}, $opts{experiment2}) = ($opts{experiment2}, $opts{experiment1});
697							}
698	5202	100				24355	$o->{cache}->{ttests} = {} unless exists $o->{cache}->{ttests};
699	5202					34198	my $cachekey = join('::', map {"$_=$opts{$_}"} sort keys %opts);
	25966					90980
700	5202	100				57857	return $o->{cache}->{ttests}->{$cachekey} if exists $o->{cache}->{ttests}->{$cachekey};
701							##
702	1402					5744	$opts{experiment} = $opts{experiment1};
703	1402					8534	my $d1 = $o->deviations(%opts);
704	1402					12589	$opts{experiment} = $opts{experiment2};
705	1402					11812	my $d2 = $o->deviations(%opts);
706	1402					15876	my $tt = $o->welchs_ttest(
707							mean1 => $d1->{mean},
708							mean2 => $d2->{mean},
709							usv1 => $d1->{usv},
710							usv2 => $d2->{usv},
711							n1 => $d1->{n},
712							n2 => $d2->{n},
713							);
714	1402	50	33			27797	$tt->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tt->{df}), $tt->{t}) : '';
715	1402					371766	my $tm = $o->welchs_ttest(
716							mean1 => $d1->{median},
717							mean2 => $d2->{median},
718							usv1 => $d1->{usv_mad},
719							usv2 => $d2->{usv_mad},
720							n1 => $d1->{n},
721							n2 => $d2->{n},
722							);
723	1402	50	33			18817	$tm->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tm->{df}), $tm->{t}) : '';
724
725	1402					307460	my $r = {
726							stats1 => $d1, stats2 => $d2, ttest => $tt, ttest_mad => $tm
727							};
728	1402					12334	$o->{cache}->{ttests}->{$cachekey} = $r;
729	1402					13433	return $r;
730							}
731
732							=head2 welchs_ttest
733
734							performs Welch's ttest, given mean1, mean2, usv1, usv2, n1 and n2 in a hash.
735
736							e.g.
737
738							$o->welchs_ttest( mean1 => 4, mean2 => 3, # sample mean
739							usv1 => 1, usv2 => 1.1, # unbiased sample variance (returned as usv from $o->sd
740							n1 => 4, n2=> 7 # number of observations
741
742							also performs Welch-Satterthwaite to calculate degrees of freedom (to look up in t-statistic table)
743
744							Returns hashref containing t and df.
745
746							=cut
747
748							sub welchs_ttest {
749	2805			2805	1	27807	my ($o, %t) = @_;
750	2805					9249	my ($x1,$x2,$v1,$v2,$n1,$n2) = map {$t{$_}} qw/mean1 mean2 usv1 usv2 n1 n2/;
	16830					37434
751	2805					9893	my ($vn1,$vn2) = ($v1/$n1, $v2/$n2);
752	2805					8904	my $t = abs($x1 - $x2) / sqrt( $vn1 + $vn2 );
753	2805					12687	my $df = ($vn1 + $vn2)2 / ( $vn12/($n1-1) + $vn2**2/($n2-1) );
754	2805					21831	return {t => $t, df => $df};
755							}
756
757							=head2 replicateMedianSubtractions
758
759							Logs data, if not already done, calculates median for each replicate, and subtracts median from each evidence in that replicate.
760
761							=cut
762
763							sub replicateMedianSubtractions {
764	5			5	1	369	my ($o, %opts) = @_; # can set filter here
765	5					28	$o->logRatios();
766	5					34	foreach my $replicate($o->experiments()){
767	135					941	my $median = $o->replicateMedian(%opts, experiment=>$replicate);
768	135					868	my $p = $o->filter(experiment=>$replicate);
769	135					280	foreach my $pgid(keys %{$p->{data}->{$replicate}}){
	135					2847
770	13965					18686	foreach my $i(0.. $#{$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}}){
	13965					69328
771	13965	100				100753	if($p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] =~ /\d/){
772	13280					78911	$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] -= $median;
773							}
774							}
775							}
776							}
777							# i guess we should do something better with generating this status:
778	5					80	return 1;
779							}
780
781							=head2 median
782
783							given a list of numbers, returns the median... assumes all items are numbers!
784
785							=cut
786
787							sub median {
788	8834			8834	1	18817	my $o = shift;
789	8834					55317	my @list = sort {$a <=> $b} @_;
	5892786					7771332
790	8834					17954	my $n = scalar @list;
791	8834	100				49007	if($n % 2){ # remainder on division by two -> it's odd!
792	3940					50834	return $list[($n-1)/2]; # index of last over 2, e.g. 21 items, last index 20, return 10.
793							}
794							else { # it's not odd... so it's even
795	4894					66267	return ($list[$n/2 - 1] + $list[$n/2]) / 2; # length over 2 and the same minus 1, e.g. 20 items, we want 9 and 10.
796							}
797							}
798
799
800							=head2 experimentMaximumPvalue
801
802							=cut
803
804							sub experimentMaximumPvalue {
805	866			866	1	13973	my ($o,%opts) = @_;
806							# run through experiments and collect replicate names for comparisons...
807							# this should be filtered for individual proteins using the leadingProteins option.
808	866					3287	my @reps1 = ();
809	866					2720	my @reps2 = ();
810	866					4963	foreach my $rep($o->experiments){
811	23382	100				105896	if($rep =~ /^$opts{experiment1}/){
812	2598					5303	push @reps1, $rep;
813							}
814	23382	100				127597	if($rep =~ /^$opts{experiment2}/){
815	2598					8756	push @reps2, $rep;
816							}
817							}
818							# now there must be enough replicates with enough observations in each...
819	866	50				7896	$opts{minimum_observations} = 2 unless exists $opts{minimum_observations};
820	866	50				4606	$opts{minimum_replicates} = 2 unless exists $opts{minimum_replicates};
821	866					12660	my $reps1 = 0;
822	866					1966	my $reps2 = 0;
823	866					2609	foreach my $rep(@reps1){
824	2598					11161	my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
825	2598					11830	my @values = $f->extractColumnValues(column => 'Ratio H/L');
826	2598	100				23886	$reps1 ++ if scalar(@values) > $opts{minimum_observations};
827							}
828	866					3446	foreach my $rep(@reps2){
829	2598					11475	my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
830	2598					11934	my @values = $f->extractColumnValues(column => 'Ratio H/L', nodup => 0);
831	2598	100				18547	$reps2 ++ if scalar(@values) > $opts{minimum_observations};
832							}
833	866	100	100			12666	return {p_max => -1, p_mad_max => -1} if $reps1 < $opts{minimum_replicates} \|\| $reps2 < $opts{minimum_replicates};
834
835							# compare each combination of replicates
836	518					1623	my $p_max = 0;
837	518					1734	my $p_mad_max = 0;
838	518					1947	foreach my $r1(@reps1){
839	1554					3874	foreach my $r2(@reps2){
840	4662					27775	my $tt = $o->ttest(%opts, experiment1=>$r1, experiment2=>$r2);
841	4662	100				48833	$p_max = $tt->{ttest}->{p} if $tt->{ttest}->{p} > $p_max;
842	4662	100				30063	$p_mad_max = $tt->{ttest_mad}->{p} if $tt->{ttest_mad}->{p} > $p_mad_max;
843							}
844							}
845							# compare experiments overall
846	518					3385	my $tt = $o->ttest(%opts);
847	518	100				5314	$p_max = $tt->{ttest}->{p} if $tt->{ttest}->{p} > $p_max;
848	518	50				3888	$p_mad_max = $tt->{ttest_mad}->{p} if $tt->{ttest_mad}->{p} > $p_mad_max;
849
850							# report the maxima
851	518					7798	return {p_max=>$p_max, p_mad_max=>$p_mad_max};
852							}
853
854							=head2 fullProteinComparison
855
856							Does a full comparison on a particular protein, i.e. compares all pairs of conditions, also does
857							differential response analysis. Allows limitation of analysis to proteotypic peptides.
858
859							=cut
860
861							sub fullProteinComparison {
862	8			8	1	126	my ($o, %opts) = @_;
863							# %opts should have our protein listed as "filter"
864	8					47	my @pairs = $o->pairs();
865	8					59	my @orths = $o->orthogonals();
866	8					43	my %results = ();
867	8					24	foreach my $p(@pairs){
868	288					2410	my ($e1,$e2) = split(/\s+/, $p);
869	288					2154	$results{$p} = $o->experimentMaximumPvalue(%opts, experiment1=>$e1, experiment2=>$e2);
870							}
871	8					40	foreach my $p(@orths){
872	288					2239	my ($e1,$e2,$e3) = split(/\s+/, $p);
873	288					2014	my $r1 = $o->experimentMaximumPvalue(%opts, experiment1=>$e1, experiment2=>$e2);
874	288					2414	my $r2 = $o->experimentMaximumPvalue(%opts, experiment1=>$e1, experiment2=>$e3);
875	288	100	100			4190	if($r1->{p_max} < 0 \|\| $r2->{p_max} < 0){
		100
876	120					2579	$r1->{p_max} = -1;
877							}
878							elsif($r2->{p_max} > $r1->{p_max}) {
879	54					174	$r1->{p_max} = $r2->{p_max};
880							}
881	288	100	100			2818	if($r1->{p_mad_max} < 0 \|\| $r2->{p_mad_max} < 0){
		100
882	120					548	$r1->{p_mad_max} = -1;
883							}
884							elsif($r2->{p_mad_max} > $r1->{p_mad_max}) {
885	52					272	$r1->{p_mad_max} = $r2->{p_mad_max};
886							}
887	288					2604	$results{$p} = $r1;
888							}
889	8					274	return \%results;
890							}
891
892							=head2 fullComparison
893
894							Does a full comparison for each protein. Returns hash of hashes.
895
896							=cut
897
898							sub fullComparison {
899	1			1	1	2005	my $o = shift;
900	1					7	my @leadingProteins = $o->getLeadingProteins();
901	1					5	my %results = ();
902	1					4	foreach my $lp(@leadingProteins){
903	7					39	$results{$lp} = $o->fullProteinComparison(filter=>$lp);
904							}
905	1					15	return \%results;
906							}
907
908							=head2 direction
909
910							given two values, returns whether the different between first and second is positive or negative
911
912							returns '+' or '-'
913
914							=cut
915
916							sub direction {
917	0	0		0	1		return $_[1] > $_[2] ? '-' : '+';
918							}
919
920							=head2 directionsDisagree
921
922							given two directions, which could be '+', '-' or '', returns true if one is '+' and the other is '-'
923
924							=cut
925
926							sub directionsDisagree {
927	0	0	0	0	1		return if $_[1] eq '-' && $_[2] eq '+';
928	0	0	0				return if $_[1] eq '+' && $_[2] eq '-';
929	0						return 1; # must be the same or one is blank.
930							}
931
932
933
934
935							=head1 AUTHOR
936
937							jimi, C<< >>
938
939							=head1 BUGS
940
941							Please report any bugs or feature requests to C, or through
942							the web interface at L. I will be notified, and then you'll
943							automatically be notified of progress on your bug as I make changes.
944
945
946
947
948							=head1 SUPPORT
949
950							You can find documentation for this module with the perldoc command.
951
952							perldoc Bio::MaxQuant::Evidence::Statistics
953
954
955							You can also look for information at:
956
957							=over 4
958
959							=item * RT: CPAN's request tracker (report bugs here)
960
961							L
962
963							=item * AnnoCPAN: Annotated CPAN documentation
964
965							L
966
967							=item * CPAN Ratings
968
969							L
970
971							=item * Search CPAN
972
973							L
974
975							=back
976
977
978							=head1 ACKNOWLEDGEMENTS
979
980
981							=head1 LICENSE AND COPYRIGHT
982
983							Copyright 2014 jimi.
984
985							This program is released under the following license: artistic2
986
987
988							=cut
989
990							1; # End of Bio::MaxQuant::Evidence::Statistics