File Coverage

blib/lib/Bio/SDRS.pm
Criterion Covered Total %
statement 373 430 86.7
branch 92 148 62.1
condition 29 51 56.8
subroutine 32 34 94.1
pod 10 10 100.0
total 536 673 79.6


line stmt bran cond sub pod time code
1             package Bio::SDRS;
2              
3 9     9   138438 use 5.008;
  9         27  
  9         360  
4 9     9   63 use strict;
  9         9  
  9         414  
5 9     9   45 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION $AUTOLOAD);
  9         36  
  9         738  
6 9     9   27 use warnings;
  9         18  
  9         261  
7 9     9   36 use Carp qw(cluck croak carp);
  9         0  
  9         657  
8 9     9   4176 use POSIX;
  9         43002  
  9         54  
9 9     9   31158 use Math::NumberCruncher;
  9         666387  
  9         594  
10 9     9   6030 use Statistics::Distributions;
  9         25641  
  9         11214  
11              
12             require Exporter;
13              
14             @ISA = qw(Exporter);
15              
16             # Items to export into callers namespace by default. Note: do not export
17             # names by default without a very good reason. Use EXPORT_OK instead.
18             # Do not simply export all your public functions/methods/constants.
19              
20             # This allows declaration
21             # use Bio::SDRS ':all';
22             # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
23             # will save memory.
24             our %EXPORT_TAGS = ( 'all' => [ qw() ] );
25              
26             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
27              
28             our @EXPORT = qw();
29              
30             our $AUTOLOAD;
31              
32             my %component = map { ($_ => 1) } ('MULTIPLE', 'LDOSE', 'HDOSE', 'STEP',
33             'MAXPROC', 'TRIM', 'SIGNIFICANCE',
34             'TMPDIR', 'DEBUG');
35              
36             our $VERSION = '0.11';
37              
38             1;
39            
40             =head1 NAME
41              
42             Bio::SDRS - Perl extension for Sigmoidal Dose Response Search, a tool
43             for characterizing biological responses to compounds.
44              
45             =head1 SYNOPSIS
46              
47             use Bio::SDRS;
48             my $sdrs = new Bio::SDRS;
49              
50             $sdrs->doses(0.423377, 1.270132, 3.810395, 11.431184, 34.293553,
51             102.880658, 308.641975, 925.925926, 2777.777778, 8333.333333,
52             25000);
53             $sdrs->set_assay('C8-BMS-208882',
54             1.885, 1.82, 2.2, 2.205, 2.78,
55             4.965, 9.21, 31.275, 74.445, 99.03,
56             100);
57             $sdrs->calculate;
58             foreach my $assay ($sdrs->assays) {
59             print "$assay\n";
60             foreach my $prop (('MAX', 'MIN', 'LOW', 'HIGH', 'EC50',
61             'PVALUE', 'EC50RANGE', 'PEAK', 'A', 'B',
62             'D', 'FOLD')) {
63             printf " %s = %s\n", $prop, $sdrs->ec50data($assay, $prop);
64             }
65             print "\n";
66             }
67              
68             =head2 Constructor
69              
70             $obj = new Bio::SDRS;
71              
72             # You can also use $obj = Bio::SDRS->new();
73              
74             =head2 Object Methods
75              
76             =head3 Input Methods
77              
78             $sdrs->multiple([$new_multiple]);
79             $sdrs->ldose([$new_ldose]);
80             $sdrs->hdose([$new_hdose]);
81             $sdrs->step([$new_step]);
82             $sdrs->maxproc([$new_maxproc]);
83             $sdrs->trim([$new_trim]);
84             $sdrs->significance([$new_significance]);
85             $sdrs->tmpdir([$new_tmpdir]);
86             $sdrs->debug([$new_debug]);
87             $sdrs->doses(doses...);
88             $sdrs->set_assay(assay, {response}...)
89              
90             =head3 Output Methods
91              
92             $sdrs->assays;
93             $sdrs->scandata;
94             $sdrs->score_doses;
95             $sdrs->sorted_assays_by_dose([$dose]);
96             $sdrs->pvalues_by_dose([$dose])
97             $sdrs->ec50data([$assay[, $property[, $precision]]]);
98              
99             =head3 Other Methods
100              
101             $sdrs->calculate;
102              
103             =head1 DESCRIPTION
104              
105             Bio::SDRS implements the Sigmoidal Dose Response Search of assay responses
106             described in the paper by
107              
108             Rui-Ru Ji, Nathan O. Siemers, Lei Ming, Liang Schweizer, and Robert E
109             Bruccoleri.
110              
111             The module is implemented using a simple object oriented paradigm
112             where the object stores all the information needed for a calculation
113             along with a state variable, C. The state variable has three
114             possible values, C<'setup'>, C<'calculating'> and C<'calculated'>. The
115             value of C<'setup'> indicates that the object is being setup with
116             data, and any results in the object are inconsistent with the data.
117             The value of C<'calculating'> indicates the object's computations are
118             in progress and tells the code not to delete intermediate files. This
119             object runs in parallel, and the object destruction code gets called
120             when each thread exits. Intermediate files must be protected at that
121             time. The value of C<'calculated'> indicates that the object's
122             computational results are consistent with the data, and may be
123             returned to a calling program.
124              
125             The C<'calculate'> method is used to update all the calculated values
126             from the input data. It checks the state variable first, and only does
127             the calculation if the state is C<'setup'>. Once the calculations are
128             complete, then the state variable is set to C<'calculated'>. Thus, the
129             C method can be called whenever a calculated value is
130             needed, and there is no performance penalty.
131              
132             The module initializes the C object with a state of
133             C<'setup'>. Any data input sets the state to C<'setup'>. Any requests
134             for calculated data, calls C<'calculate'>, which updates the state
135             variable so futures requests for calculated data return quickly.
136              
137             B This module uses parallel programming via a fork call to get
138             high performance. I
139             calling the C method, and reopen them afterwards. In
140             addition, you must ensure that any automated DESTROY methods take in
141             account their execution when the child processes terminated.>
142              
143             =head1 METHODS
144              
145             The following methods are provided:
146              
147             =over 4
148              
149             =cut
150            
151              
152             =item C
153              
154             Creates a new Bio::SDRS object.
155              
156             =cut
157              
158             sub new {
159 14     14 1 484 my $pkg;
160 14         43 my $class = shift;
161 14         39 eval {($pkg) = caller(0);};
  14         162  
162 14 50       62 if ($class ne $pkg) {
163 0         0 unshift @_, $class;
164             }
165 14         33 my $self = {};
166 14         28 bless $self;
167 14         52 $self->_init_data;
168 14         37 $self->{MULTIPLE} = 1.18;
169 14         28 $self->{LDOSE} = 0.17;
170 14         28 $self->{HDOSE} = 30000;
171 14         28 $self->{STEP} = 60;
172 14         3433 $self->{MAXPROC} = 2;
173 14         28 $self->{TRIM} = 0.0;
174 14         28 $self->{SIGNIFICANCE} = 0.05;
175 14         33 $self->{DEBUG} = 0;
176 14         106 $self->_init_tmp;
177            
178 14         33 return $self;
179             }
180              
181             sub _init_data {
182 28     28   65 my $self = shift;
183 28         140 $self->{STATE} = "setup";
184 28         93 $self->{DOSES} = [];
185 28         104 $self->{RESPONSES} = {};
186             }
187              
188             sub _init_tmp {
189 14     14   19 my $self = shift;
190 14 50       61 my $tmp = exists($ENV{TMPDIR}) ? $ENV{TMPDIR} : "/tmp";
191 14         28 $tmp .= "/sdrs";
192 14 50       61 if (exists($ENV{USER})) {
193 0         0 $tmp .= "." . $ENV{USER};
194             }
195 14         57 $tmp .= "." . $$;
196 14         33 $self->{TMPDIR} = $tmp;
197 14         37 $self->{TMP_CREATED} = 0;
198             }
199              
200             sub DESTROY {
201            
202             # There's an important multiprocessing issue to keep in mind here.
203             # When this process forks, this method will be invoked for all the subprocesses.
204             # Thus, we need to prevent the deletion of the temporary array until the
205             # calculation is completed. That is why there is a check on the STATE.
206            
207 14     14   69208518 my $self = shift;
208 14 100 33     4432 if (not $self->{DEBUG} and
      66        
209             $self->{TMP_CREATED} and
210             $self->{STATE} eq 'calculated') {
211 6         88 &_system_with_check("rm -rf " . $self->{TMPDIR},
212             $self->{DEBUG});
213             }
214             }
215              
216             # documentation for autoloaded methods goes here.
217              
218             =item C
219              
220             Retrieves the current setting for the C value,
221             and optionally sets it. This value specifies the multiplicity factor
222             for increasing the dose in the search. It must be greater than one.
223              
224             =item C
225              
226             Retrieves the current setting for the C value,
227             and optionally sets it. This value specifies the lowest dose in the search.
228             It must be greater than zero.
229              
230             =item C
231              
232             Retrieves the current setting for the C value,
233             and optionally sets it. This value specifies the highest
234             dose in the search. It must be greater than the ldose value.
235              
236             =item C
237              
238             Retrieves the current setting for the C value, and optionally
239             sets it. This value specifies the maximum change in doses in the
240             search. In the search process, this module starts at the ldose
241             value. It tries multiplying the current dose by the C value,
242             but it will only increase the dose by no more than the C value
243             specified here. It must be positive.
244              
245             =item C
246              
247             Retrieves the current setting for the C value,
248             and optionally sets it. This value specifies the maximum number of processes that
249             can be used for the search. The upper limit is 64 and the lower limit is 1.
250              
251             =item C
252              
253             Retrieves the current setting for the C value, and optionally
254             sets it. This value specifies the trimming factor for the number of
255             assays. If the C is 0, then all assays will be used, and if 1,
256             no assays will be used. It must be between 0 and 1.
257              
258             =item C
259              
260             Retrieves the current setting for the C value, and
261             optionally sets it. This value specifies the minimum permitted
262             significance value for the F score cutoff. It must be between zero and
263             1.
264              
265             =item C
266              
267             Retrieves the current setting for the C value, and optionally
268             sets it. This value specifies the temporary directory where scans of
269             the dose calculation are written. The default is
270             /tmp/sdrs.C.C.
271              
272             =item C
273              
274             Retrieves the current setting for the C variable, and
275             optionally sets it. This value specifies whether debugging information
276             is printed and if the temporary directory (see above) is deleted after
277             execution of this module.
278              
279             =cut
280              
281             sub AUTOLOAD {
282 112     112   9063 my $self = shift;
283 112         190 my $type = ref($self);
284 112         122 my $pkg = __PACKAGE__;
285 112         84 my ($pos, $new_val);
286              
287 112 50       251 croak "$self is not an object\n" if not $type;
288 112 50       868 croak "$pkg AUTOLOAD function fails on $type\n"
289             if $type !~ m/^$pkg$/;
290 112         239 my $name = uc($AUTOLOAD);
291 112         1262 $name =~ s/^.*:://;
292 112 50       617 if (not exists($component{$name})) {
    50          
    50          
293 0         0 croak "$name is not a valid method for $type.\n";
294             }
295             elsif (not exists($self->{$name})) {
296 0         0 croak "$name is not a valid method for $type\n";
297             }
298             elsif (defined $_[0]) {
299 112         151 $self->{$name} = shift;
300 112         193 $self->{STATE} = "setup";
301             }
302 112 50 66     366 if ($name eq "MULTIPLE" and
303             $self->{MULTIPLE} <= 1.0) {
304 0         0 croak "multiple parameter must be greater than 1.\n";
305             }
306 112 50 66     346 if ($name eq "LDOSE" and
307             $self->{LDOSE} <= 0.0) {
308 0         0 croak "ldose parameter must be positive.\n";
309             }
310 112 50 66     311 if ($name eq "HDOSE" and
311             $self->{HDOSE} <= $self->{LDOSE}) {
312 0         0 croak "hdose parameter must be greater than the ldose parameter.\n";
313             }
314 112 50 66     370 if ($name eq "STEP" and
315             $self->{STEP} <= 0.0) {
316 0         0 croak "step parameter must be positive.\n";
317             }
318 112 50 33     306 if ($name eq "MAXPROC" and
      66        
319             ($self->{MAXPROC} > 64 or $self->{MAXPROC} < 1)) {
320 0         0 croak "maxproc parameter must be between 1 and 64.\n";
321             }
322 112 50 33     483 if ($name eq "TRIM" and
      66        
323             ($self->{TRIM} < 0.0 or
324             $self->{TRIM} > 1.0)) {
325 0         0 croak "trim parameter must be between 0 and 1, inclusive\n";
326             }
327 112 50 33     327 if ($name eq "SIGNIFICANCE" and
      66        
328             ($self->{SIGNIFICANCE} <= 0.0 or
329             $self->{SIGNIFICANCE} >= 1.0)) {
330 0         0 croak "trim parameter must be between 0 and 1, exclusive\n";
331             }
332 112         684 return $self->{$name};
333             }
334              
335             =item C
336              
337             Specify the list of compound doses used in the all the assays in the
338             experiment. The doses must be in numerical order, from smallest to
339             largest.
340              
341             =cut
342              
343             sub doses {
344 14     14 1 37 my $self = shift;
345 14 50       75 if (scalar(@_) == 0) {
346 0         0 return @{$self->{DOSES}};
  0         0  
347             }
348             else {
349 14         79 $self->_init_data;
350 14         465 my $dose1 = shift;
351 14         32 push(@{$self->{DOSES}}, $dose1);
  14         128  
352 14         70 foreach my $dose (@_) {
353 125 50       462 if ($dose < $dose1) {
354 0         0 croak "Doses not in order. Current dose is $dose, and previous does is $dose1\n";
355             }
356 125         102 push(@{$self->{DOSES}}, $dose);
  125         274  
357 125         158 $dose1 = $dose;
358             }
359 14         71 $self->{STATE} = "setup";
360             }
361             }
362              
363             =item C
364              
365             Adds an assay to the list to be searched over. Each response in the
366             list corresponds to the doses specified in the C method. The
367             number must match, and they must be in numerical order, from smallest
368             to largest.
369              
370             =cut
371              
372             sub set_assay {
373 275     275 1 1589 my $self = shift;
374 275         244 my $assay = shift;
375              
376 275         267 $self->{STATE} = "setup";
377 275 50       438 if (exists($self->{RESPONSES}->{$assay})) {
378 0         0 croak "Duplicate assay ($assay) response specified.\n";
379             }
380 275         979 $self->{RESPONSES}->{$assay} = [ @_ ];
381 275 50       199 if (scalar(@{$self->{DOSES}}) == 0) {
  275         429  
382 0         0 croak "You must call the doses method before calling the set_assay method.\n";
383             }
384 275         190 my $nresp = scalar(@{$self->{RESPONSES}->{$assay}});
  275         307  
385 275         185 my $ndoses = scalar(@{$self->{DOSES}});
  275         235  
386 275 50       905 if ($nresp != $ndoses) {
387 0         0 croak "The number of assay responses ($nresp) does not equal the number of doses ($ndoses)\n";
388             }
389             }
390              
391             =item C
392              
393             Return the list of assay names;
394              
395             =cut
396            
397             sub assays {
398 6     6 1 813 my $self = shift;
399 6         12 return keys %{$self->{RESPONSES}};
  6         74  
400             }
401              
402             =item C
403              
404             Perform the SDRS calculation.
405              
406             =cut
407              
408             sub calculate {
409 14     14 1 312 my $self = shift;
410 14 50       51 return if $self->{STATE} eq 'calculated';
411 14         66 $self->_make_tmp;
412 14         89 $self->{STATE} = "calculating";
413             #
414             # Calculate F Test degrees of freedom.
415             #
416 14         60 my $ndoses = scalar(@{$self->{DOSES}});
  14         79  
417 14         37 my $p = 4; #number of parameters
418 14         93 $self->{FDISTR_N} = $p - 1;
419 14         222 $self->{FDISTR_M} = $ndoses - $p;
420 14         264 $self->{CUTOFF} = Statistics::Distributions::fdistr($self->{FDISTR_N},
421             $self->{FDISTR_M},
422             $self->{SIGNIFICANCE});
423              
424 14         5813 my $count = scalar(keys %{$self->{RESPONSES}});
  14         85  
425 14         37 foreach my $assay (keys %{$self->{RESPONSES}}) {
  14         174  
426 275         586 my @data = @{$self->{RESPONSES}->{$assay}};
  275         2342  
427 275         2156 my ($max, $min) = Math::NumberCruncher::Range(\@data);
428 275         14121 $self->{MAX}->{$assay} = $max;
429 275         835 $self->{MIN}->{$assay} = $min;
430 275         1496 my $value = _compute_sst(\@data); #total sum of squares
431 275         1416 $self->_define_ab_boundary($assay, \@data);
432 275         1505 $self->{SST}->{$assay} = $value;
433 275         1451 $self->{VALUES}->{$assay} = \@data;
434             }
435 14         110 my $assayNum = int($count * (1 - $self->{TRIM}));
436 14         28 my %expressedAssays;
437 14         23 $count = 0;
438 14         37 foreach my $g (sort {$self->{MAX}->{$b} <=> $self->{MAX}->{$a}} (keys %{$self->{MAX}})) {
  1008         1071  
  14         264  
439 275 50       348 last if ($count >= $assayNum);
440 275         299 $expressedAssays{$g} = 1;
441 275         226 $count++;
442             }
443              
444 14         46 my (%pvalues, %sortedprobes, $pid, @pids, $file, $iproc, $err);
445 14         37 my $tmpdir = $self->{TMPDIR};
446 14         65 for ($iproc = 0; $iproc < $self->{MAXPROC}; $iproc++) {
447 44 100       47934 if ($pid = fork) {
    50          
448 36 50       478 print STDERR "Process $pid forked.\n" if $self->{DEBUG};
449 36         1058 push @pids, $pid;
450             }
451             elsif (defined $pid) {
452 8         12001256 sleep $iproc;
453 8         591 $file = "$tmpdir/sdrs.out.$iproc";
454 8 50       2655 open (ECOUT, ">$file") || die "can not open EC50 output file: $!\n";
455             #
456             # The following statement causes the system to write each output line
457             # to disk immediately, rather than waiting for a buffer to fill.
458             #
459 9     9   5868 select((select(ECOUT), $| = 1)[$[]); # Change buffering to line by line.
  9         3645  
  9         24975  
  8         945  
460 8         143 my $last_dose = $self->{LDOSE};
461 8         84 my $icount = 0;
462 8         401 for (my $dose = $self->{LDOSE};
463             $dose < $self->{HDOSE} * $self->{MULTIPLE};
464             $dose *= $self->{MULTIPLE}) {
465 10984 100       21057 if ($dose - $last_dose > $self->{STEP}) {
466 9840         10146 $dose = $last_dose + $self->{STEP};
467             }
468 10984 100       23596 if (($icount++ % $self->{MAXPROC}) == $iproc) {
469 2746         13282 print ECOUT $self->_scan_dose_point($dose);
470             }
471 10984         36413 $last_dose = $dose;
472             }
473 8         401 close ECOUT;
474 8         867 exit 0;
475             }
476             else {
477 0         0 die "Can't fork: $!\n";
478             }
479             }
480 6         166 $err = 0;
481 6         72 foreach $pid (@pids) {
482 24         286628847 waitpid($pid, 0);
483 24 50       495 if ($? != 0) {
484 0         0 print STDERR "Process $pid return status was $?\n";
485 0         0 $err = 1;
486             }
487             }
488            
489             # Now collect all the outputs together.
490 6         34 %{$self->{GSCORE}} = ();
  6         69  
491 6         17 %{$self->{GPARAM}} = ();
  6         34  
492 6         6 %{$self->{FSCORES}} = ();
  6         25  
493 6         36 my $multiple = $self->{MULTIPLE};
494 6         25 my $step = $self->{STEP};
495 6         12 @{$self->{SCANDATA}} = ();
  6         26  
496            
497 6         31 for ($iproc = 0; $iproc < $self->{MAXPROC}; $iproc++) {
498 24         118 my $infile = "$tmpdir/sdrs.out.$iproc";
499 24 50       1493 open (IN, "<$infile") || die "can not open input file $infile: $!\n";
500 24         574 while () {
501 207323         146337 push (@{$self->{SCANDATA}}, $_);
  207323         295781  
502 207323         187921 chomp;
503 207323         530684 my ($assay, $dose, $fscore, $a, $b, $d) = split (/\t/, $_);
504 207323         420726 $self->{GSCORE}->{$assay}->{$dose} = $fscore;
505 207323         433276 $self->{GPARAM}->{$assay}->{$dose} = "$a:$b:$d";
506 207323 50       326099 if (defined $expressedAssays{$assay}) {
507 207323         593001 $self->{FSCORES}->{$dose}->{$assay} = $fscore;
508             }
509             }
510 24         817 close IN;
511             }
512            
513             #sort probesets based on F scores for every selected doses, output P values for FDR calculation
514 6         12 %{$self->{SORTED_DATA}} = ();
  6         35  
515 6         67 %{$self->{PVAL_DATA}} = ();
  6         13  
516 6         11 foreach my $dose (keys %{$self->{FSCORES}}) {
  6         3358  
517 8238         8268 my %fs = %{$self->{FSCORES}->{$dose}};
  8238         221508  
518 8238 50       67764 foreach my $assay (sort {$fs{$b} <=> $fs{$a} || $a cmp $b } (keys %fs)) {
  745540         1261965  
519 207323         164415 push (@{$self->{SORTED_DATA}->{$dose}}, $assay);
  207323         406669  
520 207323         188970 my $pvalue;
521 207323 50       377916 if ($fs{$assay} < 0) {
522 0         0 $pvalue = 1.0;
523             }
524             else {
525 207323         499208 $pvalue = Statistics::Distributions::fprob($self->{FDISTR_N},
526             $self->{FDISTR_M},
527             $fs{$assay});
528             }
529 207323         9396046 push (@{$self->{PVAL_DATA}->{$dose}}, $pvalue);
  207323         745905  
530             }
531             }
532            
533             #calculates EC50 for every probeset
534 6         1859 %{$self->{EC50DATA}} = ();
  6         31  
535 6         17 foreach my $assay (keys %{$self->{GPARAM}}) {
  6         75  
536 151         367 my (%fscore, %param);
537 151         201 %fscore = %{$self->{GSCORE}->{$assay}};
  151         221885  
538 151         14542 %param = %{$self->{GPARAM}->{$assay}};
  151         150879  
539              
540 151         29908 my @sorted_list = sort {$a<=>$b} (values %fscore);
  1884222         1515218  
541 151         534 my $max = pop @sorted_list;
542 151         402 my $min = shift @sorted_list;
543 151         71745 @sorted_list = sort {$fscore{$a}<=>$fscore{$b}} (keys %fscore);
  1884222         1811816  
544 151         20461 my $ec50 = pop @sorted_list; #highest f score
545 151         292 my ($range, $high, $low, $peak, $ec50range, $fold);
546 151 100       1139 if ($max >= $self->{CUTOFF}) {
547 111         950 $range = $self->_find_ec50_range(\%fscore, $ec50);
548 111         819 ($high, $low, $peak) = split (/\-/, $range);
549 111         832 $ec50range = $high - $low;
550             } else {
551 40         195 $high = $low = $peak = $ec50range = '';
552             }
553 151         196 my $pvalue;
554 151 50       493 if ($max <= 0.0) {
555 0         0 $pvalue = 1.0;
556             }
557             else {
558 151         1454 $pvalue = Statistics::Distributions::fprob($self->{FDISTR_N},
559             $self->{FDISTR_M},
560             $max);
561             }
562 151         18415 my ($a, $b, $d) = split (/\:/, $param{$ec50});
563 151 100       688 if ($d >= 0) {
564 11 100       52 if ($a != 0) {
565 10         30 $fold = -($b/$a);
566             } else {
567 1         9 $fold = -99999;
568             }
569             } else {
570 140 50       450 if ($a != 0) {
571 140         295 $fold = $b/$a;
572             } else {
573 0         0 $fold = 99999;
574             }
575             }
576 151         123762 $self->{EC50DATA}->{$assay} = { MAX => $max,
577             MIN => $min,
578             LOW => $low,
579             HIGH => $high,
580             EC50 => $ec50,
581             PVALUE => $pvalue,
582             EC50RANGE => $ec50range,
583             PEAK => $peak,
584             A => $a,
585             B => $b,
586             D => $d,
587             FOLD => $fold };
588             }
589 6         132 $self->{STATE} = "calculated";
590             }
591              
592             =item C
593              
594             Return the complete list of scan data used in the SDRS calculation. This is just an array of lines containing the values of the EC50 calculations.
595              
596             =cut
597              
598             sub scandata {
599 6     6 1 3783 my $self = shift;
600 6         17 return @{$self->{SCANDATA}};
  6         99035  
601             }
602              
603             =item C
604              
605             Return the list of doses used in the SDRS calculation that can be
606             used as arguments for assays and pvalues.
607              
608             =cut
609              
610             sub score_doses {
611 5     5 1 27755 my $self = shift;
612 5         25 return keys %{$self->{FSCORES}};
  5         2865  
613             }
614              
615             =item C
616              
617             Return a list of assays for each dose in $dose sorted by F-score.
618              
619             =cut
620            
621             sub sorted_assays_by_dose {
622 6865     6865 1 52065 my $self = shift;
623 6865         6200 my $dose = shift;
624 6865 50       12225 if ($self->{STATE} ne 'calculated') {
625 0         0 $self->calculate;
626             }
627              
628 6865 50       8680 if (not defined $dose) {
629 0         0 return %{$self->{SORTED_DATA}};
  0         0  
630             }
631             else {
632 6865 50       10125 if (not exists($self->{SORTED_DATA}->{$dose})) {
633 0         0 carp "Sorted assays by dose = $dose does not exist.\n";
634 0         0 return undef;
635             }
636             else {
637 6865         5185 return @{$self->{SORTED_DATA}->{$dose}};
  6865         66040  
638             }
639             }
640             }
641            
642              
643             =item C
644              
645             Return a list of P values for the assays returned by sorted_assays_by_dose
646             for each dose in $dose sorted by F-score.
647              
648             =cut
649            
650             sub pvalues_by_dose {
651 6865     6865 1 22765 my $self = shift;
652 6865         6415 my $dose = shift;
653 6865 50       11810 if ($self->{STATE} ne 'calculated') {
654 0         0 $self->calculate;
655             }
656              
657 6865 50       8645 if (not defined $dose) {
658 0         0 return %{$self->{PVAL_DATA}};
  0         0  
659             }
660             else {
661 6865 50       10370 if (not exists($self->{PVAL_DATA}->{$dose})) {
662 0         0 carp "P values by dose = $dose does not exist.\n";
663 0         0 return undef;
664             }
665             else {
666 6865         4915 return @{$self->{PVAL_DATA}->{$dose}};
  6865         66585  
667             }
668             }
669             }
670              
671             =item C
672              
673             Returns EC50 data for the calculation. If no arguments are specified,
674             then the internal hash for the EC50 calculation are returned. If just
675             an C<$assay> is specified, then the internal hash for all the EC50
676             values associated with that C<$assay> is returned. If an C<$assay> and
677             C<$property> are specified, then the property for that assay is
678             returned. If the C<$precision> operand is specified, then it controls
679             how many digits of precision are returned for the value.
680              
681             Here is the list of possible properties.
682              
683             MAX Maximum F score
684             MIN Minimum F score. If this property is negative, then an error
685             was encountered in the calculation of F scores. This is likely due
686             insufficient range in the responses.
687             LOW Lower bound of 95% confidence interval for the estimated EC50.
688             HIGH Upper bound of 95% confidence interval for the estimated EC50.
689             EC50 Estimated EC50.
690             PVALUE P value of the best fitting model
691             EC50RANGE range of 95% confidence interval for the estimated EC50.
692             PEAK Number of peaks in the F scores at search doses across experimental dose range.
693             A Estimated value for A in the best model.
694             B Estimated value for B in the best model.
695             D Estimated value for D in the best model.
696             FOLD Ratio of B/A or 99999.0. If A == 0. Positive if D < 0, negative otherwise.
697              
698             =cut
699              
700             sub ec50data {
701 1812     1812 1 4502 my $self = shift;
702 1812         1259 my $assay = shift;
703 1812         1214 my $property = shift;
704 1812         1108 my $precision = shift;
705              
706 1812 50       2620 $precision = 6 if not defined $precision;
707 1812 50       2604 if ($self->{STATE} ne 'calculated') {
708 0         0 $self->calculate;
709             }
710 1812 50       1929 if (not defined $assay) {
711 0         0 return %{$self->{EC50DATA}};
  0         0  
712             }
713             else {
714 1812 50       2528 if (not exists($self->{EC50DATA}->{$assay})) {
715 0         0 carp "EC50 data for $assay does not exist.\n";
716 0         0 return undef;
717             }
718 1812         1455 my $assaydata = $self->{EC50DATA}->{$assay};
719 1812 50       1744 if (not defined $property) {
720 0         0 return %{$self->{EC50DATA}->{$assay}};
  0         0  
721             }
722             else {
723 1812 50       1977 if (not exists($assaydata->{$property})) {
724 0         0 carp "EC50 data for property $property for assay $assay does not exist.\n";
725 0         0 return undef;
726             }
727             else {
728 1812         1652 my $ret = $assaydata->{$property};
729 1812 100 100     5663 if ($ret ne "" and $ret != int($ret)) {
730 1394         3794 $ret = sprintf("%.${precision}g", $ret) + 0.0;
731             }
732 1812         5371 return $ret;
733             }
734             }
735             }
736             }
737              
738             sub _make_tmp {
739 14     14   14 my $self = shift;
740 14 50 33     930 if (-d $self->{TMPDIR} and -w $self->{TMPDIR}) {
741 0         0 return;
742             }
743 14 50       123 if (-d $self->{TMPDIR}) {
744 0         0 croak "Unable to write " . $self->{TMPDIR} . "\n";
745             }
746 14 50       131 if (-w $self->{TMPDIR}) {
747 0 0       0 unlink($self->{TMPDIR}) ||
748             croak "Unable to delete " . $self->{TMPDIR} . "\n";
749             }
750 14         113 &_system_with_check("mkdir -p " . $self->{TMPDIR},
751             $self->{DEBUG});
752 14         118 $self->{TMP_CREATED} = 1;
753             }
754              
755             sub _find_ec50_range {
756 111     111   268 my $self = shift;
757 111         232 my ($scores, $ec50) = @_;
758 111         237 my @range = ();
759 111         177 my $off_flag = 1;
760 111         181 my $peak = 0;
761 111         111 my $current;
762 111         136 my $seen = 0;
763 111         196 my $boundary;
764 111         21081 foreach my $dose (sort {$a<=>$b} keys %$scores) {
  1402431         1060682  
765 152403         216193 $current = $$scores{$dose};
766 152403 100       209265 $seen = 1 if ($dose == $ec50);
767 152403 100       184691 if ($current >= $self->{CUTOFF}) {
768 99063         93671 push @range, $dose;
769 99063 100       161602 $off_flag = 0 if ($off_flag == 1);
770             } else {
771 53340 100       68950 if ($seen == 1) {
772 55         350 $boundary = _find_boundary(\@range);
773 55         105 $seen = 0;
774             }
775 53340         46980 @range = ();
776 53340 100       81230 if ($off_flag == 0) {
777 55         95 $off_flag = 1;
778 55         130 $peak++;
779             }
780             }
781             }
782 111 100       20760 $boundary = _find_boundary(\@range) if ($seen == 1);
783 111 100       448 $peak++ if ($current >= $self->{CUTOFF});
784            
785 111         5109 return "$boundary-$peak";
786             }
787            
788             sub _find_boundary {
789 111     111   259 my $range = shift;
790 111         156 my ($high, $low);
791              
792 111 50       323 if (@$range >= 2) {
793 111         202 $low = shift @$range;
794 111         273 $high = pop @$range;
795             } else {
796 0         0 $low = shift @$range;
797 0         0 $high = $low;
798             }
799 111         508 return "$high-$low";
800             }
801              
802             sub _scan_dose_point {
803 2746     2746   4492 my $self = shift;
804 2746         3594 my $dose = shift;
805 2746         6604 my $ecstring = '';
806 2746         4603 my $m = $self->{FDISTR_M};
807 2746         4236 my $n = $self->{FDISTR_N};
808 2746         3424 foreach my $assay (keys %{$self->{VALUES}}) {
  2746         17709  
809 42563         283631 my ($al, $ah, $astep) = split (/\:/, $self->{ARANGE}->{$assay});
810 42563         170063 my ($bl, $bh, $bstep) = split (/\:/, $self->{BRANGE}->{$assay});
811 42563         211820795 my ($a, $b, $d, $best_sse) = _compute_best_sse($al, $ah, $astep,
812             $bl, $bh, $bstep,
813             $dose,
814             $self->{VALUES}->{$assay},
815             $self->{DOSES});
816 42563 50 33     459636 if ($best_sse == 0 or $n == 0) {
817 0         0 carp sprintf("best_sse($best_sse) = 0 or n($n) = 0 for $assay. arange = %s brange = %s\n",
818             $self->{ARANGE}->{$assay},
819             $self->{BRANGE}->{$assay});
820 0         0 $ecstring .= "$assay\t" .
821             sprintf("%.5f", $dose) .
822             "\t-1\t-1\t-1\t-1\n";
823             }
824             else {
825 42563         221174 my $f_score = ($self->{SST}->{$assay} - $best_sse) * $m / ($best_sse * $n);
826 42563         499043 $f_score = sprintf("%.3f", $f_score);
827 42563         137600 $a = sprintf("%.3f", $a);
828 42563         128574 $b = sprintf("%.3f", $b);
829 42563         116794 $d = sprintf("%.3f", $d);
830 42563         364682 $ecstring .= "$assay\t" .
831             sprintf("%.5f", $dose) .
832             "\t$f_score\t$a\t$b\t$d\n";
833             }
834             }
835 2746         865087 return $ecstring;
836             }
837              
838             sub _define_ab_boundary {
839 275     275   460 my $self = shift;
840 275         509 my ($assay, $data) = @_;
841 275         1832 my @sorted = sort {$a<=>$b} @$data;
  6950         6678  
842 275         1160 my @sorted_copy = @sorted;
843 275         1371 my @brange = splice (@sorted, $#sorted-5, 6);
844 275         1374 $self->_find_step($assay, 'b', \@brange);
845 275         1912 my @arange = splice (@sorted_copy, 0, 6);
846 275         1442 $self->_find_step($assay, 'a', \@arange);
847             }
848              
849             sub _find_step {
850              
851 550     550   1009 my $self = shift;
852 550         1086 my ($assay, $pam, $data) = @_;
853 550         2740 my $stdev = Math::NumberCruncher::StandardDeviation($data);
854 550         3161617 my $mean = Math::NumberCruncher::Mean($data);
855 550         11127 my $cutoff = $mean / 10;
856 550 100 100     2712 $stdev = $cutoff if ($stdev eq 'NaN' || $stdev < $cutoff);
857 550         285996 my ($l, $h, $step);
858 550         2185 $step = $stdev / 2.5;
859 550         432010 $step = sprintf("%.3f", $step);
860 550 100       61433 if ($step == 0.0) {
861 5         1265 carp "Data range too small for $assay -- step size raised to 0.001.\n";
862 5         400 $step = "0.001";
863             }
864 550 100       2510 if ($pam eq 'a') {
    50          
865 275         1090 $h = $mean + 2.3*$stdev;
866 275         272314 $l = $mean - 2*$stdev;
867 275 100       224417 $l = $self->{MIN}->{$assay} if ($l <= 0);
868 275         44166 $h = sprintf("%.3f", $h);
869 275         24036 $l = sprintf("%.3f", $l);
870 275         18527 $self->{ARANGE}->{$assay} = "$l:$h:$step";
871             } elsif ($pam eq 'b') {
872 275         1167 $h = $mean + 2*$stdev;
873 275         263235 $l = $mean - 2.3*$stdev;
874 275 100       311141 $l = $self->{MIN}->{$assay} if ($l <= 0);
875 275         46294 $h = sprintf("%.3f", $h);
876 275         27362 $l = sprintf("%.3f", $l);
877 275         14486 $self->{BRANGE}->{$assay} = "$l:$h:$step";
878             }
879             }
880              
881             sub _compute_sst {
882 275     275   622 my $data = shift;
883 275         383 my $sst = 0;
884 275         983 my $mean = Math::NumberCruncher::Mean($data);
885            
886 275         6152 foreach my $data (@$data) {
887 3010         4509 $sst += ($data - $mean)**2;
888             }
889 275         658 return $sst;
890             }
891              
892             sub _system_with_check {
893              
894 20     20   46 my $command = shift;
895 20         46 my $echo = shift;
896              
897 20 50 33     151 if (defined $echo and $echo) {
898 0         0 &_dated_mesg($command);
899             }
900 20         113506 my $status = system("$command");
901 20 50       474625 if ($status != 0) {
902 0           croak "Shell command: $command returned $status error code.\n";
903             }
904             }
905              
906             sub _dated_mesg {
907              
908             # Print a dated message to STDERR
909 0     0     my $mesg = shift;
910              
911 0           my $date = &_date;
912 0 0         if (substr($mesg, -1, 1) ne "\n") {
913 0           $mesg .= "\n";
914             }
915 0           print STDERR "At $date: $mesg";
916             }
917              
918             sub _date {
919 0     0     return strftime("%a %b %d %T %Z %Y", localtime);
920             }
921              
922             =back
923              
924             =head1 SEE ALSO
925              
926             sdrs.pl
927              
928             =head1 AUTHORS
929              
930             Ruiru Ji
931             Nathan O. Siemers
932             Lei Ming
933             Liang Schweizer
934             Robert Bruccoleri
935              
936             =cut
937              
938            
939 9     9   8136 use Inline C => <<'END_OF_C_CODE', NAME => 'Bio::SDRS::FastSSE';
  9         161550  
  9         54  
940              
941             void extract_double_array(SV *arrayp,
942             double a[],
943             I32 asize,
944             I32 *limitp);
945              
946             void _compute_best_sse(double al,
947             double ah,
948             double astep,
949             double bl,
950             double bh,
951             double bstep,
952             double dose,
953             SV* valuesp,
954             SV* dosesp)
955             {
956             double a, b, d, diff, sse, tmp, y_hat, d2;
957             double a_best, b_best, d_best, sse_best;
958             SV* param_ret;
959             SV* best_sse_ret;
960             I32 count = 0;
961             I32 i;
962             I32 vlimit, dlimit;
963             #define DOSE_SIZE 100
964             double doses[DOSE_SIZE];
965             double values[DOSE_SIZE];
966             Inline_Stack_Vars;
967              
968             extract_double_array(valuesp, values, DOSE_SIZE, &vlimit);
969             extract_double_array(dosesp, doses, DOSE_SIZE, &dlimit);
970             if (vlimit != dlimit) {
971             fprintf(stderr, "Error in compute_best_sse: limit mismatch. vlimit = %d dlimit = %s\n",
972             vlimit, dlimit);
973             exit(1);
974             }
975            
976             for (a = al; a < ah; a += astep) {
977             // fprintf(stderr, "a = %14.10g\n", a);
978             for (b = bh; b > bl && a <= b; b -= bstep) {
979             // fprintf(stderr, "b = %14.10g\n", b);
980             diff = b - a;
981             for (d = -6; d < 6.3; d += 0.3) {
982             // fprintf(stderr, "d = %14.10g dlimit = %d\n", d, dlimit);
983             sse = 0;
984             for (i = 0; i <= dlimit; i++) {
985             tmp = doses[i] / dose;
986             tmp = pow(tmp, d);
987             y_hat = a + diff / (1 + tmp);
988             d2 = (y_hat - values[i]);
989             sse += d2 * d2;
990             }
991             if (++count == 1 || sse < sse_best) {
992             a_best = a;
993             b_best = b;
994             d_best = d;
995             sse_best = sse;
996             }
997             }
998             }
999             }
1000             Inline_Stack_Reset;
1001             Inline_Stack_Push(sv_2mortal(newSVnv(a_best)));
1002             Inline_Stack_Push(sv_2mortal(newSVnv(b_best)));
1003             Inline_Stack_Push(sv_2mortal(newSVnv(d_best)));
1004             Inline_Stack_Push(sv_2mortal(newSVnv(sse_best)));
1005             Inline_Stack_Done;
1006             }
1007              
1008             void extract_double_array(SV *arrayp,
1009             double a[],
1010             I32 asize,
1011             I32 *limitp)
1012             /*
1013             * Extract the elements of the Perl array, pointed to by arrayp,
1014             * into the C array a. The size of a is asize, and the code will check
1015             * for overflow. The limiting subscript for arrayp will be returned in
1016             * *limitp. */
1017              
1018             {
1019             AV* array;
1020             I32 limit, i;
1021              
1022             array = SvRV(arrayp);
1023             limit = av_len(array);
1024             if (limit+1 > asize) {
1025             fprintf(stderr,
1026             "extract_double_array: Perl array is too big. limit = %d asize = %d\n",
1027             limit, asize);
1028             exit(1);
1029             }
1030             for (i = 0; i <= limit; i++) {
1031             SV *element, **elementp;
1032             elementp = av_fetch(array, i, 0);
1033             element = *elementp;
1034             a[i] = SvNV(element);
1035             }
1036             *limitp = limit;
1037             }
1038              
1039             END_OF_C_CODE