File Coverage

blib/lib/diffReps/DiffRes.pm
Criterion Covered Total %
statement 27 288 9.3
branch 0 78 0.0
condition 0 18 0.0
subroutine 9 39 23.0
pod 0 22 0.0
total 36 445 8.0


line stmt bran cond sub pod time code
1 1     1   6810 use 5.006;
  1         3  
  1         64  
2             our $VERSION = '1.11';
3              
4             # Package to analyze the ChIP-seq differential analysis results.
5             # Contains functions to manipulate and score a cluster of diff sites.
6             # Can be used to identify chromatin modifiation hotspots.
7              
8             package diffReps::DiffRes;
9 1     1   6 use strict;
  1         1  
  1         40  
10             #use MyShortRead::MyShortRead qw(compare2);
11 1     1   3428 use Math::CDF qw(ppois);
  1         8597  
  1         152  
12 1     1   1241 use MyBioinfo::Common qw( min );
  1         4  
  1         108  
13 1     1   7 use constant INF => 1e10;
  1         1  
  1         66  
14 1     1   5 use constant WIN1 => 2e5; # 200kb.
  1         1  
  1         56  
15 1     1   5 use constant WIN2 => 1e6; # 1Mb.
  1         1  
  1         3441  
16              
17             require Exporter;
18             our @ISA = qw(Exporter);
19             our @EXPORT_OK = qw( cmpsite );
20              
21              
22             # Constructor.
23             # Lambda: defined as the avg site-to-site distance which can be used to
24             # calculate an expected segment number for a cluster.
25             # Expected segments = cluster distance / lambda
26             # P-value is evaluated as Poisson(Observed segment |Expected segment).
27             # With the same Observed / Expected ratio, larger cluster is always favored
28             # vs. smaller one.
29             sub new{
30 0     0 0   my $class = shift;
31 0           my $self = {
32             siteList => [], # diff site list.
33             # each site is a hash: (chrom, start, end, siteCenter, recLoc).
34             curPos => 0, # current position in site list.
35             glbLambda => undef, # global lambda: avg inter-distance between two sites.
36             clust => {
37             from => undef, # start position (in the list) of cluster.
38             to => undef, # end position (in the list) of cluster.
39             seg => undef, # cluster segments.
40             dist => undef, # left-to-right distance.
41             enrich => undef, # observed / expected segment.
42             pval => undef # P-value based on Poisson distribution.
43             }
44             };
45 0           bless $self, $class;
46 0           return $self;
47             }
48              
49             # Evaluate the significance of current cluster.
50             sub sigeval{
51 0     0 0   my($self) = @_;
52 0           $self->dist_seg_clust(); # calc actual distance, segments for cluster.
53             # Trivial case: the new pushed site is on another chromosome or the cluster contains a single site.
54 0 0 0       if($self->{'clust'}{'dist'} == INF or $self->{'clust'}{'dist'} == 0){
55 0           $self->{'clust'}{'seg'} = undef;
56 0           $self->{'clust'}{'enrich'} = undef;
57 0           $self->{'clust'}{'pval'} = 1.0;
58 0           return 1.0;
59             }
60 0           my($l_d1,$l_s1,$l_d2,$l_s2) = $self->calc_locL(-1); # search left.
61 0           my($r_d1,$r_s1,$r_d2,$r_s2) = $self->calc_locL(+1); # search right.
62 0           my $d1 = $l_d1 + $r_d1;
63 0           my $s1 = $l_s1 + $r_s1;
64 0           my $d2 = $l_d2 + $r_d2;
65 0           my $s2 = $l_s2 + $r_s2;
66              
67 0 0         my $L1 = $s1 > 0? $d1 / $s1 : INF; # local lambda 1.
68 0 0         my $L2 = $s2 > 0? $d2 / $s2 : INF; # local lambda 2.
69              
70 0           my $L_min = &min($L1, $L2, $self->{'glbLambda'});
71 0 0 0       if($L_min != INF and defined $self->{'clust'}{'dist'}){
72 0           my $exp_seg = $self->{'clust'}{'dist'} / $L_min;
73 0           $self->{'clust'}{'enrich'} = $self->{'clust'}{'seg'} / $exp_seg;
74 0           $self->{'clust'}{'pval'} = 1 - ppois($self->{'clust'}{'seg'}, $exp_seg);
75             }else{
76 0           $self->{'clust'}{'enrich'} = undef;
77 0           $self->{'clust'}{'pval'} = 1.0;
78             }
79              
80 0           return $self->{'clust'}{'pval'};
81             }
82              
83             # Search either direction of a differential list to calculate local lambda.
84             sub calc_locL{
85 0     0 0   my($self,$di) = @_; # di must be: -1 or 1 for left or right.
86             # Calc site distance considering direction.
87             sub site_dist_di{
88 0     0 0   my($di,$w_site,$c_site) = @_;
89 0 0         return $di < 0? &site_dist($w_site, $c_site) : &site_dist($c_site, $w_site);
90             }
91             # Calc segment number considering direction.
92             sub seg_di{
93 0     0 0   my($di,$w_loc,$c_loc) = @_;
94 0 0         return $di < 0? $c_loc - $w_loc : $w_loc - $c_loc;
95             }
96             # Move back to the last site within window size and calculate distance and segments
97             # considering direction.
98             sub last_dist_seg_di{
99 0     0 0   my($self,$di,$w_loc,$c_loc,$c_site) = @_;
100 0           my $w_loc_ = $w_loc - $di; # recover to the last site that is still on the same chromosome.
101 0           my $w_site = $self->{'siteList'}->[$w_loc_];
102 0           my $d = &site_dist_di($di, $w_site, $c_site);
103 0           my $s = &seg_di($di, $w_loc_, $c_loc);
104              
105 0           return ($d,$s);
106             }
107              
108             # Setup initial parameter values.
109 0 0         my $c_loc = $di < 0? $self->{'clust'}{'from'} : $self->{'clust'}{'to'}; # cluster edge location.
110 0           my $c_site = $self->{'siteList'}->[$c_loc]; # cluster edge site.
111 0           my($dist1,$seg1,$dist2,$seg2); # local distance and #segment 1 and 2.
112 0           $dist1 = $dist2 = $seg1 = $seg2 = 0;
113 0           my $tag1 = 0; # tag for window1.
114 0           my $nsite = @{$self->{'siteList'}}; # number of total sites.
  0            
115 0           my $w_loc;
116 0   0       for($w_loc = $c_loc + $di; $w_loc >= 0 and $w_loc < $nsite; $w_loc += $di){
117 0           my $w_site = $self->{'siteList'}->[$w_loc]; # window site.
118 0           my $d = &site_dist_di($di, $w_site, $c_site); # distance between two sites.
119 0 0 0       if(!$tag1 and $d > WIN1){ # pass window1 for the first time?
120 0           ($dist1,$seg1) = $self->last_dist_seg_di($di, $w_loc, $c_loc, $c_site);
121 0           $tag1 = 1; # set tag for window1.
122             }
123 0 0         if($d > WIN2){ # pass window2.
124 0           ($dist2,$seg2) = $self->last_dist_seg_di($di, $w_loc, $c_loc, $c_site);
125 0           last;
126             }
127             }
128 0 0 0       if($w_loc < 0 or $w_loc >= $nsite){ # exceeded the begining or end of the diff list.
129 0           my($d,$s) = $self->last_dist_seg_di($di, $w_loc, $c_loc, $c_site);
130 0 0         if(!$tag1){
131 0           ($dist1,$seg1) = ($d, $s);
132 0           $tag1 = 1;
133             }
134 0           ($dist2,$seg2) = ($d, $s);
135             }
136              
137 0           return ($dist1, $seg1, $dist2, $seg2);
138             }
139              
140             # Read genomic coordinates of chromatin modification sites from a list
141             # of files. Record the line number for each diff site.
142             sub read_modsite{
143 0     0 0   my($self,$rfile,$info) = @_;
144             # Figure out which columns are used in output.
145             # Format eg: 1,2,3-6,7-8,9
146 0           my @icol = (); # columns.
147 0 0         if($info ne '0'){
148 0           my @iseg = split /,/, $info; # segments.
149 0           foreach my $s(@iseg){
150 0           my($sta,$end) = split /-/, $s;
151 0 0         $end = $sta unless defined $end;
152 0           for my $c($sta..$end){
153 0           push @icol, $c;
154             }
155             }
156             }
157             # Start extracting sites from inputs.
158 0           $self->{siteList} = []; # deplete site list first.
159 0           foreach my $f(@{$rfile}){
  0            
160 0 0         open DIFF, '<', $f or die "Open diff site file error: $!\n";
161 0           my $ln = 0;
162 0           my $site;
163 0           while(){ # Get rid of header lines.
164 0           $ln++;
165 0 0         last if ($site = &parse_site($_, $f, $ln, \@icol));
166             }
167 0 0         push @{$self->{siteList}}, $site if $site;
  0            
168 0           while(){
169 0           $ln++;
170 0 0         if($site = &parse_site($_, $f, $ln, \@icol)){
  0            
171 0           push @{$self->{siteList}}, $site;
  0            
172             }
173             else {die "Un-recognized diff site format in file $f: $ln\n";}
174             }
175 0           close DIFF;
176             }
177 0           $self->{curPos} = 0; # reset current position.
178             }
179              
180              
181             # Parse a line of diff site file and store it in hash table.
182             sub parse_site{
183 0     0 0   my($rec,$file,$ln,$rcol) = @_;
184 0           my $mark;
185             # Try to figure out the histone mark name from the input file...
186 0 0         if($file =~ /((h[0-9]+k[0-9]+((me[0-9]+)|ac))|(polII)|(pol2))/i) {$mark = $1;}
  0            
  0            
187             else {$mark = $file;} # if not matching pattern, use file name instead.
188 0 0         if($rec =~ /^(\S+)\t([0-9]+)\t([0-9]+)/){
  0            
189 0           my @cells = split /\t/, $rec;
190             # Extract output info from specified columns.
191 0           my @vinfo;
192 0           foreach my $c(@{$rcol}){
  0            
193 0 0 0       if($c > 0 and $c <= @cells){
194 0           push @vinfo, $cells[$c-1];
195             }
196             }
197 0           my $sinfo = '';
198 0 0         if(@vinfo > 0){
199 0           $sinfo = join(',', @vinfo);
200             }
201             # Return record as a hash table.
202             return {
203 0           chrom => $cells[0], # chromosome name.
204             start => $cells[1],
205             end => $cells[2],
206             siteCenter => ($cells[1] + $cells[2]) / 2, # diff site center.
207             recLoc => $file . ':(' . $sinfo . '):' . $ln, # record location in file:(info):ln format.
208             markType => $mark
209             };
210             }
211             else {return 0;}
212             }
213              
214             # Sort diff sites according to genomic coordinates.
215             sub sort_list{
216 0     0 0   my($self) = @_;
217 0 0         if(@{$self->{siteList}} > 1){
  0            
218 0           @{$self->{siteList}} = sort cmpsite @{$self->{siteList}};
  0            
  0            
219             }
220             }
221              
222             # Comparing two sites according to their centers.
223             sub cmpsite{
224 0     0 0   $a->{chrom} =~ /^(chr)?(\w+)$/;
225 0           my $chr1 = $2;
226 0           $b->{chrom} =~ /^(chr)?(\w+)$/;
227 0           my $chr2 = $2;
228            
229             # $chr1 = 100 if uc $chr1 eq 'X';
230             # $chr1 = 101 if uc $chr1 eq 'Y';
231             # $chr1 = 102 if uc $chr1 eq 'M';
232             # $chr1 = 199 if uc $chr1 eq 'Z';
233             # $chr2 = 100 if uc $chr2 eq 'X';
234             # $chr2 = 101 if uc $chr2 eq 'Y';
235             # $chr2 = 102 if uc $chr2 eq 'M';
236             # $chr2 = 199 if uc $chr2 eq 'Z';
237              
238             # return $chr1 <=> $chr2 unless $chr1 == $chr2;
239              
240 0 0         return $chr1 cmp $chr2 unless $chr1 eq $chr2;
241              
242 0           return $a->{siteCenter} <=> $b->{siteCenter};
243             }
244              
245             # Calculate global lambda. Return negative value if it cannot be determined.
246             sub calc_glbL{
247 0     0 0   my($self) = @_;
248 0 0         if(@{$self->{siteList}} < 2){
  0            
249 0           $self->{'glbLambda'} = INF;
250 0           return;
251             }
252 0           my $tot_dist = 0; # total internal distance.
253 0           my $tot_seg = 0; # total distance segments.
254 0           my $left = 0;
255             # Go through each chromosome and do:
256             # identify the left and right-most site, calculate the distance and
257             # record the number of internal segments; add them to global variables.
258             # Finally, calcualte an averaged distance.
259 0           my $l_site = $self->{siteList}->[$left];
260 0           my $r_site;
261             my $right;
262 0           for($right = 1; $right < @{$self->{siteList}}; $right++){
  0            
263 0           $r_site = $self->{siteList}->[$right];
264 0 0         if(!same_chr($l_site, $r_site)){
265 0           $r_site = $self->{siteList}->[$right-1]; # the last site on same chromosome.
266 0           my $d = site_dist($l_site, $r_site);
267 0 0         if($d < INF){
268 0           $tot_dist += $d;
269 0           $tot_seg += $right - $left -1;
270             }
271 0           $left = $right; # move into new chromosome.
272 0           $l_site = $self->{siteList}->[$left];
273             }
274             } # Upon exit: left and right sites are always on the same chromosome.
275             # Add the last chromosome.
276 0           $tot_dist += site_dist($l_site, $r_site);
277 0           $tot_seg += $right - $left -1;
278 0 0         $self->{'glbLambda'} = $tot_seg > 0? $tot_dist / $tot_seg : INF;
279             }
280              
281             # Determine if two sites are on the same chromosome.
282             sub same_chr{
283 0     0 0   my($l_site,$r_site) = @_;
284 0           return $l_site->{'chrom'} eq $r_site->{'chrom'};
285             }
286              
287             # Calcualte distance between two sites. Always return a positive value.
288             # If Left > Right, stop the program and issue a fatal error.
289             sub site_dist{
290 0     0 0   my($l_site,$r_site) = @_;
291 0 0         if(!same_chr($l_site, $r_site)){
292 0           return INF;
293             }
294 0 0         if($l_site->{'siteCenter'} > $r_site->{'siteCenter'}){
295 0           die "Left site has larger coordinate than right site when calculating distance. Check program logic. Stop now.\n";
296             }
297 0           return $r_site->{'siteCenter'} - $l_site->{'siteCenter'};
298             }
299              
300             # Initialize a cluster with the current site.
301             sub init_clust{
302 0     0 0   my($self) = @_;
303 0           $self->{clust}{from} = $self->{curPos};
304 0           $self->{clust}{to} = $self->{curPos};
305 0           $self->{clust}{seg} = undef;
306 0           $self->{clust}{dist} = undef;
307 0           $self->{clust}{enrich} = undef;
308 0           $self->{clust}{pval} = 1.0;
309              
310 0           return $self->{clust}{pval};
311             }
312              
313             # Are we at the end of the site list?
314             sub is_list_end{
315 0     0 0   my($self) = @_;
316 0           return $self->{curPos} >= @{$self->{siteList}} - 1;
  0            
317             }
318              
319             # Shift the leftmost site out of the cluster.
320             sub shift_clust{
321 0     0 0   my($self) = @_;
322 0 0         if($self->{clust}{from} < $self->{clust}{to}){
323 0           $self->{clust}{from}++;
324 0           return $self->sigeval();
325             }else{
326 0           warn "Only one site left in cluster. Shift was not performed.\n";
327 0           return $self->{clust}{pval};
328             }
329             }
330              
331             # Un-shift the leftmost site into the cluster.
332             sub unshift_clust{
333 0     0 0   my($self) = @_;
334 0 0         if($self->{clust}{from} > 0){
335 0           $self->{clust}{from}--;
336 0           return $self->sigeval();
337             }else{
338 0           warn "Leftmost site is already at the beginning of the difflist. Unshift was not performed.\n";
339 0           return $self->{clust}{pval};
340             }
341             }
342              
343             # Add a site to the rightmost of the cluster.
344             sub push_clust{
345 0     0 0   my($self) = @_;
346 0 0         if($self->{curPos} < @{$self->{siteList}}){
  0            
347 0           $self->{curPos}++; # move current position to the right.
348 0           $self->{clust}{to}++;
349 0           return $self->sigeval();
350             }
351             else {
352 0           warn "Current position is already at the end of the list. Push was not performed.\n";
353 0           return $self->{clust}{pval};
354             }
355             }
356              
357             # Remove the rightmost site from the cluster.
358             sub pop_clust{
359 0     0 0   my($self) = @_;
360 0 0         if($self->{clust}{to} > $self->{clust}{from}){
361 0           $self->{clust}{to}--;
362 0           return $self->sigeval();
363             }
364             else{
365 0           warn "Site list cannot be empty. Pop was not performed.\n";
366 0           return $self->{clust}{pval};
367             }
368             }
369              
370             # Update distance and segments for current cluster.
371             sub dist_seg_clust{
372 0     0 0   my($self) = @_;
373 0           my $l_loc = $self->{clust}{from};
374 0           my $r_loc = $self->{clust}{to};
375 0           $self->{'clust'}{'seg'} = $r_loc - $l_loc;
376 0           my $l_site = $self->{'siteList'}->[$l_loc];
377 0           my $r_site = $self->{'siteList'}->[$r_loc];
378 0           $self->{'clust'}{'dist'} = &site_dist($l_site, $r_site);
379             }
380              
381             # Extract a cluster's info to external procedure.
382             sub get_clustinfo{
383 0     0 0   my($self,$h) = @_;
384 0           my $from = $self->{clust}{from};
385 0           my $to = $self->{clust}{to};
386 0 0         if($from < $to){
387 0           my @v_rec; # vector of records of cluster members.
388 0           for my $i($from..$to){
389 0           push @v_rec, $self->{siteList}[$i]{recLoc};
390             }
391 0           my %h_mark; # hash of mark types;
392 0           for my $i($from..$to){
393 0           $h_mark{$self->{siteList}[$i]{markType}} = 1;
394             }
395 0           my @v_mark = sort {$a cmp $b} keys %h_mark; # vector of unique mark types.
  0            
396 0           my $chrom = $self->{siteList}[$from]{chrom};
397 0           my $start = $self->{siteList}[$from]{start};
398 0           my $end = $self->{siteList}[$to]{end};
399             return { # return an anonymous hash.
400 0           chrom => $chrom,
401             start => $start,
402             end => $end,
403             len => $end-$start+1,
404             enrich => $self->{clust}{enrich},
405             pval => $self->{clust}{pval},
406             v_rec => [@v_rec],
407             v_mark => [@v_mark]
408             };
409             }else{
410 0           warn "Cluster contains less than two sites. No print was done.\n";
411 0           return {};
412             }
413             }
414              
415             # Get number of sites.
416             sub get_siteN{
417 0     0 0   my($self) = @_;
418 0           return scalar @{$self->{siteList}};
  0            
419             }
420              
421              
422             # Class to manage cluster list.
423             package diffReps::ClustList;
424 1     1   8 use strict;
  1         2  
  1         48  
425 1     1   5 use MyBioinfo::Common qw(padjBH);
  1         2  
  1         709  
426              
427             # Constructor.
428             sub new{
429 0     0     my $class = shift;
430 0           my $self = {
431             clustList => [],
432             f_handler => undef
433             };
434 0           bless $self, $class;
435 0           return $self;
436             }
437              
438             # Add a cluster to the list.
439             sub add{
440 0     0     my($self,$rc) = @_;
441 0           push @{$self->{clustList}}, $rc;
  0            
442             }
443              
444             # Adjust cluster P-values by BH.
445             sub adjPval{
446 0     0     my($self,$N) = @_;
447 0           my @p;
448 0           foreach my $c(@{$self->{clustList}}) {push @p, $c->{pval};}
  0            
  0            
449 0           my @padj = padjBH(\@p, $N);
450 0           my $i = 0; # iterator for adjusted P-values.
451 0           foreach my $c(@{$self->{clustList}}) {$c->{padj} = $padj[$i++];}
  0            
  0            
452             }
453              
454             # Output all clusters' info to a file.
455             sub output{
456 0     0     my($self) = @_;
457 0 0         if(!defined $self->{f_handler}){
458 0           warn "File handler has not been opened yet.\n";
459 0           return;
460             }
461 0           my $h = $self->{f_handler};
462 0           foreach my $c(@{$self->{clustList}}){
  0            
463 0           print $h join("\t", $c->{chrom},
464             $c->{start}, $c->{end}, $c->{len},
465             $c->{enrich},
466             $c->{pval}, $c->{padj},
467 0           scalar @{$c->{v_rec}}, join(';', @{$c->{v_rec}}),
  0            
468 0           scalar @{$c->{v_mark}}, join(';', @{$c->{v_mark}})), "\n";
  0            
469             }
470             }
471              
472             # Output header line.
473             sub print_header{
474 0     0     my($self) = @_;
475 0           my $h = $self->{f_handler};
476 0 0         if(defined $h){
477 0           print $h "Chrom\tStart\tEnd\tLength\tenrich\tpval\tpadj\tnsite\tSites\tntype\tMarkType\n";
478             }else{
479 0           warn "File handler has not been opened yet. Nothing was done.\n";
480             }
481             }
482              
483             # Spit a line into opened file.
484             sub spit{
485 0     0     my($self,$line) = @_;
486 0           my $h = $self->{f_handler};
487 0 0         if(defined $h){
488 0           print $h "$line\n";
489             }else{
490 0           warn "File handler has not been opened yet. Nothing was done.\n";
491             }
492             }
493              
494             # Open file handler.
495             sub open_file{
496 0     0     my($self,$f) = @_;
497 0           my $h;
498 0 0         open $h, '>', $f or die "Open output file error: $!\n";
499 0           $self->{f_handler} = $h;
500             }
501              
502             # Close file handler.
503             sub done{
504 0     0     my($self) = @_;
505 0 0         close $self->{f_handler} if defined $self->{f_handler};
506             }
507              
508              
509              
510              
511             # Preloaded methods go here.
512              
513             # Autoload methods go after =cut, and are processed by the autosplit program.
514              
515             1;
516             __END__