File Coverage

blib/lib/MyShortRead/MyShortRead.pm
Criterion Covered Total %
statement 15 181 8.2
branch 0 64 0.0
condition 0 9 0.0
subroutine 5 16 31.2
pod 0 11 0.0
total 20 281 7.1


line stmt bran cond sub pod time code
1             #
2             # My Perl library to deal with short read sequencing data.
3             # created by Li Shen, Nov 2009.
4             #
5             # strGCPercent added March 2010 - Li Shen
6             # bin_genome_count,read_chrom_len,sep_chrom_bed,del_chrfile added April 2010 - Li Shen
7             # bin_genome_count_direction added June 2010 - Li Shen
8             #
9             #
10              
11 1     1   15 use 5.006;
  1         4  
  1         41  
12 1     1   5 use strict;
  1         1  
  1         62  
13 1     1   5 use warnings;
  1         2  
  1         55  
14             package MyShortRead::MyShortRead;
15              
16              
17 1     1   5 use POSIX;
  1         1  
  1         9  
18 1     1   4522 use Time::HiRes qw(time);
  1         3622  
  1         5  
19              
20             require Exporter;
21             our @ISA = qw(Exporter);
22              
23             # Items to export into callers namespace by default. Note: do not export
24             # names by default without a very good reason. Use EXPORT_OK instead.
25             # Do not simply export all your public functions/methods/constants.
26              
27             # This allows declaration use MyShortRead::MyShortRead ':all';
28             # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
29             # will save memory.
30             our %EXPORT_TAGS = ( 'all' => [ qw(
31            
32             ) ] );
33              
34             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
35              
36             our @EXPORT = qw(compare2 overlap2 strGCPercent bin_genome_count bin_genome_count_direction read_chrlen_tbl read_chrlen_ordered sep_chrom_bed del_chrfile order_chr);
37              
38             our $VERSION = '1.00';
39              
40              
41             # Preloaded methods go here.
42              
43             # Comparing two genomic intervals according to start or end position.
44             sub compare2{
45 0     0 0   my($a,$b) = @_;
46 0           my $cmpby = 'start';
47 0 0         if(@_ > 2) {$cmpby = $_[2];}
  0            
48              
49 0           $a->{chrom} =~ /^(chr)?(\w+)$/;
50 0           my $chr1 = $2;
51 0           $b->{chrom} =~ /^(chr)?(\w+)$/;
52 0           my $chr2 = $2;
53            
54             # $chr1 = 100 if uc $chr1 eq 'X';
55             # $chr1 = 101 if uc $chr1 eq 'Y';
56             # $chr1 = 102 if uc $chr1 eq 'M';
57             # $chr1 = 199 if uc $chr1 eq 'Z';
58             # $chr2 = 100 if uc $chr2 eq 'X';
59             # $chr2 = 101 if uc $chr2 eq 'Y';
60             # $chr2 = 102 if uc $chr2 eq 'M';
61             # $chr2 = 199 if uc $chr2 eq 'Z';
62              
63             # return $chr1 <=> $chr2 unless $chr1 == $chr2;
64              
65 0 0         return $chr1 cmp $chr2 unless $chr1 eq $chr2;
66              
67 0 0         if($cmpby eq 'end') {return $a->{'end'} <=> $b->{'end'};}
  0            
  0            
68             else {return $a->{'start'} <=> $b->{'start'};}
69             }
70              
71             # Order chromosome names.
72             sub order_chr{
73 0     0 0   my($a,$b) = @_;
74 0           my($chr1, $chr2);
75             # Find chromosome names.
76 0 0         if(exists $a->{'chr'}) {$chr1 = $a->{'chr'};}
  0 0          
  0            
77 0           elsif(exists $a->{chrom}) {$chr1 = $a->{chrom};}
78 0           else {warn "Cannot find chromosome name! Order is not determined.\n"; return 0;}
79 0 0         if(exists $b->{'chr'}) {$chr2 = $b->{'chr'};}
  0 0          
  0            
80 0           elsif(exists $b->{chrom}) {$chr2 = $b->{chrom};}
81 0           else {warn "Cannot find chromosome name! Order is not determined.\n"; return 0;}
82              
83 0           return $chr1 cmp $chr2;
84              
85             # # Get rid of 'chr' if exists.
86             # $chr1 =~ /^(chr)?(\w+)$/; $chr1 = $2;
87             # $chr2 =~ /^(chr)?(\w+)$/; $chr2 = $2;
88             # # Convert letters to large numbers.
89             # $chr1 = 100 if uc $chr1 eq 'X';
90             # $chr1 = 101 if uc $chr1 eq 'Y';
91             # $chr1 = 102 if uc $chr1 eq 'M';
92             # $chr1 = 199 if uc $chr1 eq 'Z';
93             # $chr2 = 100 if uc $chr2 eq 'X';
94             # $chr2 = 101 if uc $chr2 eq 'Y';
95             # $chr2 = 102 if uc $chr2 eq 'M';
96             # $chr2 = 199 if uc $chr2 eq 'Z';
97              
98             # return $chr1 <=> $chr2;
99             }
100              
101             # Find the interval list with the lowest start or end position.
102             # return the index of the lowest list and set the position through reference.
103             sub find_lowest_list{
104 0     0 0   my($rlists, $rpos) = @_;
105 0           my $cmpby = 'start';
106 0 0         if(@_ > 2) {$cmpby = $_[2];}
  0            
107 0           my $ll = 0; # initialize lowest list.
108 0           my $lowest_intrv = $rlists->[$ll]{cur_intrv}; # intrv with the lowest end position.
109 0           for my $i(1..$#{$rlists}){
  0            
110 0           my $cur_intrv = $rlists->[$i]{cur_intrv};
111 0 0         if(compare2($cur_intrv, $lowest_intrv, $cmpby) < 0){
112 0           $lowest_intrv = $cur_intrv;
113 0           $ll = $i;
114             }
115             }
116             # return the lowest position through reference.
117 0 0         ${$rpos} = $cmpby eq 'end'? $lowest_intrv->{'end'} : $lowest_intrv->{'start'};
  0            
118 0           return $ll;
119             }
120              
121             # Judge whether two intervals overlap.
122             sub overlap2{
123 0     0 0   my($refintrv1, $refintrv2) = @_;
124 0 0         if($refintrv1->{chrom} eq $refintrv2->{chrom}){
125 0 0 0       if($refintrv1->{'start'} <= $refintrv2->{'end'} and
126             $refintrv1->{'end'} >= $refintrv2->{'start'}){
127 0           return 1;
128             }
129             }
130 0           return 0;
131             }
132              
133             # Calculate the GC percentage given a DNA sequence in string representation.
134             sub strGCPercent{
135 0     0 0   my $dnaSeq = shift;
136 0           my $c = ($dnaSeq =~ s/[gc]/X/gi); # Count the # of G or C without regarding to case.
137 0           my $n = ($dnaSeq =~ s/n/Y/gi); # count the # of 'N's.
138 0           return ($c+$n/2) / length($dnaSeq);
139             }
140              
141             # Given a BED file, bin the genome and count the reads in bins.
142             sub bin_genome_count{
143             # BED file,bin size,chromosome length,fragment size,reference to bin vector.
144 0     0 0   my($bedfile,$binsize,$chrlen,$fragsize,$r_binVec) = @_;
145 0 0         open HBED, "<", $bedfile or die "Open bed file error: $!\n";
146 0           $#{$r_binVec} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\
  0            
147 0           foreach my $i(0..$#{$r_binVec}) {$r_binVec->[$i] = 0;} # Initialize bin vector to zeros.
  0            
  0            
148 0           my $line = 0;
149 0           while(){
150 0           $line++;
151 0           chomp;
152 0           my @cells = split /\t/;
153 0 0         if(@cells < 6){
154 0           warn "Format error at $bedfile, line:$line. Skip!\n";
155 0           next;
156             }
157 0           my($chrom,$start,$end,$name,$score,$strand) = @cells;
158             # Shift read location by fragment size.
159 0           my $loc;
160 0 0         if($strand eq '+') {$loc = $start + $fragsize/2;}
  0            
  0            
161             else {$loc = $end - $fragsize/2;}
162             # Increment read count in bin.
163             # $loc--; # Decrement location to correctly assign the read to bin vector.(NO! BED is 0-based.)
164 0           $r_binVec->[floor($loc/$binsize)]++;
165             }
166 0           close HBED;
167             }
168              
169             # Bin the genome and count the reads with direction.
170             sub bin_genome_count_direction{
171             # BED file,bin size,chromosome length,fragment size,reference to bin vector.
172 0     0 0   my($bedfile,$binsize,$chrlen,$fragsize,$r_binVec_F,$r_binVec_R) = @_;
173 0 0         open HBED, "<", $bedfile or die "Open bed file error: $!\n";
174 0           $#{$r_binVec_F} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\
  0            
175 0           $#{$r_binVec_R} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\
  0            
176 0           foreach my $i(0..$#{$r_binVec_F}) {$r_binVec_F->[$i] = 0;} # Initialize bin vector to zeros.
  0            
  0            
177 0           foreach my $i(0..$#{$r_binVec_R}) {$r_binVec_R->[$i] = 0;} # Initialize bin vector to zeros.
  0            
  0            
178 0           my $line = 0;
179 0           while(){
180 0           $line++;
181 0           chomp;
182 0           my @cells = split;
183 0 0         if(@cells < 6){
184 0           warn "Format error at $bedfile, line:$line. Skip!\n";
185 0           next;
186             }
187 0           my($chrom,$start,$end,$name,$score,$strand) = @cells;
188             # Shift read location by fragment size and then increment the read count at the correct location.
189 0           my $loc;
190 0 0         if($strand eq '+') {
191 0           $loc = $start + $fragsize/2;
192 0           $r_binVec_F->[floor($loc/$binsize)]++;
193             }
194             else {
195 0           $loc = $end - $fragsize/2;
196 0           $r_binVec_R->[floor($loc/$binsize)]++;
197             }
198             }
199 0           close HBED;
200             }
201              
202             # Read chromosome name and length into a hash table.
203             sub read_chrlen_tbl{
204 0     0 0   my $chrfile = shift;
205 0 0         open HCHR, "<", $chrfile or die "Cannot open chromosome length file:$!\n";
206 0           my $line = 0;
207 0           my %chrlen; # hash table for chromosome length.
208 0           while(){
209 0           $line++;
210 0           chomp;
211 0 0         next if $_ eq ''; # skip empty line.
212 0           my @cells = split;
213 0 0         warn "Insufficient chromosome information at $chrfile, line:$line. Skip!\n" if @cells < 2;
214 0           my($chrom,$len) = @cells;
215 0 0 0       if($chrom =~ /^\S+$/ and $len =~ /^[0-9]+$/){
  0            
216 0           $chrlen{$chrom} = $len;
217             }
218             else {warn "Format error at $chrfile, line:$line. Skip!\n"}
219             }
220 0           return %chrlen;
221             }
222              
223             # Read chromosome name and length into an array and order them by name.
224             sub read_chrlen_ordered{
225 0     0 0   my $chrfile = shift;
226 0 0         open HCHR, "<", $chrfile or die "Cannot open chromosome length file:$!\n";
227 0           my $line = 0;
228 0           my @chrlen; # array for chromosome length.
229 0           while(){
230 0           $line++;
231 0           chomp;
232 0 0         next if $_ eq ''; # skip empty line.
233 0           my @cells = split;
234 0 0         warn "Insufficient chromosome information at $chrfile, line:$line. Skip!\n" if @cells < 2;
235 0           my($chrom,$len) = @cells;
236 0 0 0       if($chrom =~ /^\S+$/ and $len =~ /^[0-9]+$/){
  0            
237 0           my %rec = (
238             chrom => $chrom,
239             len => $len);
240 0           push @chrlen, \%rec;
241             }
242             else {warn "Format error at $chrfile, line:$line. Skip!\n"}
243             }
244 0           return sort {order_chr($a,$b)} @chrlen; # return ordered list.
  0            
245             }
246              
247              
248             # Separate a BED file into small ones by chromosome names.
249             sub sep_chrom_bed{
250             # BED file, reference to chromosome length hash table.
251 0     0 0   my($bedfile,@chrname) = @_;
252 0           my $timestamp = time;
253 0           my $timehead = ".$timestamp."; # timestamp file name head.
254 0           my %hsh_chrfile; # hash of chromosome file handles.
255             my %hsh_chrname; # hash of chromosome file names.
256             # Create chromosome files for writing.
257 0           foreach my $chrom(@chrname){
258 0           my $chrfile = $timehead . $chrom . ".bed";
259 0           while(-e $chrfile){ # prevent existing files from being overridden.
260 0           $chrfile = ($timehead + rand) . $chrom . ".bed";
261             }
262 0           my $fh;
263 0 0         open $fh, ">", $chrfile or die "Cannot create bed file:$!\n";
264 0           $hsh_chrfile{$chrom} = $fh;
265 0           $hsh_chrname{$chrom} = $chrfile;
266             }
267 0 0         open HBED, "<", $bedfile or die "Cannot open $bedfile:$!\n";
268 0           my $line = 0; # line number in BED file.
269 0           my $nread = 0; # actual number of reads splitted.
270             # Go through bed file and assign each line to corresponding file.
271 0           while(){
272 0           $line++;
273 0           chomp;
274 0 0         next if $_ eq '';
275 0           my @cells = split;
276 0 0         if(@cells < 6){
277 0           warn "Insufficient tag information at $bedfile, line:$line. Skip!\n";
278 0           next;
279             }
280 0           my($chrom,$start,$end,$name,$score,$strand) = @cells;
281 0 0         if(exists $hsh_chrfile{$chrom}){
282 0           $nread++;
283 0           print {$hsh_chrfile{$chrom}} $_ . "\n";
  0            
284             }
285             else{
286 0           warn "Unspecified chromosome name at $bedfile, line:$line.\nCheck your chromosome length file. Skip!\n";
287             }
288             }
289             # Close all files.
290 0           close HBED;
291 0           foreach my $hdl(values %hsh_chrfile) {close $hdl;}
  0            
292              
293             # Return chromosome file name vector.
294 0           return ($nread, \%hsh_chrname);
295             }
296              
297             # Delete all separated chromosome files.
298             sub del_chrfile{
299 0     0 0   foreach my $file(@_) {
300 0 0         unlink $file or warn "Cannot delete $file: $!\n";
301             }
302             }
303              
304             1;
305              
306             __END__