File Coverage

blib/lib/MyShortRead/MyShortRead.pm

Criterion	Covered	Total	%
statement	15	181	8.2
branch	0	64	0.0
condition	0	9	0.0
subroutine	5	16	31.2
pod	0	11	0.0
total	20	281	7.1

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# My Perl library to deal with short read sequencing data.
3							# created by Li Shen, Nov 2009.
4							#
5							# strGCPercent added March 2010 - Li Shen
6							# bin_genome_count,read_chrom_len,sep_chrom_bed,del_chrfile added April 2010 - Li Shen
7							# bin_genome_count_direction added June 2010 - Li Shen
8							#
9							#
10
11	1			1		15	use 5.006;
	1					4
	1					41
12	1			1		5	use strict;
	1					1
	1					62
13	1			1		5	use warnings;
	1					2
	1					55
14							package MyShortRead::MyShortRead;
15
16
17	1			1		5	use POSIX;
	1					1
	1					9
18	1			1		4522	use Time::HiRes qw(time);
	1					3622
	1					5
19
20							require Exporter;
21							our @ISA = qw(Exporter);
22
23							# Items to export into callers namespace by default. Note: do not export
24							# names by default without a very good reason. Use EXPORT_OK instead.
25							# Do not simply export all your public functions/methods/constants.
26
27							# This allows declaration use MyShortRead::MyShortRead ':all';
28							# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
29							# will save memory.
30							our %EXPORT_TAGS = ( 'all' => [ qw(
31
32							) ] );
33
34							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
35
36							our @EXPORT = qw(compare2 overlap2 strGCPercent bin_genome_count bin_genome_count_direction read_chrlen_tbl read_chrlen_ordered sep_chrom_bed del_chrfile order_chr);
37
38							our $VERSION = '1.00';
39
40
41							# Preloaded methods go here.
42
43							# Comparing two genomic intervals according to start or end position.
44							sub compare2{
45	0			0	0		my($a,$b) = @_;
46	0						my $cmpby = 'start';
47	0	0					if(@_ > 2) {$cmpby = $_[2];}
	0
48
49	0						$a->{chrom} =~ /^(chr)?(\w+)$/;
50	0						my $chr1 = $2;
51	0						$b->{chrom} =~ /^(chr)?(\w+)$/;
52	0						my $chr2 = $2;
53
54							# $chr1 = 100 if uc $chr1 eq 'X';
55							# $chr1 = 101 if uc $chr1 eq 'Y';
56							# $chr1 = 102 if uc $chr1 eq 'M';
57							# $chr1 = 199 if uc $chr1 eq 'Z';
58							# $chr2 = 100 if uc $chr2 eq 'X';
59							# $chr2 = 101 if uc $chr2 eq 'Y';
60							# $chr2 = 102 if uc $chr2 eq 'M';
61							# $chr2 = 199 if uc $chr2 eq 'Z';
62
63							# return $chr1 <=> $chr2 unless $chr1 == $chr2;
64
65	0	0					return $chr1 cmp $chr2 unless $chr1 eq $chr2;
66
67	0	0					if($cmpby eq 'end') {return $a->{'end'} <=> $b->{'end'};}
	0
	0
68							else {return $a->{'start'} <=> $b->{'start'};}
69							}
70
71							# Order chromosome names.
72							sub order_chr{
73	0			0	0		my($a,$b) = @_;
74	0						my($chr1, $chr2);
75							# Find chromosome names.
76	0	0					if(exists $a->{'chr'}) {$chr1 = $a->{'chr'};}
	0	0
	0
77	0						elsif(exists $a->{chrom}) {$chr1 = $a->{chrom};}
78	0						else {warn "Cannot find chromosome name! Order is not determined.\n"; return 0;}
79	0	0					if(exists $b->{'chr'}) {$chr2 = $b->{'chr'};}
	0	0
	0
80	0						elsif(exists $b->{chrom}) {$chr2 = $b->{chrom};}
81	0						else {warn "Cannot find chromosome name! Order is not determined.\n"; return 0;}
82
83	0						return $chr1 cmp $chr2;
84
85							# # Get rid of 'chr' if exists.
86							# $chr1 =~ /^(chr)?(\w+)$/; $chr1 = $2;
87							# $chr2 =~ /^(chr)?(\w+)$/; $chr2 = $2;
88							# # Convert letters to large numbers.
89							# $chr1 = 100 if uc $chr1 eq 'X';
90							# $chr1 = 101 if uc $chr1 eq 'Y';
91							# $chr1 = 102 if uc $chr1 eq 'M';
92							# $chr1 = 199 if uc $chr1 eq 'Z';
93							# $chr2 = 100 if uc $chr2 eq 'X';
94							# $chr2 = 101 if uc $chr2 eq 'Y';
95							# $chr2 = 102 if uc $chr2 eq 'M';
96							# $chr2 = 199 if uc $chr2 eq 'Z';
97
98							# return $chr1 <=> $chr2;
99							}
100
101							# Find the interval list with the lowest start or end position.
102							# return the index of the lowest list and set the position through reference.
103							sub find_lowest_list{
104	0			0	0		my($rlists, $rpos) = @_;
105	0						my $cmpby = 'start';
106	0	0					if(@_ > 2) {$cmpby = $_[2];}
	0
107	0						my $ll = 0; # initialize lowest list.
108	0						my $lowest_intrv = $rlists->[$ll]{cur_intrv}; # intrv with the lowest end position.
109	0						for my $i(1..$#{$rlists}){
	0
110	0						my $cur_intrv = $rlists->[$i]{cur_intrv};
111	0	0					if(compare2($cur_intrv, $lowest_intrv, $cmpby) < 0){
112	0						$lowest_intrv = $cur_intrv;
113	0						$ll = $i;
114							}
115							}
116							# return the lowest position through reference.
117	0	0					${$rpos} = $cmpby eq 'end'? $lowest_intrv->{'end'} : $lowest_intrv->{'start'};
	0
118	0						return $ll;
119							}
120
121							# Judge whether two intervals overlap.
122							sub overlap2{
123	0			0	0		my($refintrv1, $refintrv2) = @_;
124	0	0					if($refintrv1->{chrom} eq $refintrv2->{chrom}){
125	0	0	0				if($refintrv1->{'start'} <= $refintrv2->{'end'} and
126							$refintrv1->{'end'} >= $refintrv2->{'start'}){
127	0						return 1;
128							}
129							}
130	0						return 0;
131							}
132
133							# Calculate the GC percentage given a DNA sequence in string representation.
134							sub strGCPercent{
135	0			0	0		my $dnaSeq = shift;
136	0						my $c = ($dnaSeq =~ s/[gc]/X/gi); # Count the # of G or C without regarding to case.
137	0						my $n = ($dnaSeq =~ s/n/Y/gi); # count the # of 'N's.
138	0						return ($c+$n/2) / length($dnaSeq);
139							}
140
141							# Given a BED file, bin the genome and count the reads in bins.
142							sub bin_genome_count{
143							# BED file,bin size,chromosome length,fragment size,reference to bin vector.
144	0			0	0		my($bedfile,$binsize,$chrlen,$fragsize,$r_binVec) = @_;
145	0	0					open HBED, "<", $bedfile or die "Open bed file error: $!\n";
146	0						$#{$r_binVec} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\
	0
147	0						foreach my $i(0..$#{$r_binVec}) {$r_binVec->[$i] = 0;} # Initialize bin vector to zeros.
	0
	0
148	0						my $line = 0;
149	0						while(){
150	0						$line++;
151	0						chomp;
152	0						my @cells = split /\t/;
153	0	0					if(@cells < 6){
154	0						warn "Format error at $bedfile, line:$line. Skip!\n";
155	0						next;
156							}
157	0						my($chrom,$start,$end,$name,$score,$strand) = @cells;
158							# Shift read location by fragment size.
159	0						my $loc;
160	0	0					if($strand eq '+') {$loc = $start + $fragsize/2;}
	0
	0
161							else {$loc = $end - $fragsize/2;}
162							# Increment read count in bin.
163							# $loc--; # Decrement location to correctly assign the read to bin vector.(NO! BED is 0-based.)
164	0						$r_binVec->[floor($loc/$binsize)]++;
165							}
166	0						close HBED;
167							}
168
169							# Bin the genome and count the reads with direction.
170							sub bin_genome_count_direction{
171							# BED file,bin size,chromosome length,fragment size,reference to bin vector.
172	0			0	0		my($bedfile,$binsize,$chrlen,$fragsize,$r_binVec_F,$r_binVec_R) = @_;
173	0	0					open HBED, "<", $bedfile or die "Open bed file error: $!\n";
174	0						$#{$r_binVec_F} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\
	0
175	0						$#{$r_binVec_R} = ceil($chrlen/$binsize)-1; # Calculate bin vector size.\
	0
176	0						foreach my $i(0..$#{$r_binVec_F}) {$r_binVec_F->[$i] = 0;} # Initialize bin vector to zeros.
	0
	0
177	0						foreach my $i(0..$#{$r_binVec_R}) {$r_binVec_R->[$i] = 0;} # Initialize bin vector to zeros.
	0
	0
178	0						my $line = 0;
179	0						while(){
180	0						$line++;
181	0						chomp;
182	0						my @cells = split;
183	0	0					if(@cells < 6){
184	0						warn "Format error at $bedfile, line:$line. Skip!\n";
185	0						next;
186							}
187	0						my($chrom,$start,$end,$name,$score,$strand) = @cells;
188							# Shift read location by fragment size and then increment the read count at the correct location.
189	0						my $loc;
190	0	0					if($strand eq '+') {
191	0						$loc = $start + $fragsize/2;
192	0						$r_binVec_F->[floor($loc/$binsize)]++;
193							}
194							else {
195	0						$loc = $end - $fragsize/2;
196	0						$r_binVec_R->[floor($loc/$binsize)]++;
197							}
198							}
199	0						close HBED;
200							}
201
202							# Read chromosome name and length into a hash table.
203							sub read_chrlen_tbl{
204	0			0	0		my $chrfile = shift;
205	0	0					open HCHR, "<", $chrfile or die "Cannot open chromosome length file:$!\n";
206	0						my $line = 0;
207	0						my %chrlen; # hash table for chromosome length.
208	0						while(){
209	0						$line++;
210	0						chomp;
211	0	0					next if $_ eq ''; # skip empty line.
212	0						my @cells = split;
213	0	0					warn "Insufficient chromosome information at $chrfile, line:$line. Skip!\n" if @cells < 2;
214	0						my($chrom,$len) = @cells;
215	0	0	0				if($chrom =~ /^\S+$/ and $len =~ /^[0-9]+$/){
	0
216	0						$chrlen{$chrom} = $len;
217							}
218							else {warn "Format error at $chrfile, line:$line. Skip!\n"}
219							}
220	0						return %chrlen;
221							}
222
223							# Read chromosome name and length into an array and order them by name.
224							sub read_chrlen_ordered{
225	0			0	0		my $chrfile = shift;
226	0	0					open HCHR, "<", $chrfile or die "Cannot open chromosome length file:$!\n";
227	0						my $line = 0;
228	0						my @chrlen; # array for chromosome length.
229	0						while(){
230	0						$line++;
231	0						chomp;
232	0	0					next if $_ eq ''; # skip empty line.
233	0						my @cells = split;
234	0	0					warn "Insufficient chromosome information at $chrfile, line:$line. Skip!\n" if @cells < 2;
235	0						my($chrom,$len) = @cells;
236	0	0	0				if($chrom =~ /^\S+$/ and $len =~ /^[0-9]+$/){
	0
237	0						my %rec = (
238							chrom => $chrom,
239							len => $len);
240	0						push @chrlen, \%rec;
241							}
242							else {warn "Format error at $chrfile, line:$line. Skip!\n"}
243							}
244	0						return sort {order_chr($a,$b)} @chrlen; # return ordered list.
	0
245							}
246
247
248							# Separate a BED file into small ones by chromosome names.
249							sub sep_chrom_bed{
250							# BED file, reference to chromosome length hash table.
251	0			0	0		my($bedfile,@chrname) = @_;
252	0						my $timestamp = time;
253	0						my $timehead = ".$timestamp."; # timestamp file name head.
254	0						my %hsh_chrfile; # hash of chromosome file handles.
255							my %hsh_chrname; # hash of chromosome file names.
256							# Create chromosome files for writing.
257	0						foreach my $chrom(@chrname){
258	0						my $chrfile = $timehead . $chrom . ".bed";
259	0						while(-e $chrfile){ # prevent existing files from being overridden.
260	0						$chrfile = ($timehead + rand) . $chrom . ".bed";
261							}
262	0						my $fh;
263	0	0					open $fh, ">", $chrfile or die "Cannot create bed file:$!\n";
264	0						$hsh_chrfile{$chrom} = $fh;
265	0						$hsh_chrname{$chrom} = $chrfile;
266							}
267	0	0					open HBED, "<", $bedfile or die "Cannot open $bedfile:$!\n";
268	0						my $line = 0; # line number in BED file.
269	0						my $nread = 0; # actual number of reads splitted.
270							# Go through bed file and assign each line to corresponding file.
271	0						while(){
272	0						$line++;
273	0						chomp;
274	0	0					next if $_ eq '';
275	0						my @cells = split;
276	0	0					if(@cells < 6){
277	0						warn "Insufficient tag information at $bedfile, line:$line. Skip!\n";
278	0						next;
279							}
280	0						my($chrom,$start,$end,$name,$score,$strand) = @cells;
281	0	0					if(exists $hsh_chrfile{$chrom}){
282	0						$nread++;
283	0						print {$hsh_chrfile{$chrom}} $_ . "\n";
	0
284							}
285							else{
286	0						warn "Unspecified chromosome name at $bedfile, line:$line.\nCheck your chromosome length file. Skip!\n";
287							}
288							}
289							# Close all files.
290	0						close HBED;
291	0						foreach my $hdl(values %hsh_chrfile) {close $hdl;}
	0
292
293							# Return chromosome file name vector.
294	0						return ($nread, \%hsh_chrname);
295							}
296
297							# Delete all separated chromosome files.
298							sub del_chrfile{
299	0			0	0		foreach my $file(@_) {
300	0	0					unlink $file or warn "Cannot delete $file: $!\n";
301							}
302							}
303
304							1;
305
306							__END__