File Coverage

blib/lib/CracTools/Utils.pm

Criterion	Covered	Total	%
statement	187	271	69.0
branch	41	98	41.8
condition	13	30	43.3
subroutine	27	40	67.5
pod	27	27	100.0
total	295	466	63.3

line	stmt	bran	cond	sub	pod	time	code
1							package CracTools::Utils;
2
3							{
4							$CracTools::Utils::DIST = 'CracTools';
5							}
6							# ABSTRACT: A set of useful functions
7							$CracTools::Utils::VERSION = '1.251';
8	8			8		18571	use strict;
	8					19
	8					202
9	8			8		38	use warnings;
	8					16
	8					207
10
11	8			8		37	use Carp;
	8					22
	8					432
12	8			8		42	use Fcntl qw( SEEK_SET );
	8					16
	8					22530
13
14
15							sub reverseComplement($) {
16	1			1	1	11	my $dna = shift;
17
18							# reverse the DNA sequence
19	1					4	my $revcomp = reverse $dna;
20
21							# complement the reversed DNA sequence
22	1					4	$revcomp =~ tr/ACGTacgt/TGCAtgca/;
23	1					10	return $revcomp;
24							}
25
26
27							sub reverse_tab($) {
28	1			1	1	4	my $string = shift;
29	1					7	my @tab = split(/,/,$string);
30	1					4	my $newString;
31	1	50				6	if(@tab > 0) {
32	1					7	for (my $i=$#tab ; $i > 0 ; $i--){
33	3					7	$newString .= $tab[$i];
34	3					13	$newString .= ",";
35							}
36	1					3	$newString .= $tab[0];
37							}
38	1					7	return $newString;
39							}
40
41
42							sub isVersionGreaterOrEqual($$) {
43	0			0	1	0	my ($v1,$v2) = @_;
44	0					0	my @v1_nums = split(/\./,$v1);
45	0					0	my @v2_nums = split(/\./,$v2);
46	0					0	for(my $i = 0; $i < @v1_nums; $i++) {
47	0	0				0	if($v1_nums[$i] >= $v2_nums[$i]) {
48	0					0	return 1;
49							}else {
50	0					0	return 0;
51							}
52							}
53	0	0				0	if(scalar @v2_nums > @v1_nums) {
54	0					0	return 0;
55							} else {
56	0					0	return 1;
57							}
58							}
59
60
61							my %conversion_hash = ( '+' => 1, '-' => '-1', '1' => '+', '-1' => '-');
62							sub convertStrand($) {
63	50			50	1	90	my $strand = shift;
64	50	50				197	return defined $strand? $conversion_hash{$strand} : undef;
65							}
66
67							sub removeChrPrefix($) {
68	0			0	1	0	my $string = shift;
69	0					0	$string =~ s/^chr//i;
70	0					0	return $string;
71							}
72
73							sub addChrPrefix($) {
74	0			0	1	0	my $string = shift;
75	0					0	return "chr".removeChrPrefix($string);
76							}
77
78
79							our $Base64_BITNESS = 6;
80							our @Base64_ENCODING = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 0 1 2 3 4 5 6 7 8 9 + /);
81
82							# Encode error list into base64
83							sub encodePosListToBase64 {
84	1			1	1	5331	my @pos_list = @_;
85	1					3	my @bitList;
86							my $encoded_str;
87
88	1	50				9	return "" if @pos_list == 0;
89
90							# Compute the appropriate length for the bitList
91	1					3	my $length = $pos_list[-1] + 1;
92	1					4	my $bitList_length = int($length / $Base64_BITNESS);
93	1	50				4	if ($length % $Base64_BITNESS > 0) {
94	1					3	$bitList_length++;
95							}
96
97							# Put 0 at all position
98	1					4	for (my $i = 0; $i < $bitList_length; $i++) {
99	6					14	$bitList[$i] = 0;
100							}
101
102							# Create bit vector in Base64_ENCODING
103	1					4	foreach my $j (@pos_list) {
104	6					13	$bitList[int($j / $Base64_BITNESS)] \|= (1 << ($j % $Base64_BITNESS));
105							}
106
107							# Convert bitList into string_Base64_ENCODING
108	1					4	for(my $k=0; $k < $bitList_length; $k++) {
109	6					15	$encoded_str .= scalar($Base64_ENCODING[$bitList[$k]]);
110							}
111	1					4	return $encoded_str;
112							}
113
114
115							# Decode base 64 error list
116							sub decodePosListInBase64 {
117	1			1	1	5	my $encoded_str = shift;
118
119	1					6	my @encoded_list = split "", $encoded_str;
120	1					3	my (@index,@decoded_list);
121
122							#Looking for index of each char
123	1					2	foreach my $j (@encoded_list) {
124	6					19	my @ind = grep { $Base64_ENCODING[$_] eq $j } 0 .. $#Base64_ENCODING;
	384					581
125	6					37	push(@index,$ind[0]);
126							}
127
128							# Convert into list position
129	1					5	for (my $e = 0; $e < (0+@encoded_list);$e++) {
130	6					12	for(my $i=0; $i<$Base64_BITNESS; $i++) {
131	36	100				88	if ($index[$e] & (1 << $i)) {
132	6					19	my $pos = (int($i+$Base64_BITNESS*$e));
133	6					16	push(@decoded_list,$pos);
134							}
135							}
136							}
137
138	1					4	return @decoded_list;
139							}
140
141
142
143
144							sub seqFileIterator {
145	5			5	1	6265	my ($file,$format) = @_;
146
147	5	50				26	croak "Missing file in argument of seqFileIterator" if !defined $file;
148
149							# Get file handle for $file
150	5					26	my $fh = getReadingFileHandle($file);
151
152							# Automatic file extension detection
153	5	50				20	if(!defined $format) {
154	5	100				26	if($file =~ /\.(fasta\|fa)(\.\|$)/) {
		50
155	2					62	$format = 'fasta';
156							} elsif($file =~ /\.(fastq\|fq)(\.\|$)/) {
157	3					112	$format = 'fastq';
158							} else {
159	0					0	croak "Undefined file extension";
160							}
161							} else {
162	0					0	$format = lc $format;
163							}
164
165							# FASTA ITERATOR
166	5	100				28	if ($format eq 'fasta') {
		50
167							# Read prev line for FASTA because we dont know the number
168							# of line used for the sequence
169	2					27	my $prev_line = <$fh>;
170	2					9	chomp $prev_line;
171							return sub {
172	7			7		9068	my ($name,$seq,$qual);
173	7	100				26	if(defined $prev_line) {
174	5					31	($name) = $prev_line =~ />(.*)$/;
175	5					25	$prev_line = <$fh>;
176							# Until we find a new sequence identifier ">", we
177							# concatenate the lines corresponding to the sequence
178	5		100			43	while(defined $prev_line && $prev_line !~ /^>/) {
179	10					26	chomp $prev_line;
180	10					31	$seq .= $prev_line;
181	10					73	$prev_line = <$fh>;
182							}
183	5					37	return {name => $name, seq => $seq, qual => $qual};
184							} else {
185	2					9	return undef;
186							}
187	2					19	};
188							# FASTQ ITERATOR
189							} elsif ($format eq 'fastq') {
190							return sub {
191	7			7		3846	my ($name,$seq,$qual);
192	7					87	my $line = <$fh>;
193	7	100				59	($name) = $line =~ /@(.*)$/ if defined $line;
194	7	100				33	if(defined $name) {
195	6					23	$seq = <$fh>;
196	6					20	chomp $seq;
197	6					21	<$fh>; # skip second seq name (useless line)
198	6					21	$qual = <$fh>;
199	6					18	chomp $qual;
200	6					38	return {name => $name, seq => $seq, qual => $qual};
201							} else {
202	1					6	return undef;
203							}
204	3					30	};
205							} else {
206	0					0	croak "Undefined file format";
207							}
208							}
209
210
211							sub pairedEndSeqFileIterator {
212	1			1	1	3429	my($file1,$file2,$format) = @_;
213
214	1					7	my $it1 = seqFileIterator($file1,$format);
215	1					10	my $it2 = seqFileIterator($file2,$format);
216
217							return sub {
218	2			2		4983	my $entry1 = $it1->();
219	2					10	my $entry2 = $it2->();
220	2	50	33			26	if(defined $entry1 && defined $entry2) {
221	2					13	return { read1 => $entry1, read2 => $entry2 };
222							} else {
223	0					0	return undef;
224							}
225	1					9	};
226							}
227
228
229							sub writeSeq {
230	0			0	1	0	my ($fh,$format,$name,$seq,$qual) = @_;
231	0	0				0	if($format eq 'fasta') {
		0
232	0					0	print $fh ">$name\n$seq\n";
233							} elsif($format eq 'fastq') {
234	0					0	print $fh '@'."$name\n$seq\n+\n$qual\n";
235							} else {
236	0					0	die "Unknown file format. Must be either a FASTA or a FASTQ";
237							}
238							}
239
240
241							sub bedFileIterator {
242	1			1	1	9317	return getFileIterator(file => shift, type => 'bed');
243							}
244
245
246							sub gffFileIterator {
247	1			1	1	6634	my $file = shift;
248	1					5	my $type = shift;
249	1					7	return getFileIterator(file => $file, type => $type);
250							}
251
252
253							sub vcfFileIterator {
254	1			1	1	10588	my $file = shift;
255	1					6	return getFileIterator(file => $file, type => 'vcf');
256							}
257
258
259							sub chimCTFileIterator {
260	0			0	1	0	return getFileIterator(file => shift, type => 'chimCT');
261							}
262
263
264							sub bamFileIterator {
265	0			0	1	0	my ($file,$region) = @_;
266	0	0				0	$region = "" if !defined $region;
267
268	0					0	my $fh;
269	0	0				0	if($file =~ /\.bam$/) {
270	0	0				0	open($fh, "-\|", "samtools view $file $region" )or die "Cannot open $file, check if samtools are installed.";
271							} else {
272	0					0	$fh = getReadingFileHandle($file);
273							}
274
275							return sub {
276	0			0		0	return <$fh>;
277							}
278
279	0					0	}
280
281
282							sub getSeqFromIndexedRef {
283	0			0	1	0	my ($ref_file,$chr,$pos,$length,$format) = @_;
284	0	0				0	$format = 'fasta' if !defined $format;
285	0					0	$pos++; # +1 because samtools faidx is 1-based
286							# First we try without the "chr" prefix
287	0					0	my $region = removeChrPrefix($chr).":$pos-".($pos+$length-1);
288	0					0	my $fasta_seq = `samtools faidx $ref_file "$region"`;
289							# If it has not worked, we try without
290	0	0				0	if($fasta_seq !~ /^>\S+\n\S+/) {
291	0					0	$region = addChrPrefix($chr).":$pos-".($pos+$length-1);
292	0					0	$fasta_seq = `samtools faidx $ref_file "$region"`;
293							}
294	0	0				0	if($format eq 'raw') {
295	0					0	my ($seq) = $fasta_seq =~ /^>\S+\n(\S+)/;
296	0					0	return $seq;
297							}
298	0					0	return $fasta_seq;
299							}
300
301
302							sub parseBedLine {
303	1			1	1	5	my $line = shift;
304	1					5	my %args = @_;
305	1					11	my($chr,$start,$end,$name,$score,$strand,$thick_start,$thick_end,$rgb,$block_count,$block_size,$block_starts) = split("\t",$line);
306	1					4	my @blocks;
307
308							# Manage blocks if we have a bed12
309	1	50				7	if(defined $block_starts) {
310	1					6	my @block_size = split(",",$block_size);
311	1					7	my @block_starts = split(",",$block_starts);
312	1					4	my $cumulated_block_size = 0;
313	1					9	for(my $i = 0; $i < $block_count; $i++) {
314	2					23	push(@blocks,{size => $block_size[$i],
315							start => $block_starts[$i],
316							end => $block_starts[$i] + $block_size[$i],
317							block_start => $cumulated_block_size,
318							block_end => $cumulated_block_size + $block_size[$i],
319							ref_start => $block_starts[$i] + $start,
320							ref_end => $block_starts[$i] + $start + $block_size[$i],
321							});
322	2					11	$cumulated_block_size += $block_size[$i];
323							}
324							}
325
326	1					16	return { chr => $chr,
327							start => $start,
328							end => $end,
329							name => $name,
330							score => $score,
331							strand => $strand,
332							thick_start => $thick_start,
333							thick_end => $thick_end,
334							rgb => $rgb,
335							blocks => \@blocks,
336							};
337							}
338
339
340							sub parseGFFLine {
341	1			1	1	5	my $line = shift;
342	1					4	my $type = shift;
343	1					3	my $attribute_split;
344	1	50	0			7	if($type =~ /gff3/i) {
		0
345	1			4		7	$attribute_split = sub {my $attr = shift; return $attr =~ /(\S+)=(.*)/;};
	4					9
	4					28
346							} elsif ($type eq 'gtf' \|\| $type eq 'gff2') {
347	0			0		0	$attribute_split = sub {my $attr = shift; return $attr =~ /(\S+)\s+"?(.*)"?/;};
	0					0
	0					0
348							} else {
349	0					0	croak "Undefined GFF format (must be either gff3,gtf, or gff2)";
350							}
351	1					10	my($chr,$source,$feature,$start,$end,$score,$strand,$frame,$attributes) = split("\t",$line);
352	1					7	my @attributes_tab = split(";",$attributes);
353	1					3	my %attributes_hash;
354	1					4	foreach my $attr (@attributes_tab) {
355	4					15	my ($k,$v) = $attribute_split->($attr);
356	4	50	33			28	if(defined $k and defined $v) {
357	4					18	$attributes_hash{$k} = $v;
358							}
359							}
360	1					17	return { chr => $chr,
361							source => $source,
362							feature => $feature,
363							start => $start,
364							end => $end,
365							score => $score,
366							strand => $strand,
367							frame => $frame,
368							attributes => \%attributes_hash,
369							};
370							}
371
372
373							sub parseVCFLine {
374	1			1	1	2	my $line = shift;
375	1					6	my($chr,$pos,$id,$ref,$alt,$qual,$filter,$info) = split("\t",$line);
376
377	1	50				10	my @alts = defined $alt? split(",",$alt) : ();
378							my %infos = defined $info?
379	1	50				7	map { my ($k,$v) = split '='; $k => $v } split ';', $info : ();
	5					12
	5					15
380
381	1					11	return { chr => $chr,
382							pos => $pos,
383							id => $id,
384							ref => $ref,
385							alt => \@alts,
386							qual => $qual,
387							filter => $filter,
388							info => \%infos,
389							};
390							}
391
392
393							sub parseChimCTLine {
394	0			0	1	0	my $line = shift;
395	0					0	my($id,$name,$chr1,$pos1,$strand1,$chr2,$pos2,$strand2,$chim_value,$spanning_junction,$spanning_PE,$class,$comments,@others) = split("\t",$line);
396	0					0	my($sample,$chim_key) = split(":",$id);
397	0					0	my @comments = split(",",$comments);
398	0					0	my %comments;
399	0					0	foreach my $com (@comments){
400	0					0	my ($key,$val) = split("=",$com);
401	0					0	$comments{$key} = $val;
402							}
403	0					0	my %extend_fields;
404	0					0	foreach my $field (@others) {
405	0					0	my ($key,$val) = split("=",$field);
406	0	0	0			0	if(defined $key && defined $val) {
407	0					0	$extend_fields{$key} = $val;
408							}
409							}
410							return {
411	0					0	sample => $sample,
412							chim_key => $chim_key,
413							name => $name,
414							chr1 => $chr1,
415							pos1 => $pos1,
416							strand1 => $strand1,
417							chr2 => $chr2,
418							pos2 => $pos2,
419							strand2 => $strand2,
420							chim_value => $chim_value,
421							spanning_junction => $spanning_junction,
422							spanning_PE => $spanning_PE,
423							class => $class,
424							comments => \%comments,
425							extended_fields => \%extend_fields,
426							};
427							}
428
429
430							sub parseSAMLineLite {
431	1			1	1	5723	my $line = shift;
432	1					7	my ($qname,$flag,$rname,$pos,$mapq,$cigar,$rnext,$pnext,$tlen,$seq,$qual,@others) = split("\t",$line);
433	1					9	my @cigar_hash = map { { op => substr($_,-1), nb => substr($_,0,length($_)-1)} } $cigar =~ /(\d+\D)/g;
	3					13
434							return {
435	1					16	qname => $qname,
436							flag => $flag,
437							rname => $rname,
438							pos => $pos,
439							mapq => $mapq,
440							cigar => \@cigar_hash,
441							original_cigar => $cigar,
442							rnext => $rnext,
443							pnext => $pnext,
444							tlen => $tlen,
445							seq => $seq,
446							qual => $qual,
447							extended_fields => \@others,
448							};
449							}
450
451
452							sub parseCigarChain {
453	1			1	1	10	my $cigar_string = shift;
454	1					2	my @cigar;
455	1					7	while($cigar_string =~ /(\d+)(\S)/g) {
456	5					22	push @cigar, { op => $2, nb => $1 };
457							}
458	1					5	return \@cigar;
459							}
460
461
462
463							sub getFileIterator {
464	4			4	1	40	my %args = @_;
465	4					19	my $file = $args{file};
466	4					13	my $type = $args{type};
467	4					14	my $skip = $args{skip};
468	4					13	my $header_regex = $args{header_regex};
469	4					13	my $parsing_method = $args{parsing_method};
470	4					15	my @parsing_arguments = ();
471
472
473	4	50				22	croak "Missing arguments in getFileIterator" if !defined $file;
474
475	4	100	66			36	if(!defined $parsing_method && defined $type) {
476	3	100	66			51	if($type =~ /gff3/i \|\| $type eq 'gtf' \|\| $type eq 'gff2') {
		100	66
		50	0
		0
		0
477	1					4	$header_regex = '^#';
478	1			1		9	$parsing_method = sub { return parseGFFLine(@_,$type) };
	1					6
479	1					5	push (@parsing_arguments,$type);
480							} elsif($type =~ /bed/i) {
481	1					3	$header_regex = '(^track\s)\|(^#)';
482	1					5	$parsing_method = \&parseBedLine;
483							} elsif ($type =~ /vcf/i) {
484	1					4	$header_regex = '^#';
485	1					4	$parsing_method = \&parseVCFLine;
486							} elsif ($type =~ /chimCT/i) {
487	0					0	$header_regex = '^#';
488	0					0	$parsing_method = \&parseChimCTLine;
489							} elsif ($type =~ /SAM/i \|\| $type =~ /BAM/i) {
490	0					0	$header_regex = '^@';
491	0					0	$parsing_method = \&parseSAMLineLite;
492							} else {
493	0					0	croak "Undefined format type";
494							}
495							}
496
497							# set defaults
498							#$header_regex = '^#' if !defined $header_regex;
499	4	50		0		18	$parsing_method = sub{ return { line => shift } } if !defined $parsing_method;
	0					0
500
501							# We get a filehandle on the file
502	4					21	my $fh = getReadingFileHandle($file);
503
504							# $curr_pos will hold the SEEK_POS of the current_line
505	4					26	my $curr_pos = tell($fh);
506							# $line will hold the content of the current_line
507	4					51	my $line = <$fh>;
508
509
510							# if we need to skip lines
511	4	50				20	if(defined $skip) {
512	0					0	for(my $i = 0; $i < $skip; $i++) {
513	0					0	$curr_pos = tell($fh);
514	0					0	$line = <$fh>;
515							}
516							}
517
518							# Skip line that match a specific regex
519	4	50				17	if(defined $header_regex) {
520	4		66			98	while($line && $line =~ /$header_regex/) {
521	22					41	$curr_pos = tell($fh);
522	22					114	$line = <$fh>;
523							}
524							}
525
526							# The iterator itself (an unnamed subroutine
527							return sub {
528	35	100		35		91	if (defined $line) {
529	34					63	chomp $line;
530							# We parse the line with the appropriate methd
531	34					82	my $parsed_line = $parsing_method->($line,@parsing_arguments);
532							# Add seek pos to the output hashref
533	34					64	$parsed_line->{seek_pos} = $curr_pos;
534	34					88	$parsed_line->{_original_line} = $line;
535	34					69	$curr_pos = tell($fh);
536	34					86	$line = <$fh>; # Get next line
537	34					107	return $parsed_line;
538							} else {
539	1					17	return undef;
540							}
541	4					38	};
542							}
543
544
545							sub getReadingFileHandle {
546	10			10	1	27	my $file = shift;
547	10					27	my $fh;
548	10	50				59	if($file =~ /\.gz$/) {
549	0	0				0	open($fh,"gunzip -c $file \|") or die ("Cannot open $file");
550							} else {
551	10	50				238	open($fh,"< $file") or die ("Cannot open $file");
552							}
553	10					480	return $fh;
554							}
555
556
557							sub getWritingFileHandle {
558	0			0	1		my $file = shift;
559	0						my $fh;
560	0	0					if($file =~ /\.gz$/) {
561	0	0					open($fh,"\| gzip > $file") or die ("Cannot open $file");
562							} else {
563	0	0					open($fh,"> $file") or die ("Cannot open $file");
564							}
565	0						return $fh;
566							}
567
568
569							sub getLineFromSeekPos {
570	0			0	1		my($fh,$seek_pos) = @_;
571	0						seek($fh,$seek_pos,SEEK_SET);
572	0						my $line = <$fh>;
573	0						chomp($line);
574	0						return $line;
575							}
576
577							1;
578
579							__END__