File Coverage

blib/lib/BioUtil/Util.pm

Criterion	Covered	Total	%
statement	112	234	47.8
branch	27	104	25.9
condition	8	27	29.6
subroutine	20	34	58.8
pod	21	21	100.0
total	188	420	44.7

line	stmt	bran	cond	sub	pod	time	code
1							package BioUtil::Util;
2
3	1			1		37646	use File::Path qw/remove_tree/;
	1					2
	1					140
4	1			1		905	use Time::HiRes qw/time/;
	1					3113
	1					6
5
6							require Exporter;
7							@ISA = (Exporter);
8							@EXPORT = qw(
9							getopt
10
11							file_list_from_argv
12							get_file_list
13
14							delete_string_elements_by_indexes
15							delete_array_elements_by_indexes
16
17							extract_parameters_from_string
18							get_parameters_from_file
19
20							get_list_from_file
21							get_column_data
22
23							read_json_file
24							write_json_file
25
26							run
27							run_time
28							readable_second
29
30							check_positive_integer
31							mean_and_stdev
32
33							filename_prefix
34							check_all_files_exist
35							check_in_out_dir
36							rm_and_mkdir
37
38							get_paired_fq_gz_file_from_dir
39
40							);
41
42	1			1		1734	use vars qw($VERSION);
	1					2
	1					65
43
44	1			1		34	use 5.010_000;
	1					3
	1					46
45	1			1		5	use strict;
	1					2
	1					53
46	1			1		5	use warnings FATAL => 'all';
	1					2
	1					57
47
48	1			1		2893	use Encode qw/ encode_utf8 /;
	1					19923
	1					149
49	1			1		13	use File::Path qw(make_path remove_tree);
	1					2
	1					89
50	1			1		9	use File::Find;
	1					2
	1					93
51	1			1		23	use File::Basename;
	1					2
	1					105
52	1			1		1838	use JSON;
	1					21669
	1					12
53
54							=head1 NAME
55
56							BioUtil::Util - Utilities for operation on data or file
57
58							Some great modules like BioPerl provide many robust solutions.
59							However, it is not easy to install for someone in some platforms.
60							And for some simple task scripts, a lite module may be a good choice.
61							So I reinvented some wheels and added some useful utilities into this module,
62							hoping it would be helpful.
63
64							=head1 VERSION
65
66							Version 2015.0228
67
68							=cut
69
70							our $VERSION = 2015.0228;
71
72							=head1 EXPORT
73							getopt
74
75							file_list_from_argv
76							get_file_list
77
78							delete_string_elements_by_indexes
79							delete_array_elements_by_indexes
80
81							extract_parameters_from_string
82							get_parameters_from_file
83
84							get_list_from_file
85							get_column_data
86
87							read_json_file
88							write_json_file
89
90							run
91							run_time
92							readable_second
93
94							check_positive_integer
95							mean_and_stdev
96
97							filename_prefix
98							check_all_files_exist
99							check_in_out_dir
100							rm_and_mkdir
101
102							get_paired_fq_gz_file_from_dir
103
104							=head1 SYNOPSIS
105
106							use BioUtil::Util;
107
108
109							=head1 SUBROUTINES/METHODS
110
111							=head2 getopt
112
113							getopt FOR ME
114
115							Example
116							-a b -c t tt -d bb -dbtype asdfafd -test
117
118							-a: b
119							-c: ARRAY(0xee25e8)
120							-d: bb
121							-dbtype: asdfafd
122							-infmt: fasta
123							-test: 1
124
125							=cut
126
127							sub getopt {
128	0			0	1	0	my ( $opts, $list ) = @_;
129	0	0	0			0	return "\$opts should be ref of hash, and \$list should be ref of list\n"
130							unless ref $opts eq ref {}
131							and ref $list eq ref [];
132	0					0	my ( $o, $opt ) = (undef) x 2;
133	0					0	while (@$list) {
134	0					0	$o = shift @$list;
135	0	0				0	if ( $o =~ /^\-/ ) {
136	0					0	$opt = $o;
137	0	0				0	$$opts{$opt} = 'http:shenwei.me' unless exists $$opts{$opt};
138							}
139							else {
140	0	0				0	if ( $$opts{$opt} ne 'http:shenwei.me' ) {
141	0	0				0	$$opts{$opt} = [ $$opts{$opt} ]
142							if ref $$opts{$opt} ne ref [];
143	0					0	push @{ $$opts{$opt} }, $o;
	0					0
144							}
145							else {
146	0					0	$$opts{$opt} = $o;
147							}
148							}
149							}
150	0	0				0	for ( keys %$opts ) { $$opts{$_} = 1 if $$opts{$_} eq 'http:shenwei.me'; }
	0					0
151	0					0	return $opts;
152							}
153
154							=head2 file_list_from_argv
155
156							Get file list from @ARGV. You should use this after parsing options!
157
158							When no arguments given, 'STDIN' will be added to
159							the list, which could be further used by, e.g. FastaReader.
160
161							=cut
162
163							sub file_list_from_argv {
164	0			0	1	0	my @files = ();
165	0					0	for my $file (@_) {
166	0					0	for my $f ( glob $file ) {
167	0					0	push @files, $f;
168							}
169							}
170	0	0				0	if ( @files == 0 ) {
171	0					0	push @files, 'STDIN';
172							}
173	0					0	return @files;
174							}
175
176							=head2 get_file_list
177
178							Find files/directories with custom filter,
179							max serach depth could be specified.
180
181							Example (searching perl scripts)
182
183							my $dir = "~";
184							my $depth = 2;
185
186							my $list = get_file_list(
187							$dir,
188							sub {
189							if ( -d or /^\./i ) { # ignore configuration file and folders
190							return 0;
191							}
192							if (/\.pm/i or /\.pl/i) {
193							return 1;
194							}
195							return 0;
196							},
197							$depth
198							);
199							print "$_\n" for @$list;
200
201							=cut
202
203							sub get_file_list {
204
205							# filter is a subroutine to filter a file
206	1			1	1	15	my ( $dir, $filter, $depth ) = @_;
207	1					10	$dir =~ s/\/+/\//g;
208	1					6	$dir =~ s/\/$//;
209
210	1	50				6	$depth = 1 << 30 unless defined $depth;
211	1	50	33			17	unless ( $depth =~ /^\d+$/ and $depth > 0 ) {
212	0					0	warn "depth should be positive integer\n";
213	0					0	return [];
214							}
215
216							# print "$dir\n";
217	1					4	my $depth0 = $dir =~ tr/\//\//;
218
219	1					4	my $files = [];
220							my $wanted = sub {
221	57	100		57		1006	return if /^\.+$/;
222	56	50				118	return if $_ eq $dir;
223
224							# check depth
225	56	100				1659	return if $File::Find::name =~ tr/\//\// - $depth0 > $depth;
226
227	14	100				42	if ( &$filter($_) ) {
228	1					37	push @$files, $File::Find::name;
229							}
230	1					10	};
231
232	1					127	find( $wanted, ($dir) );
233
234	1					12	return $files;
235							}
236
237							=head2 delete_string_elements_by_indexes
238
239							Delete string elements by indexes, it uses delete_array_elements_by_indexes
240
241							=cut
242
243							sub delete_string_elements_by_indexes {
244	0			0	1	0	my ( $str, $ids ) = @_;
245	0					0	my $t = '';
246	0	0	0			0	unless ( ref $str eq ref \$t and ref $ids eq ref [] ) {
247	0					0	die "both arguments should be array reference\n";
248							}
249	0					0	my @bytes = split //, $$str;
250	0					0	return join "", @{ delete_array_elements_by_indexes( \@bytes, $ids ) };
	0					0
251							}
252
253							=head2 delete_array_elements_by_indexes
254
255							Delete array elements by given indexes.
256
257							Example:
258
259							@list = qw(a b c d e f);
260							@idx = (1, 2, 4);
261							$list2 = delete_array_elements_by_indexes(\@list, \@idx);
262							print "@$list2\n"; # result: a, d, f
263
264							=cut
265
266							sub delete_array_elements_by_indexes {
267	0			0	1	0	my ( $array, $ids ) = @_;
268	0	0	0			0	unless ( ref $array eq ref [] and ref $ids eq ref [] ) {
269	0					0	die "both arguments should be array reference\n";
270							}
271	0					0	my %omitted = map { $_ => 1 } @$ids;
	0					0
272	0					0	my @newarray = ();
273	0					0	for my $i ( 0 .. ( scalar(@$array) - 1 ) ) {
274	0	0				0	next if exists $omitted{$i};
275	0					0	push @newarray, $$array[$i];
276							}
277	0					0	return \@newarray;
278							}
279
280							=head2 extract_parameters_from_string
281
282							Extract parameters from string.
283
284							The regular expression is
285
286							/([\w\d\_\-\.]+)\s=\s([^\=;])[\s;]/
287
288							Example:
289
290							# bad format, but could also be parsed
291							# my $s = " s = b; a=test; b_c=12 3; a.b =; b
292							# = asdf
293							# sd; ads-f = 12313";
294
295							# recommended
296							my $s = "key1=abcde; key2=123; conf.a=file; conf.b=12; ";
297
298							my $pa = extract_parameters_from_string($s);
299							print "=$_:$$p{$_}=\n" for sort keys %$pa;
300
301							=cut
302
303							sub extract_parameters_from_string {
304	1			1	1	1132	my ($s) = @_;
305	1					4	my $parameters = {};
306	1					16	while ( $s =~ /([\w\d\_\-\.]+)\s=\s([^\=;])[\s;]/gm ) {
307	4	50				17	warn "$1 was defined more than once\n" if defined $$parameters{$1};
308	4					21	$$parameters{$1} = $2;
309							}
310	1					5	return $parameters;
311							}
312
313							=head2 get_parameters_from_file
314
315							Get parameters from a file.
316							Comments start with # are allowed in file.
317
318							Example:
319
320							my $pa = get_parameters_from_file("d.txt");
321							print "$_: $$pa{$_}\n" for sort keys %$pa;
322
323							For a file with content:
324
325							# cell phone
326							apple = 1 # note
327
328							nokia = 2 #
329
330							output is:
331
332							apple: 1
333							nokia: 2
334
335							=cut
336
337							sub get_parameters_from_file {
338	1			1	1	403	my ($file) = @_;
339	1					4	my $parameters = {};
340	1	50				72	open my $fh, $file or die "fail to open file $file\n";
341	1					35	while (<$fh>) {
342	4					27	s/^\s+\|\s+$//g;
343	4	100	100			32	next if $_ eq '' # blank line
344							or /^#/; # annotation
345	2					8	s/#.*//g; # delete annotation
346
347	2	50				14	next unless /([\w\d\_\-\.]+)\s=\s(.+)/;
348	2					21	$$parameters{$1} = $2;
349							}
350	1					13	close $fh;
351	1					7	return $parameters;
352							}
353
354							=head2 get_list_from_file
355
356							Get list from a file.
357							Comments start with # are allowed in file.
358
359							Example:
360
361							my $list = get_list_from_file("d.txt");
362							print "$_\n" for @$list;
363
364							For a file with content:
365
366							# cell phone
367							apple # note
368
369							nokia
370
371							output is:
372
373							apple
374							nokia
375
376							=cut
377
378							sub get_list_from_file {
379	1			1	1	322	my ($file) = @_;
380	1	50				42	open my $fh, "<", $file or die "fail to open file $file\n";
381	1					3	my @list = ();
382	1					13	while (<$fh>) {
383	4					20	s/\r?\n//g;
384	4					16	s/^\s+\|\s+$//g;
385	4	100	100			26	next if $_ eq '' # blank line
386							or /^#/; # annotation
387	2					6	s/#.*//g; # delete annotation
388
389	2					12	push @list, $_;
390							}
391	1					8	close $fh;
392	1					8	return \@list;
393							}
394
395							=head2 get_column_data
396
397							Get one column of a file.
398
399							Example:
400
401							my $list = get_column_data("d.txt", 2);
402							print "$_\n" for @$list;
403
404							=cut
405
406							sub get_column_data {
407	1			1	1	317	my ( $file, $column, $delimiter ) = @_;
408	1	50	33			20	unless ( $column =~ /^(\d+)$/ and $column > 0 ) {
409	0					0	warn "column number ($column) should be positive integer\n";
410	0					0	$column = 1;
411							}
412	1	50				5	$delimiter = "\t" unless defined $delimiter;
413
414	1	50				42	open my $fh, "<", $file or die "failed to open file: $file\n";
415	1					3	my @linedata = ();
416	1					2	my @data = ();
417	1					3	my $n = 0;
418	1					12	while (<$fh>) {
419	4					21	s/\r?\n//;
420	4	50				12	next if /^\s*#/;
421	4					40	@linedata = split /$delimiter/, $_;
422	4					7	$n = scalar @linedata;
423	4	100				19	next unless $n > 0;
424
425	3	50				5	if ( $column > $n ) {
426	0					0	die
427							"number of columns of this line ($n) is less than given column number ($column)\n$_";
428							}
429
430	3					14	push @data, $linedata[ $column - 1 ];
431							}
432	1					9	close $fh;
433
434	1					8	return \@data;
435							}
436
437							=head2 read_json_file
438
439							Read json file and decode it into a hash ref.
440
441							Example:
442
443							my $hashref = read_json_file($file);
444
445							=cut
446
447							sub read_json_file {
448	1			1	1	8	my ($file) = @_;
449	1	50				33	open my $fh, "<:encoding(utf8)", $file
450							or die "fail to open json file: $file\n";
451	1					51	my $text;
452	1					29	while (<$fh>) {
453	5					40	s/\s#+.\r?\n?//g; # remove annotation
454	5	50				59	$text .= $1 if / *(.+)/;
455							}
456	1					16	close $fh;
457	1					24	my $hash = decode_json($text);
458	1					9	return $hash;
459							}
460
461							=head2 write_json_file
462
463							Write a hash ref into a file.
464
465							Example:
466
467							my $hashref = { "a" => 1, "b" => 2 };
468							write_json_file($hashref, $file);
469
470							=cut
471
472							sub write_json_file {
473	1			1	1	343	my ( $hash, $file ) = @_;
474	1					28	my $json = JSON->new->allow_nonref;
475	1					37	my $text = $json->pretty->encode($hash);
476	1					9	$text = encode_utf8($text);
477	1	50		1		13	open my $fh2, ">:encoding(utf8)", $file
	1					2
	1					9
	1					71
478							or die "fail to open json file: $file\n";
479	1					1868	print $fh2 $text;
480	1					92	close $fh2;
481							}
482
483							=head2 run
484
485							Run a command
486
487							Example:
488
489							my $fail = run($cmd);
490							die "failed to run:$cmd\n" if $fail;
491
492							=cut
493
494							sub run {
495	0			0	1		my ($cmd) = @_;
496	0						system($cmd);
497
498	0	0					if ( $? == -1 ) {
		0
499	0						die "[ERROR] fail to run: $cmd. Command ("
500							. ( split /\s+/, $cmd )[0]
501							. ") not found\n";
502							}
503							elsif ( $? & 127 ) {
504	0	0					printf STDERR "[ERROR] command died with signal %d, %s coredump\n",
505							( $? & 127 ), ( $? & 128 ) ? 'with' : 'without';
506							}
507							else {
508							# 0, ok
509							}
510	0						return $?;
511							}
512
513							=head2 run_time
514
515							Run a subroutine with given arguments N times, and return the mean and stdev
516							of time.
517
518							Example:
519
520							my $read_by_record = sub {
521							my ($file) = @_;
522							my $next_seq = FastaReader($file);
523							while ( my $fa = &$next_seq() ) {
524							my ( $header, $seq ) = @$fa;
525							# print ">$header\n$seq\n";
526							}
527							};
528
529							my ($mean, $stdev) = run_time( 8, $read_by_record, $file );
530							printf STDERR "\n## Compute time: %0.03f ± %0.03f s\n\n", $mean, $stdev;
531
532							=cut
533
534							sub run_time {
535	0			0	1		my ( $n, $sub, @args ) = @_;
536	0	0	0				die "first argument should be positive integer"
537							unless $n =~ /^\d+$/ and $n > 0;
538
539	0						my $t0 = 0;
540	0						my @ts = ();
541	0						for ( 1 .. $n ) {
542	0						$t0 = time;
543	0						&$sub(@args); # call $sub
544	0						push @ts, time - $t0;
545							}
546
547	0						return mean_and_stdev( \@ts );
548							}
549
550							=head2 readable_second
551
552							readable_second
553
554							Example:
555
556							print readable_second(11312314),"\n"; # 130 day 22 hour 18 min 34 sec
557
558							=cut
559
560							sub readable_second ($) {
561	0			0	1		my ($seconds) = @_;
562	0	0					return "Positive integer need." unless $seconds =~ /^\d+$/;
563
564	0						my $time = "";
565	0						my $has_bigger_unit = 0;
566
567	0						my $days = $seconds / 86400;
568	0	0					if ( $days >= 1 ) {
569	0						$time .= ( int $days ) . " day ";
570	0						$has_bigger_unit = 1;
571							}
572	0						$seconds = $seconds % 86400;
573
574	0						my $hours = $seconds / 3600;
575	0	0					if ( $hours >= 1 ) {
		0
576	0						$time .= ( int $hours ) . " hour ";
577	0						$has_bigger_unit = 1;
578							}
579	0						elsif ($has_bigger_unit) { $time .= 0 . " hour "; }
580	0						$seconds = $seconds % 3600;
581
582	0						my $minutes = $seconds / 60;
583	0	0					if ( $minutes >= 1 ) {
		0
584	0						$time .= ( int $minutes ) . " min ";
585	0						$has_bigger_unit = 1;
586							}
587	0						elsif ($has_bigger_unit) { $time .= 0 . " min "; }
588	0						$seconds = $seconds % 60;
589
590	0						$time .= $seconds . " sec";
591	0						return $time;
592							}
593
594							=head2 check_positive_integer
595
596							Check Positive Integer
597
598							Example:
599
600							check_positive_integer(1);
601
602							=cut
603
604							sub check_positive_integer {
605	0			0	1		my ($n) = @_;
606	0	0	0				die "positive integer needed ($n given)"
607							unless $n =~ /^\d+$/ and $n != 0;
608							}
609
610							=head2 mean_and_stdev
611
612							return mean and stdev of a list
613
614							Example:
615							my @list = qq/1 2 3/;
616							mean_and_stdev(\@list);
617
618							=cut
619							sub mean_and_stdev($) {
620	0			0	1		my ($list) = @_;
621	0	0					return ( 0, 0 ) if @$list == 0;
622	0						my $sum = 0;
623	0						$sum += $_ for @$list;
624	0						my $sum_square = 0;
625	0						$sum_square += $_ * $_ for @$list;
626	0						my $mean = $sum / @$list;
627	0						my $variance = $sum_square / @$list - $mean * $mean;
628	0						my $std = sqrt $variance;
629	0						return ( $mean, $std );
630							}
631
632							=head2 filename_prefix
633
634							Get filename prefix
635
636							Example:
637
638							filename_prefix("test.fa"); # "test"
639							filename_prefix("tmp"); # "tmp"
640
641							=cut
642
643							sub filename_prefix {
644	0			0	1		my ($file) = @_;
645	0	0					if ( $file =~ /(.+)\..+?$/ ) {
646	0						return $1;
647							}
648							else {
649	0						return $file;
650							}
651							}
652
653							=head2 check_all_files_exist
654
655							Check whether all files existed.
656
657							=cut
658
659							sub check_all_files_exist {
660	0			0	1		my $flag = 1;
661	0						for (@_) {
662	0	0					if ( not -e $_ ) {
663	0						return 0;
664							}
665							}
666	0						return 1;
667							}
668
669							=head2 check_in_out_dir
670
671							Check in and $fh2 directory.
672
673							Example:
674
675							check_in_out_dir("~/dir", "~/dir.out");
676
677							=cut
678
679							sub check_in_out_dir {
680	0			0	1		my ( $in, $out ) = @_;
681	0	0					die "dir $in not found."
682							unless -e $in;
683
684	0	0					die "$in is not a directory.\n"
685							unless -d $in;
686
687	0						$in =~ s/\/$//;
688	0						$out =~ s/\/$//;
689	0	0					die "out dir shoud be different from in dir!\n"
690							if $in eq $out;
691							}
692
693							=head2 rm_and_mkdir
694
695							Make a directory, remove it firstly if it exists.
696
697							Example:
698
699							rm_and_mkdir("out")
700
701							=cut
702
703							sub rm_and_mkdir {
704	0			0	1		my ($dir) = @_;
705	0	0					if ( -e $dir ) {
706	0	0					remove_tree($dir) or die "fail to remove: $dir\n";
707							}
708	0	0					mkdir $dir or die "fail to mkdir: $dir\n";
709							}
710
711							=head2 get_paired_fq_gz_file_from_dir
712
713							Example:
714
715							# .
716							# ├── test_1.fq.gz
717							# └── test_2.fq.gz
718							for my $pe ( get_paired_fq_gz_file_from_dir($indir) ) {
719							# test_1.fq.gz, test_1.fq.gz, test
720							my ( $fqfile1, $fqfile2, $id ) = @$pe;
721
722							}
723
724							=cut
725
726							sub get_paired_fq_gz_file_from_dir {
727	0			0	1		my ($dir) = @_;
728	0						my @files;
729	0						for ( sort glob "$dir/*_1.fq.gz" ) {
730	0						/\/?([^\/]+)_1.fq.gz/;
731	0						my $id = $1;
732	0	0					next unless -e "$dir/${id}_2.fq.gz";
733	0						push @files, [ "$dir/${id}_1.fq.gz", "$dir/${id}_2.fq.gz", "$dir/${id}", $id ];
734							}
735	0						return @files;
736							}
737							1;