File Coverage

blib/lib/BioUtil/Util.pm

Criterion	Covered	Total	%
statement	111	241	46.0
branch	27	106	25.4
condition	8	27	29.6
subroutine	20	35	57.1
pod	21	22	95.4
total	187	431	43.3

line	stmt	bran	cond	sub	pod	time	code
1							package BioUtil::Util;
2
3	1			1		18364	use File::Path qw/remove_tree/;
	1					2
	1					74
4	1			1		729	use Time::HiRes qw/time/;
	1					1380
	1					4
5
6							require Exporter;
7							@ISA = (Exporter);
8							@EXPORT = qw(
9							getopt
10
11							file_list_from_argv
12							get_file_list
13
14							delete_string_elements_by_indexes
15							delete_array_elements_by_indexes
16
17							extract_parameters_from_string
18							get_parameters_from_file
19
20							get_list_from_file
21							get_column_data
22
23							read_json_file
24							write_json_file
25
26							run
27							run_time
28							readable_second
29
30							check_positive_integer
31							mean_and_stdev
32
33							filename_prefix
34							check_all_files_exist
35							check_in_out_dir
36							rm_and_mkdir
37
38							get_paired_fq_gz_file_from_dir
39							get_paired_fa_gz_file_from_dir
40
41							);
42
43	1			1		175	use vars qw($VERSION);
	1					1
	1					44
44
45	1			1		15	use 5.010_000;
	1					3
46	1			1		4	use strict;
	1					2
	1					26
47	1			1		4	use warnings FATAL => 'all';
	1					2
	1					53
48
49	1			1		797	use Encode qw/ encode_utf8 /;
	1					10199
	1					68
50	1			1		7	use File::Path qw(make_path remove_tree);
	1					1
	1					44
51	1			1		5	use File::Find;
	1					2
	1					59
52	1			1		11	use File::Basename;
	1					3
	1					80
53	1			1		837	use JSON;
	1					15111
	1					4
54
55							=head1 NAME
56
57							BioUtil::Util - Utilities for operation on data or file
58
59							Some great modules like BioPerl provide many robust solutions.
60							However, it is not easy to install for someone in some platforms.
61							And for some simple task scripts, a lite module may be a good choice.
62							So I reinvented some wheels and added some useful utilities into this module,
63							hoping it would be helpful.
64
65							=head1 VERSION
66
67							Version 2015.0228
68
69							=cut
70
71							our $VERSION = 2015.0228;
72
73							=head1 EXPORT
74							getopt
75
76							file_list_from_argv
77							get_file_list
78
79							delete_string_elements_by_indexes
80							delete_array_elements_by_indexes
81
82							extract_parameters_from_string
83							get_parameters_from_file
84
85							get_list_from_file
86							get_column_data
87
88							read_json_file
89							write_json_file
90
91							run
92							run_time
93							readable_second
94
95							check_positive_integer
96							mean_and_stdev
97
98							filename_prefix
99							check_all_files_exist
100							check_in_out_dir
101							rm_and_mkdir
102
103							get_paired_fq_gz_file_from_dir
104							get_paired_fa_gz_file_from_dir
105
106							=head1 SYNOPSIS
107
108							use BioUtil::Util;
109
110
111							=head1 SUBROUTINES/METHODS
112
113							=head2 getopt
114
115							getopt FOR ME
116
117							Example
118							-a b -c t tt -d bb -dbtype asdfafd -test
119
120							-a: b
121							-c: ARRAY(0xee25e8)
122							-d: bb
123							-dbtype: asdfafd
124							-infmt: fasta
125							-test: 1
126
127							=cut
128
129							sub getopt {
130	0			0	1	0	my ( $opts, $list ) = @_;
131	0	0	0			0	return "\$opts should be ref of hash, and \$list should be ref of list\n"
132							unless ref $opts eq ref {}
133							and ref $list eq ref [];
134	0					0	my ( $o, $opt ) = (undef) x 2;
135	0					0	while (@$list) {
136	0					0	$o = shift @$list;
137	0	0				0	if ( $o =~ /^\-/ ) {
138	0					0	$opt = $o;
139	0	0				0	$$opts{$opt} = 'http:shenwei.me' unless exists $$opts{$opt};
140							}
141							else {
142	0	0				0	if ( $$opts{$opt} ne 'http:shenwei.me' ) {
143							$$opts{$opt} = [ $$opts{$opt} ]
144	0	0				0	if ref $$opts{$opt} ne ref [];
145	0					0	push @{ $$opts{$opt} }, $o;
	0					0
146							}
147							else {
148	0					0	$$opts{$opt} = $o;
149							}
150							}
151							}
152	0	0				0	for ( keys %$opts ) { $$opts{$_} = 1 if $$opts{$_} eq 'http:shenwei.me'; }
	0					0
153	0					0	return $opts;
154							}
155
156							=head2 file_list_from_argv
157
158							Get file list from @ARGV. You should use this after parsing options!
159
160							When no arguments given, 'STDIN' will be added to
161							the list, which could be further used by, e.g. FastaReader.
162
163							=cut
164
165							sub file_list_from_argv {
166	0			0	1	0	my @files = ();
167	0					0	for my $file (@_) {
168	0					0	for my $f ( glob $file ) {
169	0					0	push @files, $f;
170							}
171							}
172	0	0				0	if ( @files == 0 ) {
173	0					0	push @files, 'STDIN';
174							}
175	0					0	return @files;
176							}
177
178							=head2 get_file_list
179
180							Find files/directories with custom filter,
181							max serach depth could be specified.
182
183							Example (searching perl scripts)
184
185							my $dir = "~";
186							my $depth = 2;
187
188							my $list = get_file_list(
189							$dir,
190							sub {
191							if ( -d or /^\./i ) { # ignore configuration file and folders
192							return 0;
193							}
194							if (/\.pm/i or /\.pl/i) {
195							return 1;
196							}
197							return 0;
198							},
199							$depth
200							);
201							print "$_\n" for @$list;
202
203							=cut
204
205							sub get_file_list {
206
207							# filter is a subroutine to filter a file
208	1			1	1	11	my ( $dir, $filter, $depth ) = @_;
209	1					7	$dir =~ s/\/+/\//g;
210	1					5	$dir =~ s/\/$//;
211
212	1	50				4	$depth = 1 << 30 unless defined $depth;
213	1	50	33			14	unless ( $depth =~ /^\d+$/ and $depth > 0 ) {
214	0					0	warn "depth should be positive integer\n";
215	0					0	return [];
216							}
217
218							# print "$dir\n";
219	1					4	my $depth0 = $dir =~ tr/\//\//;
220
221	1					2	my $files = [];
222							my $wanted = sub {
223	56	100		56		955	return if /^\.+$/;
224	55	50				117	return if $_ eq $dir;
225
226							# check depth
227	55	100				1253	return if $File::Find::name =~ tr/\//\// - $depth0 > $depth;
228
229	14	100				32	if ( &$filter($_) ) {
230	1					28	push @$files, $File::Find::name;
231							}
232	1					6	};
233
234	1					88	find( $wanted, ($dir) );
235
236	1					8	return $files;
237							}
238
239							=head2 delete_string_elements_by_indexes
240
241							Delete string elements by indexes, it uses delete_array_elements_by_indexes
242
243							=cut
244
245							sub delete_string_elements_by_indexes {
246	0			0	1	0	my ( $str, $ids ) = @_;
247	0					0	my $t = '';
248	0	0	0			0	unless ( ref $str eq ref \$t and ref $ids eq ref [] ) {
249	0					0	die "both arguments should be array reference\n";
250							}
251	0					0	my @bytes = split //, $$str;
252	0					0	return join "", @{ delete_array_elements_by_indexes( \@bytes, $ids ) };
	0					0
253							}
254
255							=head2 delete_array_elements_by_indexes
256
257							Delete array elements by given indexes.
258
259							Example:
260
261							@list = qw(a b c d e f);
262							@idx = (1, 2, 4);
263							$list2 = delete_array_elements_by_indexes(\@list, \@idx);
264							print "@$list2\n"; # result: a, d, f
265
266							=cut
267
268							sub delete_array_elements_by_indexes {
269	0			0	1	0	my ( $array, $ids ) = @_;
270	0	0	0			0	unless ( ref $array eq ref [] and ref $ids eq ref [] ) {
271	0					0	die "both arguments should be array reference\n";
272							}
273	0					0	my %omitted = map { $_ => 1 } @$ids;
	0					0
274	0					0	my @newarray = ();
275	0					0	for my $i ( 0 .. ( scalar(@$array) - 1 ) ) {
276	0	0				0	next if exists $omitted{$i};
277	0					0	push @newarray, $$array[$i];
278							}
279	0					0	return \@newarray;
280							}
281
282							=head2 extract_parameters_from_string
283
284							Extract parameters from string.
285
286							The regular expression is
287
288							/([\w\d\_\-\.]+)\s=\s([^\=;])[\s;]/
289
290							Example:
291
292							# bad format, but could also be parsed
293							# my $s = " s = b; a=test; b_c=12 3; a.b =; b
294							# = asdf
295							# sd; ads-f = 12313";
296
297							# recommended
298							my $s = "key1=abcde; key2=123; conf.a=file; conf.b=12; ";
299
300							my $pa = extract_parameters_from_string($s);
301							print "=$_:$$p{$_}=\n" for sort keys %$pa;
302
303							=cut
304
305							sub extract_parameters_from_string {
306	1			1	1	786	my ($s) = @_;
307	1					3	my $parameters = {};
308	1					11	while ( $s =~ /([\w\d\_\-\.]+)\s=\s([^\=;])[\s;]/gm ) {
309	4	50				15	warn "$1 was defined more than once\n" if defined $$parameters{$1};
310	4					24	$$parameters{$1} = $2;
311							}
312	1					3	return $parameters;
313							}
314
315							=head2 get_parameters_from_file
316
317							Get parameters from a file.
318							Comments start with # are allowed in file.
319
320							Example:
321
322							my $pa = get_parameters_from_file("d.txt");
323							print "$_: $$pa{$_}\n" for sort keys %$pa;
324
325							For a file with content:
326
327							# cell phone
328							apple = 1 # note
329
330							nokia = 2 #
331
332							output is:
333
334							apple: 1
335							nokia: 2
336
337							=cut
338
339							sub get_parameters_from_file {
340	1			1	1	432	my ($file) = @_;
341	1					3	my $parameters = {};
342	1	50				41	open my $fh, $file or die "fail to open file $file\n";
343	1					22	while (<$fh>) {
344	4					22	s/^\s+\|\s+$//g;
345	4	100	100			30	next if $_ eq '' # blank line
346							or /^#/; # annotation
347	2					8	s/#.*//g; # delete annotation
348
349	2	50				11	next unless /([\w\d\_\-\.]+)\s=\s(.+)/;
350	2					16	$$parameters{$1} = $2;
351							}
352	1					9	close $fh;
353	1					5	return $parameters;
354							}
355
356							=head2 get_list_from_file
357
358							Get list from a file.
359							Comments start with # are allowed in file.
360
361							Example:
362
363							my $list = get_list_from_file("d.txt");
364							print "$_\n" for @$list;
365
366							For a file with content:
367
368							# cell phone
369							apple # note
370
371							nokia
372
373							output is:
374
375							apple
376							nokia
377
378							=cut
379
380							sub get_list_from_file {
381	1			1	1	490	my ($file) = @_;
382	1	50				46	open my $fh, "<", $file or die "fail to open file $file\n";
383	1					3	my @list = ();
384	1					13	while (<$fh>) {
385	4					19	s/\r?\n//g;
386	4					18	s/^\s+\|\s+$//g;
387	4	100	100			27	next if $_ eq '' # blank line
388							or /^#/; # annotation
389	2					7	s/#.*//g; # delete annotation
390
391	2					10	push @list, $_;
392							}
393	1					9	close $fh;
394	1					7	return \@list;
395							}
396
397							=head2 get_column_data
398
399							Get one column of a file.
400
401							Example:
402
403							my $list = get_column_data("d.txt", 2);
404							print "$_\n" for @$list;
405
406							=cut
407
408							sub get_column_data {
409	1			1	1	425	my ( $file, $column, $delimiter ) = @_;
410	1	50	33			14	unless ( $column =~ /^(\d+)$/ and $column > 0 ) {
411	0					0	warn "column number ($column) should be positive integer\n";
412	0					0	$column = 1;
413							}
414	1	50				5	$delimiter = "\t" unless defined $delimiter;
415
416	1	50				35	open my $fh, "<", $file or die "failed to open file: $file\n";
417	1					3	my @linedata = ();
418	1					2	my @data = ();
419	1					2	my $n = 0;
420	1					13	while (<$fh>) {
421	4					19	s/\r?\n//;
422	4	50				13	next if /^\s*#/;
423	4					27	@linedata = split /$delimiter/, $_;
424	4					7	$n = scalar @linedata;
425	4	100				15	next unless $n > 0;
426
427	3	50				7	if ( $column > $n ) {
428	0					0	die
429							"number of columns of this line ($n) is less than given column number ($column)\n$_";
430							}
431
432	3					13	push @data, $linedata[ $column - 1 ];
433							}
434	1					9	close $fh;
435
436	1					6	return \@data;
437							}
438
439							=head2 read_json_file
440
441							Read json file and decode it into a hash ref.
442
443							Example:
444
445							my $hashref = read_json_file($file);
446
447							=cut
448
449							sub read_json_file {
450	1			1	1	6	my ($file) = @_;
451	1	50				30	open my $fh, "<:encoding(utf8)", $file
452							or die "fail to open json file: $file\n";
453	1					41	my $text;
454	1					20	while (<$fh>) {
455	5					27	s/\s#+.\r?\n?//g; # remove annotation
456	5	50				40	$text .= $1 if / *(.+)/;
457							}
458	1					11	close $fh;
459	1					12	my $hash = decode_json($text);
460	1					5	return $hash;
461							}
462
463							=head2 write_json_file
464
465							Write a hash ref into a file.
466
467							Example:
468
469							my $hashref = { "a" => 1, "b" => 2 };
470							write_json_file($hashref, $file);
471
472							=cut
473
474							sub write_json_file {
475	1			1	1	421	my ( $hash, $file ) = @_;
476	1					22	my $json = JSON->new->allow_nonref;
477	1					35	my $text = $json->pretty->encode($hash);
478	1					6	$text = encode_utf8($text);
479	1	50		1		6	open my $fh2, ">:encoding(utf8)", $file
	1					2
	1					7
	1					45
480							or die "fail to open json file: $file\n";
481	1					1673	print $fh2 $text;
482	1					99	close $fh2;
483							}
484
485							=head2 run
486
487							Run a command
488
489							Example:
490
491							my $fail = run($cmd);
492							die "failed to run:$cmd\n" if $fail;
493
494							=cut
495
496							sub run {
497	0			0	1		my ($cmd) = @_;
498	0						system($cmd);
499
500	0	0					if ( $? == -1 ) {
		0
501	0						die "[ERROR] fail to run: $cmd. Command ("
502							. ( split /\s+/, $cmd )[0]
503							. ") not found\n";
504							}
505							elsif ( $? & 127 ) {
506	0	0					printf STDERR "[ERROR] command died with signal %d, %s coredump\n",
507							( $? & 127 ), ( $? & 128 ) ? 'with' : 'without';
508							}
509							else {
510							# 0, ok
511							}
512	0						return $?;
513							}
514
515							=head2 run_time
516
517							Run a subroutine with given arguments N times, and return the mean and stdev
518							of time.
519
520							Example:
521
522							my $read_by_record = sub {
523							my ($file) = @_;
524							my $next_seq = FastaReader($file);
525							while ( my $fa = &$next_seq() ) {
526							my ( $header, $seq ) = @$fa;
527							# print ">$header\n$seq\n";
528							}
529							};
530
531							my ($mean, $stdev) = run_time( 8, $read_by_record, $file );
532							printf STDERR "\n## Compute time: %0.03f ± %0.03f s\n\n", $mean, $stdev;
533
534							=cut
535
536							sub run_time {
537	0			0	1		my ( $n, $sub, @args ) = @_;
538	0	0	0				die "first argument should be positive integer"
539							unless $n =~ /^\d+$/ and $n > 0;
540
541	0						my $t0 = 0;
542	0						my @ts = ();
543	0						for ( 1 .. $n ) {
544	0						$t0 = time;
545	0						&$sub(@args); # call $sub
546	0						push @ts, time - $t0;
547							}
548
549	0						return mean_and_stdev( \@ts );
550							}
551
552							=head2 readable_second
553
554							readable_second
555
556							Example:
557
558							print readable_second(11312314),"\n"; # 130 day 22 hour 18 min 34 sec
559
560							=cut
561
562							sub readable_second ($) {
563	0			0	1		my ($seconds) = @_;
564	0	0					return "Positive integer need." unless $seconds =~ /^\d+$/;
565
566	0						my $time = "";
567	0						my $has_bigger_unit = 0;
568
569	0						my $days = $seconds / 86400;
570	0	0					if ( $days >= 1 ) {
571	0						$time .= ( int $days ) . " day ";
572	0						$has_bigger_unit = 1;
573							}
574	0						$seconds = $seconds % 86400;
575
576	0						my $hours = $seconds / 3600;
577	0	0					if ( $hours >= 1 ) {
		0
578	0						$time .= ( int $hours ) . " hour ";
579	0						$has_bigger_unit = 1;
580							}
581	0						elsif ($has_bigger_unit) { $time .= 0 . " hour "; }
582	0						$seconds = $seconds % 3600;
583
584	0						my $minutes = $seconds / 60;
585	0	0					if ( $minutes >= 1 ) {
		0
586	0						$time .= ( int $minutes ) . " min ";
587	0						$has_bigger_unit = 1;
588							}
589	0						elsif ($has_bigger_unit) { $time .= 0 . " min "; }
590	0						$seconds = $seconds % 60;
591
592	0						$time .= $seconds . " sec";
593	0						return $time;
594							}
595
596							=head2 check_positive_integer
597
598							Check Positive Integer
599
600							Example:
601
602							check_positive_integer(1);
603
604							=cut
605
606							sub check_positive_integer {
607	0			0	1		my ($n) = @_;
608	0	0	0				die "positive integer needed ($n given)"
609							unless $n =~ /^\d+$/ and $n != 0;
610							}
611
612							=head2 mean_and_stdev
613
614							return mean and stdev of a list
615
616							Example:
617							my @list = qq/1 2 3/;
618							mean_and_stdev(\@list);
619
620							=cut
621							sub mean_and_stdev($) {
622	0			0	1		my ($list) = @_;
623	0	0					return ( 0, 0 ) if @$list == 0;
624	0						my $sum = 0;
625	0						$sum += $_ for @$list;
626	0						my $sum_square = 0;
627	0						$sum_square += $_ * $_ for @$list;
628	0						my $mean = $sum / @$list;
629	0						my $variance = $sum_square / @$list - $mean * $mean;
630	0						my $std = sqrt $variance;
631	0						return ( $mean, $std );
632							}
633
634							=head2 filename_prefix
635
636							Get filename prefix
637
638							Example:
639
640							filename_prefix("test.fa"); # "test"
641							filename_prefix("tmp"); # "tmp"
642
643							=cut
644
645							sub filename_prefix {
646	0			0	1		my ($file) = @_;
647	0	0					if ( $file =~ /(.+)\..+?$/ ) {
648	0						return $1;
649							}
650							else {
651	0						return $file;
652							}
653							}
654
655							=head2 check_all_files_exist
656
657							Check whether all files existed.
658
659							=cut
660
661							sub check_all_files_exist {
662	0			0	1		my $flag = 1;
663	0						for (@_) {
664	0	0					if ( not -e $_ ) {
665	0						return 0;
666							}
667							}
668	0						return 1;
669							}
670
671							=head2 check_in_out_dir
672
673							Check in and $fh2 directory.
674
675							Example:
676
677							check_in_out_dir("~/dir", "~/dir.out");
678
679							=cut
680
681							sub check_in_out_dir {
682	0			0	1		my ( $in, $out ) = @_;
683	0	0					die "dir $in not found."
684							unless -e $in;
685
686	0	0					die "$in is not a directory.\n"
687							unless -d $in;
688
689	0						$in =~ s/\/$//;
690	0						$out =~ s/\/$//;
691	0	0					die "out dir shoud be different from in dir!\n"
692							if $in eq $out;
693							}
694
695							=head2 rm_and_mkdir
696
697							Make a directory, remove it firstly if it exists.
698
699							Example:
700
701							rm_and_mkdir("out")
702
703							=cut
704
705							sub rm_and_mkdir {
706	0			0	1		my ($dir) = @_;
707	0	0					if ( -e $dir ) {
708	0	0					remove_tree($dir) or die "fail to remove: $dir\n";
709							}
710	0	0					mkdir $dir or die "fail to mkdir: $dir\n";
711							}
712
713							=head2 get_paired_fq_gz_file_from_dir
714
715							Example:
716
717							# .
718							# ├── test_1.fq.gz
719							# └── test_2.fq.gz
720							for my $pe ( get_paired_fq_gz_file_from_dir($indir) ) {
721							# test_1.fq.gz, test_1.fq.gz, test
722							my ( $fqfile1, $fqfile2, $id ) = @$pe;
723
724							}
725
726							=cut
727
728							sub get_paired_fq_gz_file_from_dir {
729	0			0	1		my ($dir) = @_;
730	0						my @files;
731	0						for ( sort glob "$dir/*_1.fq.gz" ) {
732	0						/\/?([^\/]+)_1.fq.gz/;
733	0						my $id = $1;
734	0	0					next unless -e "$dir/${id}_2.fq.gz";
735	0						push @files, [ "$dir/${id}_1.fq.gz", "$dir/${id}_2.fq.gz", "$dir/${id}", $id ];
736							}
737	0						return @files;
738							}
739
740							sub get_paired_fa_gz_file_from_dir {
741	0			0	0		my ($dir) = @_;
742	0						my @files;
743	0						for ( sort glob "$dir/*_1.fa.gz" ) {
744	0						/\/?([^\/]+)_1.fa.gz/;
745	0						my $id = $1;
746	0	0					next unless -e "$dir/${id}_2.fa.gz";
747	0						push @files, [ "$dir/${id}_1.fa.gz", "$dir/${id}_2.fa.gz", "$dir/${id}", $id ];
748							}
749	0						return @files;
750							}
751
752							1;