File Coverage

blib/lib/Bio/ToolBox/utility.pm

Criterion	Covered	Total	%
statement	110	141	78.0
branch	37	52	71.1
condition	1	8	12.5
subroutine	7	8	87.5
pod	5	5	100.0
total	160	214	74.7

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::utility;
2							our $VERSION = '1.69';
3
4							=head1 NAME
5
6							Bio::ToolBox::utility - common utility functions for Bio::ToolBox
7
8							=head1 DESCRIPTION
9
10							These are general subroutines that don't fit in with the other modules.
11
12							=head1 REGULAR SUBROUTINES
13
14							The following subroutines are automatically exported when you use this module.
15
16							=over 4
17
18							=item parse_list
19
20							my $index_request = '1,2,5-7';
21							my @indices = parse_list($index_request); # returns [1,2,5,6,7]
22
23							This subroutine parses a scalar value into a list of values. The scalar is
24							a text string of numbers (usually column or dataset indices) delimited by
25							commas and/or including a range. For example, a string "1,2,5-7" would become
26							an array of [1,2,5,6,7].
27
28							Pass the module the scalar string.
29
30							It will return the array of numbers.
31
32							=item format_with_commas
33
34							my $count = '4327908475';
35							printf " The final count was %s\n", format_with_commas($count);
36
37							This subroutine process a large number (e.g. 4327908475) into a human-friendly
38							version with commas delimiting the thousands (4,327,908,475).
39
40							Pass the module a scalar string with a number value.
41
42							It will return a scalar value containing the formatted number.
43
44							=item ask_user_for_index
45
46							my @answers = ask_user_for_index($Data, 'Please enter 2 or more columns ');
47
48							This subroutine will present the list of column names from a L
49							structure along with their numeric indexes to the user and prompt for one
50							or more to be selected and entered. The function is smart enough to only print
51							the list once (if it hasn't changed) so as not to annoy the user with repeated
52							lists of header names when used more than once. A text prompt should be provided,
53							or a generic one is used. The list of indices are validated, and a warning printed
54							for invalid responses. The responses are then returned as a single value or array,
55							depending on context.
56
57							=item simplify_dataset_name
58
59							my $simple_name = simplify_dataset_name($dataset);
60
61							This subroutine will take a dataset name and simplify it. Dataset names may
62							often be file names of data files, such as Bam and bigWig files. These may
63							include a C, C, or C prefix, one or more directory paths,
64							and one or more file name extensions. Additionally, more than one dataset
65							may be combined, for example two stranded bigWig files, with an ampersand.
66							This function will safely remove the prefix, directories, and everything after
67							the first period.
68
69							=item sane_chromo_sort
70
71							my @chromo = $db->seq_ids;
72							my @sorted = sane_chromo_sort(@chromo);
73
74							This subroutine will take a list of chromosome or sequence identifiers and sort
75							them into a reasonably sane order: standard numeric identifiers first (numeric
76							order), sex chromosomes (alphabetical), mitochondrial, names with text and
77							numbers (text first alphabetically, then numbers numerically) for contigs and
78							such, and finally anything else (aciibetically). Any 'chr' prefix is ignored.
79							Roman numerals are properly handled numerically.
80
81							The provided list may be a list of SCALAR values (chromosome names) or ARRAY
82							references, with the first element assumed to be the name, e.g.
83							C<[$name, $length]>.
84
85							=back
86
87							=cut
88
89	4			4		606	use strict;
	4					4
	4					90
90	4			4		13	use Carp;
	4					7
	4					158
91	4			4		25	use File::Spec;
	4					7
	4					5572
92							require Exporter;
93
94
95							### Variables
96							# Export
97							our @ISA = qw(Exporter);
98							our @EXPORT = qw(
99							parse_list
100							format_with_commas
101							ask_user_for_index
102							simplify_dataset_name
103							sane_chromo_sort
104							);
105							our $DATA_COLNAMES = undef;
106							our $DATA_FILENAME = undef;
107
108							### The True Statement
109							1;
110
111
112
113							################# The Subroutines ###################
114
115							### Parse string into list
116							sub parse_list {
117							# this subroutine will parse a string into an array
118							# it is designed for a string of numbers delimited by commas
119							# a range of numbers may be specified using a dash
120							# hence 1,2,5-7 would become an array of 1,2,5,6,7
121
122	2			2	1	858	my $string = shift;
123	2	50				5	return unless defined $string;
124	2	50				5	if ($string =~ /[^\d,\-\s\&]/) {
125	0					0	carp " the string contains characters that can't be parsed\n";
126	0					0	return;
127							}
128	2					2	my @list;
129	2					7	foreach (split /[,\s+]/, $string) {
130							# check for a range
131	5	100				10	if (/\-/) {
132	2					5	my ($start, $stop) = split /\-/;
133							# add each item in the range to the list
134	2					6	for (my $i = $start; $i <= $stop; $i++) {
135	13					19	push @list, $i;
136							}
137	2					2	next;
138							}
139							else {
140							# either an ordinary number or an "&"ed list of numbers
141	3					4	push @list, $_;
142							}
143							}
144	2					7	return @list;
145							}
146
147
148
149							### Format a number into readable comma-delimited by thousands number
150							sub format_with_commas {
151							# for formatting a number with commas
152	3			3	1	1408	my $number = shift;
153
154							# check number
155	3					3	my ($integers, $decimals, $sign);
156	3	100				24	if ($number =~ /^(\-)?(\d+)\.(\d+)$/) {
		50
157	2					5	$sign = $1;
158	2					3	$integers = $2;
159	2					3	$decimals = $3;
160							}
161							elsif ($number =~ /^(\-)?(\d+)$/) {
162	1					2	$sign = $1;
163	1					2	$integers = $2;
164							}
165							else {
166	0					0	carp " the string contains characters that can't be parsed\n";
167	0					0	return $number;
168							}
169
170							# format
171	3					9	my @digits = split //, $integers;
172	3					4	my @formatted;
173	3					5	while (@digits) {
174	10	100				13	if (@digits > 3) {
175	7					11	unshift @formatted, pop @digits;
176	7					8	unshift @formatted, pop @digits;
177	7					9	unshift @formatted, pop @digits;
178	7					11	unshift @formatted, ',';
179							}
180							else {
181	3					5	while (@digits) {
182	4					7	unshift @formatted, pop @digits;
183							}
184							}
185							}
186
187							# finished
188	3	100				6	my $final = $sign ? $sign : '';
189	3					7	$final .= join("", @formatted);
190	3	100				5	$final .= '.' . $decimals if defined $decimals;
191	3					14	return $final;
192							}
193
194
195							sub ask_user_for_index {
196	0			0	1	0	my $Data = shift;
197	0		0			0	my $line = shift \|\| ' Enter the desired column index ';
198	0	0				0	unless (ref($Data) =~ /Bio::ToolBox::Data/) {
199	0					0	carp "Must pass a Bio::ToolBox::Data object!\n";
200	0					0	return;
201							}
202
203							# print column header names only if we have not done so before
204	0	0	0			0	unless (
205							# we use filename and column number as indicators
206							$Data->filename eq $DATA_FILENAME and
207							join(";", $Data->list_columns) eq $DATA_COLNAMES
208							) {
209	0					0	print " These are the columns in the file\n";
210	0					0	my $i = 0;
211	0					0	foreach ($Data->list_columns) {
212	0					0	print " $i\t$_\n";
213	0					0	$i++;
214							}
215							# remember for next time
216	0					0	$DATA_FILENAME = $Data->filename;
217	0					0	$DATA_COLNAMES = join(";", $Data->list_columns);
218							}
219	0					0	print $line;
220
221							# get response
222	0					0	my $response = ;
223	0					0	chomp $response;
224	0					0	my @indices = parse_list($response);
225
226							# verify
227	0					0	my @good;
228	0					0	foreach (@indices) {
229	0	0				0	if ($Data->name($_)) {
230	0					0	push @good, $_;
231							}
232							else {
233	0					0	print " $_ is not a valid index!\n";
234							}
235							}
236	0	0				0	return wantarray ? @good : $good[0];
237							}
238
239
240							sub simplify_dataset_name {
241	14			14	1	4483	my $dataset = shift;
242	14					15	my $new_name;
243
244							# strip any file prefix
245	14					37	$dataset =~ s/^(?:file\|http\|ftp):\/*//;
246
247	14	100				30	if ($dataset =~ /&/) {
248							# a combination dataset
249	2					4	foreach (split /&/, $dataset) {
250	4					7	my $n = simplify_dataset_name($_);
251	4	100				6	if ($new_name) {
252	2					4	$new_name .= '&' . $n;
253							}
254							else {
255	2					3	$new_name = $n;
256							}
257							}
258							}
259							else {
260							# a single dataset
261							# this could be either a file name or an entry in a BioPerl or BigWigSet database
262							# remove any possible paths
263	12					91	(undef, undef, $new_name) = File::Spec->splitpath($dataset);
264
265							# remove any known file extensions
266	12					41	$new_name =~ s/\.(?:bw\|bam\|bb\|useq\|bigwig\|bigbed\|g[tf]f3?\|cram\|wig\|bdg\|bedgraph)(?:\.gz)?$//i;
267
268							# remove common non-useful stuff
269							# trying to imagine all sorts of possible things
270	12					24	$new_name =~ s/[_\.\-](?:sort\|sorted\|dedup\|dedupe\|deduplicated\|rmdup\|mkdup\|markdup\|dup\|unique\|filt\|filtered)\b//gi;
271	12					21	$new_name =~ s/[_\.\-](?:coverage\|rpm\|ext\d\|extend\d\|log2fe\|log\d+\|qvalue\|fragment\|count\|lambda_control\|fe\|fold.?enrichment\|ratio\|log\d*ratio)\b//gi;
272							}
273	14					25	return $new_name;
274							}
275
276
277							sub sane_chromo_sort {
278	4			4	1	1405	my @chroms = @_;
279	4	50				9	return unless scalar @chroms;
280
281							# let's try and sort in some kind of rational order
282	4					16	my @numeric;
283							my @romanic;
284	4					0	my @mixed;
285	4					0	my @alphic;
286	4					0	my @sex;
287	4					0	my @mito;
288	4					7	foreach my $c (@chroms) {
289
290	42					44	my $name;
291	42	50				48	if (ref($c) eq 'ARRAY') {
292	0					0	$name = $c->[0];
293							}
294							else {
295	42					41	$name = $c;
296							}
297
298							# identify the type of chromosome name to sort
299	42	100				156	if ($name =~ /^(?:chr)?([wxyz])$/i) {
		100
		100
		100
		100
300							# sex chromosomes
301	4					17	push @sex, [$1, $c];
302							}
303							elsif ($name =~ /^(?:chr)?(?:m\|mt\|mito)(?:dna)?$/i) {
304							# mitochondrial
305	3					5	push @mito, [$name, $c];
306							}
307							elsif ($name =~ /^(?:chr)?(\d+)$/i) {
308							# standard numeric chromosome
309	22					43	push @numeric, [$1, $c];
310							}
311							elsif ($name =~ /^(?:chr)?([IVX]+)$/) {
312							# Roman numerals - silly Saccharomyces cerevisiae
313	7					17	push @romanic, [$1, $c];
314							}
315							elsif ($name =~ /^([a-zA-Z_\-\.]+)(\d+)/) {
316							# presumed contigs and such?
317	5					11	push @mixed, [$1, $2, $name, $c];
318							}
319							else {
320							# everything else
321	1					2	push @alphic, [$name, $c];
322							}
323							}
324
325							# check romanic
326	4	100				9	if (scalar @romanic) {
327							# looks like we have romanic chromosomes
328	2	100				4	if (scalar @sex) {
329							# probably caught up chrX, unlikely WYZ
330	1					2	my @x = grep { $sex[$_]->[0] =~ m/^X$/ } (0 .. $#sex);
	2					6
331	1					2	foreach (reverse @x) {
332							# I'm assuming and hoping there's only one chrX found
333							# but reverse the list, just in case - assuming grep returns in order
334	1					2	push @romanic, ( splice(@sex, $_, 1) );
335							}
336							}
337	2	50				5	if (scalar @numeric) {
338							# well, shoot, this is weird, mix of both numeric and romanic chromosomes?
339							# just merge romanic with alphic and hope for the best
340	0					0	push @alphic, @romanic;
341							}
342							else {
343							# convert the romanic to numeric
344	2					5	while (@romanic) {
345	8					18	my $r = shift @romanic;
346	8					11	my $c = $r->[0];
347	8					11	$c =~ s/IV/4/;
348	8					12	$c =~ s/IX/9/;
349	8					12	$c =~ s/V/5/;
350	8					12	$c =~ s/I/1/g;
351	8					10	my $n = 0;
352	8					14	foreach (split q(), $c) {
353	15	100				17	if ($_ eq 'X') {
354	2					2	$n += 10;
355							}
356							else {
357	13					15	$n += $_;
358							}
359							}
360	8					24	push @numeric, [$n, $r->[1]];
361							}
362							}
363							}
364
365							# sort
366	4					5	my @sorted;
367	4					11	push @sorted, map { $_->[1] } sort {$a->[0] <=> $b->[0]} @numeric;
	30					36
	66					65
368	4					9	push @sorted, map { $_->[1] } sort {$a->[0] cmp $b->[0]} @sex;
	3					5
	1					2
369	4					6	push @sorted, map { $_->[1] } sort {$a->[0] cmp $b->[0]} @mito;
	3					3
	0					0
370	5					7	push @sorted, map { $_->[3] } sort {
371	4	50	33			9	$a->[0] cmp $b->[0] or
	4					14
372							$a->[1] <=> $b->[1] or
373							$a->[2] cmp $b->[2]
374							} @mixed;
375	4					5	push @sorted, map { $_->[1] } sort { $a->[0] cmp $b->[0] } @alphic;
	1					1
	0					0
376
377	4					21	return @sorted;
378							}
379
380
381							__END__