File Coverage

blib/lib/Bio/ToolBox/utility.pm
Criterion Covered Total %
statement 12 89 13.4
branch 0 28 0.0
condition 0 5 0.0
subroutine 4 11 36.3
pod 7 7 100.0
total 23 140 16.4


line stmt bran cond sub pod time code
1             package Bio::ToolBox::utility;
2             our $VERSION = '1.68';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::utility - common utility functions for Bio::ToolBox
7              
8             =head1 DESCRIPTION
9              
10             These are general subroutines that don't fit in with the other modules.
11              
12             =head1 REGULAR SUBROUTINES
13              
14             The following subroutines are automatically exported when you use this module.
15              
16             =over 4
17              
18             =item parse_list
19              
20             my $index_request = '1,2,5-7';
21             my @indices = parse_list($index_request); # returns [1,2,5,6,7]
22              
23             This subroutine parses a scalar value into a list of values. The scalar is
24             a text string of numbers (usually column or dataset indices) delimited by
25             commas and/or including a range. For example, a string "1,2,5-7" would become
26             an array of [1,2,5,6,7].
27              
28             Pass the module the scalar string.
29              
30             It will return the array of numbers.
31              
32             =item format_with_commas
33              
34             my $count = '4327908475';
35             printf " The final count was %s\n", format_with_commas($count);
36              
37             This subroutine process a large number (e.g. 4327908475) into a human-friendly
38             version with commas delimiting the thousands (4,327,908,475).
39              
40             Pass the module a scalar string with a number value.
41              
42             It will return a scalar value containing the formatted number.
43              
44             =item ask_user_for_index
45              
46             my @answers = ask_user_for_index($Data, 'Please enter 2 or more columns ');
47              
48             This subroutine will present the list of column names from a L
49             structure along with their numeric indexes to the user and prompt for one
50             or more to be selected and entered. The function is smart enough to only print
51             the list once (if it hasn't changed) so as not to annoy the user with repeated
52             lists of header names when used more than once. A text prompt should be provided,
53             or a generic one is used. The list of indices are validated, and a warning printed
54             for invalid responses. The responses are then returned as a single value or array,
55             depending on context.
56              
57             =item simplify_dataset_name
58              
59             my $simple_name = simplify_dataset_name($dataset);
60              
61             This subroutine will take a dataset name and simplify it. Dataset names may
62             often be file names of data files, such as Bam and bigWig files. These may
63             include a C, C, or C prefix, one or more directory paths,
64             and one or more file name extensions. Additionally, more than one dataset
65             may be combined, for example two stranded bigWig files, with an ampersand.
66             This function will safely remove the prefix, directories, and everything after
67             the first period.
68              
69             =back
70              
71             =head1 LEGACY SUBROUTINES
72              
73             These are additional functions that can be optionally exported. These provide
74             accessibility to the L functions that might be needed
75             for old scripts that do not implement L objects. You normally
76             should not need these. If you import these, be sure to import the ones above
77             if you need those too.
78              
79             =over 4
80              
81             =item open_to_read_fh
82              
83             Wrapper around L.
84             Opens a file as an L read only object. Transparently handles gzip
85             and bzip2 compression.
86              
87             =item open_to_write_fh
88              
89             my $fh = open_to_write_fh($file, $gz, $append);
90              
91             Wrapper around L.
92             Opens a file as an L write only object. Pass the file name as the option.
93             Optionally provide a boolean value if you want the file to be written as a compressed
94             gzip file. Pass another boolean value if you want to append to an existing file;
95             otherwise an existing file with the same name will be overwritten!
96              
97             =item check_file
98              
99             Wrapper around the L method.
100             Checks to see if a file exists. If not, some common missing extensions are appended
101             and then existence is re-checked. If a file is found, the name is returned so that
102             it could be opened. Useful, for example, if you forget the F<.txt> or F<.gz> extensions.
103              
104             =back
105              
106             =cut
107              
108 3     3   22 use strict;
  3         5  
  3         127  
109 3     3   19 use Carp;
  3         6  
  3         163  
110 3     3   19 use File::Spec;
  3         13  
  3         86  
111             require Exporter;
112 3     3   15 use Bio::ToolBox::Data::file;
  3         6  
  3         4102  
113              
114              
115             ### Variables
116             # Export
117             our @ISA = qw(Exporter);
118             our @EXPORT = qw(
119             parse_list
120             format_with_commas
121             ask_user_for_index
122             simplify_dataset_name
123             );
124             our @EXPORT_OK = qw(
125             open_to_read_fh
126             open_to_write_fh
127             check_file
128             );
129             our $DATA_COLNAMES = undef;
130             our $DATA_FILENAME = undef;
131              
132             ### The True Statement
133             1;
134              
135              
136              
137             ################# The Subroutines ###################
138              
139             ### Parse string into list
140             sub parse_list {
141             # this subroutine will parse a string into an array
142             # it is designed for a string of numbers delimited by commas
143             # a range of numbers may be specified using a dash
144             # hence 1,2,5-7 would become an array of 1,2,5,6,7
145            
146 0     0 1   my $string = shift;
147 0 0         return unless defined $string;
148 0 0         if ($string =~ /[^\d,\-\s\&]/) {
149 0           carp " the string contains characters that can't be parsed\n";
150 0           return;
151             }
152 0           my @list;
153 0           foreach (split /[,\s+]/, $string) {
154             # check for a range
155 0 0         if (/\-/) {
156 0           my ($start, $stop) = split /\-/;
157             # add each item in the range to the list
158 0           for (my $i = $start; $i <= $stop; $i++) {
159 0           push @list, $i;
160             }
161 0           next;
162             }
163             else {
164             # either an ordinary number or an "&"ed list of numbers
165 0           push @list, $_;
166             }
167             }
168 0           return @list;
169             }
170              
171              
172              
173             ### Format a number into readable comma-delimited by thousands number
174             sub format_with_commas {
175             # for formatting a number with commas
176 0     0 1   my $number = shift;
177 0 0         if ($number =~ /[^\d,\-\.]/) {
178 0           carp " the string contains characters that can't be parsed\n";
179 0           return $number;
180             }
181            
182             # check for decimals
183 0           my ($integers, $decimals);
184 0 0         if ($number =~ /^\-?(\d+)\.(\d+)$/) {
185 0           $integers = $1;
186 0           $decimals = $2;
187             }
188             else {
189 0           $integers = $number;
190             }
191            
192             # format
193 0           my @digits = split //, $integers;
194 0           my @formatted;
195 0           while (@digits) {
196 0 0         if (@digits > 3) {
197 0           unshift @formatted, pop @digits;
198 0           unshift @formatted, pop @digits;
199 0           unshift @formatted, pop @digits;
200 0           unshift @formatted, ',';
201             }
202             else {
203 0           while (@digits) {
204 0           unshift @formatted, pop @digits;
205             }
206             }
207             }
208            
209             # finished
210 0           my $final = join("", @formatted);
211 0 0         $final .= $decimals if defined $decimals;
212 0           return $final;
213             }
214              
215              
216             sub ask_user_for_index {
217 0     0 1   my $Data = shift;
218 0   0       my $line = shift || ' Enter the desired column index ';
219 0 0         unless (ref($Data) =~ /Bio::ToolBox::Data/) {
220 0           carp "Must pass a Bio::ToolBox::Data object!\n";
221 0           return;
222             }
223            
224             # print column header names only if we have not done so before
225 0 0 0       unless (
226             # we use filename and column number as indicators
227             $Data->filename eq $DATA_FILENAME and
228             join(";", $Data->list_columns) eq $DATA_COLNAMES
229             ) {
230 0           print " These are the columns in the file\n";
231 0           my $i = 0;
232 0           foreach ($Data->list_columns) {
233 0           print " $i\t$_\n";
234 0           $i++;
235             }
236             # remember for next time
237 0           $DATA_FILENAME = $Data->filename;
238 0           $DATA_COLNAMES = join(";", $Data->list_columns);
239             }
240 0           print $line;
241            
242             # get response
243 0           my $response = ;
244 0           chomp $response;
245 0           my @indices = parse_list($response);
246            
247             # verify
248 0           my @good;
249 0           foreach (@indices) {
250 0 0         if ($Data->name($_)) {
251 0           push @good, $_;
252             }
253             else {
254 0           print " $_ is not a valid index!\n";
255             }
256             }
257 0 0         return wantarray ? @good : $good[0];
258             }
259              
260              
261             sub simplify_dataset_name {
262 0     0 1   my $dataset = shift;
263 0           my $new_name;
264            
265             # strip any file prefix
266 0           $dataset =~ s/^(?:file|http|ftp):\/*//;
267            
268 0 0         if ($dataset =~ /&/) {
    0          
269             # a combination dataset
270 0           foreach (split /&/, $dataset) {
271 0           my $n = simplify_dataset_name($_);
272 0 0         if ($new_name) {
273 0           $new_name .= '&' . $n;
274             }
275             else {
276 0           $new_name = $n;
277             }
278             }
279             }
280             elsif ($dataset =~ m|/|) {
281             # appears to have paths
282 0           my (undef, undef, $file_name) = File::Spec->splitpath($dataset);
283 0           $file_name =~ s/^([\w\d\-\_]+)\..+$/$1/i; # take everything up to first .
284 0           $new_name = $file_name;
285             }
286             else {
287             # strip everything after first period, like above
288 0           $dataset =~ s/^([\w\d\-\_]+)\..+$/$1/i; # take everything up to first .
289 0           $new_name = $dataset;
290             }
291 0           return $new_name;
292             }
293              
294              
295             sub open_to_read_fh {
296 0     0 1   return Bio::ToolBox::Data::file->open_to_read_fh(@_);
297             }
298              
299              
300             sub open_to_write_fh {
301 0     0 1   return Bio::ToolBox::Data::file->open_to_write_fh(@_);
302             }
303              
304             sub check_file {
305 0     0 1   return Bio::ToolBox::Data::file->check_file(@_);
306             }
307              
308              
309             __END__