File Coverage

blib/lib/Bio/ToolBox/Data/file.pm
Criterion Covered Total %
statement 353 616 57.3
branch 186 474 39.2
condition 65 211 30.8
subroutine 25 27 92.5
pod 19 21 90.4
total 648 1349 48.0


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::file;
2             our $VERSION = '1.68';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::file - File functions to Bio:ToolBox::Data family
7              
8             =head1 DESCRIPTION
9              
10             File methods for reading and writing data files for both L
11             and L objects. This module should not be used
12             directly. See the respective modules for more information.
13              
14             =cut
15              
16 4     4   30 use strict;
  4         9  
  4         165  
17 4     4   25 use Carp qw(carp cluck croak confess);
  4         10  
  4         219  
18 4     4   23 use File::Basename qw(fileparse);
  4         9  
  4         280  
19 4     4   2150 use File::Which;
  4         4428  
  4         224  
20 4     4   1606 use IO::File;
  4         19252  
  4         33486  
21              
22             # List of acceptable filename extensions
23             our $SUFFIX = qr/\.(?:txt|gff3?|gtf|bed|bg|bdg|bedgraph|sgr|kgg|cdt|vcf|narrowpeak|broadpeak|gappedpeak|reff?lat|genepred|ucsc|maf)(?:\.gz|\.bz2)?/i;
24              
25             # gzip application
26             our $gzip_app;
27             our $bgzip_app;
28              
29             ### The True Statement
30             1;
31              
32              
33              
34             ### Load new version data table from file
35              
36             sub load_file {
37 10     10 1 28 my $self = shift;
38 10         23 my $file = shift;
39 10   50     48 my $noheader = shift || 0; # may not be present
40            
41             # check that we have an empty table
42 10 50 33     60 if ($self->last_row != 0 or $self->number_columns != 0 or $self->filename) {
      33        
43 0         0 carp "Cannot load file onto an existing data table!";
44 0         0 return;
45             }
46            
47             # open the file and load metadata
48 10         46 my $filename = $self->check_file($file);
49 10 50       43 unless ($filename) {
50 0         0 print " file '$file' does not exist!\n";
51 0         0 return;
52             }
53 10         54 $self->add_file_metadata($filename);
54 10 50       48 $self->open_to_read_fh or return;
55 10         86 $self->parse_headers($noheader);
56            
57             # Load the data table
58 10         194 while (my $line = $self->{fh}->getline) {
59             # the current file position should be at the beginning of the
60             # data table information
61 198 50       4817 next if $line !~ m/\w+/;
62            
63             # skip comment and empty lines
64 198 50       455 if (substr($line,0,1) eq '#') {
65 0         0 $self->add_comment($line);
66 0         0 next;
67             }
68            
69             # process the line
70 198         447 $self->add_data_line($line);
71             }
72            
73             # completed loading the file
74 10         378 $self->close_fh;
75 10         190 delete $self->{fh};
76            
77             # verify the structure
78 10 50       48 return unless $self->verify;
79            
80             # finished
81 10         53 return 1;
82             }
83              
84             sub taste_file {
85 24     24 1 2459 my $self = shift;
86 24         47 my $file = shift;
87 24 50       69 my $filename = $self->check_file($file) or return;
88 24         109 my $Taste = $self->new;
89 24         93 $Taste->add_file_metadata($filename);
90 24 50       76 $Taste->open_to_read_fh or return;
91 24         105 $Taste->parse_headers;
92            
93             # load first 10 data lines
94 24         76 $Taste->{data_table}->[0] = $Taste->{'column_names'}; # set table header names
95 24         78 for (my $i = 1; $i <= 10; $i++) {
96 192 100       384 my $line = $Taste->fh->getline or last;
97 180 50       4406 next if $line !~ m/\w+/;
98 180 50       362 next if $line =~ /^#/;
99 180         377 $Taste->add_data_line($line);
100             }
101 24         471 $Taste->close_fh;
102 24         552 $Taste->verify(1); # silently check the integrity of the file
103            
104             # check existing metadata
105 24 100       77 if ($Taste->gff) {
    100          
    50          
106 4         94 return 'gff';
107             }
108             elsif ($Taste->bed) {
109 12         253 return 'bed';
110             }
111             elsif ($Taste->ucsc) {
112 0         0 return 'ucsc';
113             }
114            
115            
116             # check if the number of columns match a known format, then verify
117 8         26 my $number = $Taste->number_columns;
118 8 50       70 if ($number == 9) {
    50          
    50          
    50          
    50          
    100          
119             # possibly a GFF file
120 0         0 $Taste->gff(2);
121 0         0 $Taste->verify(1);
122 0 0       0 return 'gff' if $Taste->gff == 2;
123             }
124             elsif ($number == 10) {
125             # possibly a genePred file
126 0         0 $Taste->ucsc(10);
127 0         0 $Taste->verify(1);
128 0 0       0 return 'ucsc' if $Taste->ucsc == 10;
129 0         0 $Taste->add_ucsc_metadata(10,1); # force metadata
130 0         0 $Taste->verify(1);
131 0 0       0 return 'ucsc' if $Taste->ucsc == 10;
132             }
133             elsif ($number == 11) {
134             # possibly a refFlat file
135 0         0 $Taste->ucsc(11);
136 0         0 $Taste->verify(1);
137 0 0       0 return 'ucsc' if $Taste->ucsc == 11;
138 0         0 $Taste->add_ucsc_metadata(11,1); # force metadata
139 0         0 $Taste->verify(1);
140 0 0       0 return 'ucsc' if $Taste->ucsc == 11;
141             }
142             elsif ($number == 12) {
143             # possibly a knownGene or BED12 file
144 0         0 $Taste->ucsc(12);
145 0         0 $Taste->verify(1);
146 0 0       0 return 'ucsc' if $Taste->ucsc == 12;
147 0         0 $Taste->bed(12);
148 0         0 $Taste->verify(1);
149 0 0       0 return 'bed' if $Taste->bed == 12;
150 0         0 $Taste->add_ucsc_metadata(12,1); # force metadata
151 0         0 $Taste->verify(1);
152 0 0       0 return 'ucsc' if $Taste->ucsc == 12;
153 0         0 $Taste->add_bed_metadata(12,1); # force metadata
154 0         0 $Taste->verify(1);
155 0 0       0 return 'bed' if $Taste->bed == 12;
156             }
157             elsif ($number == 15) {
158             # possibly a genePredExt file
159 0         0 $Taste->ucsc(15);
160 0         0 $Taste->verify(1);
161 0 0       0 return 'ucsc' if $Taste->ucsc == 15;
162 0         0 $Taste->add_ucsc_metadata(15,1); # force metadata
163 0         0 $Taste->verify(1);
164 0 0       0 return 'ucsc' if $Taste->ucsc == 15;
165             }
166             elsif ($number == 16) {
167             # possibly a genePredExt file
168 3         10 $Taste->ucsc(16);
169 3         11 $Taste->verify(1);
170 3 50       25 return 'ucsc' if $Taste->ucsc == 16;
171 3         26 $Taste->add_ucsc_metadata(16,1); # force metadata
172 3         11 $Taste->verify(1);
173 3 50       10 return 'ucsc' if $Taste->ucsc == 16;
174             }
175 5         71 return;
176             }
177              
178             sub sample_gff_type_list {
179 11     11 1 31 my ($self, $file) = @_;
180 11 50       81 return unless $file =~ /\.g[tf]f\d?(?:\.gz)?$/i; # assume extension is accurate
181 11 50       43 my $fh = $self->open_to_read_fh($file) or die "can't open $file!\n";
182 11         27 my %types;
183 11         21 my $count = 0;
184 11         41 while ($count < 1000) {
185 1288 100       20748 my $line = $fh->getline or last;
186 1277 50       30790 next if $line !~ m/\w+/;
187 1277 100       2494 next if $line =~ /^#/;
188 1241         3735 my @fields = split('\t', $line);
189 1241         1969 $types{ $fields[2] } += 1;
190 1241         2868 $count++;
191             }
192 11         390 $fh->close;
193 11         351 return join(',', keys %types);
194             }
195              
196              
197             sub parse_headers {
198 60     60 1 118 my $self = shift;
199 60   50     240 my $noheader = shift || 0; # boolean to indicate no headers are present
200            
201             # filehandle
202 60         171 my $fh;
203 60 50 33     267 if (exists $self->{fh} and $self->{fh}) {
    0          
204 60         111 $fh = $self->{fh};
205             }
206             elsif ($self->filename) {
207 0         0 $fh = $self->open_to_read_fh($self->filename);
208             }
209             else {
210 0         0 confess " file metadata and/or open filehandle must be set before parsing headers!";
211             }
212            
213             # check that we have an empty table
214 60 50 33     203 if ($self->last_row != 0 or $self->number_columns != 0) {
215 0         0 cluck "Cannot parse file headers onto an existing data table!";
216 0         0 return;
217             }
218            
219             # read and parse the file
220             # we will ONLY parse the header lines prefixed with a #, as well as the
221             # first data row which contains the column names
222 60         258 $self->program(undef); # reset this to blank, it will be filled by file metadata
223 60         100 my $header_line_count = 0;
224 60         1611 my $line = $fh->getline; # first line
225            
226             # check the first line for proper line endings
227             {
228 60         6116 my $line2 = $line;
  60         128  
229 60         142 chomp($line2);
230 60 50       277 if ($line2 =~ /[\r\n]+/) {
231 0         0 my $filename = $self->filename;
232 0         0 die "File '$filename' does not have expected line endings!\n" .
233             " Try converting to native line endings and try again\n";
234             }
235             }
236            
237 60         139 while ($line) {
238             # we are not chomping the line here because of possible side effects
239             # with UCSC tables where we have to count elements in the first line
240             # and potential header lines, and the first line has a null value at
241             # the end
242            
243             # Parse the datafile metadata headers
244            
245             # no real line, just empty space
246 137 50       2902 if ($line !~ m/\w+/) {
    50          
    100          
    100          
    50          
    100          
    50          
    100          
    100          
247 0         0 $header_line_count++;
248 0         0 next;
249             }
250            
251             # the generating program
252             elsif ($line =~ m/^# Program (.+)$/) {
253 0         0 my $p = $1;
254 0         0 $self->program($p);
255 0         0 $header_line_count++;
256             }
257            
258             # the source database
259             elsif ($line =~ m/^# Database (.+)$/) {
260 10         33 my $d = $1;
261 10         43 $self->database($d);
262 10         20 $header_line_count++;
263             }
264            
265             # the type of feature in this datafile
266             elsif ($line =~ m/^# Feature (.+)$/) {
267 10         28 my $f = $1;
268 10         34 $self->feature($f);
269 10         22 $header_line_count++;
270             }
271            
272             # column or dataset specific information
273             elsif ($line =~ m/^# Column_(\d+)/) {
274             # the column number will become the index number
275 0         0 my $index = $1;
276 0         0 $self->add_column_metadata($line, $index);
277 0         0 $header_line_count++;
278             }
279            
280             # gff version header
281             elsif ($line =~ /^##gff-version\s+([\d\.]+)$/) {
282             # store the gff version in the hash
283             # this may or may not be present in the gff file, but want to keep
284             # it if it is
285 5         19 my $g = $1;
286 5         36 $self->gff($g);
287 5         11 $header_line_count++;
288             }
289            
290             # VCF version header
291             elsif ($line =~ /^##fileformat=VCFv([\d\.]+)$/) {
292             # store the VCF version in the hash
293             # this may or may not be present in the vcf file, but want to keep
294             # it if it is
295 0         0 my $v = $1;
296 0         0 $self->vcf($v);
297 0         0 $self->add_comment($line); # so that it is written properly
298 0         0 $header_line_count++;
299             }
300            
301             # any other nonstandard header
302             elsif ($line =~ /^#/) {
303 44         196 $self->add_comment($line);
304 44         58 $header_line_count++;
305             }
306            
307             # a track or browser line
308             elsif ($line =~ /^(?:track|browser)\s+/i) {
309             # common with wig, bed, or bedgraph files for use with UCSC genome browser
310             # treat as a comment line, there's not that much useful info here
311 8 50       39 if ($line =~ /type=(\w+)/i) {
312 8         52 $self->format($1);
313             }
314 8         40 $self->add_comment($line);
315 8         14 $header_line_count++;
316             }
317            
318             # the remainder is the data table itself
319             else {
320             # the first row in the data table are (usually) the column names
321             # we only want the names, not the rest of the table
322            
323             # specific file formats have implicit predefined column formats
324             # these file formats do NOT have column headers
325             # we will first check for those file formats and process accordingly
326            
327             ### Data tables with a commented header line
328 60 50       269 if ( $self->_commented_header_line($line) ) {
329             # these will have one comment line marked with #
330             # that really contains the column headers
331            
332             # process the real header line
333 0         0 $self->add_standard_metadata( pop @{ $self->{'comments'} } );
  0         0  
334             }
335             # we will continue here in case the commented header line was part of a
336             # formatted file type, which will be checked by extension or possibly
337             # the format (if determined) below
338 60   66     197 my $format = $self->format || $self->extension;
339            
340             ### a GFF file
341 60 100       419 if ($format =~ /g[tf]f/i) {
    100          
    100          
    50          
    50          
342 5         30 $self->add_gff_metadata;
343             }
344            
345             ### a peak file
346             elsif ($format =~ /peak/i) {
347 16         44 my $count = scalar(split '\t', $line);
348 16         79 $self->add_peak_metadata($count);
349             }
350            
351             ### a Bed or BedGraph file
352             elsif ($format =~ /bg|bdg|bed/i) {
353 26         68 my $count = scalar(split '\t', $line);
354 26         114 $self->add_bed_metadata($count);
355             }
356            
357             ### a UCSC gene table
358             elsif ($format =~ /ref+lat|genepred|ucsc/i) {
359 0         0 my $count = scalar(split '\t', $line);
360 0         0 $self->add_ucsc_metadata($count);
361             }
362            
363             ### a SGR file
364             elsif ($format =~ /sgr/i) {
365 0         0 $self->add_sgr_metadata;
366             }
367            
368             ### standard text file with headers, i.e. everything else
369 60 100       256 unless ($self->number_columns) {
370             # we have not yet parsed the row of data column names
371             # we will do so now
372 13         55 $self->add_standard_metadata($line);
373            
374             # count as a header line
375 13         20 $header_line_count++;
376             }
377             }
378            
379             # keep processing by going to the next line until we have identified columns
380 137 100       327 if ($self->number_columns == 0) {
381 77         1475 $line = $fh->getline;
382             }
383             else {
384 60         164 undef $line;
385             }
386             }
387            
388             # if we didn't find columns, check that it wasn't actually commented
389             # for example, an empty vcf file
390 60 0 50     131 if ($self->number_columns == 0 and
      0        
391 0         0 scalar(@{ $self->{comments} }) and
392             $self->{comments}->[-1] =~ /\t/
393             ) {
394             # process the real header line
395 0         0 $self->add_standard_metadata( pop @{ $self->{'comments'} } );
  0         0  
396             }
397            
398            
399             # No header was requested
400 60 0 33     167 if ($noheader and not $self->bed and not $self->gff and
      33        
      0        
      0        
401             not $self->vcf and not $self->ucsc
402             ) {
403             # which means that what we used as a header is actually the first data row
404            
405             # fix the column names
406 0         0 for (my $i = 0; $i < $self->number_columns; $i++) {
407 0         0 my $name = $self->name($i);
408 0         0 $self->name($i, "Column$i ($name)");
409 0         0 $self->{$i}{'AUTO'} = 3;
410             }
411             # adjust metadata
412 0         0 $header_line_count -= 1;
413 0         0 $self->{'headers'} = -1; # special case, we never write headers here
414             }
415            
416             # Header sanity check
417             # some exported file formats, such as from R, do not include a proper
418             # header for the first column, as these are assumed to be row names
419             # this will result in incorrectly parsed files where the last columns
420             # will be merged into one column with an internal tab - not good
421             # need to handle these
422 60         1364 my $nextline = $fh->getline;
423 60 50       1634 if ($nextline) {
424 60         281 my @nextdata = split '\t', $nextline;
425 60 50       211 if (scalar(@nextdata) - 1 == $self->number_columns) {
426             # whoops! we caught a off-by-one discrepancy between header and data row
427 0         0 my $old_last = $self->last_column;
428 0         0 $fh->close; # having this open complicates changing columns....
429            
430             # add a new "column" (just metadata for now) and move it to the beginning
431 0         0 my $i = $self->add_column('Column1');
432 0         0 $self->reorder_column($i, 0 .. $old_last);
433             }
434             }
435            
436             # re-open the file
437             # I tried using seek functions - but they don't work with binary gzip
438             # files, and I can't get the seek function to return the same position
439             # as simply advancing through the file like below
440             # so I'll just do it the old way and close/open and advance
441 60 50       300 $fh->close if $fh; # may have been closed from the sanity check above
442 60         1661 $fh = $self->open_to_read_fh;
443 60         229 for (1 .. $header_line_count) {
444 90         3406 my $line = $fh->getline;
445             }
446 60         1215 $self->{header_line_count} = $header_line_count;
447 60         120 $self->{fh} = $fh;
448 60         152 return 1;
449             }
450              
451              
452             sub add_data_line {
453 574     574 1 993 my ($self, $line) = @_;
454            
455             # do not chomp the line yet, just split into an array
456 574         2776 my @linedata = split '\t', $line, $self->{number_columns};
457            
458             # chomp the last element
459             # we do this here to ensure the tab split above gets all of the values
460             # otherwise trailing null values aren't included in @linedata
461 574         1021 chomp $linedata[-1];
462            
463             # check for extra remaining tabs
464 574 50       1262 if (index($linedata[-1], "\t") != -1) {
465 0         0 die "FILE INCONSISTENCY ERRORS! line has additional tabs (columns) than headers!\n $line";
466             }
467            
468             # add the line of data
469 574         751 push @{ $self->{data_table} }, \@linedata;
  574         1241  
470 574         948 $self->{last_row} += 1;
471 574         4371 return 1;
472             }
473              
474              
475             ### Parse the filename using the list suffix list
476             sub add_file_metadata {
477 70     70 1 170 my ($self, $filename) = @_;
478 70 50       174 confess "no valid filename!" unless defined $filename;
479 70         2770 my ($basename, $path, $extension) = fileparse($filename, $SUFFIX);
480 70 50       264 unless ($extension) {
481             # look for a nonstandard extension, allowing for .gz extension
482 0 0       0 if ($filename =~ /(\.\w+(?:\.gz)?)$/i) {
483 0         0 $extension = $1;
484 0         0 $basename =~ s/$extension\Z//;
485             }
486             }
487 70         168 $self->{filename} = $filename;
488 70         148 $self->{basename} = $basename;
489 70         120 $self->{path} = $path;
490 70         167 $self->{extension} = $extension;
491             }
492              
493              
494             ### Write out a data file
495             sub write_file {
496 8     8 1 20 my $self = shift;
497            
498             # collect passed arguments
499 8         16 my %args;
500 8 100       32 if (scalar(@_) == 1) {
501 7         19 $args{'filename'} = $_[0];
502             }
503             else {
504 1         5 %args = @_;
505             }
506 8   0     29 $args{'filename'} ||= $args{'file'} || undef;
      33        
507 8   50     47 $args{'format'} ||= undef;
508 8 50       24 unless (exists $args{'gz'}) {$args{'gz'} = undef}
  8         22  
509            
510             # check the data
511 8 50       35 unless ($self->verify) {
512 0         0 cluck "bad data structure!";
513 0         0 return;
514             }
515            
516             # check filename
517 8 50 33     31 unless ($args{'filename'} or $self->filename) {
518 0         0 cluck "no filename given!\n";
519 0         0 return;
520             }
521            
522             # split filename into its base components
523             my ($name, $path, $extension) =
524 8   33     298 fileparse($args{'filename'} || $self->filename, $SUFFIX);
525            
526             # Adjust filename extension if necessary
527 8 50       102 if ($extension =~ /(g[tf]f)/i) {
    100          
    50          
    50          
    50          
    50          
    0          
528 0 0       0 if (not $self->gff) {
529             # let's set it to true and see if it passes verification
530 0 0       0 $self->{'gff'} = $extension =~ /gtf/i ? 2.5 : 3; # default
531 0 0 0     0 unless ($self->verify and $self->gff) {
532 0         0 warn " GFF structure changed, re-setting extension from $extension to .txt\n";
533 0         0 $extension =~ s/g[tf]f3?/txt/i;
534             }
535             }
536             }
537             elsif ($extension =~ /bedgraph|bed|bdg|narrowpeak|broadpeak/i) {
538 3 100       13 if (not $self->bed) {
539             # let's set it to true and see if it passes verification
540 1         4 $self->{'bed'} = 1; # a fake true
541 1 50 33     4 unless ($self->verify and $self->bed) {
542 0         0 warn " BED structure changed, re-setting extension from $extension to .txt\n";
543 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
544             }
545             }
546             }
547             elsif ($extension =~ /vcf/i) {
548 0 0       0 if (not $self->vcf) {
549             # let's set it to true and see if it passes verification
550 0         0 $self->{'vcf'} = 1; # a fake true
551 0 0 0     0 unless ($self->verify and $self->vcf) {
552 0         0 warn " VCF structure changed, re-setting extension from $extension to .txt\n";
553 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
554             }
555             }
556             }
557             elsif ($extension =~ /sgr/i) {
558 0 0       0 unless ($self->{'extension'} =~ /sgr/i) {
559             # original file was not SGR
560             # let's pretend it was and see if still passes
561             # the sgr verification relies on the recorded extension
562 0         0 $self->{'extension'} = '.sgr';
563 0         0 $self->verify;
564 0 0       0 if ($self->extension =~ /txt/) {
565 0         0 warn " SGR structure changed, re-setting extension from $extension to .txt\n";
566             }
567 0         0 $extension = $self->{'extension'};
568             }
569             }
570             elsif ($extension =~ /reff?lat|genepred|ucsc/i) {
571 0 0       0 if ($self->ucsc != $self->number_columns) {
572             # it's not set as a ucsc data
573             # let's set it to true and see if it passes verification
574 0         0 $self->ucsc($self->number_columns);
575 0 0 0     0 unless ($self->verify and $self->ucsc) {
576 0         0 warn " UCSC structure changed, re-setting extension from $extension to .txt\n";
577 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
578             }
579             }
580             }
581             elsif ($extension =~ /txt/i) {
582             # plain old text file, sounds good to me
583             # make sure headers are enabled
584 5         13 $self->{'headers'} = 1;
585             }
586             elsif (not $extension) {
587             # no extension was available
588             # try and determine one from metadata
589            
590 0 0       0 if ($self->gff) {
    0          
    0          
    0          
    0          
    0          
591 0 0       0 $extension = $self->gff == 3 ? '.gff3' : $self->gff == 2.5 ? '.gtf' : '.gff';
    0          
592             }
593             elsif ($self->bed) {
594 0 0 0     0 if ($self->format) {
    0          
595             # re-use the format value as the extension
596 0         0 $extension = sprintf(".%s", $self->format);
597             }
598             elsif (
599             $self->number_columns == 4 and
600             $self->name(3) =~ /score/i
601             ) {
602 0         0 $extension = '.bdg'; # looks like a bedGraph file
603             }
604             else {
605 0         0 $extension = '.bed'; # a regular bed file
606             }
607             }
608             elsif ($self->ucsc) {
609 0 0       0 if ($self->format) {
610             # re-use the format value as the extension
611 0         0 $extension = sprintf(".%s", $self->format);
612             }
613             else {
614             # use a generic ucsc format
615 0         0 $extension = '.ucsc';
616             }
617             }
618             elsif ($self->vcf) {
619 0         0 $extension = '.vcf';
620             }
621             elsif ($name =~ /(\.\w{3}(?:\.gz)?)$/i) {
622             # a non-standard 3 letter file extension
623             # anything else might be construed as part of the filename, so run the
624             # risk of adding a default extension below
625 0         0 $extension = $1;
626 0         0 $name =~ s/$extension\Z//;
627             }
628             elsif ($self->extension) {
629             # original file had an extension, re-use it if appropriate
630             # why wouldn't this get picked up above???? probably old cruft,
631             # or a non-standard or unknown file extension
632             # leave it in for the time being, shouldn't hurt anything
633 0 0       0 if ($self->extension =~ /g[tf]f/i) {
    0          
634 0 0       0 $extension = $self->gff ? $self->extension : '.txt';
635             }
636             elsif ($self->extension =~ /bed|bdg|peak/i) {
637 0 0       0 $extension = $self->bed ? $self->extension : '.txt';
638             }
639             else {
640             # an unstructured format
641 0         0 $extension = $self->extension;
642             }
643             }
644             else {
645             # normal data text file
646 0         0 $extension = '.txt';
647             }
648             }
649             # otherwise the extension must be good, hope for the best
650            
651             # determine format
652             # this is an arcane specification of whether we want a "simple" no metadata
653             # format, or an ordinary text format that may or may not have metadata
654             # it's currently not hurting much, so leave it in for now?
655 8 50       29 unless ($args{'format'}) {
656 8 50       32 if (defined $args{'simple'}) {
    50          
657             # an old method of specifying simple
658 0         0 $args{'format'} = 'simple';
659             }
660             elsif ($extension) {
661             # check extension from the parsed filename, if present
662 8 50       38 if ($extension =~ /sgr|cdt/i) {
663             # sgr is simple format, no headers
664 0         0 $args{'format'} = 'simple';
665             }
666             else {
667             # everything else is text
668 8         20 $args{'format'} = 'text';
669             }
670             }
671             else {
672             # somehow we got this far without defining? use default text
673 0         0 $args{'format'} = 'text';
674             }
675             }
676            
677             # check zip status if necessary
678 8 50       25 unless (defined $args{'gz'}) {
679             # look at filename extension as a clue
680             # in case we're overwriting the input file, keep the zip status
681 8 50       33 if ($extension =~ m/\.vcf\.gz/i) {
    50          
682             # vcf requires bgzip
683 0         0 $args{'gz'} = 2;
684             }
685             elsif ($extension =~ m/\.gz$/i) {
686 0         0 $args{'gz'} = 1;
687             }
688             else {
689 8         26 $args{'gz'} = 0; # default
690             }
691             }
692            
693             # adjust gzip extension as necessary
694 8 50 33     61 if ($args{'gz'} and $extension !~ m/\.gz$/i) {
    50 33        
695 0         0 $extension .= '.gz';
696             }
697             elsif (not $args{'gz'} and $extension =~ /\.gz$/i) {
698 0         0 $extension =~ s/\.gz$//i;
699             }
700            
701             # check filename length
702             # assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
703             # don't even get me started on Windows NTFS path length limitation
704 8 50       32 if (length($name . $extension) > 255) {
705 0         0 my $limit = 253 - length($extension);
706 0         0 $name = substr($name, 0, $limit) . '..';
707 0         0 warn " filename too long! Truncating to $limit characters\n";
708             }
709            
710             # generate the new filename
711 8         39 my $newname = $path . $name . $extension;
712            
713            
714             # Convert strand information
715 8         52 my $strand_i = $self->strand_column;
716 8 0 0     27 if (defined $strand_i and ($self->gff or $self->bed or $self->ucsc) ) {
      33        
717             # convert to +/-/. nomenclature as necessary
718 0 0 0     0 if ($self->gff) {
    0          
719 0         0 for my $row (1 .. $self->last_row) {
720 0         0 my $s = $self->{'data_table'}->[$row][$strand_i];
721 0 0       0 if ($s =~ /\d/) {
722 0 0       0 $s = $s == 1 ? '+' : $s == -1 ? '-' : '.';
    0          
723             }
724 0         0 $self->{'data_table'}->[$row][$strand_i] = $s;
725             }
726             }
727             elsif ($self->bed or $self->ucsc) {
728 0         0 for my $row (1 .. $self->last_row) {
729 0         0 my $s = $self->{'data_table'}->[$row][$strand_i];
730 0 0       0 if ($s =~ /\d/) {
731 0 0       0 $s = $s >= 0 ? '+' : '-';
732             }
733 0         0 $self->{'data_table'}->[$row][$strand_i] = $s;
734             }
735             }
736             }
737            
738            
739             # Open file for writing
740 8         51 my $fh = $self->open_to_write_fh($newname, $args{'gz'});
741 8 50       26 return unless defined $fh;
742            
743            
744             # Write the headers
745 8 50       35 if ($args{'format'} eq 'text') {
746             # default text format has metadata headers
747            
748             # write gff statement if gff format
749 8 50       32 if ($self->gff) {
750 0         0 $fh->print('##gff-version ' . $self->gff . "\n");
751             }
752            
753             # Write the primary headers
754 8 50 66     35 unless (
      66        
      66        
      33        
755             $self->gff or $self->bed or $self->ucsc or $self->vcf or
756             $extension =~ m/sgr|kgg|cdt|peak/i
757             ) {
758             # we only write these for normal text files, not defined format files
759            
760 5 50       19 if ($self->program) {
761 0         0 $fh->print('# Program ' . $self->program . "\n");
762             }
763 5 50       17 if ($self->database) {
764 5         22 $fh->print('# Database ' . $self->database . "\n");
765             }
766 5 50       136 if ($self->feature) {
767 5         15 $fh->print('# Feature ' . $self->feature . "\n");
768             }
769             }
770            
771             # Write the miscellaneous headers
772 8         43 foreach ( @{ $self->{'comments'} } ) {
  8         31  
773             # write remaining miscellaneous header lines if present
774             # we do this for all files
775 6 100       63 unless (/\n$/s) {
776             # append newline if not present
777 4         11 $_ .= "\n";
778             }
779             # check for comment character at beginning
780 6 100       19 if (/^#/) {
781 5         33 $fh->print($_);
782             }
783             else {
784 1         5 $fh->print("# " . $_);
785             }
786             }
787            
788             # Write the column metadata headers
789 8         43 for (my $i = 0; $i < $self->number_columns; $i++) {
790             # each column metadata in the hash is referenced by the column's
791             # index number as the key
792             # we will take each index one at a time in increasing order
793            
794             # some files do not need or tolerate metadata lines, for those
795             # known files the metadata lines will be skipped
796            
797             # these column metadata lines do not need to be written if they
798             # only have two values, presumably name and index, for files
799             # that don't normally have column headers, e.g. gff
800 22 100 66     71 if (
    50 0        
    0 0        
801             exists $self->{$i}{'AUTO'} and
802 8         26 scalar( keys %{ $self->{$i} } ) == $self->{$i}{'AUTO'}
803             ) {
804             # some of the metadata values were autogenerated and
805             # we have the same number of keys as were autogenerated
806             # no need to write these
807 8         19 next;
808             }
809 14         59 elsif (scalar( keys %{ $self->{$i} } ) == 2) {
810             # only two metadata keys exist, name and index
811             # these are so simple it's not worth writing them
812 14         40 next;
813             }
814             elsif ($extension =~ /sgr|kgg|cdt/i or $self->ucsc or $self->vcf) {
815             # these do not support metadata lines
816 0         0 next;
817             }
818            
819             # we will put each key=value pair into @pairs, listed asciibetically
820 0         0 my @pairs; # an array of the key value pairs from the metadata hash
821             # put name first
822             # we are no longer writing the index number
823 0         0 push @pairs, 'name=' . $self->{$i}{'name'};
824             # put remainder in alphabetical order
825 0         0 foreach (sort {$a cmp $b} keys %{ $self->{$i} } ) {
  0         0  
  0         0  
826 0 0       0 next if $_ eq 'name'; # already written
827 0 0       0 next if $_ eq 'index'; # internal use only
828 0 0       0 next if $_ eq 'AUTO'; # internal use only
829 0         0 push @pairs, $_ . '=' . $self->{$i}{$_};
830             }
831            
832             # Finally write the header line, joining the pairs with a
833             # semi-colon into a single string.
834             # The column identifier is comprised of the word 'Column'
835             # and the index number joined by '_'.
836 0         0 $fh->print("# Column_$i ", join(";", @pairs), "\n");
837             }
838             }
839            
840            
841             # Write the table column headers
842 8 100       40 if ($self->{'headers'} == 1) {
843 5         12 $fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[0] }));
  5         34  
844             }
845            
846            
847             # Write the data table
848 8 50       69 if ($args{'format'} eq 'simple') {
849            
850             # the simple format will strip the non-value '.' from the table
851 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
852             # we will step though the data_table array one row at a time
853             # convert the non-value '.' to undefined
854             # and print using a tab-delimited format
855 0         0 my @linedata;
856 0         0 foreach ( @{ $self->{'data_table'}[$i] }) {
  0         0  
857 0 0       0 if ($_ eq '.') {
858 0         0 push @linedata, undef;
859             } else {
860 0         0 push @linedata, $_;
861             }
862             }
863 0         0 $fh->printf("%s\n", join("\t", @linedata));
864             }
865             }
866            
867             else {
868             # normal data files
869 8         30 for (my $i = 1; $i <= $self->last_row; $i++) {
870             # we will step though the data_table array one row at a time
871             # we will join each row's array of elements into a string to print
872             # using a tab-delimited format
873 104         145 $fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[$i] }));
  104         280  
874             }
875             }
876            
877             # done writing
878 8         45 $fh->close;
879            
880             # if we made it this far, it should've been a success!
881             # return the new file name as indication of success
882 8         471 return $newname;
883             }
884              
885             sub save {
886 6     6 1 40 return shift->write_file(@_);
887             }
888              
889              
890             #### Open a file for reading
891             sub open_to_read_fh {
892 161     161 1 305 my $self = shift;
893 161   100     555 my $file = shift || undef;
894 161 100       641 my $obj = ref($self) =~ /^Bio::ToolBox/ ? 1 : 0;
895            
896             # check file
897 161 100 66     611 if (not $file and $obj) {
898 120   50     401 $file = $self->filename || undef;
899             }
900 161 50       394 unless ($file) {
901 0         0 carp " no filename provided or associated with object!";
902 0         0 return;
903             }
904            
905             # Open filehandle object as appropriate
906 161         229 my $fh;
907 161 50       695 if ($file =~ /\.gz$/i) {
    50          
908             # the file is compressed with gzip
909 0 0       0 $fh = IO::File->new("gzip -dc $file |") or
910             carp "unable to read '$file' $!\n";
911             }
912             elsif ($file =~ /\.bz2$/i) {
913             # the file is compressed with bzip2
914 0 0       0 $fh = IO::File->new("bzip2 -dc $file |") or
915             carp "unable to read '$file' $!\n";
916             }
917             else {
918             # the file is uncompressed and space hogging
919 161 50       873 $fh = IO::File->new($file, 'r') or
920             carp "unable to read '$file' $!\n";
921             }
922            
923 161 100       17658 if ($obj) {
924 120         353 $self->{fh} = $fh;
925             }
926 161         694 return $fh;
927             }
928              
929              
930             #### Open a file for writing
931             sub open_to_write_fh {
932 10     10 1 50 my ($self, $filename, $gz, $append) = @_;
933            
934             # check filename
935 10 50       25 unless ($filename) {
936 0         0 carp " no filename to write!";
937 0         0 return;
938             }
939            
940             # check filename length
941             # assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
942 10         164 my $name = fileparse($filename);
943 10 50       41 if (length $name > 255) {
944 0         0 carp " filename is too long! please shorten\n";
945 0         0 return;
946             }
947            
948             # check zip status if necessary
949 10 100       27 unless (defined $gz) {
950             # look at filename extension as a clue
951             # in case we're overwriting the input file, keep the zip status
952 2 50       19 if ($filename =~ m/\.vcf(\.gz)?$/i) {
    50          
953 0         0 $gz = 2; # bgzip
954             }
955             elsif ($filename =~ m/\.gz$/i) {
956 0         0 $gz = 1; # regular gzip
957             }
958             else {
959 2         4 $gz = 0; # default
960             }
961             }
962            
963             # gzip compression application
964 10 50 33     64 if ($gz == 1 and not $gzip_app) {
    50 33        
965             # use parallel gzip if possible
966             # this is stored in a global variable so we only have to look once
967 0         0 $gzip_app = which('pigz');
968 0 0       0 if ($gzip_app) {
969 0         0 $gzip_app .= ' -p 3'; # use a conservative 3 processes, plus perl, so 4 total
970             }
971             else {
972             # default is the standard gzip application
973             # should be available in any application
974 0         0 $gzip_app = which('gzip');
975             }
976 0 0       0 unless ($gzip_app) {
977 0         0 croak "no gzip application in PATH to open compressed file handle output!\n";
978             }
979             }
980             elsif ($gz == 2 and not $bgzip_app) {
981             # use parallel bgzip if possible
982             # this is stored in a global variable so we only have to look once
983 0         0 $bgzip_app = which('bgzip');
984 0 0       0 if ($bgzip_app) {
985             # I'm going to assume this is a recent bgzip with multi-threading
986             # use 3 threads, same as with pigz
987 0         0 $bgzip_app .= ' -@ 3 -c';
988             }
989 0 0       0 unless ($bgzip_app) {
990 0         0 croak "no bgzip application in PATH to open compressed file handle output!\n";
991             }
992             }
993 10 50       38 my $gzipper = $gz == 1 ? $gzip_app : $gz == 2 ? $bgzip_app : undef;
    50          
994            
995             # check file append mode
996 10 100       37 unless (defined $append) {
997             # default is not to append
998 8         16 $append = 0;
999             }
1000            
1001            
1002             # Generate appropriate filehandle object
1003 10         17 my $fh;
1004 10 100 66     77 if (not $gzipper and not $append) {
    50 33        
    50 33        
    0 0        
1005 8 50       60 $fh = IO::File->new($filename, 'w') or
1006             carp "cannot write to file '$filename' $!\n";
1007             }
1008             elsif ($gzipper and !$append) {
1009 0 0       0 $fh = IO::File->new("| $gzipper >$filename") or
1010             carp "cannot write to compressed file '$filename' $!\n";
1011             }
1012             elsif (not $gzipper and $append) {
1013 2 50       13 $fh = IO::File->new(">> $filename") or
1014             carp "cannot append to file '$filename' $!\n";
1015             }
1016             elsif ($gzipper and $append) {
1017 0 0       0 $fh = IO::File->new("| $gzipper >>$filename") or
1018             carp "cannot append to compressed file '$filename' $!\n";
1019             }
1020 10 50       1746 return $fh if defined $fh;
1021             }
1022              
1023              
1024             ### Subroutine to check for file existance
1025             sub check_file {
1026 60     60 1 142 my ($self, $filename) = @_;
1027            
1028             # check for file existance
1029 60 50       1538 if (-e $filename) {
1030             # confirmed full filename and path
1031 60         341 return $filename;
1032             }
1033             else {
1034             # file name is either incomplete or non-existent
1035             # try adding some common file extensions in case those are missing
1036 0         0 my $new_filename;
1037 0         0 foreach my $ext (qw(.gz .txt .txt.gz .bed .bed.gz)) {
1038 0 0       0 if (-e $filename . $ext) {
1039 0         0 $new_filename = $filename . $ext;
1040 0         0 last;
1041             }
1042             }
1043 0         0 return $new_filename;
1044             }
1045             }
1046              
1047              
1048             sub fh {
1049 192     192 0 318 my $self = shift;
1050 192 50       3552 return $self->{fh} if exists $self->{fh};
1051 0         0 return;
1052             }
1053              
1054             sub close_fh {
1055 70     70 0 249 my $self = shift;
1056 70 50 33     532 $self->{fh}->close if (exists $self->{fh} and $self->{fh});
1057             }
1058              
1059              
1060             ### Internal subroutine to check if a comment line contains headers
1061             sub _commented_header_line {
1062 60     60   143 my ($data, $line) = @_;
1063            
1064             # prepare arrays from the other lines and current line
1065 60         113 my @commentdata;
1066 60 100       102 if ( scalar @{ $data->{'comments'} } >= 1 ) {
  60         191  
1067             # take the last line in the other array
1068 30         128 @commentdata = split '\t', $data->{'comments'}->[-1];
1069             }
1070 60         286 my @linedata = split '\t', $line;
1071            
1072             # check if the counts are equal
1073 60 50       170 if (scalar @commentdata == scalar @linedata) {
1074 0         0 return 1;
1075             }
1076             else {
1077 60         220 return 0;
1078             }
1079             }
1080              
1081              
1082             ### Internal subroutine to process metadata for standard columns
1083             sub add_column_metadata {
1084 0     0 1 0 my ($data, $line, $index) = @_;
1085            
1086             # strip the Column metadata identifier
1087 0         0 chomp $line;
1088 0         0 $line =~ s/^# Column_\d+ //;
1089            
1090             # break up the column metadata
1091 0         0 my %temphash; # a temporary hash to put the column metadata into
1092 0         0 foreach (split ';', $line) {
1093 0         0 my ($key, $value) = split '=';
1094 0 0       0 if ($key eq 'index') {
1095 0 0       0 if ($index != $value) {
1096             # the value from the metadata index key should be
1097             # correct, so we will use that
1098 0         0 $index = $value;
1099             }
1100             }
1101             # store the key & value
1102 0         0 $temphash{$key} = $value;
1103             }
1104            
1105             # create a index metadata key if not already present
1106             # the rest of biotoolbox may expect this to be present
1107 0 0       0 unless (exists $temphash{'index'}) {
1108 0         0 $temphash{'index'} = $index;
1109             }
1110            
1111             # store the column metadata hash into the main data hash
1112             # use the index as the key
1113 0 0       0 if (exists $data->{$index}) {
1114             # we will simply overwrite the previous metadata hash
1115             # harsh, I know, but what to do?
1116             # if it was canned metadata for a gff file, that's ok
1117 0         0 warn "Warning: more than one metadata line exists for index $index!\n";
1118 0         0 $data->{$index} = \%temphash;
1119             }
1120             else {
1121             # metadata hash doesn't exist, so we will add it
1122 0         0 $data->{$index} = \%temphash;
1123             }
1124 0         0 return 1;
1125             }
1126              
1127             ### Subroutine to generate metadata for gff files
1128             # gff files have nine defined columns
1129             # there are different specifications and variants:
1130             # gff (v.1), gff v.2, gff v.2.5 (aka gtf), gff v.3 (gff3)
1131             # however, the columns are the same in all versions
1132             # more info on gff can be found http://gmod.org/wiki/GFF3
1133             sub add_gff_metadata {
1134 5     5 1 12 my $self = shift;
1135 5   50     26 my $version = shift || undef;
1136 5   50     26 my $force = shift || 0;
1137            
1138             # set the gff version based on the extension if it isn't already
1139 5 50 33     17 if (not($self->gff) or $force) {
1140 0 0       0 if (defined $version) {
    0          
    0          
1141 0         0 $self->gff($version);
1142             }
1143             elsif ($self->extension =~ /gtf/i) {
1144 0         0 $self->gff(2.5);
1145             }
1146             elsif ($self->extension =~ /gff3/i) {
1147 0         0 $self->gff(3);
1148             }
1149             else {
1150 0         0 $self->gff(2); # hope for the best
1151             }
1152             }
1153            
1154             # set the metadata for the each column
1155             # some of these may already be defined if there was a
1156             # column metadata specific column in the file
1157 5         27 my $column_names = $self->standard_column_names('gff');
1158 5         24 for (my $i = 0; $i < 9; $i++) {
1159             # loop for each column
1160             # set metadata unless it's already loaded
1161 45 50 33     169 if ($force or not exists $self->{$i}) {
1162 45         138 $self->{$i}{'name'} = $column_names->[$i];
1163 45         81 $self->{$i}{'index'} = $i;
1164 45         74 $self->{$i}{'AUTO'} = 3;
1165             }
1166             # assign the name to the column header
1167 45 50 33     134 if ($force or not defined $self->{'column_names'}->[$i]) {
1168 45         132 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1169             }
1170             }
1171 5         21 $self->{data_table}->[0] = $self->{'column_names'};
1172            
1173             # set column number always to 9
1174 5 50 33     35 if ($force or $self->{'number_columns'} == 0) {
1175 5         20 $self->{'number_columns'} = 9;
1176             }
1177            
1178             # set headers flag to false
1179 5 50       31 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1180            
1181             # set the feature type
1182 5 50       21 unless (defined $self->{'feature'}) {
1183 5         9 $self->{'feature'} = 'region';
1184             }
1185 5         19 return 1;
1186             }
1187              
1188              
1189             ### Subroutine to generate metadata for BED files
1190             # bed files have a loose format
1191             # they require a minimum of 3 columns, and have a max of 12
1192             # 3, 6, and 12 column files are the most common
1193             # there are also something called paired bed files floating around
1194             # The official details and specifications may be found at
1195             # http://genome.ucsc.edu/FAQ/FAQformat#format1
1196              
1197             # a special type of bed file is the bedgraph, using
1198             # either a bdg, bedgraph, or simply bed extension
1199             # these only have four columns, no more, no less
1200             # the fourth column is score, not name
1201             sub add_bed_metadata {
1202 27     27 1 63 my ($self, $column_count) = @_;
1203 27   50     78 my $force = shift || 0;
1204            
1205             # check bed type and set metadata appropriately
1206 27         41 my $bed_names;
1207 27 50 33     75 if ($self->format =~ /bedgraph/i or $self->extension =~ /bg|bdg|graph/i) {
1208 0         0 $self->format('bedGraph'); # possibly redundant
1209 0         0 $self->bed($column_count);
1210 0         0 $bed_names = $self->standard_column_names('bdg');
1211             }
1212             else {
1213 27         89 $self->format('bed');
1214 27         108 $self->bed($column_count);
1215 27         98 $bed_names = $self->standard_column_names('bed12');
1216             }
1217 27         79 $self->{'number_columns'} = $column_count;
1218            
1219             # set the metadata for each column
1220             # some of these may already be defined if there was a
1221             # column metadata specific column in the file
1222 27         140 for (my $i = 0; $i < $column_count; $i++) {
1223             # loop for each column
1224             # set name unless it already has one from metadata
1225 200 50 33     396 if ($force or not exists $self->{$i}) {
1226 200   50     566 $self->{$i}{'name'} = $bed_names->[$i] || 'extraColumn';
1227 200         350 $self->{$i}{'index'} = $i;
1228 200         307 $self->{$i}{'AUTO'} = 3;
1229             }
1230             # assign the name to the column header
1231 200 50 33     392 if ($force or not defined $self->{'column_names'}->[$i]) {
1232 200         565 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1233             }
1234             }
1235 27         131 $self->{data_table}->[0] = $self->{'column_names'};
1236              
1237             # set the feature type
1238 27 50       85 unless (defined $self->{'feature'}) {
1239 27         54 $self->{'feature'} = 'region';
1240             }
1241            
1242             # set headers flag to false
1243 27 50       104 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1244 27         83 return 1;
1245             }
1246              
1247              
1248             ### Subroutine to generate metadata for broadpeak and narrowpeak files
1249             # three different types of peak files are available
1250             # see http://genome.ucsc.edu/FAQ/FAQformat.html
1251             sub add_peak_metadata {
1252 16     16 1 40 my ($self, $column_count) = @_;
1253 16   50     42 my $force = shift || 0;
1254              
1255             # check bed type and set metadata appropriately
1256             # most of these are bed6 plus extra columns
1257 16         23 my $column_names;
1258 16 100 66     41 if ($self->format =~ /narrow/i or $self->extension =~ /narrow/i) {
    50 33        
    50 33        
1259 8         30 $self->format('narrowPeak');
1260 8         33 $self->bed($column_count);
1261 8         29 $column_names = $self->standard_column_names('narrowpeak');
1262             }
1263             elsif ($self->format =~ /broad/i or $self->extension =~ /broad/i) {
1264 0         0 $self->format('broadPeak'); # possibly redundant
1265 0         0 $self->bed($column_count);
1266 0         0 $column_names = $self->standard_column_names('broadpeak');
1267             }
1268             elsif ($self->format =~ /gapped/i or $self->extension =~ /gapped/i) {
1269 8         45 $self->format('gappedPeak'); # possibly redundant
1270 8         29 $self->bed($column_count);
1271 8         25 $column_names = $self->standard_column_names('gappedpeak');
1272             }
1273             else {
1274             # how did we get here???? Hope for the best.....
1275 0         0 $self->bed($column_count);
1276 0         0 $column_names = $self->standard_column_names('bed12');
1277             }
1278 16         42 $self->{'number_columns'} = $column_count;
1279            
1280             # add metadata
1281 16         48 for (my $i = 0; $i < $column_count; $i++) {
1282 200 50 33     385 if ($force or not exists $self->{$i}) {
1283 200   50     548 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1284 200         319 $self->{$i}{'index'} = $i;
1285 200         313 $self->{$i}{'AUTO'} = 3;
1286             }
1287             # assign the name to the column header
1288 200 50 33     356 if ($force or not defined $self->{'column_names'}->[$i]) {
1289 200         465 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1290             }
1291             }
1292 16         33 $self->{data_table}->[0] = $self->{'column_names'};
1293            
1294             # set the feature type
1295 16 50       40 unless (defined $self->{'feature'}) {
1296 16         34 $self->{'feature'} = 'region';
1297             }
1298            
1299             # set headers flag to false
1300 16 50       51 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1301 16         49 return 1;
1302             }
1303              
1304              
1305             ### Subroutine to generate metadata for various UCSC gene files
1306             # these are tricky, as we will try to determine contents by counting
1307             # the columns, which may not be accurate
1308             # not only that, but this presumes the extension even makes it recognizable
1309             # see http://genome.ucsc.edu/FAQ/FAQformat.html#format9 for details
1310             # also biotoolbox script ucsc_table2gff3.pl
1311             sub add_ucsc_metadata {
1312 3     3 1 10 my ($self, $column_count) = @_;
1313 3   50     12 my $force = shift || 0;
1314            
1315             # set metadata
1316 3         5 $self->{'number_columns'} = $column_count;
1317 3         8 $self->{'ucsc'} = $column_count;
1318            
1319             # set format and determine column names;
1320 3         5 my $column_names;
1321 3 50       8 if ($column_count == 16) {
    0          
    0          
    0          
    0          
1322 3         11 $self->format('genePredExt');
1323 3         9 $column_names = $self->standard_column_names('ucsc16');
1324             }
1325             elsif ($column_count == 15) {
1326 0         0 $self->format('genePredExt');
1327 0         0 $column_names = $self->standard_column_names('ucsc15');
1328             }
1329             elsif ($column_count == 12) {
1330 0         0 $self->format('knownGene');
1331 0         0 $column_names = $self->standard_column_names('ucsc12');
1332             }
1333             elsif ($column_count == 11) {
1334 0         0 $self->format('refFlat');
1335 0         0 $column_names = $self->standard_column_names('ucsc11');
1336             }
1337             elsif ($column_count == 10) {
1338 0         0 $self->format('genePred');
1339 0         0 $column_names = $self->standard_column_names('ucsc10');
1340             }
1341            
1342             # assign the column names and metadata
1343 3         11 for (my $i = 0; $i < $column_count; $i++) {
1344             # loop for each column
1345             # set name unless it already has one from metadata
1346 48 50 33     86 if ($force or not exists $self->{$i}) {
1347 48   50     94 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1348 48         67 $self->{$i}{'index'} = $i;
1349 48         68 $self->{$i}{'AUTO'} = 3;
1350             }
1351             # assign the name to the column header
1352 48 50 33     88 if ($force or not defined $self->{'column_names'}->[$i]) {
1353 48         97 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1354             }
1355             }
1356 3         7 $self->{data_table}->[0] = $self->{'column_names'};
1357            
1358             # set the feature type
1359 3 50       8 unless (defined $self->{'feature'}) {
1360 3         8 $self->{'feature'} = 'gene';
1361             }
1362            
1363             # set headers flag to false
1364 3 50       13 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1365 3         8 return 1;
1366             }
1367              
1368              
1369             ### Subroutine to generate metadata for SGR files
1370             # a sgr file contains three columns: chromo, position, score
1371             # this is a very simple file format, useful in exporting and
1372             # importing to binary BAR files used in T2, USeq, and IGB
1373             sub add_sgr_metadata {
1374 0     0 1 0 my $self = shift;
1375 0   0     0 my $force = shift || 0;
1376            
1377             # set column metadata
1378 0         0 my $column_names = $self->standard_column_names('sgr');
1379 0         0 for (my $i = 0; $i < 3; $i++) {
1380             # loop for each column
1381             # set name unless it already has one from metadata
1382 0 0 0     0 if ($force or not exists $self->{$i}) {
1383 0   0     0 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1384 0         0 $self->{$i}{'index'} = $i;
1385 0         0 $self->{$i}{'AUTO'} = 3;
1386             }
1387             # assign the name to the column header
1388 0 0 0     0 if ($force or not defined $self->{'column_names'}->[$i]) {
1389 0         0 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1390             }
1391             }
1392 0         0 $self->{data_table}->[0] = $self->{'column_names'};
1393 0         0 $self->{'number_columns'} = 3;
1394              
1395              
1396             # set headers flag to false
1397 0 0       0 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1398            
1399             # set the feature type
1400 0 0       0 unless (defined $self->{'feature'}) {
1401 0         0 $self->{'feature'} = 'region';
1402             }
1403 0         0 return 1;
1404             }
1405              
1406              
1407             ### Internal subroutine to generate metadata for standard files
1408             sub add_standard_metadata {
1409 13     13 1 26 my ($self, $line) = @_;
1410            
1411 13         51 my @namelist = split '\t', $line;
1412 13         28 chomp $namelist[-1];
1413            
1414             # we will define the columns based on
1415 13         38 for my $i (0..$#namelist) {
1416             # make up a name if one doesn't exist
1417 68   66     140 $namelist[$i] ||= "Column_$i";
1418            
1419             # confirm that a file metadata exists for this column
1420 68 50       122 if (exists $self->{$i}) {
1421 0 0       0 unless ($namelist[$i] eq $self->{$i}->{'name'}) {
1422 0         0 warn "metadata and header names for column $i do not match!";
1423             # set the name to match the actual column name
1424 0         0 $self->{$i}->{'name'} = $namelist[$i];
1425             }
1426             }
1427            
1428             # otherwise be nice and generate it here
1429             else {
1430 68         257 $self->{$i} = {
1431             'name' => $namelist[$i],
1432             'index' => $i,
1433             'AUTO' => 3,
1434             };
1435             }
1436             }
1437            
1438             # check the number of columns
1439 13 50       47 if (scalar @namelist != $self->{'number_columns'} ) {
1440             # adjust to match actual content
1441 13         31 $self->{'number_columns'} = scalar @namelist;
1442             }
1443            
1444             # put the column names in the metadata
1445 13         33 $self->{'column_names'} = \@namelist;
1446 13         77 $self->{data_table}->[0] = $self->{'column_names'};
1447            
1448             # set headers flag to true
1449 13         29 $self->{'headers'} = 1;
1450 13         26 return 1;
1451             }
1452              
1453              
1454             ### Internal subroutine to generate hash of standard file format column names
1455             sub standard_column_names {
1456 51     51 1 167 my ($self, $type) = @_;
1457            
1458 51 100 0     241 if ($type eq 'gff') {
    100 0        
    50 0        
    50 0        
    100          
    50          
    100          
    50          
    50          
    0          
    0          
    0          
    0          
1459 5         29 return [qw(Chromosome Source Type Start Stop Score Strand Phase Group)];
1460             }
1461             elsif ($type eq 'bed12') {
1462 27         128 return [qw(Chromosome Start0 End Name Score Strand
1463             thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0)];
1464             }
1465             elsif ($type eq 'bed6') {
1466 0         0 return [qw(Chromosome Start0 End Name Score Strand)];
1467             }
1468             elsif ($type eq 'bdg') {
1469 0         0 return [qw(Chromosome Start0 End Score)];
1470             }
1471             elsif ($type eq 'narrowpeak') {
1472 8         44 return [qw(Chromosome Start0 End Name Score Strand signalValue
1473             pValue qValue peak)];
1474             }
1475             elsif ($type eq 'broadpeak') {
1476 0         0 return [qw(Chromosome Start0 End Name Score Strand signalValue
1477             pValue qValue)];
1478             }
1479             elsif ($type eq 'gappedpeak') {
1480 8         45 return [qw(Chromosome Start0 End Name Score Strand
1481             thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0
1482             signalValue pValue qValue)];
1483             }
1484             elsif ($type eq 'sgr') {
1485 0         0 return [qw(Chromo Start Score)];
1486             }
1487             elsif ($type eq 'ucsc16') {
1488 3         17 return [qw(bin name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1489             exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1490             }
1491             elsif ($type eq 'ucsc15' or $type eq 'genepredext') {
1492 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1493             exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1494             }
1495             elsif ($type eq 'ucsc12' or $type eq 'knowngene') {
1496 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1497             exonStarts0 exonEnds proteinID alignID)];
1498             }
1499             elsif ($type eq 'ucsc11' or $type eq 'refflat') {
1500 0           return [qw(geneName transcriptName chrom strand txStart0 txEnd cdsStart0
1501             cdsEnd exonCount exonStarts0 exonEnds)];
1502             }
1503             elsif ($type eq 'ucsc10' or $type eq 'genepred') {
1504 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1505             exonStarts exonEnds)];
1506             }
1507             else {
1508 0           confess "unrecognized standard column name format '$type'!";
1509             }
1510             }
1511              
1512              
1513             __END__