File Coverage

blib/lib/Bio/ToolBox/Data/file.pm
Criterion Covered Total %
statement 365 638 57.2
branch 192 486 39.5
condition 65 211 30.8
subroutine 25 27 92.5
pod 19 21 90.4
total 666 1383 48.1


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::file;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::file - File functions to Bio:ToolBox::Data family
7              
8             =head1 DESCRIPTION
9              
10             File methods for reading and writing data files for both L
11             and L objects. This module should not be used
12             directly. See the respective modules for more information.
13              
14             =cut
15              
16 4     4   24 use strict;
  4         5  
  4         102  
17 4     4   14 use Carp qw(carp cluck croak confess);
  4         6  
  4         151  
18 4     4   16 use File::Basename qw(fileparse);
  4         7  
  4         163  
19 4     4   1405 use File::Which;
  4         3181  
  4         153  
20 4     4   1139 use IO::File;
  4         13560  
  4         24558  
21              
22             # List of acceptable filename extensions
23             our $SUFFIX = qr/\.(?:txt|gff3?|gtf|bed|bg|bdg|bedgraph|sgr|kgg|cdt|vcf|narrowpeak|broadpeak|gappedpeak|reff?lat|genepred|ucsc|maf)(?:\.gz|\.bz2)?/i;
24              
25             # gzip application
26             our $gzip_app;
27             our $bgzip_app;
28              
29             ### The True Statement
30             1;
31              
32              
33              
34             ### Load new version data table from file
35              
36             sub load_file {
37 10     10 1 19 my $self = shift;
38 10         15 my $file = shift;
39 10   50     31 my $noheader = shift || 0; # may not be present
40            
41             # check that we have an empty table
42 10 50 33     31 if ($self->last_row != 0 or $self->number_columns != 0 or $self->filename) {
      33        
43 0         0 carp "Cannot load file onto an existing data table!";
44 0         0 return;
45             }
46            
47             # open the file and load metadata
48 10         42 my $filename = $self->check_file($file);
49 10 50       30 unless ($filename) {
50 0         0 print " file '$file' does not exist!\n";
51 0         0 return;
52             }
53 10         39 $self->add_file_metadata($filename);
54 10 50       29 $self->open_to_read_fh or return;
55 10         41 $self->parse_headers($noheader);
56            
57             # Load the data table
58 10         137 while (my $line = $self->{fh}->getline) {
59             # the current file position should be at the beginning of the
60             # data table information
61 198 50       3514 next if $line !~ m/\w+/;
62            
63             # skip comment and empty lines
64 198 50       325 if (substr($line,0,1) eq '#') {
65 0         0 $self->add_comment($line);
66 0         0 next;
67             }
68            
69             # process the line
70 198         298 $self->add_data_line($line);
71             }
72            
73             # completed loading the file
74 10         292 $self->close_fh;
75 10         149 delete $self->{fh};
76            
77             # verify the structure
78 10 50       37 return unless $self->verify;
79            
80             # finished
81 10         30 return 1;
82             }
83              
84             sub taste_file {
85 24     24 1 1916 my $self = shift;
86 24         43 my $file = shift;
87 24 50       79 my $filename = $self->check_file($file) or return;
88 24         98 my $Taste = $self->new;
89 24         85 $Taste->add_file_metadata($filename);
90 24 50       75 $Taste->open_to_read_fh or return;
91 24         115 $Taste->parse_headers;
92            
93             # load first 10 data lines
94 24         86 $Taste->{data_table}->[0] = $Taste->{'column_names'}; # set table header names
95 24         79 for (my $i = 1; $i <= 10; $i++) {
96 192 100       302 my $line = $Taste->fh->getline or last;
97 180 50       3150 next if $line !~ m/\w+/;
98 180 50       268 next if $line =~ /^#/;
99 180         304 $Taste->add_data_line($line);
100             }
101 24         402 $Taste->close_fh;
102 24         446 $Taste->verify(1); # silently check the integrity of the file
103            
104             # check existing metadata
105 24 100       73 if ($Taste->gff) {
    100          
    50          
106 4         10 return ('gff', $Taste->format);
107             }
108             elsif ($Taste->bed) {
109 12         28 return ('bed', $Taste->format);
110             }
111             elsif ($Taste->ucsc) {
112 0         0 return ('ucsc', $Taste->format);
113             }
114            
115            
116             # check if the number of columns match a known format, then verify
117 8         21 my $number = $Taste->number_columns;
118 8 50       65 if ($number == 9) {
    50          
    50          
    50          
    50          
    100          
119             # possibly a GFF file
120 0         0 $Taste->gff(2);
121 0         0 $Taste->verify(1);
122 0 0       0 if ($Taste->gff == 2) {
123 0         0 return ('gff', $Taste->format);
124             }
125             }
126             elsif ($number == 10) {
127             # possibly a genePred file
128 0         0 $Taste->ucsc(10);
129 0         0 $Taste->verify(1);
130 0 0       0 return 'ucsc' if $Taste->ucsc == 10;
131 0         0 $Taste->add_ucsc_metadata(10,1); # force metadata
132 0         0 $Taste->verify(1);
133 0 0       0 if ($Taste->ucsc == 10) {
134 0         0 return ('ucsc', $Taste->format);
135             }
136             }
137             elsif ($number == 11) {
138             # possibly a refFlat file
139 0         0 $Taste->ucsc(11);
140 0         0 $Taste->verify(1);
141 0 0       0 return 'ucsc' if $Taste->ucsc == 11;
142 0         0 $Taste->add_ucsc_metadata(11,1); # force metadata
143 0         0 $Taste->verify(1);
144 0 0       0 if ($Taste->ucsc == 11) {
145 0         0 return ('ucsc', $Taste->format);
146             }
147             }
148             elsif ($number == 12) {
149             # possibly a knownGene or BED12 file
150 0         0 $Taste->ucsc(12);
151 0         0 $Taste->verify(1);
152 0 0       0 if ($Taste->ucsc == 12) {
153 0         0 return ('ucsc', $Taste->format);
154             }
155 0         0 $Taste->bed(12);
156 0         0 $Taste->verify(1);
157 0 0       0 if ($Taste->bed == 12) {
158 0         0 return ('bed', $Taste->format);
159             }
160 0         0 $Taste->add_ucsc_metadata(12,1); # force metadata
161 0         0 $Taste->verify(1);
162 0 0       0 if ($Taste->ucsc == 12) {
163 0         0 return ('ucsc', $Taste->format);
164             }
165 0         0 $Taste->add_bed_metadata(12,1); # force metadata
166 0         0 $Taste->verify(1);
167 0 0       0 if ($Taste->bed == 12) {
168 0         0 return ('bed', $Taste->format);
169             }
170             }
171             elsif ($number == 15) {
172             # possibly a genePredExt file
173 0         0 $Taste->ucsc(15);
174 0         0 $Taste->verify(1);
175 0 0       0 if ($Taste->ucsc == 15) {
176 0         0 return ('ucsc', $Taste->format);
177             }
178 0         0 $Taste->add_ucsc_metadata(15,1); # force metadata
179 0         0 $Taste->verify(1);
180 0 0       0 if ($Taste->ucsc == 15) {
181 0         0 return ('ucsc', $Taste->format);
182             }
183             }
184             elsif ($number == 16) {
185             # possibly a genePredExt file
186 3         8 $Taste->ucsc(16);
187 3         7 $Taste->verify(1);
188 3 50       8 if ($Taste->ucsc == 16) {
189 0         0 return ('ucsc', $Taste->format);
190             }
191 3 50       10 return 'ucsc' if $Taste->ucsc == 16;
192 3         16 $Taste->add_ucsc_metadata(16,1); # force metadata
193 3         9 $Taste->verify(1);
194 3 50       7 if ($Taste->ucsc == 16) {
195 3         8 return ('ucsc', $Taste->format);
196             }
197             }
198 5         63 return;
199             }
200              
201             sub sample_gff_type_list {
202 11     11 1 21 my ($self, $file) = @_;
203 11 50       58 return unless $file =~ /\.g[tf]f\d?(?:\.gz)?$/i; # assume extension is accurate
204 11 50       25 my $fh = $self->open_to_read_fh($file) or die "can't open $file!\n";
205 11         15 my %types;
206 11         15 my $count = 0;
207 11         27 while ($count < 1000) {
208 1288 100       14703 my $line = $fh->getline or last;
209 1277 50       22304 next if $line !~ m/\w+/;
210 1277 100       1828 next if $line =~ /^#/;
211 1241         2774 my @fields = split('\t', $line);
212 1241         1476 $types{ $fields[2] } += 1;
213 1241         2148 $count++;
214             }
215 11         303 $fh->close;
216 11         275 return join(',', keys %types);
217             }
218              
219              
220             sub parse_headers {
221 60     60 1 114 my $self = shift;
222 60   50     206 my $noheader = shift || 0; # boolean to indicate no headers are present
223            
224             # filehandle
225 60         93 my $fh;
226 60 50 33     224 if (exists $self->{fh} and $self->{fh}) {
    0          
227 60         95 $fh = $self->{fh};
228             }
229             elsif ($self->filename) {
230 0         0 $fh = $self->open_to_read_fh($self->filename);
231             }
232             else {
233 0         0 confess " file metadata and/or open filehandle must be set before parsing headers!";
234             }
235            
236             # check that we have an empty table
237 60 50 33     184 if ($self->last_row != 0 or $self->number_columns != 0) {
238 0         0 cluck "Cannot parse file headers onto an existing data table!";
239 0         0 return;
240             }
241            
242             # read and parse the file
243             # we will ONLY parse the header lines prefixed with a #, as well as the
244             # first data row which contains the column names
245 60         208 $self->program(undef); # reset this to blank, it will be filled by file metadata
246 60         96 my $header_line_count = 0;
247 60         1258 my $line = $fh->getline; # first line
248            
249             # check the first line for proper line endings
250             {
251 60         4205 my $line2 = $line;
  60         105  
252 60         122 chomp($line2);
253 60 50       208 if ($line2 =~ /[\r\n]+/) {
254 0         0 my $filename = $self->filename;
255 0         0 die "File '$filename' does not have expected line endings!\n" .
256             " Try converting to native line endings and try again\n";
257             }
258             }
259            
260 60         126 while ($line) {
261             # we are not chomping the line here because of possible side effects
262             # with UCSC tables where we have to count elements in the first line
263             # and potential header lines, and the first line has a null value at
264             # the end
265            
266             # Parse the datafile metadata headers
267            
268             # no real line, just empty space
269 137 50       2213 if ($line !~ m/\w+/) {
    50          
    100          
    100          
    50          
    100          
    50          
    100          
    100          
270 0         0 $header_line_count++;
271 0         0 next;
272             }
273            
274             # the generating program
275             elsif ($line =~ m/^# Program (.+)$/) {
276 0         0 my $p = $1;
277 0         0 $self->program($p);
278 0         0 $header_line_count++;
279             }
280            
281             # the source database
282             elsif ($line =~ m/^# Database (.+)$/) {
283 10         29 my $d = $1;
284 10         35 $self->database($d);
285 10         16 $header_line_count++;
286             }
287            
288             # the type of feature in this datafile
289             elsif ($line =~ m/^# Feature (.+)$/) {
290 10         24 my $f = $1;
291 10         29 $self->feature($f);
292 10         11 $header_line_count++;
293             }
294            
295             # column or dataset specific information
296             elsif ($line =~ m/^# Column_(\d+)/) {
297             # the column number will become the index number
298 0         0 my $index = $1;
299 0         0 $self->add_column_metadata($line, $index);
300 0         0 $header_line_count++;
301             }
302            
303             # gff version header
304             elsif ($line =~ /^##gff-version\s+([\d\.]+)$/) {
305             # store the gff version in the hash
306             # this may or may not be present in the gff file, but want to keep
307             # it if it is
308 5         14 my $g = $1;
309 5         28 $self->gff($g);
310 5         6 $header_line_count++;
311             }
312            
313             # VCF version header
314             elsif ($line =~ /^##fileformat=VCFv([\d\.]+)$/) {
315             # store the VCF version in the hash
316             # this may or may not be present in the vcf file, but want to keep
317             # it if it is
318 0         0 my $v = $1;
319 0         0 $self->vcf($v);
320 0         0 $self->add_comment($line); # so that it is written properly
321 0         0 $header_line_count++;
322             }
323            
324             # any other nonstandard header
325             elsif ($line =~ /^#/) {
326 44         157 $self->add_comment($line);
327 44         53 $header_line_count++;
328             }
329            
330             # a track or browser line
331             elsif ($line =~ /^(?:track|browser)\s+/i) {
332             # common with wig, bed, or bedgraph files for use with UCSC genome browser
333             # treat as a comment line, there's not that much useful info here
334 8 50       36 if ($line =~ /type=(\w+)/i) {
335 8         31 $self->format($1);
336             }
337 8         33 $self->add_comment($line);
338 8         12 $header_line_count++;
339             }
340            
341             # the remainder is the data table itself
342             else {
343             # the first row in the data table are (usually) the column names
344             # we only want the names, not the rest of the table
345            
346             # specific file formats have implicit predefined column formats
347             # these file formats do NOT have column headers
348             # we will first check for those file formats and process accordingly
349            
350             ### Data tables with a commented header line
351 60 50       221 if ( $self->_commented_header_line($line) ) {
352             # these will have one comment line marked with #
353             # that really contains the column headers
354            
355             # process the real header line
356 0         0 $self->add_standard_metadata( pop @{ $self->{'comments'} } );
  0         0  
357             }
358             # we will continue here in case the commented header line was part of a
359             # formatted file type, which will be checked by extension or possibly
360             # the format (if determined) below
361 60   66     169 my $format = $self->format || $self->extension;
362            
363             ### a GFF file
364 60 100       379 if ($format =~ /g[tf]f/i) {
    100          
    100          
    50          
    50          
365 5         19 $self->add_gff_metadata;
366             }
367            
368             ### a peak file
369             elsif ($format =~ /peak/i) {
370 16         35 my $count = scalar(split '\t', $line);
371 16         74 $self->add_peak_metadata($count);
372             }
373            
374             ### a Bed or BedGraph file
375             elsif ($format =~ /bg|bdg|bed/i) {
376 26         55 my $count = scalar(split '\t', $line);
377 26         90 $self->add_bed_metadata($count);
378             }
379            
380             ### a UCSC gene table
381             elsif ($format =~ /ref+lat|genepred|ucsc/i) {
382 0         0 my $count = scalar(split '\t', $line);
383 0         0 $self->add_ucsc_metadata($count);
384             }
385            
386             ### a SGR file
387             elsif ($format =~ /sgr/i) {
388 0         0 $self->add_sgr_metadata;
389             }
390            
391             ### standard text file with headers, i.e. everything else
392 60 100       138 unless ($self->number_columns) {
393             # we have not yet parsed the row of data column names
394             # we will do so now
395 13         54 $self->add_standard_metadata($line);
396            
397             # count as a header line
398 13         18 $header_line_count++;
399             }
400             }
401            
402             # keep processing by going to the next line until we have identified columns
403 137 100       240 if ($self->number_columns == 0) {
404 77         1125 $line = $fh->getline;
405             }
406             else {
407 60         142 undef $line;
408             }
409             }
410            
411             # if we didn't find columns, check that it wasn't actually commented
412             # for example, an empty vcf file
413 60 0 50     124 if ($self->number_columns == 0 and
      0        
414 0         0 scalar(@{ $self->{comments} }) and
415             $self->{comments}->[-1] =~ /\t/
416             ) {
417             # process the real header line
418 0         0 $self->add_standard_metadata( pop @{ $self->{'comments'} } );
  0         0  
419             }
420            
421            
422             # No header was requested
423 60 0 33     166 if ($noheader and not $self->bed and not $self->gff and
      33        
      0        
      0        
424             not $self->vcf and not $self->ucsc
425             ) {
426             # which means that what we used as a header is actually the first data row
427            
428             # fix the column names
429 0         0 for (my $i = 0; $i < $self->number_columns; $i++) {
430 0         0 my $name = $self->name($i);
431 0         0 $self->name($i, "Column$i ($name)");
432 0         0 $self->{$i}{'AUTO'} = 3;
433             }
434             # adjust metadata
435 0         0 $header_line_count -= 1;
436 0         0 $self->{'headers'} = -1; # special case, we never write headers here
437             }
438            
439             # Header sanity check
440             # some exported file formats, such as from R, do not include a proper
441             # header for the first column, as these are assumed to be row names
442             # this will result in incorrectly parsed files where the last columns
443             # will be merged into one column with an internal tab - not good
444             # need to handle these
445 60         1054 my $nextline = $fh->getline;
446 60 50       1087 if ($nextline) {
447 60         219 my @nextdata = split '\t', $nextline;
448 60 50       164 if (scalar(@nextdata) - 1 == $self->number_columns) {
449             # whoops! we caught a off-by-one discrepancy between header and data row
450 0         0 my $old_last = $self->last_column;
451 0         0 $fh->close; # having this open complicates changing columns....
452            
453             # add a new "column" (just metadata for now) and move it to the beginning
454 0         0 my $i = $self->add_column('Column1');
455 0         0 $self->reorder_column($i, 0 .. $old_last);
456             }
457 60 100       168 if (ref($self) eq 'Bio::ToolBox::Data::Stream') {
458             # store an example first line for Stream objects
459 26         45 chomp $nextdata[-1];
460 26         52 $self->{example} = \@nextdata;
461             }
462             }
463            
464             # re-open the file
465             # I tried using seek functions - but they don't work with binary gzip
466             # files, and I can't get the seek function to return the same position
467             # as simply advancing through the file like below
468             # so I'll just do it the old way and close/open and advance
469 60 50       291 $fh->close if $fh; # may have been closed from the sanity check above
470 60         1279 $fh = $self->open_to_read_fh;
471 60         161 for (1 .. $header_line_count) {
472 90         2485 my $line = $fh->getline;
473             }
474 60         901 $self->{header_line_count} = $header_line_count;
475 60         85 $self->{fh} = $fh;
476 60         105 return 1;
477             }
478              
479              
480             sub add_data_line {
481 574     574 1 741 my ($self, $line) = @_;
482            
483             # do not chomp the line yet, just split into an array
484 574         2092 my @linedata = split '\t', $line, $self->{number_columns};
485            
486             # chomp the last element
487             # we do this here to ensure the tab split above gets all of the values
488             # otherwise trailing null values aren't included in @linedata
489 574         779 chomp $linedata[-1];
490            
491             # check for extra remaining tabs
492 574 50       911 if (index($linedata[-1], "\t") != -1) {
493 0         0 die "FILE INCONSISTENCY ERRORS! line has additional tabs (columns) than headers!\n $line";
494             }
495            
496             # add the line of data
497 574         557 push @{ $self->{data_table} }, \@linedata;
  574         919  
498 574         702 $self->{last_row} += 1;
499 574         3199 return 1;
500             }
501              
502              
503             ### Parse the filename using the list suffix list
504             sub add_file_metadata {
505 70     70 1 173 my ($self, $filename) = @_;
506 70 50       155 confess "no valid filename!" unless defined $filename;
507 70         2352 my ($basename, $path, $extension) = fileparse($filename, $SUFFIX);
508 70 50       206 unless ($extension) {
509             # look for a nonstandard extension, allowing for .gz extension
510 0 0       0 if ($filename =~ /(\.\w+(?:\.gz)?)$/i) {
511 0         0 $extension = $1;
512 0         0 $basename =~ s/$extension\Z//;
513             }
514             }
515 70         156 $self->{filename} = $filename;
516 70         119 $self->{basename} = $basename;
517 70         101 $self->{path} = $path;
518 70         152 $self->{extension} = $extension;
519             }
520              
521              
522             ### Write out a data file
523             sub write_file {
524 8     8 1 14 my $self = shift;
525            
526             # collect passed arguments
527 8         11 my %args;
528 8 100       27 if (scalar(@_) == 1) {
529 7         21 $args{'filename'} = $_[0];
530             }
531             else {
532 1         3 %args = @_;
533             }
534 8   0     22 $args{'filename'} ||= $args{'file'} || undef;
      33        
535 8   50     39 $args{'format'} ||= undef;
536 8 50       21 unless (exists $args{'gz'}) {$args{'gz'} = undef}
  8         13  
537            
538             # check the data
539 8 50       30 unless ($self->verify) {
540 0         0 cluck "bad data structure!";
541 0         0 return;
542             }
543            
544             # check filename
545 8 50 33     22 unless ($args{'filename'} or $self->filename) {
546 0         0 cluck "no filename given!\n";
547 0         0 return;
548             }
549            
550             # split filename into its base components
551             my ($name, $path, $extension) =
552 8   33     260 fileparse($args{'filename'} || $self->filename, $SUFFIX);
553            
554             # Adjust filename extension if necessary
555 8 50       100 if ($extension =~ /(g[tf]f)/i) {
    100          
    50          
    50          
    50          
    50          
    0          
556 0 0       0 if (not $self->gff) {
557             # let's set it to true and see if it passes verification
558 0 0       0 $self->{'gff'} = $extension =~ /gtf/i ? 2.5 : 3; # default
559 0 0 0     0 unless ($self->verify and $self->gff) {
560 0         0 warn " GFF structure changed, re-setting extension from $extension to .txt\n";
561 0         0 $extension =~ s/g[tf]f3?/txt/i;
562             }
563             }
564             }
565             elsif ($extension =~ /bedgraph|bed|bdg|narrowpeak|broadpeak/i) {
566 3 100       7 if (not $self->bed) {
567             # let's set it to true and see if it passes verification
568 1         1 $self->{'bed'} = 1; # a fake true
569 1 50 33     3 unless ($self->verify and $self->bed) {
570 0         0 warn " BED structure changed, re-setting extension from $extension to .txt\n";
571 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
572             }
573             }
574             }
575             elsif ($extension =~ /vcf/i) {
576 0 0       0 if (not $self->vcf) {
577             # let's set it to true and see if it passes verification
578 0         0 $self->{'vcf'} = 1; # a fake true
579 0 0 0     0 unless ($self->verify and $self->vcf) {
580 0         0 warn " VCF structure changed, re-setting extension from $extension to .txt\n";
581 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
582             }
583             }
584             }
585             elsif ($extension =~ /sgr/i) {
586 0 0       0 unless ($self->{'extension'} =~ /sgr/i) {
587             # original file was not SGR
588             # let's pretend it was and see if still passes
589             # the sgr verification relies on the recorded extension
590 0         0 $self->{'extension'} = '.sgr';
591 0         0 $self->verify;
592 0 0       0 if ($self->extension =~ /txt/) {
593 0         0 warn " SGR structure changed, re-setting extension from $extension to .txt\n";
594             }
595 0         0 $extension = $self->{'extension'};
596             }
597             }
598             elsif ($extension =~ /reff?lat|genepred|ucsc/i) {
599 0 0       0 if ($self->ucsc != $self->number_columns) {
600             # it's not set as a ucsc data
601             # let's set it to true and see if it passes verification
602 0         0 $self->ucsc($self->number_columns);
603 0 0 0     0 unless ($self->verify and $self->ucsc) {
604 0         0 warn " UCSC structure changed, re-setting extension from $extension to .txt\n";
605 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
606             }
607             }
608             }
609             elsif ($extension =~ /txt/i) {
610             # plain old text file, sounds good to me
611             # make sure headers are enabled
612 5 50       22 $self->{'headers'} = 1 unless $self->{'headers'} == -1; # original noheader
613             }
614             elsif (not $extension) {
615             # no extension was available
616             # try and determine one from metadata
617            
618 0 0       0 if ($self->gff) {
    0          
    0          
    0          
    0          
    0          
619 0 0       0 $extension = $self->gff == 3 ? '.gff3' : $self->gff == 2.5 ? '.gtf' : '.gff';
    0          
620             }
621             elsif ($self->bed) {
622 0 0 0     0 if ($self->format) {
    0          
623             # re-use the format value as the extension
624 0         0 $extension = sprintf(".%s", $self->format);
625             }
626             elsif (
627             $self->number_columns == 4 and
628             $self->name(3) =~ /score/i
629             ) {
630 0         0 $extension = '.bdg'; # looks like a bedGraph file
631             }
632             else {
633 0         0 $extension = '.bed'; # a regular bed file
634             }
635             }
636             elsif ($self->ucsc) {
637 0 0       0 if ($self->format) {
638             # re-use the format value as the extension
639 0         0 $extension = sprintf(".%s", $self->format);
640             }
641             else {
642             # use a generic ucsc format
643 0         0 $extension = '.ucsc';
644             }
645             }
646             elsif ($self->vcf) {
647 0         0 $extension = '.vcf';
648             }
649             elsif ($name =~ /(\.\w{3}(?:\.gz)?)$/i) {
650             # a non-standard 3 letter file extension
651             # anything else might be construed as part of the filename, so run the
652             # risk of adding a default extension below
653 0         0 $extension = $1;
654 0         0 $name =~ s/$extension\Z//;
655             }
656             elsif ($self->extension) {
657             # original file had an extension, re-use it if appropriate
658             # why wouldn't this get picked up above???? probably old cruft,
659             # or a non-standard or unknown file extension
660             # leave it in for the time being, shouldn't hurt anything
661 0 0       0 if ($self->extension =~ /g[tf]f/i) {
    0          
662 0 0       0 $extension = $self->gff ? $self->extension : '.txt';
663             }
664             elsif ($self->extension =~ /bed|bdg|peak/i) {
665 0 0       0 $extension = $self->bed ? $self->extension : '.txt';
666             }
667             else {
668             # an unstructured format
669 0         0 $extension = $self->extension;
670             }
671             }
672             else {
673             # normal data text file
674 0         0 $extension = '.txt';
675             }
676             }
677             # otherwise the extension must be good, hope for the best
678            
679             # determine format
680             # this is an arcane specification of whether we want a "simple" no metadata
681             # format, or an ordinary text format that may or may not have metadata
682             # it's currently not hurting much, so leave it in for now?
683 8 50       18 unless ($args{'format'}) {
684 8 50       20 if (defined $args{'simple'}) {
    50          
685             # an old method of specifying simple
686 0         0 $args{'format'} = 'simple';
687             }
688             elsif ($extension) {
689             # check extension from the parsed filename, if present
690 8 50       27 if ($extension =~ /sgr|cdt/i) {
691             # sgr is simple format, no headers
692 0         0 $args{'format'} = 'simple';
693             }
694             else {
695             # everything else is text
696 8         17 $args{'format'} = 'text';
697             }
698             }
699             else {
700             # somehow we got this far without defining? use default text
701 0         0 $args{'format'} = 'text';
702             }
703             }
704            
705             # check zip status if necessary
706 8 50       17 unless (defined $args{'gz'}) {
707             # look at filename extension as a clue
708             # in case we're overwriting the input file, keep the zip status
709 8 50       31 if ($extension =~ m/\.vcf\.gz/i) {
    50          
710             # vcf requires bgzip
711 0         0 $args{'gz'} = 2;
712             }
713             elsif ($extension =~ m/\.gz$/i) {
714 0         0 $args{'gz'} = 1;
715             }
716             else {
717 8         14 $args{'gz'} = 0; # default
718             }
719             }
720            
721             # adjust gzip extension as necessary
722 8 50 33     48 if ($args{'gz'} and $extension !~ m/\.gz$/i) {
    50 33        
723 0         0 $extension .= '.gz';
724             }
725             elsif (not $args{'gz'} and $extension =~ /\.gz$/i) {
726 0         0 $extension =~ s/\.gz$//i;
727             }
728            
729             # check filename length
730             # assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
731             # don't even get me started on Windows NTFS path length limitation
732 8 50       30 if (length($name . $extension) > 255) {
733 0         0 my $limit = 253 - length($extension);
734 0         0 $name = substr($name, 0, $limit) . '..';
735 0         0 warn " filename too long! Truncating to $limit characters\n";
736             }
737            
738             # generate the new filename
739 8         19 my $newname = $path . $name . $extension;
740            
741            
742             # Convert strand information
743 8         31 my $strand_i = $self->strand_column;
744 8 0 0     20 if (defined $strand_i and ($self->gff or $self->bed or $self->ucsc) ) {
      33        
745             # convert to +/-/. nomenclature as necessary
746 0 0 0     0 if ($self->gff) {
    0          
747 0         0 for my $row (1 .. $self->last_row) {
748 0         0 my $s = $self->{'data_table'}->[$row][$strand_i];
749 0 0       0 if ($s =~ /\d/) {
750 0 0       0 $s = $s == 1 ? '+' : $s == -1 ? '-' : '.';
    0          
751             }
752 0         0 $self->{'data_table'}->[$row][$strand_i] = $s;
753             }
754             }
755             elsif ($self->bed or $self->ucsc) {
756 0         0 for my $row (1 .. $self->last_row) {
757 0         0 my $s = $self->{'data_table'}->[$row][$strand_i];
758 0 0       0 if ($s =~ /\d/) {
759 0 0       0 $s = $s >= 0 ? '+' : '-';
760             }
761 0         0 $self->{'data_table'}->[$row][$strand_i] = $s;
762             }
763             }
764             }
765            
766            
767             # Open file for writing
768 8         39 my $fh = $self->open_to_write_fh($newname, $args{'gz'});
769 8 50       22 return unless defined $fh;
770            
771            
772             # Write the headers
773 8 50       26 if ($args{'format'} eq 'text') {
774             # default text format has metadata headers
775            
776             # write gff statement if gff format
777 8 50       29 if ($self->gff) {
778 0         0 $fh->print('##gff-version ' . $self->gff . "\n");
779             }
780            
781             # Write the primary headers
782 8 50 66     19 unless (
      66        
      66        
      33        
783             $self->gff or $self->bed or $self->ucsc or $self->vcf or
784             $extension =~ m/sgr|kgg|cdt|peak/i
785             ) {
786             # we only write these for normal text files, not defined format files
787            
788 5 50       20 if ($self->program) {
789 0         0 $fh->print('# Program ' . $self->program . "\n");
790             }
791 5 50       17 if ($self->database) {
792 5         13 $fh->print('# Database ' . $self->database . "\n");
793             }
794 5 50       101 if ($self->feature) {
795 5         13 $fh->print('# Feature ' . $self->feature . "\n");
796             }
797             }
798            
799             # Write the miscellaneous headers
800 8         27 foreach ( @{ $self->{'comments'} } ) {
  8         25  
801             # write remaining miscellaneous header lines if present
802             # we do this for all files
803 6 100       46 unless (/\n$/s) {
804             # append newline if not present
805 4         7 $_ .= "\n";
806             }
807             # check for comment character at beginning
808 6 100       14 if (/^#/) {
809 5         15 $fh->print($_);
810             }
811             else {
812 1         3 $fh->print("# " . $_);
813             }
814             }
815            
816             # Write the column metadata headers
817 8         32 for (my $i = 0; $i < $self->number_columns; $i++) {
818             # each column metadata in the hash is referenced by the column's
819             # index number as the key
820             # we will take each index one at a time in increasing order
821            
822             # some files do not need or tolerate metadata lines, for those
823             # known files the metadata lines will be skipped
824            
825             # these column metadata lines do not need to be written if they
826             # only have two values, presumably name and index, for files
827             # that don't normally have column headers, e.g. gff
828 22 100 66     54 if (
    50 0        
    0 0        
829             exists $self->{$i}{'AUTO'} and
830 8         20 scalar( keys %{ $self->{$i} } ) == $self->{$i}{'AUTO'}
831             ) {
832             # some of the metadata values were autogenerated and
833             # we have the same number of keys as were autogenerated
834             # no need to write these
835 8         14 next;
836             }
837 14         32 elsif (scalar( keys %{ $self->{$i} } ) == 2) {
838             # only two metadata keys exist, name and index
839             # these are so simple it's not worth writing them
840 14         29 next;
841             }
842             elsif ($extension =~ /sgr|kgg|cdt/i or $self->ucsc or $self->vcf) {
843             # these do not support metadata lines
844 0         0 next;
845             }
846            
847             # we will put each key=value pair into @pairs, listed asciibetically
848 0         0 my @pairs; # an array of the key value pairs from the metadata hash
849             # put name first
850             # we are no longer writing the index number
851 0         0 push @pairs, 'name=' . $self->{$i}{'name'};
852             # put remainder in alphabetical order
853 0         0 foreach (sort {$a cmp $b} keys %{ $self->{$i} } ) {
  0         0  
  0         0  
854 0 0       0 next if $_ eq 'name'; # already written
855 0 0       0 next if $_ eq 'index'; # internal use only
856 0 0       0 next if $_ eq 'AUTO'; # internal use only
857 0         0 push @pairs, $_ . '=' . $self->{$i}{$_};
858             }
859            
860             # Finally write the header line, joining the pairs with a
861             # semi-colon into a single string.
862             # The column identifier is comprised of the word 'Column'
863             # and the index number joined by '_'.
864 0         0 $fh->print("# Column_$i ", join(";", @pairs), "\n");
865             }
866             }
867            
868            
869             # Write the table column headers
870 8 100       25 if ($self->{'headers'} == 1) {
871 5         10 $fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[0] }));
  5         28  
872             }
873            
874            
875             # Write the data table
876 8 50       59 if ($args{'format'} eq 'simple') {
877            
878             # the simple format will strip the non-value '.' from the table
879 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
880             # we will step though the data_table array one row at a time
881             # convert the non-value '.' to undefined
882             # and print using a tab-delimited format
883 0         0 my @linedata;
884 0         0 foreach ( @{ $self->{'data_table'}[$i] }) {
  0         0  
885 0 0       0 if ($_ eq '.') {
886 0         0 push @linedata, undef;
887             } else {
888 0         0 push @linedata, $_;
889             }
890             }
891 0         0 $fh->printf("%s\n", join("\t", @linedata));
892             }
893             }
894            
895             else {
896             # normal data files
897 8         26 for (my $i = 1; $i <= $self->last_row; $i++) {
898             # we will step though the data_table array one row at a time
899             # we will join each row's array of elements into a string to print
900             # using a tab-delimited format
901 104         125 $fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[$i] }));
  104         202  
902             }
903             }
904            
905             # done writing
906 8         29 $fh->close;
907            
908             # if we made it this far, it should've been a success!
909             # return the new file name as indication of success
910 8         433 return $newname;
911             }
912              
913             sub save {
914 6     6 1 35 return shift->write_file(@_);
915             }
916              
917              
918             #### Open a file for reading
919             sub open_to_read_fh {
920 161     161 1 264 my $self = shift;
921 161   100     455 my $file = shift || undef;
922 161 100       517 my $obj = ref($self) =~ /^Bio::ToolBox/ ? 1 : 0;
923            
924             # check file
925 161 100 66     475 if (not $file and $obj) {
926 120   50     334 $file = $self->filename || undef;
927             }
928 161 50       271 unless ($file) {
929 0         0 carp " no filename provided or associated with object!";
930 0         0 return;
931             }
932            
933             # Open filehandle object as appropriate
934 161         172 my $fh;
935 161 50       509 if ($file =~ /\.gz$/i) {
    50          
936             # the file is compressed with gzip
937 0 0       0 $fh = IO::File->new("gzip -dc $file |") or
938             carp "unable to read '$file' $!\n";
939             }
940             elsif ($file =~ /\.bz2$/i) {
941             # the file is compressed with bzip2
942 0 0       0 $fh = IO::File->new("bzip2 -dc $file |") or
943             carp "unable to read '$file' $!\n";
944             }
945             else {
946             # the file is uncompressed and space hogging
947 161 50       727 $fh = IO::File->new($file, 'r') or
948             carp "unable to read '$file' $!\n";
949             }
950            
951 161 100       13535 if ($obj) {
952 120         291 $self->{fh} = $fh;
953             }
954 161         499 return $fh;
955             }
956              
957              
958             #### Open a file for writing
959             sub open_to_write_fh {
960 10     10 1 23 my ($self, $filename, $gz, $append) = @_;
961            
962             # check filename
963 10 50       19 unless ($filename) {
964 0         0 carp " no filename to write!";
965 0         0 return;
966             }
967            
968             # check filename length
969             # assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
970 10         105 my $name = fileparse($filename);
971 10 50       58 if (length $name > 255) {
972 0         0 carp " filename is too long! please shorten\n";
973 0         0 return;
974             }
975            
976             # check zip status if necessary
977 10 100       22 unless (defined $gz) {
978             # look at filename extension as a clue
979             # in case we're overwriting the input file, keep the zip status
980 2 50       8 if ($filename =~ m/\.vcf(\.gz)?$/i) {
    50          
981 0         0 $gz = 2; # bgzip
982             }
983             elsif ($filename =~ m/\.gz$/i) {
984 0         0 $gz = 1; # regular gzip
985             }
986             else {
987 2         3 $gz = 0; # default
988             }
989             }
990            
991             # gzip compression application
992 10 50 33     47 if ($gz == 1 and not $gzip_app) {
    50 33        
993             # use parallel gzip if possible
994             # this is stored in a global variable so we only have to look once
995 0         0 $gzip_app = which('pigz');
996 0 0       0 if ($gzip_app) {
997 0         0 $gzip_app .= ' -p 3'; # use a conservative 3 processes, plus perl, so 4 total
998             }
999             else {
1000             # default is the standard gzip application
1001             # should be available in any application
1002 0         0 $gzip_app = which('gzip');
1003             }
1004 0 0       0 unless ($gzip_app) {
1005 0         0 croak "no gzip application in PATH to open compressed file handle output!\n";
1006             }
1007             }
1008             elsif ($gz == 2 and not $bgzip_app) {
1009             # use parallel bgzip if possible
1010             # this is stored in a global variable so we only have to look once
1011 0         0 $bgzip_app = which('bgzip');
1012 0 0       0 if ($bgzip_app) {
1013             # I'm going to assume this is a recent bgzip with multi-threading
1014             # use 3 threads, same as with pigz
1015 0         0 $bgzip_app .= ' -@ 3 -c';
1016             }
1017 0 0       0 unless ($bgzip_app) {
1018 0         0 croak "no bgzip application in PATH to open compressed file handle output!\n";
1019             }
1020             }
1021 10 50       24 my $gzipper = $gz == 1 ? $gzip_app : $gz == 2 ? $bgzip_app : undef;
    50          
1022            
1023             # check file append mode
1024 10 100       22 unless (defined $append) {
1025             # default is not to append
1026 8         12 $append = 0;
1027             }
1028            
1029            
1030             # Generate appropriate filehandle object
1031 10         15 my $fh;
1032 10 100 66     41 if (not $gzipper and not $append) {
    50 33        
    50 33        
    0 0        
1033 8 50       51 $fh = IO::File->new($filename, 'w') or
1034             carp "cannot write to file '$filename' $!\n";
1035             }
1036             elsif ($gzipper and !$append) {
1037 0 0       0 $fh = IO::File->new("| $gzipper >$filename") or
1038             carp "cannot write to compressed file '$filename' $!\n";
1039             }
1040             elsif (not $gzipper and $append) {
1041 2 50       10 $fh = IO::File->new(">> $filename") or
1042             carp "cannot append to file '$filename' $!\n";
1043             }
1044             elsif ($gzipper and $append) {
1045 0 0       0 $fh = IO::File->new("| $gzipper >>$filename") or
1046             carp "cannot append to compressed file '$filename' $!\n";
1047             }
1048 10 50       1465 return $fh if defined $fh;
1049             }
1050              
1051              
1052             ### Subroutine to check for file existance
1053             sub check_file {
1054 60     60 1 129 my ($self, $filename) = @_;
1055            
1056             # check for file existance
1057 60 50       1269 if (-e $filename) {
1058             # confirmed full filename and path
1059 60         292 return $filename;
1060             }
1061             else {
1062             # file name is either incomplete or non-existent
1063             # try adding some common file extensions in case those are missing
1064 0         0 my $new_filename;
1065 0         0 foreach my $ext (qw(.gz .txt .txt.gz .bed .bed.gz)) {
1066 0 0       0 if (-e $filename . $ext) {
1067 0         0 $new_filename = $filename . $ext;
1068 0         0 last;
1069             }
1070             }
1071 0         0 return $new_filename;
1072             }
1073             }
1074              
1075              
1076             sub fh {
1077 192     192 0 212 my $self = shift;
1078 192 50       2643 return $self->{fh} if exists $self->{fh};
1079 0         0 return;
1080             }
1081              
1082             sub close_fh {
1083 70     70 0 192 my $self = shift;
1084 70 50 33     482 $self->{fh}->close if (exists $self->{fh} and $self->{fh});
1085             }
1086              
1087              
1088             ### Internal subroutine to check if a comment line contains headers
1089             sub _commented_header_line {
1090 60     60   120 my ($data, $line) = @_;
1091            
1092             # prepare arrays from the other lines and current line
1093 60         104 my @commentdata;
1094 60 100       68 if ( scalar @{ $data->{'comments'} } >= 1 ) {
  60         162  
1095             # take the last line in the other array
1096 30         99 @commentdata = split '\t', $data->{'comments'}->[-1];
1097             }
1098 60         206 my @linedata = split '\t', $line;
1099            
1100             # check if the counts are equal
1101 60 50       132 if (scalar @commentdata == scalar @linedata) {
1102 0         0 return 1;
1103             }
1104             else {
1105 60         191 return 0;
1106             }
1107             }
1108              
1109              
1110             ### Internal subroutine to process metadata for standard columns
1111             sub add_column_metadata {
1112 0     0 1 0 my ($data, $line, $index) = @_;
1113            
1114             # strip the Column metadata identifier
1115 0         0 chomp $line;
1116 0         0 $line =~ s/^# Column_\d+ //;
1117            
1118             # break up the column metadata
1119 0         0 my %temphash; # a temporary hash to put the column metadata into
1120 0         0 foreach (split ';', $line) {
1121 0         0 my ($key, $value) = split '=';
1122 0 0       0 if ($key eq 'index') {
1123 0 0       0 if ($index != $value) {
1124             # the value from the metadata index key should be
1125             # correct, so we will use that
1126 0         0 $index = $value;
1127             }
1128             }
1129             # store the key & value
1130 0         0 $temphash{$key} = $value;
1131             }
1132            
1133             # create a index metadata key if not already present
1134             # the rest of biotoolbox may expect this to be present
1135 0 0       0 unless (exists $temphash{'index'}) {
1136 0         0 $temphash{'index'} = $index;
1137             }
1138            
1139             # store the column metadata hash into the main data hash
1140             # use the index as the key
1141 0 0       0 if (exists $data->{$index}) {
1142             # we will simply overwrite the previous metadata hash
1143             # harsh, I know, but what to do?
1144             # if it was canned metadata for a gff file, that's ok
1145 0         0 warn "Warning: more than one metadata line exists for index $index!\n";
1146 0         0 $data->{$index} = \%temphash;
1147             }
1148             else {
1149             # metadata hash doesn't exist, so we will add it
1150 0         0 $data->{$index} = \%temphash;
1151             }
1152 0         0 return 1;
1153             }
1154              
1155             ### Subroutine to generate metadata for gff files
1156             # gff files have nine defined columns
1157             # there are different specifications and variants:
1158             # gff (v.1), gff v.2, gff v.2.5 (aka gtf), gff v.3 (gff3)
1159             # however, the columns are the same in all versions
1160             # more info on gff can be found http://gmod.org/wiki/GFF3
1161             sub add_gff_metadata {
1162 5     5 1 9 my $self = shift;
1163 5   50     23 my $version = shift || undef;
1164 5   50     22 my $force = shift || 0;
1165            
1166             # set the gff version based on the extension if it isn't already
1167 5 50 33     11 if (not($self->gff) or $force) {
1168 0 0       0 if (defined $version) {
    0          
    0          
1169 0         0 $self->gff($version);
1170             }
1171             elsif ($self->extension =~ /gtf/i) {
1172 0         0 $self->gff(2.5);
1173             }
1174             elsif ($self->extension =~ /gff3/i) {
1175 0         0 $self->gff(3);
1176             }
1177             else {
1178 0         0 $self->gff(2); # hope for the best
1179             }
1180             }
1181             # set format based on version
1182 5 50       18 if (not $self->format) {
1183 5         11 my $v = $self->gff;
1184 5 0       21 my $f = $v == 3 ? 'gff3' : $v > 2 ? 'gtf' : 'gff';
    50          
1185 5         12 $self->format($f);
1186             }
1187            
1188             # set the metadata for the each column
1189             # some of these may already be defined if there was a
1190             # column metadata specific column in the file
1191 5         22 my $column_names = $self->standard_column_names('gff');
1192 5         17 for (my $i = 0; $i < 9; $i++) {
1193             # loop for each column
1194             # set metadata unless it's already loaded
1195 45 50 33     100 if ($force or not exists $self->{$i}) {
1196 45         99 $self->{$i}{'name'} = $column_names->[$i];
1197 45         57 $self->{$i}{'index'} = $i;
1198 45         69 $self->{$i}{'AUTO'} = 3;
1199             }
1200             # assign the name to the column header
1201 45 50 33     108 if ($force or not defined $self->{'column_names'}->[$i]) {
1202 45         84 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1203             }
1204             }
1205 5         11 $self->{data_table}->[0] = $self->{'column_names'};
1206            
1207             # set column number always to 9
1208 5 50 33     17 if ($force or $self->{'number_columns'} == 0) {
1209 5         10 $self->{'number_columns'} = 9;
1210             }
1211            
1212             # set headers flag to false
1213 5 50       16 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1214            
1215             # set the feature type
1216 5 50       10 unless (defined $self->{'feature'}) {
1217 5         9 $self->{'feature'} = 'region';
1218             }
1219 5         11 return 1;
1220             }
1221              
1222              
1223             ### Subroutine to generate metadata for BED files
1224             # bed files have a loose format
1225             # they require a minimum of 3 columns, and have a max of 12
1226             # 3, 6, and 12 column files are the most common
1227             # there are also something called paired bed files floating around
1228             # The official details and specifications may be found at
1229             # http://genome.ucsc.edu/FAQ/FAQformat#format1
1230              
1231             # a special type of bed file is the bedgraph, using
1232             # either a bdg, bedgraph, or simply bed extension
1233             # these only have four columns, no more, no less
1234             # the fourth column is score, not name
1235             sub add_bed_metadata {
1236 27     27 1 48 my ($self, $column_count) = @_;
1237 27   50     57 my $force = shift || 0;
1238            
1239             # check bed type and set metadata appropriately
1240 27         40 my $bed_names;
1241 27 50 33     57 if ($self->format =~ /bedgraph/i or $self->extension =~ /bg|bdg|graph/i) {
1242 0         0 $self->format('bedGraph'); # possibly redundant
1243 0         0 $self->bed($column_count);
1244 0         0 $bed_names = $self->standard_column_names('bdg');
1245             }
1246             else {
1247 27         67 $self->format('bed');
1248 27         75 $self->bed($column_count);
1249 27         74 $bed_names = $self->standard_column_names('bed12');
1250             }
1251 27         53 $self->{'number_columns'} = $column_count;
1252 27         39 $self->{'zerostart'} = 1;
1253            
1254             # set the metadata for each column
1255             # some of these may already be defined if there was a
1256             # column metadata specific column in the file
1257 27         79 for (my $i = 0; $i < $column_count; $i++) {
1258             # loop for each column
1259             # set name unless it already has one from metadata
1260 200 50 33     277 if ($force or not exists $self->{$i}) {
1261 200   50     427 $self->{$i}{'name'} = $bed_names->[$i] || 'extraColumn';
1262 200         254 $self->{$i}{'index'} = $i;
1263 200         219 $self->{$i}{'AUTO'} = 3;
1264             }
1265             # assign the name to the column header
1266 200 50 33     282 if ($force or not defined $self->{'column_names'}->[$i]) {
1267 200         361 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1268             }
1269             }
1270 27         51 $self->{data_table}->[0] = $self->{'column_names'};
1271              
1272             # set the feature type
1273 27 50       53 unless (defined $self->{'feature'}) {
1274 27         43 $self->{'feature'} = 'region';
1275             }
1276            
1277             # set headers flag to false
1278 27 50       75 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1279 27         61 return 1;
1280             }
1281              
1282              
1283             ### Subroutine to generate metadata for broadpeak and narrowpeak files
1284             # three different types of peak files are available
1285             # see http://genome.ucsc.edu/FAQ/FAQformat.html
1286             sub add_peak_metadata {
1287 16     16 1 31 my ($self, $column_count) = @_;
1288 16   50     47 my $force = shift || 0;
1289              
1290             # check bed type and set metadata appropriately
1291             # most of these are bed6 plus extra columns
1292 16         28 my $column_names;
1293 16 100 66     38 if ($self->format =~ /narrow/i or $self->extension =~ /narrow/i) {
    50 33        
    50 33        
1294 8         25 $self->format('narrowPeak');
1295 8         26 $self->bed($column_count);
1296 8         24 $column_names = $self->standard_column_names('narrowpeak');
1297             }
1298             elsif ($self->format =~ /broad/i or $self->extension =~ /broad/i) {
1299 0         0 $self->format('broadPeak'); # possibly redundant
1300 0         0 $self->bed($column_count);
1301 0         0 $column_names = $self->standard_column_names('broadpeak');
1302             }
1303             elsif ($self->format =~ /gapped/i or $self->extension =~ /gapped/i) {
1304 8         27 $self->format('gappedPeak'); # possibly redundant
1305 8         27 $self->bed($column_count);
1306 8         25 $column_names = $self->standard_column_names('gappedpeak');
1307             }
1308             else {
1309             # how did we get here???? Hope for the best.....
1310 0         0 $self->bed($column_count);
1311 0         0 $column_names = $self->standard_column_names('bed12');
1312             }
1313 16         39 $self->{'number_columns'} = $column_count;
1314 16         34 $self->{'zerostart'} = 1;
1315            
1316             # add metadata
1317 16         47 for (my $i = 0; $i < $column_count; $i++) {
1318 200 50 33     271 if ($force or not exists $self->{$i}) {
1319 200   50     464 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1320 200         254 $self->{$i}{'index'} = $i;
1321 200         219 $self->{$i}{'AUTO'} = 3;
1322             }
1323             # assign the name to the column header
1324 200 50 33     291 if ($force or not defined $self->{'column_names'}->[$i]) {
1325 200         330 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1326             }
1327             }
1328 16         32 $self->{data_table}->[0] = $self->{'column_names'};
1329            
1330             # set the feature type
1331 16 50       36 unless (defined $self->{'feature'}) {
1332 16         25 $self->{'feature'} = 'region';
1333             }
1334            
1335             # set headers flag to false
1336 16 50       64 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1337 16         43 return 1;
1338             }
1339              
1340              
1341             ### Subroutine to generate metadata for various UCSC gene files
1342             # these are tricky, as we will try to determine contents by counting
1343             # the columns, which may not be accurate
1344             # not only that, but this presumes the extension even makes it recognizable
1345             # see http://genome.ucsc.edu/FAQ/FAQformat.html#format9 for details
1346             # also biotoolbox script ucsc_table2gff3.pl
1347             sub add_ucsc_metadata {
1348 3     3 1 7 my ($self, $column_count) = @_;
1349 3   50     10 my $force = shift || 0;
1350            
1351             # set metadata
1352 3         6 $self->{'number_columns'} = $column_count;
1353 3         7 $self->{'ucsc'} = $column_count;
1354            
1355             # set format and determine column names;
1356 3         3 my $column_names;
1357 3 50       8 if ($column_count == 16) {
    0          
    0          
    0          
    0          
1358 3         9 $self->format('genePredExt');
1359 3         10 $column_names = $self->standard_column_names('ucsc16');
1360             }
1361             elsif ($column_count == 15) {
1362 0         0 $self->format('genePredExt');
1363 0         0 $column_names = $self->standard_column_names('ucsc15');
1364             }
1365             elsif ($column_count == 12) {
1366 0         0 $self->format('knownGene');
1367 0         0 $column_names = $self->standard_column_names('ucsc12');
1368             }
1369             elsif ($column_count == 11) {
1370 0         0 $self->format('refFlat');
1371 0         0 $column_names = $self->standard_column_names('ucsc11');
1372             }
1373             elsif ($column_count == 10) {
1374 0         0 $self->format('genePred');
1375 0         0 $column_names = $self->standard_column_names('ucsc10');
1376             }
1377 3         6 $self->{'zerostart'} = 1;
1378            
1379             # assign the column names and metadata
1380 3         11 for (my $i = 0; $i < $column_count; $i++) {
1381             # loop for each column
1382             # set name unless it already has one from metadata
1383 48 50 33     70 if ($force or not exists $self->{$i}) {
1384 48   50     74 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1385 48         52 $self->{$i}{'index'} = $i;
1386 48         51 $self->{$i}{'AUTO'} = 3;
1387             }
1388             # assign the name to the column header
1389 48 50 33     65 if ($force or not defined $self->{'column_names'}->[$i]) {
1390 48         107 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1391             }
1392             }
1393 3         5 $self->{data_table}->[0] = $self->{'column_names'};
1394            
1395             # set the feature type
1396 3 50       8 unless (defined $self->{'feature'}) {
1397 3         5 $self->{'feature'} = 'gene';
1398             }
1399            
1400             # set headers flag to false
1401 3 50       12 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1402 3         8 return 1;
1403             }
1404              
1405              
1406             ### Subroutine to generate metadata for SGR files
1407             # a sgr file contains three columns: chromo, position, score
1408             # this is a very simple file format, useful in exporting and
1409             # importing to binary BAR files used in T2, USeq, and IGB
1410             sub add_sgr_metadata {
1411 0     0 1 0 my $self = shift;
1412 0   0     0 my $force = shift || 0;
1413            
1414             # set column metadata
1415 0         0 my $column_names = $self->standard_column_names('sgr');
1416 0         0 for (my $i = 0; $i < 3; $i++) {
1417             # loop for each column
1418             # set name unless it already has one from metadata
1419 0 0 0     0 if ($force or not exists $self->{$i}) {
1420 0   0     0 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1421 0         0 $self->{$i}{'index'} = $i;
1422 0         0 $self->{$i}{'AUTO'} = 3;
1423             }
1424             # assign the name to the column header
1425 0 0 0     0 if ($force or not defined $self->{'column_names'}->[$i]) {
1426 0         0 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1427             }
1428             }
1429 0         0 $self->{data_table}->[0] = $self->{'column_names'};
1430 0         0 $self->{'number_columns'} = 3;
1431              
1432              
1433             # set headers flag to false
1434 0 0       0 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1435            
1436             # set the feature type
1437 0 0       0 unless (defined $self->{'feature'}) {
1438 0         0 $self->{'feature'} = 'region';
1439             }
1440 0         0 return 1;
1441             }
1442              
1443              
1444             ### Internal subroutine to generate metadata for standard files
1445             sub add_standard_metadata {
1446 13     13 1 23 my ($self, $line) = @_;
1447            
1448 13         42 my @namelist = split '\t', $line;
1449 13         15 chomp $namelist[-1];
1450            
1451             # we will define the columns based on
1452 13         36 for my $i (0..$#namelist) {
1453             # make up a name if one doesn't exist
1454 68   66     99 $namelist[$i] ||= "Column_$i";
1455            
1456             # confirm that a file metadata exists for this column
1457 68 50       96 if (exists $self->{$i}) {
1458 0 0       0 unless ($namelist[$i] eq $self->{$i}->{'name'}) {
1459 0         0 warn "metadata and header names for column $i do not match!";
1460             # set the name to match the actual column name
1461 0         0 $self->{$i}->{'name'} = $namelist[$i];
1462             }
1463             }
1464            
1465             # otherwise be nice and generate it here
1466             else {
1467 68         193 $self->{$i} = {
1468             'name' => $namelist[$i],
1469             'index' => $i,
1470             'AUTO' => 3,
1471             };
1472             }
1473             }
1474            
1475             # check the number of columns
1476 13 50       34 if (scalar @namelist != $self->{'number_columns'} ) {
1477             # adjust to match actual content
1478 13         29 $self->{'number_columns'} = scalar @namelist;
1479             }
1480            
1481             # put the column names in the metadata
1482 13         27 $self->{'column_names'} = \@namelist;
1483 13         24 $self->{data_table}->[0] = $self->{'column_names'};
1484            
1485             # set headers flag to true
1486 13         18 $self->{'headers'} = 1;
1487 13         19 return 1;
1488             }
1489              
1490              
1491             ### Internal subroutine to generate hash of standard file format column names
1492             sub standard_column_names {
1493 51     51 1 95 my ($self, $type) = @_;
1494            
1495 51 100 0     217 if ($type eq 'gff') {
    100 0        
    50 0        
    50 0        
    100          
    50          
    100          
    50          
    50          
    0          
    0          
    0          
    0          
1496 5         23 return [qw(Chromosome Source Type Start Stop Score Strand Phase Group)];
1497             }
1498             elsif ($type eq 'bed12') {
1499 27         89 return [qw(Chromosome Start0 End Name Score Strand
1500             thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0)];
1501             }
1502             elsif ($type eq 'bed6') {
1503 0         0 return [qw(Chromosome Start0 End Name Score Strand)];
1504             }
1505             elsif ($type eq 'bdg') {
1506 0         0 return [qw(Chromosome Start0 End Score)];
1507             }
1508             elsif ($type eq 'narrowpeak') {
1509 8         27 return [qw(Chromosome Start0 End Name Score Strand signalValue
1510             pValue qValue peak)];
1511             }
1512             elsif ($type eq 'broadpeak') {
1513 0         0 return [qw(Chromosome Start0 End Name Score Strand signalValue
1514             pValue qValue)];
1515             }
1516             elsif ($type eq 'gappedpeak') {
1517 8         31 return [qw(Chromosome Start0 End Name Score Strand
1518             thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0
1519             signalValue pValue qValue)];
1520             }
1521             elsif ($type eq 'sgr') {
1522 0         0 return [qw(Chromo Start Score)];
1523             }
1524             elsif ($type eq 'ucsc16') {
1525 3         16 return [qw(bin name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1526             exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1527             }
1528             elsif ($type eq 'ucsc15' or $type eq 'genepredext') {
1529 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1530             exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1531             }
1532             elsif ($type eq 'ucsc12' or $type eq 'knowngene') {
1533 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1534             exonStarts0 exonEnds proteinID alignID)];
1535             }
1536             elsif ($type eq 'ucsc11' or $type eq 'refflat') {
1537 0           return [qw(geneName transcriptName chrom strand txStart0 txEnd cdsStart0
1538             cdsEnd exonCount exonStarts0 exonEnds)];
1539             }
1540             elsif ($type eq 'ucsc10' or $type eq 'genepred') {
1541 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1542             exonStarts exonEnds)];
1543             }
1544             else {
1545 0           confess "unrecognized standard column name format '$type'!";
1546             }
1547             }
1548              
1549              
1550             __END__