File Coverage

blib/lib/Bio/ToolBox/Data/file.pm

Criterion	Covered	Total	%
statement	365	638	57.2
branch	192	486	39.5
condition	65	211	30.8
subroutine	25	27	92.5
pod	19	21	90.4
total	666	1383	48.1

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::Data::file;
2							our $VERSION = '1.69';
3
4							=head1 NAME
5
6							Bio::ToolBox::Data::file - File functions to Bio:ToolBox::Data family
7
8							=head1 DESCRIPTION
9
10							File methods for reading and writing data files for both L
11							and L objects. This module should not be used
12							directly. See the respective modules for more information.
13
14							=cut
15
16	4			4		24	use strict;
	4					5
	4					102
17	4			4		14	use Carp qw(carp cluck croak confess);
	4					6
	4					151
18	4			4		16	use File::Basename qw(fileparse);
	4					7
	4					163
19	4			4		1405	use File::Which;
	4					3181
	4					153
20	4			4		1139	use IO::File;
	4					13560
	4					24558
21
22							# List of acceptable filename extensions
23							our $SUFFIX = qr/\.(?:txt\|gff3?\|gtf\|bed\|bg\|bdg\|bedgraph\|sgr\|kgg\|cdt\|vcf\|narrowpeak\|broadpeak\|gappedpeak\|reff?lat\|genepred\|ucsc\|maf)(?:\.gz\|\.bz2)?/i;
24
25							# gzip application
26							our $gzip_app;
27							our $bgzip_app;
28
29							### The True Statement
30							1;
31
32
33
34							### Load new version data table from file
35
36							sub load_file {
37	10			10	1	19	my $self = shift;
38	10					15	my $file = shift;
39	10		50			31	my $noheader = shift \|\| 0; # may not be present
40
41							# check that we have an empty table
42	10	50	33			31	if ($self->last_row != 0 or $self->number_columns != 0 or $self->filename) {
			33
43	0					0	carp "Cannot load file onto an existing data table!";
44	0					0	return;
45							}
46
47							# open the file and load metadata
48	10					42	my $filename = $self->check_file($file);
49	10	50				30	unless ($filename) {
50	0					0	print " file '$file' does not exist!\n";
51	0					0	return;
52							}
53	10					39	$self->add_file_metadata($filename);
54	10	50				29	$self->open_to_read_fh or return;
55	10					41	$self->parse_headers($noheader);
56
57							# Load the data table
58	10					137	while (my $line = $self->{fh}->getline) {
59							# the current file position should be at the beginning of the
60							# data table information
61	198	50				3514	next if $line !~ m/\w+/;
62
63							# skip comment and empty lines
64	198	50				325	if (substr($line,0,1) eq '#') {
65	0					0	$self->add_comment($line);
66	0					0	next;
67							}
68
69							# process the line
70	198					298	$self->add_data_line($line);
71							}
72
73							# completed loading the file
74	10					292	$self->close_fh;
75	10					149	delete $self->{fh};
76
77							# verify the structure
78	10	50				37	return unless $self->verify;
79
80							# finished
81	10					30	return 1;
82							}
83
84							sub taste_file {
85	24			24	1	1916	my $self = shift;
86	24					43	my $file = shift;
87	24	50				79	my $filename = $self->check_file($file) or return;
88	24					98	my $Taste = $self->new;
89	24					85	$Taste->add_file_metadata($filename);
90	24	50				75	$Taste->open_to_read_fh or return;
91	24					115	$Taste->parse_headers;
92
93							# load first 10 data lines
94	24					86	$Taste->{data_table}->[0] = $Taste->{'column_names'}; # set table header names
95	24					79	for (my $i = 1; $i <= 10; $i++) {
96	192	100				302	my $line = $Taste->fh->getline or last;
97	180	50				3150	next if $line !~ m/\w+/;
98	180	50				268	next if $line =~ /^#/;
99	180					304	$Taste->add_data_line($line);
100							}
101	24					402	$Taste->close_fh;
102	24					446	$Taste->verify(1); # silently check the integrity of the file
103
104							# check existing metadata
105	24	100				73	if ($Taste->gff) {
		100
		50
106	4					10	return ('gff', $Taste->format);
107							}
108							elsif ($Taste->bed) {
109	12					28	return ('bed', $Taste->format);
110							}
111							elsif ($Taste->ucsc) {
112	0					0	return ('ucsc', $Taste->format);
113							}
114
115
116							# check if the number of columns match a known format, then verify
117	8					21	my $number = $Taste->number_columns;
118	8	50				65	if ($number == 9) {
		50
		50
		50
		50
		100
119							# possibly a GFF file
120	0					0	$Taste->gff(2);
121	0					0	$Taste->verify(1);
122	0	0				0	if ($Taste->gff == 2) {
123	0					0	return ('gff', $Taste->format);
124							}
125							}
126							elsif ($number == 10) {
127							# possibly a genePred file
128	0					0	$Taste->ucsc(10);
129	0					0	$Taste->verify(1);
130	0	0				0	return 'ucsc' if $Taste->ucsc == 10;
131	0					0	$Taste->add_ucsc_metadata(10,1); # force metadata
132	0					0	$Taste->verify(1);
133	0	0				0	if ($Taste->ucsc == 10) {
134	0					0	return ('ucsc', $Taste->format);
135							}
136							}
137							elsif ($number == 11) {
138							# possibly a refFlat file
139	0					0	$Taste->ucsc(11);
140	0					0	$Taste->verify(1);
141	0	0				0	return 'ucsc' if $Taste->ucsc == 11;
142	0					0	$Taste->add_ucsc_metadata(11,1); # force metadata
143	0					0	$Taste->verify(1);
144	0	0				0	if ($Taste->ucsc == 11) {
145	0					0	return ('ucsc', $Taste->format);
146							}
147							}
148							elsif ($number == 12) {
149							# possibly a knownGene or BED12 file
150	0					0	$Taste->ucsc(12);
151	0					0	$Taste->verify(1);
152	0	0				0	if ($Taste->ucsc == 12) {
153	0					0	return ('ucsc', $Taste->format);
154							}
155	0					0	$Taste->bed(12);
156	0					0	$Taste->verify(1);
157	0	0				0	if ($Taste->bed == 12) {
158	0					0	return ('bed', $Taste->format);
159							}
160	0					0	$Taste->add_ucsc_metadata(12,1); # force metadata
161	0					0	$Taste->verify(1);
162	0	0				0	if ($Taste->ucsc == 12) {
163	0					0	return ('ucsc', $Taste->format);
164							}
165	0					0	$Taste->add_bed_metadata(12,1); # force metadata
166	0					0	$Taste->verify(1);
167	0	0				0	if ($Taste->bed == 12) {
168	0					0	return ('bed', $Taste->format);
169							}
170							}
171							elsif ($number == 15) {
172							# possibly a genePredExt file
173	0					0	$Taste->ucsc(15);
174	0					0	$Taste->verify(1);
175	0	0				0	if ($Taste->ucsc == 15) {
176	0					0	return ('ucsc', $Taste->format);
177							}
178	0					0	$Taste->add_ucsc_metadata(15,1); # force metadata
179	0					0	$Taste->verify(1);
180	0	0				0	if ($Taste->ucsc == 15) {
181	0					0	return ('ucsc', $Taste->format);
182							}
183							}
184							elsif ($number == 16) {
185							# possibly a genePredExt file
186	3					8	$Taste->ucsc(16);
187	3					7	$Taste->verify(1);
188	3	50				8	if ($Taste->ucsc == 16) {
189	0					0	return ('ucsc', $Taste->format);
190							}
191	3	50				10	return 'ucsc' if $Taste->ucsc == 16;
192	3					16	$Taste->add_ucsc_metadata(16,1); # force metadata
193	3					9	$Taste->verify(1);
194	3	50				7	if ($Taste->ucsc == 16) {
195	3					8	return ('ucsc', $Taste->format);
196							}
197							}
198	5					63	return;
199							}
200
201							sub sample_gff_type_list {
202	11			11	1	21	my ($self, $file) = @_;
203	11	50				58	return unless $file =~ /\.g[tf]f\d?(?:\.gz)?$/i; # assume extension is accurate
204	11	50				25	my $fh = $self->open_to_read_fh($file) or die "can't open $file!\n";
205	11					15	my %types;
206	11					15	my $count = 0;
207	11					27	while ($count < 1000) {
208	1288	100				14703	my $line = $fh->getline or last;
209	1277	50				22304	next if $line !~ m/\w+/;
210	1277	100				1828	next if $line =~ /^#/;
211	1241					2774	my @fields = split('\t', $line);
212	1241					1476	$types{ $fields[2] } += 1;
213	1241					2148	$count++;
214							}
215	11					303	$fh->close;
216	11					275	return join(',', keys %types);
217							}
218
219
220							sub parse_headers {
221	60			60	1	114	my $self = shift;
222	60		50			206	my $noheader = shift \|\| 0; # boolean to indicate no headers are present
223
224							# filehandle
225	60					93	my $fh;
226	60	50	33			224	if (exists $self->{fh} and $self->{fh}) {
		0
227	60					95	$fh = $self->{fh};
228							}
229							elsif ($self->filename) {
230	0					0	$fh = $self->open_to_read_fh($self->filename);
231							}
232							else {
233	0					0	confess " file metadata and/or open filehandle must be set before parsing headers!";
234							}
235
236							# check that we have an empty table
237	60	50	33			184	if ($self->last_row != 0 or $self->number_columns != 0) {
238	0					0	cluck "Cannot parse file headers onto an existing data table!";
239	0					0	return;
240							}
241
242							# read and parse the file
243							# we will ONLY parse the header lines prefixed with a #, as well as the
244							# first data row which contains the column names
245	60					208	$self->program(undef); # reset this to blank, it will be filled by file metadata
246	60					96	my $header_line_count = 0;
247	60					1258	my $line = $fh->getline; # first line
248
249							# check the first line for proper line endings
250							{
251	60					4205	my $line2 = $line;
	60					105
252	60					122	chomp($line2);
253	60	50				208	if ($line2 =~ /[\r\n]+/) {
254	0					0	my $filename = $self->filename;
255	0					0	die "File '$filename' does not have expected line endings!\n" .
256							" Try converting to native line endings and try again\n";
257							}
258							}
259
260	60					126	while ($line) {
261							# we are not chomping the line here because of possible side effects
262							# with UCSC tables where we have to count elements in the first line
263							# and potential header lines, and the first line has a null value at
264							# the end
265
266							# Parse the datafile metadata headers
267
268							# no real line, just empty space
269	137	50				2213	if ($line !~ m/\w+/) {
		50
		100
		100
		50
		100
		50
		100
		100
270	0					0	$header_line_count++;
271	0					0	next;
272							}
273
274							# the generating program
275							elsif ($line =~ m/^# Program (.+)$/) {
276	0					0	my $p = $1;
277	0					0	$self->program($p);
278	0					0	$header_line_count++;
279							}
280
281							# the source database
282							elsif ($line =~ m/^# Database (.+)$/) {
283	10					29	my $d = $1;
284	10					35	$self->database($d);
285	10					16	$header_line_count++;
286							}
287
288							# the type of feature in this datafile
289							elsif ($line =~ m/^# Feature (.+)$/) {
290	10					24	my $f = $1;
291	10					29	$self->feature($f);
292	10					11	$header_line_count++;
293							}
294
295							# column or dataset specific information
296							elsif ($line =~ m/^# Column_(\d+)/) {
297							# the column number will become the index number
298	0					0	my $index = $1;
299	0					0	$self->add_column_metadata($line, $index);
300	0					0	$header_line_count++;
301							}
302
303							# gff version header
304							elsif ($line =~ /^##gff-version\s+([\d\.]+)$/) {
305							# store the gff version in the hash
306							# this may or may not be present in the gff file, but want to keep
307							# it if it is
308	5					14	my $g = $1;
309	5					28	$self->gff($g);
310	5					6	$header_line_count++;
311							}
312
313							# VCF version header
314							elsif ($line =~ /^##fileformat=VCFv([\d\.]+)$/) {
315							# store the VCF version in the hash
316							# this may or may not be present in the vcf file, but want to keep
317							# it if it is
318	0					0	my $v = $1;
319	0					0	$self->vcf($v);
320	0					0	$self->add_comment($line); # so that it is written properly
321	0					0	$header_line_count++;
322							}
323
324							# any other nonstandard header
325							elsif ($line =~ /^#/) {
326	44					157	$self->add_comment($line);
327	44					53	$header_line_count++;
328							}
329
330							# a track or browser line
331							elsif ($line =~ /^(?:track\|browser)\s+/i) {
332							# common with wig, bed, or bedgraph files for use with UCSC genome browser
333							# treat as a comment line, there's not that much useful info here
334	8	50				36	if ($line =~ /type=(\w+)/i) {
335	8					31	$self->format($1);
336							}
337	8					33	$self->add_comment($line);
338	8					12	$header_line_count++;
339							}
340
341							# the remainder is the data table itself
342							else {
343							# the first row in the data table are (usually) the column names
344							# we only want the names, not the rest of the table
345
346							# specific file formats have implicit predefined column formats
347							# these file formats do NOT have column headers
348							# we will first check for those file formats and process accordingly
349
350							### Data tables with a commented header line
351	60	50				221	if ( $self->_commented_header_line($line) ) {
352							# these will have one comment line marked with #
353							# that really contains the column headers
354
355							# process the real header line
356	0					0	$self->add_standard_metadata( pop @{ $self->{'comments'} } );
	0					0
357							}
358							# we will continue here in case the commented header line was part of a
359							# formatted file type, which will be checked by extension or possibly
360							# the format (if determined) below
361	60		66			169	my $format = $self->format \|\| $self->extension;
362
363							### a GFF file
364	60	100				379	if ($format =~ /g[tf]f/i) {
		100
		100
		50
		50
365	5					19	$self->add_gff_metadata;
366							}
367
368							### a peak file
369							elsif ($format =~ /peak/i) {
370	16					35	my $count = scalar(split '\t', $line);
371	16					74	$self->add_peak_metadata($count);
372							}
373
374							### a Bed or BedGraph file
375							elsif ($format =~ /bg\|bdg\|bed/i) {
376	26					55	my $count = scalar(split '\t', $line);
377	26					90	$self->add_bed_metadata($count);
378							}
379
380							### a UCSC gene table
381							elsif ($format =~ /ref+lat\|genepred\|ucsc/i) {
382	0					0	my $count = scalar(split '\t', $line);
383	0					0	$self->add_ucsc_metadata($count);
384							}
385
386							### a SGR file
387							elsif ($format =~ /sgr/i) {
388	0					0	$self->add_sgr_metadata;
389							}
390
391							### standard text file with headers, i.e. everything else
392	60	100				138	unless ($self->number_columns) {
393							# we have not yet parsed the row of data column names
394							# we will do so now
395	13					54	$self->add_standard_metadata($line);
396
397							# count as a header line
398	13					18	$header_line_count++;
399							}
400							}
401
402							# keep processing by going to the next line until we have identified columns
403	137	100				240	if ($self->number_columns == 0) {
404	77					1125	$line = $fh->getline;
405							}
406							else {
407	60					142	undef $line;
408							}
409							}
410
411							# if we didn't find columns, check that it wasn't actually commented
412							# for example, an empty vcf file
413	60	0	50			124	if ($self->number_columns == 0 and
			0
414	0					0	scalar(@{ $self->{comments} }) and
415							$self->{comments}->[-1] =~ /\t/
416							) {
417							# process the real header line
418	0					0	$self->add_standard_metadata( pop @{ $self->{'comments'} } );
	0					0
419							}
420
421
422							# No header was requested
423	60	0	33			166	if ($noheader and not $self->bed and not $self->gff and
			33
			0
			0
424							not $self->vcf and not $self->ucsc
425							) {
426							# which means that what we used as a header is actually the first data row
427
428							# fix the column names
429	0					0	for (my $i = 0; $i < $self->number_columns; $i++) {
430	0					0	my $name = $self->name($i);
431	0					0	$self->name($i, "Column$i ($name)");
432	0					0	$self->{$i}{'AUTO'} = 3;
433							}
434							# adjust metadata
435	0					0	$header_line_count -= 1;
436	0					0	$self->{'headers'} = -1; # special case, we never write headers here
437							}
438
439							# Header sanity check
440							# some exported file formats, such as from R, do not include a proper
441							# header for the first column, as these are assumed to be row names
442							# this will result in incorrectly parsed files where the last columns
443							# will be merged into one column with an internal tab - not good
444							# need to handle these
445	60					1054	my $nextline = $fh->getline;
446	60	50				1087	if ($nextline) {
447	60					219	my @nextdata = split '\t', $nextline;
448	60	50				164	if (scalar(@nextdata) - 1 == $self->number_columns) {
449							# whoops! we caught a off-by-one discrepancy between header and data row
450	0					0	my $old_last = $self->last_column;
451	0					0	$fh->close; # having this open complicates changing columns....
452
453							# add a new "column" (just metadata for now) and move it to the beginning
454	0					0	my $i = $self->add_column('Column1');
455	0					0	$self->reorder_column($i, 0 .. $old_last);
456							}
457	60	100				168	if (ref($self) eq 'Bio::ToolBox::Data::Stream') {
458							# store an example first line for Stream objects
459	26					45	chomp $nextdata[-1];
460	26					52	$self->{example} = \@nextdata;
461							}
462							}
463
464							# re-open the file
465							# I tried using seek functions - but they don't work with binary gzip
466							# files, and I can't get the seek function to return the same position
467							# as simply advancing through the file like below
468							# so I'll just do it the old way and close/open and advance
469	60	50				291	$fh->close if $fh; # may have been closed from the sanity check above
470	60					1279	$fh = $self->open_to_read_fh;
471	60					161	for (1 .. $header_line_count) {
472	90					2485	my $line = $fh->getline;
473							}
474	60					901	$self->{header_line_count} = $header_line_count;
475	60					85	$self->{fh} = $fh;
476	60					105	return 1;
477							}
478
479
480							sub add_data_line {
481	574			574	1	741	my ($self, $line) = @_;
482
483							# do not chomp the line yet, just split into an array
484	574					2092	my @linedata = split '\t', $line, $self->{number_columns};
485
486							# chomp the last element
487							# we do this here to ensure the tab split above gets all of the values
488							# otherwise trailing null values aren't included in @linedata
489	574					779	chomp $linedata[-1];
490
491							# check for extra remaining tabs
492	574	50				911	if (index($linedata[-1], "\t") != -1) {
493	0					0	die "FILE INCONSISTENCY ERRORS! line has additional tabs (columns) than headers!\n $line";
494							}
495
496							# add the line of data
497	574					557	push @{ $self->{data_table} }, \@linedata;
	574					919
498	574					702	$self->{last_row} += 1;
499	574					3199	return 1;
500							}
501
502
503							### Parse the filename using the list suffix list
504							sub add_file_metadata {
505	70			70	1	173	my ($self, $filename) = @_;
506	70	50				155	confess "no valid filename!" unless defined $filename;
507	70					2352	my ($basename, $path, $extension) = fileparse($filename, $SUFFIX);
508	70	50				206	unless ($extension) {
509							# look for a nonstandard extension, allowing for .gz extension
510	0	0				0	if ($filename =~ /(\.\w+(?:\.gz)?)$/i) {
511	0					0	$extension = $1;
512	0					0	$basename =~ s/$extension\Z//;
513							}
514							}
515	70					156	$self->{filename} = $filename;
516	70					119	$self->{basename} = $basename;
517	70					101	$self->{path} = $path;
518	70					152	$self->{extension} = $extension;
519							}
520
521
522							### Write out a data file
523							sub write_file {
524	8			8	1	14	my $self = shift;
525
526							# collect passed arguments
527	8					11	my %args;
528	8	100				27	if (scalar(@_) == 1) {
529	7					21	$args{'filename'} = $_[0];
530							}
531							else {
532	1					3	%args = @_;
533							}
534	8		0			22	$args{'filename'} \|\|= $args{'file'} \|\| undef;
			33
535	8		50			39	$args{'format'} \|\|= undef;
536	8	50				21	unless (exists $args{'gz'}) {$args{'gz'} = undef}
	8					13
537
538							# check the data
539	8	50				30	unless ($self->verify) {
540	0					0	cluck "bad data structure!";
541	0					0	return;
542							}
543
544							# check filename
545	8	50	33			22	unless ($args{'filename'} or $self->filename) {
546	0					0	cluck "no filename given!\n";
547	0					0	return;
548							}
549
550							# split filename into its base components
551							my ($name, $path, $extension) =
552	8		33			260	fileparse($args{'filename'} \|\| $self->filename, $SUFFIX);
553
554							# Adjust filename extension if necessary
555	8	50				100	if ($extension =~ /(g[tf]f)/i) {
		100
		50
		50
		50
		50
		0
556	0	0				0	if (not $self->gff) {
557							# let's set it to true and see if it passes verification
558	0	0				0	$self->{'gff'} = $extension =~ /gtf/i ? 2.5 : 3; # default
559	0	0	0			0	unless ($self->verify and $self->gff) {
560	0					0	warn " GFF structure changed, re-setting extension from $extension to .txt\n";
561	0					0	$extension =~ s/g[tf]f3?/txt/i;
562							}
563							}
564							}
565							elsif ($extension =~ /bedgraph\|bed\|bdg\|narrowpeak\|broadpeak/i) {
566	3	100				7	if (not $self->bed) {
567							# let's set it to true and see if it passes verification
568	1					1	$self->{'bed'} = 1; # a fake true
569	1	50	33			3	unless ($self->verify and $self->bed) {
570	0					0	warn " BED structure changed, re-setting extension from $extension to .txt\n";
571	0	0				0	$extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
572							}
573							}
574							}
575							elsif ($extension =~ /vcf/i) {
576	0	0				0	if (not $self->vcf) {
577							# let's set it to true and see if it passes verification
578	0					0	$self->{'vcf'} = 1; # a fake true
579	0	0	0			0	unless ($self->verify and $self->vcf) {
580	0					0	warn " VCF structure changed, re-setting extension from $extension to .txt\n";
581	0	0				0	$extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
582							}
583							}
584							}
585							elsif ($extension =~ /sgr/i) {
586	0	0				0	unless ($self->{'extension'} =~ /sgr/i) {
587							# original file was not SGR
588							# let's pretend it was and see if still passes
589							# the sgr verification relies on the recorded extension
590	0					0	$self->{'extension'} = '.sgr';
591	0					0	$self->verify;
592	0	0				0	if ($self->extension =~ /txt/) {
593	0					0	warn " SGR structure changed, re-setting extension from $extension to .txt\n";
594							}
595	0					0	$extension = $self->{'extension'};
596							}
597							}
598							elsif ($extension =~ /reff?lat\|genepred\|ucsc/i) {
599	0	0				0	if ($self->ucsc != $self->number_columns) {
600							# it's not set as a ucsc data
601							# let's set it to true and see if it passes verification
602	0					0	$self->ucsc($self->number_columns);
603	0	0	0			0	unless ($self->verify and $self->ucsc) {
604	0					0	warn " UCSC structure changed, re-setting extension from $extension to .txt\n";
605	0	0				0	$extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
606							}
607							}
608							}
609							elsif ($extension =~ /txt/i) {
610							# plain old text file, sounds good to me
611							# make sure headers are enabled
612	5	50				22	$self->{'headers'} = 1 unless $self->{'headers'} == -1; # original noheader
613							}
614							elsif (not $extension) {
615							# no extension was available
616							# try and determine one from metadata
617
618	0	0				0	if ($self->gff) {
		0
		0
		0
		0
		0
619	0	0				0	$extension = $self->gff == 3 ? '.gff3' : $self->gff == 2.5 ? '.gtf' : '.gff';
		0
620							}
621							elsif ($self->bed) {
622	0	0	0			0	if ($self->format) {
		0
623							# re-use the format value as the extension
624	0					0	$extension = sprintf(".%s", $self->format);
625							}
626							elsif (
627							$self->number_columns == 4 and
628							$self->name(3) =~ /score/i
629							) {
630	0					0	$extension = '.bdg'; # looks like a bedGraph file
631							}
632							else {
633	0					0	$extension = '.bed'; # a regular bed file
634							}
635							}
636							elsif ($self->ucsc) {
637	0	0				0	if ($self->format) {
638							# re-use the format value as the extension
639	0					0	$extension = sprintf(".%s", $self->format);
640							}
641							else {
642							# use a generic ucsc format
643	0					0	$extension = '.ucsc';
644							}
645							}
646							elsif ($self->vcf) {
647	0					0	$extension = '.vcf';
648							}
649							elsif ($name =~ /(\.\w{3}(?:\.gz)?)$/i) {
650							# a non-standard 3 letter file extension
651							# anything else might be construed as part of the filename, so run the
652							# risk of adding a default extension below
653	0					0	$extension = $1;
654	0					0	$name =~ s/$extension\Z//;
655							}
656							elsif ($self->extension) {
657							# original file had an extension, re-use it if appropriate
658							# why wouldn't this get picked up above???? probably old cruft,
659							# or a non-standard or unknown file extension
660							# leave it in for the time being, shouldn't hurt anything
661	0	0				0	if ($self->extension =~ /g[tf]f/i) {
		0
662	0	0				0	$extension = $self->gff ? $self->extension : '.txt';
663							}
664							elsif ($self->extension =~ /bed\|bdg\|peak/i) {
665	0	0				0	$extension = $self->bed ? $self->extension : '.txt';
666							}
667							else {
668							# an unstructured format
669	0					0	$extension = $self->extension;
670							}
671							}
672							else {
673							# normal data text file
674	0					0	$extension = '.txt';
675							}
676							}
677							# otherwise the extension must be good, hope for the best
678
679							# determine format
680							# this is an arcane specification of whether we want a "simple" no metadata
681							# format, or an ordinary text format that may or may not have metadata
682							# it's currently not hurting much, so leave it in for now?
683	8	50				18	unless ($args{'format'}) {
684	8	50				20	if (defined $args{'simple'}) {
		50
685							# an old method of specifying simple
686	0					0	$args{'format'} = 'simple';
687							}
688							elsif ($extension) {
689							# check extension from the parsed filename, if present
690	8	50				27	if ($extension =~ /sgr\|cdt/i) {
691							# sgr is simple format, no headers
692	0					0	$args{'format'} = 'simple';
693							}
694							else {
695							# everything else is text
696	8					17	$args{'format'} = 'text';
697							}
698							}
699							else {
700							# somehow we got this far without defining? use default text
701	0					0	$args{'format'} = 'text';
702							}
703							}
704
705							# check zip status if necessary
706	8	50				17	unless (defined $args{'gz'}) {
707							# look at filename extension as a clue
708							# in case we're overwriting the input file, keep the zip status
709	8	50				31	if ($extension =~ m/\.vcf\.gz/i) {
		50
710							# vcf requires bgzip
711	0					0	$args{'gz'} = 2;
712							}
713							elsif ($extension =~ m/\.gz$/i) {
714	0					0	$args{'gz'} = 1;
715							}
716							else {
717	8					14	$args{'gz'} = 0; # default
718							}
719							}
720
721							# adjust gzip extension as necessary
722	8	50	33			48	if ($args{'gz'} and $extension !~ m/\.gz$/i) {
		50	33
723	0					0	$extension .= '.gz';
724							}
725							elsif (not $args{'gz'} and $extension =~ /\.gz$/i) {
726	0					0	$extension =~ s/\.gz$//i;
727							}
728
729							# check filename length
730							# assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
731							# don't even get me started on Windows NTFS path length limitation
732	8	50				30	if (length($name . $extension) > 255) {
733	0					0	my $limit = 253 - length($extension);
734	0					0	$name = substr($name, 0, $limit) . '..';
735	0					0	warn " filename too long! Truncating to $limit characters\n";
736							}
737
738							# generate the new filename
739	8					19	my $newname = $path . $name . $extension;
740
741
742							# Convert strand information
743	8					31	my $strand_i = $self->strand_column;
744	8	0	0			20	if (defined $strand_i and ($self->gff or $self->bed or $self->ucsc) ) {
			33
745							# convert to +/-/. nomenclature as necessary
746	0	0	0			0	if ($self->gff) {
		0
747	0					0	for my $row (1 .. $self->last_row) {
748	0					0	my $s = $self->{'data_table'}->[$row][$strand_i];
749	0	0				0	if ($s =~ /\d/) {
750	0	0				0	$s = $s == 1 ? '+' : $s == -1 ? '-' : '.';
		0
751							}
752	0					0	$self->{'data_table'}->[$row][$strand_i] = $s;
753							}
754							}
755							elsif ($self->bed or $self->ucsc) {
756	0					0	for my $row (1 .. $self->last_row) {
757	0					0	my $s = $self->{'data_table'}->[$row][$strand_i];
758	0	0				0	if ($s =~ /\d/) {
759	0	0				0	$s = $s >= 0 ? '+' : '-';
760							}
761	0					0	$self->{'data_table'}->[$row][$strand_i] = $s;
762							}
763							}
764							}
765
766
767							# Open file for writing
768	8					39	my $fh = $self->open_to_write_fh($newname, $args{'gz'});
769	8	50				22	return unless defined $fh;
770
771
772							# Write the headers
773	8	50				26	if ($args{'format'} eq 'text') {
774							# default text format has metadata headers
775
776							# write gff statement if gff format
777	8	50				29	if ($self->gff) {
778	0					0	$fh->print('##gff-version ' . $self->gff . "\n");
779							}
780
781							# Write the primary headers
782	8	50	66			19	unless (
			66
			66
			33
783							$self->gff or $self->bed or $self->ucsc or $self->vcf or
784							$extension =~ m/sgr\|kgg\|cdt\|peak/i
785							) {
786							# we only write these for normal text files, not defined format files
787
788	5	50				20	if ($self->program) {
789	0					0	$fh->print('# Program ' . $self->program . "\n");
790							}
791	5	50				17	if ($self->database) {
792	5					13	$fh->print('# Database ' . $self->database . "\n");
793							}
794	5	50				101	if ($self->feature) {
795	5					13	$fh->print('# Feature ' . $self->feature . "\n");
796							}
797							}
798
799							# Write the miscellaneous headers
800	8					27	foreach ( @{ $self->{'comments'} } ) {
	8					25
801							# write remaining miscellaneous header lines if present
802							# we do this for all files
803	6	100				46	unless (/\n$/s) {
804							# append newline if not present
805	4					7	$_ .= "\n";
806							}
807							# check for comment character at beginning
808	6	100				14	if (/^#/) {
809	5					15	$fh->print($_);
810							}
811							else {
812	1					3	$fh->print("# " . $_);
813							}
814							}
815
816							# Write the column metadata headers
817	8					32	for (my $i = 0; $i < $self->number_columns; $i++) {
818							# each column metadata in the hash is referenced by the column's
819							# index number as the key
820							# we will take each index one at a time in increasing order
821
822							# some files do not need or tolerate metadata lines, for those
823							# known files the metadata lines will be skipped
824
825							# these column metadata lines do not need to be written if they
826							# only have two values, presumably name and index, for files
827							# that don't normally have column headers, e.g. gff
828	22	100	66			54	if (
		50	0
		0	0
829							exists $self->{$i}{'AUTO'} and
830	8					20	scalar( keys %{ $self->{$i} } ) == $self->{$i}{'AUTO'}
831							) {
832							# some of the metadata values were autogenerated and
833							# we have the same number of keys as were autogenerated
834							# no need to write these
835	8					14	next;
836							}
837	14					32	elsif (scalar( keys %{ $self->{$i} } ) == 2) {
838							# only two metadata keys exist, name and index
839							# these are so simple it's not worth writing them
840	14					29	next;
841							}
842							elsif ($extension =~ /sgr\|kgg\|cdt/i or $self->ucsc or $self->vcf) {
843							# these do not support metadata lines
844	0					0	next;
845							}
846
847							# we will put each key=value pair into @pairs, listed asciibetically
848	0					0	my @pairs; # an array of the key value pairs from the metadata hash
849							# put name first
850							# we are no longer writing the index number
851	0					0	push @pairs, 'name=' . $self->{$i}{'name'};
852							# put remainder in alphabetical order
853	0					0	foreach (sort {$a cmp $b} keys %{ $self->{$i} } ) {
	0					0
	0					0
854	0	0				0	next if $_ eq 'name'; # already written
855	0	0				0	next if $_ eq 'index'; # internal use only
856	0	0				0	next if $_ eq 'AUTO'; # internal use only
857	0					0	push @pairs, $_ . '=' . $self->{$i}{$_};
858							}
859
860							# Finally write the header line, joining the pairs with a
861							# semi-colon into a single string.
862							# The column identifier is comprised of the word 'Column'
863							# and the index number joined by '_'.
864	0					0	$fh->print("# Column_$i ", join(";", @pairs), "\n");
865							}
866							}
867
868
869							# Write the table column headers
870	8	100				25	if ($self->{'headers'} == 1) {
871	5					10	$fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[0] }));
	5					28
872							}
873
874
875							# Write the data table
876	8	50				59	if ($args{'format'} eq 'simple') {
877
878							# the simple format will strip the non-value '.' from the table
879	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
880							# we will step though the data_table array one row at a time
881							# convert the non-value '.' to undefined
882							# and print using a tab-delimited format
883	0					0	my @linedata;
884	0					0	foreach ( @{ $self->{'data_table'}[$i] }) {
	0					0
885	0	0				0	if ($_ eq '.') {
886	0					0	push @linedata, undef;
887							} else {
888	0					0	push @linedata, $_;
889							}
890							}
891	0					0	$fh->printf("%s\n", join("\t", @linedata));
892							}
893							}
894
895							else {
896							# normal data files
897	8					26	for (my $i = 1; $i <= $self->last_row; $i++) {
898							# we will step though the data_table array one row at a time
899							# we will join each row's array of elements into a string to print
900							# using a tab-delimited format
901	104					125	$fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[$i] }));
	104					202
902							}
903							}
904
905							# done writing
906	8					29	$fh->close;
907
908							# if we made it this far, it should've been a success!
909							# return the new file name as indication of success
910	8					433	return $newname;
911							}
912
913							sub save {
914	6			6	1	35	return shift->write_file(@_);
915							}
916
917
918							#### Open a file for reading
919							sub open_to_read_fh {
920	161			161	1	264	my $self = shift;
921	161		100			455	my $file = shift \|\| undef;
922	161	100				517	my $obj = ref($self) =~ /^Bio::ToolBox/ ? 1 : 0;
923
924							# check file
925	161	100	66			475	if (not $file and $obj) {
926	120		50			334	$file = $self->filename \|\| undef;
927							}
928	161	50				271	unless ($file) {
929	0					0	carp " no filename provided or associated with object!";
930	0					0	return;
931							}
932
933							# Open filehandle object as appropriate
934	161					172	my $fh;
935	161	50				509	if ($file =~ /\.gz$/i) {
		50
936							# the file is compressed with gzip
937	0	0				0	$fh = IO::File->new("gzip -dc $file \|") or
938							carp "unable to read '$file' $!\n";
939							}
940							elsif ($file =~ /\.bz2$/i) {
941							# the file is compressed with bzip2
942	0	0				0	$fh = IO::File->new("bzip2 -dc $file \|") or
943							carp "unable to read '$file' $!\n";
944							}
945							else {
946							# the file is uncompressed and space hogging
947	161	50				727	$fh = IO::File->new($file, 'r') or
948							carp "unable to read '$file' $!\n";
949							}
950
951	161	100				13535	if ($obj) {
952	120					291	$self->{fh} = $fh;
953							}
954	161					499	return $fh;
955							}
956
957
958							#### Open a file for writing
959							sub open_to_write_fh {
960	10			10	1	23	my ($self, $filename, $gz, $append) = @_;
961
962							# check filename
963	10	50				19	unless ($filename) {
964	0					0	carp " no filename to write!";
965	0					0	return;
966							}
967
968							# check filename length
969							# assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
970	10					105	my $name = fileparse($filename);
971	10	50				58	if (length $name > 255) {
972	0					0	carp " filename is too long! please shorten\n";
973	0					0	return;
974							}
975
976							# check zip status if necessary
977	10	100				22	unless (defined $gz) {
978							# look at filename extension as a clue
979							# in case we're overwriting the input file, keep the zip status
980	2	50				8	if ($filename =~ m/\.vcf(\.gz)?$/i) {
		50
981	0					0	$gz = 2; # bgzip
982							}
983							elsif ($filename =~ m/\.gz$/i) {
984	0					0	$gz = 1; # regular gzip
985							}
986							else {
987	2					3	$gz = 0; # default
988							}
989							}
990
991							# gzip compression application
992	10	50	33			47	if ($gz == 1 and not $gzip_app) {
		50	33
993							# use parallel gzip if possible
994							# this is stored in a global variable so we only have to look once
995	0					0	$gzip_app = which('pigz');
996	0	0				0	if ($gzip_app) {
997	0					0	$gzip_app .= ' -p 3'; # use a conservative 3 processes, plus perl, so 4 total
998							}
999							else {
1000							# default is the standard gzip application
1001							# should be available in any application
1002	0					0	$gzip_app = which('gzip');
1003							}
1004	0	0				0	unless ($gzip_app) {
1005	0					0	croak "no gzip application in PATH to open compressed file handle output!\n";
1006							}
1007							}
1008							elsif ($gz == 2 and not $bgzip_app) {
1009							# use parallel bgzip if possible
1010							# this is stored in a global variable so we only have to look once
1011	0					0	$bgzip_app = which('bgzip');
1012	0	0				0	if ($bgzip_app) {
1013							# I'm going to assume this is a recent bgzip with multi-threading
1014							# use 3 threads, same as with pigz
1015	0					0	$bgzip_app .= ' -@ 3 -c';
1016							}
1017	0	0				0	unless ($bgzip_app) {
1018	0					0	croak "no bgzip application in PATH to open compressed file handle output!\n";
1019							}
1020							}
1021	10	50				24	my $gzipper = $gz == 1 ? $gzip_app : $gz == 2 ? $bgzip_app : undef;
		50
1022
1023							# check file append mode
1024	10	100				22	unless (defined $append) {
1025							# default is not to append
1026	8					12	$append = 0;
1027							}
1028
1029
1030							# Generate appropriate filehandle object
1031	10					15	my $fh;
1032	10	100	66			41	if (not $gzipper and not $append) {
		50	33
		50	33
		0	0
1033	8	50				51	$fh = IO::File->new($filename, 'w') or
1034							carp "cannot write to file '$filename' $!\n";
1035							}
1036							elsif ($gzipper and !$append) {
1037	0	0				0	$fh = IO::File->new("\| $gzipper >$filename") or
1038							carp "cannot write to compressed file '$filename' $!\n";
1039							}
1040							elsif (not $gzipper and $append) {
1041	2	50				10	$fh = IO::File->new(">> $filename") or
1042							carp "cannot append to file '$filename' $!\n";
1043							}
1044							elsif ($gzipper and $append) {
1045	0	0				0	$fh = IO::File->new("\| $gzipper >>$filename") or
1046							carp "cannot append to compressed file '$filename' $!\n";
1047							}
1048	10	50				1465	return $fh if defined $fh;
1049							}
1050
1051
1052							### Subroutine to check for file existance
1053							sub check_file {
1054	60			60	1	129	my ($self, $filename) = @_;
1055
1056							# check for file existance
1057	60	50				1269	if (-e $filename) {
1058							# confirmed full filename and path
1059	60					292	return $filename;
1060							}
1061							else {
1062							# file name is either incomplete or non-existent
1063							# try adding some common file extensions in case those are missing
1064	0					0	my $new_filename;
1065	0					0	foreach my $ext (qw(.gz .txt .txt.gz .bed .bed.gz)) {
1066	0	0				0	if (-e $filename . $ext) {
1067	0					0	$new_filename = $filename . $ext;
1068	0					0	last;
1069							}
1070							}
1071	0					0	return $new_filename;
1072							}
1073							}
1074
1075
1076							sub fh {
1077	192			192	0	212	my $self = shift;
1078	192	50				2643	return $self->{fh} if exists $self->{fh};
1079	0					0	return;
1080							}
1081
1082							sub close_fh {
1083	70			70	0	192	my $self = shift;
1084	70	50	33			482	$self->{fh}->close if (exists $self->{fh} and $self->{fh});
1085							}
1086
1087
1088							### Internal subroutine to check if a comment line contains headers
1089							sub _commented_header_line {
1090	60			60		120	my ($data, $line) = @_;
1091
1092							# prepare arrays from the other lines and current line
1093	60					104	my @commentdata;
1094	60	100				68	if ( scalar @{ $data->{'comments'} } >= 1 ) {
	60					162
1095							# take the last line in the other array
1096	30					99	@commentdata = split '\t', $data->{'comments'}->[-1];
1097							}
1098	60					206	my @linedata = split '\t', $line;
1099
1100							# check if the counts are equal
1101	60	50				132	if (scalar @commentdata == scalar @linedata) {
1102	0					0	return 1;
1103							}
1104							else {
1105	60					191	return 0;
1106							}
1107							}
1108
1109
1110							### Internal subroutine to process metadata for standard columns
1111							sub add_column_metadata {
1112	0			0	1	0	my ($data, $line, $index) = @_;
1113
1114							# strip the Column metadata identifier
1115	0					0	chomp $line;
1116	0					0	$line =~ s/^# Column_\d+ //;
1117
1118							# break up the column metadata
1119	0					0	my %temphash; # a temporary hash to put the column metadata into
1120	0					0	foreach (split ';', $line) {
1121	0					0	my ($key, $value) = split '=';
1122	0	0				0	if ($key eq 'index') {
1123	0	0				0	if ($index != $value) {
1124							# the value from the metadata index key should be
1125							# correct, so we will use that
1126	0					0	$index = $value;
1127							}
1128							}
1129							# store the key & value
1130	0					0	$temphash{$key} = $value;
1131							}
1132
1133							# create a index metadata key if not already present
1134							# the rest of biotoolbox may expect this to be present
1135	0	0				0	unless (exists $temphash{'index'}) {
1136	0					0	$temphash{'index'} = $index;
1137							}
1138
1139							# store the column metadata hash into the main data hash
1140							# use the index as the key
1141	0	0				0	if (exists $data->{$index}) {
1142							# we will simply overwrite the previous metadata hash
1143							# harsh, I know, but what to do?
1144							# if it was canned metadata for a gff file, that's ok
1145	0					0	warn "Warning: more than one metadata line exists for index $index!\n";
1146	0					0	$data->{$index} = \%temphash;
1147							}
1148							else {
1149							# metadata hash doesn't exist, so we will add it
1150	0					0	$data->{$index} = \%temphash;
1151							}
1152	0					0	return 1;
1153							}
1154
1155							### Subroutine to generate metadata for gff files
1156							# gff files have nine defined columns
1157							# there are different specifications and variants:
1158							# gff (v.1), gff v.2, gff v.2.5 (aka gtf), gff v.3 (gff3)
1159							# however, the columns are the same in all versions
1160							# more info on gff can be found http://gmod.org/wiki/GFF3
1161							sub add_gff_metadata {
1162	5			5	1	9	my $self = shift;
1163	5		50			23	my $version = shift \|\| undef;
1164	5		50			22	my $force = shift \|\| 0;
1165
1166							# set the gff version based on the extension if it isn't already
1167	5	50	33			11	if (not($self->gff) or $force) {
1168	0	0				0	if (defined $version) {
		0
		0
1169	0					0	$self->gff($version);
1170							}
1171							elsif ($self->extension =~ /gtf/i) {
1172	0					0	$self->gff(2.5);
1173							}
1174							elsif ($self->extension =~ /gff3/i) {
1175	0					0	$self->gff(3);
1176							}
1177							else {
1178	0					0	$self->gff(2); # hope for the best
1179							}
1180							}
1181							# set format based on version
1182	5	50				18	if (not $self->format) {
1183	5					11	my $v = $self->gff;
1184	5	0				21	my $f = $v == 3 ? 'gff3' : $v > 2 ? 'gtf' : 'gff';
		50
1185	5					12	$self->format($f);
1186							}
1187
1188							# set the metadata for the each column
1189							# some of these may already be defined if there was a
1190							# column metadata specific column in the file
1191	5					22	my $column_names = $self->standard_column_names('gff');
1192	5					17	for (my $i = 0; $i < 9; $i++) {
1193							# loop for each column
1194							# set metadata unless it's already loaded
1195	45	50	33			100	if ($force or not exists $self->{$i}) {
1196	45					99	$self->{$i}{'name'} = $column_names->[$i];
1197	45					57	$self->{$i}{'index'} = $i;
1198	45					69	$self->{$i}{'AUTO'} = 3;
1199							}
1200							# assign the name to the column header
1201	45	50	33			108	if ($force or not defined $self->{'column_names'}->[$i]) {
1202	45					84	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1203							}
1204							}
1205	5					11	$self->{data_table}->[0] = $self->{'column_names'};
1206
1207							# set column number always to 9
1208	5	50	33			17	if ($force or $self->{'number_columns'} == 0) {
1209	5					10	$self->{'number_columns'} = 9;
1210							}
1211
1212							# set headers flag to false
1213	5	50				16	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1214
1215							# set the feature type
1216	5	50				10	unless (defined $self->{'feature'}) {
1217	5					9	$self->{'feature'} = 'region';
1218							}
1219	5					11	return 1;
1220							}
1221
1222
1223							### Subroutine to generate metadata for BED files
1224							# bed files have a loose format
1225							# they require a minimum of 3 columns, and have a max of 12
1226							# 3, 6, and 12 column files are the most common
1227							# there are also something called paired bed files floating around
1228							# The official details and specifications may be found at
1229							# http://genome.ucsc.edu/FAQ/FAQformat#format1
1230
1231							# a special type of bed file is the bedgraph, using
1232							# either a bdg, bedgraph, or simply bed extension
1233							# these only have four columns, no more, no less
1234							# the fourth column is score, not name
1235							sub add_bed_metadata {
1236	27			27	1	48	my ($self, $column_count) = @_;
1237	27		50			57	my $force = shift \|\| 0;
1238
1239							# check bed type and set metadata appropriately
1240	27					40	my $bed_names;
1241	27	50	33			57	if ($self->format =~ /bedgraph/i or $self->extension =~ /bg\|bdg\|graph/i) {
1242	0					0	$self->format('bedGraph'); # possibly redundant
1243	0					0	$self->bed($column_count);
1244	0					0	$bed_names = $self->standard_column_names('bdg');
1245							}
1246							else {
1247	27					67	$self->format('bed');
1248	27					75	$self->bed($column_count);
1249	27					74	$bed_names = $self->standard_column_names('bed12');
1250							}
1251	27					53	$self->{'number_columns'} = $column_count;
1252	27					39	$self->{'zerostart'} = 1;
1253
1254							# set the metadata for each column
1255							# some of these may already be defined if there was a
1256							# column metadata specific column in the file
1257	27					79	for (my $i = 0; $i < $column_count; $i++) {
1258							# loop for each column
1259							# set name unless it already has one from metadata
1260	200	50	33			277	if ($force or not exists $self->{$i}) {
1261	200		50			427	$self->{$i}{'name'} = $bed_names->[$i] \|\| 'extraColumn';
1262	200					254	$self->{$i}{'index'} = $i;
1263	200					219	$self->{$i}{'AUTO'} = 3;
1264							}
1265							# assign the name to the column header
1266	200	50	33			282	if ($force or not defined $self->{'column_names'}->[$i]) {
1267	200					361	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1268							}
1269							}
1270	27					51	$self->{data_table}->[0] = $self->{'column_names'};
1271
1272							# set the feature type
1273	27	50				53	unless (defined $self->{'feature'}) {
1274	27					43	$self->{'feature'} = 'region';
1275							}
1276
1277							# set headers flag to false
1278	27	50				75	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1279	27					61	return 1;
1280							}
1281
1282
1283							### Subroutine to generate metadata for broadpeak and narrowpeak files
1284							# three different types of peak files are available
1285							# see http://genome.ucsc.edu/FAQ/FAQformat.html
1286							sub add_peak_metadata {
1287	16			16	1	31	my ($self, $column_count) = @_;
1288	16		50			47	my $force = shift \|\| 0;
1289
1290							# check bed type and set metadata appropriately
1291							# most of these are bed6 plus extra columns
1292	16					28	my $column_names;
1293	16	100	66			38	if ($self->format =~ /narrow/i or $self->extension =~ /narrow/i) {
		50	33
		50	33
1294	8					25	$self->format('narrowPeak');
1295	8					26	$self->bed($column_count);
1296	8					24	$column_names = $self->standard_column_names('narrowpeak');
1297							}
1298							elsif ($self->format =~ /broad/i or $self->extension =~ /broad/i) {
1299	0					0	$self->format('broadPeak'); # possibly redundant
1300	0					0	$self->bed($column_count);
1301	0					0	$column_names = $self->standard_column_names('broadpeak');
1302							}
1303							elsif ($self->format =~ /gapped/i or $self->extension =~ /gapped/i) {
1304	8					27	$self->format('gappedPeak'); # possibly redundant
1305	8					27	$self->bed($column_count);
1306	8					25	$column_names = $self->standard_column_names('gappedpeak');
1307							}
1308							else {
1309							# how did we get here???? Hope for the best.....
1310	0					0	$self->bed($column_count);
1311	0					0	$column_names = $self->standard_column_names('bed12');
1312							}
1313	16					39	$self->{'number_columns'} = $column_count;
1314	16					34	$self->{'zerostart'} = 1;
1315
1316							# add metadata
1317	16					47	for (my $i = 0; $i < $column_count; $i++) {
1318	200	50	33			271	if ($force or not exists $self->{$i}) {
1319	200		50			464	$self->{$i}{'name'} = $column_names->[$i] \|\| 'extraColumn';
1320	200					254	$self->{$i}{'index'} = $i;
1321	200					219	$self->{$i}{'AUTO'} = 3;
1322							}
1323							# assign the name to the column header
1324	200	50	33			291	if ($force or not defined $self->{'column_names'}->[$i]) {
1325	200					330	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1326							}
1327							}
1328	16					32	$self->{data_table}->[0] = $self->{'column_names'};
1329
1330							# set the feature type
1331	16	50				36	unless (defined $self->{'feature'}) {
1332	16					25	$self->{'feature'} = 'region';
1333							}
1334
1335							# set headers flag to false
1336	16	50				64	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1337	16					43	return 1;
1338							}
1339
1340
1341							### Subroutine to generate metadata for various UCSC gene files
1342							# these are tricky, as we will try to determine contents by counting
1343							# the columns, which may not be accurate
1344							# not only that, but this presumes the extension even makes it recognizable
1345							# see http://genome.ucsc.edu/FAQ/FAQformat.html#format9 for details
1346							# also biotoolbox script ucsc_table2gff3.pl
1347							sub add_ucsc_metadata {
1348	3			3	1	7	my ($self, $column_count) = @_;
1349	3		50			10	my $force = shift \|\| 0;
1350
1351							# set metadata
1352	3					6	$self->{'number_columns'} = $column_count;
1353	3					7	$self->{'ucsc'} = $column_count;
1354
1355							# set format and determine column names;
1356	3					3	my $column_names;
1357	3	50				8	if ($column_count == 16) {
		0
		0
		0
		0
1358	3					9	$self->format('genePredExt');
1359	3					10	$column_names = $self->standard_column_names('ucsc16');
1360							}
1361							elsif ($column_count == 15) {
1362	0					0	$self->format('genePredExt');
1363	0					0	$column_names = $self->standard_column_names('ucsc15');
1364							}
1365							elsif ($column_count == 12) {
1366	0					0	$self->format('knownGene');
1367	0					0	$column_names = $self->standard_column_names('ucsc12');
1368							}
1369							elsif ($column_count == 11) {
1370	0					0	$self->format('refFlat');
1371	0					0	$column_names = $self->standard_column_names('ucsc11');
1372							}
1373							elsif ($column_count == 10) {
1374	0					0	$self->format('genePred');
1375	0					0	$column_names = $self->standard_column_names('ucsc10');
1376							}
1377	3					6	$self->{'zerostart'} = 1;
1378
1379							# assign the column names and metadata
1380	3					11	for (my $i = 0; $i < $column_count; $i++) {
1381							# loop for each column
1382							# set name unless it already has one from metadata
1383	48	50	33			70	if ($force or not exists $self->{$i}) {
1384	48		50			74	$self->{$i}{'name'} = $column_names->[$i] \|\| 'extraColumn';
1385	48					52	$self->{$i}{'index'} = $i;
1386	48					51	$self->{$i}{'AUTO'} = 3;
1387							}
1388							# assign the name to the column header
1389	48	50	33			65	if ($force or not defined $self->{'column_names'}->[$i]) {
1390	48					107	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1391							}
1392							}
1393	3					5	$self->{data_table}->[0] = $self->{'column_names'};
1394
1395							# set the feature type
1396	3	50				8	unless (defined $self->{'feature'}) {
1397	3					5	$self->{'feature'} = 'gene';
1398							}
1399
1400							# set headers flag to false
1401	3	50				12	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1402	3					8	return 1;
1403							}
1404
1405
1406							### Subroutine to generate metadata for SGR files
1407							# a sgr file contains three columns: chromo, position, score
1408							# this is a very simple file format, useful in exporting and
1409							# importing to binary BAR files used in T2, USeq, and IGB
1410							sub add_sgr_metadata {
1411	0			0	1	0	my $self = shift;
1412	0		0			0	my $force = shift \|\| 0;
1413
1414							# set column metadata
1415	0					0	my $column_names = $self->standard_column_names('sgr');
1416	0					0	for (my $i = 0; $i < 3; $i++) {
1417							# loop for each column
1418							# set name unless it already has one from metadata
1419	0	0	0			0	if ($force or not exists $self->{$i}) {
1420	0		0			0	$self->{$i}{'name'} = $column_names->[$i] \|\| 'extraColumn';
1421	0					0	$self->{$i}{'index'} = $i;
1422	0					0	$self->{$i}{'AUTO'} = 3;
1423							}
1424							# assign the name to the column header
1425	0	0	0			0	if ($force or not defined $self->{'column_names'}->[$i]) {
1426	0					0	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1427							}
1428							}
1429	0					0	$self->{data_table}->[0] = $self->{'column_names'};
1430	0					0	$self->{'number_columns'} = 3;
1431
1432
1433							# set headers flag to false
1434	0	0				0	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1435
1436							# set the feature type
1437	0	0				0	unless (defined $self->{'feature'}) {
1438	0					0	$self->{'feature'} = 'region';
1439							}
1440	0					0	return 1;
1441							}
1442
1443
1444							### Internal subroutine to generate metadata for standard files
1445							sub add_standard_metadata {
1446	13			13	1	23	my ($self, $line) = @_;
1447
1448	13					42	my @namelist = split '\t', $line;
1449	13					15	chomp $namelist[-1];
1450
1451							# we will define the columns based on
1452	13					36	for my $i (0..$#namelist) {
1453							# make up a name if one doesn't exist
1454	68		66			99	$namelist[$i] \|\|= "Column_$i";
1455
1456							# confirm that a file metadata exists for this column
1457	68	50				96	if (exists $self->{$i}) {
1458	0	0				0	unless ($namelist[$i] eq $self->{$i}->{'name'}) {
1459	0					0	warn "metadata and header names for column $i do not match!";
1460							# set the name to match the actual column name
1461	0					0	$self->{$i}->{'name'} = $namelist[$i];
1462							}
1463							}
1464
1465							# otherwise be nice and generate it here
1466							else {
1467	68					193	$self->{$i} = {
1468							'name' => $namelist[$i],
1469							'index' => $i,
1470							'AUTO' => 3,
1471							};
1472							}
1473							}
1474
1475							# check the number of columns
1476	13	50				34	if (scalar @namelist != $self->{'number_columns'} ) {
1477							# adjust to match actual content
1478	13					29	$self->{'number_columns'} = scalar @namelist;
1479							}
1480
1481							# put the column names in the metadata
1482	13					27	$self->{'column_names'} = \@namelist;
1483	13					24	$self->{data_table}->[0] = $self->{'column_names'};
1484
1485							# set headers flag to true
1486	13					18	$self->{'headers'} = 1;
1487	13					19	return 1;
1488							}
1489
1490
1491							### Internal subroutine to generate hash of standard file format column names
1492							sub standard_column_names {
1493	51			51	1	95	my ($self, $type) = @_;
1494
1495	51	100	0			217	if ($type eq 'gff') {
		100	0
		50	0
		50	0
		100
		50
		100
		50
		50
		0
		0
		0
		0
1496	5					23	return [qw(Chromosome Source Type Start Stop Score Strand Phase Group)];
1497							}
1498							elsif ($type eq 'bed12') {
1499	27					89	return [qw(Chromosome Start0 End Name Score Strand
1500							thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0)];
1501							}
1502							elsif ($type eq 'bed6') {
1503	0					0	return [qw(Chromosome Start0 End Name Score Strand)];
1504							}
1505							elsif ($type eq 'bdg') {
1506	0					0	return [qw(Chromosome Start0 End Score)];
1507							}
1508							elsif ($type eq 'narrowpeak') {
1509	8					27	return [qw(Chromosome Start0 End Name Score Strand signalValue
1510							pValue qValue peak)];
1511							}
1512							elsif ($type eq 'broadpeak') {
1513	0					0	return [qw(Chromosome Start0 End Name Score Strand signalValue
1514							pValue qValue)];
1515							}
1516							elsif ($type eq 'gappedpeak') {
1517	8					31	return [qw(Chromosome Start0 End Name Score Strand
1518							thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0
1519							signalValue pValue qValue)];
1520							}
1521							elsif ($type eq 'sgr') {
1522	0					0	return [qw(Chromo Start Score)];
1523							}
1524							elsif ($type eq 'ucsc16') {
1525	3					16	return [qw(bin name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1526							exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1527							}
1528							elsif ($type eq 'ucsc15' or $type eq 'genepredext') {
1529	0						return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1530							exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1531							}
1532							elsif ($type eq 'ucsc12' or $type eq 'knowngene') {
1533	0						return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1534							exonStarts0 exonEnds proteinID alignID)];
1535							}
1536							elsif ($type eq 'ucsc11' or $type eq 'refflat') {
1537	0						return [qw(geneName transcriptName chrom strand txStart0 txEnd cdsStart0
1538							cdsEnd exonCount exonStarts0 exonEnds)];
1539							}
1540							elsif ($type eq 'ucsc10' or $type eq 'genepred') {
1541	0						return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1542							exonStarts exonEnds)];
1543							}
1544							else {
1545	0						confess "unrecognized standard column name format '$type'!";
1546							}
1547							}
1548
1549
1550							__END__