File Coverage

blib/lib/Bio/ToolBox/Data/file.pm

Criterion	Covered	Total	%
statement	353	616	57.3
branch	186	474	39.2
condition	65	211	30.8
subroutine	25	27	92.5
pod	19	21	90.4
total	648	1349	48.0

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::Data::file;
2							our $VERSION = '1.68';
3
4							=head1 NAME
5
6							Bio::ToolBox::Data::file - File functions to Bio:ToolBox::Data family
7
8							=head1 DESCRIPTION
9
10							File methods for reading and writing data files for both L
11							and L objects. This module should not be used
12							directly. See the respective modules for more information.
13
14							=cut
15
16	4			4		30	use strict;
	4					9
	4					165
17	4			4		25	use Carp qw(carp cluck croak confess);
	4					10
	4					219
18	4			4		23	use File::Basename qw(fileparse);
	4					9
	4					280
19	4			4		2150	use File::Which;
	4					4428
	4					224
20	4			4		1606	use IO::File;
	4					19252
	4					33486
21
22							# List of acceptable filename extensions
23							our $SUFFIX = qr/\.(?:txt\|gff3?\|gtf\|bed\|bg\|bdg\|bedgraph\|sgr\|kgg\|cdt\|vcf\|narrowpeak\|broadpeak\|gappedpeak\|reff?lat\|genepred\|ucsc\|maf)(?:\.gz\|\.bz2)?/i;
24
25							# gzip application
26							our $gzip_app;
27							our $bgzip_app;
28
29							### The True Statement
30							1;
31
32
33
34							### Load new version data table from file
35
36							sub load_file {
37	10			10	1	28	my $self = shift;
38	10					23	my $file = shift;
39	10		50			48	my $noheader = shift \|\| 0; # may not be present
40
41							# check that we have an empty table
42	10	50	33			60	if ($self->last_row != 0 or $self->number_columns != 0 or $self->filename) {
			33
43	0					0	carp "Cannot load file onto an existing data table!";
44	0					0	return;
45							}
46
47							# open the file and load metadata
48	10					46	my $filename = $self->check_file($file);
49	10	50				43	unless ($filename) {
50	0					0	print " file '$file' does not exist!\n";
51	0					0	return;
52							}
53	10					54	$self->add_file_metadata($filename);
54	10	50				48	$self->open_to_read_fh or return;
55	10					86	$self->parse_headers($noheader);
56
57							# Load the data table
58	10					194	while (my $line = $self->{fh}->getline) {
59							# the current file position should be at the beginning of the
60							# data table information
61	198	50				4817	next if $line !~ m/\w+/;
62
63							# skip comment and empty lines
64	198	50				455	if (substr($line,0,1) eq '#') {
65	0					0	$self->add_comment($line);
66	0					0	next;
67							}
68
69							# process the line
70	198					447	$self->add_data_line($line);
71							}
72
73							# completed loading the file
74	10					378	$self->close_fh;
75	10					190	delete $self->{fh};
76
77							# verify the structure
78	10	50				48	return unless $self->verify;
79
80							# finished
81	10					53	return 1;
82							}
83
84							sub taste_file {
85	24			24	1	2459	my $self = shift;
86	24					47	my $file = shift;
87	24	50				69	my $filename = $self->check_file($file) or return;
88	24					109	my $Taste = $self->new;
89	24					93	$Taste->add_file_metadata($filename);
90	24	50				76	$Taste->open_to_read_fh or return;
91	24					105	$Taste->parse_headers;
92
93							# load first 10 data lines
94	24					76	$Taste->{data_table}->[0] = $Taste->{'column_names'}; # set table header names
95	24					78	for (my $i = 1; $i <= 10; $i++) {
96	192	100				384	my $line = $Taste->fh->getline or last;
97	180	50				4406	next if $line !~ m/\w+/;
98	180	50				362	next if $line =~ /^#/;
99	180					377	$Taste->add_data_line($line);
100							}
101	24					471	$Taste->close_fh;
102	24					552	$Taste->verify(1); # silently check the integrity of the file
103
104							# check existing metadata
105	24	100				77	if ($Taste->gff) {
		100
		50
106	4					94	return 'gff';
107							}
108							elsif ($Taste->bed) {
109	12					253	return 'bed';
110							}
111							elsif ($Taste->ucsc) {
112	0					0	return 'ucsc';
113							}
114
115
116							# check if the number of columns match a known format, then verify
117	8					26	my $number = $Taste->number_columns;
118	8	50				70	if ($number == 9) {
		50
		50
		50
		50
		100
119							# possibly a GFF file
120	0					0	$Taste->gff(2);
121	0					0	$Taste->verify(1);
122	0	0				0	return 'gff' if $Taste->gff == 2;
123							}
124							elsif ($number == 10) {
125							# possibly a genePred file
126	0					0	$Taste->ucsc(10);
127	0					0	$Taste->verify(1);
128	0	0				0	return 'ucsc' if $Taste->ucsc == 10;
129	0					0	$Taste->add_ucsc_metadata(10,1); # force metadata
130	0					0	$Taste->verify(1);
131	0	0				0	return 'ucsc' if $Taste->ucsc == 10;
132							}
133							elsif ($number == 11) {
134							# possibly a refFlat file
135	0					0	$Taste->ucsc(11);
136	0					0	$Taste->verify(1);
137	0	0				0	return 'ucsc' if $Taste->ucsc == 11;
138	0					0	$Taste->add_ucsc_metadata(11,1); # force metadata
139	0					0	$Taste->verify(1);
140	0	0				0	return 'ucsc' if $Taste->ucsc == 11;
141							}
142							elsif ($number == 12) {
143							# possibly a knownGene or BED12 file
144	0					0	$Taste->ucsc(12);
145	0					0	$Taste->verify(1);
146	0	0				0	return 'ucsc' if $Taste->ucsc == 12;
147	0					0	$Taste->bed(12);
148	0					0	$Taste->verify(1);
149	0	0				0	return 'bed' if $Taste->bed == 12;
150	0					0	$Taste->add_ucsc_metadata(12,1); # force metadata
151	0					0	$Taste->verify(1);
152	0	0				0	return 'ucsc' if $Taste->ucsc == 12;
153	0					0	$Taste->add_bed_metadata(12,1); # force metadata
154	0					0	$Taste->verify(1);
155	0	0				0	return 'bed' if $Taste->bed == 12;
156							}
157							elsif ($number == 15) {
158							# possibly a genePredExt file
159	0					0	$Taste->ucsc(15);
160	0					0	$Taste->verify(1);
161	0	0				0	return 'ucsc' if $Taste->ucsc == 15;
162	0					0	$Taste->add_ucsc_metadata(15,1); # force metadata
163	0					0	$Taste->verify(1);
164	0	0				0	return 'ucsc' if $Taste->ucsc == 15;
165							}
166							elsif ($number == 16) {
167							# possibly a genePredExt file
168	3					10	$Taste->ucsc(16);
169	3					11	$Taste->verify(1);
170	3	50				25	return 'ucsc' if $Taste->ucsc == 16;
171	3					26	$Taste->add_ucsc_metadata(16,1); # force metadata
172	3					11	$Taste->verify(1);
173	3	50				10	return 'ucsc' if $Taste->ucsc == 16;
174							}
175	5					71	return;
176							}
177
178							sub sample_gff_type_list {
179	11			11	1	31	my ($self, $file) = @_;
180	11	50				81	return unless $file =~ /\.g[tf]f\d?(?:\.gz)?$/i; # assume extension is accurate
181	11	50				43	my $fh = $self->open_to_read_fh($file) or die "can't open $file!\n";
182	11					27	my %types;
183	11					21	my $count = 0;
184	11					41	while ($count < 1000) {
185	1288	100				20748	my $line = $fh->getline or last;
186	1277	50				30790	next if $line !~ m/\w+/;
187	1277	100				2494	next if $line =~ /^#/;
188	1241					3735	my @fields = split('\t', $line);
189	1241					1969	$types{ $fields[2] } += 1;
190	1241					2868	$count++;
191							}
192	11					390	$fh->close;
193	11					351	return join(',', keys %types);
194							}
195
196
197							sub parse_headers {
198	60			60	1	118	my $self = shift;
199	60		50			240	my $noheader = shift \|\| 0; # boolean to indicate no headers are present
200
201							# filehandle
202	60					171	my $fh;
203	60	50	33			267	if (exists $self->{fh} and $self->{fh}) {
		0
204	60					111	$fh = $self->{fh};
205							}
206							elsif ($self->filename) {
207	0					0	$fh = $self->open_to_read_fh($self->filename);
208							}
209							else {
210	0					0	confess " file metadata and/or open filehandle must be set before parsing headers!";
211							}
212
213							# check that we have an empty table
214	60	50	33			203	if ($self->last_row != 0 or $self->number_columns != 0) {
215	0					0	cluck "Cannot parse file headers onto an existing data table!";
216	0					0	return;
217							}
218
219							# read and parse the file
220							# we will ONLY parse the header lines prefixed with a #, as well as the
221							# first data row which contains the column names
222	60					258	$self->program(undef); # reset this to blank, it will be filled by file metadata
223	60					100	my $header_line_count = 0;
224	60					1611	my $line = $fh->getline; # first line
225
226							# check the first line for proper line endings
227							{
228	60					6116	my $line2 = $line;
	60					128
229	60					142	chomp($line2);
230	60	50				277	if ($line2 =~ /[\r\n]+/) {
231	0					0	my $filename = $self->filename;
232	0					0	die "File '$filename' does not have expected line endings!\n" .
233							" Try converting to native line endings and try again\n";
234							}
235							}
236
237	60					139	while ($line) {
238							# we are not chomping the line here because of possible side effects
239							# with UCSC tables where we have to count elements in the first line
240							# and potential header lines, and the first line has a null value at
241							# the end
242
243							# Parse the datafile metadata headers
244
245							# no real line, just empty space
246	137	50				2902	if ($line !~ m/\w+/) {
		50
		100
		100
		50
		100
		50
		100
		100
247	0					0	$header_line_count++;
248	0					0	next;
249							}
250
251							# the generating program
252							elsif ($line =~ m/^# Program (.+)$/) {
253	0					0	my $p = $1;
254	0					0	$self->program($p);
255	0					0	$header_line_count++;
256							}
257
258							# the source database
259							elsif ($line =~ m/^# Database (.+)$/) {
260	10					33	my $d = $1;
261	10					43	$self->database($d);
262	10					20	$header_line_count++;
263							}
264
265							# the type of feature in this datafile
266							elsif ($line =~ m/^# Feature (.+)$/) {
267	10					28	my $f = $1;
268	10					34	$self->feature($f);
269	10					22	$header_line_count++;
270							}
271
272							# column or dataset specific information
273							elsif ($line =~ m/^# Column_(\d+)/) {
274							# the column number will become the index number
275	0					0	my $index = $1;
276	0					0	$self->add_column_metadata($line, $index);
277	0					0	$header_line_count++;
278							}
279
280							# gff version header
281							elsif ($line =~ /^##gff-version\s+([\d\.]+)$/) {
282							# store the gff version in the hash
283							# this may or may not be present in the gff file, but want to keep
284							# it if it is
285	5					19	my $g = $1;
286	5					36	$self->gff($g);
287	5					11	$header_line_count++;
288							}
289
290							# VCF version header
291							elsif ($line =~ /^##fileformat=VCFv([\d\.]+)$/) {
292							# store the VCF version in the hash
293							# this may or may not be present in the vcf file, but want to keep
294							# it if it is
295	0					0	my $v = $1;
296	0					0	$self->vcf($v);
297	0					0	$self->add_comment($line); # so that it is written properly
298	0					0	$header_line_count++;
299							}
300
301							# any other nonstandard header
302							elsif ($line =~ /^#/) {
303	44					196	$self->add_comment($line);
304	44					58	$header_line_count++;
305							}
306
307							# a track or browser line
308							elsif ($line =~ /^(?:track\|browser)\s+/i) {
309							# common with wig, bed, or bedgraph files for use with UCSC genome browser
310							# treat as a comment line, there's not that much useful info here
311	8	50				39	if ($line =~ /type=(\w+)/i) {
312	8					52	$self->format($1);
313							}
314	8					40	$self->add_comment($line);
315	8					14	$header_line_count++;
316							}
317
318							# the remainder is the data table itself
319							else {
320							# the first row in the data table are (usually) the column names
321							# we only want the names, not the rest of the table
322
323							# specific file formats have implicit predefined column formats
324							# these file formats do NOT have column headers
325							# we will first check for those file formats and process accordingly
326
327							### Data tables with a commented header line
328	60	50				269	if ( $self->_commented_header_line($line) ) {
329							# these will have one comment line marked with #
330							# that really contains the column headers
331
332							# process the real header line
333	0					0	$self->add_standard_metadata( pop @{ $self->{'comments'} } );
	0					0
334							}
335							# we will continue here in case the commented header line was part of a
336							# formatted file type, which will be checked by extension or possibly
337							# the format (if determined) below
338	60		66			197	my $format = $self->format \|\| $self->extension;
339
340							### a GFF file
341	60	100				419	if ($format =~ /g[tf]f/i) {
		100
		100
		50
		50
342	5					30	$self->add_gff_metadata;
343							}
344
345							### a peak file
346							elsif ($format =~ /peak/i) {
347	16					44	my $count = scalar(split '\t', $line);
348	16					79	$self->add_peak_metadata($count);
349							}
350
351							### a Bed or BedGraph file
352							elsif ($format =~ /bg\|bdg\|bed/i) {
353	26					68	my $count = scalar(split '\t', $line);
354	26					114	$self->add_bed_metadata($count);
355							}
356
357							### a UCSC gene table
358							elsif ($format =~ /ref+lat\|genepred\|ucsc/i) {
359	0					0	my $count = scalar(split '\t', $line);
360	0					0	$self->add_ucsc_metadata($count);
361							}
362
363							### a SGR file
364							elsif ($format =~ /sgr/i) {
365	0					0	$self->add_sgr_metadata;
366							}
367
368							### standard text file with headers, i.e. everything else
369	60	100				256	unless ($self->number_columns) {
370							# we have not yet parsed the row of data column names
371							# we will do so now
372	13					55	$self->add_standard_metadata($line);
373
374							# count as a header line
375	13					20	$header_line_count++;
376							}
377							}
378
379							# keep processing by going to the next line until we have identified columns
380	137	100				327	if ($self->number_columns == 0) {
381	77					1475	$line = $fh->getline;
382							}
383							else {
384	60					164	undef $line;
385							}
386							}
387
388							# if we didn't find columns, check that it wasn't actually commented
389							# for example, an empty vcf file
390	60	0	50			131	if ($self->number_columns == 0 and
			0
391	0					0	scalar(@{ $self->{comments} }) and
392							$self->{comments}->[-1] =~ /\t/
393							) {
394							# process the real header line
395	0					0	$self->add_standard_metadata( pop @{ $self->{'comments'} } );
	0					0
396							}
397
398
399							# No header was requested
400	60	0	33			167	if ($noheader and not $self->bed and not $self->gff and
			33
			0
			0
401							not $self->vcf and not $self->ucsc
402							) {
403							# which means that what we used as a header is actually the first data row
404
405							# fix the column names
406	0					0	for (my $i = 0; $i < $self->number_columns; $i++) {
407	0					0	my $name = $self->name($i);
408	0					0	$self->name($i, "Column$i ($name)");
409	0					0	$self->{$i}{'AUTO'} = 3;
410							}
411							# adjust metadata
412	0					0	$header_line_count -= 1;
413	0					0	$self->{'headers'} = -1; # special case, we never write headers here
414							}
415
416							# Header sanity check
417							# some exported file formats, such as from R, do not include a proper
418							# header for the first column, as these are assumed to be row names
419							# this will result in incorrectly parsed files where the last columns
420							# will be merged into one column with an internal tab - not good
421							# need to handle these
422	60					1364	my $nextline = $fh->getline;
423	60	50				1634	if ($nextline) {
424	60					281	my @nextdata = split '\t', $nextline;
425	60	50				211	if (scalar(@nextdata) - 1 == $self->number_columns) {
426							# whoops! we caught a off-by-one discrepancy between header and data row
427	0					0	my $old_last = $self->last_column;
428	0					0	$fh->close; # having this open complicates changing columns....
429
430							# add a new "column" (just metadata for now) and move it to the beginning
431	0					0	my $i = $self->add_column('Column1');
432	0					0	$self->reorder_column($i, 0 .. $old_last);
433							}
434							}
435
436							# re-open the file
437							# I tried using seek functions - but they don't work with binary gzip
438							# files, and I can't get the seek function to return the same position
439							# as simply advancing through the file like below
440							# so I'll just do it the old way and close/open and advance
441	60	50				300	$fh->close if $fh; # may have been closed from the sanity check above
442	60					1661	$fh = $self->open_to_read_fh;
443	60					229	for (1 .. $header_line_count) {
444	90					3406	my $line = $fh->getline;
445							}
446	60					1215	$self->{header_line_count} = $header_line_count;
447	60					120	$self->{fh} = $fh;
448	60					152	return 1;
449							}
450
451
452							sub add_data_line {
453	574			574	1	993	my ($self, $line) = @_;
454
455							# do not chomp the line yet, just split into an array
456	574					2776	my @linedata = split '\t', $line, $self->{number_columns};
457
458							# chomp the last element
459							# we do this here to ensure the tab split above gets all of the values
460							# otherwise trailing null values aren't included in @linedata
461	574					1021	chomp $linedata[-1];
462
463							# check for extra remaining tabs
464	574	50				1262	if (index($linedata[-1], "\t") != -1) {
465	0					0	die "FILE INCONSISTENCY ERRORS! line has additional tabs (columns) than headers!\n $line";
466							}
467
468							# add the line of data
469	574					751	push @{ $self->{data_table} }, \@linedata;
	574					1241
470	574					948	$self->{last_row} += 1;
471	574					4371	return 1;
472							}
473
474
475							### Parse the filename using the list suffix list
476							sub add_file_metadata {
477	70			70	1	170	my ($self, $filename) = @_;
478	70	50				174	confess "no valid filename!" unless defined $filename;
479	70					2770	my ($basename, $path, $extension) = fileparse($filename, $SUFFIX);
480	70	50				264	unless ($extension) {
481							# look for a nonstandard extension, allowing for .gz extension
482	0	0				0	if ($filename =~ /(\.\w+(?:\.gz)?)$/i) {
483	0					0	$extension = $1;
484	0					0	$basename =~ s/$extension\Z//;
485							}
486							}
487	70					168	$self->{filename} = $filename;
488	70					148	$self->{basename} = $basename;
489	70					120	$self->{path} = $path;
490	70					167	$self->{extension} = $extension;
491							}
492
493
494							### Write out a data file
495							sub write_file {
496	8			8	1	20	my $self = shift;
497
498							# collect passed arguments
499	8					16	my %args;
500	8	100				32	if (scalar(@_) == 1) {
501	7					19	$args{'filename'} = $_[0];
502							}
503							else {
504	1					5	%args = @_;
505							}
506	8		0			29	$args{'filename'} \|\|= $args{'file'} \|\| undef;
			33
507	8		50			47	$args{'format'} \|\|= undef;
508	8	50				24	unless (exists $args{'gz'}) {$args{'gz'} = undef}
	8					22
509
510							# check the data
511	8	50				35	unless ($self->verify) {
512	0					0	cluck "bad data structure!";
513	0					0	return;
514							}
515
516							# check filename
517	8	50	33			31	unless ($args{'filename'} or $self->filename) {
518	0					0	cluck "no filename given!\n";
519	0					0	return;
520							}
521
522							# split filename into its base components
523							my ($name, $path, $extension) =
524	8		33			298	fileparse($args{'filename'} \|\| $self->filename, $SUFFIX);
525
526							# Adjust filename extension if necessary
527	8	50				102	if ($extension =~ /(g[tf]f)/i) {
		100
		50
		50
		50
		50
		0
528	0	0				0	if (not $self->gff) {
529							# let's set it to true and see if it passes verification
530	0	0				0	$self->{'gff'} = $extension =~ /gtf/i ? 2.5 : 3; # default
531	0	0	0			0	unless ($self->verify and $self->gff) {
532	0					0	warn " GFF structure changed, re-setting extension from $extension to .txt\n";
533	0					0	$extension =~ s/g[tf]f3?/txt/i;
534							}
535							}
536							}
537							elsif ($extension =~ /bedgraph\|bed\|bdg\|narrowpeak\|broadpeak/i) {
538	3	100				13	if (not $self->bed) {
539							# let's set it to true and see if it passes verification
540	1					4	$self->{'bed'} = 1; # a fake true
541	1	50	33			4	unless ($self->verify and $self->bed) {
542	0					0	warn " BED structure changed, re-setting extension from $extension to .txt\n";
543	0	0				0	$extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
544							}
545							}
546							}
547							elsif ($extension =~ /vcf/i) {
548	0	0				0	if (not $self->vcf) {
549							# let's set it to true and see if it passes verification
550	0					0	$self->{'vcf'} = 1; # a fake true
551	0	0	0			0	unless ($self->verify and $self->vcf) {
552	0					0	warn " VCF structure changed, re-setting extension from $extension to .txt\n";
553	0	0				0	$extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
554							}
555							}
556							}
557							elsif ($extension =~ /sgr/i) {
558	0	0				0	unless ($self->{'extension'} =~ /sgr/i) {
559							# original file was not SGR
560							# let's pretend it was and see if still passes
561							# the sgr verification relies on the recorded extension
562	0					0	$self->{'extension'} = '.sgr';
563	0					0	$self->verify;
564	0	0				0	if ($self->extension =~ /txt/) {
565	0					0	warn " SGR structure changed, re-setting extension from $extension to .txt\n";
566							}
567	0					0	$extension = $self->{'extension'};
568							}
569							}
570							elsif ($extension =~ /reff?lat\|genepred\|ucsc/i) {
571	0	0				0	if ($self->ucsc != $self->number_columns) {
572							# it's not set as a ucsc data
573							# let's set it to true and see if it passes verification
574	0					0	$self->ucsc($self->number_columns);
575	0	0	0			0	unless ($self->verify and $self->ucsc) {
576	0					0	warn " UCSC structure changed, re-setting extension from $extension to .txt\n";
577	0	0				0	$extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
578							}
579							}
580							}
581							elsif ($extension =~ /txt/i) {
582							# plain old text file, sounds good to me
583							# make sure headers are enabled
584	5					13	$self->{'headers'} = 1;
585							}
586							elsif (not $extension) {
587							# no extension was available
588							# try and determine one from metadata
589
590	0	0				0	if ($self->gff) {
		0
		0
		0
		0
		0
591	0	0				0	$extension = $self->gff == 3 ? '.gff3' : $self->gff == 2.5 ? '.gtf' : '.gff';
		0
592							}
593							elsif ($self->bed) {
594	0	0	0			0	if ($self->format) {
		0
595							# re-use the format value as the extension
596	0					0	$extension = sprintf(".%s", $self->format);
597							}
598							elsif (
599							$self->number_columns == 4 and
600							$self->name(3) =~ /score/i
601							) {
602	0					0	$extension = '.bdg'; # looks like a bedGraph file
603							}
604							else {
605	0					0	$extension = '.bed'; # a regular bed file
606							}
607							}
608							elsif ($self->ucsc) {
609	0	0				0	if ($self->format) {
610							# re-use the format value as the extension
611	0					0	$extension = sprintf(".%s", $self->format);
612							}
613							else {
614							# use a generic ucsc format
615	0					0	$extension = '.ucsc';
616							}
617							}
618							elsif ($self->vcf) {
619	0					0	$extension = '.vcf';
620							}
621							elsif ($name =~ /(\.\w{3}(?:\.gz)?)$/i) {
622							# a non-standard 3 letter file extension
623							# anything else might be construed as part of the filename, so run the
624							# risk of adding a default extension below
625	0					0	$extension = $1;
626	0					0	$name =~ s/$extension\Z//;
627							}
628							elsif ($self->extension) {
629							# original file had an extension, re-use it if appropriate
630							# why wouldn't this get picked up above???? probably old cruft,
631							# or a non-standard or unknown file extension
632							# leave it in for the time being, shouldn't hurt anything
633	0	0				0	if ($self->extension =~ /g[tf]f/i) {
		0
634	0	0				0	$extension = $self->gff ? $self->extension : '.txt';
635							}
636							elsif ($self->extension =~ /bed\|bdg\|peak/i) {
637	0	0				0	$extension = $self->bed ? $self->extension : '.txt';
638							}
639							else {
640							# an unstructured format
641	0					0	$extension = $self->extension;
642							}
643							}
644							else {
645							# normal data text file
646	0					0	$extension = '.txt';
647							}
648							}
649							# otherwise the extension must be good, hope for the best
650
651							# determine format
652							# this is an arcane specification of whether we want a "simple" no metadata
653							# format, or an ordinary text format that may or may not have metadata
654							# it's currently not hurting much, so leave it in for now?
655	8	50				29	unless ($args{'format'}) {
656	8	50				32	if (defined $args{'simple'}) {
		50
657							# an old method of specifying simple
658	0					0	$args{'format'} = 'simple';
659							}
660							elsif ($extension) {
661							# check extension from the parsed filename, if present
662	8	50				38	if ($extension =~ /sgr\|cdt/i) {
663							# sgr is simple format, no headers
664	0					0	$args{'format'} = 'simple';
665							}
666							else {
667							# everything else is text
668	8					20	$args{'format'} = 'text';
669							}
670							}
671							else {
672							# somehow we got this far without defining? use default text
673	0					0	$args{'format'} = 'text';
674							}
675							}
676
677							# check zip status if necessary
678	8	50				25	unless (defined $args{'gz'}) {
679							# look at filename extension as a clue
680							# in case we're overwriting the input file, keep the zip status
681	8	50				33	if ($extension =~ m/\.vcf\.gz/i) {
		50
682							# vcf requires bgzip
683	0					0	$args{'gz'} = 2;
684							}
685							elsif ($extension =~ m/\.gz$/i) {
686	0					0	$args{'gz'} = 1;
687							}
688							else {
689	8					26	$args{'gz'} = 0; # default
690							}
691							}
692
693							# adjust gzip extension as necessary
694	8	50	33			61	if ($args{'gz'} and $extension !~ m/\.gz$/i) {
		50	33
695	0					0	$extension .= '.gz';
696							}
697							elsif (not $args{'gz'} and $extension =~ /\.gz$/i) {
698	0					0	$extension =~ s/\.gz$//i;
699							}
700
701							# check filename length
702							# assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
703							# don't even get me started on Windows NTFS path length limitation
704	8	50				32	if (length($name . $extension) > 255) {
705	0					0	my $limit = 253 - length($extension);
706	0					0	$name = substr($name, 0, $limit) . '..';
707	0					0	warn " filename too long! Truncating to $limit characters\n";
708							}
709
710							# generate the new filename
711	8					39	my $newname = $path . $name . $extension;
712
713
714							# Convert strand information
715	8					52	my $strand_i = $self->strand_column;
716	8	0	0			27	if (defined $strand_i and ($self->gff or $self->bed or $self->ucsc) ) {
			33
717							# convert to +/-/. nomenclature as necessary
718	0	0	0			0	if ($self->gff) {
		0
719	0					0	for my $row (1 .. $self->last_row) {
720	0					0	my $s = $self->{'data_table'}->[$row][$strand_i];
721	0	0				0	if ($s =~ /\d/) {
722	0	0				0	$s = $s == 1 ? '+' : $s == -1 ? '-' : '.';
		0
723							}
724	0					0	$self->{'data_table'}->[$row][$strand_i] = $s;
725							}
726							}
727							elsif ($self->bed or $self->ucsc) {
728	0					0	for my $row (1 .. $self->last_row) {
729	0					0	my $s = $self->{'data_table'}->[$row][$strand_i];
730	0	0				0	if ($s =~ /\d/) {
731	0	0				0	$s = $s >= 0 ? '+' : '-';
732							}
733	0					0	$self->{'data_table'}->[$row][$strand_i] = $s;
734							}
735							}
736							}
737
738
739							# Open file for writing
740	8					51	my $fh = $self->open_to_write_fh($newname, $args{'gz'});
741	8	50				26	return unless defined $fh;
742
743
744							# Write the headers
745	8	50				35	if ($args{'format'} eq 'text') {
746							# default text format has metadata headers
747
748							# write gff statement if gff format
749	8	50				32	if ($self->gff) {
750	0					0	$fh->print('##gff-version ' . $self->gff . "\n");
751							}
752
753							# Write the primary headers
754	8	50	66			35	unless (
			66
			66
			33
755							$self->gff or $self->bed or $self->ucsc or $self->vcf or
756							$extension =~ m/sgr\|kgg\|cdt\|peak/i
757							) {
758							# we only write these for normal text files, not defined format files
759
760	5	50				19	if ($self->program) {
761	0					0	$fh->print('# Program ' . $self->program . "\n");
762							}
763	5	50				17	if ($self->database) {
764	5					22	$fh->print('# Database ' . $self->database . "\n");
765							}
766	5	50				136	if ($self->feature) {
767	5					15	$fh->print('# Feature ' . $self->feature . "\n");
768							}
769							}
770
771							# Write the miscellaneous headers
772	8					43	foreach ( @{ $self->{'comments'} } ) {
	8					31
773							# write remaining miscellaneous header lines if present
774							# we do this for all files
775	6	100				63	unless (/\n$/s) {
776							# append newline if not present
777	4					11	$_ .= "\n";
778							}
779							# check for comment character at beginning
780	6	100				19	if (/^#/) {
781	5					33	$fh->print($_);
782							}
783							else {
784	1					5	$fh->print("# " . $_);
785							}
786							}
787
788							# Write the column metadata headers
789	8					43	for (my $i = 0; $i < $self->number_columns; $i++) {
790							# each column metadata in the hash is referenced by the column's
791							# index number as the key
792							# we will take each index one at a time in increasing order
793
794							# some files do not need or tolerate metadata lines, for those
795							# known files the metadata lines will be skipped
796
797							# these column metadata lines do not need to be written if they
798							# only have two values, presumably name and index, for files
799							# that don't normally have column headers, e.g. gff
800	22	100	66			71	if (
		50	0
		0	0
801							exists $self->{$i}{'AUTO'} and
802	8					26	scalar( keys %{ $self->{$i} } ) == $self->{$i}{'AUTO'}
803							) {
804							# some of the metadata values were autogenerated and
805							# we have the same number of keys as were autogenerated
806							# no need to write these
807	8					19	next;
808							}
809	14					59	elsif (scalar( keys %{ $self->{$i} } ) == 2) {
810							# only two metadata keys exist, name and index
811							# these are so simple it's not worth writing them
812	14					40	next;
813							}
814							elsif ($extension =~ /sgr\|kgg\|cdt/i or $self->ucsc or $self->vcf) {
815							# these do not support metadata lines
816	0					0	next;
817							}
818
819							# we will put each key=value pair into @pairs, listed asciibetically
820	0					0	my @pairs; # an array of the key value pairs from the metadata hash
821							# put name first
822							# we are no longer writing the index number
823	0					0	push @pairs, 'name=' . $self->{$i}{'name'};
824							# put remainder in alphabetical order
825	0					0	foreach (sort {$a cmp $b} keys %{ $self->{$i} } ) {
	0					0
	0					0
826	0	0				0	next if $_ eq 'name'; # already written
827	0	0				0	next if $_ eq 'index'; # internal use only
828	0	0				0	next if $_ eq 'AUTO'; # internal use only
829	0					0	push @pairs, $_ . '=' . $self->{$i}{$_};
830							}
831
832							# Finally write the header line, joining the pairs with a
833							# semi-colon into a single string.
834							# The column identifier is comprised of the word 'Column'
835							# and the index number joined by '_'.
836	0					0	$fh->print("# Column_$i ", join(";", @pairs), "\n");
837							}
838							}
839
840
841							# Write the table column headers
842	8	100				40	if ($self->{'headers'} == 1) {
843	5					12	$fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[0] }));
	5					34
844							}
845
846
847							# Write the data table
848	8	50				69	if ($args{'format'} eq 'simple') {
849
850							# the simple format will strip the non-value '.' from the table
851	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
852							# we will step though the data_table array one row at a time
853							# convert the non-value '.' to undefined
854							# and print using a tab-delimited format
855	0					0	my @linedata;
856	0					0	foreach ( @{ $self->{'data_table'}[$i] }) {
	0					0
857	0	0				0	if ($_ eq '.') {
858	0					0	push @linedata, undef;
859							} else {
860	0					0	push @linedata, $_;
861							}
862							}
863	0					0	$fh->printf("%s\n", join("\t", @linedata));
864							}
865							}
866
867							else {
868							# normal data files
869	8					30	for (my $i = 1; $i <= $self->last_row; $i++) {
870							# we will step though the data_table array one row at a time
871							# we will join each row's array of elements into a string to print
872							# using a tab-delimited format
873	104					145	$fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[$i] }));
	104					280
874							}
875							}
876
877							# done writing
878	8					45	$fh->close;
879
880							# if we made it this far, it should've been a success!
881							# return the new file name as indication of success
882	8					471	return $newname;
883							}
884
885							sub save {
886	6			6	1	40	return shift->write_file(@_);
887							}
888
889
890							#### Open a file for reading
891							sub open_to_read_fh {
892	161			161	1	305	my $self = shift;
893	161		100			555	my $file = shift \|\| undef;
894	161	100				641	my $obj = ref($self) =~ /^Bio::ToolBox/ ? 1 : 0;
895
896							# check file
897	161	100	66			611	if (not $file and $obj) {
898	120		50			401	$file = $self->filename \|\| undef;
899							}
900	161	50				394	unless ($file) {
901	0					0	carp " no filename provided or associated with object!";
902	0					0	return;
903							}
904
905							# Open filehandle object as appropriate
906	161					229	my $fh;
907	161	50				695	if ($file =~ /\.gz$/i) {
		50
908							# the file is compressed with gzip
909	0	0				0	$fh = IO::File->new("gzip -dc $file \|") or
910							carp "unable to read '$file' $!\n";
911							}
912							elsif ($file =~ /\.bz2$/i) {
913							# the file is compressed with bzip2
914	0	0				0	$fh = IO::File->new("bzip2 -dc $file \|") or
915							carp "unable to read '$file' $!\n";
916							}
917							else {
918							# the file is uncompressed and space hogging
919	161	50				873	$fh = IO::File->new($file, 'r') or
920							carp "unable to read '$file' $!\n";
921							}
922
923	161	100				17658	if ($obj) {
924	120					353	$self->{fh} = $fh;
925							}
926	161					694	return $fh;
927							}
928
929
930							#### Open a file for writing
931							sub open_to_write_fh {
932	10			10	1	50	my ($self, $filename, $gz, $append) = @_;
933
934							# check filename
935	10	50				25	unless ($filename) {
936	0					0	carp " no filename to write!";
937	0					0	return;
938							}
939
940							# check filename length
941							# assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
942	10					164	my $name = fileparse($filename);
943	10	50				41	if (length $name > 255) {
944	0					0	carp " filename is too long! please shorten\n";
945	0					0	return;
946							}
947
948							# check zip status if necessary
949	10	100				27	unless (defined $gz) {
950							# look at filename extension as a clue
951							# in case we're overwriting the input file, keep the zip status
952	2	50				19	if ($filename =~ m/\.vcf(\.gz)?$/i) {
		50
953	0					0	$gz = 2; # bgzip
954							}
955							elsif ($filename =~ m/\.gz$/i) {
956	0					0	$gz = 1; # regular gzip
957							}
958							else {
959	2					4	$gz = 0; # default
960							}
961							}
962
963							# gzip compression application
964	10	50	33			64	if ($gz == 1 and not $gzip_app) {
		50	33
965							# use parallel gzip if possible
966							# this is stored in a global variable so we only have to look once
967	0					0	$gzip_app = which('pigz');
968	0	0				0	if ($gzip_app) {
969	0					0	$gzip_app .= ' -p 3'; # use a conservative 3 processes, plus perl, so 4 total
970							}
971							else {
972							# default is the standard gzip application
973							# should be available in any application
974	0					0	$gzip_app = which('gzip');
975							}
976	0	0				0	unless ($gzip_app) {
977	0					0	croak "no gzip application in PATH to open compressed file handle output!\n";
978							}
979							}
980							elsif ($gz == 2 and not $bgzip_app) {
981							# use parallel bgzip if possible
982							# this is stored in a global variable so we only have to look once
983	0					0	$bgzip_app = which('bgzip');
984	0	0				0	if ($bgzip_app) {
985							# I'm going to assume this is a recent bgzip with multi-threading
986							# use 3 threads, same as with pigz
987	0					0	$bgzip_app .= ' -@ 3 -c';
988							}
989	0	0				0	unless ($bgzip_app) {
990	0					0	croak "no bgzip application in PATH to open compressed file handle output!\n";
991							}
992							}
993	10	50				38	my $gzipper = $gz == 1 ? $gzip_app : $gz == 2 ? $bgzip_app : undef;
		50
994
995							# check file append mode
996	10	100				37	unless (defined $append) {
997							# default is not to append
998	8					16	$append = 0;
999							}
1000
1001
1002							# Generate appropriate filehandle object
1003	10					17	my $fh;
1004	10	100	66			77	if (not $gzipper and not $append) {
		50	33
		50	33
		0	0
1005	8	50				60	$fh = IO::File->new($filename, 'w') or
1006							carp "cannot write to file '$filename' $!\n";
1007							}
1008							elsif ($gzipper and !$append) {
1009	0	0				0	$fh = IO::File->new("\| $gzipper >$filename") or
1010							carp "cannot write to compressed file '$filename' $!\n";
1011							}
1012							elsif (not $gzipper and $append) {
1013	2	50				13	$fh = IO::File->new(">> $filename") or
1014							carp "cannot append to file '$filename' $!\n";
1015							}
1016							elsif ($gzipper and $append) {
1017	0	0				0	$fh = IO::File->new("\| $gzipper >>$filename") or
1018							carp "cannot append to compressed file '$filename' $!\n";
1019							}
1020	10	50				1746	return $fh if defined $fh;
1021							}
1022
1023
1024							### Subroutine to check for file existance
1025							sub check_file {
1026	60			60	1	142	my ($self, $filename) = @_;
1027
1028							# check for file existance
1029	60	50				1538	if (-e $filename) {
1030							# confirmed full filename and path
1031	60					341	return $filename;
1032							}
1033							else {
1034							# file name is either incomplete or non-existent
1035							# try adding some common file extensions in case those are missing
1036	0					0	my $new_filename;
1037	0					0	foreach my $ext (qw(.gz .txt .txt.gz .bed .bed.gz)) {
1038	0	0				0	if (-e $filename . $ext) {
1039	0					0	$new_filename = $filename . $ext;
1040	0					0	last;
1041							}
1042							}
1043	0					0	return $new_filename;
1044							}
1045							}
1046
1047
1048							sub fh {
1049	192			192	0	318	my $self = shift;
1050	192	50				3552	return $self->{fh} if exists $self->{fh};
1051	0					0	return;
1052							}
1053
1054							sub close_fh {
1055	70			70	0	249	my $self = shift;
1056	70	50	33			532	$self->{fh}->close if (exists $self->{fh} and $self->{fh});
1057							}
1058
1059
1060							### Internal subroutine to check if a comment line contains headers
1061							sub _commented_header_line {
1062	60			60		143	my ($data, $line) = @_;
1063
1064							# prepare arrays from the other lines and current line
1065	60					113	my @commentdata;
1066	60	100				102	if ( scalar @{ $data->{'comments'} } >= 1 ) {
	60					191
1067							# take the last line in the other array
1068	30					128	@commentdata = split '\t', $data->{'comments'}->[-1];
1069							}
1070	60					286	my @linedata = split '\t', $line;
1071
1072							# check if the counts are equal
1073	60	50				170	if (scalar @commentdata == scalar @linedata) {
1074	0					0	return 1;
1075							}
1076							else {
1077	60					220	return 0;
1078							}
1079							}
1080
1081
1082							### Internal subroutine to process metadata for standard columns
1083							sub add_column_metadata {
1084	0			0	1	0	my ($data, $line, $index) = @_;
1085
1086							# strip the Column metadata identifier
1087	0					0	chomp $line;
1088	0					0	$line =~ s/^# Column_\d+ //;
1089
1090							# break up the column metadata
1091	0					0	my %temphash; # a temporary hash to put the column metadata into
1092	0					0	foreach (split ';', $line) {
1093	0					0	my ($key, $value) = split '=';
1094	0	0				0	if ($key eq 'index') {
1095	0	0				0	if ($index != $value) {
1096							# the value from the metadata index key should be
1097							# correct, so we will use that
1098	0					0	$index = $value;
1099							}
1100							}
1101							# store the key & value
1102	0					0	$temphash{$key} = $value;
1103							}
1104
1105							# create a index metadata key if not already present
1106							# the rest of biotoolbox may expect this to be present
1107	0	0				0	unless (exists $temphash{'index'}) {
1108	0					0	$temphash{'index'} = $index;
1109							}
1110
1111							# store the column metadata hash into the main data hash
1112							# use the index as the key
1113	0	0				0	if (exists $data->{$index}) {
1114							# we will simply overwrite the previous metadata hash
1115							# harsh, I know, but what to do?
1116							# if it was canned metadata for a gff file, that's ok
1117	0					0	warn "Warning: more than one metadata line exists for index $index!\n";
1118	0					0	$data->{$index} = \%temphash;
1119							}
1120							else {
1121							# metadata hash doesn't exist, so we will add it
1122	0					0	$data->{$index} = \%temphash;
1123							}
1124	0					0	return 1;
1125							}
1126
1127							### Subroutine to generate metadata for gff files
1128							# gff files have nine defined columns
1129							# there are different specifications and variants:
1130							# gff (v.1), gff v.2, gff v.2.5 (aka gtf), gff v.3 (gff3)
1131							# however, the columns are the same in all versions
1132							# more info on gff can be found http://gmod.org/wiki/GFF3
1133							sub add_gff_metadata {
1134	5			5	1	12	my $self = shift;
1135	5		50			26	my $version = shift \|\| undef;
1136	5		50			26	my $force = shift \|\| 0;
1137
1138							# set the gff version based on the extension if it isn't already
1139	5	50	33			17	if (not($self->gff) or $force) {
1140	0	0				0	if (defined $version) {
		0
		0
1141	0					0	$self->gff($version);
1142							}
1143							elsif ($self->extension =~ /gtf/i) {
1144	0					0	$self->gff(2.5);
1145							}
1146							elsif ($self->extension =~ /gff3/i) {
1147	0					0	$self->gff(3);
1148							}
1149							else {
1150	0					0	$self->gff(2); # hope for the best
1151							}
1152							}
1153
1154							# set the metadata for the each column
1155							# some of these may already be defined if there was a
1156							# column metadata specific column in the file
1157	5					27	my $column_names = $self->standard_column_names('gff');
1158	5					24	for (my $i = 0; $i < 9; $i++) {
1159							# loop for each column
1160							# set metadata unless it's already loaded
1161	45	50	33			169	if ($force or not exists $self->{$i}) {
1162	45					138	$self->{$i}{'name'} = $column_names->[$i];
1163	45					81	$self->{$i}{'index'} = $i;
1164	45					74	$self->{$i}{'AUTO'} = 3;
1165							}
1166							# assign the name to the column header
1167	45	50	33			134	if ($force or not defined $self->{'column_names'}->[$i]) {
1168	45					132	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1169							}
1170							}
1171	5					21	$self->{data_table}->[0] = $self->{'column_names'};
1172
1173							# set column number always to 9
1174	5	50	33			35	if ($force or $self->{'number_columns'} == 0) {
1175	5					20	$self->{'number_columns'} = 9;
1176							}
1177
1178							# set headers flag to false
1179	5	50				31	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1180
1181							# set the feature type
1182	5	50				21	unless (defined $self->{'feature'}) {
1183	5					9	$self->{'feature'} = 'region';
1184							}
1185	5					19	return 1;
1186							}
1187
1188
1189							### Subroutine to generate metadata for BED files
1190							# bed files have a loose format
1191							# they require a minimum of 3 columns, and have a max of 12
1192							# 3, 6, and 12 column files are the most common
1193							# there are also something called paired bed files floating around
1194							# The official details and specifications may be found at
1195							# http://genome.ucsc.edu/FAQ/FAQformat#format1
1196
1197							# a special type of bed file is the bedgraph, using
1198							# either a bdg, bedgraph, or simply bed extension
1199							# these only have four columns, no more, no less
1200							# the fourth column is score, not name
1201							sub add_bed_metadata {
1202	27			27	1	63	my ($self, $column_count) = @_;
1203	27		50			78	my $force = shift \|\| 0;
1204
1205							# check bed type and set metadata appropriately
1206	27					41	my $bed_names;
1207	27	50	33			75	if ($self->format =~ /bedgraph/i or $self->extension =~ /bg\|bdg\|graph/i) {
1208	0					0	$self->format('bedGraph'); # possibly redundant
1209	0					0	$self->bed($column_count);
1210	0					0	$bed_names = $self->standard_column_names('bdg');
1211							}
1212							else {
1213	27					89	$self->format('bed');
1214	27					108	$self->bed($column_count);
1215	27					98	$bed_names = $self->standard_column_names('bed12');
1216							}
1217	27					79	$self->{'number_columns'} = $column_count;
1218
1219							# set the metadata for each column
1220							# some of these may already be defined if there was a
1221							# column metadata specific column in the file
1222	27					140	for (my $i = 0; $i < $column_count; $i++) {
1223							# loop for each column
1224							# set name unless it already has one from metadata
1225	200	50	33			396	if ($force or not exists $self->{$i}) {
1226	200		50			566	$self->{$i}{'name'} = $bed_names->[$i] \|\| 'extraColumn';
1227	200					350	$self->{$i}{'index'} = $i;
1228	200					307	$self->{$i}{'AUTO'} = 3;
1229							}
1230							# assign the name to the column header
1231	200	50	33			392	if ($force or not defined $self->{'column_names'}->[$i]) {
1232	200					565	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1233							}
1234							}
1235	27					131	$self->{data_table}->[0] = $self->{'column_names'};
1236
1237							# set the feature type
1238	27	50				85	unless (defined $self->{'feature'}) {
1239	27					54	$self->{'feature'} = 'region';
1240							}
1241
1242							# set headers flag to false
1243	27	50				104	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1244	27					83	return 1;
1245							}
1246
1247
1248							### Subroutine to generate metadata for broadpeak and narrowpeak files
1249							# three different types of peak files are available
1250							# see http://genome.ucsc.edu/FAQ/FAQformat.html
1251							sub add_peak_metadata {
1252	16			16	1	40	my ($self, $column_count) = @_;
1253	16		50			42	my $force = shift \|\| 0;
1254
1255							# check bed type and set metadata appropriately
1256							# most of these are bed6 plus extra columns
1257	16					23	my $column_names;
1258	16	100	66			41	if ($self->format =~ /narrow/i or $self->extension =~ /narrow/i) {
		50	33
		50	33
1259	8					30	$self->format('narrowPeak');
1260	8					33	$self->bed($column_count);
1261	8					29	$column_names = $self->standard_column_names('narrowpeak');
1262							}
1263							elsif ($self->format =~ /broad/i or $self->extension =~ /broad/i) {
1264	0					0	$self->format('broadPeak'); # possibly redundant
1265	0					0	$self->bed($column_count);
1266	0					0	$column_names = $self->standard_column_names('broadpeak');
1267							}
1268							elsif ($self->format =~ /gapped/i or $self->extension =~ /gapped/i) {
1269	8					45	$self->format('gappedPeak'); # possibly redundant
1270	8					29	$self->bed($column_count);
1271	8					25	$column_names = $self->standard_column_names('gappedpeak');
1272							}
1273							else {
1274							# how did we get here???? Hope for the best.....
1275	0					0	$self->bed($column_count);
1276	0					0	$column_names = $self->standard_column_names('bed12');
1277							}
1278	16					42	$self->{'number_columns'} = $column_count;
1279
1280							# add metadata
1281	16					48	for (my $i = 0; $i < $column_count; $i++) {
1282	200	50	33			385	if ($force or not exists $self->{$i}) {
1283	200		50			548	$self->{$i}{'name'} = $column_names->[$i] \|\| 'extraColumn';
1284	200					319	$self->{$i}{'index'} = $i;
1285	200					313	$self->{$i}{'AUTO'} = 3;
1286							}
1287							# assign the name to the column header
1288	200	50	33			356	if ($force or not defined $self->{'column_names'}->[$i]) {
1289	200					465	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1290							}
1291							}
1292	16					33	$self->{data_table}->[0] = $self->{'column_names'};
1293
1294							# set the feature type
1295	16	50				40	unless (defined $self->{'feature'}) {
1296	16					34	$self->{'feature'} = 'region';
1297							}
1298
1299							# set headers flag to false
1300	16	50				51	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1301	16					49	return 1;
1302							}
1303
1304
1305							### Subroutine to generate metadata for various UCSC gene files
1306							# these are tricky, as we will try to determine contents by counting
1307							# the columns, which may not be accurate
1308							# not only that, but this presumes the extension even makes it recognizable
1309							# see http://genome.ucsc.edu/FAQ/FAQformat.html#format9 for details
1310							# also biotoolbox script ucsc_table2gff3.pl
1311							sub add_ucsc_metadata {
1312	3			3	1	10	my ($self, $column_count) = @_;
1313	3		50			12	my $force = shift \|\| 0;
1314
1315							# set metadata
1316	3					5	$self->{'number_columns'} = $column_count;
1317	3					8	$self->{'ucsc'} = $column_count;
1318
1319							# set format and determine column names;
1320	3					5	my $column_names;
1321	3	50				8	if ($column_count == 16) {
		0
		0
		0
		0
1322	3					11	$self->format('genePredExt');
1323	3					9	$column_names = $self->standard_column_names('ucsc16');
1324							}
1325							elsif ($column_count == 15) {
1326	0					0	$self->format('genePredExt');
1327	0					0	$column_names = $self->standard_column_names('ucsc15');
1328							}
1329							elsif ($column_count == 12) {
1330	0					0	$self->format('knownGene');
1331	0					0	$column_names = $self->standard_column_names('ucsc12');
1332							}
1333							elsif ($column_count == 11) {
1334	0					0	$self->format('refFlat');
1335	0					0	$column_names = $self->standard_column_names('ucsc11');
1336							}
1337							elsif ($column_count == 10) {
1338	0					0	$self->format('genePred');
1339	0					0	$column_names = $self->standard_column_names('ucsc10');
1340							}
1341
1342							# assign the column names and metadata
1343	3					11	for (my $i = 0; $i < $column_count; $i++) {
1344							# loop for each column
1345							# set name unless it already has one from metadata
1346	48	50	33			86	if ($force or not exists $self->{$i}) {
1347	48		50			94	$self->{$i}{'name'} = $column_names->[$i] \|\| 'extraColumn';
1348	48					67	$self->{$i}{'index'} = $i;
1349	48					68	$self->{$i}{'AUTO'} = 3;
1350							}
1351							# assign the name to the column header
1352	48	50	33			88	if ($force or not defined $self->{'column_names'}->[$i]) {
1353	48					97	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1354							}
1355							}
1356	3					7	$self->{data_table}->[0] = $self->{'column_names'};
1357
1358							# set the feature type
1359	3	50				8	unless (defined $self->{'feature'}) {
1360	3					8	$self->{'feature'} = 'gene';
1361							}
1362
1363							# set headers flag to false
1364	3	50				13	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1365	3					8	return 1;
1366							}
1367
1368
1369							### Subroutine to generate metadata for SGR files
1370							# a sgr file contains three columns: chromo, position, score
1371							# this is a very simple file format, useful in exporting and
1372							# importing to binary BAR files used in T2, USeq, and IGB
1373							sub add_sgr_metadata {
1374	0			0	1	0	my $self = shift;
1375	0		0			0	my $force = shift \|\| 0;
1376
1377							# set column metadata
1378	0					0	my $column_names = $self->standard_column_names('sgr');
1379	0					0	for (my $i = 0; $i < 3; $i++) {
1380							# loop for each column
1381							# set name unless it already has one from metadata
1382	0	0	0			0	if ($force or not exists $self->{$i}) {
1383	0		0			0	$self->{$i}{'name'} = $column_names->[$i] \|\| 'extraColumn';
1384	0					0	$self->{$i}{'index'} = $i;
1385	0					0	$self->{$i}{'AUTO'} = 3;
1386							}
1387							# assign the name to the column header
1388	0	0	0			0	if ($force or not defined $self->{'column_names'}->[$i]) {
1389	0					0	$self->{'column_names'}->[$i] = $self->{$i}{'name'};
1390							}
1391							}
1392	0					0	$self->{data_table}->[0] = $self->{'column_names'};
1393	0					0	$self->{'number_columns'} = 3;
1394
1395
1396							# set headers flag to false
1397	0	0				0	$self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1398
1399							# set the feature type
1400	0	0				0	unless (defined $self->{'feature'}) {
1401	0					0	$self->{'feature'} = 'region';
1402							}
1403	0					0	return 1;
1404							}
1405
1406
1407							### Internal subroutine to generate metadata for standard files
1408							sub add_standard_metadata {
1409	13			13	1	26	my ($self, $line) = @_;
1410
1411	13					51	my @namelist = split '\t', $line;
1412	13					28	chomp $namelist[-1];
1413
1414							# we will define the columns based on
1415	13					38	for my $i (0..$#namelist) {
1416							# make up a name if one doesn't exist
1417	68		66			140	$namelist[$i] \|\|= "Column_$i";
1418
1419							# confirm that a file metadata exists for this column
1420	68	50				122	if (exists $self->{$i}) {
1421	0	0				0	unless ($namelist[$i] eq $self->{$i}->{'name'}) {
1422	0					0	warn "metadata and header names for column $i do not match!";
1423							# set the name to match the actual column name
1424	0					0	$self->{$i}->{'name'} = $namelist[$i];
1425							}
1426							}
1427
1428							# otherwise be nice and generate it here
1429							else {
1430	68					257	$self->{$i} = {
1431							'name' => $namelist[$i],
1432							'index' => $i,
1433							'AUTO' => 3,
1434							};
1435							}
1436							}
1437
1438							# check the number of columns
1439	13	50				47	if (scalar @namelist != $self->{'number_columns'} ) {
1440							# adjust to match actual content
1441	13					31	$self->{'number_columns'} = scalar @namelist;
1442							}
1443
1444							# put the column names in the metadata
1445	13					33	$self->{'column_names'} = \@namelist;
1446	13					77	$self->{data_table}->[0] = $self->{'column_names'};
1447
1448							# set headers flag to true
1449	13					29	$self->{'headers'} = 1;
1450	13					26	return 1;
1451							}
1452
1453
1454							### Internal subroutine to generate hash of standard file format column names
1455							sub standard_column_names {
1456	51			51	1	167	my ($self, $type) = @_;
1457
1458	51	100	0			241	if ($type eq 'gff') {
		100	0
		50	0
		50	0
		100
		50
		100
		50
		50
		0
		0
		0
		0
1459	5					29	return [qw(Chromosome Source Type Start Stop Score Strand Phase Group)];
1460							}
1461							elsif ($type eq 'bed12') {
1462	27					128	return [qw(Chromosome Start0 End Name Score Strand
1463							thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0)];
1464							}
1465							elsif ($type eq 'bed6') {
1466	0					0	return [qw(Chromosome Start0 End Name Score Strand)];
1467							}
1468							elsif ($type eq 'bdg') {
1469	0					0	return [qw(Chromosome Start0 End Score)];
1470							}
1471							elsif ($type eq 'narrowpeak') {
1472	8					44	return [qw(Chromosome Start0 End Name Score Strand signalValue
1473							pValue qValue peak)];
1474							}
1475							elsif ($type eq 'broadpeak') {
1476	0					0	return [qw(Chromosome Start0 End Name Score Strand signalValue
1477							pValue qValue)];
1478							}
1479							elsif ($type eq 'gappedpeak') {
1480	8					45	return [qw(Chromosome Start0 End Name Score Strand
1481							thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0
1482							signalValue pValue qValue)];
1483							}
1484							elsif ($type eq 'sgr') {
1485	0					0	return [qw(Chromo Start Score)];
1486							}
1487							elsif ($type eq 'ucsc16') {
1488	3					17	return [qw(bin name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1489							exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1490							}
1491							elsif ($type eq 'ucsc15' or $type eq 'genepredext') {
1492	0						return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1493							exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1494							}
1495							elsif ($type eq 'ucsc12' or $type eq 'knowngene') {
1496	0						return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1497							exonStarts0 exonEnds proteinID alignID)];
1498							}
1499							elsif ($type eq 'ucsc11' or $type eq 'refflat') {
1500	0						return [qw(geneName transcriptName chrom strand txStart0 txEnd cdsStart0
1501							cdsEnd exonCount exonStarts0 exonEnds)];
1502							}
1503							elsif ($type eq 'ucsc10' or $type eq 'genepred') {
1504	0						return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1505							exonStarts exonEnds)];
1506							}
1507							else {
1508	0						confess "unrecognized standard column name format '$type'!";
1509							}
1510							}
1511
1512
1513							__END__