File Coverage

blib/lib/Bio/ToolBox/parser/bed.pm

Criterion	Covered	Total	%
statement	221	306	72.2
branch	75	144	52.0
condition	21	60	35.0
subroutine	26	32	81.2
pod	18	22	81.8
total	361	564	64.0

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::parser::bed;
2							our $VERSION = '1.69';
3
4							=head1 NAME
5
6							Bio::ToolBox::parser::bed - Parser for BED-style formats
7
8							=head1 SYNOPSIS
9
10							use Bio::ToolBox::parser::bed;
11
12							### Quick-n-easy bed parser
13							my $bed = Bio::ToolBox::parser::bed->new('file.bed');
14
15							### Full-powered bed parser, mostly for bed12 functionality
16							my $bed = Bio::ToolBox::parser::bed->new(
17							file => 'regions.bed',
18							do_exon => 1,
19							do_cds => 1,
20							do_codon => 1,
21							);
22
23							# what type of bed file is being parsed, determined when opening file
24							my $type = $bed->version; # returns narrowPeak, bedGraph, bed12, bed6, etc
25
26							# Retrieve one feature or line at a time
27							my $feature = $bed->next_feature;
28
29							# Retrieve array of all features
30							my @genes = $bed->top_features;
31
32							# each returned feature is a SeqFeature object
33							foreach my $f ($bed->next_top_feature) {
34							printf "%s:%d-%d\n", $f->seq_id, $f->start, $f->end;
35							}
36
37
38							=head1 DESCRIPTION
39
40							This is a parser for converting BED-style and related formats into SeqFeature objects.
41							File formats include the following.
42
43							=over 4
44
45							=item Bed
46
47							L files may have 3-12 columns,
48							where the first 3-6 columns are basic information about the feature itself, and
49							columns 7-12 are usually for defining subfeatures of a transcript model, including
50							exons, UTRs (thin portions), and CDS (thick portions) subfeatures. This parser will
51							parse these extra fields as appropriate into subfeature SeqFeature objects. Bed files
52							are recognized with the file extension F<.bed>.
53
54							=item Bedgraph
55
56							L files are a type of
57							wiggle format in Bed format, where the 4th column is a score instead of a name. BedGraph
58							files are recognized by the file extension F<.bedgraph> or F<.bdg>.
59
60							=item narrowPeak
61
62							L files are a specialized
63							Encode variant of bed files with 10 columns (typically denoted as bed6+4), where the
64							extra 4 fields represent score attributes to a narrow ChIPSeq peak. These files are
65							parsed as a typical bed6 file, and the extra four fields are assigned to SeqFeature
66							attribute tags C, C, C, and C, respectively.
67							NarrowPeak files are recognized by the file extension F<.narrowPeak>.
68
69							=item broadPeak
70
71							L files, like narrowPeak,
72							are an Encode variant with 9 columns (bed6+3) representing a broad or extended interval
73							of ChIP enrichment without a single "peak". The extra three fields are assigned to
74							SeqFeature attribute tags C, C, and C, respectively.
75							BroadPeak files are recognized by the file extension F<.broadPeak>.
76
77							=back
78
79							C and C lines are generally ignored, although a C definition
80							line containing a C key will be interpreted if it matches one of the above file
81							types.
82
83							=head2 SeqFeature default values
84
85							The SeqFeature objects built from the bed file intervals will have some inferred defaults.
86
87							=over 4
88
89							=item Coordinate system
90
91							SeqFeature objects use the 1-based coordinate system, per the specification of
92							L, so the 0-based start coordinates of bed files will always be
93							parsed into 1-based coordinates.
94
95							=item C
96
97							SeqFeature objects will use the name field (4th column in bed files), if present, as the
98							C. The SeqFeature object should default to the C if a name was
99							not provided.
100
101							=item C
102
103							It will use a concatenation of the sequence ID, start (original 0-based), and
104							stop coordinates as the C, for example 'chr1:0-100'.
105
106							=item C
107
108							Bed files don't have a concept of feature type (they're all the same type), so a
109							default C of 'region' is set. For bed12 files with transcript models,
110							the transcripts will be set to either 'mRNA' or 'ncRNA', depending on the presence of
111							interpreted CDS start and stop (thick coordinates).
112
113							=item C
114
115							Bed files don't have a concept of a source. The basename of the provided file is
116							therefore used to set the C.
117
118							=item attribute tags
119
120							Extra columns in the narrowPeak and broadPeak formats are assigned to attribute tags
121							as described above. The C values set in bed12 files are also set to an attribute tag.
122
123							=back
124
125							=head1 METHODS
126
127							=head2 Initializing the parser object
128
129							=over 4
130
131							=item new
132
133							Initiate a new Bed file parser object. Pass a single value (the bed file name) to
134							open the file for parsing. Alternatively, pass an array of key
135							value pairs to control how the table is parsed. These options are primarily for
136							parsing bed12 files with subfeatures. Options include the following.
137
138							=over 4
139
140							=item file
141
142							Provide the path and file name for a Bed file. The file may be gzip compressed.
143
144							=item source
145
146							Pass a string to be added as the source tag value of the SeqFeature objects.
147							The default value is the basename of the file to be parsed.
148
149							=item do_exon
150
151							=item do_cds
152
153							=item do_utr
154
155							=item do_codon
156
157							Pass a boolean (1 or 0) value to parse certain subfeatures, including C,
158							C, C, C, C, and C
159							features. Default is false.
160
161							=item class
162
163							Pass the name of a L compliant class that will be used to
164							create the SeqFeature objects. The default is to use L.
165
166							=back
167
168							=back
169
170							=head2 Modify the parser object
171
172							These methods set or retrieve parameters that modify parser functionality.
173
174							=over 4
175
176							=item source
177
178							=item do_exon
179
180							=item do_cds
181
182							=item do_utr
183
184							=item do_codon
185
186							These methods retrieve or set parameters to the parsing engine, same as
187							the options to the new method.
188
189							=item open_file
190
191							Pass the name of a file to parse. This function is called automatically by the
192							L method if a filename was passed. This will open the file, check its format,
193							and set the parsers appropriately.
194
195							=back
196
197							=head2 Parser or file attributes
198
199							These retrieve attributes for the parser or file.
200
201							=over 4
202
203							=item version
204
205							This returns a string representation of the opened bed file format. For standard
206							bed files, it returns 'bed' followed by the number columns, e.g. C or C.
207							For recognized special bed variants, it will return C, C, or
208							C.
209
210							=item fh
211
212							Retrieves the file handle of the current file. This module uses
213							L objects. Be careful manipulating file handles of open files!
214
215							=item typelist
216
217							Returns a string representation of the type of SeqFeature types to be encountered in
218							the file. Currently this returns generic strings, 'mRNA,ncRNA,exon,CDS' for bed12
219							and 'region' for everything else.
220
221							=back
222
223							=head2 Feature retrieval
224
225							The following methods parse the table lines into SeqFeature objects.
226							It is best if methods are not mixed; unexpected results may occur.
227
228							For bed12 files, it will return a transcript model SeqFeature with appropriate subfeatures.
229
230							=over 4
231
232							=item next_feature
233
234							This will read the next line of the table, parse it into a feature object, and
235							immediately return it.
236
237							=item next_top_feature
238
239							This method will first parse the entire file into memory. It will then return each
240							feature one at a time. Call this method repeatedly using a while loop to get all features.
241
242							=item top_features
243
244							This method is similar to L, but instead returns an array
245							of all the top features.
246
247							=back
248
249							=head2 Other methods
250
251							Additional methods for working with the parser object and the parsed
252							SeqFeature objects.
253
254							=over 4
255
256							=item parse_file
257
258							Parses the entire file into memory without returning any objects.
259
260							=item find_gene
261
262							my $gene = $bed->find_gene(
263							display_name => 'ABC1',
264							primary_id => 'chr1:123-456',
265							);
266
267							Pass a feature name, or an array of key =E values (name, display_name,
268							ID, primary_ID, and/or coordinate information), that can be used
269							to find a feature already loaded into memory. Only really successful if the
270							entire table is loaded into memory. Features with a matching name are
271							confirmed by a matching ID or overlapping coordinates, if available.
272							Otherwise the first match is returned.
273
274							=item comments
275
276							This method will return an array of the comment, track, or browser lines that may have
277							been in the parsed file. These may or may not be useful.
278
279							=item seq_ids
280
281							Returns an array or array reference of the names of the chromosomes or
282							reference sequences present in the file. Must parse the entire file before using.
283
284							=item seq_id_lengths
285
286							Returns a hash reference to the chromosomes or reference sequences and
287							their corresponding lengths. In this case, the length is inferred by the
288							greatest feature end position. Must parse the entire file before using.
289
290							=back
291
292							=head1 SEE ALSO
293
294							L, L, L,
295
296
297							=cut
298
299	1			1		1126	use strict;
	1					1
	1					27
300	1			1		4	use Carp qw(carp cluck croak confess);
	1					1
	1					52
301	1			1		453	use Bio::ToolBox::Data::Stream;
	1					4
	1					2770
302
303							my $SFCLASS = 'Bio::ToolBox::SeqFeature';
304							eval "require $SFCLASS" or croak $@;
305
306							1;
307
308							sub new {
309	20			20	1	2792	my $class = shift;
310	20					178	my $self = {
311							'stream' => undef,
312							'fh' => undef,
313							'version' => undef,
314							'bed' => undef,
315							'convertor_sub' => undef,
316							'source' => '.',
317							'top_features' => [],
318							'do_gene' => 0,
319							'do_exon' => 0,
320							'do_cds' => 0,
321							'do_utr' => 0,
322							'do_codon' => 0,
323							'share' => 0,
324							'eof' => 0,
325							'line_count' => 0,
326							'seq_ids' => {},
327							};
328	20					36	bless $self, $class;
329
330							# check for options
331	20	100				91	if (@_) {
332	12	50				26	if (scalar(@_) == 1) {
333							# short and sweet, just a file, we assume
334	0					0	my $file = shift @_;
335	0					0	$self->open_file($file);
336							}
337							else {
338	12					33	my %options = @_;
339	12	50	33			36	if (exists $options{file} or $options{table}) {
340	12		33			27	$options{file} \|\|= $options{table};
341	12					28	$self->open_file( $options{file} );
342							}
343	12	100				26	if (exists $options{do_exon}) {
344	4					11	$self->do_exon($options{do_exon});
345							}
346	12	50				21	if (exists $options{do_cds}) {
347	0					0	$self->do_cds($options{do_cds});
348							}
349	12	50				24	if (exists $options{do_utr}) {
350	0					0	$self->do_utr($options{do_utr});
351							}
352	12	50				19	if (exists $options{do_codon}) {
353	0					0	$self->do_codon($options{do_codon});
354							}
355	12	50				17	if (exists $options{source}) {
356	0					0	$self->source($options{source});
357							}
358	12	100				21	if (exists $options{class}) {
359	8					10	my $class = $options{class};
360	8	50				335	if (eval "require $class; 1") {
361	8					19	$SFCLASS = $class;
362							}
363							else {
364	0					0	croak $@;
365							}
366							}
367							}
368							}
369
370							# done
371	20					52	return $self;
372							}
373
374							sub version {
375	28			28	1	1037	return shift->{version};
376							}
377
378							sub source {
379	106			106	1	132	my $self = shift;
380	106	100				144	if (@_) {
381	20					33	$self->{'source'} = shift;
382							}
383	106					182	return $self->{'source'};
384							}
385
386							sub simplify {
387							# this doesn't do anything, for now, but maintain compatibility with gff parser
388	8			8	0	9	return 0;
389							}
390
391							sub do_gene {
392							# this does nothing
393	12			12	0	1154	return 0;
394							}
395
396							sub do_exon {
397							# this only pertains to bed-12 files
398	98			98	1	110	my $self = shift;
399	98	100				151	if (@_) {
400	8					21	$self->{'do_exon'} = shift;
401							}
402	98					193	return $self->{'do_exon'};
403							}
404
405							sub do_cds {
406							# this only pertains to bed-12 files
407	62			62	1	70	my $self = shift;
408	62	50				87	if (@_) {
409	0					0	$self->{'do_cds'} = shift;
410							}
411	62					114	return $self->{'do_cds'};
412							}
413
414							sub do_utr {
415							# this only pertains to bed-12 files
416	58			58	1	61	my $self = shift;
417	58	50				81	if (@_) {
418	0					0	$self->{'do_utr'} = shift;
419							}
420	58					107	return $self->{'do_utr'};
421							}
422
423							sub do_codon {
424							# this only pertains to bed-12 files
425	62			62	1	70	my $self = shift;
426	62	50				96	if (@_) {
427	0					0	$self->{'do_codon'} = shift;
428							}
429	62					113	return $self->{'do_codon'};
430							}
431
432							sub do_name {
433							# this does nothing
434	229			229	0	374	return 0;
435							}
436
437							sub share {
438							# this does nothing as we do not parse genes, so no opportunity for sharing
439	229			229	0	398	return 0;
440							}
441
442							sub fh {
443	346			346	1	2135	my $self = shift;
444	346					2719	return $self->{'fh'};
445							}
446
447							sub open_file {
448	20			20	1	28	my $self = shift;
449
450	20	50				62	if ($self->{stream}) {
451	0					0	confess " Cannot open a new file with the same Bed parser object!";
452							}
453
454							# check file
455	20					26	my $filename = shift;
456	20	50				37	unless ($filename) {
457	0					0	cluck("no file name passed!");
458	0					0	return;
459							}
460
461							# Open file
462							# we are using a Stream object for simplicity to handle comment lines,
463							# identify columns and structures, etc
464	20	50				99	my $Stream = Bio::ToolBox::Data::Stream->new(in => $filename) or
465							croak " cannot open file '$filename'!\n";
466	20					50	my $bed = $Stream->bed; # this is the number of bed columns observed
467	20	50				34	unless ($bed) {
468	0					0	croak " file '$filename' doesn't appear to be proper Bed format!\n";
469							}
470	20					29	$self->{bed} = $bed;
471
472							# check for special formats
473	20	100				37	my $peak = $Stream->extension =~ /peak/i ? $bed : 0;
474	20	50	33			48	my $bdg = ($bed == 4 and $Stream->extension =~ /bdg\|bedgraph/i) ? 1 : 0;
475
476							# check for a track or browser definition line
477	20					69	foreach my $c ($Stream->comments) {
478							# these may or may not be present
479							# if so, we can check for a track type which may hint at what the file is
480	8	100				49	if ($c =~ /^track.+type=gappedpeak/i) {
		50
		50
		50
		50
		50
481							# looks like we have a gappedPeak file
482	4					6	$peak = 15;
483							}
484							elsif ($c =~ /^track.+type=narrowpeak/i) {
485							# looks like we have a narrowPeak file
486	0					0	$peak = 10;
487							}
488							elsif ($c =~ /^track.+type=broadpeak/i) {
489							# looks like we have a broadPeak file
490	0					0	$peak = 9;
491							}
492							elsif ($c =~ /^track.+type=bedgraph/i) {
493							# this is unlikely to occur, but you never know
494	0					0	$bdg = 1;
495							}
496							elsif ($c =~ /^track.+type=bed\s/i) {
497							# obviously, but just in case
498							}
499							elsif ($c =~ /^track.+type=(\w+)\s/i) {
500							# something weird
501	0					0	printf " file track definition type of '%s' is unrecognized, proceeding as bed file\n", $1;
502							}
503							}
504
505							# assign source
506	20					51	$self->source( $Stream->basename );
507
508							# assign the parsing subroutine
509	20	100				62	if ($peak == 15) {
		100
		50
		50
		100
510							# gappedPeak file
511	4					8	$self->{version} = 'gappedPeak';
512	4					8	$self->{convertor_sub} = \&_parse_gappedPeak;
513	4					12	$self->do_exon(1); # always parse sub peaks as exons
514
515							# gappedPeak is essentially bed12 with extra columns
516							# we will use existing code from the ucsc parser to convert bed12 to seqfeatures
517							# we need more object stuff that the ucsc parser expects
518	4					196	eval "require Bio::ToolBox::parser::ucsc;";
519	4					13	$self->{id2count} = {};
520	4					11	$self->{refseqsum} = {};
521	4					6	$self->{refseqstat} = {};
522	4					9	$self->{kgxref} = {};
523	4					6	$self->{ensembldata} = {};
524	4					5	$self->{gene2seqf} = {};
525	4					7	$self->{sfclass} = $SFCLASS;
526							}
527							elsif ($peak == 10) {
528							# narrowPeak file
529	4					7	$self->{version} = 'narrowPeak';
530	4					8	$self->{convertor_sub} = \&_parse_narrowPeak;
531							}
532							elsif ($peak == 9) {
533							# broadPeak file
534	0					0	$self->{version} = 'broadPeak';
535	0					0	$self->{convertor_sub} = \&_parse_broadPeak;
536							}
537							elsif ($bdg) {
538							# bedGraph file
539	0					0	$self->{version} = 'bedGraph';
540	0					0	$self->{convertor_sub} = \&_parse_bedGraph;
541							}
542							elsif ($bed > 6) {
543							# a gene table bed 12 format
544	6					24	$self->{version} = 'bed12';
545	6					14	$self->{convertor_sub} = \&_parse_bed12;
546
547							# we will use existing code from the ucsc parser to convert bed12 to seqfeatures
548							# we need more object stuff that the ucsc parser expects
549	6					285	eval "require Bio::ToolBox::parser::ucsc;";
550	6					21	$self->{id2count} = {};
551	6					10	$self->{refseqsum} = {};
552	6					11	$self->{refseqstat} = {};
553	6					10	$self->{kgxref} = {};
554	6					15	$self->{ensembldata} = {};
555	6					17	$self->{gene2seqf} = {};
556	6					12	$self->{sfclass} = $SFCLASS;
557							}
558							else {
559							# an ordinary bed file
560	6					21	$self->{version} = sprintf("bed%d", $bed);
561	6					14	$self->{convertor_sub} = \&_parse_bed;
562							}
563
564							# store stuff
565	20					28	$self->{stream} = $Stream; # keep this around, but probably won't use it....
566	20					26	$self->{fh} = $Stream->{fh};
567	20					37	return 1;
568							}
569
570							sub typelist {
571	8			8	1	11	my $self = shift;
572	8	50				17	return unless $self->{stream};
573
574	8	100				17	if ($self->version eq 'bed12') {
575							# return generic list based on what I could expect
576	2					5	return 'mRNA,ncRNA,exon,CDS';
577							}
578							else {
579							# return generic
580	6					14	return 'region';
581							}
582							}
583
584							sub next_feature {
585	138			138	1	335	my $self = shift;
586
587							# check that we have an open filehandle
588	138	50				196	unless ($self->fh) {
589	0					0	croak("no Bed file loaded to parse!");
590							}
591
592							# loop through the file
593	138					193	while (my $line = $self->fh->getline) {
594	124					2664	chomp $line;
595	124	50	33			660	if ($line =~ /^#/ or $line =~ /^(?:track\|browser)/ or $line !~ /\w+/) {
			33
596	0					0	$self->{line_count}++;
597	0					0	next;
598							}
599	124					155	my $feature = &{$self->{convertor_sub}}($self, $line);
	124					209
600	124					431	$self->{line_count}++;
601	124	50				198	unless ($feature) {
602	0					0	printf STDERR "unable to make feature for line %d!\n", $self->{line_count};
603	0					0	next;
604							}
605
606							# return the object, we do this while loop once per valid line
607	124					281	return $feature;
608							}
609
610							# presumed end of file
611	14					401	$self->{'eof'} = 1;
612	14					43	$self->fh->close;
613	14					203	return;
614							}
615
616							sub next_top_feature {
617	36			36	1	34	my $self = shift;
618							# check that we have an open filehandle
619	36	50				48	unless ($self->fh) {
620	0					0	croak("no Bed file loaded to parse!");
621							}
622	36	50				47	unless ($self->{'eof'}) {
623	0	0				0	$self->parse_file or croak "unable to parse file!";
624							}
625	36					33	return shift @{ $self->{top_features} };
	36					64
626							}
627
628							sub top_features {
629	6			6	1	26	my $self = shift;
630	6	50				14	unless ($self->{'eof'}) {
631	6					13	$self->parse_file;
632							}
633	6					8	my @features = @{ $self->{top_features} };
	6					18
634	6	50				24	return wantarray ? @features : \@features;
635							}
636
637							*parse_table = \&parse_file;
638
639							sub parse_file {
640	14			14	1	17	my $self = shift;
641							# check that we have an open filehandle
642	14	50				29	unless ($self->fh) {
643	0					0	confess("no file loaded to parse!");
644							}
645	14	50				30	return 1 if $self->{'eof'};
646
647	14					23	printf " Parsing %s format file....\n", $self->version;
648
649	14					70	while (my $feature = $self->next_feature) {
650							# there are possibly lots and lots of features here
651							# we are not going to bother checking IDs for uniqueness or sort them
652							# especially since we don't have to assign subfeatures to them
653	118					132	push @{ $self->{top_features} }, $feature;
	118					188
654
655							# check chromosome
656	118					219	my $s = $feature->seq_id;
657	118	100				341	unless (exists $self->{seq_ids}{$s}) {
658	18					33	$self->{seq_ids}{$s} = $feature->end;
659							}
660	118	100				212	$self->{seq_ids}{$s} = $feature->end if $feature->end > $self->{seq_ids}{$s};
661							}
662	14					31	return 1;
663							}
664
665
666							sub _parse_narrowPeak {
667	16			16		26	my ($self, $line) = @_;
668	16					47	my @data = split /\t/, $line;
669	16	50				29	unless (scalar(@data) == 10) {
670							croak sprintf("narrowPeak line %d '%s' doesn't have 10 elements!",
671	0					0	$self->{line_count}, $line);
672							}
673
674							# generate the basic SeqFeature
675							my $feature = $SFCLASS->new(
676							-seq_id => $data[0],
677							-start => $data[1] + 1,
678							-end => $data[2],
679							-name => $data[3],
680							-score => $data[4],
681							-strand => $data[5],
682							-primary_tag => 'region',
683							-source => $self->{source},
684	16					131	-primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
685							);
686
687							# add extra columns
688	16					44	$feature->add_tag_value('signalValue', $data[6]);
689	16					34	$feature->add_tag_value('pValue', $data[7]);
690	16					28	$feature->add_tag_value('qValue', $data[8]);
691	16					30	$feature->add_tag_value('peak', $data[9]);
692
693	16					29	return $feature;
694							}
695
696							sub _parse_broadPeak {
697	0			0		0	my ($self, $line) = @_;
698	0					0	my @data = split /\t/, $line;
699	0	0				0	unless (scalar(@data) == 9) {
700							croak sprintf("broadPeak line %d '%s' doesn't have 9 elements!",
701	0					0	$self->{line_count}, $line);
702							}
703
704							# generate the basic SeqFeature
705							my $feature = $SFCLASS->new(
706							-seq_id => $data[0],
707							-start => $data[1] + 1,
708							-end => $data[2],
709							-name => $data[3],
710							-score => $data[4],
711							-strand => $data[5],
712							-primary_tag => 'region',
713							-source => $self->{source},
714	0					0	-primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
715							);
716
717							# add extra columns
718	0					0	$feature->add_tag_value('signalValue', $data[6]);
719	0					0	$feature->add_tag_value('pValue', $data[7]);
720	0					0	$feature->add_tag_value('qValue', $data[8]);
721
722	0					0	return $feature;
723							}
724
725							sub _parse_bedGraph {
726	0			0		0	my ($self, $line) = @_;
727	0					0	my @data = split /\t/, $line;
728	0	0				0	unless (scalar(@data) == 9) {
729							croak sprintf("bedGraph line %d '%s' doesn't have 4 elements!",
730	0					0	$self->{line_count}, $line);
731							}
732
733							# generate the basic SeqFeature
734							return $SFCLASS->new(
735							-seq_id => $data[0],
736							-start => $data[1] + 1,
737							-end => $data[2],
738							-score => $data[3],
739							-primary_tag => 'region',
740							-source => $self->{source},
741	0					0	-primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
742							);
743							}
744
745							sub _parse_bed {
746	22			22		37	my ($self, $line) = @_;
747	22					55	my @data = split /\t/, $line;
748	22	50				58	unless (scalar(@data) == $self->{bed}) {
749							croak sprintf("Bed line %d '%s' doesn't have %d elements!",
750	0					0	$self->{line_count}, $line, $self->{bed});
751							}
752
753							# generate the basic SeqFeature
754							return $SFCLASS->new(
755							-seq_id => $data[0],
756							-start => $data[1] + 1,
757							-end => $data[2],
758							-name => $data[3] \|\| undef,
759							-score => $data[4] \|\| undef,
760							-strand => $data[5] \|\| undef,
761							-primary_tag => 'region',
762							-source => $self->{source},
763	22		50			231	-primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
			50
			50
764							);
765							}
766
767							sub _parse_bed12 {
768	70			70		115	my ($self, $line) = @_;
769	70					231	my @data = split /\t/, $line;
770	70	50				143	unless (scalar(@data) == $self->{bed}) {
771							croak sprintf("Bed line %d '%s' doesn't have %d elements!",
772	0					0	$self->{line_count}, $line, $self->{bed});
773							}
774
775							# we will take advantage of pre-existing code in the UCSC parser to convert
776							# a bed12 line to a fully-fledged processed transcript
777							# we just have to go through a genePred format first
778							# fortunately, the two are pretty similar in structure
779
780							# now convert from bed12
781							# Chromosome Start End Name Score Strand thickStart thickEnd itemRGB blockCount blockSizes blockStarts
782							# to genePred
783							# name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds
784
785							# add missing data
786	70		33			107	$data[6] \|\|= $data[2]; # cdsStart
787	70		33			88	$data[7] \|\|= $data[2]; # cdsEnd
788	70		50			180	$data[8] \|\|= 0; # rgb values
789	70		50			88	$data[9] \|\|= 1; # exonCount
790	70		33			79	$data[10] \|\|= $data[7] - $data[6]; # block size
791	70		100			112	$data[11] \|\|= 0; # block starts
792
793							# calculate exons
794	70					121	my @exonSizes = split(',', $data[10]);
795	70					142	my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
	366					484
796	70					106	my @exonEnds;
797	70					131	for (my $i = 0; $i < $data[9]; $i++) {
798	366					860	push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
799							}
800
801							# calculate new genePred elements
802	70					295	my @new = (
803							$data[3], # name
804							$data[0], # chrom
805							$data[5], # strand
806							$data[1], # txStart
807							$data[2], # txStop
808							$data[6], # cdsStart
809							$data[7], # cdsEnd
810							$data[9], # exonCount
811							join(',', @exonStarts), # exonStarts
812							join(',', @exonEnds), # exonEnds
813							);
814
815							# create builder
816	70					246	my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
817	70					157	my $feature = $builder->build_transcript;
818	70					165	$feature->add_tag_value('itemRGB', $data[8]);
819	70					248	$feature->score($data[4]);
820	70					412	$feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
821							# change the primary ID to match other bed file behavior, not UCSC files'
822	70					437	return $feature;
823							}
824
825							sub _parse_gappedPeak {
826	16			16		25	my ($self, $line) = @_;
827	16					56	my @data = split /\t/, $line;
828	16	50				31	unless (scalar(@data) == 15) {
829							croak sprintf("GappedPeak line %d '%s' doesn't have 15 elements!",
830	0					0	$self->{line_count}, $line);
831							}
832
833							# we will take advantage of pre-existing code in the UCSC parser to convert
834							# a gappedPeak line into main peak with subpeaks.
835							# we just have to go through a genePred format first
836							# fortunately, the two are pretty similar in structure
837
838							# calculate exons, er, sub peaks
839	16					27	my @exonSizes = split(',', $data[10]);
840	16					31	my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
	41					71
841	16					32	my @exonEnds;
842	16					31	for (my $i = 0; $i < $data[9]; $i++) {
843	41					68	push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
844							}
845
846							# calculate new genePred elements
847	16					66	my @new = (
848							$data[3], # name
849							$data[0], # chrom
850							$data[5], # strand
851							$data[1], # txStart
852							$data[2], # txStop
853							$data[6], # cdsStart
854							$data[7], # cdsEnd
855							$data[9], # exonCount
856							join(',', @exonStarts), # exonStarts
857							join(',', @exonEnds), # exonEnds
858							);
859
860							# create builder and process
861	16					64	my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
862	16					38	my $feature = $builder->build_transcript;
863
864							# clean up feature and add extra values
865	16					47	$feature->add_tag_value('itemRGB', $data[8]);
866	16					35	$feature->score($data[4]);
867	16					36	$feature->primary_tag('region'); # it is not a RNA
868	16					78	$feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
869							# change the primary ID to match other bed file behavior, not UCSC files'
870	16					30	$feature->add_tag_value('signalValue', $data[12]);
871	16					31	$feature->add_tag_value('pValue', $data[13]);
872	16					28	$feature->add_tag_value('qValue', $data[14]);
873	16					83	return $feature;
874							}
875
876							sub seq_ids {
877	0			0	1	0	my $self = shift;
878	0	0				0	unless (scalar keys %{$self->{seq_ids}}) {
	0					0
879	0					0	$self->_get_seq_ids;
880							}
881	0					0	my @s = keys %{$self->{seq_ids}};
	0					0
882	0	0				0	return wantarray ? @s : \@s;
883							}
884
885
886							sub seq_id_lengths {
887	0			0	1	0	my $self = shift;
888	0	0				0	unless (scalar keys %{$self->{seq_ids}}) {
	0					0
889	0					0	$self->_get_seq_ids;
890							}
891	0					0	return $self->{seq_ids};
892							}
893
894							sub _get_seq_ids {
895	0			0		0	my $self = shift;
896	0	0				0	return unless $self->{'eof'};
897	0					0	foreach (@{ $self->{top_features} }) {
	0					0
898	0					0	my $s = $_->seq_id;
899	0	0				0	unless (exists $self->{seq_ids}{$s}) {
900	0					0	$self->{seq_ids}{$s} = 1;
901							}
902	0	0				0	$self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s};
903							}
904							}
905
906							sub comments {
907	0			0	1	0	my $self = shift;
908	0	0				0	return unless $self->{stream};
909	0					0	return $self->{stream}->comments;
910							}
911
912
913							sub find_gene {
914	38			38	1	6325	my $self = shift;
915
916							# check that we have id2seqf table
917							# we lazy load this as it might not be needed every time
918	38	100				73	unless (exists $self->{id2seqf}) {
919	10	50				23	croak "must parse file first!" unless $self->{'eof'};
920	10					17	$self->{id2seqf} = {};
921	10					16	foreach (@{ $self->{top_features} }) {
	10					20
922	86					120	my $name = lc $_->display_name;
923	86	50				223	if (exists $self->{id2seqf}->{$name}) {
924	0					0	push @{ $self->{id2seqf}->{$name} }, $_;
	0					0
925							}
926							else {
927	86					147	$self->{id2seqf}->{$name} = [$_];
928							}
929							}
930							}
931
932							# get the name and coordinates from arguments
933	38					44	my ($name, $id, $chrom, $start, $end, $strand);
934	38	50				64	if (scalar @_ == 0) {
		100
935	0					0	carp "must provide information to find_gene method!";
936	0					0	return;
937							}
938							elsif (scalar @_ == 1) {
939	6					8	$name = $_[0];
940							}
941							else {
942	32					78	my %opt = @_;
943	32		0			46	$name = $opt{name} \|\| $opt{display_name} \|\| undef;
944	32		0			45	$id = $opt{id} \|\| $opt{primary_id} \|\| undef;
945	32		50			88	$chrom = $opt{chrom} \|\| $opt{seq_id} \|\| undef;
946	32		50			49	$start = $opt{start} \|\| undef;
947	32		50			76	$end = $opt{stop} \|\| $opt{end} \|\| undef;
948	32		50			63	$strand = $opt{strand} \|\| 0;
949							}
950	38	50				56	unless ($name) {
951	0					0	carp "name is required for find_gene!";
952	0					0	return;
953							}
954
955							# check if a gene with this name exists
956	38	50				79	if (exists $self->{id2seqf}->{lc $name} ) {
957							# we found a matching gene
958
959							# pull out the gene seqfeature(s) array reference
960							# there may be more than one gene
961	38					61	my $genes = $self->{id2seqf}->{ lc $name };
962
963							# go through a series of checks to find the appropriate
964	38	100				63	if ($id) {
965	32					40	foreach my $g (@$genes) {
966	32	50				56	if ($g->primary_id eq $id) {
967	32					58	return $g;
968							}
969							}
970	0					0	return; # none of these matched despite having an ID
971							}
972	6	0	33			14	if ($chrom and $start and $end) {
			33
973	0					0	foreach my $g (@$genes) {
974	0	0	0			0	if (
			0
975							# overlap method borrowed from Bio::RangeI
976							($g->strand == $strand) and not (
977							$g->start > $end or
978							$g->end < $start
979							)
980							) {
981							# gene and transcript overlap on the same strand
982							# we found the intersecting gene
983	0					0	return $g;
984							}
985							}
986	0					0	return; # none of these matched despite having coordinate info
987							}
988	6	50				9	if (scalar @$genes == 1) {
		0
989							# going on trust here that this is the one
990	6					15	return $genes->[0];
991							}
992							elsif (scalar @$genes > 1) {
993	0						carp "more than one gene named $name found!";
994	0						return $genes->[0];
995							}
996
997							# nothing suitable found
998	0						return;
999							}
1000							}
1001
1002
1003							__END__