File Coverage

blib/lib/Bio/ToolBox/parser/bed.pm
Criterion Covered Total %
statement 222 307 72.3
branch 75 144 52.0
condition 21 60 35.0
subroutine 26 32 81.2
pod 18 22 81.8
total 362 565 64.0


line stmt bran cond sub pod time code
1             package Bio::ToolBox::parser::bed;
2             our $VERSION = '1.64';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::parser::bed - Parser for BED-style formats
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::parser::bed;
11            
12             ### Quick-n-easy bed parser
13             my $bed = Bio::ToolBox::parser::bed->new('file.bed');
14            
15             ### Full-powered bed parser, mostly for bed12 functionality
16             my $bed = Bio::ToolBox::parser::bed->new(
17             file => 'regions.bed',
18             do_exon => 1,
19             do_cds => 1,
20             do_codon => 1,
21             );
22            
23             # what type of bed file is being parsed, determined when opening file
24             my $type = $bed->version; # returns narrowPeak, bedGraph, bed12, bed6, etc
25            
26             # Retrieve one feature or line at a time
27             my $feature = $bed->next_feature;
28              
29             # Retrieve array of all features
30             my @genes = $bed->top_features;
31            
32             # each returned feature is a SeqFeature object
33             foreach my $f ($bed->next_top_feature) {
34             printf "%s:%d-%d\n", $f->seq_id, $f->start, $f->end;
35             }
36            
37              
38             =head1 DESCRIPTION
39              
40             This is a parser for converting BED-style and related formats into SeqFeature objects.
41             File formats include the following.
42              
43             =over 4
44              
45             =item Bed
46              
47             L files may have 3-12 columns,
48             where the first 3-6 columns are basic information about the feature itself, and
49             columns 7-12 are usually for defining subfeatures of a transcript model, including
50             exons, UTRs (thin portions), and CDS (thick portions) subfeatures. This parser will
51             parse these extra fields as appropriate into subfeature SeqFeature objects. Bed files
52             are recognized with the file extension F<.bed>.
53              
54             =item Bedgraph
55              
56             L files are a type of
57             wiggle format in Bed format, where the 4th column is a score instead of a name. BedGraph
58             files are recognized by the file extension F<.bedgraph> or F<.bdg>.
59              
60             =item narrowPeak
61              
62             L files are a specialized
63             Encode variant of bed files with 10 columns (typically denoted as bed6+4), where the
64             extra 4 fields represent score attributes to a narrow ChIPSeq peak. These files are
65             parsed as a typical bed6 file, and the extra four fields are assigned to SeqFeature
66             attribute tags C, C, C, and C, respectively.
67             NarrowPeak files are recognized by the file extension F<.narrowPeak>.
68              
69             =item broadPeak
70              
71             L files, like narrowPeak,
72             are an Encode variant with 9 columns (bed6+3) representing a broad or extended interval
73             of ChIP enrichment without a single "peak". The extra three fields are assigned to
74             SeqFeature attribute tags C, C, and C, respectively.
75             BroadPeak files are recognized by the file extension F<.broadPeak>.
76              
77             =back
78              
79             C and C lines are generally ignored, although a C definition
80             line containing a C key will be interpreted if it matches one of the above file
81             types.
82              
83             =head2 SeqFeature default values
84              
85             The SeqFeature objects built from the bed file intervals will have some inferred defaults.
86              
87             =over 4
88              
89             =item Coordinate system
90              
91             SeqFeature objects use the 1-based coordinate system, per the specification of
92             L, so the 0-based start coordinates of bed files will always be
93             parsed into 1-based coordinates.
94              
95             =item C
96              
97             SeqFeature objects will use the name field (4th column in bed files), if present, as the
98             C. The SeqFeature object should default to the C if a name was
99             not provided.
100              
101             =item C
102              
103             It will use a concatenation of the sequence ID, start (original 0-based), and
104             stop coordinates as the C, for example 'chr1:0-100'.
105              
106             =item C
107              
108             Bed files don't have a concept of feature type (they're all the same type), so a
109             default C of 'region' is set. For bed12 files with transcript models,
110             the transcripts will be set to either 'mRNA' or 'ncRNA', depending on the presence of
111             interpreted CDS start and stop (thick coordinates).
112              
113             =item C
114              
115             Bed files don't have a concept of a source. The basename of the provided file is
116             therefore used to set the C.
117              
118             =item attribute tags
119              
120             Extra columns in the narrowPeak and broadPeak formats are assigned to attribute tags
121             as described above. The C values set in bed12 files are also set to an attribute tag.
122              
123             =back
124              
125             =head1 METHODS
126              
127             =head2 Initializing the parser object
128              
129             =over 4
130              
131             =item new
132              
133             Initiate a new Bed file parser object. Pass a single value (the bed file name) to
134             open the file for parsing. Alternatively, pass an array of key
135             value pairs to control how the table is parsed. These options are primarily for
136             parsing bed12 files with subfeatures. Options include the following.
137              
138             =over 4
139              
140             =item file
141              
142             Provide the path and file name for a Bed file. The file may be gzip compressed.
143              
144             =item source
145              
146             Pass a string to be added as the source tag value of the SeqFeature objects.
147             The default value is the basename of the file to be parsed.
148              
149             =item do_exon
150              
151             =item do_cds
152              
153             =item do_utr
154              
155             =item do_codon
156              
157             Pass a boolean (1 or 0) value to parse certain subfeatures, including C,
158             C, C, C, C, and C
159             features. Default is false.
160              
161             =item class
162              
163             Pass the name of a L compliant class that will be used to
164             create the SeqFeature objects. The default is to use L.
165              
166             =back
167              
168             =back
169              
170             =head2 Modify the parser object
171              
172             These methods set or retrieve parameters that modify parser functionality.
173              
174             =over 4
175              
176             =item source
177              
178             =item do_exon
179              
180             =item do_cds
181              
182             =item do_utr
183              
184             =item do_codon
185              
186             These methods retrieve or set parameters to the parsing engine, same as
187             the options to the new method.
188              
189             =item open_file
190              
191             Pass the name of a file to parse. This function is called automatically by the
192             L method if a filename was passed. This will open the file, check its format,
193             and set the parsers appropriately.
194              
195             =back
196              
197             =head2 Parser or file attributes
198              
199             These retrieve attributes for the parser or file.
200              
201             =over 4
202              
203             =item version
204              
205             This returns a string representation of the opened bed file format. For standard
206             bed files, it returns 'bed' followed by the number columns, e.g. C or C.
207             For recognized special bed variants, it will return C, C, or
208             C.
209              
210             =item fh
211              
212             Retrieves the file handle of the current file. This module uses
213             L objects. Be careful manipulating file handles of open files!
214              
215             =item typelist
216              
217             Returns a string representation of the type of SeqFeature types to be encountered in
218             the file. Currently this returns generic strings, 'mRNA,ncRNA,exon,CDS' for bed12
219             and 'region' for everything else.
220              
221             =back
222              
223             =head2 Feature retrieval
224              
225             The following methods parse the table lines into SeqFeature objects.
226             It is best if methods are not mixed; unexpected results may occur.
227              
228             For bed12 files, it will return a transcript model SeqFeature with appropriate subfeatures.
229              
230             =over 4
231              
232             =item next_feature
233              
234             This will read the next line of the table, parse it into a feature object, and
235             immediately return it.
236              
237             =item next_top_feature
238              
239             This method will first parse the entire file into memory. It will then return each
240             feature one at a time. Call this method repeatedly using a while loop to get all features.
241              
242             =item top_features
243              
244             This method is similar to L, but instead returns an array
245             of all the top features.
246              
247             =back
248              
249             =head2 Other methods
250              
251             Additional methods for working with the parser object and the parsed
252             SeqFeature objects.
253              
254             =over 4
255              
256             =item parse_file
257              
258             Parses the entire file into memory without returning any objects.
259              
260             =item find_gene
261              
262             my $gene = $bed->find_gene(
263             display_name => 'ABC1',
264             primary_id => 'chr1:123-456',
265             );
266              
267             Pass a feature name, or an array of key =E values (name, display_name,
268             ID, primary_ID, and/or coordinate information), that can be used
269             to find a feature already loaded into memory. Only really successful if the
270             entire table is loaded into memory. Features with a matching name are
271             confirmed by a matching ID or overlapping coordinates, if available.
272             Otherwise the first match is returned.
273              
274             =item comments
275              
276             This method will return an array of the comment, track, or browser lines that may have
277             been in the parsed file. These may or may not be useful.
278              
279             =item seq_ids
280              
281             Returns an array or array reference of the names of the chromosomes or
282             reference sequences present in the file. Must parse the entire file before using.
283              
284             =item seq_id_lengths
285              
286             Returns a hash reference to the chromosomes or reference sequences and
287             their corresponding lengths. In this case, the length is inferred by the
288             greatest feature end position. Must parse the entire file before using.
289              
290             =back
291              
292             =head1 SEE ALSO
293              
294             L, L, L,
295              
296              
297             =cut
298              
299 1     1   1158 use strict;
  1         2  
  1         36  
300 1     1   6 use Carp qw(carp cluck croak confess);
  1         2  
  1         56  
301 1     1   680 use Bio::ToolBox::Data::Stream;
  1         4  
  1         4297  
302              
303             my $SFCLASS = 'Bio::ToolBox::SeqFeature';
304             eval "require $SFCLASS" or croak $@;
305              
306             1;
307              
308             sub new {
309 20     20 1 4147 my $class = shift;
310 20         269 my $self = {
311             'stream' => undef,
312             'fh' => undef,
313             'version' => undef,
314             'bed' => undef,
315             'convertor_sub' => undef,
316             'source' => '.',
317             'top_features' => [],
318             'do_gene' => 0,
319             'do_exon' => 0,
320             'do_cds' => 0,
321             'do_utr' => 0,
322             'do_codon' => 0,
323             'share' => 0,
324             'eof' => 0,
325             'line_count' => 0,
326             'seq_ids' => {},
327             };
328 20         53 bless $self, $class;
329            
330             # check for options
331 20 100       57 if (@_) {
332 12 50       39 if (scalar(@_) == 1) {
333             # short and sweet, just a file, we assume
334 0         0 my $file = shift @_;
335 0         0 $self->open_file($file);
336             }
337             else {
338 12         49 my %options = @_;
339 12 50 33     44 if (exists $options{file} or $options{table}) {
340 12   33     31 $options{file} ||= $options{table};
341 12         43 $self->open_file( $options{file} );
342             }
343 12 100       39 if (exists $options{do_exon}) {
344 4         17 $self->do_exon($options{do_exon});
345             }
346 12 50       36 if (exists $options{do_cds}) {
347 0         0 $self->do_cds($options{do_cds});
348             }
349 12 50       28 if (exists $options{do_utr}) {
350 0         0 $self->do_utr($options{do_utr});
351             }
352 12 50       26 if (exists $options{do_codon}) {
353 0         0 $self->do_codon($options{do_codon});
354             }
355 12 50       25 if (exists $options{source}) {
356 0         0 $self->source($options{source});
357             }
358 12 100       30 if (exists $options{class}) {
359 8         18 my $class = $options{class};
360 8 50       458 if (eval "require $class; 1") {
361 8         25 $SFCLASS = $class;
362             }
363             else {
364 0         0 croak $@;
365             }
366             }
367             }
368             }
369            
370             # done
371 20         74 return $self;
372             }
373              
374             sub version {
375 28     28 1 1456 return shift->{version};
376             }
377              
378             sub source {
379 106     106 1 164 my $self = shift;
380 106 100       223 if (@_) {
381 20         35 $self->{'source'} = shift;
382             }
383 106         268 return $self->{'source'};
384             }
385              
386             sub simplify {
387             # this doesn't do anything, for now, but maintain compatibility with gff parser
388 8     8 0 12 return 0;
389             }
390              
391             sub do_gene {
392             # this does nothing
393 12     12 0 1613 return 0;
394             }
395              
396             sub do_exon {
397             # this only pertains to bed-12 files
398 98     98 1 164 my $self = shift;
399 98 100       207 if (@_) {
400 8         19 $self->{'do_exon'} = shift;
401             }
402 98         238 return $self->{'do_exon'};
403             }
404              
405             sub do_cds {
406             # this only pertains to bed-12 files
407 62     62 1 91 my $self = shift;
408 62 50       127 if (@_) {
409 0         0 $self->{'do_cds'} = shift;
410             }
411 62         167 return $self->{'do_cds'};
412             }
413              
414             sub do_utr {
415             # this only pertains to bed-12 files
416 58     58 1 84 my $self = shift;
417 58 50       109 if (@_) {
418 0         0 $self->{'do_utr'} = shift;
419             }
420 58         140 return $self->{'do_utr'};
421             }
422              
423             sub do_codon {
424             # this only pertains to bed-12 files
425 62     62 1 99 my $self = shift;
426 62 50       131 if (@_) {
427 0         0 $self->{'do_codon'} = shift;
428             }
429 62         145 return $self->{'do_codon'};
430             }
431              
432             sub do_name {
433             # this does nothing
434 229     229 0 560 return 0;
435             }
436              
437             sub share {
438             # this does nothing as we do not parse genes, so no opportunity for sharing
439 229     229 0 509 return 0;
440             }
441              
442             sub fh {
443 346     346 1 3191 my $self = shift;
444 346         3871 return $self->{'fh'};
445             }
446              
447             sub open_file {
448 20     20 1 62 my $self = shift;
449            
450 20 50       69 if ($self->{stream}) {
451 0         0 confess " Cannot open a new file with the same Bed parser object!";
452             }
453            
454             # check file
455 20         33 my $filename = shift;
456 20 50       53 unless ($filename) {
457 0         0 cluck("no file name passed!");
458 0         0 return;
459             }
460            
461             # Open file
462             # we are using a Stream object for simplicity to handle comment lines,
463             # identify columns and structures, etc
464 20 50       130 my $Stream = Bio::ToolBox::Data::Stream->new(in => $filename) or
465             croak " cannot open file '$filename'!\n";
466 20         79 my $bed = $Stream->bed; # this is the number of bed columns observed
467 20 50       50 unless ($bed) {
468 0         0 croak " file '$filename' doesn't appear to be proper Bed format!\n";
469             }
470 20         45 $self->{bed} = $bed;
471            
472             # check for special formats
473 20 100       63 my $peak = $Stream->extension =~ /peak/i ? $bed : 0;
474 20 50 33     69 my $bdg = ($bed == 4 and $Stream->extension =~ /bdg|bedgraph/i) ? 1 : 0;
475            
476             # check for a track or browser definition line
477 20         70 foreach my $c ($Stream->comments) {
478             # these may or may not be present
479             # if so, we can check for a track type which may hint at what the file is
480 8 100       64 if ($c =~ /^track.+type=gappedpeak/i) {
    50          
    50          
    50          
    50          
    50          
481             # looks like we have a gappedPeak file
482 4         14 $peak = 15;
483             }
484             elsif ($c =~ /^track.+type=narrowpeak/i) {
485             # looks like we have a narrowPeak file
486 0         0 $peak = 10;
487             }
488             elsif ($c =~ /^track.+type=broadpeak/i) {
489             # looks like we have a broadPeak file
490 0         0 $peak = 9;
491             }
492             elsif ($c =~ /^track.+type=bedgraph/i) {
493             # this is unlikely to occur, but you never know
494 0         0 $bdg = 1;
495             }
496             elsif ($c =~ /^track.+type=bed\s/i) {
497             # obviously, but just in case
498             }
499             elsif ($c =~ /^track.+type=(\w+)\s/i) {
500             # something weird
501 0         0 printf " file track definition type of '%s' is unrecognized, proceeding as bed file\n", $1;
502             }
503             }
504            
505             # assign source
506 20         71 $self->source( $Stream->basename );
507            
508             # assign the parsing subroutine
509 20 100       89 if ($peak == 15) {
    100          
    50          
    50          
    100          
510             # gappedPeak file
511 4         31 $self->{version} = 'gappedPeak';
512 4         13 $self->{convertor_sub} = \&_parse_gappedPeak;
513 4         14 $self->do_exon(1); # always parse sub peaks as exons
514            
515             # gappedPeak is essentially bed12 with extra columns
516             # we will use existing code from the ucsc parser to convert bed12 to seqfeatures
517             # we need more object stuff that the ucsc parser expects
518 4         279 eval "require Bio::ToolBox::parser::ucsc;";
519 4         23 $self->{id2count} = {};
520 4         14 $self->{refseqsum} = {};
521 4         10 $self->{refseqstat} = {};
522 4         12 $self->{kgxref} = {};
523 4         10 $self->{ensembldata} = {};
524 4         10 $self->{gene2seqf} = {};
525 4         9 $self->{sfclass} = $SFCLASS;
526             }
527             elsif ($peak == 10) {
528             # narrowPeak file
529 4         12 $self->{version} = 'narrowPeak';
530 4         14 $self->{convertor_sub} = \&_parse_narrowPeak;
531             }
532             elsif ($peak == 9) {
533             # broadPeak file
534 0         0 $self->{version} = 'broadPeak';
535 0         0 $self->{convertor_sub} = \&_parse_broadPeak;
536             }
537             elsif ($bdg) {
538             # bedGraph file
539 0         0 $self->{version} = 'bedGraph';
540 0         0 $self->{convertor_sub} = \&_parse_bedGraph;
541             }
542             elsif ($bed > 6) {
543             # a gene table bed 12 format
544 6         17 $self->{version} = 'bed12';
545 6         26 $self->{convertor_sub} = \&_parse_bed12;
546            
547             # we will use existing code from the ucsc parser to convert bed12 to seqfeatures
548             # we need more object stuff that the ucsc parser expects
549 6         384 eval "require Bio::ToolBox::parser::ucsc;";
550 6         27 $self->{id2count} = {};
551 6         19 $self->{refseqsum} = {};
552 6         14 $self->{refseqstat} = {};
553 6         14 $self->{kgxref} = {};
554 6         15 $self->{ensembldata} = {};
555 6         16 $self->{gene2seqf} = {};
556 6         14 $self->{sfclass} = $SFCLASS;
557             }
558             else {
559             # an ordinary bed file
560 6         31 $self->{version} = sprintf("bed%d", $bed);
561 6         17 $self->{convertor_sub} = \&_parse_bed;
562             }
563            
564             # store stuff
565 20         54 $self->{stream} = $Stream; # keep this around, but probably won't use it....
566 20         36 $self->{fh} = $Stream->{fh};
567 20         56 return 1;
568             }
569              
570             sub typelist {
571 8     8 1 15 my $self = shift;
572 8 50       23 return unless $self->{stream};
573            
574 8 100       20 if ($self->version eq 'bed12') {
575             # return generic list based on what I could expect
576 2         6 return 'mRNA,ncRNA,exon,CDS';
577             }
578             else {
579             # return generic
580 6         20 return 'region';
581             }
582             }
583              
584             sub next_feature {
585 138     138 1 471 my $self = shift;
586            
587             # check that we have an open filehandle
588 138 50       271 unless ($self->fh) {
589 0         0 croak("no Bed file loaded to parse!");
590             }
591            
592             # loop through the file
593 138         270 while (my $line = $self->fh->getline) {
594 124         3253 chomp $line;
595 124 50 33     1034 if ($line =~ /^#/ or $line =~ /^(?:track|browser)/ or $line !~ /\w+/) {
      33        
596 0         0 $self->{line_count}++;
597 0         0 next;
598             }
599 124         219 my $feature = &{$self->{convertor_sub}}($self, $line);
  124         304  
600 124         641 $self->{line_count}++;
601 124 50       280 unless ($feature) {
602 0         0 printf STDERR "unable to make feature for line %d!\n", $self->{line_count};
603 0         0 next;
604             }
605            
606             # return the object, we do this while loop once per valid line
607 124         389 return $feature;
608             }
609            
610             # presumed end of file
611 14         525 $self->{'eof'} = 1;
612 14         53 $self->fh->close;
613 14         273 return;
614             }
615              
616             sub next_top_feature {
617 36     36 1 54 my $self = shift;
618             # check that we have an open filehandle
619 36 50       56 unless ($self->fh) {
620 0         0 croak("no Bed file loaded to parse!");
621             }
622 36 50       68 unless ($self->{'eof'}) {
623 0 0       0 $self->parse_file or croak "unable to parse file!";
624             }
625 36         48 return shift @{ $self->{top_features} };
  36         158  
626             }
627              
628             sub top_features {
629 6     6 1 35 my $self = shift;
630 6 50       20 unless ($self->{'eof'}) {
631 6         17 $self->parse_file;
632             }
633 6         13 my @features = @{ $self->{top_features} };
  6         25  
634 6 50       34 return wantarray ? @features : \@features;
635             }
636              
637             *parse_table = \&parse_file;
638              
639             sub parse_file {
640 14     14 1 27 my $self = shift;
641             # check that we have an open filehandle
642 14 50       29 unless ($self->fh) {
643 0         0 confess("no file loaded to parse!");
644             }
645 14 50       38 return 1 if $self->{'eof'};
646            
647 14         71 printf " Parsing %s format file....\n", $self->version;
648            
649 14         98 while (my $feature = $self->next_feature) {
650             # there are possibly lots and lots of features here
651             # we are not going to bother checking IDs for uniqueness or sort them
652             # especially since we don't have to assign subfeatures to them
653 118         173 push @{ $self->{top_features} }, $feature;
  118         267  
654            
655             # check chromosome
656 118         317 my $s = $feature->seq_id;
657 118 100       482 unless (exists $self->{seq_ids}{$s}) {
658 18         54 $self->{seq_ids}{$s} = $feature->end;
659             }
660 118 100       281 $self->{seq_ids}{$s} = $feature->end if $feature->end > $self->{seq_ids}{$s};
661             }
662 14         119 return 1;
663             }
664              
665              
666             sub _parse_narrowPeak {
667 16     16   41 my ($self, $line) = @_;
668 16         70 my @data = split /\t/, $line;
669 16 50       47 unless (scalar(@data) == 10) {
670             croak sprintf("narrowPeak line %d '%s' doesn't have 10 elements!",
671 0         0 $self->{line_count}, $line);
672             }
673            
674             # generate the basic SeqFeature
675             my $feature = $SFCLASS->new(
676             -seq_id => $data[0],
677             -start => $data[1] + 1,
678             -end => $data[2],
679             -name => $data[3],
680             -score => $data[4],
681             -strand => $data[5],
682             -primary_tag => 'region',
683             -source => $self->{source},
684 16         190 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
685             );
686            
687             # add extra columns
688 16         65 $feature->add_tag_value('signalValue', $data[6]);
689 16         49 $feature->add_tag_value('pValue', $data[7]);
690 16         48 $feature->add_tag_value('qValue', $data[8]);
691 16         43 $feature->add_tag_value('peak', $data[9]);
692            
693 16         41 return $feature;
694             }
695              
696             sub _parse_broadPeak {
697 0     0   0 my ($self, $line) = @_;
698 0         0 my @data = split /\t/, $line;
699 0 0       0 unless (scalar(@data) == 9) {
700             croak sprintf("broadPeak line %d '%s' doesn't have 9 elements!",
701 0         0 $self->{line_count}, $line);
702             }
703            
704             # generate the basic SeqFeature
705             my $feature = $SFCLASS->new(
706             -seq_id => $data[0],
707             -start => $data[1] + 1,
708             -end => $data[2],
709             -name => $data[3],
710             -score => $data[4],
711             -strand => $data[5],
712             -primary_tag => 'region',
713             -source => $self->{source},
714 0         0 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
715             );
716            
717             # add extra columns
718 0         0 $feature->add_tag_value('signalValue', $data[6]);
719 0         0 $feature->add_tag_value('pValue', $data[7]);
720 0         0 $feature->add_tag_value('qValue', $data[8]);
721            
722 0         0 return $feature;
723             }
724              
725             sub _parse_bedGraph {
726 0     0   0 my ($self, $line) = @_;
727 0         0 my @data = split /\t/, $line;
728 0 0       0 unless (scalar(@data) == 9) {
729             croak sprintf("bedGraph line %d '%s' doesn't have 4 elements!",
730 0         0 $self->{line_count}, $line);
731             }
732            
733             # generate the basic SeqFeature
734             return $SFCLASS->new(
735             -seq_id => $data[0],
736             -start => $data[1] + 1,
737             -end => $data[2],
738             -score => $data[3],
739             -primary_tag => 'region',
740             -source => $self->{source},
741 0         0 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
742             );
743             }
744              
745             sub _parse_bed {
746 22     22   47 my ($self, $line) = @_;
747 22         83 my @data = split /\t/, $line;
748 22 50       61 unless (scalar(@data) == $self->{bed}) {
749             croak sprintf("Bed line %d '%s' doesn't have %d elements!",
750 0         0 $self->{line_count}, $line, $self->{bed});
751             }
752            
753             # generate the basic SeqFeature
754             return $SFCLASS->new(
755             -seq_id => $data[0],
756             -start => $data[1] + 1,
757             -end => $data[2],
758             -name => $data[3] || undef,
759             -score => $data[4] || undef,
760             -strand => $data[5] || undef,
761             -primary_tag => 'region',
762             -source => $self->{source},
763 22   50     294 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
      50        
      50        
764             );
765             }
766              
767             sub _parse_bed12 {
768 70     70   154 my ($self, $line) = @_;
769 70         306 my @data = split /\t/, $line;
770 70 50       199 unless (scalar(@data) == $self->{bed}) {
771             croak sprintf("Bed line %d '%s' doesn't have %d elements!",
772 0         0 $self->{line_count}, $line, $self->{bed});
773             }
774            
775             # we will take advantage of pre-existing code in the UCSC parser to convert
776             # a bed12 line to a fully-fledged processed transcript
777             # we just have to go through a genePred format first
778             # fortunately, the two are pretty similar in structure
779            
780             # now convert from bed12
781             # Chromosome Start End Name Score Strand thickStart thickEnd itemRGB blockCount blockSizes blockStarts
782             # to genePred
783             # name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds
784            
785             # add missing data
786 70   33     145 $data[6] ||= $data[2]; # cdsStart
787 70   33     144 $data[7] ||= $data[2]; # cdsEnd
788 70   50     315 $data[8] ||= 0; # rgb values
789 70   50     122 $data[9] ||= 1; # exonCount
790 70   33     118 $data[10] ||= $data[7] - $data[6]; # block size
791 70   100     180 $data[11] ||= 0; # block starts
792            
793             # calculate exons
794 70         173 my @exonSizes = split(',', $data[10]);
795 70         202 my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
  366         696  
796 70         148 my @exonEnds;
797 70         172 for (my $i = 0; $i < $data[9]; $i++) {
798 366         755 push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
799             }
800            
801             # calculate new genePred elements
802 70         448 my @new = (
803             $data[3], # name
804             $data[0], # chrom
805             $data[5], # strand
806             $data[1], # txStart
807             $data[2], # txStop
808             $data[6], # cdsStart
809             $data[7], # cdsEnd
810             $data[9], # exonCount
811             join(',', @exonStarts), # exonStarts
812             join(',', @exonEnds), # exonEnds
813             );
814            
815             # create builder
816 70         363 my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
817 70         236 my $feature = $builder->build_transcript;
818 70         255 $feature->add_tag_value('itemRGB', $data[8]);
819 70         378 $feature->score($data[4]);
820 70         593 $feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
821             # change the primary ID to match other bed file behavior, not UCSC files'
822 70         629 return $feature;
823             }
824              
825             sub _parse_gappedPeak {
826 16     16   38 my ($self, $line) = @_;
827 16         90 my @data = split /\t/, $line;
828 16 50       48 unless (scalar(@data) == 15) {
829             croak sprintf("GappedPeak line %d '%s' doesn't have 15 elements!",
830 0         0 $self->{line_count}, $line);
831             }
832            
833             # we will take advantage of pre-existing code in the UCSC parser to convert
834             # a gappedPeak line into main peak with subpeaks.
835             # we just have to go through a genePred format first
836             # fortunately, the two are pretty similar in structure
837            
838             # calculate exons, er, sub peaks
839 16         41 my @exonSizes = split(',', $data[10]);
840 16         50 my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
  41         111  
841 16         32 my @exonEnds;
842 16         49 for (my $i = 0; $i < $data[9]; $i++) {
843 41         102 push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
844             }
845            
846             # calculate new genePred elements
847 16         95 my @new = (
848             $data[3], # name
849             $data[0], # chrom
850             '+', # strand is typically unstranded, so just pretend to be
851             $data[1], # txStart
852             $data[2], # txStop
853             $data[6], # cdsStart
854             $data[7], # cdsEnd
855             $data[9], # exonCount
856             join(',', @exonStarts), # exonStarts
857             join(',', @exonEnds), # exonEnds
858             );
859            
860             # create builder and process
861 16         96 my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
862 16         55 my $feature = $builder->build_transcript;
863            
864             # clean up feature and add extra values
865 16         62 $feature->add_tag_value('itemRGB', $data[8]);
866 16         48 $feature->score($data[4]);
867 16         53 $feature->strand($data[5]);
868 16         46 $feature->primary_tag('region'); # it is not a RNA
869 16         137 $feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
870             # change the primary ID to match other bed file behavior, not UCSC files'
871 16         46 $feature->add_tag_value('signalValue', $data[12]);
872 16         45 $feature->add_tag_value('pValue', $data[13]);
873 16         51 $feature->add_tag_value('qValue', $data[14]);
874 16         122 return $feature;
875             }
876              
877             sub seq_ids {
878 0     0 1 0 my $self = shift;
879 0 0       0 unless (scalar keys %{$self->{seq_ids}}) {
  0         0  
880 0         0 $self->_get_seq_ids;
881             }
882 0         0 my @s = keys %{$self->{seq_ids}};
  0         0  
883 0 0       0 return wantarray ? @s : \@s;
884             }
885              
886              
887             sub seq_id_lengths {
888 0     0 1 0 my $self = shift;
889 0 0       0 unless (scalar keys %{$self->{seq_ids}}) {
  0         0  
890 0         0 $self->_get_seq_ids;
891             }
892 0         0 return $self->{seq_ids};
893             }
894              
895             sub _get_seq_ids {
896 0     0   0 my $self = shift;
897 0 0       0 return unless $self->{'eof'};
898 0         0 foreach (@{ $self->{top_features} }) {
  0         0  
899 0         0 my $s = $_->seq_id;
900 0 0       0 unless (exists $self->{seq_ids}{$s}) {
901 0         0 $self->{seq_ids}{$s} = 1;
902             }
903 0 0       0 $self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s};
904             }
905             }
906              
907             sub comments {
908 0     0 1 0 my $self = shift;
909 0 0       0 return unless $self->{stream};
910 0         0 return $self->{stream}->comments;
911             }
912              
913              
914             sub find_gene {
915 38     38 1 8957 my $self = shift;
916            
917             # check that we have id2seqf table
918             # we lazy load this as it might not be needed every time
919 38 100       92 unless (exists $self->{id2seqf}) {
920 10 50       36 croak "must parse file first!" unless $self->{'eof'};
921 10         28 $self->{id2seqf} = {};
922 10         19 foreach (@{ $self->{top_features} }) {
  10         34  
923 86         177 my $name = lc $_->display_name;
924 86 50       337 if (exists $self->{id2seqf}->{$name}) {
925 0         0 push @{ $self->{id2seqf}->{$name} }, $_;
  0         0  
926             }
927             else {
928 86         216 $self->{id2seqf}->{$name} = [$_];
929             }
930             }
931             }
932            
933             # get the name and coordinates from arguments
934 38         107 my ($name, $id, $chrom, $start, $end, $strand);
935 38 50       106 if (scalar @_ == 0) {
    100          
936 0         0 carp "must provide information to find_gene method!";
937 0         0 return;
938             }
939             elsif (scalar @_ == 1) {
940 6         23 $name = $_[0];
941             }
942             else {
943 32         84 my %opt = @_;
944 32   0     75 $name = $opt{name} || $opt{display_name} || undef;
945 32   0     57 $id = $opt{id} || $opt{primary_id} || undef;
946 32   50     130 $chrom = $opt{chrom} || $opt{seq_id} || undef;
947 32   50     81 $start = $opt{start} || undef;
948 32   50     108 $end = $opt{stop} || $opt{end} || undef;
949 32   50     99 $strand = $opt{strand} || 0;
950             }
951 38 50       82 unless ($name) {
952 0         0 carp "name is required for find_gene!";
953 0         0 return;
954             }
955            
956             # check if a gene with this name exists
957 38 50       119 if (exists $self->{id2seqf}->{lc $name} ) {
958             # we found a matching gene
959            
960             # pull out the gene seqfeature(s) array reference
961             # there may be more than one gene
962 38         81 my $genes = $self->{id2seqf}->{ lc $name };
963            
964             # go through a series of checks to find the appropriate
965 38 100       79 if ($id) {
966 32         54 foreach my $g (@$genes) {
967 32 50       78 if ($g->primary_id eq $id) {
968 32         138 return $g;
969             }
970             }
971 0         0 return; # none of these matched despite having an ID
972             }
973 6 0 33     17 if ($chrom and $start and $end) {
      33        
974 0         0 foreach my $g (@$genes) {
975 0 0 0     0 if (
      0        
976             # overlap method borrowed from Bio::RangeI
977             ($g->strand == $strand) and not (
978             $g->start > $end or
979             $g->end < $start
980             )
981             ) {
982             # gene and transcript overlap on the same strand
983             # we found the intersecting gene
984 0         0 return $g;
985             }
986             }
987 0         0 return; # none of these matched despite having coordinate info
988             }
989 6 50       17 if (scalar @$genes == 1) {
    0          
990             # going on trust here that this is the one
991 6         19 return $genes->[0];
992             }
993             elsif (scalar @$genes > 1) {
994 0           carp "more than one gene named $name found!";
995 0           return $genes->[0];
996             }
997            
998             # nothing suitable found
999 0           return;
1000             }
1001             }
1002              
1003              
1004             __END__