File Coverage

blib/lib/Bio/ToolBox/parser/bed.pm
Criterion Covered Total %
statement 221 306 72.2
branch 75 144 52.0
condition 21 60 35.0
subroutine 26 32 81.2
pod 18 22 81.8
total 361 564 64.0


line stmt bran cond sub pod time code
1             package Bio::ToolBox::parser::bed;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::parser::bed - Parser for BED-style formats
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::parser::bed;
11            
12             ### Quick-n-easy bed parser
13             my $bed = Bio::ToolBox::parser::bed->new('file.bed');
14            
15             ### Full-powered bed parser, mostly for bed12 functionality
16             my $bed = Bio::ToolBox::parser::bed->new(
17             file => 'regions.bed',
18             do_exon => 1,
19             do_cds => 1,
20             do_codon => 1,
21             );
22            
23             # what type of bed file is being parsed, determined when opening file
24             my $type = $bed->version; # returns narrowPeak, bedGraph, bed12, bed6, etc
25            
26             # Retrieve one feature or line at a time
27             my $feature = $bed->next_feature;
28              
29             # Retrieve array of all features
30             my @genes = $bed->top_features;
31            
32             # each returned feature is a SeqFeature object
33             foreach my $f ($bed->next_top_feature) {
34             printf "%s:%d-%d\n", $f->seq_id, $f->start, $f->end;
35             }
36            
37              
38             =head1 DESCRIPTION
39              
40             This is a parser for converting BED-style and related formats into SeqFeature objects.
41             File formats include the following.
42              
43             =over 4
44              
45             =item Bed
46              
47             L files may have 3-12 columns,
48             where the first 3-6 columns are basic information about the feature itself, and
49             columns 7-12 are usually for defining subfeatures of a transcript model, including
50             exons, UTRs (thin portions), and CDS (thick portions) subfeatures. This parser will
51             parse these extra fields as appropriate into subfeature SeqFeature objects. Bed files
52             are recognized with the file extension F<.bed>.
53              
54             =item Bedgraph
55              
56             L files are a type of
57             wiggle format in Bed format, where the 4th column is a score instead of a name. BedGraph
58             files are recognized by the file extension F<.bedgraph> or F<.bdg>.
59              
60             =item narrowPeak
61              
62             L files are a specialized
63             Encode variant of bed files with 10 columns (typically denoted as bed6+4), where the
64             extra 4 fields represent score attributes to a narrow ChIPSeq peak. These files are
65             parsed as a typical bed6 file, and the extra four fields are assigned to SeqFeature
66             attribute tags C, C, C, and C, respectively.
67             NarrowPeak files are recognized by the file extension F<.narrowPeak>.
68              
69             =item broadPeak
70              
71             L files, like narrowPeak,
72             are an Encode variant with 9 columns (bed6+3) representing a broad or extended interval
73             of ChIP enrichment without a single "peak". The extra three fields are assigned to
74             SeqFeature attribute tags C, C, and C, respectively.
75             BroadPeak files are recognized by the file extension F<.broadPeak>.
76              
77             =back
78              
79             C and C lines are generally ignored, although a C definition
80             line containing a C key will be interpreted if it matches one of the above file
81             types.
82              
83             =head2 SeqFeature default values
84              
85             The SeqFeature objects built from the bed file intervals will have some inferred defaults.
86              
87             =over 4
88              
89             =item Coordinate system
90              
91             SeqFeature objects use the 1-based coordinate system, per the specification of
92             L, so the 0-based start coordinates of bed files will always be
93             parsed into 1-based coordinates.
94              
95             =item C
96              
97             SeqFeature objects will use the name field (4th column in bed files), if present, as the
98             C. The SeqFeature object should default to the C if a name was
99             not provided.
100              
101             =item C
102              
103             It will use a concatenation of the sequence ID, start (original 0-based), and
104             stop coordinates as the C, for example 'chr1:0-100'.
105              
106             =item C
107              
108             Bed files don't have a concept of feature type (they're all the same type), so a
109             default C of 'region' is set. For bed12 files with transcript models,
110             the transcripts will be set to either 'mRNA' or 'ncRNA', depending on the presence of
111             interpreted CDS start and stop (thick coordinates).
112              
113             =item C
114              
115             Bed files don't have a concept of a source. The basename of the provided file is
116             therefore used to set the C.
117              
118             =item attribute tags
119              
120             Extra columns in the narrowPeak and broadPeak formats are assigned to attribute tags
121             as described above. The C values set in bed12 files are also set to an attribute tag.
122              
123             =back
124              
125             =head1 METHODS
126              
127             =head2 Initializing the parser object
128              
129             =over 4
130              
131             =item new
132              
133             Initiate a new Bed file parser object. Pass a single value (the bed file name) to
134             open the file for parsing. Alternatively, pass an array of key
135             value pairs to control how the table is parsed. These options are primarily for
136             parsing bed12 files with subfeatures. Options include the following.
137              
138             =over 4
139              
140             =item file
141              
142             Provide the path and file name for a Bed file. The file may be gzip compressed.
143              
144             =item source
145              
146             Pass a string to be added as the source tag value of the SeqFeature objects.
147             The default value is the basename of the file to be parsed.
148              
149             =item do_exon
150              
151             =item do_cds
152              
153             =item do_utr
154              
155             =item do_codon
156              
157             Pass a boolean (1 or 0) value to parse certain subfeatures, including C,
158             C, C, C, C, and C
159             features. Default is false.
160              
161             =item class
162              
163             Pass the name of a L compliant class that will be used to
164             create the SeqFeature objects. The default is to use L.
165              
166             =back
167              
168             =back
169              
170             =head2 Modify the parser object
171              
172             These methods set or retrieve parameters that modify parser functionality.
173              
174             =over 4
175              
176             =item source
177              
178             =item do_exon
179              
180             =item do_cds
181              
182             =item do_utr
183              
184             =item do_codon
185              
186             These methods retrieve or set parameters to the parsing engine, same as
187             the options to the new method.
188              
189             =item open_file
190              
191             Pass the name of a file to parse. This function is called automatically by the
192             L method if a filename was passed. This will open the file, check its format,
193             and set the parsers appropriately.
194              
195             =back
196              
197             =head2 Parser or file attributes
198              
199             These retrieve attributes for the parser or file.
200              
201             =over 4
202              
203             =item version
204              
205             This returns a string representation of the opened bed file format. For standard
206             bed files, it returns 'bed' followed by the number columns, e.g. C or C.
207             For recognized special bed variants, it will return C, C, or
208             C.
209              
210             =item fh
211              
212             Retrieves the file handle of the current file. This module uses
213             L objects. Be careful manipulating file handles of open files!
214              
215             =item typelist
216              
217             Returns a string representation of the type of SeqFeature types to be encountered in
218             the file. Currently this returns generic strings, 'mRNA,ncRNA,exon,CDS' for bed12
219             and 'region' for everything else.
220              
221             =back
222              
223             =head2 Feature retrieval
224              
225             The following methods parse the table lines into SeqFeature objects.
226             It is best if methods are not mixed; unexpected results may occur.
227              
228             For bed12 files, it will return a transcript model SeqFeature with appropriate subfeatures.
229              
230             =over 4
231              
232             =item next_feature
233              
234             This will read the next line of the table, parse it into a feature object, and
235             immediately return it.
236              
237             =item next_top_feature
238              
239             This method will first parse the entire file into memory. It will then return each
240             feature one at a time. Call this method repeatedly using a while loop to get all features.
241              
242             =item top_features
243              
244             This method is similar to L, but instead returns an array
245             of all the top features.
246              
247             =back
248              
249             =head2 Other methods
250              
251             Additional methods for working with the parser object and the parsed
252             SeqFeature objects.
253              
254             =over 4
255              
256             =item parse_file
257              
258             Parses the entire file into memory without returning any objects.
259              
260             =item find_gene
261              
262             my $gene = $bed->find_gene(
263             display_name => 'ABC1',
264             primary_id => 'chr1:123-456',
265             );
266              
267             Pass a feature name, or an array of key =E values (name, display_name,
268             ID, primary_ID, and/or coordinate information), that can be used
269             to find a feature already loaded into memory. Only really successful if the
270             entire table is loaded into memory. Features with a matching name are
271             confirmed by a matching ID or overlapping coordinates, if available.
272             Otherwise the first match is returned.
273              
274             =item comments
275              
276             This method will return an array of the comment, track, or browser lines that may have
277             been in the parsed file. These may or may not be useful.
278              
279             =item seq_ids
280              
281             Returns an array or array reference of the names of the chromosomes or
282             reference sequences present in the file. Must parse the entire file before using.
283              
284             =item seq_id_lengths
285              
286             Returns a hash reference to the chromosomes or reference sequences and
287             their corresponding lengths. In this case, the length is inferred by the
288             greatest feature end position. Must parse the entire file before using.
289              
290             =back
291              
292             =head1 SEE ALSO
293              
294             L, L, L,
295              
296              
297             =cut
298              
299 1     1   1126 use strict;
  1         1  
  1         27  
300 1     1   4 use Carp qw(carp cluck croak confess);
  1         1  
  1         52  
301 1     1   453 use Bio::ToolBox::Data::Stream;
  1         4  
  1         2770  
302              
303             my $SFCLASS = 'Bio::ToolBox::SeqFeature';
304             eval "require $SFCLASS" or croak $@;
305              
306             1;
307              
308             sub new {
309 20     20 1 2792 my $class = shift;
310 20         178 my $self = {
311             'stream' => undef,
312             'fh' => undef,
313             'version' => undef,
314             'bed' => undef,
315             'convertor_sub' => undef,
316             'source' => '.',
317             'top_features' => [],
318             'do_gene' => 0,
319             'do_exon' => 0,
320             'do_cds' => 0,
321             'do_utr' => 0,
322             'do_codon' => 0,
323             'share' => 0,
324             'eof' => 0,
325             'line_count' => 0,
326             'seq_ids' => {},
327             };
328 20         36 bless $self, $class;
329            
330             # check for options
331 20 100       91 if (@_) {
332 12 50       26 if (scalar(@_) == 1) {
333             # short and sweet, just a file, we assume
334 0         0 my $file = shift @_;
335 0         0 $self->open_file($file);
336             }
337             else {
338 12         33 my %options = @_;
339 12 50 33     36 if (exists $options{file} or $options{table}) {
340 12   33     27 $options{file} ||= $options{table};
341 12         28 $self->open_file( $options{file} );
342             }
343 12 100       26 if (exists $options{do_exon}) {
344 4         11 $self->do_exon($options{do_exon});
345             }
346 12 50       21 if (exists $options{do_cds}) {
347 0         0 $self->do_cds($options{do_cds});
348             }
349 12 50       24 if (exists $options{do_utr}) {
350 0         0 $self->do_utr($options{do_utr});
351             }
352 12 50       19 if (exists $options{do_codon}) {
353 0         0 $self->do_codon($options{do_codon});
354             }
355 12 50       17 if (exists $options{source}) {
356 0         0 $self->source($options{source});
357             }
358 12 100       21 if (exists $options{class}) {
359 8         10 my $class = $options{class};
360 8 50       335 if (eval "require $class; 1") {
361 8         19 $SFCLASS = $class;
362             }
363             else {
364 0         0 croak $@;
365             }
366             }
367             }
368             }
369            
370             # done
371 20         52 return $self;
372             }
373              
374             sub version {
375 28     28 1 1037 return shift->{version};
376             }
377              
378             sub source {
379 106     106 1 132 my $self = shift;
380 106 100       144 if (@_) {
381 20         33 $self->{'source'} = shift;
382             }
383 106         182 return $self->{'source'};
384             }
385              
386             sub simplify {
387             # this doesn't do anything, for now, but maintain compatibility with gff parser
388 8     8 0 9 return 0;
389             }
390              
391             sub do_gene {
392             # this does nothing
393 12     12 0 1154 return 0;
394             }
395              
396             sub do_exon {
397             # this only pertains to bed-12 files
398 98     98 1 110 my $self = shift;
399 98 100       151 if (@_) {
400 8         21 $self->{'do_exon'} = shift;
401             }
402 98         193 return $self->{'do_exon'};
403             }
404              
405             sub do_cds {
406             # this only pertains to bed-12 files
407 62     62 1 70 my $self = shift;
408 62 50       87 if (@_) {
409 0         0 $self->{'do_cds'} = shift;
410             }
411 62         114 return $self->{'do_cds'};
412             }
413              
414             sub do_utr {
415             # this only pertains to bed-12 files
416 58     58 1 61 my $self = shift;
417 58 50       81 if (@_) {
418 0         0 $self->{'do_utr'} = shift;
419             }
420 58         107 return $self->{'do_utr'};
421             }
422              
423             sub do_codon {
424             # this only pertains to bed-12 files
425 62     62 1 70 my $self = shift;
426 62 50       96 if (@_) {
427 0         0 $self->{'do_codon'} = shift;
428             }
429 62         113 return $self->{'do_codon'};
430             }
431              
432             sub do_name {
433             # this does nothing
434 229     229 0 374 return 0;
435             }
436              
437             sub share {
438             # this does nothing as we do not parse genes, so no opportunity for sharing
439 229     229 0 398 return 0;
440             }
441              
442             sub fh {
443 346     346 1 2135 my $self = shift;
444 346         2719 return $self->{'fh'};
445             }
446              
447             sub open_file {
448 20     20 1 28 my $self = shift;
449            
450 20 50       62 if ($self->{stream}) {
451 0         0 confess " Cannot open a new file with the same Bed parser object!";
452             }
453            
454             # check file
455 20         26 my $filename = shift;
456 20 50       37 unless ($filename) {
457 0         0 cluck("no file name passed!");
458 0         0 return;
459             }
460            
461             # Open file
462             # we are using a Stream object for simplicity to handle comment lines,
463             # identify columns and structures, etc
464 20 50       99 my $Stream = Bio::ToolBox::Data::Stream->new(in => $filename) or
465             croak " cannot open file '$filename'!\n";
466 20         50 my $bed = $Stream->bed; # this is the number of bed columns observed
467 20 50       34 unless ($bed) {
468 0         0 croak " file '$filename' doesn't appear to be proper Bed format!\n";
469             }
470 20         29 $self->{bed} = $bed;
471            
472             # check for special formats
473 20 100       37 my $peak = $Stream->extension =~ /peak/i ? $bed : 0;
474 20 50 33     48 my $bdg = ($bed == 4 and $Stream->extension =~ /bdg|bedgraph/i) ? 1 : 0;
475            
476             # check for a track or browser definition line
477 20         69 foreach my $c ($Stream->comments) {
478             # these may or may not be present
479             # if so, we can check for a track type which may hint at what the file is
480 8 100       49 if ($c =~ /^track.+type=gappedpeak/i) {
    50          
    50          
    50          
    50          
    50          
481             # looks like we have a gappedPeak file
482 4         6 $peak = 15;
483             }
484             elsif ($c =~ /^track.+type=narrowpeak/i) {
485             # looks like we have a narrowPeak file
486 0         0 $peak = 10;
487             }
488             elsif ($c =~ /^track.+type=broadpeak/i) {
489             # looks like we have a broadPeak file
490 0         0 $peak = 9;
491             }
492             elsif ($c =~ /^track.+type=bedgraph/i) {
493             # this is unlikely to occur, but you never know
494 0         0 $bdg = 1;
495             }
496             elsif ($c =~ /^track.+type=bed\s/i) {
497             # obviously, but just in case
498             }
499             elsif ($c =~ /^track.+type=(\w+)\s/i) {
500             # something weird
501 0         0 printf " file track definition type of '%s' is unrecognized, proceeding as bed file\n", $1;
502             }
503             }
504            
505             # assign source
506 20         51 $self->source( $Stream->basename );
507            
508             # assign the parsing subroutine
509 20 100       62 if ($peak == 15) {
    100          
    50          
    50          
    100          
510             # gappedPeak file
511 4         8 $self->{version} = 'gappedPeak';
512 4         8 $self->{convertor_sub} = \&_parse_gappedPeak;
513 4         12 $self->do_exon(1); # always parse sub peaks as exons
514            
515             # gappedPeak is essentially bed12 with extra columns
516             # we will use existing code from the ucsc parser to convert bed12 to seqfeatures
517             # we need more object stuff that the ucsc parser expects
518 4         196 eval "require Bio::ToolBox::parser::ucsc;";
519 4         13 $self->{id2count} = {};
520 4         11 $self->{refseqsum} = {};
521 4         6 $self->{refseqstat} = {};
522 4         9 $self->{kgxref} = {};
523 4         6 $self->{ensembldata} = {};
524 4         5 $self->{gene2seqf} = {};
525 4         7 $self->{sfclass} = $SFCLASS;
526             }
527             elsif ($peak == 10) {
528             # narrowPeak file
529 4         7 $self->{version} = 'narrowPeak';
530 4         8 $self->{convertor_sub} = \&_parse_narrowPeak;
531             }
532             elsif ($peak == 9) {
533             # broadPeak file
534 0         0 $self->{version} = 'broadPeak';
535 0         0 $self->{convertor_sub} = \&_parse_broadPeak;
536             }
537             elsif ($bdg) {
538             # bedGraph file
539 0         0 $self->{version} = 'bedGraph';
540 0         0 $self->{convertor_sub} = \&_parse_bedGraph;
541             }
542             elsif ($bed > 6) {
543             # a gene table bed 12 format
544 6         24 $self->{version} = 'bed12';
545 6         14 $self->{convertor_sub} = \&_parse_bed12;
546            
547             # we will use existing code from the ucsc parser to convert bed12 to seqfeatures
548             # we need more object stuff that the ucsc parser expects
549 6         285 eval "require Bio::ToolBox::parser::ucsc;";
550 6         21 $self->{id2count} = {};
551 6         10 $self->{refseqsum} = {};
552 6         11 $self->{refseqstat} = {};
553 6         10 $self->{kgxref} = {};
554 6         15 $self->{ensembldata} = {};
555 6         17 $self->{gene2seqf} = {};
556 6         12 $self->{sfclass} = $SFCLASS;
557             }
558             else {
559             # an ordinary bed file
560 6         21 $self->{version} = sprintf("bed%d", $bed);
561 6         14 $self->{convertor_sub} = \&_parse_bed;
562             }
563            
564             # store stuff
565 20         28 $self->{stream} = $Stream; # keep this around, but probably won't use it....
566 20         26 $self->{fh} = $Stream->{fh};
567 20         37 return 1;
568             }
569              
570             sub typelist {
571 8     8 1 11 my $self = shift;
572 8 50       17 return unless $self->{stream};
573            
574 8 100       17 if ($self->version eq 'bed12') {
575             # return generic list based on what I could expect
576 2         5 return 'mRNA,ncRNA,exon,CDS';
577             }
578             else {
579             # return generic
580 6         14 return 'region';
581             }
582             }
583              
584             sub next_feature {
585 138     138 1 335 my $self = shift;
586            
587             # check that we have an open filehandle
588 138 50       196 unless ($self->fh) {
589 0         0 croak("no Bed file loaded to parse!");
590             }
591            
592             # loop through the file
593 138         193 while (my $line = $self->fh->getline) {
594 124         2664 chomp $line;
595 124 50 33     660 if ($line =~ /^#/ or $line =~ /^(?:track|browser)/ or $line !~ /\w+/) {
      33        
596 0         0 $self->{line_count}++;
597 0         0 next;
598             }
599 124         155 my $feature = &{$self->{convertor_sub}}($self, $line);
  124         209  
600 124         431 $self->{line_count}++;
601 124 50       198 unless ($feature) {
602 0         0 printf STDERR "unable to make feature for line %d!\n", $self->{line_count};
603 0         0 next;
604             }
605            
606             # return the object, we do this while loop once per valid line
607 124         281 return $feature;
608             }
609            
610             # presumed end of file
611 14         401 $self->{'eof'} = 1;
612 14         43 $self->fh->close;
613 14         203 return;
614             }
615              
616             sub next_top_feature {
617 36     36 1 34 my $self = shift;
618             # check that we have an open filehandle
619 36 50       48 unless ($self->fh) {
620 0         0 croak("no Bed file loaded to parse!");
621             }
622 36 50       47 unless ($self->{'eof'}) {
623 0 0       0 $self->parse_file or croak "unable to parse file!";
624             }
625 36         33 return shift @{ $self->{top_features} };
  36         64  
626             }
627              
628             sub top_features {
629 6     6 1 26 my $self = shift;
630 6 50       14 unless ($self->{'eof'}) {
631 6         13 $self->parse_file;
632             }
633 6         8 my @features = @{ $self->{top_features} };
  6         18  
634 6 50       24 return wantarray ? @features : \@features;
635             }
636              
637             *parse_table = \&parse_file;
638              
639             sub parse_file {
640 14     14 1 17 my $self = shift;
641             # check that we have an open filehandle
642 14 50       29 unless ($self->fh) {
643 0         0 confess("no file loaded to parse!");
644             }
645 14 50       30 return 1 if $self->{'eof'};
646            
647 14         23 printf " Parsing %s format file....\n", $self->version;
648            
649 14         70 while (my $feature = $self->next_feature) {
650             # there are possibly lots and lots of features here
651             # we are not going to bother checking IDs for uniqueness or sort them
652             # especially since we don't have to assign subfeatures to them
653 118         132 push @{ $self->{top_features} }, $feature;
  118         188  
654            
655             # check chromosome
656 118         219 my $s = $feature->seq_id;
657 118 100       341 unless (exists $self->{seq_ids}{$s}) {
658 18         33 $self->{seq_ids}{$s} = $feature->end;
659             }
660 118 100       212 $self->{seq_ids}{$s} = $feature->end if $feature->end > $self->{seq_ids}{$s};
661             }
662 14         31 return 1;
663             }
664              
665              
666             sub _parse_narrowPeak {
667 16     16   26 my ($self, $line) = @_;
668 16         47 my @data = split /\t/, $line;
669 16 50       29 unless (scalar(@data) == 10) {
670             croak sprintf("narrowPeak line %d '%s' doesn't have 10 elements!",
671 0         0 $self->{line_count}, $line);
672             }
673            
674             # generate the basic SeqFeature
675             my $feature = $SFCLASS->new(
676             -seq_id => $data[0],
677             -start => $data[1] + 1,
678             -end => $data[2],
679             -name => $data[3],
680             -score => $data[4],
681             -strand => $data[5],
682             -primary_tag => 'region',
683             -source => $self->{source},
684 16         131 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
685             );
686            
687             # add extra columns
688 16         44 $feature->add_tag_value('signalValue', $data[6]);
689 16         34 $feature->add_tag_value('pValue', $data[7]);
690 16         28 $feature->add_tag_value('qValue', $data[8]);
691 16         30 $feature->add_tag_value('peak', $data[9]);
692            
693 16         29 return $feature;
694             }
695              
696             sub _parse_broadPeak {
697 0     0   0 my ($self, $line) = @_;
698 0         0 my @data = split /\t/, $line;
699 0 0       0 unless (scalar(@data) == 9) {
700             croak sprintf("broadPeak line %d '%s' doesn't have 9 elements!",
701 0         0 $self->{line_count}, $line);
702             }
703            
704             # generate the basic SeqFeature
705             my $feature = $SFCLASS->new(
706             -seq_id => $data[0],
707             -start => $data[1] + 1,
708             -end => $data[2],
709             -name => $data[3],
710             -score => $data[4],
711             -strand => $data[5],
712             -primary_tag => 'region',
713             -source => $self->{source},
714 0         0 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
715             );
716            
717             # add extra columns
718 0         0 $feature->add_tag_value('signalValue', $data[6]);
719 0         0 $feature->add_tag_value('pValue', $data[7]);
720 0         0 $feature->add_tag_value('qValue', $data[8]);
721            
722 0         0 return $feature;
723             }
724              
725             sub _parse_bedGraph {
726 0     0   0 my ($self, $line) = @_;
727 0         0 my @data = split /\t/, $line;
728 0 0       0 unless (scalar(@data) == 9) {
729             croak sprintf("bedGraph line %d '%s' doesn't have 4 elements!",
730 0         0 $self->{line_count}, $line);
731             }
732            
733             # generate the basic SeqFeature
734             return $SFCLASS->new(
735             -seq_id => $data[0],
736             -start => $data[1] + 1,
737             -end => $data[2],
738             -score => $data[3],
739             -primary_tag => 'region',
740             -source => $self->{source},
741 0         0 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
742             );
743             }
744              
745             sub _parse_bed {
746 22     22   37 my ($self, $line) = @_;
747 22         55 my @data = split /\t/, $line;
748 22 50       58 unless (scalar(@data) == $self->{bed}) {
749             croak sprintf("Bed line %d '%s' doesn't have %d elements!",
750 0         0 $self->{line_count}, $line, $self->{bed});
751             }
752            
753             # generate the basic SeqFeature
754             return $SFCLASS->new(
755             -seq_id => $data[0],
756             -start => $data[1] + 1,
757             -end => $data[2],
758             -name => $data[3] || undef,
759             -score => $data[4] || undef,
760             -strand => $data[5] || undef,
761             -primary_tag => 'region',
762             -source => $self->{source},
763 22   50     231 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
      50        
      50        
764             );
765             }
766              
767             sub _parse_bed12 {
768 70     70   115 my ($self, $line) = @_;
769 70         231 my @data = split /\t/, $line;
770 70 50       143 unless (scalar(@data) == $self->{bed}) {
771             croak sprintf("Bed line %d '%s' doesn't have %d elements!",
772 0         0 $self->{line_count}, $line, $self->{bed});
773             }
774            
775             # we will take advantage of pre-existing code in the UCSC parser to convert
776             # a bed12 line to a fully-fledged processed transcript
777             # we just have to go through a genePred format first
778             # fortunately, the two are pretty similar in structure
779            
780             # now convert from bed12
781             # Chromosome Start End Name Score Strand thickStart thickEnd itemRGB blockCount blockSizes blockStarts
782             # to genePred
783             # name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds
784            
785             # add missing data
786 70   33     107 $data[6] ||= $data[2]; # cdsStart
787 70   33     88 $data[7] ||= $data[2]; # cdsEnd
788 70   50     180 $data[8] ||= 0; # rgb values
789 70   50     88 $data[9] ||= 1; # exonCount
790 70   33     79 $data[10] ||= $data[7] - $data[6]; # block size
791 70   100     112 $data[11] ||= 0; # block starts
792            
793             # calculate exons
794 70         121 my @exonSizes = split(',', $data[10]);
795 70         142 my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
  366         484  
796 70         106 my @exonEnds;
797 70         131 for (my $i = 0; $i < $data[9]; $i++) {
798 366         860 push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
799             }
800            
801             # calculate new genePred elements
802 70         295 my @new = (
803             $data[3], # name
804             $data[0], # chrom
805             $data[5], # strand
806             $data[1], # txStart
807             $data[2], # txStop
808             $data[6], # cdsStart
809             $data[7], # cdsEnd
810             $data[9], # exonCount
811             join(',', @exonStarts), # exonStarts
812             join(',', @exonEnds), # exonEnds
813             );
814            
815             # create builder
816 70         246 my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
817 70         157 my $feature = $builder->build_transcript;
818 70         165 $feature->add_tag_value('itemRGB', $data[8]);
819 70         248 $feature->score($data[4]);
820 70         412 $feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
821             # change the primary ID to match other bed file behavior, not UCSC files'
822 70         437 return $feature;
823             }
824              
825             sub _parse_gappedPeak {
826 16     16   25 my ($self, $line) = @_;
827 16         56 my @data = split /\t/, $line;
828 16 50       31 unless (scalar(@data) == 15) {
829             croak sprintf("GappedPeak line %d '%s' doesn't have 15 elements!",
830 0         0 $self->{line_count}, $line);
831             }
832            
833             # we will take advantage of pre-existing code in the UCSC parser to convert
834             # a gappedPeak line into main peak with subpeaks.
835             # we just have to go through a genePred format first
836             # fortunately, the two are pretty similar in structure
837            
838             # calculate exons, er, sub peaks
839 16         27 my @exonSizes = split(',', $data[10]);
840 16         31 my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
  41         71  
841 16         32 my @exonEnds;
842 16         31 for (my $i = 0; $i < $data[9]; $i++) {
843 41         68 push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
844             }
845            
846             # calculate new genePred elements
847 16         66 my @new = (
848             $data[3], # name
849             $data[0], # chrom
850             $data[5], # strand
851             $data[1], # txStart
852             $data[2], # txStop
853             $data[6], # cdsStart
854             $data[7], # cdsEnd
855             $data[9], # exonCount
856             join(',', @exonStarts), # exonStarts
857             join(',', @exonEnds), # exonEnds
858             );
859            
860             # create builder and process
861 16         64 my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
862 16         38 my $feature = $builder->build_transcript;
863            
864             # clean up feature and add extra values
865 16         47 $feature->add_tag_value('itemRGB', $data[8]);
866 16         35 $feature->score($data[4]);
867 16         36 $feature->primary_tag('region'); # it is not a RNA
868 16         78 $feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
869             # change the primary ID to match other bed file behavior, not UCSC files'
870 16         30 $feature->add_tag_value('signalValue', $data[12]);
871 16         31 $feature->add_tag_value('pValue', $data[13]);
872 16         28 $feature->add_tag_value('qValue', $data[14]);
873 16         83 return $feature;
874             }
875              
876             sub seq_ids {
877 0     0 1 0 my $self = shift;
878 0 0       0 unless (scalar keys %{$self->{seq_ids}}) {
  0         0  
879 0         0 $self->_get_seq_ids;
880             }
881 0         0 my @s = keys %{$self->{seq_ids}};
  0         0  
882 0 0       0 return wantarray ? @s : \@s;
883             }
884              
885              
886             sub seq_id_lengths {
887 0     0 1 0 my $self = shift;
888 0 0       0 unless (scalar keys %{$self->{seq_ids}}) {
  0         0  
889 0         0 $self->_get_seq_ids;
890             }
891 0         0 return $self->{seq_ids};
892             }
893              
894             sub _get_seq_ids {
895 0     0   0 my $self = shift;
896 0 0       0 return unless $self->{'eof'};
897 0         0 foreach (@{ $self->{top_features} }) {
  0         0  
898 0         0 my $s = $_->seq_id;
899 0 0       0 unless (exists $self->{seq_ids}{$s}) {
900 0         0 $self->{seq_ids}{$s} = 1;
901             }
902 0 0       0 $self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s};
903             }
904             }
905              
906             sub comments {
907 0     0 1 0 my $self = shift;
908 0 0       0 return unless $self->{stream};
909 0         0 return $self->{stream}->comments;
910             }
911              
912              
913             sub find_gene {
914 38     38 1 6325 my $self = shift;
915            
916             # check that we have id2seqf table
917             # we lazy load this as it might not be needed every time
918 38 100       73 unless (exists $self->{id2seqf}) {
919 10 50       23 croak "must parse file first!" unless $self->{'eof'};
920 10         17 $self->{id2seqf} = {};
921 10         16 foreach (@{ $self->{top_features} }) {
  10         20  
922 86         120 my $name = lc $_->display_name;
923 86 50       223 if (exists $self->{id2seqf}->{$name}) {
924 0         0 push @{ $self->{id2seqf}->{$name} }, $_;
  0         0  
925             }
926             else {
927 86         147 $self->{id2seqf}->{$name} = [$_];
928             }
929             }
930             }
931            
932             # get the name and coordinates from arguments
933 38         44 my ($name, $id, $chrom, $start, $end, $strand);
934 38 50       64 if (scalar @_ == 0) {
    100          
935 0         0 carp "must provide information to find_gene method!";
936 0         0 return;
937             }
938             elsif (scalar @_ == 1) {
939 6         8 $name = $_[0];
940             }
941             else {
942 32         78 my %opt = @_;
943 32   0     46 $name = $opt{name} || $opt{display_name} || undef;
944 32   0     45 $id = $opt{id} || $opt{primary_id} || undef;
945 32   50     88 $chrom = $opt{chrom} || $opt{seq_id} || undef;
946 32   50     49 $start = $opt{start} || undef;
947 32   50     76 $end = $opt{stop} || $opt{end} || undef;
948 32   50     63 $strand = $opt{strand} || 0;
949             }
950 38 50       56 unless ($name) {
951 0         0 carp "name is required for find_gene!";
952 0         0 return;
953             }
954            
955             # check if a gene with this name exists
956 38 50       79 if (exists $self->{id2seqf}->{lc $name} ) {
957             # we found a matching gene
958            
959             # pull out the gene seqfeature(s) array reference
960             # there may be more than one gene
961 38         61 my $genes = $self->{id2seqf}->{ lc $name };
962            
963             # go through a series of checks to find the appropriate
964 38 100       63 if ($id) {
965 32         40 foreach my $g (@$genes) {
966 32 50       56 if ($g->primary_id eq $id) {
967 32         58 return $g;
968             }
969             }
970 0         0 return; # none of these matched despite having an ID
971             }
972 6 0 33     14 if ($chrom and $start and $end) {
      33        
973 0         0 foreach my $g (@$genes) {
974 0 0 0     0 if (
      0        
975             # overlap method borrowed from Bio::RangeI
976             ($g->strand == $strand) and not (
977             $g->start > $end or
978             $g->end < $start
979             )
980             ) {
981             # gene and transcript overlap on the same strand
982             # we found the intersecting gene
983 0         0 return $g;
984             }
985             }
986 0         0 return; # none of these matched despite having coordinate info
987             }
988 6 50       9 if (scalar @$genes == 1) {
    0          
989             # going on trust here that this is the one
990 6         15 return $genes->[0];
991             }
992             elsif (scalar @$genes > 1) {
993 0           carp "more than one gene named $name found!";
994 0           return $genes->[0];
995             }
996            
997             # nothing suitable found
998 0           return;
999             }
1000             }
1001              
1002              
1003             __END__