File Coverage

blib/lib/Bio/ToolBox/parser/ucsc.pm
Criterion Covered Total %
statement 430 664 64.7
branch 214 384 55.7
condition 71 193 36.7
subroutine 52 58 89.6
pod 22 24 91.6
total 789 1323 59.6


line stmt bran cond sub pod time code
1             package Bio::ToolBox::parser::ucsc;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::parser::ucsc - Parser for UCSC genePred, refFlat, etc formats
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::parser::ucsc;
11            
12             ### A simple transcript parser
13             my $ucsc = Bio::ToolBox::parser::ucsc->new('file.genePred');
14            
15             ### A full fledged gene parser
16             my $ucsc = Bio::ToolBox::parser::ucsc->new(
17             file => 'ensGene.genePred',
18             do_gene => 1,
19             do_cds => 1,
20             do_utr => 1,
21             ensname => 'ensemblToGene.txt',
22             enssrc => 'ensemblSource.txt',
23             );
24            
25             ### Retrieve one transcript line at a time
26             my $transcript = $ucsc->next_feature;
27            
28             ### Retrieve one assembled gene at a time
29             my $gene = $ucsc->next_top_feature;
30            
31             ### Retrieve array of all assembled genes
32             my @genes = $ucsc->top_features;
33            
34             # Each gene or transcript is a SeqFeatureI compatible object
35             printf "gene %s is located at %s:%s-%s\n",
36             $gene->display_name, $gene->seq_id,
37             $gene->start, $gene->end;
38            
39             # Multiple transcripts can be assembled into a gene
40             foreach my $transcript ($gene->get_SeqFeatures) {
41             # each transcript has exons
42             foreach my $exon ($transcript->get_SeqFeatures) {
43             printf "exon is %sbp long\n", $exon->length;
44             }
45             }
46            
47             # Features can be printed in GFF3 format
48             $gene->version(3);
49             print STDOUT $gene->gff_string(1);
50             # the 1 indicates to recurse through all subfeatures
51            
52              
53             =head1 DESCRIPTION
54              
55             This is a parser for converting UCSC-style gene prediction flat file formats into
56             BioPerl-style L compliant objects, complete with nested objects
57             representing transcripts, exons, CDS, UTRs, start- and stop-codons. Full control
58             is available on what to parse, e.g. exons on, CDS and codons off. Additional gene
59             information can be added by supplying additional tables of information, such as
60             common gene names and descriptions, available from the UCSC repository.
61              
62             =head2 Table formats supported
63              
64             Supported files are tab-delimited text files obtained from UCSC and described
65             at L. Formats are identified
66             by the number of columns, rather than specific file extensions, column name
67             headers, or other metadata. Therefore, unmodified tables should only be used
68             for correct parsing. Some errors are reported for incorrect lines. Unadulterated
69             files can safely be downloaded from L.
70             Files obtained from the UCSC Table Browser can also be used with caution. Files
71             may be gzip compressed.
72              
73             File formats supported include the following.
74              
75             =over 4
76              
77             =item * Gene Prediction (genePred), 10 columns
78              
79             =item * Gene Prediction with RefSeq gene Name (refFlat), 11 columns
80              
81             =item * Extended Gene Prediction (genePredExt), 15 columns
82              
83             =item * Extended Gene Prediction with bin (genePredExt), 16 columns
84              
85             =item * knownGene table, 12 columns
86              
87             =back
88              
89             =head2 Supplemental information
90              
91             The UCSC gene prediction tables include essential information, but not detailed
92             information, such as common gene names, description, protein accession IDs, etc.
93             This additional information can be associated with the genes or transcripts during
94             parsing if the appropriate tables are supplied. These tables can be obtained from
95             the UCSC download site L.
96              
97             Supported tables include the following.
98              
99             =over 4
100              
101             =item * refSeqStatus, for refGene, knownGene, and xenoRefGene tables
102              
103             =item * refSeqSummary, for refGene, knownGene, and xenoRefGene tables
104              
105             =item * ensemblToGeneName, for ensGene tables
106              
107             =item * ensemblSource, for ensGene tables
108              
109             =item * kgXref, for knownGene tables
110              
111             =back
112              
113             =head2 Implementation
114              
115             For an implementation of this module to generate GFF3 formatted files from UCSC
116             data sources, see the L script L.
117              
118             =head1 METHODS
119              
120             =head2 Initalize the parser object
121              
122             =over 4
123              
124             =item new
125              
126             Initiate a UCSC table parser object. Pass a single value (a table file name)
127             to open a table and parse its objects. Alternatively, pass an array of key
128             value pairs to control how the table is parsed. Options include the following.
129              
130             =over 4
131              
132             =item file
133              
134             =item table
135              
136             Provide a file name for a UCSC gene prediction table. The file may be gzip
137             compressed.
138              
139             =item source
140              
141             Pass a string to be added as the source tag value of the SeqFeature objects.
142             The default value is 'UCSC'. If the file name has a recognizable name,
143             such as 'refGene' or 'ensGene', it will be used instead.
144              
145             =item do_gene
146              
147             Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene
148             name under a single gene object. Default is true.
149              
150             -item do_exon
151              
152             =item do_cds
153              
154             =item do_utr
155              
156             =item do_codon
157              
158             Pass a boolean (1 or 0) value to parse certain subfeatures, including exon,
159             CDS, five_prime_UTR, three_prime_UTR, stop_codon, and start_codon features.
160             Default is false.
161              
162             =item do_name
163              
164             Pass a boolean (1 or 0) value to assign names to subfeatures, including exons,
165             CDSs, UTRs, and start and stop codons. Default is false.
166              
167             =item share
168              
169             Pass a boolean (1 or 0) value to recycle shared subfeatures (exons and UTRs)
170             between multiple transcripts of the same gene. This results in reduced
171             memory usage, and smaller exported GFF3 files. Default is true.
172              
173             =item refseqsum
174              
175             =item refseqstat
176              
177             =item kgxref
178              
179             =item ensembltogene
180              
181             =item ensemblsource
182              
183             Pass the appropriate file name for additional information.
184              
185             =item class
186              
187             Pass the name of a L compliant class that will be used to
188             create the SeqFeature objects. The default is to use L.
189              
190             =back
191              
192             =back
193              
194             =head2 Modify the parser object
195              
196             These methods set or retrieve parameters, and load supplemental files and
197             new tables.
198              
199             =over 4
200              
201             =item source
202              
203             =item do_gene
204              
205             =item do_exon
206              
207             =item do_cds
208              
209             =item do_utr
210              
211             =item do_codon
212              
213             =item do_name
214              
215             =item share
216              
217             These methods retrieve or set parameters to the parsing engine, same as
218             the options to the new method.
219              
220             =item fh
221              
222             Set or retrieve the file handle of the current table. This module uses
223             L objects. Be careful manipulating file handles of open tables!
224              
225             =item open_file
226              
227             Pass the name of a new table to parse. Existing gene models loaded in
228             memory, if any, are discarded. Counts are reset to 0. Supplemental
229             tables are not discarded.
230              
231             =item load_extra_data($file, $type)
232              
233             my $file = 'hg19_refSeqSummary.txt.gz';
234             my success = $ucsc->load_extra_data($file, 'summary');
235              
236             Pass two values, the file name of the supplemental file and the type
237             of supplemental data. Values can include the following
238              
239             =over 4
240              
241             =item * refseqstatus or status
242              
243             =item * refseqsummary or summary
244              
245             =item * kgxref
246              
247             =item * ensembltogene or ensname
248              
249             =item * ensemblsource or enssrc
250              
251             =back
252              
253             The number of transcripts with information loaded from the supplemental
254             data file is returned.
255              
256             =back
257              
258             =head2 Feature retrieval
259              
260             The following methods parse the table lines into SeqFeature objects.
261             It is best if methods are not mixed; unexpected results may occur.
262              
263             =over 4
264              
265             =item next_feature
266              
267             This will read the next line of the table and parse it into a gene or
268             transcript object. However, multiple transcripts from the same gene are
269             not assembled together under the same gene object.
270              
271             =item next_top_feature
272              
273             This method will return all top features (typically genes), with multiple
274             transcripts of the same gene assembled under the same gene object. Transcripts
275             are assembled together if they share the same gene name and the transcripts
276             overlap. If transcripts share the same gene name but do not overlap, they
277             are placed into separate gene objects with the same name but different
278             C tags. Calling this method will parse the entire table into
279             memory (so that multiple transcripts may be assembled), but only one object
280             is returned at a time. Call this method repeatedly using a while loop to
281             get all features.
282              
283             =item top_features
284              
285             This method is similar to L, but instead returns an array
286             of all the top features.
287              
288             =back
289              
290             =head2 Other methods
291              
292             Additional methods for working with the parser object and the parsed
293             SeqFeature objects.
294              
295             =over 4
296              
297             =item parse_table
298              
299             Parses the table into memory. If a table wasn't provided using the
300             L or L methods, then a filename can be passed to this
301             method and it will automatically be opened for you.
302              
303             =item find_gene
304              
305             my $gene = $ucsc->find_gene(
306             display_name => 'ABC1',
307             primary_id => 'gene000001',
308             );
309              
310             Pass a gene name, or an array of key =E values (name, display_name,
311             ID, primary_ID, and/or coordinate information), that can be used
312             to find a gene already loaded into memory. Only really successful if the
313             entire table is loaded into memory. Genes with a matching name are
314             confirmed by a matching ID or overlapping coordinates, if available.
315             Otherwise the first match is returned.
316              
317             =item counts
318              
319             This method will return a hash of the number of genes and RNA types that
320             have been parsed.
321              
322             =item typelist
323              
324             This method will return a comma-delimited list of the feature types or
325             Cs found in the parsed file. Returns a generic list if a
326             file has not been parsed.
327              
328             =item from_ucsc_string
329              
330             A bare bones method that will convert a tab-delimited text line from a UCSC
331             formatted gene table into a SeqFeature object for you. Don't expect alternate
332             transcripts to be assembled into genes.
333              
334             =item seq_ids
335              
336             Returns an array or array reference of the names of the chromosomes or
337             reference sequences present in the table.
338              
339             =item seq_id_lengths
340              
341             Returns a hash reference to the chromosomes or reference sequences and
342             their corresponding lengths. In this case, the length is inferred by the
343             greatest gene end position.
344              
345             =back
346              
347             =head2 Bio::ToolBox::parser::ucsc::builder
348              
349             This is a private module that is responsible for building SeqFeature
350             objects from UCSC table lines. It is not intended for general public use.
351              
352             =head1 SEE ALSO
353              
354             L, L
355              
356             =cut
357              
358 2     2   83270 use strict;
  2         10  
  2         53  
359 2     2   9 use Carp qw(carp cluck croak);
  2         2  
  2         80  
360 2     2   457 use Bio::ToolBox::Data::file; # only used to get an open filehandle
  2         5  
  2         4748  
361              
362             1;
363              
364             sub new {
365 5     5 1 3355 my $class = shift;
366 5         84 my $self = {
367             'fh' => undef,
368             'version' => undef,
369             'source' => 'UCSC',
370             'top_features' => [],
371             'gene2seqf' => {},
372             'id2count' => {},
373             'counts' => {},
374             'do_gene' => 1,
375             'do_exon' => 0,
376             'do_cds' => 0,
377             'do_utr' => 0,
378             'do_codon' => 0,
379             'do_name' => 0,
380             'share' => 1,
381             'refseqsum' => {},
382             'refseqstat' => {},
383             'kgxref' => {},
384             'ensembldata' => {},
385             'eof' => 0,
386             'line_count' => 0,
387             'sfclass' => 'Bio::ToolBox::SeqFeature', # default class
388             'seq_ids' => {},
389             };
390 5         14 bless $self, $class;
391            
392             # check for options
393 5 100       16 if (@_) {
394 3 50       9 if (scalar(@_) == 1) {
395             # short and sweet, just a file, we assume
396 0         0 my $file = shift @_;
397 0         0 $self->open_file($file);
398             }
399             else {
400 3         13 my %options = @_;
401 3 50 33     14 if (exists $options{file} or $options{table}) {
402 3   33     6 $options{file} ||= $options{table};
403 3         11 $self->open_file( $options{file} );
404             }
405 3 100       8 if (exists $options{do_gene}) {
406 1         3 $self->do_gene($options{do_gene});
407             }
408 3 50       8 if (exists $options{do_exon}) {
409 3         10 $self->do_exon($options{do_exon});
410             }
411 3 100       8 if (exists $options{do_cds}) {
412 1         4 $self->do_cds($options{do_cds});
413             }
414 3 100       9 if (exists $options{do_utr}) {
415 1         4 $self->do_utr($options{do_utr});
416             }
417 3 50       6 if (exists $options{do_codon}) {
418 0         0 $self->do_codon($options{do_codon});
419             }
420 3 50       10 if (exists $options{do_name}) {
421 0         0 $self->do_name($options{do_name});
422             }
423 3 50       47 if (exists $options{share}) {
424 0         0 $self->share($options{share});
425             }
426 3 50       10 if (exists $options{source}) {
427 0         0 $self->source($options{source});
428             }
429 3 50       11 if (exists $options{refseqsum}) {
    50          
430 0         0 $self->load_extra_data($options{refseqsum}, 'refseqsum');
431             }
432             elsif (exists $options{summary}) {
433 0         0 $self->load_extra_data($options{summary}, 'refseqsum');
434             }
435 3 50       10 if (exists $options{refseqstat}) {
    50          
436 0         0 $self->load_extra_data($options{refseqstat}, 'refseqstat');
437             }
438             elsif (exists $options{status}) {
439 0         0 $self->load_extra_data($options{status}, 'refseqstat');
440             }
441 3 50       8 if (exists $options{kgxref}) {
442 0         0 $self->load_extra_data($options{kgxref}, 'kgxref');
443             }
444 3 50       10 if (exists $options{ensembltogenename}) {
    50          
445 0         0 $self->load_extra_data($options{ensembltogenename}, 'ensembltogene');
446             }
447             elsif (exists $options{ensname}) {
448 0         0 $self->load_extra_data($options{ensname}, 'ensembltogene');
449             }
450 3 50       11 if (exists $options{ensemblsource}) {
    100          
451 0         0 $self->load_extra_data($options{ensemblsource}, 'ensemblsource');
452             }
453             elsif (exists $options{enssrc}) {
454 1         4 $self->load_extra_data($options{enssrc}, 'ensemblsource');
455             }
456 3 100       10 if (exists $options{class}) {
457 2         5 $self->{sfclass} = $options{class};
458             }
459             }
460             }
461            
462             # done
463 5         15 return $self;
464             }
465              
466             sub version {
467 0     0 0 0 return shift->{version};
468             }
469              
470             sub source {
471 123     123 1 160 my $self = shift;
472 123 100       218 if (@_) {
473 5         9 $self->{'source'} = shift;
474             }
475 123         284 return $self->{'source'};
476             }
477              
478             sub simplify {
479             # this doesn't do anything, for now, but maintain compatibility with gff parser
480 2     2 0 3 return 0;
481             }
482              
483             sub do_gene {
484 95     95 1 665 my $self = shift;
485 95 100       169 if (@_) {
486 3         5 $self->{'do_gene'} = shift;
487             }
488 95         171 return $self->{'do_gene'};
489             }
490              
491             sub do_exon {
492 93     93 1 108 my $self = shift;
493 93 100       169 if (@_) {
494 4         9 $self->{'do_exon'} = shift;
495             }
496 93         171 return $self->{'do_exon'};
497             }
498              
499             sub do_cds {
500 55     55 1 70 my $self = shift;
501 55 100       117 if (@_) {
502 1         2 $self->{'do_cds'} = shift;
503             }
504 55         102 return $self->{'do_cds'};
505             }
506              
507             sub do_utr {
508 53     53 1 68 my $self = shift;
509 53 100       92 if (@_) {
510 1         1 $self->{'do_utr'} = shift;
511             }
512 53         110 return $self->{'do_utr'};
513             }
514              
515             sub do_codon {
516 54     54 1 73 my $self = shift;
517 54 50       107 if (@_) {
518 0         0 $self->{'do_codon'} = shift;
519             }
520 54         130 return $self->{'do_codon'};
521             }
522              
523             sub do_name {
524 231     231 1 278 my $self = shift;
525 231 50       365 if (@_) {
526 0         0 $self->{'do_name'} = shift;
527             }
528 231         403 return $self->{'do_name'};
529             }
530              
531             sub share {
532 394     394 1 449 my $self = shift;
533 394 50       537 if (@_) {
534 0         0 $self->{'share'} = shift;
535             }
536 394         984 return $self->{'share'};
537             }
538              
539             sub fh {
540 234     234 1 880 my $self = shift;
541 234 100       326 if (@_) {
542 7         11 $self->{'fh'} = shift;
543             }
544 234         2748 return $self->{'fh'};
545             }
546              
547             sub open_file {
548 7     7 1 11 my $self = shift;
549            
550             # check file
551 7         12 my $filename = shift;
552 7 50       18 unless ($filename) {
553 0         0 cluck("no file name passed!");
554 0         0 return;
555             }
556            
557             # Open filehandle object
558 7 50       51 my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or
559             croak " cannot open file '$filename'!\n";
560            
561             # check number of columns
562 7         14 my $ncol;
563 7         158 while (my $line = $fh->getline) {
564 14 100       958 next if $line =~ /^#/;
565 7 50       28 next unless $line =~ /\w+/;
566 7         41 my @fields = split "\t", $line;
567 7         12 $ncol = scalar(@fields);
568 7         20 last;
569             }
570 7 0 33     20 unless ($ncol == 16 or $ncol == 15 or $ncol == 12 or $ncol == 11 or $ncol == 10) {
      33        
      0        
      0        
571 0         0 carp " File '$filename' doesn't have recognizable number of columns! It has $ncol.";
572 0         0 return;
573             }
574 7 50       20 if ($ncol == 10) {
575             # turn off gene processing for simple genePred which has no gene names
576 0         0 $self->do_gene(0);
577             }
578            
579             # reopen file handle
580 7         42 $fh->close;
581 7         155 $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename);
582            
583             # reset source as necessary
584 7 100 66     52 if ($filename =~ /ensgene/i and $self->source eq 'UCSC') {
    50 33        
    50 33        
    50 33        
    50 33        
585 5         12 $self->source('EnsGene');
586             }
587             elsif ($filename =~ /xenorefgene/i and $self->source eq 'UCSC') {
588 0         0 $self->source('xenoRefGene');
589             }
590             elsif ($filename =~ /refgene/i and $self->source eq 'UCSC') {
591 0         0 $self->source('refGene');
592             }
593             elsif ($filename =~ /refseq/i and $self->source eq 'UCSC') {
594 0         0 $self->source('refSeq');
595             }
596             elsif ($filename =~ /knowngene/i and $self->source eq 'UCSC') {
597 0         0 $self->source('knownGene');
598             }
599            
600             # check existing data
601             # the reason this parser can handle reading a second file without having to make a
602             # new parser object (like gff and bed) is so that we can potentially recycle the
603             # extra UCSC information
604 7 100       23 if ($self->fh) {
605             # close existing
606 2         6 $self->fh->close;
607             # go ahead and clear out existing data
608 2         37 $self->{'version'} = undef;
609 2         5 $self->{'top_features'} = [];
610 2         6 $self->{'duplicate_ids'} = {};
611 2         5 $self->{'gene2seqf'} = {};
612 2         5 $self->{'id2count'} = {};
613 2         5 $self->{'counts'} = {};
614 2         4 $self->{'eof'} = 0;
615 2         4 $self->{'line_count'} = 0;
616             }
617            
618 7         19 $self->fh($fh);
619 7         19 return 1;
620             }
621              
622             sub load_extra_data {
623 5     5 1 1042 my ($self, $file, $type) = @_;
624 5 50       21 unless ($file) {
625 0         0 cluck "no file name passed!";
626 0         0 return;
627             }
628            
629             # check the type
630 5 100       45 if ($type =~ /ensembltogene|ensname/i) {
    50          
    0          
    0          
    0          
631 2         4 $type = 'ensembltogene';
632             }
633             elsif ($type =~ /ensemblsource|enssrc/i) {
634 3         7 $type = 'ensemblsource';
635             }
636             elsif ($type =~ /refseqstat|status/i) {
637 0         0 $type = 'refseqstat';
638             }
639             elsif ($type =~ /refseqsum|summary/i) {
640 0         0 $type = 'refseqsum';
641             }
642             elsif ($type =~ /kgxref/i) {
643 0         0 $type = 'kgxref';
644             }
645             else {
646 0         0 carp "unknown type '$type' to load extra data";
647 0         0 return;
648             }
649            
650 5         29 my $fh = Bio::ToolBox::Data::file->open_to_read_fh($file);
651 5 50       14 unless ($fh) {
652 0         0 carp "unable to open file '$file'! $!";
653 0         0 return;
654             }
655            
656             # load ensembl data
657 5         11 my $count = 0;
658 5 50       17 if ($type =~ /ensembl/) {
659             # we will store gene name in position 0, and source in position 1
660 5 100       13 my $index = $type eq 'ensembltogene' ? 0 : 1;
661 5         120 while (my $line = $fh->getline) {
662            
663             # process line
664 85         2015 chomp $line;
665 85 50       132 next if ($line =~ /^#/);
666 85         129 my @line_data = split /\t/, $line;
667 85 50       125 if (scalar @line_data != 2) {
668 0         0 carp " file $file doesn't seem right!? Line has " .
669             scalar @line_data . " elements!\n";
670 0         0 return;
671             }
672            
673             # store data into hash
674 85         178 $self->{'ensembldata'}{ $line_data[0] }->[$index] = $line_data[1];
675 85         1031 $count++;
676             }
677             }
678            
679             # load various refSeq data
680             else {
681             # we just store the line data based on the gene ID
682             # each table has different elements
683             # here they are for reference
684            
685             ### refSeqStatus table
686             # 0 mrnaAcc RefSeq gene accession name
687             # 1 status Status ('Unknown', 'Reviewed', 'Validated', 'Provisional', 'Predicted', 'Inferred')
688             # 2 molecule type ('DNA', 'RNA', 'ds-RNA', 'ds-mRNA', 'ds-rRNA', 'mRNA', 'ms-DNA', 'ms-RNA', 'rRNA', 'scRNA', 'snRNA', 'snoRNA', 'ss-DNA', 'ss-RNA', 'ss-snoRNA', 'tRNA', 'cRNA', 'ss-cRNA', 'ds-cRNA', 'ms-rRNA') values molecule type
689            
690             ### refSeqSummary table
691             # 0 RefSeq mRNA accession
692             # 1 completeness FullLength ('Unknown', 'Complete5End', 'Complete3End', 'FullLength', 'IncompleteBothEnds', 'Incomplete5End', 'Incomplete3End', 'Partial')
693             # 1 summary text values Summary comments
694            
695             ### kgXref table
696             # 0 kgID Known Gene ID
697             # 1 mRNA mRNA ID
698             # 2 spID SWISS-PROT protein Accession number
699             # 3 spDisplayID SWISS-PROT display ID
700             # 4 geneSymbol Gene Symbol
701             # 5 refseq RefSeq ID
702             # 6 protAcc NCBI protein Accession number
703             # 7 description Description
704            
705             ### ensemblToGeneName table
706             # 0 Ensembl transcript ID
707             # 1 gene name
708            
709             # load the table
710 0         0 while (my $line = $fh->getline) {
711 0         0 chomp $line;
712 0 0       0 next if ($line =~ /^#/);
713 0         0 my @line_data = split /\t/, $line;
714            
715             # the unique id should be the first element in the array
716             # take it off the array, since it doesn't need to be stored there too
717 0         0 my $id = shift @line_data;
718            
719             # check for duplicate lines
720 0 0       0 if (exists $self->{$type}{$id} ) {
721 0         0 warn " $type line for identifier $id exists twice!\n";
722 0         0 next;
723             }
724            
725             # store data into hash
726 0         0 $self->{$type}{$id} = [@line_data];
727 0         0 $count++;
728             }
729             }
730            
731 5         139 $fh->close;
732 5         85 return $count;
733             }
734              
735             sub typelist {
736 3     3 1 5 my $self = shift;
737 3         5 my @items;
738 3         5 foreach my $k (keys %{$self->{counts}}) {
  3         8  
739 0 0       0 push @items, $k if $self->{counts}{$k} > 0;
740             }
741 3 50       8 if (@items) {
742 0         0 return join(',', @items);
743             }
744             else {
745             # return generic list
746 3 100       7 return $self->do_gene ? 'gene,mRNA,ncRNA,exon,CDS' : 'mRNA,ncRNA,exon,CDS';
747             }
748             }
749              
750             sub next_feature {
751 92     92 1 1272 my $self = shift;
752            
753             # check that we have an open filehandle
754 92 50       178 unless ($self->fh) {
755 0         0 croak("no UCSC file loaded to parse!");
756             }
757            
758 92         167 while (my $line = $self->fh->getline) {
759 114         2239 chomp $line;
760 114 100 66     536 if ($line =~ /^#/ or $line !~ /\w+/) {
761 27         64 $self->{line_count}++;
762 27         65 next;
763             }
764 87         273 my $builder = Bio::ToolBox::parser::ucsc::builder->new($line, $self);
765 87         122 $self->{line_count}++;
766 87 50       175 unless ($builder) {
767             # builder will print its own error message if fails
768 0         0 warn " unable to parse line number ", $self->{line_count}, "\n";
769 0         0 next;
770             }
771            
772             # generate the feature from the line
773 87         98 my $feature;
774 87 100       199 if ($self->do_gene) {
775 70         122 $feature = $builder->build_gene;
776             }
777             else {
778 17         37 $feature = $builder->build_transcript;
779             }
780            
781             # return the object, we do this while loop once per valid line
782 87         521 return $feature;
783             }
784            
785             # presumed end of file
786 5         206 $self->{'eof'} = 1;
787 5         14 return;
788             }
789              
790             sub next_top_feature {
791 25     25 1 1058 my $self = shift;
792 25 50       37 unless ($self->{'eof'}) {
793 0         0 $self->parse_table;
794             }
795 25         27 return shift @{ $self->{top_features} };
  25         49  
796             }
797              
798             sub top_features {
799 2     2 1 1389 my $self = shift;
800 2 50       8 unless ($self->{'eof'}) {
801 0         0 $self->parse_table;
802             }
803 2         4 my @features = @{ $self->{top_features} };
  2         5  
804 2 50       8 return wantarray ? @features : \@features;
805             }
806              
807             *parse_file = \&parse_table;
808              
809             sub parse_table {
810 5     5 1 7704 my $self = shift;
811 5 100       15 if (@_) {
812 2 50       6 $self->open_file(shift) or return;
813             }
814 5 50       13 unless ($self->fh) {
815 0         0 carp "must open a file first!";
816 0         0 return;
817             }
818 5 50       16 return if ($self->{'eof'});
819            
820             #### Main Loop
821 5         393 print " Parsing UCSC gene table....\n";
822 5         26 while (my $feature = $self->next_feature) {
823            
824             # add to gene2seqf hash
825 85         196 my $gene = $self->find_gene($feature);
826 85 50       157 unless ($gene) {
827             # the current feature is not in the hash, so add it
828 85         216 $self->{gene2seqf}->{ lc $feature->display_name } = [ $feature ];
829             }
830            
831             # check chromosome
832 85         260 my $s = $feature->seq_id;
833 85 100       275 unless (exists $self->{seq_ids}{$s}) {
834 5         15 $self->{seq_ids}{$s} = $feature->end;
835             }
836 85 100       170 $self->{seq_ids}{$s} = $feature->end if $feature->end > $self->{seq_ids}{$s};
837             }
838            
839             # add to the top list of features, Schwartzian transform and sort
840             # based on the genes found in the gene2seqf hash
841 5         14 push @{ $self->{top_features} },
842 37         45 map {$_->[2]}
843 83 0       182 sort {$a->[0] cmp $b->[0] or $a->[1] <=> $b->[1]}
844             map [$_->seq_id, $_->start, $_],
845 37         68 map @{ $self->{gene2seqf}->{$_} },
846 5         9 keys %{$self->{gene2seqf}};
  5         25  
847            
848 5         29 return 1;
849             }
850              
851             sub find_gene {
852 87     87 1 20826 my $self = shift;
853            
854             # get the name and coordinates from arguments
855 87         138 my ($name, $id, $chrom, $start, $end, $strand);
856 87 50       212 if (scalar @_ == 0) {
    50          
857 0         0 carp "must provide information to find_gene method!";
858 0         0 return;
859             }
860             elsif (scalar @_ == 1) {
861 87         113 $name = $_[0];
862             }
863             else {
864 0         0 my %opt = @_;
865 0   0     0 $name = $opt{name} || $opt{display_name} || undef;
866 0   0     0 $id = $opt{id} || $opt{primary_id} || undef;
867 0   0     0 $chrom = $opt{chrom} || $opt{seq_id} || undef;
868 0   0     0 $start = $opt{start} || undef;
869 0   0     0 $end = $opt{stop} || $opt{end} || undef;
870 0   0     0 $strand = $opt{strand} || 0;
871             }
872 87 50       149 unless ($name) {
873 0         0 carp "name is required for find_gene!";
874 0         0 return;
875             }
876            
877             # check if a gene with this name exists
878 87 100       384 if (exists $self->{gene2seqf}->{lc $name} ) {
879             # we found a matching gene
880            
881             # pull out the gene seqfeature(s) array reference
882             # there may be more than one gene
883 2         5 my $genes = $self->{gene2seqf}->{ lc $name };
884            
885             # go through a series of checks to find the appropriate
886 2 50       6 if ($id) {
887 0         0 foreach my $g (@$genes) {
888 0 0       0 if ($g->primary_id eq $id) {
889 0         0 return $g;
890             }
891             }
892 0         0 return; # none of these matched despite having an ID
893             }
894 2 0 33     8 if ($chrom and $start and $end) {
      33        
895 0         0 foreach my $g (@$genes) {
896 0 0 0     0 if (
      0        
897             # overlap method borrowed from Bio::RangeI
898             ($g->strand == $strand) and not (
899             $g->start > $end or
900             $g->end < $start
901             )
902             ) {
903             # gene and transcript overlap on the same strand
904             # we found the intersecting gene
905 0         0 return $g;
906             }
907             }
908 0         0 return; # none of these matched despite having coordinate info
909             }
910 2 50       6 if (scalar @$genes == 1) {
    0          
911             # going on trust here that this is the one
912 2         6 return $genes->[0];
913             }
914             elsif (scalar @$genes > 1) {
915 0         0 carp "more than one gene named $name found!";
916 0         0 return $genes->[0];
917             }
918            
919             # nothing suitable found
920 0         0 return;
921             }
922             }
923              
924             sub counts {
925 2     2 1 3041 my $self = shift;
926 2         5 my %counts = %{ $self->{counts} };
  2         14  
927 2 50       22 return wantarray ? %counts : \%counts;
928             }
929              
930             sub from_ucsc_string {
931 0     0 1 0 my ($self, $string) = @_;
932 0 0       0 return unless $string;
933 0         0 my $builder = Bio::ToolBox::parser::ucsc::builder->new($string, $self);
934 0 0       0 return unless $builder;
935 0 0       0 if ($self->do_gene) {
936 0         0 return $builder->build_gene;
937             }
938             else {
939 0         0 return $builder->build_transcript;
940             }
941             }
942              
943             sub seq_ids {
944 0     0 1 0 my $self = shift;
945 0         0 my @s = keys %{$self->{seq_ids}};
  0         0  
946 0 0       0 return wantarray ? @s : \@s;
947             }
948              
949             sub seq_id_lengths {
950 0     0 1 0 my $self = shift;
951 0         0 return $self->{seq_ids};
952             }
953              
954              
955              
956             package Bio::ToolBox::parser::ucsc::builder;
957 2     2   22 use strict;
  2         3  
  2         55  
958 2     2   9 use Carp qw(carp cluck croak);
  2         4  
  2         7689  
959             our $SFCLASS = ''; # SeqFeature class to use
960              
961             1;
962              
963             sub new {
964 173     173   313 my ($class, $line, $ucsc) = @_;
965 173         203 my %self;
966             my $format;
967            
968             # check SeqFeature class
969 173 100       371 if ($ucsc->{sfclass} ne $SFCLASS) {
970 3         9 $SFCLASS = $ucsc->{sfclass};
971 3 50       141 eval "require $SFCLASS" or croak $@;
972             }
973            
974 173         207 chomp $line;
975 173         641 my @linedata = split /\t/, $line;
976            
977             # we're identifying the type of table based on the number of columns
978             # may not be the best or accurate method, but it generally works for valid formats
979            
980             ### Extended Gene Prediction Table ###
981 173 100       474 if (scalar @linedata == 16) {
    50          
    50          
    50          
    50          
982             # an extended gene prediction table, e.g. refGene, ensGene, xenoRefGene
983             # as downloaded from the UCSC Table Browser or FTP site
984             # includes the bin number as the first column
985            
986             # 0 bin
987             # 1 name
988             # 2 chrom
989             # 3 strand
990             # 4 txStart
991             # 5 txEnd
992             # 6 cdsStart
993             # 7 cdsEnd
994             # 8 exonCount
995             # 9 exonStarts
996             # 10 exonEnds
997             # 11 score
998             # 12 name2
999             # 13 cdsStartStat
1000             # 14 cdsEndStat
1001             # 15 exonFrames
1002            
1003 87         105 $format = 'genePredExt with bin';
1004 87         157 $self{name} = $linedata[1];
1005 87         130 $self{chrom} = $linedata[2];
1006 87         125 $self{strand} = $linedata[3];
1007 87         171 $self{txStart} = $linedata[4] + 1;
1008 87         136 $self{txEnd} = $linedata[5];
1009 87         160 $self{cdsStart} = $linedata[6] + 1;
1010 87         123 $self{cdsEnd} = $linedata[7];
1011 87         191 $self{exonCount} = $linedata[8];
1012 87         123 $self{exonStarts} = $linedata[9];
1013 87         108 $self{exonEnds} = $linedata[10];
1014 87   50     192 $self{name2} = $linedata[12] || undef;
1015 87   50     352 $self{gene_name} = $ucsc->{ensembldata}->{ $linedata[1] }->[0] ||
1016             $linedata[12] || undef;
1017 87   50     295 $self{note} = $ucsc->{refseqsum}->{ $linedata[1] }->[1] || undef;
1018 87   50     259 $self{status} = $ucsc->{refseqstat}->{ $linedata[1] }->[0] || undef;
1019 87   50     231 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[1] }->[0] || undef;
1020 87 50       185 if ($linedata[1] =~ /^N[MR]_\d+/) {
1021 0         0 $self{refseq} = $linedata[1];
1022             }
1023             }
1024             ### Extended Gene Prediction Table ###
1025             elsif (scalar @linedata == 15) {
1026             # an extended gene prediction table, e.g. refGene, ensGene, xenoRefGene
1027             # without the bin value
1028            
1029             # 0 name
1030             # 1 chrom
1031             # 2 strand
1032             # 3 txStart
1033             # 4 txEnd
1034             # 5 cdsStart
1035             # 6 cdsEnd
1036             # 7 exonCount
1037             # 8 exonStarts
1038             # 9 exonEnds
1039             # 10 score
1040             # 11 name2
1041             # 12 cdsStartStat
1042             # 13 cdsEndStat
1043             # 14 exonFrames
1044            
1045 0         0 $format = 'genePredExt';
1046 0         0 $self{name} = $linedata[0];
1047 0         0 $self{chrom} = $linedata[1];
1048 0         0 $self{strand} = $linedata[2];
1049 0         0 $self{txStart} = $linedata[3] + 1;
1050 0         0 $self{txEnd} = $linedata[4];
1051 0         0 $self{cdsStart} = $linedata[5] + 1;
1052 0         0 $self{cdsEnd} = $linedata[6];
1053 0         0 $self{exonCount} = $linedata[7];
1054 0         0 $self{exonStarts} = $linedata[8];
1055 0         0 $self{exonEnds} = $linedata[9];
1056 0   0     0 $self{name2} = $linedata[11] || undef;
1057 0   0     0 $self{gene_name} = $ucsc->{ensembldata}->{ $linedata[0] }->[0] ||
1058             $linedata[11] || undef;
1059 0   0     0 $self{note} = $ucsc->{refseqsum}->{ $linedata[0] }->[1] || undef;
1060 0   0     0 $self{status} = $ucsc->{refseqstat}->{ $linedata[0] }->[0] || undef;
1061 0   0     0 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[0] }->[0] || undef;
1062 0 0       0 if ($linedata[0] =~ /^N[MR]_\d+/) {
1063 0         0 $self{refseq} = $linedata[0];
1064             }
1065             }
1066             ### Known Gene Table ###
1067             elsif (scalar @linedata == 12) {
1068            
1069             # 0 name known gene identifier
1070             # 1 chrom Reference sequence chromosome or scaffold
1071             # 2 strand + or - for strand
1072             # 3 txStart Transcription start position
1073             # 4 txEnd Transcription end position
1074             # 5 cdsStart Coding region start
1075             # 6 cdsEnd Coding region end
1076             # 7 exonCount Number of exons
1077             # 8 exonStarts Exon start positions
1078             # 9 exonEnds Exon end positions
1079             # 10 proteinID UniProt display ID for Known Genes, UniProt accession or RefSeq protein ID for UCSC Genes
1080             # 11 alignID Unique identifier for each (known gene, alignment position) pair
1081            
1082 0         0 $format = 'knownGene';
1083 0   0     0 $self{name} = $ucsc->{kgxref}->{ $linedata[0] }->[0] || $linedata[0];
1084 0         0 $self{chrom} = $linedata[1];
1085 0         0 $self{strand} = $linedata[2];
1086 0         0 $self{txStart} = $linedata[3] + 1;
1087 0         0 $self{txEnd} = $linedata[4];
1088 0         0 $self{cdsStart} = $linedata[5] + 1;
1089 0         0 $self{cdsEnd} = $linedata[6];
1090 0         0 $self{exonCount} = $linedata[7];
1091 0         0 $self{exonStarts} = $linedata[8];
1092 0         0 $self{exonEnds} = $linedata[9];
1093 0         0 $self{name2} = $linedata[0];
1094             $self{gene_name} = $ucsc->{kgxref}->{ $linedata[0] }->[3] || # geneSymbol
1095             $ucsc->{kgxref}->{ $linedata[0] }->[0] || # mRNA id
1096 0   0     0 $ucsc->{kgxref}->{ $linedata[0] }->[4] || # refSeq id
1097             $linedata[0]; # ugly default
1098 0   0     0 $self{note} = $ucsc->{kgxref}->{ $linedata[0] }->[6] || undef;
1099 0   0     0 $self{refseq} = $ucsc->{kgxref}->{ $linedata[0] }->[4] || undef;
1100 0   0     0 $self{status} = $ucsc->{refseqstat}->{ $self{refseq} }->[0] || undef;
1101 0   0     0 $self{completeness} = $ucsc->{refseqsum}->{ $self{refseq} }->[0] || undef;
1102 0   0     0 $self{spid} = $ucsc->{kgxref}->{ $linedata[0] }->[1] || undef; # SwissProt ID
1103 0   0     0 $self{spdid} = $ucsc->{kgxref}->{ $linedata[0] }->[2] || undef; # SwissProt display ID
1104 0   0     0 $self{protacc} = $ucsc->{kgxref}->{ $linedata[0] }->[5] || undef; # NCBI protein accession
1105             }
1106             ### refFlat or Gene Prediction Table ###
1107             elsif (scalar @linedata == 11) {
1108            
1109             # 0 name2 or gene name
1110             # 1 name or transcript name
1111             # 2 chrom
1112             # 3 strand
1113             # 4 txStart
1114             # 5 txEnd
1115             # 6 cdsStart
1116             # 7 cdsEnd
1117             # 8 exonCount
1118             # 9 exonStarts
1119             # 10 exonEnds
1120            
1121 0         0 $format = 'refFlat';
1122 0   0     0 $self{gene_name} = $ucsc->{ensembldata}->{ $linedata[1] }->[0] ||
1123             $linedata[0] || undef;
1124 0         0 $self{name2} = $linedata[0];
1125 0         0 $self{name} = $linedata[1];
1126 0         0 $self{chrom} = $linedata[2];
1127 0         0 $self{strand} = $linedata[3];
1128 0         0 $self{txStart} = $linedata[4] + 1;
1129 0         0 $self{txEnd} = $linedata[5];
1130 0         0 $self{cdsStart} = $linedata[6] + 1;
1131 0         0 $self{cdsEnd} = $linedata[7];
1132 0         0 $self{exonCount} = $linedata[8];
1133 0         0 $self{exonStarts} = $linedata[9];
1134 0         0 $self{exonEnds} = $linedata[10];
1135 0   0     0 $self{note} = $ucsc->{refseqsum}->{ $linedata[1] }->[1] || undef;
1136 0   0     0 $self{status} = $ucsc->{refseqstat}->{ $linedata[1] }->[0] || undef;
1137 0   0     0 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[1] }->[0] || undef;
1138 0 0       0 if ($linedata[1] =~ /^N[MR]_\d+/) {
1139 0         0 $self{refseq} = $linedata[1];
1140             }
1141             }
1142             ### Gene Prediction Table ###
1143             elsif (scalar @linedata == 10) {
1144             # a simple gene prediction table, e.g. refGene, ensGene, xenoRefGene
1145            
1146             # 0 name
1147             # 1 chrom
1148             # 2 strand
1149             # 3 txStart
1150             # 4 txEnd
1151             # 5 cdsStart
1152             # 6 cdsEnd
1153             # 7 exonCount
1154             # 8 exonStarts
1155             # 9 exonEnds
1156            
1157 86         112 $format = 'genePred';
1158 86         139 $self{name} = $linedata[0];
1159 86         131 $self{chrom} = $linedata[1];
1160 86         109 $self{strand} = $linedata[2];
1161 86         121 $self{txStart} = $linedata[3] + 1;
1162 86         111 $self{txEnd} = $linedata[4];
1163 86         122 $self{cdsStart} = $linedata[5] + 1;
1164 86         114 $self{cdsEnd} = $linedata[6];
1165 86         171 $self{exonCount} = $linedata[7];
1166 86         121 $self{exonStarts} = $linedata[8];
1167 86         109 $self{exonEnds} = $linedata[9];
1168 86         118 $self{name2} = $linedata[0]; # re-use transcript name
1169 86         106 $self{gene_name} = $linedata[0]; # re-use transcript name
1170 86   50     318 $self{note} = $ucsc->{refseqsum}->{ $linedata[0] }->[1] || undef;
1171 86   50     247 $self{status} = $ucsc->{refseqstat}->{ $linedata[0] }->[0] || undef;
1172 86   50     220 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[0] }->[0] || undef;
1173 86 50       157 if ($linedata[0] =~ /^N[MR]_\d+/) {
1174 0         0 $self{refseq} = $linedata[0];
1175             }
1176             }
1177             else {
1178             # unrecognized line format
1179 0         0 carp "unrecognized format, line has " . scalar @linedata . "elements";
1180 0         0 return;
1181             }
1182            
1183             # verify
1184 173         253 my @errors;
1185 173 50       454 push @errors, 'strand' unless $self{strand} =~ /^[\+\-\.]$/;
1186 173 50       512 push @errors, 'txStart' unless $self{txStart} =~ /^\d+$/;
1187 173 50       373 push @errors, 'txEnd' unless $self{txEnd} =~ /^\d+$/;
1188 173 50       406 push @errors, 'cdsStart' unless $self{cdsStart} =~ /^\d+$/;
1189 173 50       401 push @errors, 'cdsEnd' unless $self{cdsEnd} =~ /^\d+$/;
1190 173 50       386 push @errors, 'exonCount' unless $self{exonCount} =~ /^\d+$/;
1191 173 50       396 push @errors, 'exonStarts' unless $self{exonStarts} =~ /^[\d,]+$/;
1192 173 50       392 push @errors, 'exonEnds' unless $self{exonEnds} =~ /^[\d,]+$/;
1193 173 50       288 if (@errors) {
1194 0         0 warn "line format for $format has the following errors: @errors";
1195 0         0 return;
1196             }
1197            
1198             # fix values
1199 173 100       368 $self{strand} = $self{strand} eq '+' ? 1 : $self{strand} eq '-' ? -1 : 0;
    100          
1200 173         455 $self{exonStarts} = [ map {$_ += 1} ( split ",", $self{exonStarts} ) ];
  862         1371  
1201 173         566 $self{exonEnds} = [ ( split ",", $self{exonEnds} ) ];
1202            
1203             # Attempt to identify the transcript type
1204 173   100     593 my $type = $ucsc->{ensembldata}->{ $self{name} }->[1] || undef;
1205             # check if we have loaded ensembl source data and use that if available
1206 173 100       392 if ( $self{cdsStart} - 1 == $self{cdsEnd} ) {
1207             # there appears to be no coding potential when
1208             # txEnd = cdsStart = cdsEnd
1209             # if you'll look, all of the exon phases should also be -1
1210            
1211 63 100       236 if ($type) {
    50          
    50          
    50          
1212             # we have an ensembl source type, so prefer to use that
1213 21         70 $self{type} = $type;
1214             }
1215            
1216             # otherwise, we may be able to infer some certain
1217             # types from the gene name
1218             elsif ($self{name2} =~ /^mir/i) {
1219             # a noncoding gene whose name begins with mir is likely a micro RNA
1220 0         0 $self{type} = 'miRNA';
1221             }
1222             elsif ($self{name2} =~ /^snr/i) {
1223             # a noncoding gene whose name begins with snr is likely a snRNA
1224 0         0 $self{type} = 'snRNA';
1225             }
1226             elsif ($self{name2} =~ /^sno/i) {
1227             # a noncoding gene whose name begins with sno is likely a snoRNA
1228 0         0 $self{type} = 'snoRNA';
1229             }
1230             else {
1231             # a generic ncRNA
1232 42         98 $self{type} = 'ncRNA';
1233             }
1234             }
1235             else {
1236             # the transcript has an identifiable CDS so likely a mRNA
1237 110 100       397 $self{type} = defined $type ? $type : 'mRNA';
1238             }
1239            
1240             # add the ucsc object
1241 173         290 $self{ucsc} = $ucsc;
1242            
1243 173         558 return bless \%self, $class;
1244             }
1245              
1246             sub name {
1247 692     692   1757 return shift->{name};
1248             }
1249              
1250             sub name2 {
1251 229     229   470 return shift->{name2};
1252             }
1253              
1254             sub gene_name {
1255 371     371   800 return shift->{gene_name};
1256             }
1257              
1258             sub chrom {
1259 195     195   489 return shift->{chrom};
1260             }
1261              
1262             sub txStart {
1263 291     291   648 return shift->{txStart};
1264             }
1265              
1266             sub txEnd {
1267 295     295   597 return shift->{txEnd};
1268             }
1269              
1270             sub strand {
1271 243     243   537 return shift->{strand};
1272             }
1273              
1274             sub cdsStart {
1275 811     811   1592 return shift->{cdsStart};
1276             }
1277              
1278             sub cdsEnd {
1279 409     409   860 return shift->{cdsEnd};
1280             }
1281              
1282             sub exonCount {
1283 942     942   10705 return shift->{exonCount};
1284             }
1285              
1286             sub exonStarts {
1287 1412     1412   3281 return shift->{exonStarts};
1288             }
1289              
1290             sub exonEnds {
1291 1014     1014   1559 return shift->{exonEnds};
1292             }
1293              
1294             sub type {
1295 120     120   199 return shift->{type};
1296             }
1297              
1298             sub refseq {
1299 243     243   276 my $self = shift;
1300 243 50       490 return exists $self->{refseq} ? $self->{refseq} : undef;
1301             }
1302              
1303             sub note {
1304 243     243   273 my $self = shift;
1305 243 50       538 return exists $self->{note} ? $self->{note} : undef;
1306             }
1307              
1308             sub status {
1309 173     173   191 my $self = shift;
1310 173 50       344 return exists $self->{status} ? $self->{status} : undef;
1311             }
1312              
1313             sub completeness {
1314 173     173   201 my $self = shift;
1315 173 50       365 return exists $self->{completeness} ? $self->{completeness} : undef;
1316             }
1317              
1318             sub ucsc {
1319 493     493   737 return shift->{ucsc};
1320             }
1321              
1322             sub build_gene {
1323 70     70   81 my $self = shift;
1324            
1325             # shortcuts
1326 70         121 my $ucsc = $self->ucsc;
1327 70         96 my $id2count = $ucsc->{id2count};
1328 70         77 my $ensembldata = $ucsc->{ensembldata};
1329            
1330             # find a pre-existing gene to update or build a new one
1331 70         159 my $gene = $self->find_gene;
1332 70 100       104 if ($gene) {
1333             # update as necessary
1334 48 50       68 if ( ($self->txStart) < $gene->start) {
1335             # update the transcription start position
1336 0         0 $gene->start( $self->txStart );
1337             }
1338 48 100       142 if ($self->txEnd > $gene->end) {
1339             # update the transcription stop position
1340 4         16 $gene->end( $self->txEnd );
1341             }
1342             }
1343             else {
1344             # build a new gene
1345 22         44 $gene = $SFCLASS->new(
1346             -seq_id => $self->chrom,
1347             -source => $ucsc->source,
1348             -primary_tag => 'gene',
1349             -start => $self->txStart,
1350             -end => $self->txEnd,
1351             -strand => $self->strand,
1352             -phase => '.',
1353             -display_name => $self->gene_name,
1354             );
1355 22         296 $ucsc->{counts}->{gene} += 1;
1356            
1357             # Add a unique primary ID
1358 22         42 my $id = $self->name2;
1359 22 50       49 if (exists $id2count->{ lc $id }) {
1360             # we've encountered this gene ID before
1361            
1362             # then make name unique by appending the count number
1363 0         0 $id2count->{ lc $id } += 1;
1364 0         0 $id .= '.' . $id2count->{ lc $id };
1365             }
1366             else {
1367             # this is the first transcript with this id
1368             # set the id counter
1369 22         48 $id2count->{lc $id} = 0;
1370             }
1371 22         55 $gene->primary_id($id);
1372            
1373             # Add an alias
1374 22 100       59 if ($self->name2 ne $self->gene_name) {
1375 12         15 $gene->add_tag_value('Alias', $self->name2);
1376             }
1377             }
1378            
1379             # now build the transcript for the gene
1380 70         238 my $transcript = $self->build_transcript($gene);
1381 70         170 $gene->add_SeqFeature($transcript);
1382 70         1236 $transcript->add_tag_value('Parent', $gene->primary_id);
1383            
1384             # update extra attributes as necessary
1385 70         309 $self->update_attributes($gene);
1386            
1387             # finished
1388 70         119 return $gene;
1389             }
1390              
1391             sub build_transcript {
1392 173     173   262 my ($self, $gene) = @_; # gene is not required
1393            
1394             # shortcuts
1395 173         238 my $ucsc = $self->ucsc;
1396 173         235 my $id2count = $ucsc->{id2count};
1397 173         218 my $ensembldata = $ucsc->{ensembldata};
1398 173         232 my $counts = $ucsc->{counts};
1399            
1400             # Uniqueify the transcript ID and name
1401 173         260 my $id = $self->name;
1402 173 50       334 if (exists $id2count->{ lc $id } ) {
1403             # we've encountered this transcript ID before
1404            
1405             # now need to make ID unique by appending a number
1406 0         0 $id2count->{ lc $id } += 1;
1407 0         0 $id .= '.' . $id2count->{ lc $id };
1408             }
1409             else {
1410             # this is the first transcript with this id
1411 173         338 $id2count->{lc $id} = 0;
1412             }
1413            
1414             # identify the primary_tag value
1415 173         201 my ($type, $biotype);
1416 173 50       233 if (exists $ensembldata->{$self->name}) {
1417 173   100     232 my $t = $ensembldata->{$self->name}->[1] || undef;
1418 173 100 100     796 if ($t and $t =~ /protein.coding/i) {
    100 100        
    100          
1419 26         40 $type = 'mRNA';
1420 26         38 $biotype = $t;
1421             }
1422             elsif ($t and $t =~ /rna|transcript/i) {
1423 15         23 $type = $t;
1424 15         21 $biotype = $t;
1425             }
1426             elsif ($t) {
1427 12         17 $type = 'transcript';
1428 12         24 $biotype = $t;
1429             }
1430             else {
1431 120         189 $type = $self->type;
1432             }
1433             }
1434             else {
1435 0         0 $type = $self->type;
1436             }
1437            
1438             # Generate the transcript SeqFeature object
1439 173         332 my $transcript = $SFCLASS->new(
1440             -seq_id => $self->chrom,
1441             -source => $ucsc->source,
1442             -primary_tag => $type,
1443             -start => $self->txStart,
1444             -end => $self->txEnd,
1445             -strand => $self->strand,
1446             -phase => '.',
1447             -display_name => $self->name,
1448             -primary_id => $id,
1449             );
1450            
1451             # add gene name as an alias
1452 173 100       1976 if ($self->gene_name ne $self->name2) {
1453 36         59 $transcript->add_tag_value('Alias', $self->gene_name);
1454             }
1455            
1456             # update extra attributes as necessary
1457 173         487 $self->update_attributes($transcript);
1458            
1459             # add transcript specific attributes
1460 173 50       274 if (defined $self->completeness ) {
1461 0         0 $transcript->add_tag_value( 'completeness', $self->completeness );
1462             }
1463 173 50       267 if (defined $self->status ) {
1464 0         0 $transcript->add_tag_value( 'status', $self->status );
1465             }
1466 173 100       266 if ($biotype) {
1467 53         93 $transcript->add_tag_value( 'biotype', $biotype);
1468             }
1469            
1470             # add the exons
1471 173 100       459 if ($ucsc->do_exon) {
1472 122         220 $self->add_exons($transcript, $gene);
1473             }
1474            
1475             # add CDS, UTRs, and codons if necessary
1476 173 100       323 if ( $self->cdsStart - 1 != $self->cdsEnd ) {
1477            
1478 110 100       261 if ($ucsc->do_utr) {
1479 10         33 $self->add_utrs($transcript, $gene);
1480             }
1481            
1482 110 50       215 if ($ucsc->do_codon) {
1483 0         0 $self->add_codons($transcript, $gene);
1484             }
1485            
1486 110 100       218 if ($ucsc->do_cds) {
1487 10         29 $self->add_cds($transcript);
1488             }
1489             }
1490            
1491             # record the type of transcript
1492 173         340 $counts->{$type} += 1;
1493            
1494             # transcript is complete
1495 173         357 return $transcript;
1496             }
1497              
1498             sub update_attributes {
1499 243     243   355 my ($self, $seqf) = @_;
1500            
1501             # add Note if possible
1502 243 50       389 if (defined $self->note ) {
1503 0         0 $self->add_unique_attribute($seqf, 'Note', $self->note );
1504             }
1505            
1506             # add refSeq identifier if possible
1507 243 50       445 if (defined $self->refseq) {
1508 0         0 $self->add_unique_attribute($seqf, 'Dbxref', 'RefSeq:' . $self->refseq);
1509             }
1510            
1511             # add SwissProt identifier if possible
1512 243 50 33     496 if (exists $self->{spid} and defined $self->{spid}) {
1513 0         0 $self->add_unique_attribute($seqf, 'Dbxref', 'Swiss-Prot:' . $self->{spid});
1514             }
1515            
1516             # add SwissProt display identifier if possible
1517 243 50 33     429 if (exists $self->{spdid} and defined $self->{spdid}) {
1518 0         0 $self->add_unique_attribute($seqf, 'swiss-prot_display_id', $self->{spdid});
1519             }
1520            
1521             # add NCBI protein access identifier if possible
1522 243 50 33     408 if (exists $self->{protacc} and defined $self->{protacc}) {
1523 0         0 $self->add_unique_attribute($seqf, 'Dbxref', 'RefSeq:' . $self->{protacc});
1524             }
1525             }
1526              
1527             sub add_unique_attribute {
1528 0     0   0 my ($self, $seqf, $tag, $value) = @_;
1529            
1530             # look for a pre-existing identical tag value
1531 0         0 my $check = 1;
1532 0         0 foreach ($seqf->get_tag_values($tag)) {
1533 0 0       0 if ($_ eq $value) {
1534 0         0 $check = 0;
1535 0         0 last;
1536             }
1537             }
1538            
1539             # add it if our value is unique
1540 0 0       0 $seqf->add_tag_value($tag, $value) if $check;
1541             }
1542              
1543             sub add_exons {
1544 122     122   175 my ($self, $transcript, $gene) = @_;
1545 122         196 my $ucsc = $self->ucsc;
1546            
1547             # Add the exons
1548             EXON_LOOP:
1549 122         214 for (my $i = 0; $i < $self->exonCount; $i++) {
1550            
1551             # first look for existing
1552 595 100 100     862 if ($ucsc->share and $gene) {
1553 277         382 my $exon = $self->find_existing_subfeature($gene, 'exon',
1554             $self->exonStarts->[$i], $self->exonEnds->[$i]);
1555 277 100       435 if ($exon) {
1556             # we found an existing exon to reuse
1557             # associate with this transcript
1558 156         349 $transcript->add_SeqFeature($exon);
1559 156         3488 next EXON_LOOP;
1560             }
1561             }
1562            
1563             # transform index for reverse strands
1564             # this will allow numbering from 5'->3'
1565 439         448 my $number;
1566 439 100       689 if ($transcript->strand == 1) {
1567             # forward strand
1568 386         1066 $number = $i;
1569             }
1570             else {
1571             # reverse strand
1572 53         100 $number = abs( $i - $self->exonCount + 1);
1573             }
1574            
1575             # build the exon seqfeature
1576 439         717 my $exon = $SFCLASS->new(
1577             -seq_id => $transcript->seq_id,
1578             -source => $transcript->source,
1579             -primary_tag => 'exon',
1580             -start => $self->exonStarts->[$i],
1581             -end => $self->exonEnds->[$i],
1582             -strand => $transcript->strand,
1583             -primary_id => $transcript->primary_id . ".exon$number",
1584             );
1585            
1586             # add name if requested
1587 439 50       7196 if ($ucsc->do_name) {
1588 0         0 $exon->display_name( $transcript->display_name . ".exon$number" );
1589             }
1590            
1591             # associate with transcript
1592 439         761 $transcript->add_SeqFeature($exon);
1593             }
1594             }
1595              
1596             sub add_utrs {
1597 10     10   22 my ($self, $transcript, $gene) = @_;
1598 10         23 my $ucsc = $self->ucsc;
1599            
1600             # we will scan each exon and look for a potential utr and build it
1601 10         13 my @utrs;
1602             UTR_LOOP:
1603 10         24 for (my $i = 0; $i < $self->exonCount; $i++) {
1604            
1605             # transform index for reverse strands
1606             # this will allow numbering from 5'->3'
1607 76         74 my $number;
1608 76 50       116 if ($transcript->strand == 1) {
1609             # forward strand
1610 76         77 $number = $i;
1611             }
1612             else {
1613             # reverse strand
1614 0         0 $number = abs( $i - $self->exonCount + 1);
1615             }
1616            
1617             # identify UTRs
1618             # we will identify by comparing the cdsStart and cdsStop relative
1619             # to the exon coordinates
1620             # the primary tag is determined by the exon strand orientation
1621 76         133 my ($start, $stop, $tag);
1622             # in case we need to build two UTRs
1623 76         0 my ($start2, $stop2, $tag2);
1624            
1625             # Split 5'UTR, CDS, and 3'UTR all on the same exon
1626 76 50 66     94 if (
    100 100        
    100 66        
    100 66        
    100 66        
    50 33        
1627             $self->exonStarts->[$i] < $self->cdsStart
1628             and
1629             $self->exonEnds->[$i] > $self->cdsEnd
1630             ) {
1631             # the CDS is entirely within the exon, resulting in two UTRs
1632             # on either side of the exon
1633             # we must build two UTRs
1634            
1635             # the left UTR
1636 0         0 $start = $self->exonStarts->[$i];
1637 0         0 $stop = $self->cdsStart - 1;
1638 0 0       0 $tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1639            
1640             # the right UTR
1641 0         0 $start2 = $self->cdsEnd + 1;
1642 0         0 $stop2 = $self->exonEnds->[$i];
1643 0 0       0 $tag2 = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1644             }
1645            
1646             # 5'UTR forward, 3'UTR reverse
1647             elsif (
1648             $self->exonStarts->[$i] < $self->cdsStart
1649             and
1650             $self->exonEnds->[$i] < $self->cdsStart
1651             ) {
1652             # the exon start/end is entirely before the cdsStart
1653 3         5 $start = $self->exonStarts->[$i];
1654 3         5 $stop = $self->exonEnds->[$i];
1655 3 50       6 $tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1656             }
1657            
1658             # Split 5'UTR & CDS on forward, 3'UTR & CDS
1659             elsif (
1660             $self->exonStarts->[$i] < $self->cdsStart
1661             and
1662             $self->exonEnds->[$i] >= $self->cdsStart
1663             ) {
1664             # the start/stop codon is in this exon
1665             # we need to make the UTR out of a portion of this exon
1666 9         15 $start = $self->exonStarts->[$i];
1667 9         17 $stop = $self->cdsStart - 1;
1668 9 50       18 $tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1669             }
1670            
1671             # CDS only
1672             elsif (
1673             $self->exonStarts->[$i] >= $self->cdsStart
1674             and
1675             $self->exonEnds->[$i] <= $self->cdsEnd
1676             ) {
1677             # CDS only exon
1678 50         89 next UTR_LOOP;
1679             }
1680            
1681             # Split 3'UTR & CDS on forward, 5'UTR & CDS
1682             elsif (
1683             $self->exonStarts->[$i] <= $self->cdsEnd
1684             and
1685             $self->exonEnds->[$i] > $self->cdsEnd
1686             ) {
1687             # the stop/start codon is in this exon
1688             # we need to make the UTR out of a portion of this exon
1689 7         11 $start = $self->cdsEnd + 1;
1690 7         13 $stop = $self->exonEnds->[$i];
1691 7 50       12 $tag = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1692             }
1693            
1694             # 3'UTR forward, 5'UTR reverse
1695             elsif (
1696             $self->exonStarts->[$i] > $self->cdsEnd
1697             and
1698             $self->exonEnds->[$i] > $self->cdsEnd
1699             ) {
1700             # the exon start/end is entirely after the cdsStop
1701             # we have a 3'UTR
1702 7         10 $start = $self->exonStarts->[$i];
1703 7         11 $stop = $self->exonEnds->[$i];
1704 7 50       10 $tag = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1705             }
1706            
1707             # Something else?
1708             else {
1709 0         0 my $warning = "Warning: A malformed UTR that doesn't match known criteria: ";
1710 0         0 $warning .= "cdsStart " . $self->cdsStart;
1711 0         0 $warning .= ", cdsEnd " . $self->cdsEnd;
1712 0         0 $warning .= ", exonStart " . $self->exonStarts->[$i];
1713 0         0 $warning .= ", exonEnd " . $self->exonEnds->[$i];
1714 0         0 warn $warning;
1715 0         0 next UTR_LOOP;
1716             }
1717            
1718             ## Generate the UTR objects
1719 26         41 my $utr;
1720            
1721             # look for existing utr
1722 26 50 33     38 if ($ucsc->share and $gene) {
1723 26         48 $utr = $self->find_existing_subfeature($gene, $tag, $start, $stop);
1724             }
1725            
1726             # otherwise build the UTR object
1727 26 100       39 unless ($utr) {
1728 19         66 $utr = $SFCLASS->new(
1729             -seq_id => $transcript->seq_id,
1730             -source => $transcript->source,
1731             -start => $start,
1732             -end => $stop,
1733             -strand => $transcript->strand,
1734             -phase => '.',
1735             -primary_tag => $tag,
1736             -primary_id => $transcript->primary_id . ".utr$number",
1737             );
1738 19 50       77 $utr->display_name( $transcript->display_name . ".utr$number" ) if
1739             $ucsc->do_name;
1740             }
1741            
1742             # store this utr seqfeature in a temporary array
1743 26         36 push @utrs, $utr;
1744            
1745             # build a second UTR object as necessary
1746 26 50       69 if ($start2) {
1747 0         0 my $utr2;
1748            
1749             # look for existing utr
1750 0 0       0 if ($ucsc->share) {
1751 0         0 $utr2 = $self->find_existing_subfeature($gene, $tag2, $start2, $stop2);
1752             }
1753            
1754             # otherwise build the utr
1755 0 0       0 unless ($utr2) {
1756 0         0 $utr2 = $SFCLASS->new(
1757             -seq_id => $transcript->seq_id,
1758             -source => $transcript->source,
1759             -start => $start2,
1760             -end => $stop2,
1761             -strand => $transcript->strand,
1762             -phase => '.',
1763             -primary_tag => $tag2,
1764             -primary_id => $transcript->primary_id . ".utr$number" . "a",
1765             );
1766 0 0       0 $utr2->display_name( $transcript->display_name . ".utr$number" . "a" )
1767             if $ucsc->do_name;
1768             }
1769            
1770             # store this utr seqfeature in a temporary array
1771 0         0 push @utrs, $utr2;
1772             }
1773             }
1774            
1775             # associate found UTRs with the transcript
1776 10         22 foreach my $utr (@utrs) {
1777 26         43 $transcript->add_SeqFeature($utr);
1778             }
1779             }
1780              
1781             sub add_cds {
1782 10     10   19 my ($self, $transcript) = @_;
1783            
1784             # we will NOT collapse CDS features since we cannot guarantee that a shared
1785             # CDS will have the same phase, since phase is dependent on the translation
1786             # start
1787            
1788             # we will scan each exon and look for a potential CDS and build it
1789 10         12 my @cdss;
1790 10         13 my $phase = 0; # initialize CDS phase and keep track as we process CDSs
1791             CDS_LOOP:
1792 10         26 for (my $i = 0; $i < $self->exonCount; $i++) {
1793            
1794             # transform index for reverse strands
1795 76         71 my $j;
1796 76 50       105 if ($transcript->strand == 1) {
1797             # forward strand
1798 76         80 $j = $i;
1799             }
1800             else {
1801             # reverse strand
1802             # flip the index for exon starts and stops so that we
1803             # always progress 5' -> 3'
1804             # this ensures the phase is accurate from the start codon
1805 0         0 $j = abs( $i - $self->exonCount + 1);
1806             }
1807            
1808             # identify CDSs
1809             # we will identify by comparing the cdsStart and cdsStop relative
1810             # to the exon coordinates
1811 76         92 my ($start, $stop);
1812            
1813             # Split 5'UTR, CDS, and 3'UTR all on the same exon
1814 76 50 66     100 if (
    100 100        
    100 66        
    100 66        
    100 66        
    50 33        
1815             $self->exonStarts->[$j] < $self->cdsStart
1816             and
1817             $self->exonEnds->[$j] > $self->cdsEnd
1818             ) {
1819             # exon contains the entire CDS
1820 0         0 $start = $self->cdsStart;
1821 0         0 $stop = $self->cdsEnd;
1822             }
1823            
1824             # 5'UTR forward, 3'UTR reverse
1825             elsif (
1826             $self->exonStarts->[$j] < $self->cdsStart
1827             and
1828             $self->exonEnds->[$j] < $self->cdsStart
1829             ) {
1830             # no CDS in this exon
1831 3         7 next CDS_LOOP;
1832             }
1833            
1834             # Split 5'UTR & CDS on forward, 3'UTR & CDS
1835             elsif (
1836             $self->exonStarts->[$j] < $self->cdsStart
1837             and
1838             $self->exonEnds->[$j] >= $self->cdsStart
1839             ) {
1840             # the start/stop codon is in this exon
1841             # we need to make the CDS out of a portion of this exon
1842 9         11 $start = $self->cdsStart;
1843 9         15 $stop = $self->exonEnds->[$j];
1844             }
1845            
1846             # CDS only
1847             elsif (
1848             $self->exonStarts->[$j] >= $self->cdsStart
1849             and
1850             $self->exonEnds->[$j] <= $self->cdsEnd
1851             ) {
1852             # entire exon is CDS
1853 50         70 $start = $self->exonStarts->[$j];
1854 50         60 $stop = $self->exonEnds->[$j];
1855             }
1856            
1857             # Split 3'UTR & CDS on forward, 5'UTR & CDS
1858             elsif (
1859             $self->exonStarts->[$j] <= $self->cdsEnd
1860             and
1861             $self->exonEnds->[$j] > $self->cdsEnd
1862             ) {
1863             # the stop/start codon is in this exon
1864             # we need to make the CDS out of a portion of this exon
1865 7         11 $start = $self->exonStarts->[$j];
1866 7         13 $stop = $self->cdsEnd;
1867             }
1868            
1869             # 3'UTR forward, 5'UTR reverse
1870             elsif (
1871             $self->exonStarts->[$j] > $self->cdsEnd
1872             and
1873             $self->exonEnds->[$j] > $self->cdsEnd
1874             ) {
1875             # the exon start/end is entirely after the cdsStop
1876             # we have entirely 5' or 3'UTR, no CDS
1877 7         14 next CDS_LOOP;
1878             }
1879            
1880             # Something else?
1881             else {
1882 0         0 my $warning = "Warning: A malformed CDS that doesn't match known criteria: ";
1883 0         0 $warning .= "cdsStart " . $self->cdsStart;
1884 0         0 $warning .= ", cdsEnd " . $self->cdsEnd;
1885 0         0 $warning .= ", exonStart " . $self->exonStarts->[$j];
1886 0         0 $warning .= ", exonEnd " . $self->exonEnds->[$j];
1887 0         0 warn $warning;
1888 0         0 next CDS_LOOP;
1889             }
1890            
1891             # build the CDS object
1892 66         139 my $cds = $SFCLASS->new(
1893             -seq_id => $transcript->seq_id,
1894             -source => $transcript->source,
1895             -start => $start,
1896             -end => $stop,
1897             -strand => $transcript->strand,
1898             -phase => $phase,
1899             -primary_tag => 'CDS',
1900             -primary_id => $transcript->primary_id . ".cds$i",
1901             -display_name => $transcript->display_name . ".cds$i",
1902             );
1903             # the id and name still use $i for labeling to ensure numbering from 0
1904            
1905             # store this utr seqfeature in a temporary array
1906 66         125 push @cdss, $cds;
1907            
1908             # reset the phase for the next CDS
1909             # phase + (3 - (length % 3)), readjust to 0..2 if necessary
1910             # adapted from Barry Moore's gtf2gff3.pl script
1911 66         122 $phase = $phase + (3 - ( $cds->length % 3) );
1912 66 100       157 $phase -=3 if $phase > 2;
1913             }
1914            
1915             # associate found UTRs with the transcript
1916 10         22 foreach my $cds (@cdss) {
1917 66         87 $transcript->add_SeqFeature($cds);
1918             }
1919             }
1920              
1921             sub add_codons {
1922 0     0   0 my ($self, $transcript, $gene) = @_;
1923 0         0 my $ucsc = $self->ucsc;
1924            
1925             # generate the start and stop codons
1926 0         0 my ($start_codon, $stop_codon);
1927 0 0       0 if ($transcript->strand == 1) {
1928             # forward strand
1929            
1930             # share codons if possible
1931 0 0 0     0 if ($ucsc->share and $gene) {
1932 0         0 $start_codon = $self->find_existing_subfeature($gene, 'start_codon',
1933             $self->cdsStart, $self->cdsStart + 2);
1934 0         0 $stop_codon = $self->find_existing_subfeature($gene, 'stop_codon',
1935             $self->cdsEnd - 2, $self->cdsEnd);
1936             }
1937            
1938             # start codon
1939 0 0       0 unless ($start_codon) {
1940 0         0 $start_codon = $SFCLASS->new(
1941             -seq_id => $transcript->seq_id,
1942             -source => $transcript->source,
1943             -primary_tag => 'start_codon',
1944             -start => $self->cdsStart,
1945             -end => $self->cdsStart + 2,
1946             -strand => 1,
1947             -phase => 0,
1948             -primary_id => $transcript->primary_id . '.start_codon',
1949             );
1950 0 0       0 $start_codon->display_name( $transcript->display_name . '.start_codon' ) if
1951             $ucsc->do_name;
1952             }
1953            
1954             # stop codon
1955 0 0       0 unless ($stop_codon) {
1956 0         0 $stop_codon = $SFCLASS->new(
1957             -seq_id => $transcript->seq_id,
1958             -source => $transcript->source,
1959             -primary_tag => 'stop_codon',
1960             -start => $self->cdsEnd - 2,
1961             -end => $self->cdsEnd,
1962             -strand => 1,
1963             -phase => 0,
1964             -primary_id => $transcript->primary_id . '.stop_codon',
1965             );
1966 0 0       0 $stop_codon->display_name( $transcript->display_name . '.stop_codon' ) if
1967             $ucsc->do_name;
1968             }
1969             }
1970            
1971             else {
1972             # reverse strand
1973            
1974             # share codons if possible
1975 0 0 0     0 if ($ucsc->share and $gene) {
1976 0         0 $stop_codon = $self->find_existing_subfeature($gene, 'stop_codon',
1977             $self->cdsStart, $self->cdsStart + 2);
1978 0         0 $start_codon = $self->find_existing_subfeature($gene, 'start_codon',
1979             $self->cdsEnd - 2, $self->cdsEnd);
1980             }
1981            
1982             # stop codon
1983 0 0       0 unless ($stop_codon) {
1984 0         0 $stop_codon = $SFCLASS->new(
1985             -seq_id => $transcript->seq_id,
1986             -source => $transcript->source,
1987             -primary_tag => 'stop_codon',
1988             -start => $self->cdsStart,
1989             -end => $self->cdsStart + 2,
1990             -strand => -1,
1991             -phase => 0,
1992             -primary_id => $transcript->primary_id . '.stop_codon',
1993             );
1994 0 0       0 $stop_codon->display_name( $transcript->display_name . '.stop_codon' ) if
1995             $ucsc->do_name;
1996             }
1997            
1998             # start codon
1999 0 0       0 unless ($start_codon) {
2000 0         0 $start_codon = $SFCLASS->new(
2001             -seq_id => $transcript->seq_id,
2002             -source => $transcript->source,
2003             -primary_tag => 'start_codon',
2004             -start => $self->cdsEnd - 2,
2005             -end => $self->cdsEnd,
2006             -strand => -1,
2007             -phase => 0,
2008             -primary_id => $transcript->primary_id . '.start_codon',
2009             -display_name => $transcript->primary_id . '.start_codon',
2010             );
2011 0 0       0 $start_codon->display_name( $transcript->display_name . '.start_codon' ) if
2012             $ucsc->do_name;
2013             }
2014             }
2015            
2016             # associate with transcript
2017 0         0 $transcript->add_SeqFeature($start_codon);
2018 0         0 $transcript->add_SeqFeature($stop_codon);
2019             }
2020              
2021             sub find_gene {
2022 70     70   94 my $self = shift;
2023            
2024             # check if a gene with this name exists
2025 70 100       106 if (exists $self->ucsc->{gene2seqf}->{lc $self->gene_name} ) {
2026             # we found a gene with the same name
2027             # pull out the gene seqfeature(s) array reference
2028             # there may be more than one gene
2029 48         76 my $genes = $self->ucsc->{gene2seqf}->{ lc $self->gene_name };
2030            
2031             # check that the current transcript intersects with the gene
2032             # sometimes we can have two separate transcripts with the
2033             # same gene name, but located on opposite ends of the chromosome
2034             # part of a gene family, but unlikely the same gene 200 Mb in
2035             # length
2036 48         93 foreach my $g (@$genes) {
2037 48 50 33     116 if (
      33        
2038             # overlap method borrowed from Bio::RangeI
2039             ($g->strand == $self->strand) and not (
2040             $g->start > $self->txEnd or
2041             $g->end < $self->txStart
2042             )
2043             ) {
2044             # gene and transcript overlap on the same strand
2045             # we found the intersecting gene
2046 48         96 return $g;
2047             }
2048             }
2049             }
2050 22         34 return;
2051             }
2052              
2053             sub find_existing_subfeature {
2054 303     303   462 my ($self, $gene, $type, $start, $stop) = @_;
2055            
2056             # we will try to find a pre-existing subfeature at identical coordinates
2057 303         499 foreach my $transcript ($gene->get_SeqFeatures()) {
2058             # walk through transcripts
2059 988         3679 foreach my $subfeature ($transcript->get_SeqFeatures()) {
2060             # walk through subfeatures of transcripts
2061 8284 100 100     25763 if (
      100        
2062             $subfeature->primary_tag eq $type and
2063             $subfeature->start == $start and
2064             $subfeature->end == $stop
2065             ) {
2066             # we found a match
2067 163         1398 return $subfeature;
2068             }
2069             }
2070             }
2071 140         525 return;
2072             }
2073              
2074              
2075              
2076              
2077             __END__