File Coverage

blib/lib/Bio/ToolBox/parser/ucsc.pm
Criterion Covered Total %
statement 430 664 64.7
branch 212 382 55.5
condition 71 193 36.7
subroutine 52 58 89.6
pod 22 24 91.6
total 787 1321 59.5


line stmt bran cond sub pod time code
1             package Bio::ToolBox::parser::ucsc;
2             our $VERSION = '1.65';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::parser::ucsc - Parser for UCSC genePred, refFlat, etc formats
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::parser::ucsc;
11            
12             ### A simple transcript parser
13             my $ucsc = Bio::ToolBox::parser::ucsc->new('file.genePred');
14            
15             ### A full fledged gene parser
16             my $ucsc = Bio::ToolBox::parser::ucsc->new(
17             file => 'ensGene.genePred',
18             do_gene => 1,
19             do_cds => 1,
20             do_utr => 1,
21             ensname => 'ensemblToGene.txt',
22             enssrc => 'ensemblSource.txt',
23             );
24            
25             ### Retrieve one transcript line at a time
26             my $transcript = $ucsc->next_feature;
27            
28             ### Retrieve one assembled gene at a time
29             my $gene = $ucsc->next_top_feature;
30            
31             ### Retrieve array of all assembled genes
32             my @genes = $ucsc->top_features;
33            
34             # Each gene or transcript is a SeqFeatureI compatible object
35             printf "gene %s is located at %s:%s-%s\n",
36             $gene->display_name, $gene->seq_id,
37             $gene->start, $gene->end;
38            
39             # Multiple transcripts can be assembled into a gene
40             foreach my $transcript ($gene->get_SeqFeatures) {
41             # each transcript has exons
42             foreach my $exon ($transcript->get_SeqFeatures) {
43             printf "exon is %sbp long\n", $exon->length;
44             }
45             }
46            
47             # Features can be printed in GFF3 format
48             $gene->version(3);
49             print STDOUT $gene->gff_string(1);
50             # the 1 indicates to recurse through all subfeatures
51            
52              
53             =head1 DESCRIPTION
54              
55             This is a parser for converting UCSC-style gene prediction flat file formats into
56             BioPerl-style L compliant objects, complete with nested objects
57             representing transcripts, exons, CDS, UTRs, start- and stop-codons. Full control
58             is available on what to parse, e.g. exons on, CDS and codons off. Additional gene
59             information can be added by supplying additional tables of information, such as
60             common gene names and descriptions, available from the UCSC repository.
61              
62             =head2 Table formats supported
63              
64             Supported files are tab-delimited text files obtained from UCSC and described
65             at L. Formats are identified
66             by the number of columns, rather than specific file extensions, column name
67             headers, or other metadata. Therefore, unmodified tables should only be used
68             for correct parsing. Some errors are reported for incorrect lines. Unadulterated
69             files can safely be downloaded from L.
70             Files obtained from the UCSC Table Browser can also be used with caution. Files
71             may be gzip compressed.
72              
73             File formats supported include the following.
74              
75             =over 4
76              
77             =item * Gene Prediction (genePred), 10 columns
78              
79             =item * Gene Prediction with RefSeq gene Name (refFlat), 11 columns
80              
81             =item * Extended Gene Prediction (genePredExt), 15 columns
82              
83             =item * Extended Gene Prediction with bin (genePredExt), 16 columns
84              
85             =item * knownGene table, 12 columns
86              
87             =back
88              
89             =head2 Supplemental information
90              
91             The UCSC gene prediction tables include essential information, but not detailed
92             information, such as common gene names, description, protein accession IDs, etc.
93             This additional information can be associated with the genes or transcripts during
94             parsing if the appropriate tables are supplied. These tables can be obtained from
95             the UCSC download site L.
96              
97             Supported tables include the following.
98              
99             =over 4
100              
101             =item * refSeqStatus, for refGene, knownGene, and xenoRefGene tables
102              
103             =item * refSeqSummary, for refGene, knownGene, and xenoRefGene tables
104              
105             =item * ensemblToGeneName, for ensGene tables
106              
107             =item * ensemblSource, for ensGene tables
108              
109             =item * kgXref, for knownGene tables
110              
111             =back
112              
113             =head2 Implementation
114              
115             For an implementation of this module to generate GFF3 formatted files from UCSC
116             data sources, see the L script L.
117              
118             =head1 METHODS
119              
120             =head2 Initalize the parser object
121              
122             =over 4
123              
124             =item new
125              
126             Initiate a UCSC table parser object. Pass a single value (a table file name)
127             to open a table and parse its objects. Alternatively, pass an array of key
128             value pairs to control how the table is parsed. Options include the following.
129              
130             =over 4
131              
132             =item file
133              
134             =item table
135              
136             Provide a file name for a UCSC gene prediction table. The file may be gzip
137             compressed.
138              
139             =item source
140              
141             Pass a string to be added as the source tag value of the SeqFeature objects.
142             The default value is 'UCSC'. If the file name has a recognizable name,
143             such as 'refGene' or 'ensGene', it will be used instead.
144              
145             =item do_gene
146              
147             Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene
148             name under a single gene object. Default is true.
149              
150             -item do_exon
151              
152             =item do_cds
153              
154             =item do_utr
155              
156             =item do_codon
157              
158             Pass a boolean (1 or 0) value to parse certain subfeatures, including exon,
159             CDS, five_prime_UTR, three_prime_UTR, stop_codon, and start_codon features.
160             Default is false.
161              
162             =item do_name
163              
164             Pass a boolean (1 or 0) value to assign names to subfeatures, including exons,
165             CDSs, UTRs, and start and stop codons. Default is false.
166              
167             =item share
168              
169             Pass a boolean (1 or 0) value to recycle shared subfeatures (exons and UTRs)
170             between multiple transcripts of the same gene. This results in reduced
171             memory usage, and smaller exported GFF3 files. Default is true.
172              
173             =item refseqsum
174              
175             =item refseqstat
176              
177             =item kgxref
178              
179             =item ensembltogene
180              
181             =item ensemblsource
182              
183             Pass the appropriate file name for additional information.
184              
185             =item class
186              
187             Pass the name of a L compliant class that will be used to
188             create the SeqFeature objects. The default is to use L.
189              
190             =back
191              
192             =back
193              
194             =head2 Modify the parser object
195              
196             These methods set or retrieve parameters, and load supplemental files and
197             new tables.
198              
199             =over 4
200              
201             =item source
202              
203             =item do_gene
204              
205             =item do_exon
206              
207             =item do_cds
208              
209             =item do_utr
210              
211             =item do_codon
212              
213             =item do_name
214              
215             =item share
216              
217             These methods retrieve or set parameters to the parsing engine, same as
218             the options to the new method.
219              
220             =item fh
221              
222             Set or retrieve the file handle of the current table. This module uses
223             L objects. Be careful manipulating file handles of open tables!
224              
225             =item open_file
226              
227             Pass the name of a new table to parse. Existing gene models loaded in
228             memory, if any, are discarded. Counts are reset to 0. Supplemental
229             tables are not discarded.
230              
231             =item load_extra_data($file, $type)
232              
233             my $file = 'hg19_refSeqSummary.txt.gz';
234             my success = $ucsc->load_extra_data($file, 'summary');
235              
236             Pass two values, the file name of the supplemental file and the type
237             of supplemental data. Values can include the following
238              
239             =over 4
240              
241             =item * refseqstatus or status
242              
243             =item * refseqsummary or summary
244              
245             =item * kgxref
246              
247             =item * ensembltogene or ensname
248              
249             =item * ensemblsource or enssrc
250              
251             =back
252              
253             The number of transcripts with information loaded from the supplemental
254             data file is returned.
255              
256             =back
257              
258             =head2 Feature retrieval
259              
260             The following methods parse the table lines into SeqFeature objects.
261             It is best if methods are not mixed; unexpected results may occur.
262              
263             =over 4
264              
265             =item next_feature
266              
267             This will read the next line of the table and parse it into a gene or
268             transcript object. However, multiple transcripts from the same gene are
269             not assembled together under the same gene object.
270              
271             =item next_top_feature
272              
273             This method will return all top features (typically genes), with multiple
274             transcripts of the same gene assembled under the same gene object. Transcripts
275             are assembled together if they share the same gene name and the transcripts
276             overlap. If transcripts share the same gene name but do not overlap, they
277             are placed into separate gene objects with the same name but different
278             C tags. Calling this method will parse the entire table into
279             memory (so that multiple transcripts may be assembled), but only one object
280             is returned at a time. Call this method repeatedly using a while loop to
281             get all features.
282              
283             =item top_features
284              
285             This method is similar to L, but instead returns an array
286             of all the top features.
287              
288             =back
289              
290             =head2 Other methods
291              
292             Additional methods for working with the parser object and the parsed
293             SeqFeature objects.
294              
295             =over 4
296              
297             =item parse_table
298              
299             Parses the table into memory. If a table wasn't provided using the
300             L or L methods, then a filename can be passed to this
301             method and it will automatically be opened for you.
302              
303             =item find_gene
304              
305             my $gene = $ucsc->find_gene(
306             display_name => 'ABC1',
307             primary_id => 'gene000001',
308             );
309              
310             Pass a gene name, or an array of key =E values (name, display_name,
311             ID, primary_ID, and/or coordinate information), that can be used
312             to find a gene already loaded into memory. Only really successful if the
313             entire table is loaded into memory. Genes with a matching name are
314             confirmed by a matching ID or overlapping coordinates, if available.
315             Otherwise the first match is returned.
316              
317             =item counts
318              
319             This method will return a hash of the number of genes and RNA types that
320             have been parsed.
321              
322             =item typelist
323              
324             This method will return a comma-delimited list of the feature types or
325             Cs found in the parsed file. Returns a generic list if a
326             file has not been parsed.
327              
328             =item from_ucsc_string
329              
330             A bare bones method that will convert a tab-delimited text line from a UCSC
331             formatted gene table into a SeqFeature object for you. Don't expect alternate
332             transcripts to be assembled into genes.
333              
334             =item seq_ids
335              
336             Returns an array or array reference of the names of the chromosomes or
337             reference sequences present in the table.
338              
339             =item seq_id_lengths
340              
341             Returns a hash reference to the chromosomes or reference sequences and
342             their corresponding lengths. In this case, the length is inferred by the
343             greatest gene end position.
344              
345             =back
346              
347             =head2 Bio::ToolBox::parser::ucsc::builder
348              
349             This is a private module that is responsible for building SeqFeature
350             objects from UCSC table lines. It is not intended for general public use.
351              
352             =head1 SEE ALSO
353              
354             L, L
355              
356             =cut
357              
358 2     2   110107 use strict;
  2         12  
  2         69  
359 2     2   10 use Carp qw(carp cluck croak);
  2         5  
  2         103  
360 2     2   639 use Bio::ToolBox::Data::file; # only used to get an open filehandle
  2         5  
  2         6561  
361              
362             1;
363              
364             sub new {
365 5     5 1 4700 my $class = shift;
366 5         103 my $self = {
367             'fh' => undef,
368             'version' => undef,
369             'source' => 'UCSC',
370             'top_features' => [],
371             'gene2seqf' => {},
372             'id2count' => {},
373             'counts' => {},
374             'do_gene' => 1,
375             'do_exon' => 0,
376             'do_cds' => 0,
377             'do_utr' => 0,
378             'do_codon' => 0,
379             'do_name' => 0,
380             'share' => 1,
381             'refseqsum' => {},
382             'refseqstat' => {},
383             'kgxref' => {},
384             'ensembldata' => {},
385             'eof' => 0,
386             'line_count' => 0,
387             'sfclass' => 'Bio::ToolBox::SeqFeature', # default class
388             'seq_ids' => {},
389             };
390 5         18 bless $self, $class;
391            
392             # check for options
393 5 100       21 if (@_) {
394 3 50       13 if (scalar(@_) == 1) {
395             # short and sweet, just a file, we assume
396 0         0 my $file = shift @_;
397 0         0 $self->open_file($file);
398             }
399             else {
400 3         22 my %options = @_;
401 3 50 33     19 if (exists $options{file} or $options{table}) {
402 3   33     12 $options{file} ||= $options{table};
403 3         16 $self->open_file( $options{file} );
404             }
405 3 100       14 if (exists $options{do_gene}) {
406 1         5 $self->do_gene($options{do_gene});
407             }
408 3 50       11 if (exists $options{do_exon}) {
409 3         14 $self->do_exon($options{do_exon});
410             }
411 3 100       11 if (exists $options{do_cds}) {
412 1         5 $self->do_cds($options{do_cds});
413             }
414 3 100       11 if (exists $options{do_utr}) {
415 1         4 $self->do_utr($options{do_utr});
416             }
417 3 50       9 if (exists $options{do_codon}) {
418 0         0 $self->do_codon($options{do_codon});
419             }
420 3 50       10 if (exists $options{do_name}) {
421 0         0 $self->do_name($options{do_name});
422             }
423 3 50       34 if (exists $options{share}) {
424 0         0 $self->share($options{share});
425             }
426 3 50       12 if (exists $options{source}) {
427 0         0 $self->source($options{source});
428             }
429 3 50       16 if (exists $options{refseqsum}) {
    50          
430 0         0 $self->load_extra_data($options{refseqsum}, 'refseqsum');
431             }
432             elsif (exists $options{summary}) {
433 0         0 $self->load_extra_data($options{summary}, 'refseqsum');
434             }
435 3 50       16 if (exists $options{refseqstat}) {
    50          
436 0         0 $self->load_extra_data($options{refseqstat}, 'refseqstat');
437             }
438             elsif (exists $options{status}) {
439 0         0 $self->load_extra_data($options{status}, 'refseqstat');
440             }
441 3 50       10 if (exists $options{kgxref}) {
442 0         0 $self->load_extra_data($options{kgxref}, 'kgxref');
443             }
444 3 50       16 if (exists $options{ensembltogenename}) {
    50          
445 0         0 $self->load_extra_data($options{ensembltogenename}, 'ensembltogene');
446             }
447             elsif (exists $options{ensname}) {
448 0         0 $self->load_extra_data($options{ensname}, 'ensembltogene');
449             }
450 3 50       24 if (exists $options{ensemblsource}) {
    100          
451 0         0 $self->load_extra_data($options{ensemblsource}, 'ensemblsource');
452             }
453             elsif (exists $options{enssrc}) {
454 1         9 $self->load_extra_data($options{enssrc}, 'ensemblsource');
455             }
456 3 100       14 if (exists $options{class}) {
457 2         7 $self->{sfclass} = $options{class};
458             }
459             }
460             }
461            
462             # done
463 5         20 return $self;
464             }
465              
466             sub version {
467 0     0 0 0 return shift->{version};
468             }
469              
470             sub source {
471 123     123 1 203 my $self = shift;
472 123 100       271 if (@_) {
473 5         27 $self->{'source'} = shift;
474             }
475 123         380 return $self->{'source'};
476             }
477              
478             sub simplify {
479             # this doesn't do anything, for now, but maintain compatibility with gff parser
480 2     2 0 5 return 0;
481             }
482              
483             sub do_gene {
484 95     95 1 920 my $self = shift;
485 95 100       213 if (@_) {
486 3         6 $self->{'do_gene'} = shift;
487             }
488 95         263 return $self->{'do_gene'};
489             }
490              
491             sub do_exon {
492 93     93 1 142 my $self = shift;
493 93 100       193 if (@_) {
494 4         10 $self->{'do_exon'} = shift;
495             }
496 93         215 return $self->{'do_exon'};
497             }
498              
499             sub do_cds {
500 55     55 1 81 my $self = shift;
501 55 100       137 if (@_) {
502 1         2 $self->{'do_cds'} = shift;
503             }
504 55         146 return $self->{'do_cds'};
505             }
506              
507             sub do_utr {
508 53     53 1 86 my $self = shift;
509 53 100       121 if (@_) {
510 1         2 $self->{'do_utr'} = shift;
511             }
512 53         127 return $self->{'do_utr'};
513             }
514              
515             sub do_codon {
516 54     54 1 90 my $self = shift;
517 54 50       120 if (@_) {
518 0         0 $self->{'do_codon'} = shift;
519             }
520 54         118 return $self->{'do_codon'};
521             }
522              
523             sub do_name {
524 231     231 1 355 my $self = shift;
525 231 50       506 if (@_) {
526 0         0 $self->{'do_name'} = shift;
527             }
528 231         517 return $self->{'do_name'};
529             }
530              
531             sub share {
532 394     394 1 620 my $self = shift;
533 394 50       777 if (@_) {
534 0         0 $self->{'share'} = shift;
535             }
536 394         1325 return $self->{'share'};
537             }
538              
539             sub fh {
540 234     234 1 1221 my $self = shift;
541 234 100       478 if (@_) {
542 7         19 $self->{'fh'} = shift;
543             }
544 234         3065 return $self->{'fh'};
545             }
546              
547             sub open_file {
548 7     7 1 14 my $self = shift;
549            
550             # check file
551 7         13 my $filename = shift;
552 7 50       20 unless ($filename) {
553 0         0 cluck("no file name passed!");
554 0         0 return;
555             }
556            
557             # Open filehandle object
558 7 50       85 my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or
559             croak " cannot open file '$filename'!\n";
560            
561             # check number of columns
562 7         11 my $ncol;
563 7         199 while (my $line = $fh->getline) {
564 14 100       1412 next if $line =~ /^#/;
565 7 50       44 next unless $line =~ /\w+/;
566 7         51 my @fields = split "\t", $line;
567 7         17 $ncol = scalar(@fields);
568 7         25 last;
569             }
570 7 0 33     26 unless ($ncol == 16 or $ncol == 15 or $ncol == 12 or $ncol == 11 or $ncol == 10) {
      33        
      0        
      0        
571 0         0 carp " File '$filename' doesn't have recognizable number of columns! It has $ncol.";
572 0         0 return;
573             }
574 7 50       24 if ($ncol == 10) {
575             # turn off gene processing for simple genePred which has no gene names
576 0         0 $self->do_gene(0);
577             }
578            
579             # reopen file handle
580 7         38 $fh->close;
581 7         214 $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename);
582            
583             # reset source as necessary
584 7 100 66     78 if ($filename =~ /ensgene/i and $self->source eq 'UCSC') {
    50 33        
    50 33        
    50 33        
    50 33        
585 5         20 $self->source('EnsGene');
586             }
587             elsif ($filename =~ /xenorefgene/i and $self->source eq 'UCSC') {
588 0         0 $self->source('xenoRefGene');
589             }
590             elsif ($filename =~ /refgene/i and $self->source eq 'UCSC') {
591 0         0 $self->source('refGene');
592             }
593             elsif ($filename =~ /refseq/i and $self->source eq 'UCSC') {
594 0         0 $self->source('refSeq');
595             }
596             elsif ($filename =~ /knowngene/i and $self->source eq 'UCSC') {
597 0         0 $self->source('knownGene');
598             }
599            
600             # check existing data
601             # the reason this parser can handle reading a second file without having to make a
602             # new parser object (like gff and bed) is so that we can potentially recycle the
603             # extra UCSC information
604 7 100       26 if ($self->fh) {
605             # close existing
606 2         7 $self->fh->close;
607             # go ahead and clear out existing data
608 2         45 $self->{'version'} = undef;
609 2         8 $self->{'top_features'} = [];
610 2         18 $self->{'duplicate_ids'} = {};
611 2         7 $self->{'gene2seqf'} = {};
612 2         8 $self->{'id2count'} = {};
613 2         5 $self->{'counts'} = {};
614 2         7 $self->{'eof'} = 0;
615 2         5 $self->{'line_count'} = 0;
616             }
617            
618 7         26 $self->fh($fh);
619 7         24 return 1;
620             }
621              
622             sub load_extra_data {
623 5     5 1 1352 my ($self, $file, $type) = @_;
624 5 50       25 unless ($file) {
625 0         0 cluck "no file name passed!";
626 0         0 return;
627             }
628            
629             # check the type
630 5 100       53 if ($type =~ /ensembltogene|ensname/i) {
    50          
    0          
    0          
    0          
631 2         5 $type = 'ensembltogene';
632             }
633             elsif ($type =~ /ensemblsource|enssrc/i) {
634 3         6 $type = 'ensemblsource';
635             }
636             elsif ($type =~ /refseqstat|status/i) {
637 0         0 $type = 'refseqstat';
638             }
639             elsif ($type =~ /refseqsum|summary/i) {
640 0         0 $type = 'refseqsum';
641             }
642             elsif ($type =~ /kgxref/i) {
643 0         0 $type = 'kgxref';
644             }
645             else {
646 0         0 carp "unknown type '$type' to load extra data";
647 0         0 return;
648             }
649            
650 5         34 my $fh = Bio::ToolBox::Data::file->open_to_read_fh($file);
651 5 50       23 unless ($fh) {
652 0         0 carp "unable to open file '$file'! $!";
653 0         0 return;
654             }
655            
656             # load ensembl data
657 5         13 my $count = 0;
658 5 50       24 if ($type =~ /ensembl/) {
659             # we will store gene name in position 0, and source in position 1
660 5 100       18 my $index = $type eq 'ensembltogene' ? 0 : 1;
661 5         129 while (my $line = $fh->getline) {
662            
663             # process line
664 85         2836 chomp $line;
665 85 50       202 next if ($line =~ /^#/);
666 85         193 my @line_data = split /\t/, $line;
667 85 50       170 if (scalar @line_data != 2) {
668 0         0 carp " file $file doesn't seem right!? Line has " .
669             scalar @line_data . " elements!\n";
670 0         0 return;
671             }
672            
673             # store data into hash
674 85         238 $self->{'ensembldata'}{ $line_data[0] }->[$index] = $line_data[1];
675 85         1481 $count++;
676             }
677             }
678            
679             # load various refSeq data
680             else {
681             # we just store the line data based on the gene ID
682             # each table has different elements
683             # here they are for reference
684            
685             ### refSeqStatus table
686             # 0 mrnaAcc RefSeq gene accession name
687             # 1 status Status ('Unknown', 'Reviewed', 'Validated', 'Provisional', 'Predicted', 'Inferred')
688             # 2 molecule type ('DNA', 'RNA', 'ds-RNA', 'ds-mRNA', 'ds-rRNA', 'mRNA', 'ms-DNA', 'ms-RNA', 'rRNA', 'scRNA', 'snRNA', 'snoRNA', 'ss-DNA', 'ss-RNA', 'ss-snoRNA', 'tRNA', 'cRNA', 'ss-cRNA', 'ds-cRNA', 'ms-rRNA') values molecule type
689            
690             ### refSeqSummary table
691             # 0 RefSeq mRNA accession
692             # 1 completeness FullLength ('Unknown', 'Complete5End', 'Complete3End', 'FullLength', 'IncompleteBothEnds', 'Incomplete5End', 'Incomplete3End', 'Partial')
693             # 1 summary text values Summary comments
694            
695             ### kgXref table
696             # 0 kgID Known Gene ID
697             # 1 mRNA mRNA ID
698             # 2 spID SWISS-PROT protein Accession number
699             # 3 spDisplayID SWISS-PROT display ID
700             # 4 geneSymbol Gene Symbol
701             # 5 refseq RefSeq ID
702             # 6 protAcc NCBI protein Accession number
703             # 7 description Description
704            
705             ### ensemblToGeneName table
706             # 0 Ensembl transcript ID
707             # 1 gene name
708            
709             # load the table
710 0         0 while (my $line = $fh->getline) {
711 0         0 chomp $line;
712 0 0       0 next if ($line =~ /^#/);
713 0         0 my @line_data = split /\t/, $line;
714            
715             # the unique id should be the first element in the array
716             # take it off the array, since it doesn't need to be stored there too
717 0         0 my $id = shift @line_data;
718            
719             # check for duplicate lines
720 0 0       0 if (exists $self->{$type}{$id} ) {
721 0         0 warn " $type line for identifier $id exists twice!\n";
722 0         0 next;
723             }
724            
725             # store data into hash
726 0         0 $self->{$type}{$id} = [@line_data];
727 0         0 $count++;
728             }
729             }
730            
731 5         184 $fh->close;
732 5         104 return $count;
733             }
734              
735             sub typelist {
736 3     3 1 5 my $self = shift;
737 3         5 my @items;
738 3         5 foreach my $k (keys %{$self->{counts}}) {
  3         13  
739 0 0       0 push @items, $k if $self->{counts}{$k} > 0;
740             }
741 3 50       8 if (@items) {
742 0         0 return join(',', @items);
743             }
744             else {
745             # return generic list
746 3 100       10 return $self->do_gene ? 'gene,mRNA,ncRNA,exon,CDS' : 'mRNA,ncRNA,exon,CDS';
747             }
748             }
749              
750             sub next_feature {
751 92     92 1 1660 my $self = shift;
752            
753             # check that we have an open filehandle
754 92 50       188 unless ($self->fh) {
755 0         0 croak("no UCSC file loaded to parse!");
756             }
757            
758 92         180 while (my $line = $self->fh->getline) {
759 114         2987 chomp $line;
760 114 100 66     614 if ($line =~ /^#/ or $line !~ /\w+/) {
761 27         91 $self->{line_count}++;
762 27         85 next;
763             }
764 87         306 my $builder = Bio::ToolBox::parser::ucsc::builder->new($line, $self);
765 87         141 $self->{line_count}++;
766 87 50       205 unless ($builder) {
767             # builder will print its own error message if fails
768 0         0 warn " unable to parse line number ", $self->{line_count}, "\n";
769 0         0 next;
770             }
771            
772             # generate the feature from the line
773 87         128 my $feature;
774 87 100       217 if ($self->do_gene) {
775 70         152 $feature = $builder->build_gene;
776             }
777             else {
778 17         41 $feature = $builder->build_transcript;
779             }
780            
781             # return the object, we do this while loop once per valid line
782 87         619 return $feature;
783             }
784            
785             # presumed end of file
786 5         228 $self->{'eof'} = 1;
787 5         18 return;
788             }
789              
790             sub next_top_feature {
791 25     25 1 1157 my $self = shift;
792 25 50       56 unless ($self->{'eof'}) {
793 0         0 $self->parse_table;
794             }
795 25         35 return shift @{ $self->{top_features} };
  25         72  
796             }
797              
798             sub top_features {
799 2     2 1 2018 my $self = shift;
800 2 50       10 unless ($self->{'eof'}) {
801 0         0 $self->parse_table;
802             }
803 2         5 my @features = @{ $self->{top_features} };
  2         8  
804 2 50       10 return wantarray ? @features : \@features;
805             }
806              
807             *parse_file = \&parse_table;
808              
809             sub parse_table {
810 5     5 1 11099 my $self = shift;
811 5 100       19 if (@_) {
812 2 50       9 $self->open_file(shift) or return;
813             }
814 5 50       17 unless ($self->fh) {
815 0         0 carp "must open a file first!";
816 0         0 return;
817             }
818 5 50       22 return if ($self->{'eof'});
819            
820             #### Main Loop
821 5         217 print " Parsing UCSC gene table....\n";
822 5         34 while (my $feature = $self->next_feature) {
823            
824             # add to gene2seqf hash
825 85         224 my $gene = $self->find_gene($feature);
826 85 50       195 unless ($gene) {
827             # the current feature is not in the hash, so add it
828 85         270 $self->{gene2seqf}->{ lc $feature->display_name } = [ $feature ];
829             }
830            
831             # check chromosome
832 85         327 my $s = $feature->seq_id;
833 85 100       318 unless (exists $self->{seq_ids}{$s}) {
834 5         25 $self->{seq_ids}{$s} = $feature->end;
835             }
836 85 100       224 $self->{seq_ids}{$s} = $feature->end if $feature->end > $self->{seq_ids}{$s};
837             }
838            
839             # add to the top list of features, Schwartzian transform and sort
840             # based on the genes found in the gene2seqf hash
841 5         19 push @{ $self->{top_features} },
842 37         64 map {$_->[2]}
843 79 0       248 sort {$a->[0] cmp $b->[0] or $a->[1] <=> $b->[1]}
844             map [$_->seq_id, $_->start, $_],
845 37         85 map @{ $self->{gene2seqf}->{$_} },
846 5         23 keys %{$self->{gene2seqf}};
  5         31  
847            
848 5         35 return 1;
849             }
850              
851             sub find_gene {
852 87     87 1 29372 my $self = shift;
853            
854             # get the name and coordinates from arguments
855 87         155 my ($name, $id, $chrom, $start, $end, $strand);
856 87 50       246 if (scalar @_ == 0) {
    50          
857 0         0 carp "must provide information to find_gene method!";
858 0         0 return;
859             }
860             elsif (scalar @_ == 1) {
861 87         148 $name = $_[0];
862             }
863             else {
864 0         0 my %opt = @_;
865 0   0     0 $name = $opt{name} || $opt{display_name} || undef;
866 0   0     0 $id = $opt{id} || $opt{primary_id} || undef;
867 0   0     0 $chrom = $opt{chrom} || $opt{seq_id} || undef;
868 0   0     0 $start = $opt{start} || undef;
869 0   0     0 $end = $opt{stop} || $opt{end} || undef;
870 0   0     0 $strand = $opt{strand} || 0;
871             }
872 87 50       212 unless ($name) {
873 0         0 carp "name is required for find_gene!";
874 0         0 return;
875             }
876            
877             # check if a gene with this name exists
878 87 100       410 if (exists $self->{gene2seqf}->{lc $name} ) {
879             # we found a matching gene
880            
881             # pull out the gene seqfeature(s) array reference
882             # there may be more than one gene
883 2         8 my $genes = $self->{gene2seqf}->{ lc $name };
884            
885             # go through a series of checks to find the appropriate
886 2 50       9 if ($id) {
887 0         0 foreach my $g (@$genes) {
888 0 0       0 if ($g->primary_id eq $id) {
889 0         0 return $g;
890             }
891             }
892 0         0 return; # none of these matched despite having an ID
893             }
894 2 0 33     8 if ($chrom and $start and $end) {
      33        
895 0         0 foreach my $g (@$genes) {
896 0 0 0     0 if (
      0        
897             # overlap method borrowed from Bio::RangeI
898             ($g->strand == $strand) and not (
899             $g->start > $end or
900             $g->end < $start
901             )
902             ) {
903             # gene and transcript overlap on the same strand
904             # we found the intersecting gene
905 0         0 return $g;
906             }
907             }
908 0         0 return; # none of these matched despite having coordinate info
909             }
910 2 50       9 if (scalar @$genes == 1) {
    0          
911             # going on trust here that this is the one
912 2         7 return $genes->[0];
913             }
914             elsif (scalar @$genes > 1) {
915 0         0 carp "more than one gene named $name found!";
916 0         0 return $genes->[0];
917             }
918            
919             # nothing suitable found
920 0         0 return;
921             }
922             }
923              
924             sub counts {
925 2     2 1 4298 my $self = shift;
926 2         7 my %counts = %{ $self->{counts} };
  2         19  
927 2 50       20 return wantarray ? %counts : \%counts;
928             }
929              
930             sub from_ucsc_string {
931 0     0 1 0 my ($self, $string) = @_;
932 0 0       0 return unless $string;
933 0         0 my $builder = Bio::ToolBox::parser::ucsc::builder->new($string, $self);
934 0 0       0 return unless $builder;
935 0 0       0 if ($self->do_gene) {
936 0         0 return $builder->build_gene;
937             }
938             else {
939 0         0 return $builder->build_transcript;
940             }
941             }
942              
943             sub seq_ids {
944 0     0 1 0 my $self = shift;
945 0         0 my @s = keys %{$self->{seq_ids}};
  0         0  
946 0 0       0 return wantarray ? @s : \@s;
947             }
948              
949             sub seq_id_lengths {
950 0     0 1 0 my $self = shift;
951 0         0 return $self->{seq_ids};
952             }
953              
954              
955              
956             package Bio::ToolBox::parser::ucsc::builder;
957 2     2   23 use strict;
  2         5  
  2         79  
958 2     2   15 use Carp qw(carp cluck croak);
  2         3  
  2         10658  
959             our $SFCLASS = ''; # SeqFeature class to use
960              
961             1;
962              
963             sub new {
964 173     173   453 my ($class, $line, $ucsc) = @_;
965 173         314 my %self;
966             my $format;
967            
968             # check SeqFeature class
969 173 100       507 if ($ucsc->{sfclass} ne $SFCLASS) {
970 3         11 $SFCLASS = $ucsc->{sfclass};
971 3 50       193 eval "require $SFCLASS" or croak $@;
972             }
973            
974 173         319 chomp $line;
975 173         824 my @linedata = split /\t/, $line;
976            
977             # we're identifying the type of table based on the number of columns
978             # may not be the best or accurate method, but it generally works for valid formats
979            
980             ### Extended Gene Prediction Table ###
981 173 100       575 if (scalar @linedata == 16) {
    50          
    50          
    50          
    50          
982             # an extended gene prediction table, e.g. refGene, ensGene, xenoRefGene
983             # as downloaded from the UCSC Table Browser or FTP site
984             # includes the bin number as the first column
985            
986             # 0 bin
987             # 1 name
988             # 2 chrom
989             # 3 strand
990             # 4 txStart
991             # 5 txEnd
992             # 6 cdsStart
993             # 7 cdsEnd
994             # 8 exonCount
995             # 9 exonStarts
996             # 10 exonEnds
997             # 11 score
998             # 12 name2
999             # 13 cdsStartStat
1000             # 14 cdsEndStat
1001             # 15 exonFrames
1002            
1003 87         142 $format = 'genePredExt with bin';
1004 87         209 $self{name} = $linedata[1];
1005 87         163 $self{chrom} = $linedata[2];
1006 87         152 $self{strand} = $linedata[3];
1007 87         251 $self{txStart} = $linedata[4] + 1;
1008 87         154 $self{txEnd} = $linedata[5];
1009 87         149 $self{cdsStart} = $linedata[6] + 1;
1010 87         143 $self{cdsEnd} = $linedata[7];
1011 87         229 $self{exonCount} = $linedata[8];
1012 87         143 $self{exonStarts} = $linedata[9];
1013 87         143 $self{exonEnds} = $linedata[10];
1014 87   50     211 $self{name2} = $linedata[12] || undef;
1015 87   50     422 $self{gene_name} = $ucsc->{ensembldata}->{ $linedata[1] }->[0] ||
1016             $linedata[12] || undef;
1017 87   50     355 $self{note} = $ucsc->{refseqsum}->{ $linedata[1] }->[1] || undef;
1018 87   50     331 $self{status} = $ucsc->{refseqstat}->{ $linedata[1] }->[0] || undef;
1019 87   50     278 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[1] }->[0] || undef;
1020 87 50       241 if ($linedata[1] =~ /^N[MR]_\d+/) {
1021 0         0 $self{refseq} = $linedata[1];
1022             }
1023             }
1024             ### Extended Gene Prediction Table ###
1025             elsif (scalar @linedata == 15) {
1026             # an extended gene prediction table, e.g. refGene, ensGene, xenoRefGene
1027             # without the bin value
1028            
1029             # 0 name
1030             # 1 chrom
1031             # 2 strand
1032             # 3 txStart
1033             # 4 txEnd
1034             # 5 cdsStart
1035             # 6 cdsEnd
1036             # 7 exonCount
1037             # 8 exonStarts
1038             # 9 exonEnds
1039             # 10 score
1040             # 11 name2
1041             # 12 cdsStartStat
1042             # 13 cdsEndStat
1043             # 14 exonFrames
1044            
1045 0         0 $format = 'genePredExt';
1046 0         0 $self{name} = $linedata[0];
1047 0         0 $self{chrom} = $linedata[1];
1048 0         0 $self{strand} = $linedata[2];
1049 0         0 $self{txStart} = $linedata[3] + 1;
1050 0         0 $self{txEnd} = $linedata[4];
1051 0         0 $self{cdsStart} = $linedata[5] + 1;
1052 0         0 $self{cdsEnd} = $linedata[6];
1053 0         0 $self{exonCount} = $linedata[7];
1054 0         0 $self{exonStarts} = $linedata[8];
1055 0         0 $self{exonEnds} = $linedata[9];
1056 0   0     0 $self{name2} = $linedata[11] || undef;
1057 0   0     0 $self{gene_name} = $ucsc->{ensembldata}->{ $linedata[0] }->[0] ||
1058             $linedata[11] || undef;
1059 0   0     0 $self{note} = $ucsc->{refseqsum}->{ $linedata[0] }->[1] || undef;
1060 0   0     0 $self{status} = $ucsc->{refseqstat}->{ $linedata[0] }->[0] || undef;
1061 0   0     0 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[0] }->[0] || undef;
1062 0 0       0 if ($linedata[0] =~ /^N[MR]_\d+/) {
1063 0         0 $self{refseq} = $linedata[0];
1064             }
1065             }
1066             ### Known Gene Table ###
1067             elsif (scalar @linedata == 12) {
1068            
1069             # 0 name known gene identifier
1070             # 1 chrom Reference sequence chromosome or scaffold
1071             # 2 strand + or - for strand
1072             # 3 txStart Transcription start position
1073             # 4 txEnd Transcription end position
1074             # 5 cdsStart Coding region start
1075             # 6 cdsEnd Coding region end
1076             # 7 exonCount Number of exons
1077             # 8 exonStarts Exon start positions
1078             # 9 exonEnds Exon end positions
1079             # 10 proteinID UniProt display ID for Known Genes, UniProt accession or RefSeq protein ID for UCSC Genes
1080             # 11 alignID Unique identifier for each (known gene, alignment position) pair
1081            
1082 0         0 $format = 'knownGene';
1083 0   0     0 $self{name} = $ucsc->{kgxref}->{ $linedata[0] }->[0] || $linedata[0];
1084 0         0 $self{chrom} = $linedata[1];
1085 0         0 $self{strand} = $linedata[2];
1086 0         0 $self{txStart} = $linedata[3] + 1;
1087 0         0 $self{txEnd} = $linedata[4];
1088 0         0 $self{cdsStart} = $linedata[5] + 1;
1089 0         0 $self{cdsEnd} = $linedata[6];
1090 0         0 $self{exonCount} = $linedata[7];
1091 0         0 $self{exonStarts} = $linedata[8];
1092 0         0 $self{exonEnds} = $linedata[9];
1093 0         0 $self{name2} = $linedata[0];
1094             $self{gene_name} = $ucsc->{kgxref}->{ $linedata[0] }->[3] || # geneSymbol
1095             $ucsc->{kgxref}->{ $linedata[0] }->[0] || # mRNA id
1096 0   0     0 $ucsc->{kgxref}->{ $linedata[0] }->[4] || # refSeq id
1097             $linedata[0]; # ugly default
1098 0   0     0 $self{note} = $ucsc->{kgxref}->{ $linedata[0] }->[6] || undef;
1099 0   0     0 $self{refseq} = $ucsc->{kgxref}->{ $linedata[0] }->[4] || undef;
1100 0   0     0 $self{status} = $ucsc->{refseqstat}->{ $self{refseq} }->[0] || undef;
1101 0   0     0 $self{completeness} = $ucsc->{refseqsum}->{ $self{refseq} }->[0] || undef;
1102 0   0     0 $self{spid} = $ucsc->{kgxref}->{ $linedata[0] }->[1] || undef; # SwissProt ID
1103 0   0     0 $self{spdid} = $ucsc->{kgxref}->{ $linedata[0] }->[2] || undef; # SwissProt display ID
1104 0   0     0 $self{protacc} = $ucsc->{kgxref}->{ $linedata[0] }->[5] || undef; # NCBI protein accession
1105             }
1106             ### refFlat or Gene Prediction Table ###
1107             elsif (scalar @linedata == 11) {
1108            
1109             # 0 name2 or gene name
1110             # 1 name or transcript name
1111             # 2 chrom
1112             # 3 strand
1113             # 4 txStart
1114             # 5 txEnd
1115             # 6 cdsStart
1116             # 7 cdsEnd
1117             # 8 exonCount
1118             # 9 exonStarts
1119             # 10 exonEnds
1120            
1121 0         0 $format = 'refFlat';
1122 0   0     0 $self{gene_name} = $ucsc->{ensembldata}->{ $linedata[1] }->[0] ||
1123             $linedata[0] || undef;
1124 0         0 $self{name2} = $linedata[0];
1125 0         0 $self{name} = $linedata[1];
1126 0         0 $self{chrom} = $linedata[2];
1127 0         0 $self{strand} = $linedata[3];
1128 0         0 $self{txStart} = $linedata[4] + 1;
1129 0         0 $self{txEnd} = $linedata[5];
1130 0         0 $self{cdsStart} = $linedata[6] + 1;
1131 0         0 $self{cdsEnd} = $linedata[7];
1132 0         0 $self{exonCount} = $linedata[8];
1133 0         0 $self{exonStarts} = $linedata[9];
1134 0         0 $self{exonEnds} = $linedata[10];
1135 0   0     0 $self{note} = $ucsc->{refseqsum}->{ $linedata[1] }->[1] || undef;
1136 0   0     0 $self{status} = $ucsc->{refseqstat}->{ $linedata[1] }->[0] || undef;
1137 0   0     0 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[1] }->[0] || undef;
1138 0 0       0 if ($linedata[1] =~ /^N[MR]_\d+/) {
1139 0         0 $self{refseq} = $linedata[1];
1140             }
1141             }
1142             ### Gene Prediction Table ###
1143             elsif (scalar @linedata == 10) {
1144             # a simple gene prediction table, e.g. refGene, ensGene, xenoRefGene
1145            
1146             # 0 name
1147             # 1 chrom
1148             # 2 strand
1149             # 3 txStart
1150             # 4 txEnd
1151             # 5 cdsStart
1152             # 6 cdsEnd
1153             # 7 exonCount
1154             # 8 exonStarts
1155             # 9 exonEnds
1156            
1157 86         129 $format = 'genePred';
1158 86         211 $self{name} = $linedata[0];
1159 86         185 $self{chrom} = $linedata[1];
1160 86         152 $self{strand} = $linedata[2];
1161 86         185 $self{txStart} = $linedata[3] + 1;
1162 86         142 $self{txEnd} = $linedata[4];
1163 86         141 $self{cdsStart} = $linedata[5] + 1;
1164 86         153 $self{cdsEnd} = $linedata[6];
1165 86         250 $self{exonCount} = $linedata[7];
1166 86         143 $self{exonStarts} = $linedata[8];
1167 86         151 $self{exonEnds} = $linedata[9];
1168 86         126 $self{name2} = $linedata[0]; # re-use transcript name
1169 86         140 $self{gene_name} = $linedata[0]; # re-use transcript name
1170 86   50     405 $self{note} = $ucsc->{refseqsum}->{ $linedata[0] }->[1] || undef;
1171 86   50     315 $self{status} = $ucsc->{refseqstat}->{ $linedata[0] }->[0] || undef;
1172 86   50     268 $self{completeness} = $ucsc->{refseqsum}->{ $linedata[0] }->[0] || undef;
1173 86 50       216 if ($linedata[0] =~ /^N[MR]_\d+/) {
1174 0         0 $self{refseq} = $linedata[0];
1175             }
1176             }
1177             else {
1178             # unrecognized line format
1179 0         0 carp "unrecognized format, line has " . scalar @linedata . "elements";
1180 0         0 return;
1181             }
1182            
1183             # verify
1184 173         353 my @errors;
1185 173 50       612 push @errors, 'strand' unless $self{strand} =~ /^[\+\-]$/;
1186 173 50       651 push @errors, 'txStart' unless $self{txStart} =~ /^\d+$/;
1187 173 50       487 push @errors, 'txEnd' unless $self{txEnd} =~ /^\d+$/;
1188 173 50       555 push @errors, 'cdsStart' unless $self{cdsStart} =~ /^\d+$/;
1189 173 50       479 push @errors, 'cdsEnd' unless $self{cdsEnd} =~ /^\d+$/;
1190 173 50       456 push @errors, 'exonCount' unless $self{exonCount} =~ /^\d+$/;
1191 173 50       455 push @errors, 'exonStarts' unless $self{exonStarts} =~ /^[\d,]+$/;
1192 173 50       497 push @errors, 'exonEnds' unless $self{exonEnds} =~ /^[\d,]+$/;
1193 173 50       400 if (@errors) {
1194 0         0 warn "line format for $format has the following errors: @errors";
1195 0         0 return;
1196             }
1197            
1198             # fix values
1199 173 100       454 $self{strand} = $self{strand} eq '+' ? 1 : -1;
1200 173         582 $self{exonStarts} = [ map {$_ += 1} ( split ",", $self{exonStarts} ) ];
  862         1908  
1201 173         726 $self{exonEnds} = [ ( split ",", $self{exonEnds} ) ];
1202            
1203             # Attempt to identify the transcript type
1204 173   100     711 my $type = $ucsc->{ensembldata}->{ $self{name} }->[1] || undef;
1205             # check if we have loaded ensembl source data and use that if available
1206 173 100       486 if ( $self{cdsStart} - 1 == $self{cdsEnd} ) {
1207             # there appears to be no coding potential when
1208             # txEnd = cdsStart = cdsEnd
1209             # if you'll look, all of the exon phases should also be -1
1210            
1211 63 100       313 if ($type) {
    50          
    50          
    50          
1212             # we have an ensembl source type, so prefer to use that
1213 21         93 $self{type} = $type;
1214             }
1215            
1216             # otherwise, we may be able to infer some certain
1217             # types from the gene name
1218             elsif ($self{name2} =~ /^mir/i) {
1219             # a noncoding gene whose name begins with mir is likely a micro RNA
1220 0         0 $self{type} = 'miRNA';
1221             }
1222             elsif ($self{name2} =~ /^snr/i) {
1223             # a noncoding gene whose name begins with snr is likely a snRNA
1224 0         0 $self{type} = 'snRNA';
1225             }
1226             elsif ($self{name2} =~ /^sno/i) {
1227             # a noncoding gene whose name begins with sno is likely a snoRNA
1228 0         0 $self{type} = 'snoRNA';
1229             }
1230             else {
1231             # a generic ncRNA
1232 42         142 $self{type} = 'ncRNA';
1233             }
1234             }
1235             else {
1236             # the transcript has an identifiable CDS so likely a mRNA
1237 110 100       504 $self{type} = defined $type ? $type : 'mRNA';
1238             }
1239            
1240             # add the ucsc object
1241 173         307 $self{ucsc} = $ucsc;
1242            
1243 173         757 return bless \%self, $class;
1244             }
1245              
1246             sub name {
1247 692     692   2104 return shift->{name};
1248             }
1249              
1250             sub name2 {
1251 229     229   571 return shift->{name2};
1252             }
1253              
1254             sub gene_name {
1255 371     371   1014 return shift->{gene_name};
1256             }
1257              
1258             sub chrom {
1259 195     195   616 return shift->{chrom};
1260             }
1261              
1262             sub txStart {
1263 291     291   783 return shift->{txStart};
1264             }
1265              
1266             sub txEnd {
1267 295     295   828 return shift->{txEnd};
1268             }
1269              
1270             sub strand {
1271 243     243   680 return shift->{strand};
1272             }
1273              
1274             sub cdsStart {
1275 811     811   2031 return shift->{cdsStart};
1276             }
1277              
1278             sub cdsEnd {
1279 409     409   1105 return shift->{cdsEnd};
1280             }
1281              
1282             sub exonCount {
1283 901     901   14974 return shift->{exonCount};
1284             }
1285              
1286             sub exonStarts {
1287 1412     1412   4437 return shift->{exonStarts};
1288             }
1289              
1290             sub exonEnds {
1291 1014     1014   2049 return shift->{exonEnds};
1292             }
1293              
1294             sub type {
1295 120     120   255 return shift->{type};
1296             }
1297              
1298             sub refseq {
1299 243     243   360 my $self = shift;
1300 243 50       645 return exists $self->{refseq} ? $self->{refseq} : undef;
1301             }
1302              
1303             sub note {
1304 243     243   382 my $self = shift;
1305 243 50       704 return exists $self->{note} ? $self->{note} : undef;
1306             }
1307              
1308             sub status {
1309 173     173   266 my $self = shift;
1310 173 50       473 return exists $self->{status} ? $self->{status} : undef;
1311             }
1312              
1313             sub completeness {
1314 173     173   243 my $self = shift;
1315 173 50       421 return exists $self->{completeness} ? $self->{completeness} : undef;
1316             }
1317              
1318             sub ucsc {
1319 493     493   889 return shift->{ucsc};
1320             }
1321              
1322             sub build_gene {
1323 70     70   107 my $self = shift;
1324            
1325             # shortcuts
1326 70         133 my $ucsc = $self->ucsc;
1327 70         115 my $id2count = $ucsc->{id2count};
1328 70         103 my $ensembldata = $ucsc->{ensembldata};
1329            
1330             # find a pre-existing gene to update or build a new one
1331 70         156 my $gene = $self->find_gene;
1332 70 100       144 if ($gene) {
1333             # update as necessary
1334 48 50       82 if ( ($self->txStart) < $gene->start) {
1335             # update the transcription start position
1336 0         0 $gene->start( $self->txStart );
1337             }
1338 48 100       185 if ($self->txEnd > $gene->end) {
1339             # update the transcription stop position
1340 4         51 $gene->end( $self->txEnd );
1341             }
1342             }
1343             else {
1344             # build a new gene
1345 22         52 $gene = $SFCLASS->new(
1346             -seq_id => $self->chrom,
1347             -source => $ucsc->source,
1348             -primary_tag => 'gene',
1349             -start => $self->txStart,
1350             -end => $self->txEnd,
1351             -strand => $self->strand,
1352             -phase => '.',
1353             -display_name => $self->gene_name,
1354             );
1355 22         405 $ucsc->{counts}->{gene} += 1;
1356            
1357             # Add a unique primary ID
1358 22         52 my $id = $self->name2;
1359 22 50       63 if (exists $id2count->{ lc $id }) {
1360             # we've encountered this gene ID before
1361            
1362             # then make name unique by appending the count number
1363 0         0 $id2count->{ lc $id } += 1;
1364 0         0 $id .= '.' . $id2count->{ lc $id };
1365             }
1366             else {
1367             # this is the first transcript with this id
1368             # set the id counter
1369 22         69 $id2count->{lc $id} = 0;
1370             }
1371 22         71 $gene->primary_id($id);
1372            
1373             # Add an alias
1374 22 100       82 if ($self->name2 ne $self->gene_name) {
1375 12         25 $gene->add_tag_value('Alias', $self->name2);
1376             }
1377             }
1378            
1379             # now build the transcript for the gene
1380 70         293 my $transcript = $self->build_transcript($gene);
1381 70         200 $gene->add_SeqFeature($transcript);
1382 70         1775 $transcript->add_tag_value('Parent', $gene->primary_id);
1383            
1384             # update extra attributes as necessary
1385 70         434 $self->update_attributes($gene);
1386            
1387             # finished
1388 70         199 return $gene;
1389             }
1390              
1391             sub build_transcript {
1392 173     173   400 my ($self, $gene) = @_; # gene is not required
1393            
1394             # shortcuts
1395 173         338 my $ucsc = $self->ucsc;
1396 173         283 my $id2count = $ucsc->{id2count};
1397 173         287 my $ensembldata = $ucsc->{ensembldata};
1398 173         267 my $counts = $ucsc->{counts};
1399            
1400             # Uniqueify the transcript ID and name
1401 173         339 my $id = $self->name;
1402 173 50       484 if (exists $id2count->{ lc $id } ) {
1403             # we've encountered this transcript ID before
1404            
1405             # now need to make ID unique by appending a number
1406 0         0 $id2count->{ lc $id } += 1;
1407 0         0 $id .= '.' . $id2count->{ lc $id };
1408             }
1409             else {
1410             # this is the first transcript with this id
1411 173         452 $id2count->{lc $id} = 0;
1412             }
1413            
1414             # identify the primary_tag value
1415 173         281 my ($type, $biotype);
1416 173 50       327 if (exists $ensembldata->{$self->name}) {
1417 173   100     331 my $t = $ensembldata->{$self->name}->[1] || undef;
1418 173 100 100     966 if ($t and $t =~ /protein.coding/i) {
    100 100        
    100          
1419 26         53 $type = 'mRNA';
1420 26         46 $biotype = $t;
1421             }
1422             elsif ($t and $t =~ /rna|transcript/i) {
1423 15         30 $type = $t;
1424 15         30 $biotype = $t;
1425             }
1426             elsif ($t) {
1427 12         41 $type = 'transcript';
1428 12         21 $biotype = $t;
1429             }
1430             else {
1431 120         239 $type = $self->type;
1432             }
1433             }
1434             else {
1435 0         0 $type = $self->type;
1436             }
1437            
1438             # Generate the transcript SeqFeature object
1439 173         427 my $transcript = $SFCLASS->new(
1440             -seq_id => $self->chrom,
1441             -source => $ucsc->source,
1442             -primary_tag => $type,
1443             -start => $self->txStart,
1444             -end => $self->txEnd,
1445             -strand => $self->strand,
1446             -phase => '.',
1447             -display_name => $self->name,
1448             -primary_id => $id,
1449             );
1450            
1451             # add gene name as an alias
1452 173 100       2615 if ($self->gene_name ne $self->name2) {
1453 36         67 $transcript->add_tag_value('Alias', $self->gene_name);
1454             }
1455            
1456             # update extra attributes as necessary
1457 173         636 $self->update_attributes($transcript);
1458            
1459             # add transcript specific attributes
1460 173 50       316 if (defined $self->completeness ) {
1461 0         0 $transcript->add_tag_value( 'completeness', $self->completeness );
1462             }
1463 173 50       345 if (defined $self->status ) {
1464 0         0 $transcript->add_tag_value( 'status', $self->status );
1465             }
1466 173 100       368 if ($biotype) {
1467 53         124 $transcript->add_tag_value( 'biotype', $biotype);
1468             }
1469            
1470             # add the exons
1471 173 100       529 if ($ucsc->do_exon) {
1472 122         257 $self->add_exons($transcript, $gene);
1473             }
1474            
1475             # add CDS, UTRs, and codons if necessary
1476 173 100       412 if ( $self->cdsStart - 1 != $self->cdsEnd ) {
1477            
1478 110 100       287 if ($ucsc->do_utr) {
1479 10         25 $self->add_utrs($transcript, $gene);
1480             }
1481            
1482 110 50       247 if ($ucsc->do_codon) {
1483 0         0 $self->add_codons($transcript, $gene);
1484             }
1485            
1486 110 100       238 if ($ucsc->do_cds) {
1487 10         24 $self->add_cds($transcript);
1488             }
1489             }
1490            
1491             # record the type of transcript
1492 173         458 $counts->{$type} += 1;
1493            
1494             # transcript is complete
1495 173         464 return $transcript;
1496             }
1497              
1498             sub update_attributes {
1499 243     243   457 my ($self, $seqf) = @_;
1500            
1501             # add Note if possible
1502 243 50       503 if (defined $self->note ) {
1503 0         0 $self->add_unique_attribute($seqf, 'Note', $self->note );
1504             }
1505            
1506             # add refSeq identifier if possible
1507 243 50       506 if (defined $self->refseq) {
1508 0         0 $self->add_unique_attribute($seqf, 'Dbxref', 'RefSeq:' . $self->refseq);
1509             }
1510            
1511             # add SwissProt identifier if possible
1512 243 50 33     587 if (exists $self->{spid} and defined $self->{spid}) {
1513 0         0 $self->add_unique_attribute($seqf, 'Dbxref', 'Swiss-Prot:' . $self->{spid});
1514             }
1515            
1516             # add SwissProt display identifier if possible
1517 243 50 33     603 if (exists $self->{spdid} and defined $self->{spdid}) {
1518 0         0 $self->add_unique_attribute($seqf, 'swiss-prot_display_id', $self->{spdid});
1519             }
1520            
1521             # add NCBI protein access identifier if possible
1522 243 50 33     585 if (exists $self->{protacc} and defined $self->{protacc}) {
1523 0         0 $self->add_unique_attribute($seqf, 'Dbxref', 'RefSeq:' . $self->{protacc});
1524             }
1525             }
1526              
1527             sub add_unique_attribute {
1528 0     0   0 my ($self, $seqf, $tag, $value) = @_;
1529            
1530             # look for a pre-existing identical tag value
1531 0         0 my $check = 1;
1532 0         0 foreach ($seqf->get_tag_values($tag)) {
1533 0 0       0 if ($_ eq $value) {
1534 0         0 $check = 0;
1535 0         0 last;
1536             }
1537             }
1538            
1539             # add it if our value is unique
1540 0 0       0 $seqf->add_tag_value($tag, $value) if $check;
1541             }
1542              
1543             sub add_exons {
1544 122     122   230 my ($self, $transcript, $gene) = @_;
1545 122         219 my $ucsc = $self->ucsc;
1546            
1547             # Add the exons
1548             EXON_LOOP:
1549 122         311 for (my $i = 0; $i < $self->exonCount; $i++) {
1550            
1551             # first look for existing
1552 595 100 100     1259 if ($ucsc->share and $gene) {
1553 277         517 my $exon = $self->find_existing_subfeature($gene, 'exon',
1554             $self->exonStarts->[$i], $self->exonEnds->[$i]);
1555 277 100       606 if ($exon) {
1556             # we found an existing exon to reuse
1557             # associate with this transcript
1558 156         404 $transcript->add_SeqFeature($exon);
1559 156         4939 next EXON_LOOP;
1560             }
1561             }
1562            
1563             # transform index for reverse strands
1564             # this will allow numbering from 5'->3'
1565 439         657 my $number;
1566 439 100       890 if ($transcript->strand == 1) {
1567             # forward strand
1568 427         1549 $number = $i;
1569             }
1570             else {
1571             # reverse strand
1572 12         72 $number = abs( $i - $self->exonCount + 1);
1573             }
1574            
1575             # build the exon seqfeature
1576 439         945 my $exon = $SFCLASS->new(
1577             -seq_id => $transcript->seq_id,
1578             -source => $transcript->source,
1579             -primary_tag => 'exon',
1580             -start => $self->exonStarts->[$i],
1581             -end => $self->exonEnds->[$i],
1582             -strand => $transcript->strand,
1583             -primary_id => $transcript->primary_id . ".exon$number",
1584             );
1585            
1586             # add name if requested
1587 439 50       10204 if ($ucsc->do_name) {
1588 0         0 $exon->display_name( $transcript->display_name . ".exon$number" );
1589             }
1590            
1591             # associate with transcript
1592 439         1018 $transcript->add_SeqFeature($exon);
1593             }
1594             }
1595              
1596             sub add_utrs {
1597 10     10   22 my ($self, $transcript, $gene) = @_;
1598 10         21 my $ucsc = $self->ucsc;
1599            
1600             # we will scan each exon and look for a potential utr and build it
1601 10         17 my @utrs;
1602             UTR_LOOP:
1603 10         23 for (my $i = 0; $i < $self->exonCount; $i++) {
1604            
1605             # transform index for reverse strands
1606             # this will allow numbering from 5'->3'
1607 76         106 my $number;
1608 76 50       149 if ($transcript->strand == 1) {
1609             # forward strand
1610 76         111 $number = $i;
1611             }
1612             else {
1613             # reverse strand
1614 0         0 $number = abs( $i - $self->exonCount + 1);
1615             }
1616            
1617             # identify UTRs
1618             # we will identify by comparing the cdsStart and cdsStop relative
1619             # to the exon coordinates
1620             # the primary tag is determined by the exon strand orientation
1621 76         167 my ($start, $stop, $tag);
1622             # in case we need to build two UTRs
1623 76         0 my ($start2, $stop2, $tag2);
1624            
1625             # Split 5'UTR, CDS, and 3'UTR all on the same exon
1626 76 50 66     121 if (
    100 100        
    100 66        
    100 66        
    100 66        
    50 33        
1627             $self->exonStarts->[$i] < $self->cdsStart
1628             and
1629             $self->exonEnds->[$i] > $self->cdsEnd
1630             ) {
1631             # the CDS is entirely within the exon, resulting in two UTRs
1632             # on either side of the exon
1633             # we must build two UTRs
1634            
1635             # the left UTR
1636 0         0 $start = $self->exonStarts->[$i];
1637 0         0 $stop = $self->cdsStart - 1;
1638 0 0       0 $tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1639            
1640             # the right UTR
1641 0         0 $start2 = $self->cdsEnd + 1;
1642 0         0 $stop2 = $self->exonEnds->[$i];
1643 0 0       0 $tag2 = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1644             }
1645            
1646             # 5'UTR forward, 3'UTR reverse
1647             elsif (
1648             $self->exonStarts->[$i] < $self->cdsStart
1649             and
1650             $self->exonEnds->[$i] < $self->cdsStart
1651             ) {
1652             # the exon start/end is entirely before the cdsStart
1653 3         5 $start = $self->exonStarts->[$i];
1654 3         6 $stop = $self->exonEnds->[$i];
1655 3 50       6 $tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1656             }
1657            
1658             # Split 5'UTR & CDS on forward, 3'UTR & CDS
1659             elsif (
1660             $self->exonStarts->[$i] < $self->cdsStart
1661             and
1662             $self->exonEnds->[$i] >= $self->cdsStart
1663             ) {
1664             # the start/stop codon is in this exon
1665             # we need to make the UTR out of a portion of this exon
1666 9         19 $start = $self->exonStarts->[$i];
1667 9         18 $stop = $self->cdsStart - 1;
1668 9 50       21 $tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1669             }
1670            
1671             # CDS only
1672             elsif (
1673             $self->exonStarts->[$i] >= $self->cdsStart
1674             and
1675             $self->exonEnds->[$i] <= $self->cdsEnd
1676             ) {
1677             # CDS only exon
1678 50         118 next UTR_LOOP;
1679             }
1680            
1681             # Split 3'UTR & CDS on forward, 5'UTR & CDS
1682             elsif (
1683             $self->exonStarts->[$i] <= $self->cdsEnd
1684             and
1685             $self->exonEnds->[$i] > $self->cdsEnd
1686             ) {
1687             # the stop/start codon is in this exon
1688             # we need to make the UTR out of a portion of this exon
1689 7         14 $start = $self->cdsEnd + 1;
1690 7         25 $stop = $self->exonEnds->[$i];
1691 7 50       18 $tag = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1692             }
1693            
1694             # 3'UTR forward, 5'UTR reverse
1695             elsif (
1696             $self->exonStarts->[$i] > $self->cdsEnd
1697             and
1698             $self->exonEnds->[$i] > $self->cdsEnd
1699             ) {
1700             # the exon start/end is entirely after the cdsStop
1701             # we have a 3'UTR
1702 7         12 $start = $self->exonStarts->[$i];
1703 7         13 $stop = $self->exonEnds->[$i];
1704 7 50       17 $tag = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1705             }
1706            
1707             # Something else?
1708             else {
1709 0         0 my $warning = "Warning: A malformed UTR that doesn't match known criteria: ";
1710 0         0 $warning .= "cdsStart " . $self->cdsStart;
1711 0         0 $warning .= ", cdsEnd " . $self->cdsEnd;
1712 0         0 $warning .= ", exonStart " . $self->exonStarts->[$i];
1713 0         0 $warning .= ", exonEnd " . $self->exonEnds->[$i];
1714 0         0 warn $warning;
1715 0         0 next UTR_LOOP;
1716             }
1717            
1718             ## Generate the UTR objects
1719 26         50 my $utr;
1720            
1721             # look for existing utr
1722 26 50 33     52 if ($ucsc->share and $gene) {
1723 26         53 $utr = $self->find_existing_subfeature($gene, $tag, $start, $stop);
1724             }
1725            
1726             # otherwise build the UTR object
1727 26 100       57 unless ($utr) {
1728 19         87 $utr = $SFCLASS->new(
1729             -seq_id => $transcript->seq_id,
1730             -source => $transcript->source,
1731             -start => $start,
1732             -end => $stop,
1733             -strand => $transcript->strand,
1734             -phase => '.',
1735             -primary_tag => $tag,
1736             -primary_id => $transcript->primary_id . ".utr$number",
1737             );
1738 19 50       100 $utr->display_name( $transcript->display_name . ".utr$number" ) if
1739             $ucsc->do_name;
1740             }
1741            
1742             # store this utr seqfeature in a temporary array
1743 26         49 push @utrs, $utr;
1744            
1745             # build a second UTR object as necessary
1746 26 50       82 if ($start2) {
1747 0         0 my $utr2;
1748            
1749             # look for existing utr
1750 0 0       0 if ($ucsc->share) {
1751 0         0 $utr2 = $self->find_existing_subfeature($gene, $tag2, $start2, $stop2);
1752             }
1753            
1754             # otherwise build the utr
1755 0 0       0 unless ($utr2) {
1756 0         0 $utr2 = $SFCLASS->new(
1757             -seq_id => $transcript->seq_id,
1758             -source => $transcript->source,
1759             -start => $start2,
1760             -end => $stop2,
1761             -strand => $transcript->strand,
1762             -phase => '.',
1763             -primary_tag => $tag2,
1764             -primary_id => $transcript->primary_id . ".utr$number" . "a",
1765             );
1766 0 0       0 $utr2->display_name( $transcript->display_name . ".utr$number" . "a" )
1767             if $ucsc->do_name;
1768             }
1769            
1770             # store this utr seqfeature in a temporary array
1771 0         0 push @utrs, $utr2;
1772             }
1773             }
1774            
1775             # associate found UTRs with the transcript
1776 10         23 foreach my $utr (@utrs) {
1777 26         51 $transcript->add_SeqFeature($utr);
1778             }
1779             }
1780              
1781             sub add_cds {
1782 10     10   22 my ($self, $transcript) = @_;
1783            
1784             # we will NOT collapse CDS features since we cannot guarantee that a shared
1785             # CDS will have the same phase, since phase is dependent on the translation
1786             # start
1787            
1788             # we will scan each exon and look for a potential CDS and build it
1789 10         12 my @cdss;
1790 10         16 my $phase = 0; # initialize CDS phase and keep track as we process CDSs
1791             CDS_LOOP:
1792 10         22 for (my $i = 0; $i < $self->exonCount; $i++) {
1793            
1794             # transform index for reverse strands
1795 76         112 my $j;
1796 76 50       152 if ($transcript->strand == 1) {
1797             # forward strand
1798 76         102 $j = $i;
1799             }
1800             else {
1801             # reverse strand
1802             # flip the index for exon starts and stops so that we
1803             # always progress 5' -> 3'
1804             # this ensures the phase is accurate from the start codon
1805 0         0 $j = abs( $i - $self->exonCount + 1);
1806             }
1807            
1808             # identify CDSs
1809             # we will identify by comparing the cdsStart and cdsStop relative
1810             # to the exon coordinates
1811 76         124 my ($start, $stop);
1812            
1813             # Split 5'UTR, CDS, and 3'UTR all on the same exon
1814 76 50 66     136 if (
    100 100        
    100 66        
    100 66        
    100 66        
    50 33        
1815             $self->exonStarts->[$j] < $self->cdsStart
1816             and
1817             $self->exonEnds->[$j] > $self->cdsEnd
1818             ) {
1819             # exon contains the entire CDS
1820 0         0 $start = $self->cdsStart;
1821 0         0 $stop = $self->cdsEnd;
1822             }
1823            
1824             # 5'UTR forward, 3'UTR reverse
1825             elsif (
1826             $self->exonStarts->[$j] < $self->cdsStart
1827             and
1828             $self->exonEnds->[$j] < $self->cdsStart
1829             ) {
1830             # no CDS in this exon
1831 3         8 next CDS_LOOP;
1832             }
1833            
1834             # Split 5'UTR & CDS on forward, 3'UTR & CDS
1835             elsif (
1836             $self->exonStarts->[$j] < $self->cdsStart
1837             and
1838             $self->exonEnds->[$j] >= $self->cdsStart
1839             ) {
1840             # the start/stop codon is in this exon
1841             # we need to make the CDS out of a portion of this exon
1842 9         17 $start = $self->cdsStart;
1843 9         16 $stop = $self->exonEnds->[$j];
1844             }
1845            
1846             # CDS only
1847             elsif (
1848             $self->exonStarts->[$j] >= $self->cdsStart
1849             and
1850             $self->exonEnds->[$j] <= $self->cdsEnd
1851             ) {
1852             # entire exon is CDS
1853 50         85 $start = $self->exonStarts->[$j];
1854 50         93 $stop = $self->exonEnds->[$j];
1855             }
1856            
1857             # Split 3'UTR & CDS on forward, 5'UTR & CDS
1858             elsif (
1859             $self->exonStarts->[$j] <= $self->cdsEnd
1860             and
1861             $self->exonEnds->[$j] > $self->cdsEnd
1862             ) {
1863             # the stop/start codon is in this exon
1864             # we need to make the CDS out of a portion of this exon
1865 7         12 $start = $self->exonStarts->[$j];
1866 7         12 $stop = $self->cdsEnd;
1867             }
1868            
1869             # 3'UTR forward, 5'UTR reverse
1870             elsif (
1871             $self->exonStarts->[$j] > $self->cdsEnd
1872             and
1873             $self->exonEnds->[$j] > $self->cdsEnd
1874             ) {
1875             # the exon start/end is entirely after the cdsStop
1876             # we have entirely 5' or 3'UTR, no CDS
1877 7         19 next CDS_LOOP;
1878             }
1879            
1880             # Something else?
1881             else {
1882 0         0 my $warning = "Warning: A malformed CDS that doesn't match known criteria: ";
1883 0         0 $warning .= "cdsStart " . $self->cdsStart;
1884 0         0 $warning .= ", cdsEnd " . $self->cdsEnd;
1885 0         0 $warning .= ", exonStart " . $self->exonStarts->[$j];
1886 0         0 $warning .= ", exonEnd " . $self->exonEnds->[$j];
1887 0         0 warn $warning;
1888 0         0 next CDS_LOOP;
1889             }
1890            
1891             # build the CDS object
1892 66         174 my $cds = $SFCLASS->new(
1893             -seq_id => $transcript->seq_id,
1894             -source => $transcript->source,
1895             -start => $start,
1896             -end => $stop,
1897             -strand => $transcript->strand,
1898             -phase => $phase,
1899             -primary_tag => 'CDS',
1900             -primary_id => $transcript->primary_id . ".cds$i",
1901             -display_name => $transcript->display_name . ".cds$i",
1902             );
1903             # the id and name still use $i for labeling to ensure numbering from 0
1904            
1905             # store this utr seqfeature in a temporary array
1906 66         170 push @cdss, $cds;
1907            
1908             # reset the phase for the next CDS
1909             # phase + (3 - (length % 3)), readjust to 0..2 if necessary
1910             # adapted from Barry Moore's gtf2gff3.pl script
1911 66         154 $phase = $phase + (3 - ( $cds->length % 3) );
1912 66 100       217 $phase -=3 if $phase > 2;
1913             }
1914            
1915             # associate found UTRs with the transcript
1916 10         21 foreach my $cds (@cdss) {
1917 66         127 $transcript->add_SeqFeature($cds);
1918             }
1919             }
1920              
1921             sub add_codons {
1922 0     0   0 my ($self, $transcript, $gene) = @_;
1923 0         0 my $ucsc = $self->ucsc;
1924            
1925             # generate the start and stop codons
1926 0         0 my ($start_codon, $stop_codon);
1927 0 0       0 if ($transcript->strand == 1) {
1928             # forward strand
1929            
1930             # share codons if possible
1931 0 0 0     0 if ($ucsc->share and $gene) {
1932 0         0 $start_codon = $self->find_existing_subfeature($gene, 'start_codon',
1933             $self->cdsStart, $self->cdsStart + 2);
1934 0         0 $stop_codon = $self->find_existing_subfeature($gene, 'stop_codon',
1935             $self->cdsEnd - 2, $self->cdsEnd);
1936             }
1937            
1938             # start codon
1939 0 0       0 unless ($start_codon) {
1940 0         0 $start_codon = $SFCLASS->new(
1941             -seq_id => $transcript->seq_id,
1942             -source => $transcript->source,
1943             -primary_tag => 'start_codon',
1944             -start => $self->cdsStart,
1945             -end => $self->cdsStart + 2,
1946             -strand => 1,
1947             -phase => 0,
1948             -primary_id => $transcript->primary_id . '.start_codon',
1949             );
1950 0 0       0 $start_codon->display_name( $transcript->display_name . '.start_codon' ) if
1951             $ucsc->do_name;
1952             }
1953            
1954             # stop codon
1955 0 0       0 unless ($stop_codon) {
1956 0         0 $stop_codon = $SFCLASS->new(
1957             -seq_id => $transcript->seq_id,
1958             -source => $transcript->source,
1959             -primary_tag => 'stop_codon',
1960             -start => $self->cdsEnd - 2,
1961             -end => $self->cdsEnd,
1962             -strand => 1,
1963             -phase => 0,
1964             -primary_id => $transcript->primary_id . '.stop_codon',
1965             );
1966 0 0       0 $stop_codon->display_name( $transcript->display_name . '.stop_codon' ) if
1967             $ucsc->do_name;
1968             }
1969             }
1970            
1971             else {
1972             # reverse strand
1973            
1974             # share codons if possible
1975 0 0 0     0 if ($ucsc->share and $gene) {
1976 0         0 $stop_codon = $self->find_existing_subfeature($gene, 'stop_codon',
1977             $self->cdsStart, $self->cdsStart + 2);
1978 0         0 $start_codon = $self->find_existing_subfeature($gene, 'start_codon',
1979             $self->cdsEnd - 2, $self->cdsEnd);
1980             }
1981            
1982             # stop codon
1983 0 0       0 unless ($stop_codon) {
1984 0         0 $stop_codon = $SFCLASS->new(
1985             -seq_id => $transcript->seq_id,
1986             -source => $transcript->source,
1987             -primary_tag => 'stop_codon',
1988             -start => $self->cdsStart,
1989             -end => $self->cdsStart + 2,
1990             -strand => -1,
1991             -phase => 0,
1992             -primary_id => $transcript->primary_id . '.stop_codon',
1993             );
1994 0 0       0 $stop_codon->display_name( $transcript->display_name . '.stop_codon' ) if
1995             $ucsc->do_name;
1996             }
1997            
1998             # start codon
1999 0 0       0 unless ($start_codon) {
2000 0         0 $start_codon = $SFCLASS->new(
2001             -seq_id => $transcript->seq_id,
2002             -source => $transcript->source,
2003             -primary_tag => 'start_codon',
2004             -start => $self->cdsEnd - 2,
2005             -end => $self->cdsEnd,
2006             -strand => -1,
2007             -phase => 0,
2008             -primary_id => $transcript->primary_id . '.start_codon',
2009             -display_name => $transcript->primary_id . '.start_codon',
2010             );
2011 0 0       0 $start_codon->display_name( $transcript->display_name . '.start_codon' ) if
2012             $ucsc->do_name;
2013             }
2014             }
2015            
2016             # associate with transcript
2017 0         0 $transcript->add_SeqFeature($start_codon);
2018 0         0 $transcript->add_SeqFeature($stop_codon);
2019             }
2020              
2021             sub find_gene {
2022 70     70   126 my $self = shift;
2023            
2024             # check if a gene with this name exists
2025 70 100       120 if (exists $self->ucsc->{gene2seqf}->{lc $self->gene_name} ) {
2026             # we found a gene with the same name
2027             # pull out the gene seqfeature(s) array reference
2028             # there may be more than one gene
2029 48         111 my $genes = $self->ucsc->{gene2seqf}->{ lc $self->gene_name };
2030            
2031             # check that the current transcript intersects with the gene
2032             # sometimes we can have two separate transcripts with the
2033             # same gene name, but located on opposite ends of the chromosome
2034             # part of a gene family, but unlikely the same gene 200 Mb in
2035             # length
2036 48         113 foreach my $g (@$genes) {
2037 48 50 33     135 if (
      33        
2038             # overlap method borrowed from Bio::RangeI
2039             ($g->strand == $self->strand) and not (
2040             $g->start > $self->txEnd or
2041             $g->end < $self->txStart
2042             )
2043             ) {
2044             # gene and transcript overlap on the same strand
2045             # we found the intersecting gene
2046 48         138 return $g;
2047             }
2048             }
2049             }
2050 22         48 return;
2051             }
2052              
2053             sub find_existing_subfeature {
2054 303     303   624 my ($self, $gene, $type, $start, $stop) = @_;
2055            
2056             # we will try to find a pre-existing subfeature at identical coordinates
2057 303         657 foreach my $transcript ($gene->get_SeqFeatures()) {
2058             # walk through transcripts
2059 988         5179 foreach my $subfeature ($transcript->get_SeqFeatures()) {
2060             # walk through subfeatures of transcripts
2061 8284 100 100     35171 if (
      100        
2062             $subfeature->primary_tag eq $type and
2063             $subfeature->start == $start and
2064             $subfeature->end == $stop
2065             ) {
2066             # we found a match
2067 163         1532 return $subfeature;
2068             }
2069             }
2070             }
2071 140         743 return;
2072             }
2073              
2074              
2075              
2076              
2077             __END__