File Coverage

blib/lib/Bio/ToolBox/parser/ucsc.pm

Criterion	Covered	Total	%
statement	430	664	64.7
branch	212	382	55.5
condition	71	193	36.7
subroutine	52	58	89.6
pod	22	24	91.6
total	787	1321	59.5

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::parser::ucsc;
2							our $VERSION = '1.65';
3
4							=head1 NAME
5
6							Bio::ToolBox::parser::ucsc - Parser for UCSC genePred, refFlat, etc formats
7
8							=head1 SYNOPSIS
9
10							use Bio::ToolBox::parser::ucsc;
11
12							### A simple transcript parser
13							my $ucsc = Bio::ToolBox::parser::ucsc->new('file.genePred');
14
15							### A full fledged gene parser
16							my $ucsc = Bio::ToolBox::parser::ucsc->new(
17							file => 'ensGene.genePred',
18							do_gene => 1,
19							do_cds => 1,
20							do_utr => 1,
21							ensname => 'ensemblToGene.txt',
22							enssrc => 'ensemblSource.txt',
23							);
24
25							### Retrieve one transcript line at a time
26							my $transcript = $ucsc->next_feature;
27
28							### Retrieve one assembled gene at a time
29							my $gene = $ucsc->next_top_feature;
30
31							### Retrieve array of all assembled genes
32							my @genes = $ucsc->top_features;
33
34							# Each gene or transcript is a SeqFeatureI compatible object
35							printf "gene %s is located at %s:%s-%s\n",
36							$gene->display_name, $gene->seq_id,
37							$gene->start, $gene->end;
38
39							# Multiple transcripts can be assembled into a gene
40							foreach my $transcript ($gene->get_SeqFeatures) {
41							# each transcript has exons
42							foreach my $exon ($transcript->get_SeqFeatures) {
43							printf "exon is %sbp long\n", $exon->length;
44							}
45							}
46
47							# Features can be printed in GFF3 format
48							$gene->version(3);
49							print STDOUT $gene->gff_string(1);
50							# the 1 indicates to recurse through all subfeatures
51
52
53							=head1 DESCRIPTION
54
55							This is a parser for converting UCSC-style gene prediction flat file formats into
56							BioPerl-style L compliant objects, complete with nested objects
57							representing transcripts, exons, CDS, UTRs, start- and stop-codons. Full control
58							is available on what to parse, e.g. exons on, CDS and codons off. Additional gene
59							information can be added by supplying additional tables of information, such as
60							common gene names and descriptions, available from the UCSC repository.
61
62							=head2 Table formats supported
63
64							Supported files are tab-delimited text files obtained from UCSC and described
65							at L. Formats are identified
66							by the number of columns, rather than specific file extensions, column name
67							headers, or other metadata. Therefore, unmodified tables should only be used
68							for correct parsing. Some errors are reported for incorrect lines. Unadulterated
69							files can safely be downloaded from L.
70							Files obtained from the UCSC Table Browser can also be used with caution. Files
71							may be gzip compressed.
72
73							File formats supported include the following.
74
75							=over 4
76
77							=item * Gene Prediction (genePred), 10 columns
78
79							=item * Gene Prediction with RefSeq gene Name (refFlat), 11 columns
80
81							=item * Extended Gene Prediction (genePredExt), 15 columns
82
83							=item * Extended Gene Prediction with bin (genePredExt), 16 columns
84
85							=item * knownGene table, 12 columns
86
87							=back
88
89							=head2 Supplemental information
90
91							The UCSC gene prediction tables include essential information, but not detailed
92							information, such as common gene names, description, protein accession IDs, etc.
93							This additional information can be associated with the genes or transcripts during
94							parsing if the appropriate tables are supplied. These tables can be obtained from
95							the UCSC download site L.
96
97							Supported tables include the following.
98
99							=over 4
100
101							=item * refSeqStatus, for refGene, knownGene, and xenoRefGene tables
102
103							=item * refSeqSummary, for refGene, knownGene, and xenoRefGene tables
104
105							=item * ensemblToGeneName, for ensGene tables
106
107							=item * ensemblSource, for ensGene tables
108
109							=item * kgXref, for knownGene tables
110
111							=back
112
113							=head2 Implementation
114
115							For an implementation of this module to generate GFF3 formatted files from UCSC
116							data sources, see the L script L.
117
118							=head1 METHODS
119
120							=head2 Initalize the parser object
121
122							=over 4
123
124							=item new
125
126							Initiate a UCSC table parser object. Pass a single value (a table file name)
127							to open a table and parse its objects. Alternatively, pass an array of key
128							value pairs to control how the table is parsed. Options include the following.
129
130							=over 4
131
132							=item file
133
134							=item table
135
136							Provide a file name for a UCSC gene prediction table. The file may be gzip
137							compressed.
138
139							=item source
140
141							Pass a string to be added as the source tag value of the SeqFeature objects.
142							The default value is 'UCSC'. If the file name has a recognizable name,
143							such as 'refGene' or 'ensGene', it will be used instead.
144
145							=item do_gene
146
147							Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene
148							name under a single gene object. Default is true.
149
150							-item do_exon
151
152							=item do_cds
153
154							=item do_utr
155
156							=item do_codon
157
158							Pass a boolean (1 or 0) value to parse certain subfeatures, including exon,
159							CDS, five_prime_UTR, three_prime_UTR, stop_codon, and start_codon features.
160							Default is false.
161
162							=item do_name
163
164							Pass a boolean (1 or 0) value to assign names to subfeatures, including exons,
165							CDSs, UTRs, and start and stop codons. Default is false.
166
167							=item share
168
169							Pass a boolean (1 or 0) value to recycle shared subfeatures (exons and UTRs)
170							between multiple transcripts of the same gene. This results in reduced
171							memory usage, and smaller exported GFF3 files. Default is true.
172
173							=item refseqsum
174
175							=item refseqstat
176
177							=item kgxref
178
179							=item ensembltogene
180
181							=item ensemblsource
182
183							Pass the appropriate file name for additional information.
184
185							=item class
186
187							Pass the name of a L compliant class that will be used to
188							create the SeqFeature objects. The default is to use L.
189
190							=back
191
192							=back
193
194							=head2 Modify the parser object
195
196							These methods set or retrieve parameters, and load supplemental files and
197							new tables.
198
199							=over 4
200
201							=item source
202
203							=item do_gene
204
205							=item do_exon
206
207							=item do_cds
208
209							=item do_utr
210
211							=item do_codon
212
213							=item do_name
214
215							=item share
216
217							These methods retrieve or set parameters to the parsing engine, same as
218							the options to the new method.
219
220							=item fh
221
222							Set or retrieve the file handle of the current table. This module uses
223							L objects. Be careful manipulating file handles of open tables!
224
225							=item open_file
226
227							Pass the name of a new table to parse. Existing gene models loaded in
228							memory, if any, are discarded. Counts are reset to 0. Supplemental
229							tables are not discarded.
230
231							=item load_extra_data($file, $type)
232
233							my $file = 'hg19_refSeqSummary.txt.gz';
234							my success = $ucsc->load_extra_data($file, 'summary');
235
236							Pass two values, the file name of the supplemental file and the type
237							of supplemental data. Values can include the following
238
239							=over 4
240
241							=item * refseqstatus or status
242
243							=item * refseqsummary or summary
244
245							=item * kgxref
246
247							=item * ensembltogene or ensname
248
249							=item * ensemblsource or enssrc
250
251							=back
252
253							The number of transcripts with information loaded from the supplemental
254							data file is returned.
255
256							=back
257
258							=head2 Feature retrieval
259
260							The following methods parse the table lines into SeqFeature objects.
261							It is best if methods are not mixed; unexpected results may occur.
262
263							=over 4
264
265							=item next_feature
266
267							This will read the next line of the table and parse it into a gene or
268							transcript object. However, multiple transcripts from the same gene are
269							not assembled together under the same gene object.
270
271							=item next_top_feature
272
273							This method will return all top features (typically genes), with multiple
274							transcripts of the same gene assembled under the same gene object. Transcripts
275							are assembled together if they share the same gene name and the transcripts
276							overlap. If transcripts share the same gene name but do not overlap, they
277							are placed into separate gene objects with the same name but different
278							C tags. Calling this method will parse the entire table into
279							memory (so that multiple transcripts may be assembled), but only one object
280							is returned at a time. Call this method repeatedly using a while loop to
281							get all features.
282
283							=item top_features
284
285							This method is similar to L, but instead returns an array
286							of all the top features.
287
288							=back
289
290							=head2 Other methods
291
292							Additional methods for working with the parser object and the parsed
293							SeqFeature objects.
294
295							=over 4
296
297							=item parse_table
298
299							Parses the table into memory. If a table wasn't provided using the
300							L or L methods, then a filename can be passed to this
301							method and it will automatically be opened for you.
302
303							=item find_gene
304
305							my $gene = $ucsc->find_gene(
306							display_name => 'ABC1',
307							primary_id => 'gene000001',
308							);
309
310							Pass a gene name, or an array of key =E values (name, display_name,
311							ID, primary_ID, and/or coordinate information), that can be used
312							to find a gene already loaded into memory. Only really successful if the
313							entire table is loaded into memory. Genes with a matching name are
314							confirmed by a matching ID or overlapping coordinates, if available.
315							Otherwise the first match is returned.
316
317							=item counts
318
319							This method will return a hash of the number of genes and RNA types that
320							have been parsed.
321
322							=item typelist
323
324							This method will return a comma-delimited list of the feature types or
325							Cs found in the parsed file. Returns a generic list if a
326							file has not been parsed.
327
328							=item from_ucsc_string
329
330							A bare bones method that will convert a tab-delimited text line from a UCSC
331							formatted gene table into a SeqFeature object for you. Don't expect alternate
332							transcripts to be assembled into genes.
333
334							=item seq_ids
335
336							Returns an array or array reference of the names of the chromosomes or
337							reference sequences present in the table.
338
339							=item seq_id_lengths
340
341							Returns a hash reference to the chromosomes or reference sequences and
342							their corresponding lengths. In this case, the length is inferred by the
343							greatest gene end position.
344
345							=back
346
347							=head2 Bio::ToolBox::parser::ucsc::builder
348
349							This is a private module that is responsible for building SeqFeature
350							objects from UCSC table lines. It is not intended for general public use.
351
352							=head1 SEE ALSO
353
354							L, L
355
356							=cut
357
358	2			2		110107	use strict;
	2					12
	2					69
359	2			2		10	use Carp qw(carp cluck croak);
	2					5
	2					103
360	2			2		639	use Bio::ToolBox::Data::file; # only used to get an open filehandle
	2					5
	2					6561
361
362							1;
363
364							sub new {
365	5			5	1	4700	my $class = shift;
366	5					103	my $self = {
367							'fh' => undef,
368							'version' => undef,
369							'source' => 'UCSC',
370							'top_features' => [],
371							'gene2seqf' => {},
372							'id2count' => {},
373							'counts' => {},
374							'do_gene' => 1,
375							'do_exon' => 0,
376							'do_cds' => 0,
377							'do_utr' => 0,
378							'do_codon' => 0,
379							'do_name' => 0,
380							'share' => 1,
381							'refseqsum' => {},
382							'refseqstat' => {},
383							'kgxref' => {},
384							'ensembldata' => {},
385							'eof' => 0,
386							'line_count' => 0,
387							'sfclass' => 'Bio::ToolBox::SeqFeature', # default class
388							'seq_ids' => {},
389							};
390	5					18	bless $self, $class;
391
392							# check for options
393	5	100				21	if (@_) {
394	3	50				13	if (scalar(@_) == 1) {
395							# short and sweet, just a file, we assume
396	0					0	my $file = shift @_;
397	0					0	$self->open_file($file);
398							}
399							else {
400	3					22	my %options = @_;
401	3	50	33			19	if (exists $options{file} or $options{table}) {
402	3		33			12	$options{file} \|\|= $options{table};
403	3					16	$self->open_file( $options{file} );
404							}
405	3	100				14	if (exists $options{do_gene}) {
406	1					5	$self->do_gene($options{do_gene});
407							}
408	3	50				11	if (exists $options{do_exon}) {
409	3					14	$self->do_exon($options{do_exon});
410							}
411	3	100				11	if (exists $options{do_cds}) {
412	1					5	$self->do_cds($options{do_cds});
413							}
414	3	100				11	if (exists $options{do_utr}) {
415	1					4	$self->do_utr($options{do_utr});
416							}
417	3	50				9	if (exists $options{do_codon}) {
418	0					0	$self->do_codon($options{do_codon});
419							}
420	3	50				10	if (exists $options{do_name}) {
421	0					0	$self->do_name($options{do_name});
422							}
423	3	50				34	if (exists $options{share}) {
424	0					0	$self->share($options{share});
425							}
426	3	50				12	if (exists $options{source}) {
427	0					0	$self->source($options{source});
428							}
429	3	50				16	if (exists $options{refseqsum}) {
		50
430	0					0	$self->load_extra_data($options{refseqsum}, 'refseqsum');
431							}
432							elsif (exists $options{summary}) {
433	0					0	$self->load_extra_data($options{summary}, 'refseqsum');
434							}
435	3	50				16	if (exists $options{refseqstat}) {
		50
436	0					0	$self->load_extra_data($options{refseqstat}, 'refseqstat');
437							}
438							elsif (exists $options{status}) {
439	0					0	$self->load_extra_data($options{status}, 'refseqstat');
440							}
441	3	50				10	if (exists $options{kgxref}) {
442	0					0	$self->load_extra_data($options{kgxref}, 'kgxref');
443							}
444	3	50				16	if (exists $options{ensembltogenename}) {
		50
445	0					0	$self->load_extra_data($options{ensembltogenename}, 'ensembltogene');
446							}
447							elsif (exists $options{ensname}) {
448	0					0	$self->load_extra_data($options{ensname}, 'ensembltogene');
449							}
450	3	50				24	if (exists $options{ensemblsource}) {
		100
451	0					0	$self->load_extra_data($options{ensemblsource}, 'ensemblsource');
452							}
453							elsif (exists $options{enssrc}) {
454	1					9	$self->load_extra_data($options{enssrc}, 'ensemblsource');
455							}
456	3	100				14	if (exists $options{class}) {
457	2					7	$self->{sfclass} = $options{class};
458							}
459							}
460							}
461
462							# done
463	5					20	return $self;
464							}
465
466							sub version {
467	0			0	0	0	return shift->{version};
468							}
469
470							sub source {
471	123			123	1	203	my $self = shift;
472	123	100				271	if (@_) {
473	5					27	$self->{'source'} = shift;
474							}
475	123					380	return $self->{'source'};
476							}
477
478							sub simplify {
479							# this doesn't do anything, for now, but maintain compatibility with gff parser
480	2			2	0	5	return 0;
481							}
482
483							sub do_gene {
484	95			95	1	920	my $self = shift;
485	95	100				213	if (@_) {
486	3					6	$self->{'do_gene'} = shift;
487							}
488	95					263	return $self->{'do_gene'};
489							}
490
491							sub do_exon {
492	93			93	1	142	my $self = shift;
493	93	100				193	if (@_) {
494	4					10	$self->{'do_exon'} = shift;
495							}
496	93					215	return $self->{'do_exon'};
497							}
498
499							sub do_cds {
500	55			55	1	81	my $self = shift;
501	55	100				137	if (@_) {
502	1					2	$self->{'do_cds'} = shift;
503							}
504	55					146	return $self->{'do_cds'};
505							}
506
507							sub do_utr {
508	53			53	1	86	my $self = shift;
509	53	100				121	if (@_) {
510	1					2	$self->{'do_utr'} = shift;
511							}
512	53					127	return $self->{'do_utr'};
513							}
514
515							sub do_codon {
516	54			54	1	90	my $self = shift;
517	54	50				120	if (@_) {
518	0					0	$self->{'do_codon'} = shift;
519							}
520	54					118	return $self->{'do_codon'};
521							}
522
523							sub do_name {
524	231			231	1	355	my $self = shift;
525	231	50				506	if (@_) {
526	0					0	$self->{'do_name'} = shift;
527							}
528	231					517	return $self->{'do_name'};
529							}
530
531							sub share {
532	394			394	1	620	my $self = shift;
533	394	50				777	if (@_) {
534	0					0	$self->{'share'} = shift;
535							}
536	394					1325	return $self->{'share'};
537							}
538
539							sub fh {
540	234			234	1	1221	my $self = shift;
541	234	100				478	if (@_) {
542	7					19	$self->{'fh'} = shift;
543							}
544	234					3065	return $self->{'fh'};
545							}
546
547							sub open_file {
548	7			7	1	14	my $self = shift;
549
550							# check file
551	7					13	my $filename = shift;
552	7	50				20	unless ($filename) {
553	0					0	cluck("no file name passed!");
554	0					0	return;
555							}
556
557							# Open filehandle object
558	7	50				85	my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or
559							croak " cannot open file '$filename'!\n";
560
561							# check number of columns
562	7					11	my $ncol;
563	7					199	while (my $line = $fh->getline) {
564	14	100				1412	next if $line =~ /^#/;
565	7	50				44	next unless $line =~ /\w+/;
566	7					51	my @fields = split "\t", $line;
567	7					17	$ncol = scalar(@fields);
568	7					25	last;
569							}
570	7	0	33			26	unless ($ncol == 16 or $ncol == 15 or $ncol == 12 or $ncol == 11 or $ncol == 10) {
			33
			0
			0
571	0					0	carp " File '$filename' doesn't have recognizable number of columns! It has $ncol.";
572	0					0	return;
573							}
574	7	50				24	if ($ncol == 10) {
575							# turn off gene processing for simple genePred which has no gene names
576	0					0	$self->do_gene(0);
577							}
578
579							# reopen file handle
580	7					38	$fh->close;
581	7					214	$fh = Bio::ToolBox::Data::file->open_to_read_fh($filename);
582
583							# reset source as necessary
584	7	100	66			78	if ($filename =~ /ensgene/i and $self->source eq 'UCSC') {
		50	33
		50	33
		50	33
		50	33
585	5					20	$self->source('EnsGene');
586							}
587							elsif ($filename =~ /xenorefgene/i and $self->source eq 'UCSC') {
588	0					0	$self->source('xenoRefGene');
589							}
590							elsif ($filename =~ /refgene/i and $self->source eq 'UCSC') {
591	0					0	$self->source('refGene');
592							}
593							elsif ($filename =~ /refseq/i and $self->source eq 'UCSC') {
594	0					0	$self->source('refSeq');
595							}
596							elsif ($filename =~ /knowngene/i and $self->source eq 'UCSC') {
597	0					0	$self->source('knownGene');
598							}
599
600							# check existing data
601							# the reason this parser can handle reading a second file without having to make a
602							# new parser object (like gff and bed) is so that we can potentially recycle the
603							# extra UCSC information
604	7	100				26	if ($self->fh) {
605							# close existing
606	2					7	$self->fh->close;
607							# go ahead and clear out existing data
608	2					45	$self->{'version'} = undef;
609	2					8	$self->{'top_features'} = [];
610	2					18	$self->{'duplicate_ids'} = {};
611	2					7	$self->{'gene2seqf'} = {};
612	2					8	$self->{'id2count'} = {};
613	2					5	$self->{'counts'} = {};
614	2					7	$self->{'eof'} = 0;
615	2					5	$self->{'line_count'} = 0;
616							}
617
618	7					26	$self->fh($fh);
619	7					24	return 1;
620							}
621
622							sub load_extra_data {
623	5			5	1	1352	my ($self, $file, $type) = @_;
624	5	50				25	unless ($file) {
625	0					0	cluck "no file name passed!";
626	0					0	return;
627							}
628
629							# check the type
630	5	100				53	if ($type =~ /ensembltogene\|ensname/i) {
		50
		0
		0
		0
631	2					5	$type = 'ensembltogene';
632							}
633							elsif ($type =~ /ensemblsource\|enssrc/i) {
634	3					6	$type = 'ensemblsource';
635							}
636							elsif ($type =~ /refseqstat\|status/i) {
637	0					0	$type = 'refseqstat';
638							}
639							elsif ($type =~ /refseqsum\|summary/i) {
640	0					0	$type = 'refseqsum';
641							}
642							elsif ($type =~ /kgxref/i) {
643	0					0	$type = 'kgxref';
644							}
645							else {
646	0					0	carp "unknown type '$type' to load extra data";
647	0					0	return;
648							}
649
650	5					34	my $fh = Bio::ToolBox::Data::file->open_to_read_fh($file);
651	5	50				23	unless ($fh) {
652	0					0	carp "unable to open file '$file'! $!";
653	0					0	return;
654							}
655
656							# load ensembl data
657	5					13	my $count = 0;
658	5	50				24	if ($type =~ /ensembl/) {
659							# we will store gene name in position 0, and source in position 1
660	5	100				18	my $index = $type eq 'ensembltogene' ? 0 : 1;
661	5					129	while (my $line = $fh->getline) {
662
663							# process line
664	85					2836	chomp $line;
665	85	50				202	next if ($line =~ /^#/);
666	85					193	my @line_data = split /\t/, $line;
667	85	50				170	if (scalar @line_data != 2) {
668	0					0	carp " file $file doesn't seem right!? Line has " .
669							scalar @line_data . " elements!\n";
670	0					0	return;
671							}
672
673							# store data into hash
674	85					238	$self->{'ensembldata'}{ $line_data[0] }->[$index] = $line_data[1];
675	85					1481	$count++;
676							}
677							}
678
679							# load various refSeq data
680							else {
681							# we just store the line data based on the gene ID
682							# each table has different elements
683							# here they are for reference
684
685							### refSeqStatus table
686							# 0 mrnaAcc RefSeq gene accession name
687							# 1 status Status ('Unknown', 'Reviewed', 'Validated', 'Provisional', 'Predicted', 'Inferred')
688							# 2 molecule type ('DNA', 'RNA', 'ds-RNA', 'ds-mRNA', 'ds-rRNA', 'mRNA', 'ms-DNA', 'ms-RNA', 'rRNA', 'scRNA', 'snRNA', 'snoRNA', 'ss-DNA', 'ss-RNA', 'ss-snoRNA', 'tRNA', 'cRNA', 'ss-cRNA', 'ds-cRNA', 'ms-rRNA') values molecule type
689
690							### refSeqSummary table
691							# 0 RefSeq mRNA accession
692							# 1 completeness FullLength ('Unknown', 'Complete5End', 'Complete3End', 'FullLength', 'IncompleteBothEnds', 'Incomplete5End', 'Incomplete3End', 'Partial')
693							# 1 summary text values Summary comments
694
695							### kgXref table
696							# 0 kgID Known Gene ID
697							# 1 mRNA mRNA ID
698							# 2 spID SWISS-PROT protein Accession number
699							# 3 spDisplayID SWISS-PROT display ID
700							# 4 geneSymbol Gene Symbol
701							# 5 refseq RefSeq ID
702							# 6 protAcc NCBI protein Accession number
703							# 7 description Description
704
705							### ensemblToGeneName table
706							# 0 Ensembl transcript ID
707							# 1 gene name
708
709							# load the table
710	0					0	while (my $line = $fh->getline) {
711	0					0	chomp $line;
712	0	0				0	next if ($line =~ /^#/);
713	0					0	my @line_data = split /\t/, $line;
714
715							# the unique id should be the first element in the array
716							# take it off the array, since it doesn't need to be stored there too
717	0					0	my $id = shift @line_data;
718
719							# check for duplicate lines
720	0	0				0	if (exists $self->{$type}{$id} ) {
721	0					0	warn " $type line for identifier $id exists twice!\n";
722	0					0	next;
723							}
724
725							# store data into hash
726	0					0	$self->{$type}{$id} = [@line_data];
727	0					0	$count++;
728							}
729							}
730
731	5					184	$fh->close;
732	5					104	return $count;
733							}
734
735							sub typelist {
736	3			3	1	5	my $self = shift;
737	3					5	my @items;
738	3					5	foreach my $k (keys %{$self->{counts}}) {
	3					13
739	0	0				0	push @items, $k if $self->{counts}{$k} > 0;
740							}
741	3	50				8	if (@items) {
742	0					0	return join(',', @items);
743							}
744							else {
745							# return generic list
746	3	100				10	return $self->do_gene ? 'gene,mRNA,ncRNA,exon,CDS' : 'mRNA,ncRNA,exon,CDS';
747							}
748							}
749
750							sub next_feature {
751	92			92	1	1660	my $self = shift;
752
753							# check that we have an open filehandle
754	92	50				188	unless ($self->fh) {
755	0					0	croak("no UCSC file loaded to parse!");
756							}
757
758	92					180	while (my $line = $self->fh->getline) {
759	114					2987	chomp $line;
760	114	100	66			614	if ($line =~ /^#/ or $line !~ /\w+/) {
761	27					91	$self->{line_count}++;
762	27					85	next;
763							}
764	87					306	my $builder = Bio::ToolBox::parser::ucsc::builder->new($line, $self);
765	87					141	$self->{line_count}++;
766	87	50				205	unless ($builder) {
767							# builder will print its own error message if fails
768	0					0	warn " unable to parse line number ", $self->{line_count}, "\n";
769	0					0	next;
770							}
771
772							# generate the feature from the line
773	87					128	my $feature;
774	87	100				217	if ($self->do_gene) {
775	70					152	$feature = $builder->build_gene;
776							}
777							else {
778	17					41	$feature = $builder->build_transcript;
779							}
780
781							# return the object, we do this while loop once per valid line
782	87					619	return $feature;
783							}
784
785							# presumed end of file
786	5					228	$self->{'eof'} = 1;
787	5					18	return;
788							}
789
790							sub next_top_feature {
791	25			25	1	1157	my $self = shift;
792	25	50				56	unless ($self->{'eof'}) {
793	0					0	$self->parse_table;
794							}
795	25					35	return shift @{ $self->{top_features} };
	25					72
796							}
797
798							sub top_features {
799	2			2	1	2018	my $self = shift;
800	2	50				10	unless ($self->{'eof'}) {
801	0					0	$self->parse_table;
802							}
803	2					5	my @features = @{ $self->{top_features} };
	2					8
804	2	50				10	return wantarray ? @features : \@features;
805							}
806
807							*parse_file = \&parse_table;
808
809							sub parse_table {
810	5			5	1	11099	my $self = shift;
811	5	100				19	if (@_) {
812	2	50				9	$self->open_file(shift) or return;
813							}
814	5	50				17	unless ($self->fh) {
815	0					0	carp "must open a file first!";
816	0					0	return;
817							}
818	5	50				22	return if ($self->{'eof'});
819
820							#### Main Loop
821	5					217	print " Parsing UCSC gene table....\n";
822	5					34	while (my $feature = $self->next_feature) {
823
824							# add to gene2seqf hash
825	85					224	my $gene = $self->find_gene($feature);
826	85	50				195	unless ($gene) {
827							# the current feature is not in the hash, so add it
828	85					270	$self->{gene2seqf}->{ lc $feature->display_name } = [ $feature ];
829							}
830
831							# check chromosome
832	85					327	my $s = $feature->seq_id;
833	85	100				318	unless (exists $self->{seq_ids}{$s}) {
834	5					25	$self->{seq_ids}{$s} = $feature->end;
835							}
836	85	100				224	$self->{seq_ids}{$s} = $feature->end if $feature->end > $self->{seq_ids}{$s};
837							}
838
839							# add to the top list of features, Schwartzian transform and sort
840							# based on the genes found in the gene2seqf hash
841	5					19	push @{ $self->{top_features} },
842	37					64	map {$_->[2]}
843	79	0				248	sort {$a->[0] cmp $b->[0] or $a->[1] <=> $b->[1]}
844							map [$_->seq_id, $_->start, $_],
845	37					85	map @{ $self->{gene2seqf}->{$_} },
846	5					23	keys %{$self->{gene2seqf}};
	5					31
847
848	5					35	return 1;
849							}
850
851							sub find_gene {
852	87			87	1	29372	my $self = shift;
853
854							# get the name and coordinates from arguments
855	87					155	my ($name, $id, $chrom, $start, $end, $strand);
856	87	50				246	if (scalar @_ == 0) {
		50
857	0					0	carp "must provide information to find_gene method!";
858	0					0	return;
859							}
860							elsif (scalar @_ == 1) {
861	87					148	$name = $_[0];
862							}
863							else {
864	0					0	my %opt = @_;
865	0		0			0	$name = $opt{name} \|\| $opt{display_name} \|\| undef;
866	0		0			0	$id = $opt{id} \|\| $opt{primary_id} \|\| undef;
867	0		0			0	$chrom = $opt{chrom} \|\| $opt{seq_id} \|\| undef;
868	0		0			0	$start = $opt{start} \|\| undef;
869	0		0			0	$end = $opt{stop} \|\| $opt{end} \|\| undef;
870	0		0			0	$strand = $opt{strand} \|\| 0;
871							}
872	87	50				212	unless ($name) {
873	0					0	carp "name is required for find_gene!";
874	0					0	return;
875							}
876
877							# check if a gene with this name exists
878	87	100				410	if (exists $self->{gene2seqf}->{lc $name} ) {
879							# we found a matching gene
880
881							# pull out the gene seqfeature(s) array reference
882							# there may be more than one gene
883	2					8	my $genes = $self->{gene2seqf}->{ lc $name };
884
885							# go through a series of checks to find the appropriate
886	2	50				9	if ($id) {
887	0					0	foreach my $g (@$genes) {
888	0	0				0	if ($g->primary_id eq $id) {
889	0					0	return $g;
890							}
891							}
892	0					0	return; # none of these matched despite having an ID
893							}
894	2	0	33			8	if ($chrom and $start and $end) {
			33
895	0					0	foreach my $g (@$genes) {
896	0	0	0			0	if (
			0
897							# overlap method borrowed from Bio::RangeI
898							($g->strand == $strand) and not (
899							$g->start > $end or
900							$g->end < $start
901							)
902							) {
903							# gene and transcript overlap on the same strand
904							# we found the intersecting gene
905	0					0	return $g;
906							}
907							}
908	0					0	return; # none of these matched despite having coordinate info
909							}
910	2	50				9	if (scalar @$genes == 1) {
		0
911							# going on trust here that this is the one
912	2					7	return $genes->[0];
913							}
914							elsif (scalar @$genes > 1) {
915	0					0	carp "more than one gene named $name found!";
916	0					0	return $genes->[0];
917							}
918
919							# nothing suitable found
920	0					0	return;
921							}
922							}
923
924							sub counts {
925	2			2	1	4298	my $self = shift;
926	2					7	my %counts = %{ $self->{counts} };
	2					19
927	2	50				20	return wantarray ? %counts : \%counts;
928							}
929
930							sub from_ucsc_string {
931	0			0	1	0	my ($self, $string) = @_;
932	0	0				0	return unless $string;
933	0					0	my $builder = Bio::ToolBox::parser::ucsc::builder->new($string, $self);
934	0	0				0	return unless $builder;
935	0	0				0	if ($self->do_gene) {
936	0					0	return $builder->build_gene;
937							}
938							else {
939	0					0	return $builder->build_transcript;
940							}
941							}
942
943							sub seq_ids {
944	0			0	1	0	my $self = shift;
945	0					0	my @s = keys %{$self->{seq_ids}};
	0					0
946	0	0				0	return wantarray ? @s : \@s;
947							}
948
949							sub seq_id_lengths {
950	0			0	1	0	my $self = shift;
951	0					0	return $self->{seq_ids};
952							}
953
954
955
956							package Bio::ToolBox::parser::ucsc::builder;
957	2			2		23	use strict;
	2					5
	2					79
958	2			2		15	use Carp qw(carp cluck croak);
	2					3
	2					10658
959							our $SFCLASS = ''; # SeqFeature class to use
960
961							1;
962
963							sub new {
964	173			173		453	my ($class, $line, $ucsc) = @_;
965	173					314	my %self;
966							my $format;
967
968							# check SeqFeature class
969	173	100				507	if ($ucsc->{sfclass} ne $SFCLASS) {
970	3					11	$SFCLASS = $ucsc->{sfclass};
971	3	50				193	eval "require $SFCLASS" or croak $@;
972							}
973
974	173					319	chomp $line;
975	173					824	my @linedata = split /\t/, $line;
976
977							# we're identifying the type of table based on the number of columns
978							# may not be the best or accurate method, but it generally works for valid formats
979
980							### Extended Gene Prediction Table ###
981	173	100				575	if (scalar @linedata == 16) {
		50
		50
		50
		50
982							# an extended gene prediction table, e.g. refGene, ensGene, xenoRefGene
983							# as downloaded from the UCSC Table Browser or FTP site
984							# includes the bin number as the first column
985
986							# 0 bin
987							# 1 name
988							# 2 chrom
989							# 3 strand
990							# 4 txStart
991							# 5 txEnd
992							# 6 cdsStart
993							# 7 cdsEnd
994							# 8 exonCount
995							# 9 exonStarts
996							# 10 exonEnds
997							# 11 score
998							# 12 name2
999							# 13 cdsStartStat
1000							# 14 cdsEndStat
1001							# 15 exonFrames
1002
1003	87					142	$format = 'genePredExt with bin';
1004	87					209	$self{name} = $linedata[1];
1005	87					163	$self{chrom} = $linedata[2];
1006	87					152	$self{strand} = $linedata[3];
1007	87					251	$self{txStart} = $linedata[4] + 1;
1008	87					154	$self{txEnd} = $linedata[5];
1009	87					149	$self{cdsStart} = $linedata[6] + 1;
1010	87					143	$self{cdsEnd} = $linedata[7];
1011	87					229	$self{exonCount} = $linedata[8];
1012	87					143	$self{exonStarts} = $linedata[9];
1013	87					143	$self{exonEnds} = $linedata[10];
1014	87		50			211	$self{name2} = $linedata[12] \|\| undef;
1015	87		50			422	$self{gene_name} = $ucsc->{ensembldata}->{ $linedata[1] }->[0] \|\|
1016							$linedata[12] \|\| undef;
1017	87		50			355	$self{note} = $ucsc->{refseqsum}->{ $linedata[1] }->[1] \|\| undef;
1018	87		50			331	$self{status} = $ucsc->{refseqstat}->{ $linedata[1] }->[0] \|\| undef;
1019	87		50			278	$self{completeness} = $ucsc->{refseqsum}->{ $linedata[1] }->[0] \|\| undef;
1020	87	50				241	if ($linedata[1] =~ /^N[MR]_\d+/) {
1021	0					0	$self{refseq} = $linedata[1];
1022							}
1023							}
1024							### Extended Gene Prediction Table ###
1025							elsif (scalar @linedata == 15) {
1026							# an extended gene prediction table, e.g. refGene, ensGene, xenoRefGene
1027							# without the bin value
1028
1029							# 0 name
1030							# 1 chrom
1031							# 2 strand
1032							# 3 txStart
1033							# 4 txEnd
1034							# 5 cdsStart
1035							# 6 cdsEnd
1036							# 7 exonCount
1037							# 8 exonStarts
1038							# 9 exonEnds
1039							# 10 score
1040							# 11 name2
1041							# 12 cdsStartStat
1042							# 13 cdsEndStat
1043							# 14 exonFrames
1044
1045	0					0	$format = 'genePredExt';
1046	0					0	$self{name} = $linedata[0];
1047	0					0	$self{chrom} = $linedata[1];
1048	0					0	$self{strand} = $linedata[2];
1049	0					0	$self{txStart} = $linedata[3] + 1;
1050	0					0	$self{txEnd} = $linedata[4];
1051	0					0	$self{cdsStart} = $linedata[5] + 1;
1052	0					0	$self{cdsEnd} = $linedata[6];
1053	0					0	$self{exonCount} = $linedata[7];
1054	0					0	$self{exonStarts} = $linedata[8];
1055	0					0	$self{exonEnds} = $linedata[9];
1056	0		0			0	$self{name2} = $linedata[11] \|\| undef;
1057	0		0			0	$self{gene_name} = $ucsc->{ensembldata}->{ $linedata[0] }->[0] \|\|
1058							$linedata[11] \|\| undef;
1059	0		0			0	$self{note} = $ucsc->{refseqsum}->{ $linedata[0] }->[1] \|\| undef;
1060	0		0			0	$self{status} = $ucsc->{refseqstat}->{ $linedata[0] }->[0] \|\| undef;
1061	0		0			0	$self{completeness} = $ucsc->{refseqsum}->{ $linedata[0] }->[0] \|\| undef;
1062	0	0				0	if ($linedata[0] =~ /^N[MR]_\d+/) {
1063	0					0	$self{refseq} = $linedata[0];
1064							}
1065							}
1066							### Known Gene Table ###
1067							elsif (scalar @linedata == 12) {
1068
1069							# 0 name known gene identifier
1070							# 1 chrom Reference sequence chromosome or scaffold
1071							# 2 strand + or - for strand
1072							# 3 txStart Transcription start position
1073							# 4 txEnd Transcription end position
1074							# 5 cdsStart Coding region start
1075							# 6 cdsEnd Coding region end
1076							# 7 exonCount Number of exons
1077							# 8 exonStarts Exon start positions
1078							# 9 exonEnds Exon end positions
1079							# 10 proteinID UniProt display ID for Known Genes, UniProt accession or RefSeq protein ID for UCSC Genes
1080							# 11 alignID Unique identifier for each (known gene, alignment position) pair
1081
1082	0					0	$format = 'knownGene';
1083	0		0			0	$self{name} = $ucsc->{kgxref}->{ $linedata[0] }->[0] \|\| $linedata[0];
1084	0					0	$self{chrom} = $linedata[1];
1085	0					0	$self{strand} = $linedata[2];
1086	0					0	$self{txStart} = $linedata[3] + 1;
1087	0					0	$self{txEnd} = $linedata[4];
1088	0					0	$self{cdsStart} = $linedata[5] + 1;
1089	0					0	$self{cdsEnd} = $linedata[6];
1090	0					0	$self{exonCount} = $linedata[7];
1091	0					0	$self{exonStarts} = $linedata[8];
1092	0					0	$self{exonEnds} = $linedata[9];
1093	0					0	$self{name2} = $linedata[0];
1094							$self{gene_name} = $ucsc->{kgxref}->{ $linedata[0] }->[3] \|\| # geneSymbol
1095							$ucsc->{kgxref}->{ $linedata[0] }->[0] \|\| # mRNA id
1096	0		0			0	$ucsc->{kgxref}->{ $linedata[0] }->[4] \|\| # refSeq id
1097							$linedata[0]; # ugly default
1098	0		0			0	$self{note} = $ucsc->{kgxref}->{ $linedata[0] }->[6] \|\| undef;
1099	0		0			0	$self{refseq} = $ucsc->{kgxref}->{ $linedata[0] }->[4] \|\| undef;
1100	0		0			0	$self{status} = $ucsc->{refseqstat}->{ $self{refseq} }->[0] \|\| undef;
1101	0		0			0	$self{completeness} = $ucsc->{refseqsum}->{ $self{refseq} }->[0] \|\| undef;
1102	0		0			0	$self{spid} = $ucsc->{kgxref}->{ $linedata[0] }->[1] \|\| undef; # SwissProt ID
1103	0		0			0	$self{spdid} = $ucsc->{kgxref}->{ $linedata[0] }->[2] \|\| undef; # SwissProt display ID
1104	0		0			0	$self{protacc} = $ucsc->{kgxref}->{ $linedata[0] }->[5] \|\| undef; # NCBI protein accession
1105							}
1106							### refFlat or Gene Prediction Table ###
1107							elsif (scalar @linedata == 11) {
1108
1109							# 0 name2 or gene name
1110							# 1 name or transcript name
1111							# 2 chrom
1112							# 3 strand
1113							# 4 txStart
1114							# 5 txEnd
1115							# 6 cdsStart
1116							# 7 cdsEnd
1117							# 8 exonCount
1118							# 9 exonStarts
1119							# 10 exonEnds
1120
1121	0					0	$format = 'refFlat';
1122	0		0			0	$self{gene_name} = $ucsc->{ensembldata}->{ $linedata[1] }->[0] \|\|
1123							$linedata[0] \|\| undef;
1124	0					0	$self{name2} = $linedata[0];
1125	0					0	$self{name} = $linedata[1];
1126	0					0	$self{chrom} = $linedata[2];
1127	0					0	$self{strand} = $linedata[3];
1128	0					0	$self{txStart} = $linedata[4] + 1;
1129	0					0	$self{txEnd} = $linedata[5];
1130	0					0	$self{cdsStart} = $linedata[6] + 1;
1131	0					0	$self{cdsEnd} = $linedata[7];
1132	0					0	$self{exonCount} = $linedata[8];
1133	0					0	$self{exonStarts} = $linedata[9];
1134	0					0	$self{exonEnds} = $linedata[10];
1135	0		0			0	$self{note} = $ucsc->{refseqsum}->{ $linedata[1] }->[1] \|\| undef;
1136	0		0			0	$self{status} = $ucsc->{refseqstat}->{ $linedata[1] }->[0] \|\| undef;
1137	0		0			0	$self{completeness} = $ucsc->{refseqsum}->{ $linedata[1] }->[0] \|\| undef;
1138	0	0				0	if ($linedata[1] =~ /^N[MR]_\d+/) {
1139	0					0	$self{refseq} = $linedata[1];
1140							}
1141							}
1142							### Gene Prediction Table ###
1143							elsif (scalar @linedata == 10) {
1144							# a simple gene prediction table, e.g. refGene, ensGene, xenoRefGene
1145
1146							# 0 name
1147							# 1 chrom
1148							# 2 strand
1149							# 3 txStart
1150							# 4 txEnd
1151							# 5 cdsStart
1152							# 6 cdsEnd
1153							# 7 exonCount
1154							# 8 exonStarts
1155							# 9 exonEnds
1156
1157	86					129	$format = 'genePred';
1158	86					211	$self{name} = $linedata[0];
1159	86					185	$self{chrom} = $linedata[1];
1160	86					152	$self{strand} = $linedata[2];
1161	86					185	$self{txStart} = $linedata[3] + 1;
1162	86					142	$self{txEnd} = $linedata[4];
1163	86					141	$self{cdsStart} = $linedata[5] + 1;
1164	86					153	$self{cdsEnd} = $linedata[6];
1165	86					250	$self{exonCount} = $linedata[7];
1166	86					143	$self{exonStarts} = $linedata[8];
1167	86					151	$self{exonEnds} = $linedata[9];
1168	86					126	$self{name2} = $linedata[0]; # re-use transcript name
1169	86					140	$self{gene_name} = $linedata[0]; # re-use transcript name
1170	86		50			405	$self{note} = $ucsc->{refseqsum}->{ $linedata[0] }->[1] \|\| undef;
1171	86		50			315	$self{status} = $ucsc->{refseqstat}->{ $linedata[0] }->[0] \|\| undef;
1172	86		50			268	$self{completeness} = $ucsc->{refseqsum}->{ $linedata[0] }->[0] \|\| undef;
1173	86	50				216	if ($linedata[0] =~ /^N[MR]_\d+/) {
1174	0					0	$self{refseq} = $linedata[0];
1175							}
1176							}
1177							else {
1178							# unrecognized line format
1179	0					0	carp "unrecognized format, line has " . scalar @linedata . "elements";
1180	0					0	return;
1181							}
1182
1183							# verify
1184	173					353	my @errors;
1185	173	50				612	push @errors, 'strand' unless $self{strand} =~ /^[\+\-]$/;
1186	173	50				651	push @errors, 'txStart' unless $self{txStart} =~ /^\d+$/;
1187	173	50				487	push @errors, 'txEnd' unless $self{txEnd} =~ /^\d+$/;
1188	173	50				555	push @errors, 'cdsStart' unless $self{cdsStart} =~ /^\d+$/;
1189	173	50				479	push @errors, 'cdsEnd' unless $self{cdsEnd} =~ /^\d+$/;
1190	173	50				456	push @errors, 'exonCount' unless $self{exonCount} =~ /^\d+$/;
1191	173	50				455	push @errors, 'exonStarts' unless $self{exonStarts} =~ /^[\d,]+$/;
1192	173	50				497	push @errors, 'exonEnds' unless $self{exonEnds} =~ /^[\d,]+$/;
1193	173	50				400	if (@errors) {
1194	0					0	warn "line format for $format has the following errors: @errors";
1195	0					0	return;
1196							}
1197
1198							# fix values
1199	173	100				454	$self{strand} = $self{strand} eq '+' ? 1 : -1;
1200	173					582	$self{exonStarts} = [ map {$_ += 1} ( split ",", $self{exonStarts} ) ];
	862					1908
1201	173					726	$self{exonEnds} = [ ( split ",", $self{exonEnds} ) ];
1202
1203							# Attempt to identify the transcript type
1204	173		100			711	my $type = $ucsc->{ensembldata}->{ $self{name} }->[1] \|\| undef;
1205							# check if we have loaded ensembl source data and use that if available
1206	173	100				486	if ( $self{cdsStart} - 1 == $self{cdsEnd} ) {
1207							# there appears to be no coding potential when
1208							# txEnd = cdsStart = cdsEnd
1209							# if you'll look, all of the exon phases should also be -1
1210
1211	63	100				313	if ($type) {
		50
		50
		50
1212							# we have an ensembl source type, so prefer to use that
1213	21					93	$self{type} = $type;
1214							}
1215
1216							# otherwise, we may be able to infer some certain
1217							# types from the gene name
1218							elsif ($self{name2} =~ /^mir/i) {
1219							# a noncoding gene whose name begins with mir is likely a micro RNA
1220	0					0	$self{type} = 'miRNA';
1221							}
1222							elsif ($self{name2} =~ /^snr/i) {
1223							# a noncoding gene whose name begins with snr is likely a snRNA
1224	0					0	$self{type} = 'snRNA';
1225							}
1226							elsif ($self{name2} =~ /^sno/i) {
1227							# a noncoding gene whose name begins with sno is likely a snoRNA
1228	0					0	$self{type} = 'snoRNA';
1229							}
1230							else {
1231							# a generic ncRNA
1232	42					142	$self{type} = 'ncRNA';
1233							}
1234							}
1235							else {
1236							# the transcript has an identifiable CDS so likely a mRNA
1237	110	100				504	$self{type} = defined $type ? $type : 'mRNA';
1238							}
1239
1240							# add the ucsc object
1241	173					307	$self{ucsc} = $ucsc;
1242
1243	173					757	return bless \%self, $class;
1244							}
1245
1246							sub name {
1247	692			692		2104	return shift->{name};
1248							}
1249
1250							sub name2 {
1251	229			229		571	return shift->{name2};
1252							}
1253
1254							sub gene_name {
1255	371			371		1014	return shift->{gene_name};
1256							}
1257
1258							sub chrom {
1259	195			195		616	return shift->{chrom};
1260							}
1261
1262							sub txStart {
1263	291			291		783	return shift->{txStart};
1264							}
1265
1266							sub txEnd {
1267	295			295		828	return shift->{txEnd};
1268							}
1269
1270							sub strand {
1271	243			243		680	return shift->{strand};
1272							}
1273
1274							sub cdsStart {
1275	811			811		2031	return shift->{cdsStart};
1276							}
1277
1278							sub cdsEnd {
1279	409			409		1105	return shift->{cdsEnd};
1280							}
1281
1282							sub exonCount {
1283	901			901		14974	return shift->{exonCount};
1284							}
1285
1286							sub exonStarts {
1287	1412			1412		4437	return shift->{exonStarts};
1288							}
1289
1290							sub exonEnds {
1291	1014			1014		2049	return shift->{exonEnds};
1292							}
1293
1294							sub type {
1295	120			120		255	return shift->{type};
1296							}
1297
1298							sub refseq {
1299	243			243		360	my $self = shift;
1300	243	50				645	return exists $self->{refseq} ? $self->{refseq} : undef;
1301							}
1302
1303							sub note {
1304	243			243		382	my $self = shift;
1305	243	50				704	return exists $self->{note} ? $self->{note} : undef;
1306							}
1307
1308							sub status {
1309	173			173		266	my $self = shift;
1310	173	50				473	return exists $self->{status} ? $self->{status} : undef;
1311							}
1312
1313							sub completeness {
1314	173			173		243	my $self = shift;
1315	173	50				421	return exists $self->{completeness} ? $self->{completeness} : undef;
1316							}
1317
1318							sub ucsc {
1319	493			493		889	return shift->{ucsc};
1320							}
1321
1322							sub build_gene {
1323	70			70		107	my $self = shift;
1324
1325							# shortcuts
1326	70					133	my $ucsc = $self->ucsc;
1327	70					115	my $id2count = $ucsc->{id2count};
1328	70					103	my $ensembldata = $ucsc->{ensembldata};
1329
1330							# find a pre-existing gene to update or build a new one
1331	70					156	my $gene = $self->find_gene;
1332	70	100				144	if ($gene) {
1333							# update as necessary
1334	48	50				82	if ( ($self->txStart) < $gene->start) {
1335							# update the transcription start position
1336	0					0	$gene->start( $self->txStart );
1337							}
1338	48	100				185	if ($self->txEnd > $gene->end) {
1339							# update the transcription stop position
1340	4					51	$gene->end( $self->txEnd );
1341							}
1342							}
1343							else {
1344							# build a new gene
1345	22					52	$gene = $SFCLASS->new(
1346							-seq_id => $self->chrom,
1347							-source => $ucsc->source,
1348							-primary_tag => 'gene',
1349							-start => $self->txStart,
1350							-end => $self->txEnd,
1351							-strand => $self->strand,
1352							-phase => '.',
1353							-display_name => $self->gene_name,
1354							);
1355	22					405	$ucsc->{counts}->{gene} += 1;
1356
1357							# Add a unique primary ID
1358	22					52	my $id = $self->name2;
1359	22	50				63	if (exists $id2count->{ lc $id }) {
1360							# we've encountered this gene ID before
1361
1362							# then make name unique by appending the count number
1363	0					0	$id2count->{ lc $id } += 1;
1364	0					0	$id .= '.' . $id2count->{ lc $id };
1365							}
1366							else {
1367							# this is the first transcript with this id
1368							# set the id counter
1369	22					69	$id2count->{lc $id} = 0;
1370							}
1371	22					71	$gene->primary_id($id);
1372
1373							# Add an alias
1374	22	100				82	if ($self->name2 ne $self->gene_name) {
1375	12					25	$gene->add_tag_value('Alias', $self->name2);
1376							}
1377							}
1378
1379							# now build the transcript for the gene
1380	70					293	my $transcript = $self->build_transcript($gene);
1381	70					200	$gene->add_SeqFeature($transcript);
1382	70					1775	$transcript->add_tag_value('Parent', $gene->primary_id);
1383
1384							# update extra attributes as necessary
1385	70					434	$self->update_attributes($gene);
1386
1387							# finished
1388	70					199	return $gene;
1389							}
1390
1391							sub build_transcript {
1392	173			173		400	my ($self, $gene) = @_; # gene is not required
1393
1394							# shortcuts
1395	173					338	my $ucsc = $self->ucsc;
1396	173					283	my $id2count = $ucsc->{id2count};
1397	173					287	my $ensembldata = $ucsc->{ensembldata};
1398	173					267	my $counts = $ucsc->{counts};
1399
1400							# Uniqueify the transcript ID and name
1401	173					339	my $id = $self->name;
1402	173	50				484	if (exists $id2count->{ lc $id } ) {
1403							# we've encountered this transcript ID before
1404
1405							# now need to make ID unique by appending a number
1406	0					0	$id2count->{ lc $id } += 1;
1407	0					0	$id .= '.' . $id2count->{ lc $id };
1408							}
1409							else {
1410							# this is the first transcript with this id
1411	173					452	$id2count->{lc $id} = 0;
1412							}
1413
1414							# identify the primary_tag value
1415	173					281	my ($type, $biotype);
1416	173	50				327	if (exists $ensembldata->{$self->name}) {
1417	173		100			331	my $t = $ensembldata->{$self->name}->[1] \|\| undef;
1418	173	100	100			966	if ($t and $t =~ /protein.coding/i) {
		100	100
		100
1419	26					53	$type = 'mRNA';
1420	26					46	$biotype = $t;
1421							}
1422							elsif ($t and $t =~ /rna\|transcript/i) {
1423	15					30	$type = $t;
1424	15					30	$biotype = $t;
1425							}
1426							elsif ($t) {
1427	12					41	$type = 'transcript';
1428	12					21	$biotype = $t;
1429							}
1430							else {
1431	120					239	$type = $self->type;
1432							}
1433							}
1434							else {
1435	0					0	$type = $self->type;
1436							}
1437
1438							# Generate the transcript SeqFeature object
1439	173					427	my $transcript = $SFCLASS->new(
1440							-seq_id => $self->chrom,
1441							-source => $ucsc->source,
1442							-primary_tag => $type,
1443							-start => $self->txStart,
1444							-end => $self->txEnd,
1445							-strand => $self->strand,
1446							-phase => '.',
1447							-display_name => $self->name,
1448							-primary_id => $id,
1449							);
1450
1451							# add gene name as an alias
1452	173	100				2615	if ($self->gene_name ne $self->name2) {
1453	36					67	$transcript->add_tag_value('Alias', $self->gene_name);
1454							}
1455
1456							# update extra attributes as necessary
1457	173					636	$self->update_attributes($transcript);
1458
1459							# add transcript specific attributes
1460	173	50				316	if (defined $self->completeness ) {
1461	0					0	$transcript->add_tag_value( 'completeness', $self->completeness );
1462							}
1463	173	50				345	if (defined $self->status ) {
1464	0					0	$transcript->add_tag_value( 'status', $self->status );
1465							}
1466	173	100				368	if ($biotype) {
1467	53					124	$transcript->add_tag_value( 'biotype', $biotype);
1468							}
1469
1470							# add the exons
1471	173	100				529	if ($ucsc->do_exon) {
1472	122					257	$self->add_exons($transcript, $gene);
1473							}
1474
1475							# add CDS, UTRs, and codons if necessary
1476	173	100				412	if ( $self->cdsStart - 1 != $self->cdsEnd ) {
1477
1478	110	100				287	if ($ucsc->do_utr) {
1479	10					25	$self->add_utrs($transcript, $gene);
1480							}
1481
1482	110	50				247	if ($ucsc->do_codon) {
1483	0					0	$self->add_codons($transcript, $gene);
1484							}
1485
1486	110	100				238	if ($ucsc->do_cds) {
1487	10					24	$self->add_cds($transcript);
1488							}
1489							}
1490
1491							# record the type of transcript
1492	173					458	$counts->{$type} += 1;
1493
1494							# transcript is complete
1495	173					464	return $transcript;
1496							}
1497
1498							sub update_attributes {
1499	243			243		457	my ($self, $seqf) = @_;
1500
1501							# add Note if possible
1502	243	50				503	if (defined $self->note ) {
1503	0					0	$self->add_unique_attribute($seqf, 'Note', $self->note );
1504							}
1505
1506							# add refSeq identifier if possible
1507	243	50				506	if (defined $self->refseq) {
1508	0					0	$self->add_unique_attribute($seqf, 'Dbxref', 'RefSeq:' . $self->refseq);
1509							}
1510
1511							# add SwissProt identifier if possible
1512	243	50	33			587	if (exists $self->{spid} and defined $self->{spid}) {
1513	0					0	$self->add_unique_attribute($seqf, 'Dbxref', 'Swiss-Prot:' . $self->{spid});
1514							}
1515
1516							# add SwissProt display identifier if possible
1517	243	50	33			603	if (exists $self->{spdid} and defined $self->{spdid}) {
1518	0					0	$self->add_unique_attribute($seqf, 'swiss-prot_display_id', $self->{spdid});
1519							}
1520
1521							# add NCBI protein access identifier if possible
1522	243	50	33			585	if (exists $self->{protacc} and defined $self->{protacc}) {
1523	0					0	$self->add_unique_attribute($seqf, 'Dbxref', 'RefSeq:' . $self->{protacc});
1524							}
1525							}
1526
1527							sub add_unique_attribute {
1528	0			0		0	my ($self, $seqf, $tag, $value) = @_;
1529
1530							# look for a pre-existing identical tag value
1531	0					0	my $check = 1;
1532	0					0	foreach ($seqf->get_tag_values($tag)) {
1533	0	0				0	if ($_ eq $value) {
1534	0					0	$check = 0;
1535	0					0	last;
1536							}
1537							}
1538
1539							# add it if our value is unique
1540	0	0				0	$seqf->add_tag_value($tag, $value) if $check;
1541							}
1542
1543							sub add_exons {
1544	122			122		230	my ($self, $transcript, $gene) = @_;
1545	122					219	my $ucsc = $self->ucsc;
1546
1547							# Add the exons
1548							EXON_LOOP:
1549	122					311	for (my $i = 0; $i < $self->exonCount; $i++) {
1550
1551							# first look for existing
1552	595	100	100			1259	if ($ucsc->share and $gene) {
1553	277					517	my $exon = $self->find_existing_subfeature($gene, 'exon',
1554							$self->exonStarts->[$i], $self->exonEnds->[$i]);
1555	277	100				606	if ($exon) {
1556							# we found an existing exon to reuse
1557							# associate with this transcript
1558	156					404	$transcript->add_SeqFeature($exon);
1559	156					4939	next EXON_LOOP;
1560							}
1561							}
1562
1563							# transform index for reverse strands
1564							# this will allow numbering from 5'->3'
1565	439					657	my $number;
1566	439	100				890	if ($transcript->strand == 1) {
1567							# forward strand
1568	427					1549	$number = $i;
1569							}
1570							else {
1571							# reverse strand
1572	12					72	$number = abs( $i - $self->exonCount + 1);
1573							}
1574
1575							# build the exon seqfeature
1576	439					945	my $exon = $SFCLASS->new(
1577							-seq_id => $transcript->seq_id,
1578							-source => $transcript->source,
1579							-primary_tag => 'exon',
1580							-start => $self->exonStarts->[$i],
1581							-end => $self->exonEnds->[$i],
1582							-strand => $transcript->strand,
1583							-primary_id => $transcript->primary_id . ".exon$number",
1584							);
1585
1586							# add name if requested
1587	439	50				10204	if ($ucsc->do_name) {
1588	0					0	$exon->display_name( $transcript->display_name . ".exon$number" );
1589							}
1590
1591							# associate with transcript
1592	439					1018	$transcript->add_SeqFeature($exon);
1593							}
1594							}
1595
1596							sub add_utrs {
1597	10			10		22	my ($self, $transcript, $gene) = @_;
1598	10					21	my $ucsc = $self->ucsc;
1599
1600							# we will scan each exon and look for a potential utr and build it
1601	10					17	my @utrs;
1602							UTR_LOOP:
1603	10					23	for (my $i = 0; $i < $self->exonCount; $i++) {
1604
1605							# transform index for reverse strands
1606							# this will allow numbering from 5'->3'
1607	76					106	my $number;
1608	76	50				149	if ($transcript->strand == 1) {
1609							# forward strand
1610	76					111	$number = $i;
1611							}
1612							else {
1613							# reverse strand
1614	0					0	$number = abs( $i - $self->exonCount + 1);
1615							}
1616
1617							# identify UTRs
1618							# we will identify by comparing the cdsStart and cdsStop relative
1619							# to the exon coordinates
1620							# the primary tag is determined by the exon strand orientation
1621	76					167	my ($start, $stop, $tag);
1622							# in case we need to build two UTRs
1623	76					0	my ($start2, $stop2, $tag2);
1624
1625							# Split 5'UTR, CDS, and 3'UTR all on the same exon
1626	76	50	66			121	if (
		100	100
		100	66
		100	66
		100	66
		50	33
1627							$self->exonStarts->[$i] < $self->cdsStart
1628							and
1629							$self->exonEnds->[$i] > $self->cdsEnd
1630							) {
1631							# the CDS is entirely within the exon, resulting in two UTRs
1632							# on either side of the exon
1633							# we must build two UTRs
1634
1635							# the left UTR
1636	0					0	$start = $self->exonStarts->[$i];
1637	0					0	$stop = $self->cdsStart - 1;
1638	0	0				0	$tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1639
1640							# the right UTR
1641	0					0	$start2 = $self->cdsEnd + 1;
1642	0					0	$stop2 = $self->exonEnds->[$i];
1643	0	0				0	$tag2 = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1644							}
1645
1646							# 5'UTR forward, 3'UTR reverse
1647							elsif (
1648							$self->exonStarts->[$i] < $self->cdsStart
1649							and
1650							$self->exonEnds->[$i] < $self->cdsStart
1651							) {
1652							# the exon start/end is entirely before the cdsStart
1653	3					5	$start = $self->exonStarts->[$i];
1654	3					6	$stop = $self->exonEnds->[$i];
1655	3	50				6	$tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1656							}
1657
1658							# Split 5'UTR & CDS on forward, 3'UTR & CDS
1659							elsif (
1660							$self->exonStarts->[$i] < $self->cdsStart
1661							and
1662							$self->exonEnds->[$i] >= $self->cdsStart
1663							) {
1664							# the start/stop codon is in this exon
1665							# we need to make the UTR out of a portion of this exon
1666	9					19	$start = $self->exonStarts->[$i];
1667	9					18	$stop = $self->cdsStart - 1;
1668	9	50				21	$tag = $transcript->strand == 1 ? 'five_prime_UTR' : 'three_prime_UTR';
1669							}
1670
1671							# CDS only
1672							elsif (
1673							$self->exonStarts->[$i] >= $self->cdsStart
1674							and
1675							$self->exonEnds->[$i] <= $self->cdsEnd
1676							) {
1677							# CDS only exon
1678	50					118	next UTR_LOOP;
1679							}
1680
1681							# Split 3'UTR & CDS on forward, 5'UTR & CDS
1682							elsif (
1683							$self->exonStarts->[$i] <= $self->cdsEnd
1684							and
1685							$self->exonEnds->[$i] > $self->cdsEnd
1686							) {
1687							# the stop/start codon is in this exon
1688							# we need to make the UTR out of a portion of this exon
1689	7					14	$start = $self->cdsEnd + 1;
1690	7					25	$stop = $self->exonEnds->[$i];
1691	7	50				18	$tag = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1692							}
1693
1694							# 3'UTR forward, 5'UTR reverse
1695							elsif (
1696							$self->exonStarts->[$i] > $self->cdsEnd
1697							and
1698							$self->exonEnds->[$i] > $self->cdsEnd
1699							) {
1700							# the exon start/end is entirely after the cdsStop
1701							# we have a 3'UTR
1702	7					12	$start = $self->exonStarts->[$i];
1703	7					13	$stop = $self->exonEnds->[$i];
1704	7	50				17	$tag = $transcript->strand == 1 ? 'three_prime_UTR' : 'five_prime_UTR';
1705							}
1706
1707							# Something else?
1708							else {
1709	0					0	my $warning = "Warning: A malformed UTR that doesn't match known criteria: ";
1710	0					0	$warning .= "cdsStart " . $self->cdsStart;
1711	0					0	$warning .= ", cdsEnd " . $self->cdsEnd;
1712	0					0	$warning .= ", exonStart " . $self->exonStarts->[$i];
1713	0					0	$warning .= ", exonEnd " . $self->exonEnds->[$i];
1714	0					0	warn $warning;
1715	0					0	next UTR_LOOP;
1716							}
1717
1718							## Generate the UTR objects
1719	26					50	my $utr;
1720
1721							# look for existing utr
1722	26	50	33			52	if ($ucsc->share and $gene) {
1723	26					53	$utr = $self->find_existing_subfeature($gene, $tag, $start, $stop);
1724							}
1725
1726							# otherwise build the UTR object
1727	26	100				57	unless ($utr) {
1728	19					87	$utr = $SFCLASS->new(
1729							-seq_id => $transcript->seq_id,
1730							-source => $transcript->source,
1731							-start => $start,
1732							-end => $stop,
1733							-strand => $transcript->strand,
1734							-phase => '.',
1735							-primary_tag => $tag,
1736							-primary_id => $transcript->primary_id . ".utr$number",
1737							);
1738	19	50				100	$utr->display_name( $transcript->display_name . ".utr$number" ) if
1739							$ucsc->do_name;
1740							}
1741
1742							# store this utr seqfeature in a temporary array
1743	26					49	push @utrs, $utr;
1744
1745							# build a second UTR object as necessary
1746	26	50				82	if ($start2) {
1747	0					0	my $utr2;
1748
1749							# look for existing utr
1750	0	0				0	if ($ucsc->share) {
1751	0					0	$utr2 = $self->find_existing_subfeature($gene, $tag2, $start2, $stop2);
1752							}
1753
1754							# otherwise build the utr
1755	0	0				0	unless ($utr2) {
1756	0					0	$utr2 = $SFCLASS->new(
1757							-seq_id => $transcript->seq_id,
1758							-source => $transcript->source,
1759							-start => $start2,
1760							-end => $stop2,
1761							-strand => $transcript->strand,
1762							-phase => '.',
1763							-primary_tag => $tag2,
1764							-primary_id => $transcript->primary_id . ".utr$number" . "a",
1765							);
1766	0	0				0	$utr2->display_name( $transcript->display_name . ".utr$number" . "a" )
1767							if $ucsc->do_name;
1768							}
1769
1770							# store this utr seqfeature in a temporary array
1771	0					0	push @utrs, $utr2;
1772							}
1773							}
1774
1775							# associate found UTRs with the transcript
1776	10					23	foreach my $utr (@utrs) {
1777	26					51	$transcript->add_SeqFeature($utr);
1778							}
1779							}
1780
1781							sub add_cds {
1782	10			10		22	my ($self, $transcript) = @_;
1783
1784							# we will NOT collapse CDS features since we cannot guarantee that a shared
1785							# CDS will have the same phase, since phase is dependent on the translation
1786							# start
1787
1788							# we will scan each exon and look for a potential CDS and build it
1789	10					12	my @cdss;
1790	10					16	my $phase = 0; # initialize CDS phase and keep track as we process CDSs
1791							CDS_LOOP:
1792	10					22	for (my $i = 0; $i < $self->exonCount; $i++) {
1793
1794							# transform index for reverse strands
1795	76					112	my $j;
1796	76	50				152	if ($transcript->strand == 1) {
1797							# forward strand
1798	76					102	$j = $i;
1799							}
1800							else {
1801							# reverse strand
1802							# flip the index for exon starts and stops so that we
1803							# always progress 5' -> 3'
1804							# this ensures the phase is accurate from the start codon
1805	0					0	$j = abs( $i - $self->exonCount + 1);
1806							}
1807
1808							# identify CDSs
1809							# we will identify by comparing the cdsStart and cdsStop relative
1810							# to the exon coordinates
1811	76					124	my ($start, $stop);
1812
1813							# Split 5'UTR, CDS, and 3'UTR all on the same exon
1814	76	50	66			136	if (
		100	100
		100	66
		100	66
		100	66
		50	33
1815							$self->exonStarts->[$j] < $self->cdsStart
1816							and
1817							$self->exonEnds->[$j] > $self->cdsEnd
1818							) {
1819							# exon contains the entire CDS
1820	0					0	$start = $self->cdsStart;
1821	0					0	$stop = $self->cdsEnd;
1822							}
1823
1824							# 5'UTR forward, 3'UTR reverse
1825							elsif (
1826							$self->exonStarts->[$j] < $self->cdsStart
1827							and
1828							$self->exonEnds->[$j] < $self->cdsStart
1829							) {
1830							# no CDS in this exon
1831	3					8	next CDS_LOOP;
1832							}
1833
1834							# Split 5'UTR & CDS on forward, 3'UTR & CDS
1835							elsif (
1836							$self->exonStarts->[$j] < $self->cdsStart
1837							and
1838							$self->exonEnds->[$j] >= $self->cdsStart
1839							) {
1840							# the start/stop codon is in this exon
1841							# we need to make the CDS out of a portion of this exon
1842	9					17	$start = $self->cdsStart;
1843	9					16	$stop = $self->exonEnds->[$j];
1844							}
1845
1846							# CDS only
1847							elsif (
1848							$self->exonStarts->[$j] >= $self->cdsStart
1849							and
1850							$self->exonEnds->[$j] <= $self->cdsEnd
1851							) {
1852							# entire exon is CDS
1853	50					85	$start = $self->exonStarts->[$j];
1854	50					93	$stop = $self->exonEnds->[$j];
1855							}
1856
1857							# Split 3'UTR & CDS on forward, 5'UTR & CDS
1858							elsif (
1859							$self->exonStarts->[$j] <= $self->cdsEnd
1860							and
1861							$self->exonEnds->[$j] > $self->cdsEnd
1862							) {
1863							# the stop/start codon is in this exon
1864							# we need to make the CDS out of a portion of this exon
1865	7					12	$start = $self->exonStarts->[$j];
1866	7					12	$stop = $self->cdsEnd;
1867							}
1868
1869							# 3'UTR forward, 5'UTR reverse
1870							elsif (
1871							$self->exonStarts->[$j] > $self->cdsEnd
1872							and
1873							$self->exonEnds->[$j] > $self->cdsEnd
1874							) {
1875							# the exon start/end is entirely after the cdsStop
1876							# we have entirely 5' or 3'UTR, no CDS
1877	7					19	next CDS_LOOP;
1878							}
1879
1880							# Something else?
1881							else {
1882	0					0	my $warning = "Warning: A malformed CDS that doesn't match known criteria: ";
1883	0					0	$warning .= "cdsStart " . $self->cdsStart;
1884	0					0	$warning .= ", cdsEnd " . $self->cdsEnd;
1885	0					0	$warning .= ", exonStart " . $self->exonStarts->[$j];
1886	0					0	$warning .= ", exonEnd " . $self->exonEnds->[$j];
1887	0					0	warn $warning;
1888	0					0	next CDS_LOOP;
1889							}
1890
1891							# build the CDS object
1892	66					174	my $cds = $SFCLASS->new(
1893							-seq_id => $transcript->seq_id,
1894							-source => $transcript->source,
1895							-start => $start,
1896							-end => $stop,
1897							-strand => $transcript->strand,
1898							-phase => $phase,
1899							-primary_tag => 'CDS',
1900							-primary_id => $transcript->primary_id . ".cds$i",
1901							-display_name => $transcript->display_name . ".cds$i",
1902							);
1903							# the id and name still use $i for labeling to ensure numbering from 0
1904
1905							# store this utr seqfeature in a temporary array
1906	66					170	push @cdss, $cds;
1907
1908							# reset the phase for the next CDS
1909							# phase + (3 - (length % 3)), readjust to 0..2 if necessary
1910							# adapted from Barry Moore's gtf2gff3.pl script
1911	66					154	$phase = $phase + (3 - ( $cds->length % 3) );
1912	66	100				217	$phase -=3 if $phase > 2;
1913							}
1914
1915							# associate found UTRs with the transcript
1916	10					21	foreach my $cds (@cdss) {
1917	66					127	$transcript->add_SeqFeature($cds);
1918							}
1919							}
1920
1921							sub add_codons {
1922	0			0		0	my ($self, $transcript, $gene) = @_;
1923	0					0	my $ucsc = $self->ucsc;
1924
1925							# generate the start and stop codons
1926	0					0	my ($start_codon, $stop_codon);
1927	0	0				0	if ($transcript->strand == 1) {
1928							# forward strand
1929
1930							# share codons if possible
1931	0	0	0			0	if ($ucsc->share and $gene) {
1932	0					0	$start_codon = $self->find_existing_subfeature($gene, 'start_codon',
1933							$self->cdsStart, $self->cdsStart + 2);
1934	0					0	$stop_codon = $self->find_existing_subfeature($gene, 'stop_codon',
1935							$self->cdsEnd - 2, $self->cdsEnd);
1936							}
1937
1938							# start codon
1939	0	0				0	unless ($start_codon) {
1940	0					0	$start_codon = $SFCLASS->new(
1941							-seq_id => $transcript->seq_id,
1942							-source => $transcript->source,
1943							-primary_tag => 'start_codon',
1944							-start => $self->cdsStart,
1945							-end => $self->cdsStart + 2,
1946							-strand => 1,
1947							-phase => 0,
1948							-primary_id => $transcript->primary_id . '.start_codon',
1949							);
1950	0	0				0	$start_codon->display_name( $transcript->display_name . '.start_codon' ) if
1951							$ucsc->do_name;
1952							}
1953
1954							# stop codon
1955	0	0				0	unless ($stop_codon) {
1956	0					0	$stop_codon = $SFCLASS->new(
1957							-seq_id => $transcript->seq_id,
1958							-source => $transcript->source,
1959							-primary_tag => 'stop_codon',
1960							-start => $self->cdsEnd - 2,
1961							-end => $self->cdsEnd,
1962							-strand => 1,
1963							-phase => 0,
1964							-primary_id => $transcript->primary_id . '.stop_codon',
1965							);
1966	0	0				0	$stop_codon->display_name( $transcript->display_name . '.stop_codon' ) if
1967							$ucsc->do_name;
1968							}
1969							}
1970
1971							else {
1972							# reverse strand
1973
1974							# share codons if possible
1975	0	0	0			0	if ($ucsc->share and $gene) {
1976	0					0	$stop_codon = $self->find_existing_subfeature($gene, 'stop_codon',
1977							$self->cdsStart, $self->cdsStart + 2);
1978	0					0	$start_codon = $self->find_existing_subfeature($gene, 'start_codon',
1979							$self->cdsEnd - 2, $self->cdsEnd);
1980							}
1981
1982							# stop codon
1983	0	0				0	unless ($stop_codon) {
1984	0					0	$stop_codon = $SFCLASS->new(
1985							-seq_id => $transcript->seq_id,
1986							-source => $transcript->source,
1987							-primary_tag => 'stop_codon',
1988							-start => $self->cdsStart,
1989							-end => $self->cdsStart + 2,
1990							-strand => -1,
1991							-phase => 0,
1992							-primary_id => $transcript->primary_id . '.stop_codon',
1993							);
1994	0	0				0	$stop_codon->display_name( $transcript->display_name . '.stop_codon' ) if
1995							$ucsc->do_name;
1996							}
1997
1998							# start codon
1999	0	0				0	unless ($start_codon) {
2000	0					0	$start_codon = $SFCLASS->new(
2001							-seq_id => $transcript->seq_id,
2002							-source => $transcript->source,
2003							-primary_tag => 'start_codon',
2004							-start => $self->cdsEnd - 2,
2005							-end => $self->cdsEnd,
2006							-strand => -1,
2007							-phase => 0,
2008							-primary_id => $transcript->primary_id . '.start_codon',
2009							-display_name => $transcript->primary_id . '.start_codon',
2010							);
2011	0	0				0	$start_codon->display_name( $transcript->display_name . '.start_codon' ) if
2012							$ucsc->do_name;
2013							}
2014							}
2015
2016							# associate with transcript
2017	0					0	$transcript->add_SeqFeature($start_codon);
2018	0					0	$transcript->add_SeqFeature($stop_codon);
2019							}
2020
2021							sub find_gene {
2022	70			70		126	my $self = shift;
2023
2024							# check if a gene with this name exists
2025	70	100				120	if (exists $self->ucsc->{gene2seqf}->{lc $self->gene_name} ) {
2026							# we found a gene with the same name
2027							# pull out the gene seqfeature(s) array reference
2028							# there may be more than one gene
2029	48					111	my $genes = $self->ucsc->{gene2seqf}->{ lc $self->gene_name };
2030
2031							# check that the current transcript intersects with the gene
2032							# sometimes we can have two separate transcripts with the
2033							# same gene name, but located on opposite ends of the chromosome
2034							# part of a gene family, but unlikely the same gene 200 Mb in
2035							# length
2036	48					113	foreach my $g (@$genes) {
2037	48	50	33			135	if (
			33
2038							# overlap method borrowed from Bio::RangeI
2039							($g->strand == $self->strand) and not (
2040							$g->start > $self->txEnd or
2041							$g->end < $self->txStart
2042							)
2043							) {
2044							# gene and transcript overlap on the same strand
2045							# we found the intersecting gene
2046	48					138	return $g;
2047							}
2048							}
2049							}
2050	22					48	return;
2051							}
2052
2053							sub find_existing_subfeature {
2054	303			303		624	my ($self, $gene, $type, $start, $stop) = @_;
2055
2056							# we will try to find a pre-existing subfeature at identical coordinates
2057	303					657	foreach my $transcript ($gene->get_SeqFeatures()) {
2058							# walk through transcripts
2059	988					5179	foreach my $subfeature ($transcript->get_SeqFeatures()) {
2060							# walk through subfeatures of transcripts
2061	8284	100	100			35171	if (
			100
2062							$subfeature->primary_tag eq $type and
2063							$subfeature->start == $start and
2064							$subfeature->end == $stop
2065							) {
2066							# we found a match
2067	163					1532	return $subfeature;
2068							}
2069							}
2070							}
2071	140					743	return;
2072							}
2073
2074
2075
2076
2077							__END__