File Coverage

blib/lib/Bio/ToolBox/GeneTools.pm
Criterion Covered Total %
statement 275 572 48.0
branch 122 362 33.7
condition 16 88 18.1
subroutine 30 38 78.9
pod 30 33 90.9
total 473 1093 43.2


line stmt bran cond sub pod time code
1             package Bio::ToolBox::GeneTools;
2             our $VERSION = '1.67';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::GeneTools - SeqFeature agnostic methods for working with gene models
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::GeneTools qw(:all);
11            
12             my $gene; # a SeqFeatureI compliant gene object obtained elsewhere
13             # for example, from Bio::DB::SeqFeature::Store database
14             # or parsed from a GFF3, GTF, or UCSC-style gene table using
15             # Bio::ToolBox parsers
16            
17             if (is_coding($gene)) { # boolean test
18            
19             # collect all exons from all transcripts in gene
20             my @exons = get_exons($gene);
21            
22             # find just the alternate exons used only once
23             my @alternate_exons = get_alt_exons($gene);
24            
25             # collect UTRs, which may not be defined in the original source
26             my @utrs;
27             foreach my $t (get_transcripts($gene)) {
28             my @u = get_utrs($t);
29             push @utrs, @u;
30             }
31             }
32              
33              
34             =head1 DESCRIPTION
35              
36             This module provides numerous exportable functions for working with gene
37             SeqFeature models. This assumes that the gene models follow the BioPerl
38             L convention with nested SeqFeature objects representing the
39             gene, transcript, and exons. For example,
40              
41             gene
42             transcript
43             exon
44             CDS
45              
46             Depending upon how the SeqFeatures were generated or defined, subfeatures
47             may or may not be defined or be obvious. For example, UTRs or introns may
48             not be present. Furthermore, the C or type may not follow
49             Sequence Ontology terms. Regular expressions are deployed to handle
50             varying naming schemes and exceptions.
51              
52             These functions should work with most or all L compliant
53             objects. It has been tested with L,
54             L, and L classes.
55              
56             New SeqFeature objects that are generated use the same class for
57             simplicity and expediency.
58              
59             =head1 METHODS
60              
61             =head2 Function Import
62              
63             None of the functions are exported by default. Specify which ones you want
64             when you import the module. Alternatively, use one of the tags below.
65              
66             =over 4
67              
68             =item :all
69              
70             Import all of the methods.
71              
72             =item :exon
73              
74             Import all of the exon methods, including L, L,
75             L, L, and L.
76              
77             =item :intron
78              
79             Import all of the intron methods, including L, L,
80             L, L, and L.
81              
82             =item :transcript
83              
84             Import the transcript related methods, including L,
85             L, and L.
86              
87             =item :cds
88              
89             Import the CDS pertaining methods, including L, L,
90             L, L, L, and L.
91              
92             =item :export
93              
94             Import all of the export methods, including L, L,
95             L, and L;
96              
97             =item :filter
98              
99             Import all of the transcript filter methods, including L,
100             L, and L.
101              
102             =back
103              
104             =head2 Exon Methods
105              
106             Functions to get a list of exons from a gene or transcript
107              
108             =over 4
109              
110             =item get_exons
111              
112             my @exons = get_exons($gene);
113             my @exons = get_exons($transcript);
114              
115             This will return an array or array reference of all the exon subfeatures in
116             the SeqFeature object, either gene or transcript. No discrimination whether
117             they are used once or more than once. Non-defined exons can be assembled from
118             CDS and/or UTR subfeatures. Exons are sorted by start coordinate.
119              
120             =item get_alt_exons
121              
122             my @alternate_exons = get_alt_exons($gene);
123              
124             This will return an array or array reference of all the exon subfeatures for
125             a multi-transcript gene that are used only once in all of the transcripts.
126              
127             =item get_common_exons
128              
129             my @common_exons = get_common_exons($gene);
130              
131             This will return an array or array reference of all the exon subfeatures for
132             a multi-transcript gene that are used in all of the transcripts.
133              
134             =item get_uncommon_exons
135              
136             my @uncommon_exons = get_uncommon_exons($gene);
137              
138             This will return an array or array reference of all the exon subfeatures for
139             a multi-transcript gene that are used in some of the transcripts, i.e. more
140             than one but not all.
141              
142             =item get_alt_common_exons
143              
144             my %exon_hash = get_alt_common_exons($gene);
145              
146             This will return a hash reference with several keys, including "common",
147             "uncommon", and each of the transcript IDs. Each key value is an array
148             reference with the exons for that category. The "common" will be all
149             common exons, "uncommon" will be uncommon exons (used more than once but
150             less than all), and each transcript ID will include their specific alternate
151             exons (used only once).
152              
153             For genes with only a single transcript, all exons will be marked as "common"
154             for simplicity, although technically they could all be considered "alternate"
155             since they're only used once.
156              
157             =back
158              
159             =head2 Intron Methods
160              
161             Functions to get a list of introns from a gene or transcript. Introns are
162             not usually defined in gene annotation files, but are inferred from the
163             exons and total gene or transcript length. In this case, new SeqFeature
164             elements are generated for each intron.
165              
166             =over 4
167              
168             =item get_introns
169              
170             my @introns = get_introns($gene);
171             my @introns = get_introns($transcript);
172              
173             This will return an array or array reference of all the intron subfeatures in
174             the SeqFeature object, either gene or transcript. No discrimination whether
175             they are used once or more than once. Non-defined introns can be assembled from
176             CDS andEor UTR subfeatures. Introns are sorted by start coordinate.
177              
178             =item get_alt_introns
179              
180             my @alternate_introns = get_alt_introns($gene);
181              
182             This will return an array or array reference of all the intron subfeatures for
183             a multi-transcript gene that are used only once in all of the transcripts.
184              
185             =item get_common_introns
186              
187             my @common_introns = get_common_introns($gene);
188              
189             This will return an array or array reference of all the intron subfeatures for
190             a multi-transcript gene that are used in all of the transcripts.
191              
192             =item get_uncommon_introns
193              
194             my @uncommon_introns = get_uncommon_introns($gene);
195              
196             This will return an array or array reference of all the intron subfeatures for
197             a multi-transcript gene that are used in some of the transcripts, i.e. more
198             than one but not all.
199              
200             =item get_alt_common_introns
201              
202             my %intron_hash = get_alt_common_introns($gene);
203              
204             This will return a hash reference with several keys, including "common",
205             "uncommon", and each of the transcript IDs. Each key value is an array
206             reference with the introns for that category. The "common" will be all
207             common introns, "uncommon" will be uncommon introns (used more than once but
208             less than all), and each transcript ID will include their specific alternate
209             introns (used only once).
210              
211             For genes with only a single transcript, all introns will be marked as "common"
212             for simplicity, although technically they could all be considered "alternate"
213             since they're only used once.
214              
215             =back
216              
217             =head2 Transcript Methods
218              
219             These methods work on transcripts, typically alternate transcripts from a
220             gene SeqFeature.
221              
222             =over 4
223              
224             =item get_transcripts
225              
226             my @transcripts = get_transcripts($gene);
227              
228             Returns an array or array reference of the transcripts associated with a
229             gene feature.
230              
231             =item collapse_transcripts
232              
233             my $new_transcript = collapse_transcripts($gene);
234             my $new_transcript = collapse_transcripts($transcript1, $transcript2);
235              
236             This method will collapse all of the transcripts associated with a gene
237             SeqFeature into a single artificial transcript, merging exons as necessary
238             to maximize exon length and minimize introns. This is useful when
239             performing, for example, RNASeq analysis on genes. A single SeqFeature
240             transcript object is returned containing the merged exon subfeatures.
241              
242             Pass either a gene or a list of transcripts to collapse.
243              
244             =item get_transcript_length
245              
246             my $length = get_transcript_length($transcript);
247              
248             Calculates and returns the transcribed length of a transcript, i.e
249             the sum of its exon lengths. B If you pass a gene object, you
250             will get the maximum of all transcript exon lengths, which may not be
251             what you anticipate!
252              
253             =back
254              
255             =head2 CDS methods
256              
257             These methods calculate values related to the coding sequence of the
258             mRNA transcripts.
259              
260             =over 4
261              
262             =item is_coding
263              
264             if( is_coding($transcript) ) {
265             # do something
266             }
267              
268             This method will return a boolean value (1 or 0) if the passed transcript object
269             appears to be a coding transcript. GFF and GTF files are not always immediately
270             clear about the type of transcript; there are (unfortunately) multiple ways
271             to encode the feature as a protein coding transcript: C,
272             C, GFF attribute, presence of CDS subfeatures, etc.
273             This method checks all of these possibilities. B: If you pass a
274             multi-transcript gene, only one transcript need to be coding to pass a true
275             value.
276              
277             =item get_cds
278              
279             my @cds = get_cds($transcript);
280              
281             Returns the CDS subfeatures of the given transcript, if they are
282             defined. Returns either an array or array reference.
283              
284             =item get_cdsStart
285              
286             my $start = get_cdsStart($trancript);
287              
288             Returns the start coordinate of the CDS for the given transcript.
289             Note that this is the leftmost (smallest) coordinate of the CDS
290             and not necessarily the coordinate of the start codon, similar to
291             what the UCSC gene tables report. Use the transcript strand to
292             determine the 5' end.
293              
294             =item get_cdsEnd
295              
296             my $end = get_cdsEnd($trancript);
297              
298             Returns the stop coordinate of the CDS for the given transcript.
299             B that this is the rightmost (largest) coordinate of the CDS
300             and not necessarily the coordinate of the stop codon, similar to
301             what the UCSC gene tables report. Use the transcript strand to
302             determine which is the C<3'> and C<5'> end.
303              
304             =item get_start_codon
305              
306             my $start_codon = get_start_codon($trancript);
307              
308             Returns a SeqFeature object representing the start codon. If one is
309             not explicitly defined in the hierarchy, then a new object is generated.
310              
311             =item get_stop_codon
312              
313             my $stop_codon = get_stop_codon($transcript);
314              
315             Returns a SeqFeature object representing the stop codon. If one is
316             not defined in the hierarchy, then a new object is created. B that
317             this assumes that the stop codon is inclusive to the defined CDS, which is
318             the case with GFF3 and UCSC gene table derived features. On the other hand,
319             features derived from GTF is defined with the stop codon exclusive to the CDS.
320             This shouldn't matter with GTF, however, since GTF usually explicitly includes
321             stop codon features, whereas the other two formats do not.
322              
323             =item get_transcript_cds_length
324              
325             my $length = get_transcript_cds_length($transcript);
326              
327             Calculates and returns the length of the coding sequence for a
328             transcript, i.e. the sum of the CDS lengths.
329              
330             =item get_utrs
331              
332             my @utrs = get_utrs($trancript);
333              
334             Returns both C<5'> and C<3'> untranslated regions of the transcript. If these are
335             not defined in the SeqFeature subfeature hierarchy, then the coordinates will be
336             determined from from the exon and CDS subfeatures, if available, and new SeqFeature
337             objects generated. Non-coding transcripts will not return anything.
338              
339             =item get_5p_utrs
340              
341             my @5p_utrs = get_5p_utrs($trancript);
342              
343             Returns only the C<5'> untranslated regions of the transcript.
344              
345             =item get_3p_utrs($transcript)
346              
347             my @3p_utrs = get_3p_utrs($trancript);
348              
349             Returns only the C<3'> untranslated regions of the transcript.
350              
351             =back
352              
353             =head2 Export methods
354              
355             These methods are used for exporting a gene andEor transcript model into
356             a text string based on the specified format.
357              
358             =over 4
359              
360             =item gff_string
361              
362             my $string .= gff_string($gene, 1);
363             my $string .= gff_string($transcript, 1);
364              
365             This is just a convenience method. SeqFeature objects based on
366             L, L, or L
367             have a C method, and this will simply call that method. SeqFeature
368             objects that do not have this method will, of course, cause the script to
369             terminate.
370              
371             Pass the seqfeature object and a boolean value to recursively append all
372             subfeatures (e.g. exons) to the string. In most cases, this will generate a
373             GFF3-style string.
374              
375             L also provides a simplified gff_string method.
376              
377             =item gtf_string
378              
379             my $string .= gtf_string($gene);
380             my $string .= gtf_string($transcript);
381              
382             This will export a gene or transcript model as a series of GTF formatted
383             text lines, following the defined Gene Transfer Format (also known as GFF
384             version 2.5). It will ensure that each feature is properly tagged with the
385             C and C attributes.
386              
387             This method will automatically recurse through all subfeatures.
388              
389             =item ucsc_string
390              
391             my $string = ucsc_string($gene);
392              
393             This will export a gene or transcript model as a refFlat formatted Gene
394             Prediction line (11 columns). See L
395             for details. Multiple transcript genes are exported as multiple text lines
396             concatenated together.
397              
398             =item bed_string
399              
400             my $string = bed_string($gene);
401              
402             This will export a gene or transcript model as a UCSC Bed formatted transcript
403             line (12 columns). See L
404             for details. Multiple transcript genes are exported as multiple text lines
405             concatenated together. Note that gene information is not preserved with Bed12
406             files; only the transcript name is used. The C value is set to 0.
407              
408             =back
409              
410             =head2 Filter methods
411              
412             These methods are used to filter genes.
413              
414             =over 4
415              
416             =item filter_transcript_support_level
417              
418             my $new_gene = filter_transcript_support_level($gene, 'best2');
419             my @good_transcripts = filter_transcript_support_level(\@transcripts);
420              
421             This will filter a gene object for transcripts that match or exceed the
422             provided transcript support level. This assumes that the transcripts
423             contain the attribute tag 'transcript_support_level', which are present in
424             Ensembl provided GFF3 and GTF annotation files. The values are a digit (1-5),
425             or 'NA', where 1 is experimentally supported and 5 is entirely predicted
426             with no experimental evidence. See
427             L
428             for details.
429              
430             Pass a gene SeqFeature object with one or more transcript subfeatures.
431             Alternatively, an array reference of transcripts could be passed as well.
432              
433             A level may be provided as a second argument. The default is 'best'.
434              
435             =over 4
436              
437             =item best
438              
439             Only the transcripts with the highest existing value will be retained.
440              
441             =item bestEdigitE
442              
443             All transcripts up to the indicated level are retained. For example,
444             'best3' would indicate that transcripts with support levels 1, 2, and 3
445             would be retained.
446              
447             =item EdigitE
448              
449             Only transcripts at the given level are retained.
450              
451             =item NA
452              
453             Only transcripts with 'NA' as the value are retained. These are typically
454             pseudogenes or single-exon transcripts.
455              
456             =back
457              
458             If none of the transcripts have the attribute, then all are returned
459             (nothing is filtered).
460              
461             If a gene object was provided, a new gene object will be returned with
462             only the retained transcripts as subfeatures. If an array reference of
463             transcripts was provided, then an array reference of the filtered
464             transcripts is returned.
465              
466             =item filter_transcript_gencode_basic
467              
468             my $new_gene = filter_transcript_gencode_basic($gene);
469             my @good_transcripts = filter_transcript_gencode_basic(\@transcripts);
470              
471             This will filter a gene object for transcripts for the Ensembl GENCODE
472             tag "basic", which indicates that a transcript is tagged as GENCODE Basic
473             transcript.
474              
475             If a gene object was provided, a new gene object will be returned with
476             only the retained transcripts as subfeatures. If an array reference of
477             transcripts was provided, then an array reference of the filtered
478             transcripts is returned.
479            
480             =item filter_transcript_biotype
481              
482             my $new_gene = filter_transcript_gencode_basic($gene, $biotype);
483             my @good_transcripts = filter_transcript_gencode_basic(\@transcripts, 'miRNA');
484              
485             This will filter a gene object for transcripts for specific biotype values
486             using the C or C attribute tags, commonly found
487             in Ensembl annotation.
488              
489             If a gene object was provided, a new gene object will be returned with
490             only the retained transcripts as subfeatures. If an array reference of
491             transcripts was provided, then an array reference of the filtered
492             transcripts is returned.
493              
494             =back
495              
496             =head1 SEE ALSO
497              
498             L, L,
499             L, L, L,
500             L, L, L
501              
502             =cut
503              
504 1     1   961 use strict;
  1         1  
  1         26  
505 1     1   3 use Carp qw(carp cluck croak confess);
  1         2  
  1         5886  
506             require Exporter;
507              
508             ### Export
509             our @ISA = qw(Exporter);
510             our @EXPORT = qw();
511             our @EXPORT_OK = qw(
512             get_exons
513             get_alt_exons
514             get_common_exons
515             get_uncommon_exons
516             get_alt_common_exons
517             get_introns
518             get_alt_introns
519             get_common_introns
520             get_uncommon_introns
521             get_alt_common_introns
522             get_transcripts
523             collapse_transcripts
524             get_transcript_length
525             is_coding
526             get_cds
527             get_cdsStart
528             get_cdsEnd
529             get_start_codon
530             get_stop_codon
531             get_transcript_cds_length
532             get_utrs
533             get_transcript_utr_length
534             get_5p_utrs
535             get_3p_utrs
536             get_transcript_5p_utr_length
537             get_transcript_3p_utr_length
538             gff_string
539             gtf_string
540             ucsc_string
541             bed_string
542             filter_transcript_support_level
543             filter_transcript_gencode_basic
544             filter_transcript_biotype
545             );
546             our %EXPORT_TAGS = (
547             all => \@EXPORT_OK,
548             exon => [ qw(
549             get_exons
550             get_alt_exons
551             get_common_exons
552             get_uncommon_exons
553             get_alt_common_exons
554             ) ],
555             intron => [ qw(
556             get_introns
557             get_alt_introns
558             get_common_introns
559             get_uncommon_introns
560             get_alt_common_introns
561             ) ],
562             transcript => [ qw(
563             get_transcripts
564             collapse_transcripts
565             get_transcript_length
566             ) ],
567             cds => [ qw(
568             is_coding
569             get_cds
570             get_cdsStart
571             get_cdsEnd
572             get_start_codon
573             get_stop_codon
574             get_transcript_cds_length
575             ) ],
576             utr => [ qw(
577             get_utrs
578             get_5p_utrs
579             get_3p_utrs
580             get_transcript_utr_length
581             get_transcript_5p_utr_length
582             get_transcript_3p_utr_length
583             ) ],
584             export => [ qw(
585             gff_string
586             gtf_string
587             ucsc_string
588             bed_string
589             ) ],
590             filter => [ qw(
591             filter_transcript_biotype
592             filter_transcript_gencode_basic
593             filter_transcript_support_level
594             )]
595             );
596              
597              
598             ### The True Statement
599             1;
600              
601              
602              
603             ######## Exon Methods
604              
605             sub get_exons {
606             # initialize
607 102     102 1 127 my $transcript = shift;
608 102 50       235 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
609 102         177 my @exons;
610             my @cdss;
611 102         0 my @transcripts;
612            
613             # go through the subfeatures
614 102         183 foreach my $subfeat ($transcript->get_SeqFeatures) {
615 1372         1824 my $type = $subfeat->primary_tag;
616 1372 100       2892 if ($type =~ /exon/i) {
    50          
    0          
617 682         810 push @exons, $subfeat;
618             }
619             elsif ($type =~ /cds|utr|untranslated/i) {
620 690         858 push @cdss, $subfeat;
621             }
622             elsif ($type =~ /rna|transcript/i) {
623 0         0 push @transcripts, $subfeat;
624             }
625             }
626            
627             # check which array we'll use
628             # prefer to use actual exon subfeatures, but those may not be defined
629 102         153 my @list;
630 102 50       140 if (@exons) {
    0          
    0          
631 682         717 @list = map { $_->[0] }
632 1184         1193 sort { $a->[1] <=> $b->[1] }
633 102         140 map { [$_, $_->start] }
  682         898  
634             @exons;
635             }
636             elsif (@cdss) {
637             # duplicate the CDSs as exons
638 0         0 foreach (@cdss) {
639 0         0 my $e = $_->duplicate;
640 0         0 $e->primary_tag('exon'); # reset tag
641 0         0 $e->phase('.'); # no phase
642 0         0 push @list, $e;
643             }
644            
645             # make sure to merge adjacent exons
646 0         0 @list = map { $_->[0] } # must sort first
647 0         0 sort { $a->[1] <=> $b->[1] }
648 0         0 map { [$_, $_->start] }
  0         0  
649             @list;
650 0         0 for (my $i = 0; $i < scalar @list; $i++) {
651 0 0       0 if (defined ($list[ $i + 1 ]) ) {
652 0 0       0 if ($list[$i+1]->start - $list[$i]->end <= 1) {
653             # need to merge
654 0         0 $list[$i]->end( $list[$i+1]->end );
655 0         0 splice(@list, $i + 1, 1); # remove the merged
656 0         0 $i--;
657             }
658             }
659             }
660             }
661             elsif (@transcripts) {
662 0         0 foreach my $t (@transcripts) {
663             # there are possibly duplicates in here if there are alternate transcripts
664             # should we remove them?
665 0         0 my @e = get_exons($t);
666 0         0 push @list, @e;
667             }
668 0         0 @list = map { $_->[0] }
669 0 0       0 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
670 0         0 map { [$_, $_->start, $_->end] }
  0         0  
671             @list;
672             }
673             else {
674             # nothing found!
675 0         0 return;
676             }
677            
678 102 50       304 return wantarray ? @list : \@list;
679             }
680              
681             sub get_alt_exons {
682 1     1 1 5 my $ac_exons = get_alt_common_exons(@_);
683 1         2 my @alts;
684 1         4 foreach my $k (keys %$ac_exons) {
685 15 100       20 next if $k eq 'common';
686 14 100       16 next if $k eq 'uncommon';
687 13         11 push @alts, @{ $ac_exons->{$k} };
  13         16  
688             }
689 1 50       2 if (@alts) {
690             # re-sort in genomic order
691 19         20 @alts = map {$_->[1]}
692 60         69 sort {$a->[0] <=> $b->[0]}
693 1         2 map { [$_->start, $_] } @alts;
  19         26  
694             }
695 1 50       16 return wantarray ? @alts : \@alts;
696             }
697              
698             sub get_common_exons {
699 1     1 1 508 my $ac_exons = get_alt_common_exons(@_);
700 1 50       5 return wantarray ? @{ $ac_exons->{common} } : $ac_exons->{common};
  1         27  
701             }
702              
703             sub get_uncommon_exons {
704 1     1 1 745 my $ac_exons = get_alt_common_exons(@_);
705 1 50       5 return wantarray ? @{ $ac_exons->{uncommon} } : $ac_exons->{uncommon};
  1         17  
706             }
707              
708             sub get_alt_common_exons {
709 3     3 1 10 return _get_alt_common_things(1, @_);
710             }
711              
712              
713             ######## Intron Methods
714              
715             sub get_introns {
716 42     42 1 1476 my $transcript = shift;
717 42 50       141 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
718 42         46 my @introns;
719            
720             # find the exons and/or CDSs
721 42         71 my @exons = get_exons($transcript);
722 42 50       77 return unless @exons;
723 42 50       78 return if (scalar(@exons) == 1);
724            
725             # identify the last exon index position
726 42         60 my $last = scalar(@exons) - 1;
727            
728             # forward strand
729 42 50       79 if ($transcript->strand >= 0) {
730             # each intron is created based on the previous exon
731 42         97 for (my $i = 0; $i < $last; $i++) {
732 234         243 my $e = $exons[$i];
733 234         365 my $i = $e->new(
734             -seq_id => $e->seq_id,
735             -start => $e->end + 1,
736             -end => $exons[$i + 1]->start - 1, # up to start of next exon
737             -strand => $transcript->strand,
738             -primary_tag => 'intron',
739             -source_tag => $transcript->source_tag,
740             -primary_id => $transcript->display_name . ".intron$i",
741             -display_name => $transcript->display_name . ".intron$i",
742             );
743 234         603 push @introns, $i;
744             }
745             }
746            
747             # reverse strand
748             else {
749             # each intron is created based on the previous exon
750             # ordering from 5' to 3' end direction for convenience in naming
751 0         0 for (my $i = $last; $i > 0; $i--) {
752 0         0 my $e = $exons[$i];
753 0         0 my $i = $e->new(
754             -seq_id => $e->seq_id,
755             -start => $exons[$i - 1]->end + 1, # end of next exon
756             -end => $e->start - 1,
757             -strand => $transcript->strand,
758             -primary_tag => 'intron',
759             -source_tag => $transcript->source_tag,
760             -primary_id => $transcript->display_name . ".intron$i",
761             -display_name => $transcript->display_name . ".intron$i",
762             );
763 0         0 push @introns, $i;
764             }
765            
766             # reorder the introns based on start position
767 0 0       0 if (@introns) {
768 0         0 @introns = map { $_->[0] }
769 0 0       0 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
770 0         0 map { [$_, $_->start, $_->end] }
  0         0  
771             @introns;
772             }
773             }
774            
775             # finished
776 42 50       121 return wantarray ? @introns : \@introns;
777             }
778              
779             sub get_alt_introns {
780 1     1 1 4 my $ac_introns = get_alt_common_introns(@_);
781 1         3 my @alts;
782 1         6 foreach my $k (keys %$ac_introns) {
783 15 100       20 next if $k eq 'common';
784 14 100       17 next if $k eq 'uncommon';
785 13         13 push @alts, @{ $ac_introns->{$k} };
  13         17  
786             }
787 1 50       3 if (@alts) {
788             # re-sort in genomic order
789 8         10 @alts = map {$_->[1]}
790 16         18 sort {$a->[0] <=> $b->[0]}
791 1         3 map { [$_->start, $_] } @alts;
  8         13  
792             }
793 1 50       16 return wantarray ? @alts : \@alts;
794             }
795              
796             sub get_common_introns {
797 1     1 1 4 my $ac_introns = get_alt_common_introns(@_);
798 1 50       4 return wantarray ? @{ $ac_introns->{common} } : $ac_introns->{common};
  1         18  
799             }
800              
801             sub get_uncommon_introns {
802 1     1 1 757 my $ac_introns = get_alt_common_introns(@_);
803 1 50       3 return wantarray ? @{ $ac_introns->{uncommon} } : $ac_introns->{uncommon};
  1         12  
804             }
805              
806             sub get_alt_common_introns {
807 3     3 1 10 return _get_alt_common_things(0, @_);
808             }
809              
810             sub _get_alt_common_things {
811             # internal subroutine to get either exons or introns
812 6     6   12 my $do_exon = shift; # true for exon, false for intron
813 6         11 my @transcripts;
814 6 50       17 return unless @_;
815 6 50       17 if (scalar @_ == 1) {
    0          
816             # someone passed a gene, get the transcripts
817 6 50       31 confess "not a SeqFeature object!" unless ref($_[0]) =~ /seqfeature/i;
818 6         15 @transcripts = get_transcripts($_[0]);
819             }
820             elsif (scalar @_ > 1) {
821             # presume these are transcripts?
822 0         0 @transcripts = @_;
823             }
824            
825             # hash of transcript to things
826 6         22 my %tx2things = (
827             common => [],
828             uncommon => []
829             );
830            
831             # no transcripts provided?
832 6 50       15 return \%tx2things unless @transcripts;
833            
834             # only one transcript provided?
835 6 50       15 if (scalar @transcripts == 1) {
836             # all exons are common by definition
837             # my $name = $transcripts[0]->display_name;
838 0 0       0 my @things = $do_exon ? get_exons($transcripts[0]) : get_introns($transcripts[0]);
839 0         0 $tx2things{common} = \@things;
840 0         0 return \%tx2things;
841             }
842            
843             # get things and put them in has based on coordinates
844 6         9 my %pos2things;
845 6         12 foreach my $t (@transcripts) {
846 78 100       158 my @things = $do_exon ? get_exons($t) : get_introns($t);
847 78         119 foreach my $e (@things) {
848 471         741 my $new_e = $e->duplicate;
849 471         728 $pos2things{$e->start}{$e->end}{$t->display_name} = $new_e;
850             }
851 78         138 $tx2things{ $t->display_name } = [];
852             }
853            
854             # put things into categories based on commonality
855             # associate things with unique transcripts, common, or uncommon sets
856 6         15 my $trx_number = scalar @transcripts;
857 6         43 foreach my $s (sort {$a <=> $b} keys %pos2things) { # sort on start
  416         410  
858 126         115 foreach my $e (sort {$a <=> $b} keys %{ $pos2things{$s} }) { # sort on stop
  47         78  
  126         248  
859 165         184 my @names = keys %{ $pos2things{$s}{$e} };
  165         322  
860 165 100       274 if (scalar @names == 1) {
    50          
861             # only 1 thing, must be an alternate
862 81         74 push @{ $tx2things{$names[0]} }, $pos2things{$s}{$e}{$names[0]};
  81         189  
863             }
864             elsif (scalar @names == $trx_number) {
865             # common to all transcripts, take the first one as example
866 0         0 push @{ $tx2things{common} }, $pos2things{$s}{$e}{$names[0]};
  0         0  
867             }
868             else {
869             # common to some but not all transcripts, so uncommon
870 84         72 push @{ $tx2things{uncommon} }, $pos2things{$s}{$e}{$names[0]};
  84         155  
871             }
872             }
873             }
874 6         299 return \%tx2things;
875             }
876              
877              
878              
879             ######## Transcript Methods
880              
881             sub get_transcripts {
882 9     9 1 364 my $gene = shift;
883 9 50       23 return unless $gene;
884 9 50       36 confess "not a SeqFeature object!" unless ref($gene) =~ /seqfeature/i;
885 9 50       25 return $gene if ($gene->primary_tag =~ /rna|transcript/i);
886 9         18 my @transcripts;
887             my @exons;
888 9         0 my @other;
889 9         25 foreach my $subf ($gene->get_SeqFeatures) {
890 106 50       140 if ($subf->primary_tag =~ /rna|transcript|\bprocessed/i) {
    0          
891 106         148 push @transcripts, $subf;
892             }
893             elsif ($subf->primary_tag =~ /^(?:cds|exon|\w+codon)$/i) {
894 0         0 push @exons, $subf;
895             }
896             else {
897             # wierdo subfeature types like unprocessed_pseudogene
898 0         0 push @other, $subf;
899             }
900             }
901 9 50 33     49 if (not @transcripts and @exons) {
    50 33        
      33        
902             # some weirdly formatted annotation files skip the transcript
903             # looking at you SGD
904 0         0 my $transcript = $gene->new(
905             -seq_id => $gene->seq_id,
906             -start => $gene->start,
907             -end => $gene->end,
908             -strand => $gene->strand,
909             -primary_tag => 'transcript',
910             -source => $gene->source_tag,
911             -name => $gene->display_name,
912             -segments => \@exons,
913             );
914 0         0 push @transcripts, $transcript;
915             }
916             elsif (not @transcripts and not @exons and @other) {
917             # well, what choice do we have?
918             # we can assume these are transcripts, because what else could they be?
919 0         0 @transcripts = @other;
920             }
921 106         111 @transcripts = map { $_->[0] }
922 225 50       282 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
923 9         21 map { [$_, $_->start, $_->length] }
  106         154  
924             @transcripts;
925 9 50       40 return wantarray ? @transcripts : \@transcripts;
926             }
927              
928              
929             sub collapse_transcripts {
930 1     1 1 2 my @transcripts;
931 1 50       4 return unless @_;
932 1         2 my $example = $_[0]; # parent transcript seqfeature to model new transcript on
933 1 50       5 if (scalar @_ == 1) {
    50          
934             # someone passed a gene, get the transcripts
935 0         0 @transcripts = get_transcripts($_[0]);
936 0 0       0 return unless @transcripts;
937 0 0       0 return $transcripts[0] if scalar @transcripts == 1;
938             }
939             elsif (scalar @_ > 1) {
940 1         4 @transcripts = @_;
941             }
942              
943             # get all useable exons to collapse
944 1         3 my @exons;
945 1         3 foreach my $t (@transcripts) {
946 13         17 my @e = get_exons($t);
947 13         30 push @exons, @e;
948             }
949            
950             # check that we have exons - weirdo files may just have CDS!!!????
951 1 50       5 unless (@exons) {
952 0         0 foreach my $t (@transcripts) {
953 0         0 my @e = get_cds($t);
954 0         0 push @exons, @e;
955             }
956             }
957 1 50       3 return unless (@exons);
958            
959             # sort all the exons
960 85         84 my @sorted = map { $_->[0] }
961 414 50       489 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
962 1         3 map { [$_, $_->start, $_->end] }
  85         114  
963             @exons;
964            
965             # build new exons from the original - don't keep cruft
966 1         10 my $next = shift @sorted;
967 1         2 my @new;
968 1         6 $new[0] = $next->new(
969             -seq_id => $next->seq_id,
970             -start => $next->start,
971             -end => $next->end,
972             -strand => $next->strand,
973             -primary_tag => 'exon',
974             );
975            
976             # work through remaining exons, adding and merging as necessary
977 1         4 while (@sorted) {
978 84         97 $next = shift @sorted;
979 84         113 my ($ns, $ne) = ($next->start, $next->end); # new start & end
980 84         115 my ($os, $oe) = ($new[-1]->start, $new[-1]->end); # old start & end
981 84 100 100     278 if ($ns == $os and $ne > $oe) {
    50 100        
    100 66        
982             # same beginning, further end
983 5         8 $new[-1]->end($ne);
984             }
985             elsif ($ns > $os and $ns < $oe and $ne > $oe) {
986             # overlapping start, further end
987 0         0 $new[-1]->end($ne);
988             }
989             elsif ($ns > $oe) {
990             # completely new exon
991 13         24 push @new, $next->new(
992             -seq_id => $next->seq_id,
993             -start => $ns,
994             -end => $ne,
995             -strand => $next->strand,
996             -primary_tag => 'exon',
997             );
998             }
999             # all other possibilities we can skip
1000             }
1001            
1002             # return the assembled transcript
1003 1         5 return $example->new(
1004             -seq_id => $example->seq_id,
1005             -start => $new[0]->start,
1006             -end => $new[-1]->end,
1007             -strand => $example->strand,
1008             -primary_tag => 'transcript',
1009             -source => $example->source_tag,
1010             -name => $example->display_name,
1011             -segments => \@new,
1012             );
1013             }
1014              
1015             sub get_transcript_length {
1016 4     4 1 508 my $transcript = shift;
1017 4 50       26 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1018 4 50       14 if ($transcript->primary_tag =~ /gene$/i) {
1019             # someone passed a gene object!!!!
1020 0         0 my @lengths;
1021 0         0 foreach my $t (get_transcripts($transcript)) {
1022 0         0 push @lengths, get_transcript_length($t);
1023             }
1024             # return the longest transcript length
1025 0         0 return (sort {$b <=> $a} @lengths)[0];
  0         0  
1026             }
1027 4         8 my $total = 0;
1028 4         15 foreach my $e (get_exons($transcript)) {
1029 33         53 $total += $e->length;
1030             }
1031 4         17 return $total;
1032             }
1033              
1034              
1035              
1036             ######## CDS Methods
1037              
1038             sub is_coding {
1039 3     3 1 461 my $transcript = shift;
1040 3 50       11 return unless $transcript;
1041 3 50       36 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1042 3 50       8 if ($transcript->primary_tag =~ /gene$/i) {
1043             # someone passed a gene, check its subfeatures
1044 0         0 my $code_potential = 0;
1045 0         0 foreach ($transcript->get_SeqFeatures) {
1046 0         0 $code_potential += is_coding($_);
1047             }
1048 0         0 return $code_potential;
1049             }
1050 3 100       8 return 1 if $transcript->primary_tag =~ /mrna/i; # assumption
1051 2 50       9 return 1 if $transcript->source =~ /protein.?coding/i;
1052 2 50       8 if ($transcript->has_tag('transcript_biotype')) {
    50          
    0          
1053             # ensembl type GTFs
1054 0         0 my ($biotype) = $transcript->get_tag_values('transcript_biotype');
1055 0 0       0 return $biotype =~ /protein.?coding/i ? 1 : 0;
1056             }
1057             elsif ($transcript->has_tag('biotype')) {
1058             # ensembl type GFFs
1059 2         7 my ($biotype) = $transcript->get_tag_values('biotype');
1060 2 50       12 return $biotype =~ /protein.?coding/i ? 1 : 0;
1061             }
1062             elsif ($transcript->has_tag('gene_biotype')) {
1063             # ensembl type GTFs
1064             # must be careful here, gene_biotype of course pertains to gene,
1065             # and not necessarily this particular transcript
1066 0         0 my ($biotype) = $transcript->get_tag_values('gene_biotype');
1067 0 0       0 return 1 if $biotype =~ /protein.?coding/i;
1068             }
1069 0         0 foreach ($transcript->get_SeqFeatures) {
1070             # old fashioned way
1071 0 0       0 return 1 if $_->primary_tag eq 'CDS';
1072             }
1073 0         0 return 0;
1074             }
1075              
1076             sub get_cds {
1077 13     13 1 1420 my $transcript = shift;
1078 13 50       30 return unless $transcript;
1079 13 50       39 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1080 13         16 my @cds;
1081 13         30 foreach my $subfeat ($transcript->get_SeqFeatures) {
1082 170 100       222 push @cds, $subfeat if $subfeat->primary_tag eq 'CDS';
1083             }
1084 13 100       32 return unless @cds;
1085 43         48 @cds = map { $_->[0] }
1086 70         73 sort { $a->[1] <=> $b->[1] }
1087 7         15 map { [$_, $_->start] }
  43         65  
1088             @cds;
1089 7 100       27 return wantarray ? @cds : \@cds;
1090             }
1091              
1092             sub get_cdsStart {
1093 4     4 1 2039 my $transcript = shift;
1094 4 50       23 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1095 4         11 my $cds = get_cds($transcript);
1096 4 100       16 return unless $cds;
1097 2 50       7 if ($transcript->strand >= 0) {
1098 2         5 return $cds->[0]->start;
1099             }
1100             else {
1101             # stop codons may or may not be not included
1102 0         0 my $codon = get_stop_codon($transcript);
1103 0 0       0 if ($codon) {
1104 0 0       0 return $codon->start < $cds->[0]->start ?
1105             $codon->start : $cds->[0]->start;
1106             }
1107             else {
1108 0         0 return $cds->[0]->start;
1109             }
1110             }
1111             }
1112              
1113             sub get_cdsEnd {
1114 4     4 1 9 my $transcript = shift;
1115 4 50       22 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1116 4         9 my $cds = get_cds($transcript);
1117 4 100       12 return unless $cds;
1118 2 50       7 if ($transcript->strand >= 0) {
1119 2         9 my $codon = get_stop_codon($transcript);
1120 2 50       6 if ($codon) {
1121 2 50       5 return $codon->end > $cds->[-1]->end ? $codon->end : $cds->[-1]->end;
1122             }
1123             else {
1124 0         0 return $cds->[-1]->end;
1125             }
1126             }
1127             else {
1128 0         0 return $cds->[-1]->end;
1129             }
1130             }
1131              
1132             sub get_transcript_cds_length {
1133 4     4 1 9 my $transcript = shift;
1134 4 50       22 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1135 4         6 my $total = 0;
1136 4         12 foreach my $subf ($transcript->get_SeqFeatures) {
1137 53 100       97 next unless $subf->primary_tag eq 'CDS';
1138 13         24 $total += $subf->length;
1139             }
1140 4         16 return $total;
1141             }
1142              
1143             sub get_start_codon {
1144 0     0 1 0 my $transcript = shift;
1145 0 0       0 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1146 0         0 my $start_codon;
1147            
1148             # look for existing one
1149 0         0 foreach my $subfeat ($transcript->get_SeqFeatures) {
1150 0 0       0 $start_codon = $subfeat if $subfeat->primary_tag =~ /start.?codon/i;
1151             }
1152 0 0       0 return $start_codon if $start_codon;
1153            
1154             # otherwise we have to build one
1155 0         0 my $cdss = get_cds($transcript);
1156 0 0       0 return unless $cdss;
1157 0 0       0 if ($transcript->strand >= 0) {
1158 0         0 $start_codon = $transcript->new(
1159             -seq_id => $transcript->seq_id,
1160             -source => $transcript->source,
1161             -primary_tag => 'start_codon',
1162             -start => $cdss->[0]->start,
1163             -end => $cdss->[0]->start + 2,
1164             -strand => 1,
1165             -phase => 0,
1166             -primary_id => $transcript->primary_id . '.start_codon',
1167             );
1168             }
1169             else {
1170 0         0 $start_codon = $transcript->new(
1171             -seq_id => $transcript->seq_id,
1172             -source => $transcript->source,
1173             -primary_tag => 'start_codon',
1174             -start => $cdss->[-1]->end - 2,
1175             -end => $cdss->[-1]->end,
1176             -strand => -1,
1177             -phase => 0,
1178             -primary_id => $transcript->primary_id . '.start_codon',
1179             );
1180             }
1181 0         0 return $start_codon;
1182             }
1183              
1184             sub get_stop_codon {
1185 2     2 1 3 my $transcript = shift;
1186 2 50       10 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1187 2         3 my $stop_codon;
1188            
1189             # look for existing one
1190 2         6 foreach my $subfeat ($transcript->get_SeqFeatures) {
1191 37 50       44 $stop_codon = $subfeat if $subfeat->primary_tag =~ /stop.?codon/i;
1192             }
1193 2 50       5 return $stop_codon if $stop_codon;
1194            
1195             # otherwise we have to build one
1196             # this entirely presumes that the stop codon is inclusive to the last cds
1197             # this is the case with GFF3 and UCSC tables, but not GTF
1198 2         5 my $cdss = get_cds($transcript);
1199 2 50       4 return unless $cdss;
1200 2 50       5 if ($transcript->strand >= 0) {
1201 2         7 $stop_codon = $transcript->new(
1202             -seq_id => $transcript->seq_id,
1203             -source => $transcript->source,
1204             -primary_tag => 'stop_codon',
1205             -start => $cdss->[-1]->end - 2,
1206             -end => $cdss->[-1]->end,
1207             -strand => 1,
1208             -phase => 0,
1209             -primary_id => $transcript->primary_id . '.stop_codon',
1210             );
1211             }
1212             else {
1213 0         0 $stop_codon = $transcript->new(
1214             -seq_id => $transcript->seq_id,
1215             -source => $transcript->source,
1216             -primary_tag => 'stop_codon',
1217             -start => $cdss->[0]->start,
1218             -end => $cdss->[0]->start + 2,
1219             -strand => -1,
1220             -phase => 0,
1221             -primary_id => $transcript->primary_id . '.stop_codon',
1222             );
1223             }
1224 2         6 return $stop_codon;
1225             }
1226              
1227             sub get_utrs {
1228 13     13 1 21 my $transcript = shift;
1229 13 50       36 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1230            
1231             # collect the various types of subfeatures
1232 13         39 my @exons;
1233             my @cdss;
1234 13         0 my @utrs;
1235 13         0 my @transcripts;
1236 13         30 foreach my $subfeat ($transcript->get_SeqFeatures) {
1237 224         309 my $type = $subfeat->primary_tag;
1238 224 100       477 if ($type =~ /exon/i) {
    100          
    50          
    0          
1239 104         128 push @exons, $subfeat;
1240             }
1241             elsif ($type =~ /cds/i) {
1242 78         96 push @cdss, $subfeat;
1243             }
1244             elsif ($type =~ /utr|untranslated/i) {
1245 42         57 push @utrs, $subfeat;
1246             }
1247             elsif ($type =~ /rna|transcript/i) {
1248 0         0 push @transcripts, $subfeat;
1249             }
1250             }
1251            
1252             # collect the utrs into final list
1253 13         18 my @list;
1254 13 100 33     31 if (@utrs) {
    50          
    50          
1255             # good, we don't have to do any work
1256 12         20 @list = @utrs;
1257             }
1258             elsif (@transcripts) {
1259             # someone must've passed us a gene
1260 0         0 foreach my $t (@transcripts) {
1261 0         0 my @u = get_utrs($t);
1262 0         0 push @list, @u;
1263             }
1264             }
1265             elsif (@exons and @cdss) {
1266             # calculate the utrs ourselves
1267            
1268 0         0 @exons = map { $_->[0] }
1269 0         0 sort { $a->[1] <=> $b->[1] }
1270 0         0 map { [$_, $_->start] }
  0         0  
1271             @exons;
1272 0         0 @cdss = map { $_->[0] }
1273 0         0 sort { $a->[1] <=> $b->[1] }
1274 0         0 map { [$_, $_->start] }
  0         0  
1275             @cdss;
1276 0         0 my $firstCDS = $cdss[0];
1277 0         0 my $lastCDS = $cdss[-1];
1278 0         0 while (@exons) {
1279 0         0 my $exon = shift @exons;
1280 0 0 0     0 if ($exon->end < $firstCDS->start) {
    0          
    0          
    0          
    0          
1281             # whole exon is UTR
1282 0         0 my $utr = $exon->duplicate;
1283 0 0       0 $utr->primary_tag(
1284             $transcript->strand >= 0 ? 'five_prime_UTR' : 'three_prime_UTR' );
1285 0         0 $utr->display_name( $exon->display_name . '.utr' );
1286 0         0 push @list, $utr;
1287             }
1288             elsif ($exon->overlaps($firstCDS)) {
1289             # partial UTR on left side
1290 0         0 my $pieces = $exon->subtract($firstCDS); # array ref of pieces
1291 0 0       0 next unless $pieces;
1292 0         0 my $utr = $pieces->[0]; # we will want the first one if there are two
1293 0 0       0 $utr->primary_tag(
1294             $transcript->strand >= 0 ? 'five_prime_UTR' : 'three_prime_UTR' );
1295 0         0 $utr->display_name($exon->display_name . '.utr');
1296 0         0 $utr->strand($exon->strand);
1297 0         0 $utr->source($exon->strand);
1298 0         0 push @list, $utr;
1299             }
1300             elsif ($exon->start > $firstCDS->end and $exon->end < $lastCDS->start) {
1301             # CDS exon
1302 0         0 next;
1303             }
1304             elsif ($exon->overlaps($lastCDS)) {
1305             # partial UTR
1306 0         0 my $pieces = $exon->subtract($lastCDS); # array ref of pieces
1307 0 0       0 next unless $pieces;
1308 0         0 my $utr = $pieces->[-1]; # we will want the second one if there are two
1309 0 0       0 $utr->primary_tag(
1310             $transcript->strand >= 0 ? 'three_prime_UTR' : 'five_prime_UTR' );
1311 0         0 $utr->display_name($exon->display_name . '.utr');
1312 0         0 $utr->strand($exon->strand);
1313 0         0 $utr->source($exon->strand);
1314 0         0 push @list, $utr;
1315             }
1316             elsif ($exon->start > $lastCDS->end) {
1317             # whole exon is UTR
1318 0         0 my $utr = $exon->duplicate;
1319 0 0       0 $utr->primary_tag(
1320             $transcript->strand >= 0 ? 'three_prime_UTR' : 'five_prime_UTR' );
1321 0         0 $utr->display_name( $exon->display_name . '.utr' );
1322 0         0 push @list, $utr;
1323             }
1324             else {
1325             # geometric error
1326 0         0 croak " programmer geometric error!";
1327             }
1328             }
1329             }
1330             else {
1331             # nothing usable found to identify UTRs
1332 1         4 return;
1333             }
1334            
1335             # we have our list
1336 12 100       33 return wantarray ? @list : \@list;
1337             }
1338              
1339             sub get_transcript_utr_length {
1340 2     2 0 4 my $transcript = shift;
1341 2 50       13 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1342 2         4 my $utrs = get_utrs($transcript);
1343 2         4 my $total = 0;
1344 2         4 foreach my $utr (@$utrs) {
1345 7         11 $total += $utr->length;
1346             }
1347 2         8 return $total;
1348             }
1349              
1350             sub get_5p_utrs {
1351 4     4 1 470 my $transcript = shift;
1352 4 50       23 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1353            
1354             # get all UTRs
1355 4         13 my $utrs = get_utrs($transcript);
1356 4 50       11 return unless scalar(@$utrs);
1357            
1358 4         10 my @fivers = grep { $_->primary_tag =~ /5|five/i } @$utrs;
  14         22  
1359 4 100       15 return wantarray ? @fivers : \@fivers;
1360             }
1361              
1362             sub get_3p_utrs {
1363 4     4 1 7 my $transcript = shift;
1364 4 50       21 confess "not a SeqFeature object!" unless ref($transcript) =~ /seqfeature/i;
1365            
1366             # get all UTRs
1367 4         8 my $utrs = get_utrs($transcript);
1368 4 50       10 return unless scalar(@$utrs);
1369            
1370 4         7 my @threes = grep { $_->primary_tag =~ /3|three/i } @$utrs;
  14         847  
1371 4 100       14 return wantarray ? @threes : \@threes;
1372             }
1373              
1374             sub get_transcript_5p_utr_length {
1375 2     2 0 502 my $transcript = shift;
1376 2         8 my $utrs = get_5p_utrs($transcript);
1377 2         4 my $total = 0;
1378 2         6 foreach my $utr (@$utrs) {
1379 3         9 $total += $utr->length;
1380             }
1381 2         8 return $total;
1382             }
1383              
1384             sub get_transcript_3p_utr_length {
1385 2     2 0 6 my $transcript = shift;
1386 2         4 my $utrs = get_3p_utrs($transcript);
1387 2         2 my $total = 0;
1388 2         5 foreach my $utr (@$utrs) {
1389 4         10 $total += $utr->length;
1390             }
1391 2         6 return $total;
1392             }
1393              
1394              
1395             #### Export methods
1396              
1397             sub gff_string {
1398             # Bio::ToolBox::SeqFeature and Bio::SeqFeature::Lite objects have this method
1399             # otherwise this will die
1400 0     0 1 0 return shift->gff_string(@_);
1401             }
1402              
1403             sub gtf_string {
1404 0     0 1 0 my $feature = shift;
1405 0   0     0 my $gene = shift || undef; # only present when recursing
1406 0 0       0 confess "not a SeqFeature object!" unless ref($feature) =~ /seqfeature/i;
1407            
1408             # process a gene
1409 0 0 0     0 if ($feature->primary_tag =~ /gene$/i and not defined $gene) {
1410 0         0 my $string;
1411 0         0 foreach my $t (get_transcripts($feature)) {
1412 0         0 $string .= gtf_string($t, $feature);
1413             }
1414 0         0 return $string;
1415             }
1416            
1417             # check that we have transcribed feature with exons
1418 0         0 my @exons = get_exons($feature);
1419 0 0       0 return unless @exons; # no exon subfeatures? must not be a transcript....
1420            
1421             # mandatory identifiers
1422 0         0 my ($gene_id, $gene_name, $gene_biotype);
1423 0 0       0 if ($gene) {
1424 0   0     0 $gene_id = $gene->primary_id || $gene->display_name;
1425 0         0 $gene_name = $gene->display_name;
1426 0   0     0 ($gene_biotype) = $gene->get_tag_values('gene_biotype') ||
1427             $gene->get_tag_values('biotype') || undef;
1428             }
1429             else {
1430             # these attributes might still be present for transcripts
1431 0   0     0 ($gene_id) = $feature->get_tag_values('gene_id') || undef;
1432 0   0     0 ($gene_name) = $feature->get_tag_values('gene_name') || undef;
1433 0   0     0 ($gene_biotype) = $feature->get_tag_values('gene_biotype') || undef;
1434             }
1435 0   0     0 my $trx_id = $feature->primary_id || $feature->display_name;
1436 0         0 my $trx_name = $feature->display_name;
1437 0         0 my $group = sprintf(
1438             "gene_id \"%s\"; transcript_id \"%s\"; gene_name \"%s\"; transcript_name \"%s\";",
1439             $gene_id, $trx_id, $gene_name, $trx_name);
1440            
1441             # add additional transcript attributes that might be interesting to keep
1442 0 0       0 if ($gene_biotype) {
1443 0         0 $group .= " gene_biotype \"$gene_biotype\";";
1444             }
1445 0   0     0 my ($biotype) = $feature->get_tag_values('transcript_biotype') ||
1446             $feature->get_tag_values('biotype');
1447 0 0       0 if ($biotype) {
1448 0         0 $group .= " transcript_biotype \"$biotype\";" ;
1449             }
1450 0         0 my ($tsl) = $feature->get_tag_values('transcript_support_level');
1451 0 0       0 if ($tsl) {
1452 0         0 $group .= " transcript_support_level \"$tsl\";";
1453             }
1454            
1455             # skip transcript as it is technically not part of the GTF standard....
1456            
1457             # convert exon subfeatures collected above
1458 0         0 my $string;
1459 0         0 my @cds = get_cds($feature);
1460 0         0 push @cds, get_stop_codon($feature);
1461 0         0 push @cds, get_start_codon($feature);
1462 0         0 @exons = map { $_->[0] }
1463 0 0       0 sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] }
1464 0         0 map { [$_, $_->start, $_->end] } ( @exons, @cds );
  0         0  
1465 0         0 foreach my $subf (@exons) {
1466 0 0 0     0 $string .= join("\t", (
    0 0        
    0          
1467             $subf->seq_id || '.',
1468             $subf->source_tag || '.',
1469             $subf->primary_tag,
1470             $subf->start,
1471             $subf->end,
1472             defined $subf->score ? $subf->score : '.',
1473             $subf->strand < 0 ? '-' : '+',
1474             defined $subf->phase ? $subf->phase : '.',
1475             "$group\n"
1476             ) );
1477             }
1478            
1479             # finished
1480 0         0 return $string;
1481             }
1482              
1483             sub ucsc_string {
1484 0     0 1 0 my $feature = shift;
1485 0 0       0 confess "not a SeqFeature object!" unless ref($feature) =~ /seqfeature/i;
1486 0         0 my @ucsc_list;
1487            
1488             # process according to type
1489 0 0       0 if ($feature->primary_tag =~ /gene$/i) {
    0          
1490             # a gene object, we will need to process it's transcript subfeatures
1491 0         0 foreach my $transcript (get_transcripts($feature)) {
1492 0         0 my $ucsc = _process_ucsc_transcript($transcript, $feature);
1493 0 0       0 push @ucsc_list, $ucsc if $ucsc;
1494             }
1495             }
1496             elsif ($feature->primary_tag =~ /rna|transcript/i) {
1497             # some sort of RNA transcript
1498 0         0 my $ucsc = _process_ucsc_transcript($feature);
1499 0 0       0 push @ucsc_list, $ucsc if $ucsc;
1500             }
1501             else {
1502 0         0 return;
1503             }
1504            
1505             # return strings
1506 0         0 my $string;
1507 0         0 foreach my $ucsc (@ucsc_list) {
1508             $string .= sprintf("%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%s\t%s\n",
1509             $ucsc->{'name2'},
1510             $ucsc->{'name'},
1511             $ucsc->{'chr'},
1512             $ucsc->{'strand'},
1513             $ucsc->{'txStart'},
1514             $ucsc->{'txEnd'},
1515             $ucsc->{'cdsStart'},
1516             $ucsc->{'cdsEnd'},
1517             $ucsc->{'exonCount'},
1518 0         0 join(",", @{$ucsc->{'exonStarts'}} ) . ',',
1519 0         0 join(",", @{$ucsc->{'exonEnds'}} ) . ',',
  0         0  
1520             );
1521             }
1522 0         0 return $string;
1523             }
1524              
1525             sub bed_string {
1526 0     0 1 0 my $feature = shift;
1527 0 0       0 confess "not a SeqFeature object!" unless ref($feature) =~ /seqfeature/i;
1528 0         0 my @ucsc_list;
1529            
1530             # process according to type
1531 0 0       0 if ($feature->primary_tag =~ /gene$/i) {
    0          
1532             # a gene object, we will need to process it's transcript subfeatures
1533 0         0 foreach my $transcript (get_transcripts($feature)) {
1534 0         0 my $ucsc = _process_ucsc_transcript($transcript, $feature);
1535 0 0       0 push @ucsc_list, $ucsc if $ucsc;
1536             }
1537             }
1538             elsif ($feature->primary_tag =~ /rna|transcript/i) {
1539             # some sort of RNA transcript
1540 0         0 my $ucsc = _process_ucsc_transcript($feature);
1541 0 0       0 push @ucsc_list, $ucsc if $ucsc;
1542             }
1543             else {
1544 0         0 return;
1545             }
1546            
1547             # return strings
1548 0         0 my $string;
1549 0         0 foreach my $ucsc (@ucsc_list) {
1550             # exon sizes
1551 0         0 my @sizes;
1552 0         0 for (my $i = 0; $i < scalar( @{$ucsc->{'exonStarts'}} ); $i++) {
  0         0  
1553 0         0 push @sizes, $ucsc->{'exonEnds'}->[$i] - $ucsc->{'exonStarts'}->[$i];
1554             }
1555             $string .= sprintf("%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
1556             $ucsc->{'chr'},
1557             $ucsc->{'txStart'},
1558             $ucsc->{'txEnd'},
1559             $ucsc->{'name'},
1560             $ucsc->{'score'},
1561             $ucsc->{'strand'},
1562             $ucsc->{'cdsStart'},
1563             $ucsc->{'cdsEnd'},
1564             0,
1565             $ucsc->{'exonCount'},
1566             join(",", @sizes),
1567 0         0 join(",", map {$_ - $ucsc->{'txStart'} } @{$ucsc->{'exonStarts'}} ),
  0         0  
  0         0  
1568             );
1569             }
1570 0         0 return $string;
1571             }
1572              
1573              
1574             #### filter methods
1575              
1576             sub filter_transcript_support_level {
1577 0     0 1 0 my $gene = shift;
1578 0 0       0 return unless $gene;
1579 0   0     0 my $min_tsl = shift || 'best';
1580 0         0 my @list = qw(1 2 3 4 5 NA Missing);
1581            
1582             # get transcripts
1583 0         0 my @transcripts;
1584 0 0       0 if (ref($gene) =~ /seqfeature/i) {
    0          
1585 0         0 @transcripts = get_transcripts($gene);
1586             }
1587             elsif (ref($gene) eq 'ARRAY') {
1588 0         0 @transcripts = @$gene;
1589             }
1590             else {
1591 0         0 return;
1592             }
1593 0 0       0 return unless @transcripts;
1594            
1595             # categorize transcripts
1596 0         0 my %results = map { $_ => [] } @list;
  0         0  
1597 0         0 foreach my $t (@transcripts) {
1598 0         0 my ($tsl) = $t->get_tag_values('transcript_support_level');
1599 0   0     0 $tsl ||= 'Missing'; # default in case nothing is present
1600 0         0 push @{ $results{$tsl} }, $t;
  0         0  
1601             }
1602            
1603             # take appropriate transcripts
1604 0         0 my @keepers;
1605 0 0       0 if ($min_tsl eq 'best') {
    0          
    0          
    0          
1606             # progress through all levels and keep only the highest set
1607 0         0 foreach my $tsl (@list) {
1608 0 0       0 if (scalar @{ $results{$tsl} }) {
  0         0  
1609 0         0 @keepers = @{ $results{$tsl} };
  0         0  
1610 0         0 last;
1611             }
1612             }
1613             }
1614             elsif ($min_tsl =~ /^best(\d)$/) {
1615             # progress through all levels and take all up to specified max
1616 0         0 my $max = $1;
1617 0         0 foreach my $tsl (1 .. 5) {
1618 0 0       0 if (scalar @{ $results{$tsl} }) {
  0         0  
1619 0 0 0     0 if ($tsl <= $max) {
    0          
1620 0         0 push @keepers, @{ $results{$tsl} };
  0         0  
1621             }
1622             elsif (not @keepers and $tsl > $max) {
1623             # go ahead and take it if we have no other option
1624 0         0 push @keepers, @{ $results{$tsl} };
  0         0  
1625             }
1626             }
1627             }
1628             # still take the NA and Missing if nothing else
1629 0 0 0     0 if (scalar @keepers == 0 and scalar @{ $results{'NA'} }) {
  0 0 0     0  
1630 0         0 @keepers = @{ $results{'NA'} };
  0         0  
1631             }
1632 0         0 elsif (scalar @keepers == 0 and scalar @{ $results{'Missing'} }) {
1633 0         0 @keepers = @{ $results{'Missing'} };
  0         0  
1634             }
1635             }
1636             elsif ($min_tsl =~ /^\d$/) {
1637             # take only the specified level
1638 0         0 @keepers = @{ $results{$min_tsl} };
  0         0  
1639             }
1640             elsif ($min_tsl eq 'NA') {
1641             # take only the NA guys
1642 0         0 @keepers = @{ $results{'NA'} };
  0         0  
1643             }
1644             else {
1645 0         0 confess "unrecognized minimum TSL value '$min_tsl' Check the documentation!";
1646             }
1647 0 0       0 @keepers = @{ $results{'Missing'} } unless @keepers;
  0         0  
1648            
1649             # return
1650 0         0 return _return_filtered_transcripts($gene, \@keepers);
1651             }
1652              
1653             sub filter_transcript_gencode_basic {
1654 0     0 1 0 my $gene = shift;
1655 0 0       0 return unless $gene;
1656              
1657             # get transcripts
1658 0         0 my @transcripts;
1659 0 0       0 if (ref($gene) =~ /seqfeature/i) {
    0          
1660 0         0 @transcripts = get_transcripts($gene);
1661             }
1662             elsif (ref($gene) eq 'ARRAY') {
1663 0         0 @transcripts = @$gene;
1664             }
1665             else {
1666 0         0 return;
1667             }
1668 0 0       0 return unless @transcripts;
1669            
1670             # take appropriate transcripts
1671 0         0 my @keepers;
1672 0         0 foreach my $t (@transcripts) {
1673 0         0 my ($basic) = $t->get_tag_values('tag');
1674 0 0 0     0 if ($basic and $basic eq 'basic') {
1675 0         0 push @keepers, $t;
1676             }
1677             }
1678            
1679             # return
1680 0         0 return _return_filtered_transcripts($gene, \@keepers);
1681             }
1682              
1683              
1684             sub filter_transcript_biotype {
1685 1     1 1 550 my $gene = shift;
1686 1 50       3 return unless $gene;
1687 1         2 my $check = shift;
1688 1 50       2 confess "no biotype value to check provided!" unless $check;
1689            
1690             # get transcripts
1691 1         2 my @transcripts;
1692 1 50       6 if (ref($gene) =~ /seqfeature/i) {
    0          
1693 1         4 @transcripts = get_transcripts($gene);
1694             }
1695             elsif (ref($gene) eq 'ARRAY') {
1696 0         0 @transcripts = @$gene;
1697             }
1698             else {
1699 0         0 return;
1700             }
1701 1 50       3 return unless @transcripts;
1702            
1703             # take appropriate transcripts
1704 1         2 my @keepers;
1705 1         3 foreach my $t (@transcripts) {
1706 13   33     23 my ($value) = $t->get_tag_values('transcript_biotype') ||
1707             $t->get_tag_values('biotype');
1708 13   33     22 $value ||= $t->primary_tag;
1709 13 100 66     44 if ($value and $value =~ /$check/i) {
1710 2         4 push @keepers, $t;
1711             }
1712             }
1713            
1714             # return
1715 1         4 return _return_filtered_transcripts($gene, \@keepers);
1716             }
1717              
1718              
1719              
1720              
1721             #### internal methods
1722              
1723             sub _process_ucsc_transcript {
1724 0     0   0 my $transcript = shift;
1725 0   0     0 my $gene = shift || undef;
1726            
1727             # initialize ucsc hash
1728 0 0 0     0 my $ucsc = {
      0        
1729             'name' => $transcript->display_name || $transcript->primary_id,
1730             'name2' => undef,
1731             'chr' => $transcript->seq_id,
1732             'strand' => $transcript->strand < 0 ? '-' : '+',
1733             'txStart' => $transcript->start - 1,
1734             'txEnd' => $transcript->end,
1735             'cdsStart' => undef,
1736             'cdsEnd' => undef,
1737             'exonCount' => 0,
1738             'exonStarts' => [],
1739             'exonEnds' => [],
1740             'score' => $transcript->score || 1000,
1741             };
1742            
1743             # determine gene name
1744 0 0       0 if ($gene) {
1745             # use provided gene name
1746 0   0     0 $ucsc->{'name2'} = $gene->display_name || $gene->primary_id;
1747             }
1748             else {
1749             # reuse the name
1750 0         0 $ucsc->{'name2'} = $ucsc->{'name'};
1751             }
1752            
1753             # record CDS points
1754 0 0       0 if (is_coding($transcript)) {
1755 0         0 my $start = get_cdsStart($transcript);
1756 0         0 my $stop = get_cdsEnd($transcript);
1757 0 0 0     0 if ($start and $stop) {
1758 0         0 $ucsc->{cdsStart} = $start - 1;
1759 0         0 $ucsc->{cdsEnd} = $stop;
1760             }
1761             else {
1762             # if we don't have cds start/stop, then assign to transcript end
1763             # as if it is a noncoding transcript, regardless of primary_tag
1764 0         0 $ucsc->{cdsStart} = $transcript->end;
1765 0         0 $ucsc->{cdsEnd} = $transcript->end;
1766             }
1767             }
1768             else {
1769             # non-coding transcript sets the cds points to the transcript end
1770 0         0 $ucsc->{cdsStart} = $transcript->end;
1771 0         0 $ucsc->{cdsEnd} = $transcript->end;
1772             }
1773            
1774             # record the exons
1775 0         0 foreach my $exon (get_exons($transcript)) {
1776 0         0 push @{ $ucsc->{'exonStarts'} }, $exon->start - 1;
  0         0  
1777 0         0 push @{ $ucsc->{'exonEnds'} }, $exon->end;
  0         0  
1778 0         0 $ucsc->{'exonCount'} += 1;
1779             }
1780            
1781 0         0 return $ucsc;
1782             }
1783              
1784             sub _return_filtered_transcripts {
1785 1     1   3 my ($gene, $keepers) = @_;
1786            
1787 1 50       4 if (ref($gene) =~ /seqfeature/i) {
1788             # first check if we were only given a transcript
1789 1 50       2 if ($gene->primary_tag =~ /transcript|rna/i) {
1790             # we must have been given a single transcript to check, so return it
1791 0   0     0 $keepers->[0] ||= undef;
1792 0         0 return $keepers->[0];
1793             }
1794            
1795             # we can't delete subfeatures, so we're forced to create a new
1796             # parent gene and reattach the filtered transcripts
1797 1         3 my %attributes = $gene->attributes;
1798 1         5 return $gene->new(
1799             -seq_id => $gene->seq_id,
1800             -start => $gene->start,
1801             -end => $gene->end,
1802             -strand => $gene->strand,
1803             -primary_tag => $gene->primary_tag,
1804             -source => $gene->source_tag,
1805             -name => $gene->display_name,
1806             -id => $gene->primary_id,
1807             -attributes => \%attributes,
1808             -segments => $keepers,
1809             );
1810             }
1811             else {
1812 0           return $keepers;
1813             }
1814             }
1815              
1816              
1817              
1818             __END__