File Coverage

Bio/SeqFeature/Tools/Unflattener.pm
Criterion Covered Total %
statement 546 725 75.3
branch 222 344 64.5
condition 39 60 65.0
subroutine 33 42 78.5
pod 16 22 72.7
total 856 1193 71.7


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::SeqFeature::Tools::Unflattener
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Chris Mungall
7             #
8             # Copyright Chris Mungall
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqFeature::Tools::Unflattener - turns flat list of genbank-sourced features into a nested SeqFeatureI hierarchy
17              
18             =head1 SYNOPSIS
19              
20             # standard / generic use - unflatten a genbank record
21             use Bio::SeqIO;
22             use Bio::SeqFeature::Tools::Unflattener;
23              
24             # generate an Unflattener object
25             $unflattener = Bio::SeqFeature::Tools::Unflattener->new;
26              
27             # first fetch a genbank SeqI object
28             $seqio =
29             Bio::SeqIO->new(-file=>'AE003644.gbk',
30             -format=>'GenBank');
31             my $out =
32             Bio::SeqIO->new(-format=>'asciitree');
33             while ($seq = $seqio->next_seq()) {
34              
35             # get top level unflattended SeqFeatureI objects
36             $unflattener->unflatten_seq(-seq=>$seq,
37             -use_magic=>1);
38             $out->write_seq($seq);
39              
40             @top_sfs = $seq->get_SeqFeatures;
41             foreach my $sf (@top_sfs) {
42             # do something with top-level features (eg genes)
43             }
44             }
45              
46              
47             =head1 DESCRIPTION
48              
49             Most GenBank entries for annotated genomic DNA contain a B list
50             of features. These features can be parsed into an equivalent flat list
51             of L objects using the standard L
52             classes. However, it is often desirable to B this list into
53             something resembling actual B, in which genes, mRNAs and CDSs
54             are B according to the nature of the gene model.
55              
56             The BioPerl object model allows us to store these kind of associations
57             between SeqFeatures in B -- any SeqFeatureI
58             object can contain nested SeqFeatureI objects. The
59             Bio::SeqFeature::Tools::Unflattener object facilitates construction of
60             these hierarchies from the underlying GenBank flat-feature-list
61             representation.
62              
63             For example, if you were to look at a typical GenBank DNA entry, say,
64             B, you would see a flat list of features:
65              
66             source
67              
68             gene CG4491
69             mRNA CG4491-RA
70             CDS CG4491-PA
71              
72             gene tRNA-Pro
73             tRNA tRNA-Pro
74              
75             gene CG32954
76             mRNA CG32954-RA
77             mRNA CG32954-RC
78             mRNA CG32954-RB
79             CDS CG32954-PA
80             CDS CG32954-PB
81             CDS CG32954-PC
82              
83             These features have sequence locations, but it is not immediately
84             clear how to write code such that each mRNA is linked to the
85             appropriate CDS (other than relying on IDs which is very bad)
86              
87             We would like to convert the above list into the B
88             hierarchy>, shown below:
89              
90             source
91             gene
92             mRNA CG4491-RA
93             CDS CG4491-PA
94             exon
95             exon
96             gene
97             tRNA tRNA-Pro
98             exon
99             gene
100             mRNA CG32954-RA
101             CDS CG32954-PA
102             exon
103             exon
104             mRNA CG32954-RC
105             CDS CG32954-PC
106             exon
107             exon
108             mRNA CG32954-RB
109             CDS CG32954-PB
110             exon
111             exon
112              
113             Where each feature is nested underneath its container. Note that exons
114             have been automatically inferred (even for tRNA genes).
115              
116             We do this using a call on a L
117             object
118              
119             @sfs = $unflattener->unflatten_seq(-seq=>$seq);
120              
121             This would return a list of the B (i.e. container)
122             SeqFeatureI objects - in this case, genes. Other top level features
123             are possible; for instance, the B feature which is always
124             present, and other features such as B or B
125             types.
126              
127             The containment hierarchy can be accessed using the get_SeqFeature()
128             call on any feature object - see L.
129             The following code will traverse the containment hierarchy for a
130             feature:
131              
132             sub traverse {
133             $sf = shift; # $sf isa Bio::SeqfeatureI
134              
135             # ...do something with $sf!
136              
137             # depth first traversal of containment tree
138             @contained_sfs = $sf->get_SeqFeatures;
139             traverse($_) foreach @contained_sfs;
140             }
141              
142             Once you have built the hierarchy, you can do neat stuff like turn the
143             features into 'rich' feature objects (eg
144             L) or convert to a suitable
145             format such as GFF3 or chadoxml (after mapping to the Sequence
146             Ontology); this step is not described here.
147              
148             =head1 USING MAGIC
149              
150             Due to the quixotic nature of how features are stored in
151             GenBank/EMBL/DDBJ, there is no guarantee that the default behaviour of
152             this module will produce perfect results. Sometimes it is hard or
153             impossible to build a correct containment hierarchy if the information
154             provided is simply too lossy, as is often the case. If you care deeply
155             about your data, you should always manually inspect the resulting
156             containment hierarchy; you may have to customise the algorithm for
157             building the hierarchy, or even manually tweak the resulting
158             hierarchy. This is explained in more detail further on in the document.
159              
160             However, if you are satisfied with the default behaviour, then you do
161             not need to read any further. Just make sure you set the parameter
162             B - this will invoke incantations which will magically
163             produce good results no matter what the idiosyncracies of the
164             particular GenBank record in question.
165              
166             For example
167              
168             $unflattener->unflatten_seq(-seq=>$seq,
169             -use_magic=>1);
170              
171             The success of this depends on the phase of the moon at the time the
172             entry was submitted to GenBank. Note that the magical recipe is being
173             constantly improved, so the results of invoking magic may vary
174             depending on the bioperl release.
175              
176             If you are skeptical of magic, or you wish to exact fine grained
177             control over how the entry is unflattened, or you simply wish to
178             understand more about how this crazy stuff works, then read on!
179              
180             =head1 PROBLEMATIC DATA AND INCONSISTENCIES
181              
182             Occasionally the Unflattener will have problems with certain
183             records. For example, the record may contain inconsistent data - maybe
184             there is an B entry that has no corresponding B location.
185              
186             The default behaviour is to throw an exception reporting the problem,
187             if the problem is relatively serious - for example, inconsistent data.
188              
189             You can exert more fine grained control over this - perhaps you want
190             the Unflattener to do the best it can, and report any problems. This
191             can be done - refer to the methods.
192              
193             error_threshold()
194              
195             get_problems()
196              
197             report_problems()
198              
199             ignore_problems()
200              
201             =head1 ALGORITHM
202              
203             This is the default algorithm; you should be able to override any part
204             of it to customise.
205              
206             The core of the algorithm is in two parts
207              
208             =over
209              
210             =item Partitioning the flat feature list into groups
211              
212             =item Resolving the feature containment hierarchy for each group
213              
214             =back
215              
216             There are other optional steps after the completion of these two
217             steps, such as B; we now describe in more detail what
218             is going on.
219              
220             =head2 Partitioning into groups
221              
222             First of all the flat feature list is partitioned into Bs.
223              
224             The default way of doing this is to use the B attribute; if we
225             look at two features from GenBank accession AE003644.3:
226              
227             gene 20111..23268
228             /gene="noc"
229             /locus_tag="CG4491"
230             /note="last curated on Thu Dec 13 16:51:32 PST 2001"
231             /map="35B2-35B2"
232             /db_xref="FLYBASE:FBgn0005771"
233             mRNA join(20111..20584,20887..23268)
234             /gene="noc"
235             /locus_tag="CG4491"
236             /product="CG4491-RA"
237             /db_xref="FLYBASE:FBgn0005771"
238              
239             Both these features share the same /gene tag which is "noc", so they
240             correspond to the same gene model (the CDS feature is not shown, but
241             this also has a tag-value /gene="noc").
242              
243             Not all groups need to correspond to gene models, but this is the most
244             common use case; later on we shall describe how to customise the
245             grouping.
246              
247             Sometimes other tags have to be used; for instance, if you look at the
248             entire record for AE003644.3 you will see you actually need the use the
249             /locus_tag attribute. This attribute is actually B in
250             most records!
251              
252             You can override this:
253              
254             $collection->unflatten_seq(-seq=>$seq, -group_tag=>'locus_tag');
255              
256             Alternatively, if you B<-use_magic>, the object will try and make a
257             guess as to what the correct group_tag should be.
258              
259             At the end of this step, we should have a list of groups - there is no
260             structure within a group; the group just serves to partition the flat
261             features. For the example data above, we would have the following groups.
262              
263             [ source ]
264             [ gene mRNA CDS ]
265             [ gene mRNA CDS ]
266             [ gene mRNA CDS ]
267             [ gene mRNA mRNA mRNA CDS CDS CDS ]
268              
269             =head3 Multicopy Genes
270              
271             Multicopy genes are usually rRNAs or tRNAs that are duplicated across
272             the genome. Because they are functionally equivalent, and usually have
273             the same sequence, they usually have the same group_tag (ie gene
274             symbol); they often have a /note tag giving copy number. This means
275             they will end up in the same group. This is undesirable, because they
276             are spatially disconnected.
277              
278             There is another step, which involves splitting spatially disconnected
279             groups into distinct groups
280              
281             this would turn this
282              
283             [gene-rrn3 rRNA-rrn3 gene-rrn3 rRNA-rrn3]
284              
285             into this
286              
287             [gene-rrn3 rRNA-rrn3] [gene-rrn3 rRNA-rrn3]
288              
289             based on the coordinates
290              
291             =head3 What next?
292              
293             The next step is to add some structure to each group, by making
294             B, trees that represent how the features
295             interrelate
296              
297             =head2 Resolving the containment hierarchy
298              
299             After the grouping is done, we end up with a list of groups which
300             probably contain features of type 'gene', 'mRNA', 'CDS' and so on.
301              
302             Singleton groups (eg the 'source' feature) are ignored at this stage.
303              
304             Each group is itself flat; we need to add an extra level of
305             organisation. Usually this is because different spliceforms
306             (represented by the 'mRNA' feature) can give rise to different
307             protein products (indicated by the 'CDS' feature). We want to correctly
308             associate mRNAs to CDSs.
309              
310             We want to go from a group like this:
311              
312             [ gene mRNA mRNA mRNA CDS CDS CDS ]
313              
314             to a containment hierarchy like this:
315              
316             gene
317             mRNA
318             CDS
319             mRNA
320             CDS
321             mRNA
322             CDS
323              
324             In which each CDS is nested underneath the correct corresponding mRNA.
325              
326             For entries that contain no alternate splicing, this is simple; we
327             know that the group
328              
329             [ gene mRNA CDS ]
330              
331             Must resolve to the tree
332              
333             gene
334             mRNA
335             CDS
336              
337             How can we do this in entries with alternate splicing? The bad
338             news is that there is no guaranteed way of doing this correctly for
339             any GenBank entry. Occasionally the submission will have been done in
340             such a way as to reconstruct the containment hierarchy. However, this
341             is not consistent across databank entries, so no generic solution can
342             be provided by this object. This module does provide the framework
343             within which you can customise a solution for the particular dataset
344             you are interested in - see later.
345              
346             The good news is that there is an inference we can do that should
347             produce pretty good results the vast majority of the time. It uses
348             splice coordinate data - this is the default behaviour of this module,
349             and is described in detail below.
350              
351             =head2 Using splice site coordinates to infer containment
352              
353             If an mRNA is to be the container for a CDS, then the splice site
354             coordinates (or intron coordinates, depending on how you look at it)
355             of the CDS must fit inside the splice site coordinates of the mRNA.
356              
357             Ambiguities can still arise, but the results produced should still be
358             reasonable and consistent at the sequence level. Look at this fake
359             example:
360              
361             mRNA XXX---XX--XXXXXX--XXXX join(1..3,7..8,11..16,19..23)
362             mRNA XXX-------XXXXXX--XXXX join(1..3,11..16,19..23)
363             CDS XXXX--XX join(13..16,19..20)
364             CDS XXXX--XX join(13..16,19..20)
365              
366             [obviously the positions have been scaled down]
367              
368             We cannot unambiguously match mRNA with CDS based on splice sites,
369             since both CDS share the splice site locations 16^17 and
370             18^19. However, the consequences of making a wrong match are probably
371             not very severe. Any annotation data attached to the first CDS is
372             probably identical to the seconds CDS, other than identifiers.
373              
374             The default behaviour of this module is to make an arbitrary call
375             where it is ambiguous (the mapping will always be bijective; i.e. one
376             mRNA -E one CDS).
377              
378             [TODO: NOTE: not tested on EMBL data, which may not be bijective; ie two
379             mRNAs can share the same CDS??]
380              
381             This completes the building of the containment hierarchy; other
382             optional step follow
383              
384             =head1 POST-GROUPING STEPS
385              
386             =head2 Inferring exons from mRNAs
387              
388             This step always occurs if B<-use_magic> is invoked.
389              
390             In a typical GenBank entry, the exons are B. That is they
391             can be inferred from the mRNA location.
392              
393             For example:
394              
395             mRNA join(20111..20584,20887..23268)
396              
397             This tells us that this particular transcript has two exons. In
398             bioperl, the mRNA feature will have a 'split location'.
399              
400             If we call
401              
402             $unflattener->feature_from_splitloc(-seq=>$seq);
403              
404             This will generate the necessary exon features, and nest them under
405             the appropriate mRNAs. Note that the mRNAs will no longer have split
406             locations - they will have simple locations spanning the extent of the
407             exons. This is intentional, to avoid redundancy.
408              
409             Occasionally a GenBank entry will have both implicit exons (from the
410             mRNA location) B explicit exon features.
411              
412             In this case, exons will still be transferred. Tag-value data from the
413             explicit exon will be transferred to the implicit exon. If exons are
414             shared between mRNAs these will be represented by different
415             objects. Any inconsistencies between implicit and explicit will be
416             reported.
417              
418             =head3 tRNAs and other noncoding RNAs
419              
420             exons will also be generated from these features
421              
422             =head2 Inferring mRNAs from CDS
423              
424             Some GenBank entries represent gene models using features of type
425             gene, mRNA and CDS; some entries just use gene and CDS.
426              
427             If we only have gene and CDS, then the containment hierarchies will
428             look like this:
429              
430             gene
431             CDS
432              
433             If we want the containment hierarchies to be uniform, like this
434              
435             gene
436             mRNA
437             CDS
438              
439             Then we must create an mRNA feature. This will have identical
440             coordinates to the CDS. The assumption is that there is either no
441             untranslated region, or it is unknown.
442              
443             To do this, we can call
444              
445             $unflattener->infer_mRNA_from_CDS(-seq=>$seq);
446              
447             This is taken care of automatically, if B<-use_magic> is invoked.
448              
449             =head1 ADVANCED
450              
451             =head2 Customising the grouping of features
452              
453             The default behaviour is suited mostly to building models of protein
454             coding genes and noncoding genes from genbank genomic DNA submissions.
455              
456             You can change the tag used to partition the feature by passing in a
457             different group_tag argument - see the unflatten_seq() method
458              
459             Other behaviour may be desirable. For example, even though SNPs
460             (features of type 'variation' in GenBank) are not actually part of the
461             gene model, it may be desirable to group SNPs that overlap or are
462             nearby gene models.
463              
464             It should certainly be possible to extend this module to do
465             this. However, I have yet to code this part!!! If anyone would find
466             this useful let me know.
467              
468             In the meantime, you could write your own grouping subroutine, and
469             feed the results into unflatten_groups() [see the method documentation
470             below]
471              
472             =head2 Customising the resolution of the containment hierarchy
473              
474             Once the flat list of features has been partitioned into groups, the
475             method unflatten_group() is called on each group to build a tree.
476              
477             The algorithm for doing this is described above; ambiguities are
478             resolved by using splice coordinates. As discussed, this can be
479             ambiguous.
480              
481             Some submissions may contain information in tags/attributes that hint
482             as to the mapping that needs to be made between the features.
483              
484             For example, with the Drosophila Melanogaster release 3 submission, we
485             see that CDS features in alternately spliced mRNAs have a form like
486             this:
487              
488             CDS join(145588..145686,145752..146156,146227..146493)
489             /locus_tag="CG32954"
490             /note="CG32954 gene product from transcript CG32954-RA"
491             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
492             /codon_start=1
493             /product="CG32954-PA"
494             /protein_id="AAF53403.1"
495             /db_xref="GI:7298167"
496             /db_xref="FLYBASE:FBgn0052954"
497             /translation="MSFTLTNKNVIFVAGLGGIGLDTSKELLKRDLKNLVILDRIENP..."
498              
499             Here the /note tag provides the clue we need to link CDS to mRNA
500             (highlighted with ^^^^). We just need to find the mRNA with the tag
501              
502             /product="CG32954-RA"
503              
504             I have no idea how consistent this practice is across submissions; it
505             is consistent for the fruitfly genome submission.
506              
507             We can customise the behaviour of unflatten_group() by providing our
508             own resolver method. This obviously requires a bit of extra
509             programming, but there is no way to get around this.
510              
511             Here is an example of how to pass in your own resolver; this example
512             basically checks the parent (container) /product tag to see if it
513             matches the required string in the child (contained) /note tag.
514              
515             $unflattener->unflatten_seq(-seq=>$seq,
516             -group_tag=>'locus_tag',
517             -resolver_method=>sub {
518             my $self = shift;
519             my ($sf, @candidate_container_sfs) = @_;
520             if ($sf->has_tag('note')) {
521             my @notes = $sf->get_tag_values('note');
522             my @trnames = map {/from transcript\s+(.*)/;
523             $1} @notes;
524             @trnames = grep {$_} @trnames;
525             my $trname;
526             if (@trnames == 0) {
527             $self->throw("UNRESOLVABLE");
528             }
529             elsif (@trnames == 1) {
530             $trname = $trnames[0];
531             }
532             else {
533             $self->throw("AMBIGUOUS: @trnames");
534             }
535             my @container_sfs =
536             grep {
537             my ($product) =
538             $_->has_tag('product') ?
539             $_->get_tag_values('product') :
540             ('');
541             $product eq $trname;
542             } @candidate_container_sfs;
543             if (@container_sfs == 0) {
544             $self->throw("UNRESOLVABLE");
545             }
546             elsif (@container_sfs == 1) {
547             # we got it!
548             return $container_sfs[0];
549             }
550             else {
551             $self->throw("AMBIGUOUS");
552             }
553             }
554             });
555              
556             the resolver method is only called when there is more than one spliceform.
557              
558             =head2 Parsing mRNA records
559              
560             Some of the entries in sequence databanks are for mRNA sequences as
561             well as genomic DNA. We may want to build models from these too.
562              
563             NOT YET DONE - IN PROGRESS!!!
564              
565             Open question - what would these look like?
566              
567             Ideally we would like a way of combining a mRNA record with the
568             corresponding SeFeature entry from the appropriate genomic DNA
569             record. This could be problemmatic in some cases - for example, the
570             mRNA sequences may not match 100% (due to differences in strain,
571             assembly problems, sequencing problems, etc). What then...?
572              
573             =head1 SEE ALSO
574              
575             Feature table description
576              
577             http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
578              
579             =head1 FEEDBACK
580              
581             =head2 Mailing Lists
582              
583             User feedback is an integral part of the evolution of this and other
584             Bioperl modules. Send your comments and suggestions preferably to the
585             Bioperl mailing lists Your participation is much appreciated.
586              
587             bioperl-l@bioperl.org - General discussion
588             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
589              
590             =head2 Support
591              
592             Please direct usage questions or support issues to the mailing list:
593              
594             I
595              
596             rather than to the module maintainer directly. Many experienced and
597             reponsive experts will be able look at the problem and quickly
598             address it. Please include a thorough description of the problem
599             with code and data examples if at all possible.
600              
601             =head2 Reporting Bugs
602              
603             report bugs to the Bioperl bug tracking system to help us keep track
604             the bugs and their resolution. Bug reports can be submitted via the
605             web:
606              
607             https://github.com/bioperl/bioperl-live/issues
608              
609             =head1 AUTHOR - Chris Mungall
610              
611             Email: cjm@fruitfly.org
612              
613             =head1 APPENDIX
614              
615             The rest of the documentation details each of the object
616             methods. Internal methods are usually preceded with a _
617              
618             =cut
619              
620              
621             # Let the code begin...
622              
623             package Bio::SeqFeature::Tools::Unflattener;
624 3     3   1181 use strict;
  3         6  
  3         73  
625              
626             # Object preamble - inherits from Bio::Root::Root
627 3     3   39 use Bio::Location::Simple;
  3         6  
  3         56  
628 3     3   306 use Bio::SeqFeature::Generic;
  3         10  
  3         87  
629 3     3   23 use Bio::Range;
  3         5  
  3         75  
630              
631              
632 3     3   19 use base qw(Bio::Root::Root);
  3         9  
  3         18274  
633              
634             =head2 new
635              
636             Title : new
637             Usage : $unflattener = Bio::SeqFeature::Tools::Unflattener->new();
638             $unflattener->unflatten_seq(-seq=>$seq);
639             Function: constructor
640             Example :
641             Returns : a new Bio::SeqFeature::Tools::Unflattener
642             Args : see below
643              
644             Arguments
645              
646             -seq : A L object (optional)
647             the sequence to unflatten; this can also be passed in
648             when we call unflatten_seq()
649              
650             -group_tag : a string representing the /tag used to partition flat features
651             (see discussion above)
652              
653             =cut
654              
655              
656             sub new {
657 1     1 1 16 my($class,@args) = @_;
658 1         12 my $self = $class->SUPER::new(@args);
659              
660 1         10 my($seq, $group_tag, $trust_grouptag) =
661             $self->_rearrange([qw(SEQ
662             GROUP_TAG
663             TRUST_GROUPTAG
664             )],
665             @args);
666              
667 1 50       4 $seq && $self->seq($seq);
668 1 50       4 $group_tag && $self->group_tag($group_tag);
669             # $self->{'trust_grouptag'}= $trust_grouptag if($trust_grouptag); #dgg suggestion
670 1         2 return $self; # success - we hope!
671             }
672              
673             sub DESTROY {
674 1     1   3 my $self = shift;
675 1 50       4 return if $self->{_reported_problems};
676 1 50       4 return if $self->{_ignore_problems};
677 1         3 my @probs = $self->get_problems;
678 1 50 50     6 if (!$self->{_problems_reported} &&
679             scalar(@probs)) {
680 0         0 $self->warn(
681             "WARNING: There are UNREPORTED PROBLEMS.\n".
682             "You may wish to use the method report_problems(), \n",
683             "or ignore_problems() on the Unflattener object\n");
684             }
685 1         24 return;
686             }
687              
688             =head2 seq
689              
690             Title : seq
691             Usage : $unflattener->seq($newval)
692             Function:
693             Example :
694             Returns : value of seq (a Bio::SeqI)
695             Args : on set, new value (a Bio::SeqI, optional)
696              
697             The Bio::SeqI object should hold a flat list of Bio::SeqFeatureI
698             objects; this is the list that will be unflattened.
699              
700             The sequence object can also be set when we call unflatten_seq()
701              
702             =cut
703              
704             sub seq{
705 12     12 1 20 my $self = shift;
706              
707 12 100       33 return $self->{'seq'} = shift if @_;
708 11         42 return $self->{'seq'};
709             }
710              
711             =head2 group_tag
712              
713             Title : group_tag
714             Usage : $unflattener->group_tag($newval)
715             Function:
716             Example :
717             Returns : value of group_tag (a scalar)
718             Args : on set, new value (a scalar or undef, optional)
719              
720             This is the tag that will be used to collect elements from the flat
721             feature list into groups; for instance, if we look at two typical
722             GenBank features:
723              
724             gene 20111..23268
725             /gene="noc"
726             /locus_tag="CG4491"
727             /note="last curated on Thu Dec 13 16:51:32 PST 2001"
728             /map="35B2-35B2"
729             /db_xref="FLYBASE:FBgn0005771"
730             mRNA join(20111..20584,20887..23268)
731             /gene="noc"
732             /locus_tag="CG4491"
733             /product="CG4491-RA"
734             /db_xref="FLYBASE:FBgn0005771"
735              
736             We can see that these comprise the same gene model because they share
737             the same /gene attribute; we want to collect these together in groups.
738              
739             Setting group_tag is optional. The default is to use 'gene'. In the
740             example above, we could also use /locus_tag
741              
742             =cut
743              
744             sub group_tag{
745 9     9 1 24 my $self = shift;
746              
747 9 50       25 return $self->{'group_tag'} = shift if @_;
748 9         30 return $self->{'group_tag'};
749             }
750              
751             =head2 partonomy
752              
753             Title : partonomy
754             Usage : $unflattener->partonomy({mRNA=>'gene', CDS=>'mRNA')
755             Function:
756             Example :
757             Returns : value of partonomy (a scalar)
758             Args : on set, new value (a scalar or undef, optional)
759              
760             A hash representing the containment structure that the seq_feature
761             nesting should conform to; each key represents the contained (child)
762             type; each value represents the container (parent) type.
763              
764             =cut
765              
766             sub partonomy{
767 28863     28863 1 28314 my $self = shift;
768              
769 28863 100       36842 return $self->{'partonomy'} = shift if @_;
770 28849 100       37991 if (!$self->{'partonomy'}) {
771 1         3 $self->{'partonomy'} = $self->_default_partonomy;
772             }
773 28849         31932 return $self->{'partonomy'};
774             }
775              
776             sub _default_partonomy{
777             return {
778 1     1   11 mRNA => 'gene',
779             tRNA => 'gene',
780             rRNA => 'gene',
781             scRNA => 'gene',
782             snRNA => 'gene',
783             snoRNA => 'gene',
784             misc_RNA => 'gene',
785             CDS => 'mRNA',
786             exon => 'mRNA',
787             intron => 'mRNA',
788              
789             pseudoexon => 'pseudogene',
790             pseudointron => 'pseudogene',
791             pseudotranscript => 'pseudogene',
792             };
793             }
794              
795             =head2 structure_type
796              
797             Title : structure_type
798             Usage : $unflattener->structure_type($newval)
799             Function:
800             Example :
801             Returns : value of structure_type (a scalar)
802             Args : on set, new value (an int or undef, optional)
803              
804             GenBank entries conform to different flavours, or B
805             types>. Some have mRNAs, some do not.
806              
807             Right now there are only two base structure types defined. If you set
808             the structure type, then appropriate unflattening action will be
809             taken. The presence or absence of explicit exons does not affect the
810             structure type.
811              
812             If you invoke B<-use_magic> then this will be set automatically, based
813             on the content of the record.
814              
815             =over
816              
817             =item Type 0 (DEFAULT)
818              
819             typically contains
820              
821             source
822             gene
823             mRNA
824             CDS
825              
826             with this structure type, we want the seq_features to be nested like this
827              
828             gene
829             mRNA
830             CDS
831             exon
832              
833             exons and introns are implicit from the mRNA 'join' location
834              
835             to get exons from the mRNAs, you will need this call (see below)
836              
837             $unflattener->feature_from_splitloc(-seq=>$seq);
838              
839             =item Type 1
840              
841             typically contains
842              
843             source
844             gene
845             CDS
846             exon [optional]
847             intron [optional]
848              
849             there are no mRNA features
850              
851             with this structure type, we want the seq_features to be nested like this
852              
853             gene
854             CDS
855             exon
856             intron
857              
858             exon and intron may or may not be present; they may be implicit from
859             the CDS 'join' location
860              
861             =back
862              
863             =cut
864              
865             sub structure_type{
866 22     22 1 60 my $self = shift;
867              
868 22 100       61 return $self->{'structure_type'} = shift if @_;
869 11         35 return $self->{'structure_type'};
870             }
871              
872             =head2 get_problems
873              
874             Title : get_problems
875             Usage : @probs = get_problems()
876             Function: Get the list of problem(s) for this object.
877             Example :
878             Returns : An array of [severity, description] pairs
879             Args :
880              
881             In the course of unflattening a record, problems may occur. Some of
882             these problems are non-fatal, and can be ignored.
883              
884             Problems are represented as arrayrefs containing a pair [severity,
885             description]
886              
887             severity is a number, the higher, the more severe the problem
888              
889             the description is a text string
890              
891             =cut
892              
893             sub get_problems{
894 2     2 1 460 my $self = shift;
895              
896 2 50       7 return @{$self->{'_problems'}} if exists($self->{'_problems'});
  2         8  
897 0         0 return ();
898             }
899              
900             =head2 clear_problems
901              
902             Title : clear_problems
903             Usage :
904             Function: resets the problem list to empty
905             Example :
906             Returns :
907             Args :
908              
909              
910             =cut
911              
912             sub clear_problems{
913 1     1 1 8 my ($self,@args) = @_;
914 1         2 $self->{'_problems'} = [];
915 1         3 return;
916             }
917              
918              
919             # PRIVATE
920             # see get_problems
921             sub add_problem{
922 6     6 0 9 my $self = shift;
923              
924 6 100       18 $self->{'_problems'} = [] unless exists($self->{'_problems'});
925 6 50       15 if ($self->verbose > 0) {
926 0         0 warn( "PROBLEM: $_\n") foreach @_;
927             }
928 6         9 push(@{$self->{'_problems'}}, @_);
  6         16  
929             }
930              
931             # PRIVATE
932             # see get_problems
933             sub problem {
934 6     6 0 12 my $self = shift;
935 6         14 my ($severity, $desc, @sfs) = @_;
936 6 50       15 if (@sfs) {
937 6         11 foreach my $sf (@sfs) {
938             $desc .=
939             sprintf("\nSF [$sf]: ". $sf->location->to_FTstring . "; %s\n",
940             join('; ',
941             $sf->primary_tag,
942             map {
943 6 100       24 $sf->has_tag($_) ?
  24         38  
944             $sf->get_tag_values($_) : ()
945             } qw(locus_tag gene product label)));
946             }
947             }
948 6         15 my $thresh = $self->error_threshold;
949 6 50       14 if ($severity > $thresh) {
950 0         0 $self->{_problems_reported} = 1;
951 0         0 $self->throw("PROBLEM, SEVERITY==$severity\n$desc");
952             }
953 6         26 $self->add_problem([$severity, $desc]);
954 6         15 return;
955             }
956              
957             =head2 report_problems
958              
959             Title : report_problems
960             Usage : $unflattener->report_problems(\*STDERR);
961             Function:
962             Example :
963             Returns :
964             Args : FileHandle (defaults to STDERR)
965              
966              
967             =cut
968              
969             sub report_problems{
970 0     0 1 0 my ($self, $fh) = @_;
971              
972 0 0       0 if (!$fh) {
973 0         0 $fh = \*STDERR;
974             }
975 0         0 foreach my $problem ($self->get_problems) {
976 0         0 my ($sev, $desc) = @$problem;
977 0         0 printf $fh "PROBLEM, SEVERITY==$sev\n$desc\n";
978             }
979 0         0 $self->{_problems_reported} = 1;
980 0         0 return;
981             }
982              
983             =head2 ignore_problems
984              
985             Title : ignore_problems
986             Usage : $obj->ignore_problems();
987             Function:
988             Example :
989             Returns :
990             Args :
991              
992             Unflattener is very particular about problems it finds along the
993             way. If you have set the error_threshold such that less severe
994             problems do not cause exceptions, Unflattener still expects you to
995             report_problems() at the end, so that the user of the module is aware
996             of any inconsistencies or problems with the data. In fact, a warning
997             will be produced if there are unreported problems. To silence, this
998             warning, call the ignore_problems() method before the Unflattener
999             object is destroyed.
1000              
1001             =cut
1002              
1003             sub ignore_problems{
1004 0     0 1 0 my ($self) = @_;
1005 0         0 $self->{_ignore_problems} = 1;
1006 0         0 return;
1007             }
1008              
1009              
1010             =head2 error_threshold
1011              
1012             Title : error_threshold
1013             Usage : $obj->error_threshold($severity)
1014             Function:
1015             Example :
1016             Returns : value of error_threshold (a scalar)
1017             Args : on set, new value (an integer)
1018              
1019             Sets the threshold above which errors cause this module to throw an
1020             exception. The default is 0; all problems with a severity E 0 will
1021             cause an exception.
1022              
1023             If you raise the threshold to 1, then the unflattening process will be
1024             more lax; problems of severity==1 are generally non-fatal, but may
1025             indicate that the results should be inspected, for example, to make
1026             sure there is no data loss.
1027              
1028             =cut
1029              
1030             sub error_threshold{
1031 7     7 1 24 my $self = shift;
1032              
1033 7 100       21 return $self->{'error_threshold'} = shift if @_;
1034 6   50     17 return $self->{'error_threshold'} || 0;
1035             }
1036              
1037              
1038              
1039             # PRIVATE
1040             #
1041             # given a type (eg mRNA), will return the container type (eg gene)
1042             sub get_container_type{
1043 15023     15023 0 18890 my ($self,$type) = @_;
1044 15023         18141 my @roots = $self->_get_partonomy_roots;
1045 15023 100       17481 if (grep {$_ eq $type} @roots) {
  221597         249662  
1046             # it is a root - no parents/containers
1047 2725         5602 return;
1048             }
1049 12298         15468 my $ch = $self->partonomy;
1050 12298         14206 my $ctype = $ch->{$type};
1051 12298 100       15398 if (!$ctype) {
1052             # asterix acts as a wild card
1053 192         214 $ctype = $ch->{'*'};
1054             }
1055 12298         20937 return $ctype;
1056             }
1057              
1058             # get root node of partonomy hierarchy (usually gene)
1059             sub _get_partonomy_roots {
1060 15023     15023   14730 my $self = shift;
1061 15023         17005 my $ch = $self->partonomy;
1062 15023         44672 my @parents = values %$ch;
1063             # find parents that do not have parents themselves
1064 15023         17804 return grep {!$ch->{$_}} @parents;
  309029         373848  
1065             }
1066              
1067              
1068              
1069             =head2 unflatten_seq
1070              
1071             Title : unflatten_seq
1072             Usage : @sfs = $unflattener->unflatten_seq($seq);
1073             Function: turns a flat list of features into a list of holder features
1074             Example :
1075             Returns : list of Bio::SeqFeatureI objects
1076             Args : see below
1077              
1078             partitions a list of features then arranges them in a nested tree; see
1079             above for full explanation.
1080              
1081             note - the Bio::SeqI object passed in will be modified
1082              
1083             Arguments
1084              
1085             -seq : a Bio::SeqI object; must contain Bio::SeqFeatureI objects
1086             (this is optional if seq has already been set)
1087              
1088             -use_magic: if TRUE (ie non-zero) then magic will be invoked;
1089             see discussion above.
1090              
1091             -resolver_method: a CODE reference
1092             see the documentation above for an example of
1093             a subroutine that can be used to resolve hierarchies
1094             within groups.
1095              
1096             this is optional - if nothing is supplied, a default
1097             subroutine will be used (see below)
1098              
1099             -group_tag: a string
1100             [ see the group_tag() method ]
1101             this overrides the default group_tag which is 'gene'
1102              
1103              
1104              
1105             =cut
1106              
1107             sub unflatten_seq{
1108 11     11 1 284 my ($self,@args) = @_;
1109              
1110 11         61 my($seq, $resolver_method, $group_tag, $partonomy,
1111             $structure_type, $resolver_tag, $use_magic, $noinfer) =
1112             $self->_rearrange([qw(SEQ
1113             RESOLVER_METHOD
1114             GROUP_TAG
1115             PARTONOMY
1116             STRUCTURE_TYPE
1117             RESOLVER_TAG
1118             USE_MAGIC
1119             NOINFER
1120             )],
1121             @args);
1122              
1123             # seq we want to unflatten
1124 11   33     46 $seq = $seq || $self->seq;
1125 11 100       45 if (!$self->seq) {
1126 1         3 $self->seq($seq);
1127             }
1128              
1129              
1130             # prevent bad argument combinations
1131 11 50 66     43 if ($partonomy &&
1132             defined($structure_type)) {
1133 0         0 $self->throw("You cannot set both -partonomy and -structure_type\n".
1134             "(the former is implied by the latter)");
1135             }
1136              
1137             # remember the current value of partonomy, to reset later
1138 11         44 my $old_partonomy = $self->partonomy;
1139 11 100       38 $self->partonomy($partonomy) if defined $partonomy;
1140              
1141             # remember old structure_type
1142 11         41 my $old_structure_type = $self->structure_type;
1143 11 50       30 $self->structure_type($structure_type) if defined $structure_type;
1144              
1145             # if we are sourcing our data from genbank, all the
1146             # features should be flat (eq no sub_SeqFeatures)
1147 11         35 my @flat_seq_features = $seq->get_SeqFeatures;
1148 11         78 my @all_seq_features = $seq->get_all_SeqFeatures;
1149              
1150             # sanity checks
1151 11 50       58 if (@all_seq_features > @flat_seq_features) {
1152 0         0 $self->throw("It looks as if this sequence has already been unflattened");
1153             }
1154 11 50       41 if (@all_seq_features < @flat_seq_features) {
1155 0         0 $self->throw("ASSERTION ERROR: something is seriously wrong with your features");
1156             }
1157              
1158             # tag for ungrouping; usually /gene or /locus_tag
1159             # for example: /gene="foo"
1160 11   66     53 $group_tag = $group_tag || $self->group_tag;
1161 11 100       35 if ($use_magic) {
1162             # use magic to guess the group tag
1163             my @sfs_with_locus_tag =
1164 7         28 grep {$_->has_tag("locus_tag")} @flat_seq_features;
  6020         7332  
1165             my @sfs_with_gene_tag =
1166 7         28 grep {$_->has_tag("gene")} @flat_seq_features;
  6020         7343  
1167             my @sfs_with_product_tag =
1168 7         31 grep {$_->has_tag("product")} @flat_seq_features;
  6020         7241  
1169            
1170             # if ($group_tag && $self->{'trust_grouptag'}) { # dgg suggestion
1171             #
1172             # }
1173             # elsif
1174 7 100       29 if (@sfs_with_locus_tag) {
1175             # dgg note: would like to -use_magic with -group_tag = 'gene' for ensembl genomes
1176             # where ensembl gene FT have both /locus_tag and /gene, but mRNA, CDS have /gene only
1177 1 50 33     5 if ($group_tag && $group_tag ne 'locus_tag') {
1178 0         0 $self->throw("You have explicitly set group_tag to be '$group_tag'\n".
1179             "However, I detect that some features use /locus_tag\n".
1180             "I believe that this is the correct group_tag to use\n".
1181             "You can resolve this by either NOT setting -group_tag\n".
1182             "OR you can unset -use_magic to regain control");
1183             }
1184              
1185             # use /locus_tag instead of /gene tag for grouping
1186             # see GenBank entry AE003677 (version 3) for an example
1187 1         3 $group_tag = 'locus_tag';
1188 1 50       3 if ($self->verbose > 0) {
1189 0         0 warn "Set group tag to: $group_tag\n";
1190             }
1191             }
1192              
1193             # on rare occasions, records will have no /gene or /locus_tag
1194             # but it WILL have /product tags. These serve the same purpose
1195             # for grouping. For an example, see AY763288 (also in t/data)
1196 7 50 100     419 if (@sfs_with_locus_tag==0 &&
      66        
      66        
1197             @sfs_with_gene_tag==0 &&
1198             @sfs_with_product_tag>0 &&
1199             !$group_tag) {
1200 1         3 $group_tag = 'product';
1201 1 50       4 if ($self->verbose > 0) {
1202 0         0 warn "Set group tag to: $group_tag\n";
1203             }
1204            
1205             }
1206             }
1207 11 100       34 if (!$group_tag) {
1208 7         18 $group_tag = 'gene';
1209             }
1210              
1211             # ------------------------------
1212             # GROUP FEATURES using $group_tag
1213             # collect features into unstructured groups
1214             # ------------------------------
1215              
1216             # -------------
1217             # we want to generate a list of groups;
1218             # each group is a list of SeqFeatures; this
1219             # group probably (but not necessarily)
1220             # corresponds to a gene model.
1221             #
1222             # this array will look something like this:
1223             # ([$f1], [$f2, $f3, $f4], ...., [$f97, $f98, $f99])
1224             #
1225             # there are also 'singleton' groups, with one member.
1226             # for instance, the 'source' feature is in a singleton group;
1227             # the same with others such as 'misc_feature'
1228 11         23 my @groups = ();
1229             # -------------
1230              
1231             # --------------------
1232             # we hope that the genbank record allows us to group by some grouping
1233             # tag.
1234             # for instance, most of the time a gene model can be grouped using
1235             # the gene tag - that is where you see
1236             # /gene="foo"
1237             # in a genbank record
1238             # --------------------
1239            
1240             # keep an index of groups by their
1241             # grouping tag
1242 11         23 my %group_by_tag = ();
1243            
1244              
1245             # iterate through all features, putting them into groups
1246 11         29 foreach my $sf (@flat_seq_features) {
1247 6333 100       8600 if (!$sf->has_tag($group_tag)) {
1248             # SINGLETON
1249             # this is an ungroupable feature;
1250             # add it to a group of its own
1251 64         127 push(@groups, [$sf]);
1252             }
1253             else {
1254             # NON-SINGLETON
1255 6269         7571 my @group_tagvals = $sf->get_tag_values($group_tag);
1256 6269 50       8367 if (@group_tagvals > 1) {
1257             # sanity check:
1258             # currently something can only belong to one group
1259 0         0 $self->problem(2,
1260             ">1 value for /$group_tag: @group_tagvals\n".
1261             "At this time this module is not equipped to handle this adequately", $sf);
1262             }
1263             # get value of group tag
1264 6269         6521 my $gtv = shift @group_tagvals;
1265 6269 50       7320 $gtv || $self->throw("Empty /$group_tag vals not allowed!");
1266              
1267             # is this a new group?
1268 6269         6409 my $group = $group_by_tag{$gtv};
1269 6269 100       7114 if ($group) {
1270             # this group has been encountered before - add current
1271             # sf to the end of the group
1272 4253         6303 push(@$group, $sf);
1273             }
1274             else {
1275             # new group; add to index and create new group
1276 2016         2307 $group = [$sf]; # currently one member; probably more to come
1277 2016         2737 $group_by_tag{$gtv} = $group;
1278 2016         2739 push(@groups, $group);
1279             }
1280             }
1281             }
1282            
1283             # as well as having the same group_tag, a group should be spatially
1284             # connected. if not, then the group should be split into subgroups.
1285             # this turns out to be necessary in the case of multicopy genes.
1286             # the standard way to represent these is as spatially disconnected
1287             # gene models (usually a 'gene' feature and some kind of RNA feature)
1288             # with the same group tag; the code below will split these into
1289             # seperate groups, one per copy.
1290 11         37 @groups = map { $self->_split_group_if_disconnected($_) } @groups;
  2080         3782  
1291              
1292             # remove any duplicates; most of the time the method below has
1293             # no effect. there are some unusual genbank records for which
1294             # duplicate removal is necessary. see the comments in the
1295             # _remove_duplicates_from_group() method if you want to know
1296             # the ugly details
1297 11         68 foreach my $group (@groups) {
1298 2083         2347 $self->_remove_duplicates_from_group($group);
1299             }
1300              
1301             # -
1302              
1303             # PSEUDOGENES, PSEUDOEXONS AND PSEUDOINTRONS
1304             # these are indicated with the /pseudo tag
1305             # these are mapped to a different type; they should NOT
1306             # be treated as normal genes
1307 11         32 foreach my $sf (@all_seq_features) {
1308 6333 100       8478 if ($sf->has_tag('pseudo')) {
1309 899         1122 my $type = $sf->primary_tag;
1310             # SO type is typically the same as the normal
1311             # type but preceeded by "pseudo"
1312 899 100 66     1802 if ($type eq 'misc_RNA' || $type eq 'mRNA') {
1313             # dgg: see TypeMapper; both pseudo mRNA,misc_RNA should be pseudogenic_transcript
1314 101         131 $sf->primary_tag("pseudotranscript");
1315             }
1316             else {
1317 798         1353 $sf->primary_tag("pseudo$type");
1318             }
1319             }
1320             }
1321             # now some of the post-processing that follows which applies to
1322             # genes will NOT be applied to pseudogenes; this is deliberate
1323             # for example, gene models are normalised to be gene-transcript-exon
1324             # for pseudogenes we leave them as pseudogene-pseudoexon
1325              
1326             # --- MAGIC ---
1327 11         26 my $need_to_infer_exons = 0;
1328 11         21 my $need_to_infer_mRNAs = 0;
1329 11         24 my @removed_exons = ();
1330 11 100       36 if ($use_magic) {
1331 7 50       27 if (defined($structure_type)) {
1332 0         0 $self->throw("Can't combine use_magic AND setting structure_type");
1333             }
1334             my $n_introns =
1335 7         30 scalar(grep {$_->primary_tag eq 'exon'} @flat_seq_features);
  6020         8727  
1336             my $n_exons =
1337 7         31 scalar(grep {$_->primary_tag eq 'exon'} @flat_seq_features);
  6020         8852  
1338             my $n_mrnas =
1339 7         35 scalar(grep {$_->primary_tag eq 'mRNA'} @flat_seq_features);
  6020         8715  
1340             my $n_mrnas_attached_to_gene =
1341 7 100       32 scalar(grep {$_->primary_tag eq 'mRNA' &&
  6020         8744  
1342             $_->has_tag($group_tag)} @flat_seq_features);
1343             my $n_cdss =
1344 7         32 scalar(grep {$_->primary_tag eq 'CDS'} @flat_seq_features);
  6020         8837  
1345             my $n_rnas =
1346 7         34 scalar(grep {$_->primary_tag =~ /RNA/} @flat_seq_features);
  6020         9398  
1347             # Are there any CDS features in the record?
1348 7 100       27 if ($n_cdss > 0) {
1349             # YES
1350            
1351             # - a pc gene model should contain at the least a CDS
1352              
1353             # Are there any mRNA features in the record?
1354 6 100       27 if ($n_mrnas == 0) {
    50          
1355             # NO mRNAs:
1356             # looks like structure_type == 1
1357 1         7 $structure_type = 1;
1358 1         5 $need_to_infer_mRNAs = 1;
1359             }
1360             elsif ($n_mrnas_attached_to_gene == 0) {
1361             # $n_mrnas > 0
1362             # $n_mrnas_attached_to_gene = 0
1363             #
1364             # The entries _do_ contain mRNA features,
1365             # but none of them are part of a group/gene, i.e. they
1366             # are 'floating'
1367              
1368             # this is an annoying weird file that has some floating
1369             # mRNA features;
1370             # eg ftp.ncbi.nih.gov/genomes/Schizosaccharomyces_pombe/
1371            
1372 0 0       0 if ($self->verbose) {
1373             my @floating_mrnas =
1374 0 0       0 grep {$_->primary_tag eq 'mRNA' &&
  0         0  
1375             !$_->has_tag($group_tag)} @flat_seq_features;
1376 0         0 printf STDERR "Unattached mRNAs:\n";
1377 0         0 foreach my $mrna (@floating_mrnas) {
1378 0         0 $self->_write_sf_detail($mrna);
1379             }
1380 0         0 printf STDERR "Don't know how to deal with these; filter at source?\n";
1381             }
1382              
1383 0         0 foreach (@flat_seq_features) {
1384 0 0       0 if ($_->primary_tag eq 'mRNA') {
1385             # what should we do??
1386            
1387             # I think for pombe we just have to filter
1388             # out bogus mRNAs prior to starting
1389             }
1390             }
1391              
1392             # looks like structure_type == 2
1393 0         0 $structure_type = 2;
1394 0         0 $need_to_infer_mRNAs = 1;
1395             }
1396             else {
1397             }
1398              
1399             # we always infer exons in magic mode
1400 6         12 $need_to_infer_exons = 1;
1401             }
1402             else {
1403             # this doesn't seem to be any kind of protein coding gene model
1404 1 50       3 if ( $n_rnas > 0 ) {
1405 1         3 $need_to_infer_exons = 1;
1406             }
1407             }
1408              
1409 7 50       22 $need_to_infer_exons = 0 if $noinfer; #NML
1410              
1411 7 50       39 if ($need_to_infer_exons) {
1412             # remove exons and introns from group -
1413             # we will infer exons later, and we
1414             # can always infer introns from exons
1415 7         23 foreach my $group (@groups) {
1416             @$group =
1417             grep {
1418 1936         2460 my $type = $_->primary_tag();
  6009         8972  
1419 6009 100       8184 if ($type eq 'exon') {
1420             # keep track of all removed exons,
1421             # so we can do a sanity check later
1422 130         286 push(@removed_exons, $_);
1423             }
1424 6009 100       14600 $type ne 'exon' && $type ne 'intron'
1425             } @$group;
1426             }
1427             # get rid of any groups that have zero members
1428 7         29 @groups = grep {scalar(@$_)} @groups;
  1936         2235  
1429             }
1430             }
1431             # --- END OF MAGIC ---
1432            
1433             # LOGICAL ASSERTION
1434 11 50       31 if (grep {!scalar(@$_)} @groups) {
  2078         2237  
1435 0         0 $self->throw("ASSERTION ERROR: empty group");
1436             }
1437              
1438             # LOGGING
1439 11 50       56 if ($self->verbose > 0) {
1440 0         0 printf STDERR "GROUPS:\n";
1441 0         0 foreach my $group (@groups) {
1442 0         0 $self->_write_group($group, $group_tag);
1443             }
1444             }
1445             # -
1446              
1447             # --------- FINISHED GROUPING -------------
1448              
1449              
1450             # TYPE CONTAINMENT HIERARCHY (aka partonomy)
1451             # set the containment hierarchy if desired
1452             # see docs for structure_type() method
1453 11 100       38 if ($structure_type) {
1454 1 50       9 if ($structure_type == 1) {
1455 1         16 $self->partonomy(
1456             {CDS => 'gene',
1457             exon => 'CDS',
1458             intron => 'CDS',
1459             }
1460             );
1461             }
1462             else {
1463 0         0 $self->throw("structure_type $structure_type is currently unknown");
1464             }
1465             }
1466              
1467             # see if we have an obvious resolver_tag
1468 11 100       40 if ($use_magic) {
1469 7         24 foreach my $sf (@all_seq_features) {
1470 6020 100       8418 if ($sf->has_tag('derived_from')) {
1471 2         5 $resolver_tag = 'derived_from';
1472             }
1473             }
1474             }
1475              
1476 11 100       51 if ($use_magic) {
1477             # point all feature types without a container type to the root type.
1478             #
1479             # for example, if we have an unanticipated feature_type, say
1480             # 'aberration', this should by default point to the parent 'gene'
1481 7         20 foreach my $group (@groups) {
1482 1931         3342 my @sfs = @$group;
1483 1931 100       2640 if (@sfs > 1) {
1484 1384         1553 foreach my $sf (@sfs) {
1485 5227         7161 my $type = $sf->primary_tag;
1486 5227 100       7259 next if $type eq 'gene';
1487 3989         4462 my $container_type = $self->get_container_type($type);
1488 3989 100       6240 if (!$container_type) {
1489 9         16 $self->partonomy->{$type} = 'gene';
1490             }
1491             }
1492             }
1493             }
1494             }
1495              
1496             # we have done the first part of the unflattening.
1497             # we now have a list of groups; each group is a list of seqfeatures.
1498             # the actual group itself is flat; we may want to unflatten this further;
1499             # for instance, a gene model can contain multiple mRNAs and CDSs. We may want
1500             # to link the correct mRNA to the correct CDS via the bioperl sub_SeqFeature tree.
1501             #
1502             # what we would end up with would be
1503             # gene1
1504             # mRNA-a
1505             # CDS-a
1506             # mRNA-b
1507             # CDS-b
1508 11         72 my @top_sfs = $self->unflatten_groups(-groups=>\@groups,
1509             -resolver_method=>$resolver_method,
1510             -resolver_tag=>$resolver_tag);
1511            
1512             # restore settings
1513 11         83 $self->partonomy($old_partonomy);
1514              
1515             # restore settings
1516 11         44 $self->structure_type($old_structure_type);
1517              
1518             # modify the original Seq object - the top seqfeatures are now
1519             # the top features from each group
1520 11         74 $seq->remove_SeqFeatures;
1521 11         56 $seq->add_SeqFeature($_) foreach @top_sfs;
1522              
1523             # --------- FINISHED UNFLATTENING -------------
1524              
1525             # lets see if there are any post-unflattening tasks we need to do
1526              
1527            
1528              
1529             # INFERRING mRNAs
1530 11 100       38 if ($need_to_infer_mRNAs) {
1531 1 50       5 if ($self->verbose > 0) {
1532 0         0 printf STDERR "** INFERRING mRNA from CDS\n";
1533             }
1534 1         6 $self->infer_mRNA_from_CDS(-seq=>$seq, -noinfer=>$noinfer);
1535             }
1536              
1537             # INFERRING exons
1538 11 100       32 if ($need_to_infer_exons) {
1539              
1540             # infer exons, one group/gene at a time
1541 7         15 foreach my $sf (@top_sfs) {
1542 2034         4051 my @sub_sfs = ($sf, $sf->get_all_SeqFeatures);
1543 2034         4301 $self->feature_from_splitloc(-features=>\@sub_sfs);
1544             }
1545              
1546             # some exons are stated explicitly; ie there is an "exon" feature
1547             # most exons are inferred; ie there is a "mRNA" feature with
1548             # split locations
1549             #
1550             # if there were exons explicitly stated in the entry, we need to
1551             # do two things:
1552             #
1553             # make sure these exons are consistent with the inferred exons
1554             # (you never know)
1555             #
1556             # transfer annotation (tag-vals) from the explicit exon to the
1557             # new inferred exon
1558 7 100       33 if (@removed_exons) {
1559 2         12 my @allfeats = $seq->get_all_SeqFeatures;
1560              
1561             # find all the inferred exons that are children of mRNA
1562 2         9 my @mrnas = grep {$_->primary_tag eq 'mRNA'} @allfeats;
  213         293  
1563             my @exons =
1564 160         212 grep {$_->primary_tag eq 'exon'}
1565 2         5 map {$_->get_SeqFeatures} @mrnas;
  26         32  
1566              
1567 2         7 my %exon_h = (); # index of exons by location;
1568              
1569             # there CAN be >1 exon at a location; we can represent these redundantly
1570             # (ie as a tree, not a graph)
1571 2         7 push(@{$exon_h{$self->_locstr($_)}}, $_) foreach @exons;
  134         200  
1572 2         6 my @problems = (); # list of problems;
1573             # each problem is a
1574             # [$severity, $description] pair
1575 2         4 my $problem = '';
1576 2         10 my ($n_exons, $n_removed_exons) =
1577             (scalar(keys %exon_h), scalar(@removed_exons));
1578 2         6 foreach my $removed_exon (@removed_exons) {
1579 130         190 my $locstr = $self->_locstr($removed_exon);
1580 130         212 my $inferred_exons = $exon_h{$locstr};
1581 130         205 delete $exon_h{$locstr};
1582 130 50       176 if ($inferred_exons) {
1583 130         151 my %exons_done = ();
1584 130         177 foreach my $exon (@$inferred_exons) {
1585              
1586             # make sure we don't move stuff twice
1587 134 100       271 next if $exons_done{$exon};
1588 130         195 $exons_done{$exon} = 1;
1589              
1590             # we need to tranfer any tag-values from the explicit
1591             # exon to the implicit exon
1592 130         215 foreach my $tag ($removed_exon->get_all_tags) {
1593 284         445 my @vals = $removed_exon->get_tag_values($tag);
1594 284 50       629 if (!$exon->can("add_tag_value")) {
1595             # I'm puzzled as to what should be done here;
1596             # SeqFeatureIs are not necessarily mutable,
1597             # but we know that in practice the implementing
1598             # class is mutable
1599 0         0 $self->throw("The SeqFeature object does not ".
1600             "implement add_tag_value()");
1601             }
1602 284         427 $exon->add_tag_value($tag, @vals);
1603             }
1604             }
1605             }
1606             else {
1607             # no exons inferred at $locstr
1608 0         0 push(@problems,
1609             [1,
1610             "there is a conflict with exons; there was an explicitly ".
1611             "stated exon with location $locstr, yet I cannot generate ".
1612             "this exon from the supplied mRNA locations\n"]);
1613             }
1614             }
1615             # do we have any inferred exons left over, that were not
1616             # covered in the explicit exons?
1617 2 50       8 if (keys %exon_h) {
1618             # TODO - we ignore this problem for now
1619 0         0 push(@problems,
1620             [1,
1621             sprintf("There are some inferred exons that are not in the ".
1622             "explicit exon list; they are the exons at locations:\n".
1623             join("\n", keys %exon_h)."\n")]);
1624             }
1625              
1626             # report any problems
1627 2 50       43 if (@problems) {
1628 0         0 my $thresh = $self->error_threshold;
1629 0         0 my @bad_problems = grep {$_->[0] > $thresh} @problems;
  0         0  
1630 0 0       0 if (@bad_problems) {
1631 0         0 printf STDERR "PROBLEM:\n";
1632 0         0 $self->_write_hier(\@top_sfs);
1633             # TODO - allow more fine grained control over this
1634 0         0 $self->{_problems_reported} = 1;
1635             $self->throw(join("\n",
1636 0         0 map {"@$_"} @bad_problems));
  0         0  
1637             }
1638 0         0 $self->problem(@$_) foreach @problems;
1639             }
1640             }
1641             }
1642             # --- end of inferring exons --
1643              
1644             # return new top level features; this can also
1645             # be retrieved via
1646             # $seq->get_SeqFeatures();
1647             # return @top_sfs;
1648 11         172 return $seq->get_SeqFeatures;
1649             }
1650              
1651             # _split_group_if_disconnected([@sfs])
1652             #
1653             # as well as having the same group_tag, a group should be spatially
1654             # connected. if not, then the group should be split into subgroups.
1655             # this turns out to be necessary in the case of multicopy genes.
1656             # the standard way to represent these is as spatially disconnected
1657             # gene models (usually a 'gene' feature and some kind of RNA feature)
1658             # with the same group tag; the code below will split these into
1659             # seperate groups, one per copy.
1660              
1661             sub _split_group_if_disconnected {
1662 2080     2080   2430 my $self = shift;
1663 2080         2450 my $group = shift;
1664 2080         4173 my @sfs = @$group;
1665 2080         4335 my @ranges =
1666             Bio::Range->disconnected_ranges(@sfs);
1667 2080         2464 my @groups;
1668 2080 50       4394 if (@ranges == 0) {
    100          
1669 0         0 $self->throw("ASSERTION ERROR");
1670             }
1671             elsif (@ranges == 1) {
1672             # no need to split the group
1673 2077         2984 @groups = ($group);
1674             }
1675             else {
1676             # @ranges > 1
1677             # split the group into disconnected ranges
1678 3 50       10 if ($self->verbose > 0) {
1679 0         0 printf STDERR "GROUP PRE-SPLIT:\n";
1680 0         0 $self->_write_group($group, $self->group_tag);
1681             }
1682             @groups =
1683             map {
1684 3         8 my $range = $_;
  6         10  
1685             [grep {
1686 6         11 $_->intersection($range);
  136         243  
1687             } @sfs]
1688             } @ranges;
1689 3 50       14 if ($self->verbose > 0) {
1690 0         0 printf STDERR "SPLIT GROUPS:\n";
1691 0         0 $self->_write_group($_, $self->group_tag) foreach @groups;
1692             }
1693             }
1694 2080         4182 return @groups;
1695             }
1696              
1697             sub _remove_duplicates_from_group {
1698 2083     2083   1977 my $self = shift;
1699 2083         1944 my $group = shift;
1700              
1701             # ::: WEIRD BOUNDARY CASE CODE :::
1702             # for some reason, there are some gb records with two gene
1703             # features for one gene; for example, see ATF14F8.gbk
1704             # in the t/data directory
1705             #
1706             # in this case, we get rid of one of the genes
1707              
1708 2083         2464 my @genes = grep {$_->primary_tag eq 'gene'} @$group;
  6333         9013  
1709 2083 100       2906 if (@genes > 1) {
1710             # OK, if we look at ATF14F8.gbk we see that some genes
1711             # just exist as a single location, some exist as a multisplit location;
1712             #
1713             # eg
1714              
1715             # gene 16790..26395
1716             # /gene="F14F8_60"
1717             # ...
1718             # gene complement(join(16790..19855,20136..20912,21378..21497,
1719             # 21654..21876,22204..22400,22527..23158,23335..23448,
1720             # 23538..23938,24175..24536,24604..24715,24889..24984,
1721             # 25114..25171,25257..25329,25544..25589,25900..26018,
1722             # 26300..26395))
1723             # /gene="F14F8_60"
1724              
1725             # the former is the 'standard' way of representing the gene in genbank;
1726             # the latter is redundant with the CDS entry. So we shall get rid of
1727             # the latter with the following filter
1728              
1729 11 50       25 if ($self->verbose > 0) {
1730 0         0 printf STDERR "REMOVING DUPLICATES:\n";
1731             }
1732              
1733             @genes =
1734             grep {
1735 11         14 my $loc = $_->location;
  22         42  
1736 22 100       65 if ($loc->isa("Bio::Location::SplitLocationI")) {
1737 10         19 my @locs = $loc->each_Location;
1738 10 50       57 if (@locs > 1) {
1739 10         49 0;
1740             }
1741             else {
1742 0         0 1;
1743             }
1744             }
1745             else {
1746 12         25 1;
1747             }
1748             } @genes;
1749              
1750 11 100       23 if (@genes > 1) {
1751             # OK, that didn't work. Our only resort is to just pick one at random
1752 1         4 @genes = ($genes[0]);
1753             }
1754 11 50       24 if (@genes) {
1755 11 50       21 @genes == 1 || $self->throw("ASSERTION ERROR");
1756             @$group =
1757 11         19 ($genes[0], grep {$_->primary_tag ne 'gene'} @$group);
  170         239  
1758             }
1759             }
1760             # its a dirty job but someone's gotta do it
1761 2083         2668 return;
1762             }
1763              
1764              
1765             =head2 unflatten_groups
1766              
1767             Title : unflatten_groups
1768             Usage :
1769             Function: iterates over groups, calling unflatten_group() [see below]
1770             Example :
1771             Returns : list of Bio::SeqFeatureI objects that are holders
1772             Args : see below
1773              
1774             Arguments
1775              
1776             -groups: list of list references; inner list is of Bio::SeqFeatureI objects
1777             e.g. ( [$sf1], [$sf2, $sf3, $sf4], [$sf5, ...], ...)
1778              
1779             -resolver_method: a CODE reference
1780             see the documentation above for an example of
1781             a subroutine that can be used to resolve hierarchies
1782             within groups.
1783              
1784             this is optional - a default subroutine will be used
1785              
1786              
1787             NOTE: You should not need to call this method, unless you want fine
1788             grained control over how the unflattening process.
1789              
1790             =cut
1791              
1792             sub unflatten_groups{
1793 11     11 1 50 my ($self,@args) = @_;
1794 11         83 my($groups, $resolver_method, $resolver_tag) =
1795             $self->_rearrange([qw(GROUPS
1796             RESOLVER_METHOD
1797             RESOLVER_TAG
1798             )],
1799             @args);
1800              
1801             # this is just a simple wrapper for unflatten_group()
1802             return
1803             map {
1804 11         44 $self->unflatten_group(-group=>$_,
  2078         5341  
1805             -resolver_method=>$resolver_method,
1806             -resolver_tag=>$resolver_tag)
1807             } @$groups;
1808             }
1809              
1810             =head2 unflatten_group
1811              
1812             Title : unflatten_group
1813             Usage :
1814             Function: nests a group of features into a feature containment hierarchy
1815             Example :
1816             Returns : Bio::SeqFeatureI objects that holds other features
1817             Args : see below
1818              
1819             Arguments
1820              
1821             -group: reference to list of Bio::SeqFeatureI objects
1822              
1823             -resolver_method: a CODE reference
1824             see the documentation above for an example of
1825             a subroutine that can be used to resolve hierarchies
1826             within groups
1827              
1828             this is optional - a default subroutine will be used
1829              
1830              
1831             NOTE: You should not need to call this method, unless you want fine
1832             grained control over how the unflattening process.
1833              
1834             =cut
1835              
1836             sub unflatten_group{
1837 2078     2078 1 4258 my ($self,@args) = @_;
1838              
1839 2078         4951 my($group, $resolver_method, $resolver_tag) =
1840             $self->_rearrange([qw(GROUP
1841             RESOLVER_METHOD
1842             RESOLVER_TAG
1843             )],
1844             @args);
1845              
1846 2078 50       4949 if ($self->verbose > 0) {
1847 0         0 printf STDERR "UNFLATTENING GROUP:\n";
1848 0         0 $self->_write_group($group, $self->group_tag);
1849             }
1850              
1851 2078         4233 my @sfs = @$group;
1852              
1853             # we can safely ignore singletons (e.g. [source])
1854 2078 100       4743 return $sfs[0] if @sfs == 1;
1855              
1856 1508         2501 my $partonomy = $self->partonomy;
1857              
1858             # $resolver_method is a reference to a SUB that will resolve
1859             # ambiguous parent/child containment; for example, determining
1860             # which mRNAs go with which CDSs
1861 1508   100     4562 $resolver_method = $resolver_method || \&_resolve_container_for_sf;
1862              
1863             # TAG BASED RESOLVING OF HIERARCHIES
1864             #
1865             # if the user specifies $resolver_tag, then we use this tag
1866             # to pair up ambiguous parents and children;
1867             #
1868             # for example, the CDS feature may have a resolver tag of /derives_from
1869             # which is a 'foreign key' into the /label tag of the mRNA feature
1870             #
1871             # this kind of tag-based resolution is possible for a certain subset
1872             # of genbank records
1873             #
1874             # if no resolver tag is specified, we revert to the normal
1875             # resolver_method
1876 1508 100       2795 if ($resolver_tag) {
1877 1         4 my $backup_resolver_method = $resolver_method;
1878             # closure: $resolver_tag is remembered by this sub
1879             my $sub =
1880             sub {
1881 2     2   7 my ($self, $sf, @possible_container_sfs) = @_;
1882 2         7 my @container_sfs = ();
1883 2 50       7 if ($sf->has_tag($resolver_tag)) {
1884 2         9 my ($resolver_tagval) = $sf->get_tag_values($resolver_tag);
1885             # if a feature has a resolver_tag (e.g. /derives_from)
1886             # this specifies the /product, /symbol or /label for the
1887             # parent feature
1888             @container_sfs =
1889             grep {
1890 2         4 my $match = 0;
  4         5  
1891 4 50       10 $self->_write_sf($_) if $self->verbose > 0;
1892 4         11 foreach my $tag (qw(product symbol label)) {
1893 10 100       19 if ($_->has_tag($tag)) {
1894 6         13 my @vals =
1895             $_->get_tag_values($tag);
1896 6 100       13 if (grep {$_ eq $resolver_tagval} @vals) {
  6         23  
1897 2         7 $match = 1;
1898 2         7 last;
1899             }
1900             }
1901             }
1902 4         11 $match;
1903             } @possible_container_sfs;
1904             }
1905             else {
1906 0         0 return $backup_resolver_method->($sf, @possible_container_sfs);
1907             }
1908 2         6 return map {$_=>0} @container_sfs;
  2         11  
1909 1         25 };
1910 1         3 $resolver_method = $sub;
1911             }
1912             else {
1913             # CONDITION: $resolver_tag is NOT set
1914 1507 50       2646 $self->throw("assertion error") if $resolver_tag;
1915             }
1916             # we have now set $resolver_method to a subroutine for
1917             # disambiguatimng parent/child relationships. we will
1918             # now build the whole containment hierarchy for this group
1919              
1920              
1921             # FIND TOP/ROOT SEQFEATURES
1922             #
1923             # find all the features for which there is no
1924             # containing feature type (eg genes)
1925             my @top_sfs =
1926             grep {
1927 1508         2065 !$self->get_container_type($_->primary_tag);
  5517         8985  
1928             } @sfs;
1929              
1930             # CONDITION: there must be at most one root
1931 1508 50       3105 if (@top_sfs > 1) {
1932 0         0 $self->_write_group($group, $self->group_tag);
1933 0         0 printf STDERR "TOP SFS:\n";
1934 0         0 $self->_write_sf($_) foreach @top_sfs;
1935 0         0 $self->throw("multiple top-sfs in group");
1936             }
1937 1508         1952 my $top_sf = $top_sfs[0];
1938              
1939             # CREATE INDEX OF SEQFEATURES BY TYPE
1940 1508         1840 my %sfs_by_type = ();
1941 1508         2286 foreach my $sf (@sfs) {
1942 5517         5818 push(@{$sfs_by_type{$sf->primary_tag}}, $sf);
  5517         7770  
1943             }
1944              
1945             # containment index; keyed by child; lookup parent
1946             # note: this index uses the stringified object reference of
1947             # the object as a surrogate lookup key
1948              
1949 1508         1934 my %container = (); # child -> parent
1950              
1951             # ALGORITHM: build containment graph
1952             #
1953             # find all possible containers for each SF;
1954             # for instance, for a CDS, the possible containers are all
1955             # the mRNAs in the same group. For a mRNA, the possible
1956             # containers are any SFs of type 'gene' (should only be 1).
1957             # (these container-type mappings can be overridden)
1958             #
1959             # contention is resolved by checking coordinates of splice sites
1960             # (this is the default, but can be overridden)
1961             #
1962             # most of the time, there is no problem identifying a unique
1963             # parent for every child; this can be ambiguous when constructing
1964             # CDS to mRNA relationships with lots of alternate splicing
1965             #
1966             # a hash of child->parent relationships is constructed (%container)
1967             # any mappings that need further resolution (eg CDS to mRNA) are
1968             # placed in %unresolved
1969              
1970             # %unresolved index
1971             # (keyed by stringified object reference of child seqfeature)
1972 1508         1700 my %unresolved = (); # child -> [parent,score] to be resolved
1973            
1974             # index of seqfeatures by their stringified object reference;
1975             # this is essentially a way of 'reviving' an object from its stringified
1976             # reference
1977             # (see NOTE ON USING OBJECTS AS KEYS IN HASHES, below)
1978 1508         1996 my %idxsf = map {$_=>$_} @sfs;
  5517         9947  
1979              
1980 1508         2569 foreach my $sf (@sfs) {
1981 5517         8434 my $type = $sf->primary_tag;
1982              
1983             # container type (e.g. the container type for CDS is usually mRNA)
1984 5517         8012 my $container_type =
1985             $self->get_container_type($type);
1986 5517 100       8441 if ($container_type) {
1987              
1988             my @possible_container_sfs =
1989 4155 100       4163 @{$sfs_by_type{$container_type} || []};
  4155         8319  
1990             # we now have a list of possible containers
1991             # (eg for a CDS in an alternately spliced gene, this
1992             # would be a list of all the mRNAs for this gene)
1993              
1994 4155 100       6374 if (!@possible_container_sfs) {
1995             # root of hierarchy
1996             }
1997             else {
1998 3906 100       5749 if (@possible_container_sfs == 1) {
1999             # this is the easy situation, whereby the containment
2000             # hierarchy is unambiguous. this will probably be the
2001             # case if the genbank record has no alternate splicing
2002             # within it
2003              
2004             # ONE OPTION ONLY - resolved!
2005 2934         6502 $container{$sf} = $possible_container_sfs[0];
2006              
2007             }
2008             else {
2009             # MULTIPLE CONTAINER CHOICES
2010 972 50       1870 $self->throw("ASSERTION ERROR") unless @possible_container_sfs > 1;
2011              
2012             # push this onto the %unresolved graph, and deal with it
2013             # later
2014              
2015             # for now we hardcode things such that the only type
2016             # with ambiguous parents is a CDS; if this is violated,
2017             # it has a weak problem class of '1' so the API user
2018             # can easily set things to ignore these
2019 972 50       1719 if ($sf->primary_tag ne 'CDS') {
2020 0         0 $self->problem(1,
2021             "multiple container choice for non-CDS; ".
2022             "CDS to mRNA should be the only ".
2023             "relationships requiring resolving",
2024             $sf);
2025             }
2026              
2027             # previously we set the SUB $resolver_method
2028 972 50       1944 $self->throw("ASSERTION ERROR")
2029             unless $resolver_method;
2030              
2031             # $resolver_method will assign scores to
2032             # parent/child combinations; later on we
2033             # will use these scores to find the optimal
2034             # parent/child pairings
2035              
2036             # the default $resolver_method uses splice sites to
2037             # score possible parent/child matches
2038              
2039 972         1872 my %container_sfh =
2040             $resolver_method->($self, $sf, @possible_container_sfs);
2041 972 100       2316 if (!%container_sfh) {
2042 6         36 $self->problem(2,
2043             "no containers possible for SeqFeature of ".
2044             "type: $type; this SF is being placed at ".
2045             "root level",
2046             $sf);
2047             # RESOLVED! (sort of - placed at root/gene level)
2048 6         24 $container{$sf} = $top_sf;
2049              
2050             # this sort of thing happens if the record is
2051             # badly messed up and there is absolutely no indication
2052             # of where to put the CDS. Perhaps we should just
2053             # place it with a random mRNA?
2054             }
2055 972         2253 foreach my $jsf (keys %container_sfh) {
2056              
2057             # add [score, parent] pairs to the %unresolved
2058             # lookup table/graph
2059 1748         8080 push(@{$unresolved{$sf}},
2060 1748   100     1933 [$idxsf{$jsf}, $container_sfh{$jsf} || 0]);
2061             }
2062             }
2063             }
2064             }
2065             else {
2066             # CONDITION:
2067             # not container type for $sf->primary_tag
2068            
2069             # CONDITION:
2070             # $sf must be a root/top node (eg gene)
2071             }
2072             }
2073              
2074 1508         1689 if (0) {
2075              
2076             # CODE CURRENTLY DISABLED
2077              
2078             # we require a 1:1 mapping between mRNAs and CDSs;
2079             # create artificial duplicates if we can't do this...
2080             if (%unresolved) {
2081             my %childh = map {$_=>1} keys %unresolved;
2082             my %parenth = map {$_->[0]=>1} map {@$_} values %unresolved;
2083             if ($self->verbose > 0) {
2084             printf STDERR "MATCHING %d CHILDREN TO %d PARENTS\n",
2085             scalar(keys %childh), scalar(keys %parenth);
2086             }
2087             # 99.99% of the time in genbank genomic record of structure type 0, we
2088             # see one CDS for every mRNA; one exception is the S Pombe
2089             # genome, which is all CDS, bar a few spurious mRNAs; we have to
2090             # filter out the spurious mRNAs in this case
2091             #
2092             # another strange case is in the mouse genome, NT_078847.1
2093             # for Pcdh13 you will notice there is 4 mRNAs and 5 CDSs.
2094             # most unusual!
2095             # I'm at a loss for a really clever thing to do here. I think the
2096             # best thing is to create duplicate features to preserve the 1:1 mapping
2097             # my $suffix_id = 1;
2098             # while (keys %childh > keys %parenth) {
2099             #
2100             # }
2101             }
2102             }
2103              
2104             # DEBUGGING CODE
2105 1508 0 50     2643 if ($self->verbose > 0 && scalar(keys %unresolved)) {
2106 0         0 printf STDERR "UNRESOLVED PAIRS:\n";
2107 0         0 foreach my $childsf (keys %unresolved) {
2108 0         0 my @poss = @{$unresolved{$childsf}};
  0         0  
2109 0         0 foreach my $p (@poss) {
2110 0         0 my $parentsf = $p->[0];
2111 0         0 $childsf = $idxsf{$childsf};
2112 0         0 my @clabels = ($childsf->get_tagset_values(qw(protein_id label product)), "?");
2113 0         0 my @plabels = ($parentsf->get_tagset_values(qw(transcript_id label product)), "?");
2114 0         0 printf STDERR
2115             (" PAIR: $clabels[0] => $plabels[0] (of %d)\n",
2116             scalar(@poss));
2117             }
2118             }
2119             } # -- end of verbose
2120              
2121             # Now we have to fully resolve the containment hierarchy; remember,
2122             # the graph %container has the fully resolved child->parent links;
2123             #
2124             # the graph %unresolved is keyed by children missing parents; we
2125             # need to put all these orphans in the %container graph
2126             #
2127             # we do this using the scores in %unresolved, with the
2128             # find_best_matches() algorithm
2129 1508         1999 my $unresolved_problem_reported = 0;
2130 1508 100       2604 if (%unresolved) {
2131 322         1119 my $new_pairs =
2132             $self->find_best_matches(\%unresolved, []);
2133 322 50       931 if (!$new_pairs) {
2134 0   0     0 my ($g) = $sfs[0]->get_tagset_values($self->group_tag || 'gene');
2135 0         0 $self->problem(2,
2136             "Could not resolve hierarchy for $g");
2137 0         0 $new_pairs = [];
2138 0         0 $unresolved_problem_reported = 1;
2139             }
2140 322         614 foreach my $pair (@$new_pairs) {
2141 966 50       1635 if ($self->verbose > 0) {
2142 0         0 printf STDERR " resolved pair @$pair\n";
2143             }
2144 966         1389 $container{$pair->[0]} = $pair->[1];
2145 966         2127 delete $unresolved{$pair->[0]};
2146             }
2147             }
2148              
2149             # CONDITION: containment hierarchy resolved
2150 1508 50       2469 if (%unresolved) {
2151 0 0       0 $self->throw("UNRESOLVED: %unresolved")
2152             unless $unresolved_problem_reported;
2153             }
2154              
2155             # make nested SeqFeature hierarchy from @containment_pairs
2156             # ie put child SeqFeatures into parent SeqFeatures
2157 1508         1870 my @top = ();
2158 1508         2043 foreach my $sf (@sfs) {
2159 5517         8766 my $container_sf = $container{$sf};
2160 5517 100       8230 if ($container_sf) {
2161             # make $sf nested inside $container_sf
2162              
2163             # first check if the container spatially contains the containee
2164 3906 50       7026 if ($container_sf->contains($sf)) {
2165             # add containee
2166 3906         7190 $container_sf->add_SeqFeature($sf);
2167             }
2168             else {
2169             # weird case - the container does NOT spatially
2170             # contain the containee;
2171             # we expand and throw a warning
2172             #
2173             # for an example of this see ZFP91-CNTF dicistronic gene
2174             # in NCBI chrom 11 build 34.3
2175 0         0 $self->problem(1,
2176             "Container feature does not spatially contain ".
2177             "subfeature. Perhaps this is a dicistronic gene? ".
2178             "I am expanding the parent feature",
2179             $container_sf,
2180             $sf);
2181 0         0 $container_sf->add_SeqFeature($sf, 'EXPAND');
2182             }
2183             }
2184             else {
2185 1611         2474 push(@top, $sf);
2186             }
2187             }
2188 1508         10823 return @top;
2189             } # -- end of unflatten_group
2190              
2191             # -------
2192             # A NOTE ON USING OBJECTS AS KEYS IN HASHES (stringified objects)
2193             #
2194             # Often we with to use seqfeatures as keys in a hashtable; because seqfeatures
2195             # in bioperl have no unique ID, we use a surrogate ID in the form of the
2196             # stringified object references - this is just what you get if you say
2197             #
2198             # print "$sf\n";
2199             #
2200             # this is guaranteed to be unique (within a particular perl execution)
2201             #
2202             # often we want to 'revive' the objects used as keys in a hash - once the
2203             # objects are used as keys, remember it is the *strings* used as keys and
2204             # not the object itself, so the object needs to be revived using another
2205             # hashtable that looks like this
2206             #
2207             # %sfidx = map { $_ => $_ } @sfs
2208             #
2209             # -------
2210              
2211              
2212             # recursively finds the best set of pairings from a matrix of possible pairings
2213             #
2214             # tries to make sure nothing is unpaired
2215             #
2216             # given a matrix of POSSIBLE matches
2217             # (matrix expressed as hash/lookup; keyed by child object; val = [parent, score]
2218             #
2219             #
2220             sub find_best_matches {
2221 1288     1288 0 1455 my $self = shift;
2222 1288         1385 my $matrix = shift;
2223 1288         1412 my $pairs = shift; # [child,parent] pairs already selected
2224              
2225 1288         1893 my $verbose = $self->verbose;
2226             #################################print "I";
2227 1288 50       2433 if ($verbose > 0) {
2228 0         0 printf STDERR "find_best_matches: (/%d)\n", scalar(@$pairs);
2229             }
2230              
2231 1288         1849 my %selected_children = map {($_->[0]=>1)} @$pairs;
  2302         4015  
2232 1288         1938 my %selected_parents = map {($_->[1]=>1)} @$pairs;
  2302         3862  
2233            
2234             # make a copy of the matrix with the portions still to be
2235             # resolved
2236 1288         1818 my %unresolved_parents = ();
2237             my %unresolved =
2238             map {
2239 1288 50       2510 if ($verbose > 0) {
  4604         6140  
2240 0         0 printf STDERR " $_ : %s\n", join("; ", map {"[@$_]"} @{$matrix->{$_}});
  0         0  
  0         0  
2241             }
2242 4604 100       5679 if ($selected_children{$_}) {
2243 2302         2532 ();
2244             }
2245             else {
2246             my @parents =
2247             grep {
2248 5130         10003 !$selected_parents{$_->[0]}
2249 2302         2352 } @{$matrix->{$_}};
  2302         3208  
2250 2302         6220 $unresolved_parents{$_} = 1 foreach @parents;
2251             # new parents
2252 2302         5166 ($_ => [@parents]);
2253             }
2254             } keys %$matrix;
2255            
2256 1288         2657 my @I = keys %unresolved;
2257              
2258 1288 100       3033 return $pairs if !scalar(keys %unresolved_parents);
2259             # NECESSARY CONDITION:
2260             # all possible parents have a child match
2261              
2262 966 50       1737 return $pairs if !scalar(@I);
2263             # NECESSARY CONDITION:
2264             # all possible children have a parent match
2265              
2266             # give those with fewest choices highest priority
2267             @I = sort {
2268             # n possible parents
2269 966         2680 scalar(@{$unresolved{$a}})
  2063         2236  
2270             <=>
2271 2063         2268 scalar(@{$unresolved{$b}}) ;
  2063         3035  
2272             } @I;
2273            
2274 966         1415 my $csf = shift @I;
2275              
2276 966         1149 my @J = @{$unresolved{$csf}}; # array of [parent, score]
  966         1491  
2277              
2278             # sort by score, highest first
2279             @J =
2280             sort {
2281 966         1385 $b->[1] <=> $a->[1]
  364         592  
2282             } @J;
2283              
2284             # select pair(s) from remaining matrix of possible pairs
2285             # by iterating through possible parents
2286              
2287 966         1098 my $successful_pairs;
2288 966         1349 foreach my $j (@J) {
2289 966         1571 my ($psf, $score) = @$j;
2290             # would selecting $csf, $psf as a pair
2291             # remove all choices from another?
2292 966         1303 my $bad = 0;
2293 966         1247 foreach my $sf (@I) {
2294 1336 50       1404 if (!grep {$_->[0] ne $psf} @{$unresolved{$sf}}) {
  2843         6346  
  1336         1767  
2295             # $psf was the only parent choice for $sf
2296 0         0 $bad = 1;
2297 0         0 last;
2298             }
2299             }
2300 966 50       1709 if (!$bad) {
2301 966         1598 my $pair = [$csf, $psf];
2302 966         1720 my $new_pairs = [@$pairs, $pair];
2303 966         2095 my $set = $self->find_best_matches($matrix, $new_pairs);
2304 966 50       1417 if ($set) {
2305 966         965 $successful_pairs = $set;
2306 966         1581 last;
2307             }
2308             }
2309             }
2310             # success
2311 966 50       3456 return $successful_pairs if $successful_pairs;
2312             # fail
2313 0         0 return 0;
2314             }
2315              
2316             # ----------------------------------------------
2317             # writes a group to stdout
2318             #
2319             # mostly for logging/debugging
2320             # ----------------------------------------------
2321             sub _write_group {
2322 0     0   0 my $self = shift;
2323 0         0 my $group = shift;
2324 0   0     0 my $group_tag = shift || 'gene';
2325              
2326 0         0 my $f = $group->[0];
2327 0         0 my $label = '?';
2328 0 0       0 if ($f->has_tag($group_tag)) {
2329 0         0 ($label) = $f->get_tag_values($group_tag);
2330             }
2331 0 0       0 if( $self->verbose > 0 ) {
2332             printf STDERR (" GROUP [%s]:%s\n",
2333             $label,
2334             join(' ',
2335 0         0 map { $_->primary_tag } @$group));
  0         0  
2336             }
2337              
2338             }
2339              
2340             sub _write_sf {
2341 0     0   0 my $self = shift;
2342 0         0 my $sf = shift;
2343 0         0 printf STDERR "TYPE:%s\n", $sf->primary_tag;
2344 0         0 return;
2345             }
2346              
2347             sub _write_sf_detail {
2348 0     0   0 my $self = shift;
2349 0         0 my $sf = shift;
2350 0         0 printf STDERR "TYPE:%s\n", $sf->primary_tag;
2351 0         0 my @locs = $sf->location->each_Location;
2352 0         0 printf STDERR " %s,%s [%s]\n", $_->start, $_->end, $_->strand foreach @locs;
2353 0         0 return;
2354             }
2355              
2356             sub _write_hier {
2357 0     0   0 my $self = shift;
2358 0 0       0 my @sfs = @{shift || []};
  0         0  
2359 0   0     0 my $indent = shift || 0;
2360 0 0       0 if( $self->verbose > 0 ) {
2361 0         0 foreach my $sf (@sfs) {
2362 0         0 my $label = '?';
2363 0 0       0 if ($sf->has_tag('product')) {
2364 0         0 ($label) = $sf->get_tag_values('product');
2365             }
2366 0         0 printf STDERR "%s%s $label\n", ' ' x $indent, $sf->primary_tag;
2367 0         0 my @sub_sfs = $sf->sub_SeqFeature;
2368 0         0 $self->_write_hier(\@sub_sfs, $indent+1);
2369             }
2370             }
2371             }
2372              
2373             # -----------------------------------------------
2374             #
2375             # returns all possible containers for an SF based
2376             # on splice site coordinates; splice site coords
2377             # must be contained
2378             # -----------------------------------------------
2379             sub _resolve_container_for_sf{
2380 962     962   1789 my ($self, $sf, @possible_container_sfs) = @_;
2381              
2382 962         1758 my @coords = $self->_get_splice_coords_for_sf($sf);
2383 962         1917 my $start = $sf->start;
2384 962         1821 my $end = $sf->end;
2385 962         3214 my $splice_uniq_str = "@coords";
2386            
2387 962         1301 my @sf_score_pairs = ();
2388             # a CDS is contained by a mRNA if the locations of the splice
2389             # coordinates are identical
2390 962         1550 foreach (@possible_container_sfs) {
2391 3606         5650 my @container_coords = $self->_get_splice_coords_for_sf($_);
2392 3606   100     19945 my $inside =
2393             !$splice_uniq_str ||
2394             index("@container_coords", $splice_uniq_str) > -1;
2395 3606 100       6286 if ($inside) {
2396             # the container cannot be smaller than the thing contained
2397 1742 100 66     3093 if ($_->start > $start || $_->end < $end) {
2398 20         27 $inside = 0;
2399             }
2400             }
2401              
2402              
2403             # SPECIAL CASE FOR /ribosomal_slippage
2404             # See: https://www.ncbi.nlm.nih.gov/collab/FT/
2405 3606 100 100     8440 if (!$inside && $sf->has_tag('ribosomal_slippage')) {
2406 22 50       42 if ($self->verbose > 0) {
2407 0         0 printf STDERR " Checking for ribosomal_slippage\n";
2408             }
2409              
2410             # TODO: rewrite this to match introns;
2411             # each slippage will be a "fake" small CDS exon
2412 22         42 my @transcript_splice_sites = @container_coords;
2413 22         51 my @cds_splice_sites = @coords;
2414             ##printf STDERR "xxTR SSs: @transcript_splice_sites :: %s\n", $_->get_tag_values('product');
2415             ##printf STDERR "xxCD SSs: @cds_splice_sites :: %s\n\n", $sf->get_tag_values('product');
2416              
2417             # find the the first splice site within the CDS
2418 22   100     72 while (scalar(@transcript_splice_sites) &&
2419             $transcript_splice_sites[0] < $cds_splice_sites[0]) {
2420 29         64 shift @transcript_splice_sites;
2421             }
2422              
2423             ##print STDERR "TR SSs: @transcript_splice_sites\n";
2424             ##print STDERR "CD SSs: @cds_splice_sites\n\n";
2425              
2426 22 100 100     58 if (!(scalar(@transcript_splice_sites)) ||
2427             $transcript_splice_sites[0] == $cds_splice_sites[0]) {
2428              
2429             # we will now try and align all splice remaining sites in the transcript and CDS;
2430             # any splice site that can't be aligned is assumed to be a ribosomal slippage
2431              
2432 16         20 my @slips = ();
2433 16         24 my $in_exon = 1;
2434 16         18 $inside = 1; # innocent until proven guilty..
2435 16         25 while (@cds_splice_sites) {
2436 36 100       61 if (!@transcript_splice_sites) {
    100          
2437              
2438             # ribosomal slippage is after the last transcript splice site
2439             # Example: (NC_00007, isoform 3 of PEG10)
2440             # mRNA join(85682..85903,92646..99007)
2441             # mRNA join(85682..85903,92646..99007)
2442             # CDS join(85899..85903,92646..93825,93825..94994)
2443              
2444             # OR: None of the splice sites align;
2445             # may be a single CDS exon with one slippage inside it.
2446             # Example: (NC_00007, isoform 4 of PEG10)
2447             # mRNA join(85637..85892,92646..99007)
2448             # CDS join(92767..93825,93825..94994)
2449            
2450             # Yes, this code is repeated below...
2451 14         17 my $p1 = shift @cds_splice_sites;
2452 14         15 my $p2 = shift @cds_splice_sites;
2453 14 50       18 if ($self->verbose > 0) {
2454 0         0 printf STDERR " Found the ribosomal_slippage: $p1..$p2\n";
2455             }
2456 14         34 push(@slips, ($p2-$p1)-1);
2457             }
2458             elsif ($cds_splice_sites[0] == $transcript_splice_sites[0]) {
2459             # splice sites align: this is not the slippage
2460 20         18 shift @cds_splice_sites;
2461 20         31 shift @transcript_splice_sites;
2462             ##print STDERR "MATCH\n";
2463             }
2464             else {
2465             # mismatch
2466 2 50       21 if ($cds_splice_sites[0] < $transcript_splice_sites[0]) {
2467             # potential slippage
2468             # v
2469             # ---TTTTTTTTTT----
2470             # ---CCCC--CCCC----
2471             # ^
2472              
2473 2         5 my $p1 = shift @cds_splice_sites;
2474 2         4 my $p2 = shift @cds_splice_sites;
2475 2 50       6 if ($self->verbose > 0) {
2476 0         0 printf STDERR " Found the ribosomal_slippage: $p1..$p2\n";
2477             }
2478 2         8 push(@slips, ($p2-$p1)-1);
2479             }
2480             else {
2481             # not a potential ribosomal slippage
2482 0         0 $inside = 0; # guilty!
2483             ##print STDERR "FAIL\n";
2484 0         0 last;
2485             }
2486             }
2487             }
2488 16 50       23 if ($inside) {
2489             # TODO: this is currently completely arbitrary. How many ribosomal slippages do we allow?
2490             # perhaps we need some mini-statistical model here....?
2491 16 50       24 if (@slips > 1) {
2492 0         0 $inside = 0;
2493             }
2494             # TODO: this is currently completely arbitrary. What is the maximum size of a ribosomal slippage?
2495             # perhaps we need some mini-statistical model here....?
2496 16 50       21 if (grep {$_ > 2} @slips) {
  16         43  
2497 0         0 $inside = 0;
2498             }
2499             }
2500             }
2501             else {
2502             # not a ribosomal_slippage, sorry
2503             }
2504             }
2505 3606 50       6607 if ($self->verbose > 0) {
2506 0         0 printf STDERR " Checking containment:[$inside] (@container_coords) IN ($splice_uniq_str)\n";
2507             }
2508 3606 100       7451 if ($inside) {
2509             # SCORE: matching (ss-scoords+2)/(n-container-ss-coords+2)
2510 1738         3391 my $score =
2511             (scalar(@coords)+2)/(scalar(@container_coords)+2);
2512 1738         4309 push(@sf_score_pairs,
2513             $_=>$score);
2514             }
2515             }
2516 962         4238 return @sf_score_pairs;
2517             }
2518              
2519             sub _get_splice_coords_for_sf {
2520 4568     4568   5142 my $self = shift;
2521 4568         4621 my $sf = shift;
2522              
2523 4568         6878 my @locs = $sf->location;
2524 4568 100       6446 if ($sf->location->isa("Bio::Location::SplitLocationI")) {
2525 4467         7090 @locs = $sf->location->each_Location;
2526             }
2527              
2528             # get an ordered list of (start, end) positions
2529              
2530             # my @coords =
2531             # map {
2532             # $_->strand > 0 ? ($_->start, $_->end) : ($_->end, $_->start)
2533             # } @locs;
2534              
2535 4568         6717 my @coords = map {($_->start, $_->end)} @locs;
  50783         67476  
2536              
2537             # remove first and last leaving only splice sites
2538 4568         5898 pop @coords;
2539 4568         5306 shift @coords;
2540 4568         18115 return @coords;
2541             }
2542              
2543             =head2 feature_from_splitloc
2544              
2545             Title : feature_from_splitloc
2546             Usage : $unflattener->feature_from_splitloc(-features=>$sfs);
2547             Function:
2548             Example :
2549             Returns :
2550             Args : see below
2551              
2552             At this time all this method does is generate exons for mRNA or other RNA features
2553              
2554             Arguments:
2555              
2556             -feature: a Bio::SeqFeatureI object (that conforms to Bio::FeatureHolderI)
2557             -seq: a Bio::SeqI object that contains Bio::SeqFeatureI objects
2558             -features: an arrayref of Bio::SeqFeatureI object
2559              
2560              
2561             =cut
2562              
2563             sub feature_from_splitloc{
2564 2035     2035 1 3231 my ($self,@args) = @_;
2565              
2566 2035         4462 my($sf, $seq, $sfs) =
2567             $self->_rearrange([qw(FEATURE
2568             SEQ
2569             FEATURES
2570             )],
2571             @args);
2572 2035 100       3183 my @sfs = (@{$sfs || []});
  2035         4137  
2573 2035 50       3131 push(@sfs, $sf) if $sf;
2574 2035 100       2691 if ($seq) {
2575 1 50       7 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
2576 1         4 @sfs = $seq->get_all_SeqFeatures;
2577             }
2578 2035         2365 my @exons = grep {$_->primary_tag eq 'exon'} @sfs;
  5856         7906  
2579 2035 50       3184 if (@exons) {
2580 0         0 $self->problem(2,
2581             "There are already exons, so I will not infer exons");
2582             }
2583              
2584             # index of features by type+location
2585 2035         2572 my %loc_h = ();
2586              
2587             # infer for every feature
2588 2035         2584 foreach my $sf (@sfs) {
2589              
2590 5856 50       11995 $sf->isa("Bio::SeqFeatureI") || $self->throw("$sf NOT A SeqFeatureI");
2591 5856 50       9501 $sf->isa("Bio::FeatureHolderI") || $self->throw("$sf NOT A FeatureHolderI");
2592              
2593 5856         7941 my $type = $sf->primary_tag;
2594 5856 100 100     18287 next unless $type eq 'mRNA' or $type =~ /RNA/;
2595              
2596             # an mRNA from genbank will have a discontinuous location,
2597             # with each sub-location being equivalent to an exon
2598 1991         2837 my @locs = $sf->location;
2599              
2600 1991 100       2746 if ($sf->location->isa("Bio::Location::SplitLocationI")) {
2601 1762         2492 @locs = $sf->location->each_Location;
2602             }
2603              
2604 1991 50       3691 if (!@locs) {
2605 3     3   45 use Data::Dumper;
  3         8  
  3         4053  
2606 0         0 print Dumper $sf;
2607 0         0 $self->throw("ASSERTION ERROR: sf has no location objects");
2608             }
2609              
2610             # make exons from locations
2611             my @subsfs =
2612             map {
2613 1991         2711 my $subsf = Bio::SeqFeature::Generic->new(-location=>$_,
  18260         39393  
2614             -primary_tag=>'exon');
2615             ## Provide seq_id to new feature:
2616 18260 50       31205 $subsf->seq_id($sf->seq_id) if $sf->seq_id;
2617 18260 50       29310 $subsf->source_tag($sf->source_tag) if $sf->source_tag;
2618             ## Transfer /locus_tag and /gene tag values to inferred
2619             ## features. TODO: Perhaps? this should not be done
2620             ## indiscriminantly but rather by virtue of the setting
2621             ## of group_tag.
2622 18260         30055 foreach my $tag (grep /gene|locus_tag/, $sf->get_all_tags) {
2623 34683         56352 my @vals = $sf->get_tag_values($tag);
2624 34683         51336 $subsf->add_tag_value($tag, @vals);
2625             }
2626              
2627 18260         35217 my $locstr = 'exon::'.$self->_locstr($subsf);
2628              
2629             # re-use feature if type and location the same
2630 18260 100       32937 if ($loc_h{$locstr}) {
2631 7266         15806 $subsf = $loc_h{$locstr};
2632             }
2633             else {
2634 10994         17369 $loc_h{$locstr} = $subsf;
2635             }
2636 18260         28637 $subsf;
2637             } @locs;
2638            
2639             # PARANOID CHECK
2640 1991         3451 $self->_check_order_is_consistent($sf->location->strand,@subsfs);
2641             #----
2642              
2643 1991         4119 $sf->location(Bio::Location::Simple->new());
2644              
2645             # we allow the exons to define the boundaries of the transcript
2646 1991         4608 $sf->add_SeqFeature($_, 'EXPAND') foreach @subsfs;
2647              
2648              
2649 1991 50       3062 if (!$sf->location->strand) {
2650             # correct weird bioperl bug in previous versions;
2651             # strand was not being set correctly
2652 0         0 $sf->location->strand($subsfs[0]->location->strand);
2653             }
2654              
2655            
2656             }
2657 2035         7854 return;
2658             }
2659              
2660             #sub merge_features_with_same_loc {
2661             # my ($self,@args) = @_;
2662              
2663             # my($sfs, $seq) =
2664             # $self->_rearrange([qw(FEATURES
2665             # SEQ
2666             # )],
2667             # @args);
2668             # my @sfs = (@$sfs);
2669             # if ($seq) {
2670             # $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
2671             # @sfs = $seq->get_all_SeqFeatures;
2672             # }
2673              
2674            
2675             # my %loc_h = ();
2676             # foreach my $sf (@sfs) {
2677             # my $type = $sf->primary_tag;
2678             # my $locstr = $self->_locstr($sf);
2679             ## $loc_h{$type.$locstr}
2680             # push(@{$exon_h{$self->_locstr($_)}}, $_) foreach @exons;
2681             # }
2682             #}
2683              
2684             =head2 infer_mRNA_from_CDS
2685              
2686             Title : infer_mRNA_from_CDS
2687             Usage :
2688             Function:
2689             Example :
2690             Returns :
2691             Args :
2692              
2693             given a "type 1" containment hierarchy
2694              
2695             gene
2696             CDS
2697             exon
2698              
2699             this will infer the uniform "type 0" containment hierarchy
2700              
2701             gene
2702             mRNA
2703             CDS
2704             exon
2705              
2706             all the children of the CDS will be moved to the mRNA
2707              
2708             a "type 2" containment hierarchy is mixed type "0" and "1" (for
2709             example, see ftp.ncbi.nih.gov/genomes/Schizosaccharomyces_pombe/)
2710              
2711             =cut
2712              
2713             sub infer_mRNA_from_CDS{
2714 1     1 1 4 my ($self,@args) = @_;
2715              
2716 1         5 my($sf, $seq, $noinfer) =
2717             $self->_rearrange([qw(FEATURE
2718             SEQ
2719             NOINFER
2720             )],
2721             @args);
2722 1         4 my @sfs = ($sf);
2723 1 50       3 if ($seq) {
2724 1 50       6 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
2725 1         7 @sfs = $seq->get_all_SeqFeatures;
2726             }
2727              
2728 1         3 foreach my $sf (@sfs) {
2729              
2730 52 50       152 $sf->isa("Bio::SeqFeatureI") || $self->throw("$sf NOT A SeqFeatureI");
2731 52 50       98 $sf->isa("Bio::FeatureHolderI") || $self->throw("$sf NOT A FeatureHolderI");
2732 52 50       107 if ($self->verbose > 0) {
2733 0         0 printf STDERR " Checking $sf %s\n", $sf->primary_tag;
2734             }
2735            
2736 52 50       83 if ($sf->primary_tag eq 'mRNA') {
2737 0         0 $self->problem(2,
2738             "Inferring mRNAs when there are already mRNAs present");
2739             }
2740              
2741 52         118 my @cdsl = grep {$_->primary_tag eq 'CDS' } $sf->get_SeqFeatures;
  24         46  
2742 52 100       99 if (@cdsl) {
2743 24         50 my @children = grep {$_->primary_tag ne 'CDS'} $sf->get_SeqFeatures;
  24         47  
2744 24         30 my @mrnas = ();
2745              
2746              
2747 24         43 foreach my $cds (@cdsl) {
2748            
2749 24 50       50 if ($self->verbose > 0) {
2750 0         0 print " Inferring mRNA from CDS $cds\n";
2751             }
2752 24         49 $self->_check_order_is_consistent($cds->location->strand,$cds->location->each_Location);
2753            
2754 24         69 my $loc = Bio::Location::Split->new;
2755 24         54 foreach my $cdsexonloc ($cds->location->each_Location) {
2756 124         206 my $subloc =
2757             Bio::Location::Simple->new(-start=>$cdsexonloc->start,
2758             -end=>$cdsexonloc->end,
2759             -strand=>$cdsexonloc->strand);
2760 124         268 $loc->add_sub_Location($subloc);
2761             }
2762 24 50       55 if ($noinfer) {
2763 0         0 push(@mrnas, $cds);
2764             }
2765             else {
2766             # share the same location
2767 24         90 my $mrna =
2768             Bio::SeqFeature::Generic->new(-location=>$loc,
2769             -primary_tag=>'mRNA');
2770              
2771             ## Provide seq_id to new feature:
2772 24 50       61 $mrna->seq_id($cds->seq_id) if $cds->seq_id;
2773 24 50       57 $mrna->source_tag($cds->source_tag) if $cds->source_tag;
2774              
2775 24         53 $self->_check_order_is_consistent($mrna->location->strand,$mrna->location->each_Location);
2776              
2777             # make the mRNA hold the CDS; no EXPAND option,
2778             # the CDS cannot be wider than the mRNA
2779 24         69 $mrna->add_SeqFeature($cds);
2780              
2781             # mRNA steals children of CDS
2782 24         50 foreach my $subsf ($cds->get_SeqFeatures) {
2783 0         0 $mrna->add_SeqFeature($subsf);
2784             }
2785 24         62 $cds->remove_SeqFeatures;
2786 24         51 push(@mrnas, $mrna);
2787             }
2788             }
2789             # change gene/CDS to gene/mRNA
2790 24         56 $sf->remove_SeqFeatures;
2791 24         62 $sf->add_SeqFeature($_) foreach (@mrnas, @children);
2792             }
2793             }
2794 1         12 return;
2795            
2796              
2797             }
2798              
2799             =head2 remove_types
2800              
2801             Title : remove_types
2802             Usage : $unf->remove_types(-seq=>$seq, -types=>["mRNA"]);
2803             Function:
2804             Example :
2805             Returns :
2806             Args :
2807              
2808             removes features of a set type
2809              
2810             useful for pre-filtering a genbank record; eg to get rid of STSs
2811              
2812             also, there is no way to unflatten
2813             ftp.ncbi.nih.gov/genomes/Schizosaccharomyces_pombe/ UNLESS the bogus
2814             mRNAs in these records are removed (or changed to a different type) -
2815             they just confuse things too much
2816              
2817             =cut
2818              
2819             sub remove_types{
2820 0     0 1 0 my ($self,@args) = @_;
2821              
2822 0         0 my($seq, $types) =
2823             $self->_rearrange([qw(
2824             SEQ
2825             TYPES
2826             )],
2827             @args);
2828 0 0       0 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
2829 0         0 my @sfs = $seq->get_all_SeqFeatures;
2830 0         0 my %rh = map {$_=>1} @$types;
  0         0  
2831 0         0 @sfs = grep {!$rh{$_->primary_tag}} @sfs;
  0         0  
2832 0         0 $seq->remove_SeqFeatures;
2833 0         0 $seq->add_SeqFeature($_) foreach @sfs;
2834 0         0 return;
2835             }
2836              
2837              
2838             # _check_order_is_consistent($strand,$ranges) RETURNS BOOL
2839             #
2840             # note: the value of this test is moot - there are many valid,
2841             # if unusual cases where it would flag an anomaly. for example
2842             # transpliced genes such as mod(mdg4) in dmel on AE003744, and
2843             # the following spliced gene on NC_001284:
2844             #
2845             # mRNA complement(join(20571..20717,21692..22086,190740..190761,
2846             # 140724..141939,142769..142998))
2847             # /gene="nad5"
2848             # /note="trans-splicing, RNA editing"
2849             # /db_xref="GeneID:814567"
2850             #
2851             # note how the exons are not in order
2852             # this will flag a level-3 warning, the user of this module
2853             # can ignore this and deal appropriately with the resulting
2854             # unordered exons
2855             sub _check_order_is_consistent {
2856 2039     2039   2217 my $self = shift;
2857              
2858 2039         2407 my $parent_strand = shift; # this does nothing..?
2859 2039         3400 my @ranges = @_;
2860 2039 50       3161 return unless @ranges;
2861             my $rangestr =
2862 2039         2858 join(" ",map{sprintf("[%s,%s]",$_->start,$_->end)} @ranges);
  18508         27904  
2863 2039         4849 my $strand = $ranges[0]->strand;
2864 2039         4294 for (my $i=1; $i<@ranges;$i++) {
2865 16469 50       23058 if ($ranges[$i]->strand != $strand) {
2866 0         0 $self->problem(1,"inconsistent strands. Trans-spliced gene? Range: $rangestr");
2867 0         0 return 1;
2868             # mixed ranges - autopass
2869             # some mRNAs have exons on both strands; for
2870             # example, the dmel mod(mdg4) gene which is
2871             # trans-spliced (in actual fact two mRNAs)
2872             }
2873             }
2874 2039         2544 my $pass = 1;
2875 2039         3389 for (my $i=1; $i<@ranges;$i++) {
2876 16469         19294 my $rangeP = $ranges[$i-1];
2877 16469         16002 my $range = $ranges[$i];
2878 16469 50       22011 if ($rangeP->start > $range->end) {
2879 0 0       0 if ($self->seq->is_circular) {
2880             # see for example NC_006578.gbk
2881             # we make exceptions for circular genomes here.
2882             # see Re: [Gmod-ajax] flatfile-to-json.pl error with GFF
2883             # 2010-07-26
2884             }
2885             else {
2886             # failed - but still get one more chance..
2887 0         0 $pass = 0;
2888 0         0 $self->problem(2,"Ranges not in correct order. Strange ensembl genbank entry? Range: $rangestr");
2889 0         0 last;
2890             }
2891             }
2892             }
2893            
2894 2039 50       3343 if (!$pass) {
2895             # sometimes (eg ensembl flavour genbank files)
2896             # exons on reverse strand listed in reverse order
2897             # eg join(complement(R1),...,complement(Rn))
2898             # where R1 > R2
2899 0         0 for (my $i=1; $i<@ranges;$i++) {
2900 0         0 my $rangeP = $ranges[$i-1];
2901 0         0 my $range = $ranges[$i];
2902 0 0       0 if ($rangeP->end < $range->start) {
2903 0         0 $self->problem(3,"inconsistent order. Range: $rangestr");
2904 0         0 return 0;
2905             }
2906             }
2907             }
2908 2039         3029 return 1; # pass
2909             }
2910              
2911             # PRIVATE METHOD: _locstr($sf)
2912             #
2913             # returns a location string for a feature; just the outer boundaries
2914             sub _locstr {
2915 18524     18524   20528 my $self = shift;
2916 18524         17198 my $sf = shift;
2917             return
2918 18524         26501 sprintf("%d..%d", $sf->start, $sf->end);
2919             }
2920              
2921             sub iterate_containment_tree {
2922 0     0 0   my $self = shift;
2923 0           my $feature_holder = shift;
2924 0           my $sub = shift;
2925 0           $sub->($feature_holder);
2926 0           my @sfs = $feature_holder->get_SeqFeatures;
2927 0           $self->iterate_containment_tree($_) foreach @sfs;
2928             }
2929              
2930             sub find_best_pairs {
2931 0     0 0   my $matrix = shift;
2932 0           my $size = shift;
2933 0   0       my $i = shift || 0;
2934              
2935 0           for (my $j=0; $j < $size; $j++) {
2936 0           my $score = $matrix->[$i][$j];
2937 0 0         if (!defined($score)) {
2938 0           next;
2939             }
2940            
2941             }
2942            
2943             }
2944              
2945             1;