File Coverage

blib/lib/CracTools/Annotator.pm
Criterion Covered Total %
statement 171 221 77.3
branch 33 86 38.3
condition 3 30 10.0
subroutine 23 24 95.8
pod 12 12 100.0
total 242 373 64.8


line stmt bran cond sub pod time code
1             package CracTools::Annotator;
2              
3             {
4             $CracTools::Annotator::DIST = 'CracTools';
5             }
6             # ABSTRACT: Generic annotation base on CracTools::GFF::Query::File
7             $CracTools::Annotator::VERSION = '1.251';
8 1     1   13272 use strict;
  1         3  
  1         25  
9 1     1   4 use warnings;
  1         1  
  1         25  
10              
11 1     1   5 use Carp;
  1         2  
  1         65  
12 1     1   5 use List::Util qw[min max];
  1         2  
  1         83  
13              
14 1     1   323 use CracTools::Const;
  1         2  
  1         36  
15 1     1   326 use CracTools::GFF::Annotation;
  1         3  
  1         26  
16 1     1   362 use CracTools::Interval::Query;
  1         3  
  1         31  
17 1     1   368 use CracTools::Interval::Query::File;
  1         3  
  1         1459  
18              
19              
20             sub new {
21 1     1 1 1759 my $class = shift;
22 1         3 my $gff_file = shift;
23 1         2 my $mode = shift;
24              
25 1 50       5 $mode = 'light' if !defined $mode;
26              
27 1 50       3 if(!defined $gff_file) {
28 0         0 croak "Missing GFF file argument in CracTools::Annotator constructor";
29             }
30              
31 1         5 my $self = bless {
32             gff_file => $gff_file,
33             mode => $mode,
34             }, $class;
35              
36 1         4 $self->_init();
37              
38 1         4 return $self;
39             }
40              
41              
42             sub mode {
43 48     48 1 84 my $self = shift;
44 48         112 return $self->{mode};
45             }
46              
47              
48             sub foundAnnotation {
49 1     1 1 789 my $self = shift;
50             #my ($chr,$pos_start,$pos_end,$strand) = @_;
51 1         3 my @candidates = @{ $self->getAnnotationCandidates(@_)};
  1         3  
52 1         9 return (scalar @candidates > 0);
53             }
54              
55              
56             sub foundGene {
57 1     1 1 2 my $self = shift;
58 1         4 my ($chr,$pos_start,$pos_end,$strand) = @_;
59 1         3 my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
  1         4  
60 1         3 foreach my $candidate (@candidates) {
61 1 50       9 return 1 if defined $candidate->{gene};
62             }
63 0         0 return 0;
64             }
65              
66              
67             sub foundSameGene {
68 3     3 1 416 my $self = shift;
69 3         9 my ($chr,$pos_start1,$pos_end1,$pos_start2,$pos_end2,$strand) = @_;
70 3         5 my @candidates1 = @{ $self->getAnnotationCandidates($chr,$pos_start1,$pos_end1,$strand)};
  3         8  
71 3         4 my @candidates2 = @{ $self->getAnnotationCandidates($chr,$pos_start2,$pos_end2,$strand)};
  3         10  
72 3         6 my $found_same_gene = 0;
73 3         5 my @genes1;
74             my @genes2;
75 3         6 foreach my $candi1 (@candidates1) {
76 5 50       13 if(defined $candi1->{gene}) {
77 5         15 push @genes1,$candi1->{gene}->attribute('ID');
78             }
79             }
80 3         4 foreach my $candi2 (@candidates2) {
81 5 50       13 if(defined $candi2->{gene}) {
82 5         14 push @genes2,$candi2->{gene}->attribute('ID');
83             }
84             }
85 3         6 foreach my $gene_id (@genes1) {
86 3         7 foreach (@genes2) {
87 2 50       6 if($gene_id eq $_) {
88 2         3 $found_same_gene = 1;
89 2         4 last;
90             }
91             }
92 3 100       7 last if $found_same_gene == 1;
93             }
94 3         29 return $found_same_gene;
95             }
96              
97              
98             sub getBestAnnotationCandidate {
99 1     1 1 426 my $self = shift;
100 1         4 my ($best_candidates,$best_priority,$best_type) = $self->getBestAnnotationCandidates(@_);
101 1 50       2 if(@{$best_candidates}) {
  1         6  
102 1         5 return $best_candidates->[0],$best_priority,$best_type;
103             } else {
104 0         0 return undef,undef,undef;
105             }
106             }
107              
108              
109             sub getBestAnnotationCandidates {
110 1     1 1 2 my $self = shift;
111 1         3 my ($chr,$pos_start,$pos_end,$strand,$prioritySub,$compareSub) = @_;
112              
113 1 50 33     8 if(!defined $prioritySub && !defined $compareSub) {
114 1 50       4 $prioritySub = \&getCandidatePriorityDefault unless defined $prioritySub;
115 1 50       4 $compareSub = \&compareTwoCandidatesDefault unless defined $compareSub;
116             }
117              
118 1         2 my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
  1         4  
119 1         2 my @best_candidates;
120 1         3 my ($best_priority,$best_type);
121 1         2 foreach my $candi (@candidates) {
122 1         2 my ($priority,$type);
123 1 50       8 ($priority,$type) = $prioritySub->($pos_start,$pos_end,$candi) if defined $prioritySub;
124 1 50 33     7 if(defined $priority && $priority != -1) {
125 1 50 0     8 if(!defined $best_priority) {
    0          
    0          
126 1         2 $best_priority = $priority;
127 1         2 push @best_candidates, $candi;
128 1         3 $best_type = $type;
129             } elsif($priority < $best_priority) {
130 0         0 @best_candidates = ($candi);
131 0         0 $best_priority = $priority;
132 0         0 $best_type = $type;
133             }
134             #we should compare two candidates with equal priority to always choose the one
135             elsif (!defined $priority || $priority == $best_priority){
136 0         0 my $candidate_chosen;
137 0         0 my $found_better_candidate = 0;
138 0         0 foreach my $best_candidate (@best_candidates) {
139 0 0       0 $candidate_chosen = $compareSub->($best_candidate,$candi,$pos_start,$pos_end) if defined $compareSub;
140             # They are both equal
141 0 0       0 if (!defined $candidate_chosen) {
    0          
142             # We cannnot say if this candidate is better
143 0         0 next;
144             } elsif ($candidate_chosen == $candi) {
145             # We have found a better candidate that previously register ones
146             # we save it and remove the others
147 0         0 @best_candidates = ($candi);
148 0         0 $found_better_candidate = 1;
149 0         0 last;
150             } else {
151             # The better candidate is not "candi", so this candidates
152             # does not belong the the best_candidate array.
153             # We can stop looping
154 0         0 $found_better_candidate = 1;
155 0         0 last;
156             }
157             }
158 0 0       0 push @best_candidates, $candi if !$found_better_candidate;
159             }
160             }
161             }
162             # TODO We should not return variable in that order,
163             # it is not easy to only retrieve the best candidatse...
164 1         3 return \@best_candidates,$best_priority,$best_type;
165             }
166              
167              
168             sub getAnnotationCandidates {
169 10     10 1 13 my $self = shift;
170 10         20 my ($chr,$pos_start,$pos_end,$strand) = @_;
171             # TODO if no strand is provided we should return annotations from both strands
172              
173             # get GFF annotations that overlap the region to annotate
174 10         33 my $annotations = $self->{gff_query}->fetchByRegion($chr,$pos_start,$pos_end,$strand);
175             # get a ref of an array of hash of candidates
176 10         23 my $candidatates = $self->_constructCandidatesFromAnnotation($annotations);
177 10         29 return $candidatates;
178             }
179              
180              
181             sub getAnnotationNearestDownCandidates {
182 1     1 1 416 my $self = shift;
183 1         4 my ($chr,$pos_start,$strand) = @_;
184              
185             # get GFF annotations that overlap the pos_start to annotate
186 1         5 my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_start,$strand);
187             # get GFF annotations of nearest down intervals that not overlaped [pos_start,pos_end] pos
188 1         2 my @annotations_down;
189              
190 1         2 push @annotations_down, @{$self->{gff_query}->fetchAllNearestDown($chr,$pos_start,$strand)};
  1         6  
191              
192             # get a ref of an array of hash of candidates
193 1         4 my @annotations = (@$annotations_overlap,@annotations_down);
194 1         5 my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
195 1         4 return $candidatates;
196             }
197              
198              
199             sub getAnnotationNearestUpCandidates {
200 1     1 1 400 my $self = shift;
201 1         3 my ($chr,$pos_end,$strand) = @_;
202              
203             # get GFF annotations that overlap the pos_end to annotate
204 1         4 my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_end,$strand);
205             # get GFF annotations of nearest up intervals that not overlaped [pos_start,pos_end] pos
206 1         3 my @annotations_up;
207              
208 1         2 push @annotations_up, @{$self->{gff_query}->fetchAllNearestUp($chr,$pos_end,$strand)};
  1         5  
209              
210             # get a ref of an array of hash of candidates
211 1         3 my @annotations = (@$annotations_overlap,@annotations_up);
212 1         5 my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
213 1         4 return $candidatates;
214             }
215              
216              
217             sub getCandidatePriorityDefault {
218 2     2 1 5 my ($pos_start,$pos_end,$candidate) = @_;
219 2         5 my ($priority,$type) = (-1,'');
220 2         6 my ($mRNA,$exon) = ($candidate->{mRNA},$candidate->{exon});
221 2 50       6 if(defined $mRNA) {
222 2 50 33     6 if(defined $mRNA->attribute('type') && $mRNA->attribute('type') =~ /protein_coding/i) {
223 2 50       7 if(defined $exon) {
224 0 0 0     0 if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
225 0         0 $priority = 1;
226 0 0       0 if(defined $candidate->{three}) {
    0          
227 0         0 $type = '3PRIM_UTR';
228             } elsif(defined $candidate->{five}) {
229 0         0 $type = '5PRIM_UTR';
230             # } elsif(defined $candidate->{cds}) {
231             # $type = 'CDS';
232             } else {
233 0         0 $type = 'EXON';
234             }
235             } else {
236 0         0 $priority = 2;
237 0         0 $type = 'INXON';
238             }
239             } else {
240 2         4 $priority = 4;
241 2         4 $type = 'INTRON';
242             }
243             } else {
244 0 0       0 if(defined $exon) {
245 0 0 0     0 if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
246 0         0 $priority = 3;
247 0         0 $type = 'NON_CODING';
248             }
249             }
250             }
251             }
252 2         6 return ($priority,$type);
253             }
254              
255             sub compareTwoCandidatesDefault{
256 0     0 1 0 my ($candidate1,$candidate2,$pos_start) = @_;
257             # If both candidates are exons we try to find wich one is closer to the pos_start of the region to annotate
258 0 0 0     0 if ($candidate1->{exon} && $candidate2->{exon}) {
259 0         0 my $dist1= min(abs($candidate1->{exon}->end - $pos_start),abs($candidate1->{exon}->start - $pos_start));
260 0         0 my $dist2= min(abs($candidate2->{exon}->end - $pos_start),abs($candidate2->{exon}->start - $pos_start));
261 0 0       0 if ($dist1 > $dist2) {
    0          
262 0         0 return $candidate2;
263             } elsif ($dist1 < $dist2) {
264 0         0 return $candidate1;
265             }
266             }
267             # If we have not found a better candidate, we use the lexicographic order of the mRNA ID
268 0         0 my ($mRNA1,$mRNA2) = ($candidate1->{mRNA},$candidate2->{mRNA});
269 0 0 0     0 if(defined $mRNA1 && defined $mRNA1->attribute('ID') && defined $mRNA2 && defined $mRNA2->attribute('ID')) {
      0        
      0        
270 0 0       0 if($mRNA1->attribute('ID') lt $mRNA2->attribute('ID')) {
271 0         0 return $candidate1;
272             } else {
273 0         0 return $candidate2;
274             }
275             }
276             # If nothing has worked we return "undef"
277 0         0 return undef;
278             }
279              
280              
281             sub _init {
282 1     1   3 my $self = shift;
283 1         2 my $gff_query;
284              
285             # Create a GFF file to query exons
286 1 50       4 if($self->mode eq "fast") {
287 1         8 $gff_query = CracTools::Interval::Query->new();
288             my $gff_it = CracTools::Utils::getFileIterator(file => $self->{gff_file},
289 31     31   86 parsing_method => sub { CracTools::GFF::Annotation->new(@_) },
290 1         8 header_regex => "^#",
291             );
292 1         4 while(my $gff_annot = $gff_it->()) {
293 31         70 $gff_query->addInterval($gff_annot->chr,
294             $gff_annot->start+1,
295             $gff_annot->end+1,
296             $gff_annot->strand,
297             $gff_annot,
298             );
299             }
300             } else {
301 0         0 $gff_query = CracTools::Interval::Query::File->new(file => $self->{gff_file}, type => 'gff');
302             }
303              
304 1         4 $self->{gff_query} = $gff_query;
305             }
306              
307              
308             sub _constructCandidates {
309 66     66   118 my ($annot_id,$candidate,$annot_hash) = @_;
310              
311             # We init the "leaf_feature" value if this is the first recursion step
312 66 100       166 $candidate->{leaf_feature} = $annot_hash->{$annot_id}->feature if !defined $candidate->{leaf_feature};
313              
314 66         100 my @candidates;
315 66 50       138 if (!defined $annot_hash->{$annot_id}){
316 0         0 carp("Missing feature for $annot_id in the gff file");
317             }
318 66         179 $candidate->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$annot_id};
319 66         163 my $parents = $annot_hash->{$annot_id}->parents;
320 66 100       128 if(@$parents) {
321 42         60 foreach my $parent (@{$parents}) {
  42         75  
322            
323             #Test to avoid a deep recursion
324 47 50       183 if($parent eq $annot_id) {
    50          
    100          
325 0         0 carp("Parent could not be the candidat itself, please check your gff file for $annot_id");
326 0         0 next;
327             # If there is already a parent with this feature type we duplicated
328             # the candidate since we are branching in the annotation tree
329             }elsif(!defined $annot_hash->{$parent}) {
330 0         0 carp("Parent not found, please check your gff file for $annot_id (Parent: $parent)");
331            
332             }elsif(defined $candidate->{$annot_hash->{$parent}->feature}) {
333 10         15 my %copy_candidate = %{$candidate};
  10         38  
334 10         19 my %copy_parent_feature = %{$candidate->{parent_feature}};
  10         34  
335 10         19 $copy_candidate{parent_feature} = \%copy_parent_feature;
336             # We register in parent_feature links
337 10         28 $copy_candidate{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
338 10         16 my $copy_ref = \%copy_candidate;
339 10         19 push(@candidates,@{_constructCandidates($parent,$copy_ref,$annot_hash)});
  10         18  
340             # If not we only go up to the parent node in order to continue candidate
341             # construction
342             } else {
343             # We register in parent_feature links
344 37         91 $candidate->{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
345 37         63 push(@candidates,@{_constructCandidates($parent,$candidate,$annot_hash)});
  37         81  
346             }
347             }
348 42         112 return \@candidates;
349             } else {
350 24         71 return [$candidate];
351             }
352             }
353              
354              
355             sub _constructCandidatesFromAnnotation {
356 12     12   20 my $self = shift;
357 12         18 my $annotations = shift;
358 12         21 my %annot_hash = ();
359 12         21 my @candidates = ();
360              
361             # Construct annotation hash with annot ID as key
362 12         20 foreach my $annot_line (@{$annotations}) {
  12         23  
363 46 50       89 if($self->mode eq "fast") {
364 46         117 $annot_hash{$annot_line->attribute('ID')} = $annot_line;
365             } else {
366 0         0 my $annot = CracTools::GFF::Annotation->new($annot_line,'gff3');
367 0         0 $annot_hash{$annot->attribute('ID')} = $annot;
368             }
369             }
370              
371             # Find leaves in annotation tree
372 12         20 my %hash_leaves;
373 12         31 foreach my $annot_id (keys %annot_hash) {
374             #my @parents = $annot_hash{$annot_id}->parents;
375 46         64 foreach my $parent (@{$annot_hash{$annot_id}->parents}){
  46         108  
376 40 100       114 $hash_leaves{$parent} = 1 unless (defined $hash_leaves{$parent});
377             }
378             }
379 12         27 foreach my $annot_id (keys %annot_hash) {
380             # check if annot_id is a leaf
381 46 100       104 if (!defined $hash_leaves{$annot_id}){
382             # Get all possible path from this leaf to the root
383 19         29 push @candidates, @{_constructCandidates($annot_id,my $new_candidate,\%annot_hash)};
  19         38  
384             }
385             }
386              
387 12         36 return \@candidates;
388             }
389              
390             1;
391              
392             __END__