File Coverage

blib/lib/Bio/MUST/Apps/Roles/OrgProcable.pm
Criterion Covered Total %
statement 30 92 32.6
branch 0 22 0.0
condition 0 9 0.0
subroutine 10 13 76.9
pod 0 1 0.0
total 40 137 29.2


line stmt bran cond sub pod time code
1             package Bio::MUST::Apps::Roles::OrgProcable;
2             # ABSTRACT: Attributes and methods common to OrgProcessor objects
3             $Bio::MUST::Apps::Roles::OrgProcable::VERSION = '0.213470';
4 1     1   998 use Moose::Role;
  1         3  
  1         12  
5              
6 1     1   5561 use autodie;
  1         4  
  1         10  
7 1     1   4950 use feature qw(say);
  1         4  
  1         114  
8              
9 1     1   8 use Smart::Comments -ENV;
  1         56  
  1         15  
10              
11 1     1   1501 use List::AllUtils qw(each_array);
  1         3  
  1         85  
12 1     1   759 use Number::Interval;
  1         2875  
  1         54  
13              
14 1     1   14 use Bio::MUST::Core;
  1         3  
  1         27  
15 1     1   6 use aliased 'Bio::MUST::Core::SeqId';
  1         2  
  1         11  
16 1     1   269 use aliased 'Bio::MUST::Core::SeqMask';
  1         2  
  1         5  
17              
18              
19             has 'org' => (
20             is => 'ro',
21             isa => 'Str',
22             required => 1,
23             );
24              
25              
26             has 'banks' => (
27             traits => ['Array'],
28             is => 'ro',
29             isa => 'ArrayRef[Str]',
30             required => 1,
31             handles => {
32             all_banks => 'elements',
33             },
34             );
35              
36              
37             has 'code' => (
38             is => 'ro',
39             isa => 'Str',
40             default => '1',
41             );
42              
43              
44             sub _collect_hsps { ## no critic (RequireArgUnpacking)
45 0     0     my $self = shift;
46 0           my %args = @_;
47              
48             my ($parser, $protein, $transcript, $mask_for, $hit_for, $strand_for)
49 0           = @args{ qw(parser protein transcript mask_for hit_for strand_for) };
50              
51             # Note: this method should work both for
52             # - XML HSPs within a single hit (1331)
53             # - multi-query/multi-hit tabular HSPs (42)
54 0   0       $parser //= $transcript;
55              
56 0           while (my $hsp = $parser->next_hsp) {
57              
58             # get mask key from query protein (1331) or hit hsp (42)
59 0 0         my $key = $transcript ? $protein->query_def : $hsp->hit_id;
60              
61             # update coverage on corresponding hit
62 0           push @{ $mask_for->{$key} }, Number::Interval->new(
  0            
63             min => $hsp->hit_start, max => $hsp->hit_end,
64             incmin => 1, incmax => 1,
65             );
66              
67             # record protein => best transcript pair (only for 1331)
68             # Note: we do this here rather than in leel OrgProcessor for symmetry
69 0 0 0       $hit_for->{$key} //= $transcript->id
70             if defined $hit_for;
71              
72             # record strand (only for 42)
73 0 0         $strand_for->{$key} += $hsp->hit_strand
74             if defined $strand_for;
75             }
76              
77 0           return;
78             }
79              
80              
81             sub _fetch_and_trim_hits { ## no critic (RequireArgUnpacking)
82 0     0     my $self = shift;
83 0           my %args = @_;
84              
85             my ($blastdb, $mask_for, $hit_for, $strand_for)
86 0           = @args{ qw(blastdb mask_for hit_for strand_for) };
87              
88 0           my $rp = $self->ali_proc->run_proc;
89              
90             # compute seq chunks to extract from all hits (default behavior)
91 0           my @entries;
92 0 0         if ($rp->trim_homologues eq 'on') {
93              
94             # tie might help making app completely deterministic
95 0           tie my %chunks_for, 'Tie::IxHash';
96              
97             # loop through masks to consolidate neighboring ranges into chunks
98             # Note: while ranges always deal with hit seqs...
99             # ... the key is either a protein abbr_id (1331) or a hit id (42)
100 0           while ( my ($key, $mask) = each %{$mask_for} ) {
  0            
101              
102             # first fetch seed range (corresponding to the best HSP)
103             # this will be useful for 1331 only (see below)
104 0           my $seed = $mask->[0]->min;
105              
106             # then sort ranges in ascending order
107 0           my @ranges = sort { $a->min <=> $b->min } @{$mask};
  0            
  0            
108              
109             # merge successive ranges not farther away than hit_max_shift
110 0           my $range1 = shift @ranges;
111 0           while (my $range2 = shift @ranges) {
112              
113             # too distant: store current chunk and start new one
114 0 0         if ( $range2->min - $range1->max > $rp->trim_max_shift) {
115 0           push @{ $chunks_for{$key} }, $range1;
  0            
116 0           $range1 = $range2;
117             }
118              
119             # close enough: extend current chunk
120             else {
121 0           $range1->max(
122             List::AllUtils::max( $range1->max, $range2->max )
123             ); # Note: this is needed as BLAST can return HSPs
124             } # that are completely included within better HSPs...
125             }
126              
127             # store last chunk
128 0           push @{ $chunks_for{$key} }, $range1;
  0            
129              
130             # only keep the (extended) range containing seed (only for 1331)
131             # this allows extracting the correct seq in case of paralogy
132 0 0         if (defined $hit_for) {
133             $chunks_for{$key} = [
134 0           grep { $_->contains($seed) } @{ $chunks_for{$key} }
  0            
  0            
135             ];
136             }
137             }
138              
139             # format chunks for retrieval with blastdbcmd...
140             # ... allowing for hit_extra_margin (no risk of out-of-bound max)
141             # keys are converted to hit ids on the fly (only for 1331)
142 0           while ( my ($key, $chunks) = each %chunks_for ) {
143             push @entries, map {
144 0 0         (defined $hit_for ? $hit_for->{$key} : $key) . q{ }
145             . List::AllUtils::max(1, $_->min - $rp->trim_extra_margin)
146             . q{-} . ($_->max + $rp->trim_extra_margin)
147 0           } @{$chunks};
  0            
148             }
149             }
150              
151             # otherwise simply extract whole hits
152             else {
153             @entries = map { # again on the fly (see above)
154 0 0         defined $hit_for ? $hit_for->{$_} : $_
155 0           } keys %{$mask_for};
  0            
156             }
157              
158             # fetch hits from BLAST database
159             # in any case extracted seqs are hits (for both 1331 and 42)
160             # trimming is left to blastdbcmd (should be very efficient)
161 0           my @hits = $blastdb->blastdbcmd( \@entries )->all_seqs;
162              
163             # rename hits after (long) id of corresponding protein (only for 1331)
164 0 0         if (defined $hit_for) {
165 0           my @abbr_ids = keys %{$mask_for};
  0            
166 0           my $ea = each_array(@abbr_ids, @hits);
167 0           while (my ($abbr_id, $hit) = $ea->() ) {
168 0           my $new_id = $self->long_id_for($abbr_id);
169 0           $hit->set_seq_id( SeqId->new( full_id => $new_id ) );
170             }
171             }
172              
173             # rename hits to fill in tax-reports (only for 42)
174             # this is only needed when trim_homologues is on (default behavior)
175 0 0 0       if (defined $strand_for && $rp->trim_homologues eq 'on') {
176 0           my %count_for;
177 0           my $ea = each_array(@entries, @hits);
178 0           while (my ($entry, $hit) = $ea->() ) {
179              
180             # extract details from entry
181 0           my ($hit_id, $start, $end) = $entry =~ m/(\S+)\s(\d+)\-(\d+)/xms;
182              
183             # compute hit strand (through HSPs votes)
184             # Note: strand might be inaccurate in case of multiple chunks
185 0 0         my $strand = $strand_for->{$hit_id} >= 0 ? 1 : -1;
186              
187             # embed range and strand in hit_id
188 0           my $new_id = join ':::', $hit_id, $start, $end, $strand;
189 0           $hit->set_seq_id( SeqId->new( full_id => $new_id ) );
190             }
191             }
192              
193 0           return @hits;
194             }
195              
196              
197             sub display { ## no critic (RequireArgUnpacking)
198 0     0 0   return join "\n--- ", q{}, @_
199             }
200              
201              
202 1     1   1328 no Moose::Role;
  1         3  
  1         13  
203             1;
204              
205             __END__
206              
207             =pod
208              
209             =head1 NAME
210              
211             Bio::MUST::Apps::Roles::OrgProcable - Attributes and methods common to OrgProcessor objects
212              
213             =head1 VERSION
214              
215             version 0.213470
216              
217             =head1 SYNOPSIS
218              
219             # TODO
220              
221             =head1 DESCRIPTION
222              
223             # TODO
224              
225             =head1 AUTHOR
226              
227             Denis BAURAIN <denis.baurain@uliege.be>
228              
229             =head1 COPYRIGHT AND LICENSE
230              
231             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
232              
233             This is free software; you can redistribute it and/or modify it under
234             the same terms as the Perl 5 programming language system itself.
235              
236             =cut