File Coverage

blib/lib/Bio/Polloc/GroupCriteria.pm
Criterion Covered Total %
statement 75 356 21.0
branch 21 192 10.9
condition 10 105 9.5
subroutine 16 34 47.0
pod 14 14 100.0
total 136 701 19.4


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Bio::Polloc::GroupCriteria - Rules to group loci
4              
5             =head1 DESCRIPTION
6              
7             Takes loci and returns groups of loci based on certain
8             rules. If created via .bme (.cfg) files, it is defined
9             in the C<[ RuleGroup ]> and C<[ GroupExtension ]>
10             namespaces.
11              
12             =head1 AUTHOR - Luis M. Rodriguez-R
13              
14             Email lmrodriguezr at gmail dot com
15              
16             =head1 LICENSE
17              
18             This package is licensed under the Artistic License - see LICENSE.txt
19              
20             =head1 IMPLEMENTS OR EXTENDS
21              
22             =over
23              
24             =item *
25              
26             L<Bio::Polloc::Polloc::Root>
27              
28             =back
29              
30             =cut
31              
32             package Bio::Polloc::GroupCriteria;
33 3     3   928 use strict;
  3         5  
  3         126  
34 3     3   15 use base qw(Bio::Polloc::Polloc::Root);
  3         6  
  3         839  
35 3     3   18 use List::Util qw(min max first);
  3         6  
  3         305  
36 3     3   15 use Bio::Polloc::Polloc::IO;
  3         6  
  3         63  
37 3     3   1858 use Bio::Polloc::LociGroup;
  3         8  
  3         80  
38 3     3   1984 use Bio::Polloc::GroupCriteria::operator;
  3         9  
  3         74  
39 3     3   1871 use Bio::Polloc::GroupCriteria::operator::cons;
  3         8  
  3         124  
40 3     3   3129 use Bio::Seq;
  3         175189  
  3         110  
41 3     3   60 use Error qw(:try);
  3         6  
  3         29  
42             our $VERSION = 1.0503; # [a-version] from Bio::Polloc::Polloc::Version
43              
44              
45             #
46              
47             =head1 APPENDIX - Methods
48              
49             Methods provided by the package
50              
51             =cut
52              
53             =head2 new
54              
55             =over
56              
57             =item
58              
59             Generic initialization method
60              
61             =item Arguments
62              
63             =over
64              
65             =item -souce I<str>
66              
67             See L<source>
68              
69             =item -target I<str>
70              
71             See L<target>
72              
73             =item -features I<Bio::Polloc::LociGroup>
74              
75             Alias of C<-loci>
76              
77             =item -loci I<Bio::Polloc::LociGroup>
78              
79             See L<locigroup>
80              
81             =back
82              
83             =item Returns
84              
85             The C<Bio::Polloc::GroupCriteria> object
86              
87             =back
88              
89             =cut
90              
91             sub new {
92 3     3 1 10 my($caller,@args) = @_;
93 3         34 my $self = $caller->SUPER::new(@args);
94 3         19 $self->_initialize(@args);
95 3         10 return $self;
96             }
97              
98             =head2 source
99              
100             =over
101              
102             =item
103              
104             Sets/gets the type of source loci (see L<Bio::Polloc::LocusI-E<gt>family>
105              
106             =back
107              
108             =cut
109              
110             sub source {
111 4     4 1 409 my($self, $value) = @_;
112 4 100       20 $self->{'_source'} = $value if defined $value;
113 4         12 return $self->{'_source'};
114             }
115              
116             =head2 target
117              
118             =over
119              
120             =item
121              
122             Sets/gets the type of target loci (see L<Bio::Polloc::LocusI-E<gt>family>
123              
124             =back
125              
126             =cut
127              
128             sub target {
129 4     4 1 12 my($self, $value) = @_;
130 4 100       27 $self->{'_target'} = $value if defined $value;
131 4         11 return $self->{'_target'};
132             }
133              
134             =head2 locigroup
135              
136             =over
137              
138             =item
139              
140             Gets/sets the input L<Bio::Polloc::LociGroup> object containing
141             all the loci to evaluate.
142              
143             =back
144              
145             =cut
146              
147             sub locigroup {
148 3     3 1 7 my($self, $value) = @_;
149 3 50       12 if(defined $value){
150 0         0 $self->{'_locigroup'} = $value;
151 0         0 $self->{'_reorder'} = 1;
152             }
153 3         10 return $self->{'_locigroup'};
154             }
155              
156             =head2 condition
157              
158             =over
159              
160             =item
161              
162             Sets/gets the conditions set to evaluate.
163              
164             =back
165              
166             =cut
167              
168             sub condition {
169 4     4 1 9 my($self, $value) = @_;
170 4 100       38 if(defined $value){
171 3 50 33     51 $self->throw('Unexpected type of condition', $value)
172             unless UNIVERSAL::can($value, 'isa') and $value->isa('Bio::Polloc::GroupCriteria::operator');
173 3         9 $self->{'_condition'} = $value;
174             }
175 4         16 return $self->{'_condition'};
176             }
177              
178             =head2 evaluate
179              
180             =over
181              
182             =item
183              
184             Compares two loci based on the defined conditions
185              
186             =item Arguments
187              
188             =over
189              
190             =item *
191              
192             The first locus (a L<Bio::Polloc::LocusI> object)
193              
194             =item *
195              
196             The second locus (a L<Bio::Polloc::LocusI> object)
197              
198             =back
199              
200             =item Returns
201              
202             Boolean
203              
204             =item Throws
205              
206             L<Bio::Polloc::Polloc::Error> if unexpected input or undefined condition, source or
207             target
208              
209             =back
210              
211             =cut
212              
213             sub evaluate {
214 0     0 1 0 my($self, $feat1, $feat2) = @_;
215             # Test the input
216 0 0       0 $feat1->isa('Bio::Polloc::LocusI') or
217             $self->throw("First feature of illegal class", $feat1);
218            
219 0 0       0 $feat2->isa('Bio::Polloc::LocusI') or
220             $self->throw("Second feature of illegal class", $feat2);
221            
222 0 0       0 defined $self->condition or
223             $self->throw("Undefined condition, impossible to group");
224            
225 0 0       0 $self->condition->type eq 'bool' or
226             $self->throw("Unexpected type of condition", $self->condition);
227            
228 0 0       0 $self->throw("Undefined source features") unless defined $self->source;
229 0 0       0 $self->throw("Undefined target features") unless defined $self->target;
230            
231             # Run
232 0 0       0 return 0 unless $feat1->family eq $self->source;
233 0 0       0 return 0 unless $feat2->family eq $self->target;
234 0         0 $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT1'} = $feat1;
235 0         0 $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT2'} = $feat2;
236 0         0 my $o = $self->condition->operate;
237 0         0 $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT1'} = undef;
238 0         0 $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT2'} = undef;
239 0         0 return $o;
240             }
241              
242             =head2 get_loci
243              
244             =over
245              
246             =item
247              
248             Gets the stored loci
249              
250             =item Note
251              
252             The stored loci can also be obtained with C<$object-E<gt>locigroup-E<gt>loci>,
253             but this function ensures a consistent order in the loci for its evaluation.
254              
255             =back
256              
257             =cut
258              
259             sub get_loci {
260 0     0 1 0 my($self,@args) = @_;
261 0 0 0     0 $self->{'_features'} = $self->locigroup->loci
262             if defined $self->locigroup and not defined $self->{'_features'};
263 0 0       0 $self->{'_features'} = [] unless defined $self->{'_features'};
264 0 0 0     0 if($self->{'_reorder'} && $self->source ne $self->target){
265 0         0 my @src = ();
266 0         0 my @tgt = ();
267 0         0 my @oth = ();
268 0         0 for my $ft (@{$self->locigroup->loci}){
  0         0  
269 0 0       0 if($ft->family eq $self->source){ push (@src, $ft) }
  0 0       0  
270 0         0 elsif($ft->family eq $self->target){ push (@tgt, $ft) }
271 0         0 else{ push @oth, $ft }
272             }
273 0         0 $self->{'_features'} = [];
274 0         0 push @{$self->{'_features'}}, @tgt, @src, @oth;
  0         0  
275 0         0 $self->{'_reorder'} = 0;
276             }
277 0         0 return $self->{'_features'};
278             }
279              
280              
281             =head2 get_locus
282              
283             =over
284              
285             =item
286              
287             Get the locus with the specified index.
288              
289             =item Arguments
290              
291             The index (int, mandatory).
292              
293             =item Returns
294              
295             A L<Bio::Polloc::LocusI> object or undef.
296              
297             =item Note
298              
299             This is a lazzy method, and should be used B<ONLY> after C<get_loci()>
300             were called at least once. Otherwise, the order might not be the expected,
301             and weird results would appear.
302              
303             =back
304              
305             =cut
306              
307             sub get_locus {
308 0     0 1 0 my($self, $index) = @_;
309 0 0       0 return unless defined $index;
310 0 0       0 return unless defined $self->{'_features'};
311 0         0 return $self->{'_features'}->[$index];
312             }
313              
314             =head2 extension
315              
316             =over
317              
318             =item
319              
320             Sets the conditions for group extensions.
321              
322             =item Arguments
323              
324             Array, hash or string with C<-key =E<gt> value> pairs. Supported values are:
325              
326             =over
327              
328             =item -function I<str>
329              
330             =over
331              
332             =item C<context>
333              
334             Searches the flanking regions in the target sequence.
335              
336             =back
337              
338             =item -upstream I<int>
339              
340             Extension in number of residues upstream the feature.
341              
342             =item -downstream I<int>
343              
344             Extension in number of residues downstream the feature.
345              
346             =item -detectstrand I<bool (int)>
347              
348             Should I detect the proper strand? Otherwise, the stored strand
349             is trusted. This is useful for non-directed features like repeats,
350             which context is actually directed.
351              
352             =item -alldetected I<bool (int)>
353              
354             Include all detected features (even these overlapping with input features).
355              
356             =item -feature I<bool (int)>
357              
358             Should I include the feature region in the search? 0 by default.
359              
360             =item -lensd I<float>
361              
362             Number of Standar Deviations (SD) tolerated as half of the range of lengths
363             for a feature. The average (Avg) and the standard deviation of the length
364             are calculated based on all the stored features, and the Avg+(SD*lensd) is
365             considered as the largest possible new feature. No minimum length constraint
366             is given, unless explicitly set with -minlen. This argument is ignored if
367             C<-maxlen> is explicitly set. Default is 1.5.
368              
369             =item -maxlen I<int>
370              
371             Maximum length of a new feature in number of residues. If zero (0) evaluates
372             C<-lensd> instead. Default is 0.
373              
374             =item -minlen I<int>
375              
376             Minimum length of a new feature in number of residues. Default is 0.
377              
378             =item -similarity I<float>
379              
380             Minimum fraction of similarity to include a found region. 0.8 by default.
381              
382             =item -oneside I<bool (int)>
383              
384             Should I consider features with only one of the sides? Takes effect only if
385             both -upstream and -downstream are defined. 0 by default.
386              
387             =item -algorithm I<str>
388              
389             =over
390              
391             =item C<blast>
392              
393             Use BLAST to search (after multiple alignment and consensus calculation of
394             queries). Default algorithm.
395              
396             =item C<hmmer>
397              
398             Use HMMer to search (after multiple alignment and C<hmmbuild> of query
399             sequences).
400              
401             =back
402              
403             =item -score I<int>
404              
405             Minimum score for either algorithms B<blast> and B<hmmer>. 20 by default.
406              
407             =item -consensusperc I<float>
408              
409             Minimum percentage a residue must appear in order to include it in the
410             consensus used as query. 60 by default. Only if -algorithm blast.
411              
412             =item -e I<float>
413              
414             If C<-algorithm> B<blast>, maximum e-value. 0.1 by default.
415              
416             =item -p I<str>
417              
418             If C<-algorithm> B<blast>, program used (C<[t]blast[npx]>). B<blastn> by
419             default.
420              
421             =back
422              
423             =item Throws
424              
425             L<Bio::Polloc::Polloc::Error> if unexpected input,
426              
427             =back
428              
429             =cut
430              
431             sub extension {
432 20     20 1 46 my ($self, @args) = @_;
433 20 100       125 return $self->{'_groupextension'} unless $#args>=0;
434 3 50       68 @args = split /\s+/, $args[0] if $#args == 0;
435 3 50       19 $self->throw("Odd number of elements, impossible to build key-value pairs", \@args)
436             unless $#args%2;
437 3         35 my %f = @args;
438 3   50     15 $f{'-function'} ||= 'context';
439 3   50     11 $f{'-algorithm'} ||= 'blast';
440 3   50     22 ($f{'-feature'} ||= 0) += 0;
441 3   50     12 ($f{'-downstream'} ||= 0) += 0;
442 3   50     34 ($f{'-upstream'} ||= 0) += 0;
443 3   50     18 ($f{'-detectstrand'} ||= 0) += 0;
444 3   50     19 ($f{'-alldetected'} ||= 0) += 0;
445 3   50     31 ($f{'-oneside'} ||= 0) += 0;
446 3 50       16 $f{'-lensd'} = defined $f{'-lensd'} ? $f{'-lensd'}+0 : 1.5;
447 3 50       18 $f{'-maxlen'} = defined $f{'-maxlen'} ? $f{'-maxlen'}+0 : 0;
448 3 50       22 $f{'-minlen'} = defined $f{'-minlen'} ? $f{'-minlen'}+0 : 0;
449 3 50       21 $f{'-similarity'} = defined $f{'-similarity'} ? $f{'-similarity'}+0 : 0.8;
450 3 50       15 $f{'-score'} = defined $f{'-score'} ? $f{'-score'}+0 : 20;
451 3 50       14 $f{'-consensusperc'} = defined $f{'-consensusperc'} ? $f{'-consensusperc'}+0 : 60;
452 3 50       24 $f{'-e'} = defined $f{'-e'} ? $f{'-e'}+0 : 0.1;
453 3 50       20 $f{'-p'} = 'blastn' unless defined $f{'-p'};
454 3         9 $self->{'_groupextension'} = \%f;
455 3         24 return $self->{'_groupextension'};
456             }
457              
458              
459             =head2 extend
460              
461             =over
462              
463             =item
464              
465             Extends a group based on the arguments provided by L<Bio::Polloc::GroupCriteria->extension>.
466              
467             =item Arguments
468              
469             =over
470              
471             =item -loci I<Bio::Polloc::LociGroup>
472              
473             The L<Bio::Polloc::LociGroup> containing the loci in the group to extend.
474              
475             =back
476              
477             =item Returns
478              
479             A L<Bio::Polloc::LociGroup> object containing the updated group, i.e. the
480             original group PLUS the extended features.
481              
482             =item Throws
483              
484             L<Bio::Polloc::Polloc::Error> if unexpected input or weird extension definition.
485              
486             =back
487              
488             =cut
489              
490             sub extend {
491 0     0 1 0 my ($self, @args) = @_;
492 0         0 my ($loci) = $self->_rearrange([qw(LOCI)], @args);
493             # Check input
494 0         0 my $ext = $self->{'_groupextension'};
495 0 0       0 return unless defined $ext;
496 0 0 0     0 $self->throw("The loci are not into an object", $loci)
      0        
497             unless defined $loci and ref($loci) and UNIVERSAL::can($loci,'isa');
498 0 0       0 $self->throw("Unexpected type for the group of loci", $loci)
499             unless $loci->isa('Bio::Polloc::LociGroup');
500 0 0       0 return unless $#{$loci->loci}>=0;
  0         0  
501             # Set ID base
502 0         0 my $group_id = $self->_next_group_id;
503             # Run
504 0         0 my @new = ();
505 0         0 $self->debug("--- Extending group (based on ".($#{$loci->loci}+1)." loci) ---");
  0         0  
506 0 0       0 if(lc($ext->{'-function'}) eq 'context'){
507 0         0 my ($up_pos, $down_pos, $in_pos);
508 0 0 0     0 $loci->fix_strands(max($ext->{'-downstream'}, $ext->{'-upstream'}))
      0        
509             if $ext->{'-detectstrand'} and ($ext->{'-upstream'} or $ext->{'-downstream'});
510             # Search
511 0 0       0 my $eval_feature = $ext->{'-feature'} ? 1 : 0;
512 0 0       0 $up_pos = $self->_search_aln_seqs($loci->align_context(-1, $ext->{'-upstream'}, 0))
513             if $ext->{'-upstream'};
514 0 0       0 $down_pos = $self->_search_aln_seqs($loci->align_context(1, $ext->{'-downstream'},0))
515             if $ext->{'-downstream'};
516 0 0       0 $in_pos = $self->_search_aln_seqs($loci->align_context(0, 0, 0)) if $ext->{'-feature'};
517             # Determine maximum size
518 0         0 my $max_len = $ext->{'-maxlen'};
519 0 0       0 unless($max_len){
520 0         0 my($len_avg, $len_sd) = $self->locigroup->avg_length;
521 0         0 $self->warn("Building size constrains based in one sequence only")
522 0 0       0 if $#{$self->locigroup->loci}<1;
523 0         0 $max_len = $len_avg + $len_sd*$ext->{'-lensd'};
524             }
525 0         0 $self->debug("Comparing results with maximum feature's length of $max_len");
526             # Evaluate/pair
527 0 0 0     0 if($ext->{'-upstream'} and $ext->{'-downstream'}){
    0          
528             # Detect border pairs
529 0         0 push @new, $self->_detect_border_pairs($up_pos, $down_pos, $max_len);
530 0 0       0 if($eval_feature){
531 0         0 $self->debug("Filtering results with in-feature sequences");
532 0         0 my @prefilter = @new;
533 0         0 @new = ();
534 0         0 BORDER: for my $br (@prefilter){
535             push @new,
536 0 0 0 0   0 first { $br->[0] ne $_->[0] # != ctg
      0        
537             and $br->[3] == $_->[3] # upstream's strand
538             and $br->[3]*$_->[1] < $br->[3]*$br->[2] # no overlap
539             and $br->[3]*$_->[2] > $br->[3]*$br->[1] # no overlap
540 0         0 } @$in_pos;
541             #WITHIN: for my $in (@$in_pos){
542             # $self->throw("Unexpected array structure (in-feature)", $in)
543             # unless defined $in->[0] and defined $in->[4];
544             # next WITHIN if $br->[0] eq $in->[0] # == ctg
545             # or $br->[3] != $in->[3] # upstream's strand
546             # or $br->[3]*$in->[1] >= $br->[3]*$br->[2] # overlap
547             # or $br->[3]*$in->[2] <= $br->[3]*$br->[1]; # overlap
548             # # Good!
549             # $br->[4] = (2*$br->[4] + $in->[4])/3;
550             # push @new, $br;
551             # next BORDER;
552             # }
553             }
554             }
555 0         0 }elsif($eval_feature){ @new = @$in_pos }else{
556 0         0 $self->throw('Nothing to evaluate! '.
557             'I need either the two borders or the middle sequence (or both)');
558             }
559             }else{
560 0         0 $self->throw('Unsupported function for group extension', $ext->{'-function'});
561             }
562              
563             # And finally, create the detected features, discarding loci overlapping input loci
564 0         0 $self->debug("Found ".($#new+1)." loci, creating extend features");
565 0         0 my $comments = "Based on group $group_id: ";
566 0 0       0 for my $locus (@{$loci->loci}) { $comments.= $locus->id . ", " if defined $locus->id }
  0         0  
  0         0  
567 0         0 $comments = substr $comments, 0, -2;
568            
569 0         0 my $newloci = Bio::Polloc::LociGroup->new();
570 0 0       0 $newloci->name($loci->name."-ext") if defined $loci->name;
571 0 0       0 $newloci->featurename($loci->featurename) if defined $loci->featurename;
572 0 0       0 $newloci->genomes($loci->genomes) if defined $loci->genomes;
573 0         0 NEW: for my $itemk (0 .. $#new){
574 0         0 my $item = $new[$itemk];
575 0         0 ($item->[1], $item->[2]) = (min($item->[1], $item->[2]), max($item->[1], $item->[2]));
576 0 0       0 unless($ext->{'-alldetected'}){
577 0         0 OLD: for my $locus (@{$loci->loci}){
  0         0  
578             # Not new! :
579 0 0 0     0 next NEW if $item->[1]<$locus->to and $item->[2]>$locus->from;
580             }
581             }
582 0         0 my $seq;
583 0         0 my($Gk, $acc) = split /:/, $item->[0], 2;
584 0         0 $Gk+=0;
585 0         0 for my $ck (0 .. $#{$self->genomes->[$Gk]->get_sequences}){
  0         0  
586 0         0 my $id = $self->genomes->[$Gk]->get_sequences->[$ck]->display_id;
587 0 0 0     0 if($id eq $acc or $id =~ m/\|$acc(\.\d+)?(\||\s*$)/){
588 0         0 $seq = [$Gk,$ck]; last;
  0         0  
589             }
590             }
591 0 0       0 $self->warn('I can not find the sequence', $acc) unless defined $seq;
592 0 0       0 $self->throw('Undefined genome-contig pair', $acc, 'UnexpectedException')
593             unless defined $self->genomes->[$seq->[0]]->get_sequences->[$seq->[1]];
594 0         0 my $id = $self->source . "-ext:".($Gk+1).".$group_id.".($#{$newloci->loci}+2);
  0         0  
595 0 0       0 $newloci->add_loci(Bio::Polloc::LocusI->new(
    0          
596             -type=>'extend',
597             -from=>$item->[1],
598             -to=>$item->[2],
599             -id=>(defined $id ? $id : ''),
600             -strand=>($item->[3]==-1 ? '+' : '-'),
601             # Gk:Genome ck:Contig
602             -seq=>$self->genomes->[$seq->[0]]->get_sequences->[$seq->[1]],
603             -score=>$item->[4],
604             -basefeature=>$loci->loci->[0],
605             -comments=>$comments,
606             -genome=>$self->genomes->[$Gk]
607             ));
608             }
609 0         0 return $newloci;
610             }
611              
612             =head2 build_bin
613              
614             =over
615              
616             =item
617              
618             Compares all the included loci and returns the identity matrix
619              
620             =item Arguments
621              
622             =over
623              
624             =item -complete I<bool (int)>
625              
626             If true, calculates the complete matrix instead of only the bottom-left triangle.
627              
628             =back
629              
630             =item Returns
631              
632             A reference to a boolean 2-dimensional array (only left-down triangle)
633              
634             =item Note
635              
636             B<WARNING!> The order of the output is not allways the same of the input.
637             Please use C<get_loci()> instead, as source features B<MUST> be after
638             target features in the array. Otherwise, it is not possible to have the
639             full picture without building the full matrix (instead of half).
640              
641             =back
642              
643             =cut
644              
645             sub build_bin {
646 0     0 1 0 my($self,@args) = @_;
647 0         0 my $bin = [];
648 0         0 my($complete) = $self->_rearrange([qw(COMPLETE)], @args);
649 0         0 for my $i (0 .. $#{$self->get_loci}){
  0         0  
650 0         0 $bin->[$i] = [];
651 0 0       0 my $lim = $complete ? $#{$self->get_loci} : $i;
  0         0  
652 0         0 for my $j (0 .. $lim){
653 0         0 $bin->[$i]->[$j] = $self->evaluate(
654             $self->get_loci->[$i],
655             $self->get_loci->[$j]
656             );
657             }
658             }
659 0         0 return $bin;
660             }
661              
662              
663             =head2 bin_build_groups
664              
665             =over
666              
667             =item
668              
669             Builds groups of loci based on a binary matrix
670              
671             =item Arguments
672              
673             A matrix as returned by L<Bio::Polloc::GroupCriteria-E<gt>build_bin>
674              
675             =item Returns
676              
677             A 2-D arrayref.
678              
679             =item Note
680              
681             This method is intended to build groups providing information on all-vs-all
682             comparisons. If you do not need this information, use the much more
683             efficient L<Bio::Polloc::GroupCriteria-E<gt>build_groups> method, that relies on
684             transitive property of groups to avoid unnecessary comparisons. Please note
685             that this function also relies on transitivity, but gives you the option to
686             examine all the paired comparisons and even write your own grouping function.
687              
688             =back
689              
690             =cut
691              
692             sub bin_build_groups {
693 0     0 1 0 my($self,$bin) = @_;
694 0         0 my $groups = [];
695 0         0 FEAT: for my $f (0 .. $#{$self->get_loci}){
  0         0  
696 0         0 GROUP: for my $g (0 .. $#{$groups}){
  0         0  
697 0         0 MEMBER: for my $m (0 .. $#{$groups->[$g]}){
  0         0  
698 0 0       0 if($bin->[$f]->[$groups->[$g]->[$m]] ){
699 0         0 push @{$groups->[$g]}, $f;
  0         0  
700 0         0 next FEAT;
701             }
702             }
703             }
704 0         0 push @{$groups}, [$f]; # If not found in previous groups
  0         0  
705             }
706             # Change indexes by Bio::Polloc::LocusI objects
707 0         0 return $self->_feat_index2obj($groups);
708             }
709              
710              
711             =head2 build_groups
712              
713             =over
714              
715             =item
716              
717             This is the main method, creates groups of loci.
718              
719             =item Arguments
720              
721             =over
722              
723             =item -cpus I<int>
724              
725             If defined, attempts to distribute the work among the specified number of
726             cores. B<Warning>: This parameter is experimental, and relies on
727             C<Parallel::ForkManager>. It can be used in production with certain
728             confidence, but it is highly probable to B<NOT> work in parallel (to avoid
729             errors, this method ignores the command at ANY possible error).
730              
731             B<Unimplemented>: This argument is currently ignored. Some algorithmic
732             considerations must be addressed before using it. B<TODO>.
733              
734             =item -advance I<coderef>
735              
736             A reference to a function to call at every new pair. The function is called
737             with three arguments, the first is the index of the first locus, the second
738             is the index of the second locus and the third is the total number of loci.
739             Note that this function is called B<BEFORE> running the comparison.
740              
741             =back
742              
743             =item Returns
744              
745             An arrayref of L<Bio::Polloc::LociGroup> objects, each containing one consistent
746             group of loci.
747              
748             =item Note
749              
750             This method is faster than combining C<build_bin()> and C<build_groups_bin()>,
751             and it should be used whenever transitivity can be freely assumed and you do
752             not need the all-vs-all matrix for further evaluation (for example, manual
753             inspection).
754              
755             =back
756              
757             =cut
758              
759             sub build_groups {
760 0     0 1 0 my($self,@args) = @_;
761 0         0 my ($cpus, $advance) = $self->_rearrange([qw(CPUS ADVANCE)], @args);
762            
763 0         0 my $groups = [[0]]; #<- this is bcs first feature is ignored in FEAT1
764 0         0 my $loci = $self->get_loci;
765 0         0 my $l_max = $#$loci;
766 0         0 $self->debug("Building groups for ".($l_max+1)." loci");
767 0 0       0 $self->warn('Nothing to do, any stored loci') unless $l_max>=0;
768 0         0 FEAT1: for my $i (1 .. $l_max){
769 0         0 FEAT2: for my $j (0 .. $i-1){
770 0         0 $self->debug("Evaluate [$i vs $j]");
771 0 0       0 &$advance($i, $j, $l_max+1) if defined $advance;
772 0 0       0 next FEAT2 unless $self->evaluate(
773             $loci->[$i],
774             $loci->[$j]
775             );
776             # --> If I am here, FEAT1 ~ FEAT2 <--
777 0         0 GROUP: for my $g (0 .. $#{$groups}){
  0         0  
778 0         0 MEMBER: for my $m (0 .. $#{$groups->[$g]}){
  0         0  
779 0 0       0 if($j == $groups->[$g]->[$m]){
780             # I.e., if FEAT2 is member of GROUP
781 0         0 push @{$groups->[$g]}, $i;
  0         0  
782 0         0 next FEAT1; #<- This is why the current method is way more efficient
783             }
784             }#MEMBER
785             }#GROUP
786             }#FEAT2
787             # --> If I am here, FEAT1 belongs to a new group <--
788 0         0 push @{$groups}, [$i];
  0         0  
789             }#FEAT1
790 0         0 my $out = [];
791 0         0 for my $gk (0 .. $#$groups){
792 0         0 my $group = Bio::Polloc::LociGroup->new(-name=>sprintf("%04s", $gk+1)); #+++ ToDo: Is ID ok?
793 0         0 $group->genomes($self->genomes);
794 0         0 for my $lk (0 .. $#{$groups->[$gk]}){
  0         0  
795 0         0 my $locus = $loci->[ $groups->[$gk]->[$lk] ];
796             # Paranoid bugbuster:
797 0 0       0 $self->throw('Impossible to gather the locus back:'.
798             ' $groups->['.$gk.']->['.$lk.']: '.$groups->[$gk]->[$lk],
799             $loci, 'Bio::Polloc::Polloc::UnexpectedException')
800             unless defined $locus;
801 0         0 $group->add_loci($locus);
802             }
803 0         0 push @$out, $group;
804             }
805 0         0 return $out;
806             }
807              
808             =head2 genomes
809              
810             =over
811              
812             =item
813              
814             Gets the genomes of the base group of loci. This function is similar
815             to calling C<locigroup()-E<gt>genomes()>, but is read-only.
816              
817             =back
818              
819             =cut
820              
821             sub genomes {
822 0     0 1 0 my ($self, $value) = @_;
823 0 0       0 $self->warn("Attempting to set the genomes from a read-only function")
824             if defined $value;
825 0 0       0 return unless defined $self->locigroup;
826 0         0 return $self->locigroup->genomes;
827             }
828              
829             =head1 INTERNAL METHODS
830              
831             Methods intended to be used only within the scope of Bio::Polloc::*
832              
833             =head2 _detect_border_pairs
834              
835             =cut
836              
837             sub _detect_border_pairs {
838 0     0   0 my($self, $up_pos, $down_pos, $max_len) = @_;
839 0 0 0     0 return unless $up_pos and $down_pos;
840 0         0 my $ext = $self->{'_groupextension'};
841 0         0 my @out = ();
842 0         0 US: for my $us (@$up_pos){
843 0 0 0     0 $self->throw("Unexpected array structure (upstream): ", $us)
844             unless defined $us->[0] and defined $us->[4];
845 0         0 $self->debug(" US: ", join(':', @$us));
846 0         0 my $found;
847 0         0 my $pair = [];
848 0         0 DS: for my $ds (@$down_pos){
849 0 0 0     0 $self->throw("Unexpected array structure (downstream): ", $ds)
850             unless defined $ds->[0] and defined $ds->[4];
851 0         0 $self->debug(" DS: ", join(':', @$ds));
852 0 0 0     0 next DS if $us->[0] ne $ds->[0] # != ctg
      0        
      0        
      0        
      0        
853             or $us->[3] == $ds->[3] # == strand
854             or abs($ds->[2]-$us->[2]) > $max_len # too large
855             or abs($ds->[2]-$us->[2]) < $ext->{'-minlen'} # too short
856             or defined $found and abs($us->[2]-$ds->[2]) > $found; # prev better
857             # Good!
858 0         0 $self->debug("Saving pair ".$us->[1]."..".$us->[2]."/".$ds->[1]."..".$ds->[2]);
859 0         0 $found = abs($us->[2]-$ds->[2]);
860 0         0 $pair = [$us->[0], $us->[2], $ds->[2], $us->[3], ($us->[4]+$ds->[4])/2];
861             }
862 0 0       0 push @out, $pair if $#$pair>1;
863             }
864 0         0 return @out;
865             }
866              
867             =head2 _next_group_id
868              
869             =over
870              
871             =item
872              
873             Returns an incremental ID that attempts to identify the group used as basis
874             of extension. Please note that this method DOES NOT check if the group's ID
875             is the right one, and it is basically intended to keep track of how many
876             times the C<extend> function has been called.
877              
878             =back
879              
880             =cut
881              
882             sub _next_group_id {
883 0     0   0 my $self = shift;
884 0   0     0 $self->{'_next_group_id'}||= 0;
885 0         0 return ++$self->{'_next_group_id'};
886             }
887              
888             =head2 _build_subseq
889              
890             =over
891              
892             =item Arguments
893              
894             All the following arguments are mandatory and must be passed in that order.
895             The strand will be determined by the relative position of from/to:
896              
897             =over
898              
899             =item *
900              
901             The sequence (L<Bio::Seq> object).
902              
903             =item *
904              
905             The B<from> position (I<int>).
906              
907             =item *
908              
909             The B<to> position (I<int>).
910              
911             =back
912              
913             =item Returns
914              
915             A L<Bio::Seq> object.
916              
917             =item Comments
918              
919             This method should be located at a higher hierarchy module (Root?).
920              
921             This method is static.
922              
923             =back
924              
925             =cut
926              
927             sub _build_subseq {
928 0     0   0 my($self, $seq, $from, $to) = @_;
929 0 0 0     0 $self->throw("No main sequence", $seq)
      0        
930             unless defined $seq and UNIVERSAL::can($seq, 'isa') and $seq->isa('Bio::Seq');
931 0         0 my ($start, $end) = (min($to, $from), max($to, $from));
932 0         0 $start = max($start, 1);
933 0         0 $end = min($end, $seq->length);
934 0 0       0 return unless $start != $end;
935 0         0 my $seqstr = $seq->subseq($start, $end);
936 0         0 my $cleanstr = $seqstr;
937 0         0 $cleanstr =~ s/^N*//;
938 0         0 $cleanstr =~ s/N*$//;
939 0 0       0 return unless length $cleanstr > 0; # See issue BME#5
940 0         0 my $subseq = Bio::Seq->new(-seq=>$seqstr);
941 0 0       0 $subseq = $subseq->revcom if $from < $to;
942 0         0 return $subseq;
943             }
944              
945             =head2 _search_aln_seqs
946              
947             =over
948              
949             =item
950              
951             Uses an alignment to search in the sequences of the collection of genomes
952              
953             =item Arguments
954              
955             A Bio::SimpleAlign object
956              
957             =item Returns
958              
959             A 2D arrayref, where first key is an incremental and second key preserves the
960             orrder in the structure: C<["genome-key:acc", from, to, strand, score]>
961              
962             =back
963              
964             =cut
965              
966             sub _search_aln_seqs {
967 0     0   0 my ($self, $aln) = @_;
968 0         0 my $ext = $self->{'_groupextension'};
969 0 0       0 return unless defined $ext;
970 0 0       0 return unless defined $self->genomes;
971 0         0 my $pos = [];
972 0 0       0 return $pos unless defined $aln; #<- For example, if zero sequences. To gracefully exit.
973 0         0 my $alg = lc $ext->{'-algorithm'};
974 0 0 0     0 if($alg eq 'blast' or $alg eq 'hmmer'){ # ------------------------------- BLAST & HMMer
975             # -------------------------------------------------------------------- Setup DB
976 0 0       0 unless(defined $self->{'_seqsdb'}){
977 0         0 $self->{'_seqsdb'} = Bio::Polloc::Polloc::IO->tempdir();
978 0         0 $self->debug("Creating DB at ".$self->{'_seqsdb'});
979 0         0 for my $genomek (0 .. $#{$self->genomes}){
  0         0  
980 0         0 my $file = $self->{'_seqsdb'}."/$genomek";
981 0         0 my $fasta = Bio::SeqIO->new(-file=>">$file", -format=>'Fasta');
982 0         0 for my $ctg (@{$self->genomes->[$genomek]->get_sequences}){ $fasta->write_seq($ctg) }
  0         0  
  0         0  
983             # BLAST requires a formatdb (not only the fasta)
984 0 0       0 if($alg eq 'blast'){
985 0         0 my $run = Bio::Polloc::Polloc::IO->new(-file=>"formatdb -p F -i '$file' 2>&1 |");
986 0         0 while($run->_readline) {} # just run ;o)
987 0         0 $run->close;
988             }
989             }
990             }
991             # -------------------------------------------------------------------- Predefine vars
992 0         0 my $factory;
993             my $query;
994 0 0       0 if($alg eq 'blast'){
    0          
995 0         0 $self->_load_module('Bio::Tools::Run::StandAloneBlast');
996 0         0 my $cons_seq = $aln->consensus_string($ext->{'-consensusperc'});
997 0         0 $cons_seq =~ s/\?/N/g;
998 0         0 $query = Bio::Seq->new(-seq=>$cons_seq);
999             }elsif($alg eq 'hmmer'){
1000 0         0 $self->_load_module('Bio::Tools::Run::Hmmer');
1001 0         0 my $tmpio = Bio::Polloc::Polloc::IO->new();
1002             # The following lines should be addressed with a three-lines code,
1003             # but the buggy AUTOLOAD of Bio::Tools::Run::Hmmer let us no option
1004             # -lrr
1005 0         0 $factory = Bio::Tools::Run::Hmmer->new();
1006 0         0 $factory->hmm($tmpio->tempfile);
1007 0         0 $factory->program('hmmbuild');
1008 0         0 $factory->run($aln);
1009             #$factory->calibrate();
1010             }
1011             # -------------------------------------------------------------------- Search
1012 0         0 $self->debug("Searching... alg:$alg, sim:".$ext->{'-similarity'}." score:".$ext->{'-score'}." e:".$ext->{'-e'});
1013 0         0 GENOME: for my $Gk (0 .. $#{$self->genomes}){
  0         0  
1014 0         0 my $report;
1015 0 0       0 if($alg eq 'blast'){
    0          
1016 0 0       0 next GENOME if ($query->seq =~ tr/N//) > 0.25*$query->length; # issue#14
1017 0         0 $factory = Bio::Tools::Run::StandAloneBlast->new(
1018             '-e'=>$ext->{'-e'}, '-program'=>$ext->{'-p'},
1019             '-db'=>$self->{'_seqsdb'}."/$Gk" );
1020             # Try to handle issue#14 and possible undocumented related issues:
1021             # (still causing some problems in the STDERR output)
1022 0     0   0 try { $report = $factory->blastall($query); }
1023             catch Error with {
1024 0     0   0 $self->debug("Launch BLAST with query: ".$query->seq());
1025 0         0 $self->warn("BLAST failed, skipping query and attempting to continue");
1026 0         0 next GENOME;
1027             }
1028             otherwise {
1029 0     0   0 $self->throw("BLAST failed", $_, 'Bio::Polloc::Polloc::UnexpectedException');
1030 0         0 };
1031             }elsif($alg eq 'hmmer'){
1032 0         0 $factory->program('hmmsearch');
1033 0         0 $report = $factory->run($self->{'_seqsdb'}."/$Gk");
1034             }
1035             # ----------------------------------------------------------------- Parse search
1036 0         0 RESULT: while(my $res = $report->next_result){
1037 0         0 HIT: while(my $hit = $res->next_hit){
1038 0         0 HSP: while(my $hsp = $hit->next_hsp){
1039             # -------------------------------------------------------- Eval criteria
1040 0 0 0     0 if( ($alg eq 'blast'
      0        
      0        
      0        
      0        
1041             and $hsp->frac_identical('query') >= $ext->{'-similarity'}
1042             and $hsp->score >= $ext->{'-score'})
1043             or
1044             ($alg eq 'hmmer'
1045             and $hsp->score >= $ext->{'-score'}
1046             and $hsp->evalue <= $ext->{'-e'})
1047             ){
1048             # -------------------------------------------------- Save result
1049 0         0 $self->debug("Found: sim:".$hsp->frac_identical('query').", score:".
1050             $hsp->score.", e:".$hsp->evalue);
1051 0 0       0 my $r_pos = ["$Gk:".$hit->accession,
    0          
    0          
1052             $hsp->strand('hit')!=$hsp->strand('query')?
1053             $hsp->start('hit'):$hsp->end('hit'),
1054             $hsp->strand('hit')!=$hsp->strand('query')?
1055             $hsp->end('hit'):$hsp->start('hit'),
1056             $hsp->strand('hit')!=$hsp->strand('query')?
1057             -1 : 1,
1058             $hsp->bits];
1059 0         0 push @$pos, $r_pos;
1060             }
1061             } # HSP
1062             } # HIT
1063             } # RESULT
1064             } # GENOME
1065             }else{ # ---------------------------------------------------------------- UNSUPPORTED
1066 0         0 $self->throw('Unsupported search algorithm', $ext->{'-algorithm'});
1067             }
1068 0         0 return $pos;
1069             }
1070              
1071             =head2 _feat_index2obj
1072              
1073             =over
1074              
1075             =item
1076              
1077             Takes an index 2D matrix and returns it as the equivalent L<Bio::Polloc::LocusI> objects
1078              
1079             =item Arguments
1080              
1081             2D matrix of integers (arrayref)
1082              
1083             =item Returns
1084              
1085             2D matrix of L<Bio::Polloc::LocusI> objects (ref)
1086              
1087             =back
1088              
1089             =cut
1090              
1091             sub _feat_index2obj{
1092 0     0   0 my($self,$groups) = @_;
1093 0         0 for my $g (0 .. $#{$groups}){
  0         0  
1094 0         0 for my $m (0 .. $#{$groups->[$g]}){
  0         0  
1095 0         0 $groups->[$g]->[$m] = $self->get_locus($groups->[$g]->[$m]);
1096             }
1097             }
1098 0         0 return $groups;
1099             }
1100              
1101              
1102             =head2 _grouprules_cleanup
1103              
1104             =cut
1105              
1106             # Issue #7
1107             sub _grouprules_cleanup {
1108 0     0   0 my $self = shift;
1109 0 0       0 if(defined $self->{'_seqsdb'}) {
1110 0         0 my $tmp = $self->{'_seqsdb'};
1111 0 0       0 for my $k (0 .. $#{$self->genomes || []}){
  0         0  
1112 0         0 while(<$tmp/$k.*>){
1113 0 0       0 unlink $_ or $self->throw("Impossible to delete '$_'", $!);
1114             }
1115 0 0       0 unlink "$tmp/$k" or $self->throw("Impossible to delete '$tmp/$k'", $!);
1116             }
1117 0         0 rmdir $tmp;
1118             }
1119             }
1120              
1121             =head2 _initialize
1122              
1123             =cut
1124              
1125             sub _initialize {
1126 3     3   11 my($self, @args) = @_;
1127 3         28 $self->_register_cleanup_method(\&_grouprules_cleanup);
1128 3         29 my($source, $target, $features, $loci) =
1129             $self->_rearrange([qw(SOURCE TARGET FEATURES LOCI)], @args);
1130             # $self->throw('Discouraged use of -features flag, use -loci instead');
1131 3         21 $self->source($source);
1132 3         12 $self->target($target);
1133 3 50 33     16 $loci = $features if defined $features and not defined $loci;
1134 3         14 $self->locigroup($loci);
1135             }
1136              
1137              
1138             1;
1139              
1140