File Coverage

Bio/Tree/DistanceFactory.pm
Criterion Covered Total %
statement 89 227 39.2
branch 23 84 27.3
condition 6 39 15.3
subroutine 9 12 75.0
pod 4 4 100.0
total 131 366 35.7


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tree::DistanceFactory
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tree::DistanceFactory - Construct a tree using distance based methods
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tree::DistanceFactory;
21             use Bio::AlignIO;
22             use Bio::Align::DNAStatistics;
23             my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ");
24             my $stats = Bio::Align::DNAStatistics->new();
25              
26             my $alnin = Bio::AlignIO->new(-format => 'clustalw',
27             -file => 'file.aln');
28             my $aln = $alnin->next_aln;
29             # Of course matrix can come from a different place
30             # like PHYLIP if you prefer, Bio::Matrix::IO should be able
31             # to parse many things
32             my $jcmatrix = $stats->distance(-align => $aln,
33             -method => 'Jukes-Cantor');
34             my $tree = $tfactory->make_tree($jcmatrix);
35              
36              
37             =head1 DESCRIPTION
38              
39             This is a factory which will construct a phylogenetic tree based on
40             the pairwise sequence distances for a set of sequences. Currently
41             UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree
42             construction methods are implemented.
43              
44             =head1 REFERENCES
45              
46             Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis",
47             Cambridge Univ Press, Cambridge, UK.
48              
49             Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge
50             Neighbour-Joining trees of protein sequences." Bioinformatics
51             18(11):1546-1547.
52              
53             Saitou N and Nei M, (1987) "The neighbor-joining method: a new method
54             for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25.
55              
56             =head1 FEEDBACK
57              
58             =head2 Mailing Lists
59              
60             User feedback is an integral part of the evolution of this and other
61             Bioperl modules. Send your comments and suggestions preferably to
62             the Bioperl mailing list. Your participation is much appreciated.
63              
64             bioperl-l@bioperl.org - General discussion
65             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66              
67             =head2 Support
68              
69             Please direct usage questions or support issues to the mailing list:
70              
71             I
72              
73             rather than to the module maintainer directly. Many experienced and
74             reponsive experts will be able look at the problem and quickly
75             address it. Please include a thorough description of the problem
76             with code and data examples if at all possible.
77              
78             =head2 Reporting Bugs
79              
80             Report bugs to the Bioperl bug tracking system to help us keep track
81             of the bugs and their resolution. Bug reports can be submitted the web:
82              
83             https://github.com/bioperl/bioperl-live/issues
84              
85             =head1 AUTHOR - Jason Stajich
86              
87             Email jason-at-bioperl.org
88              
89             =head1 APPENDIX
90              
91             The rest of the documentation details each of the object methods.
92             Internal methods are usually preceded with a _
93              
94             =cut
95              
96             package Bio::Tree::DistanceFactory;
97 1     1   871 use vars qw($DefaultMethod $Precision);
  1         2  
  1         41  
98 1     1   5 use strict;
  1         2  
  1         28  
99              
100             # some defaults
101             $DefaultMethod = 'UPGMA';
102             $Precision = 5;
103              
104 1     1   276 use Bio::Tree::Node;
  1         2  
  1         28  
105 1     1   265 use Bio::Tree::Tree;
  1         2  
  1         27  
106              
107 1     1   5 use base qw(Bio::Root::Root);
  1         3  
  1         1673  
108              
109             =head2 new
110              
111             Title : new
112             Usage : my $obj = Bio::Tree::DistanceFactory->new();
113             Function: Builds a new Bio::Tree::DistanceFactory object
114             Returns : an instance of Bio::Tree::DistanceFactory
115             Args : -method => 'NJ' or 'UPGMA'
116              
117              
118             =cut
119              
120             sub new {
121 1     1 1 740 my($class,@args) = @_;
122 1         23 my $self = $class->SUPER::new(@args);
123              
124 1         8 my ($method) = $self->_rearrange([qw(METHOD)],
125             @args);
126 1   33     7 $self->method($method || $DefaultMethod);
127 1         2 return $self;
128             }
129              
130             =head2 make_tree
131              
132             Title : make_tree
133             Usage : my $tree = $disttreefact->make_tree($matrix);
134             Function: Build a Tree based on a distance matrix
135             Returns : L
136             Args : L object
137              
138              
139             =cut
140              
141             sub make_tree{
142 1     1 1 5 my ($self,$matrix) = @_;
143 1 50 33     9 if( ! defined $matrix || !ref($matrix) ||
      33        
144             ! $matrix->isa('Bio::Matrix::MatrixI') ) {
145 0         0 $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree");
146 0         0 return;
147             }
148              
149 1         3 my $method = uc ($self->method);
150 1 50       6 if( $method =~ /NJ/i ) {
    0          
151 1         3 return $self->_nj($matrix);
152             } elsif( $method =~ /UPGMA/i ) {
153 0         0 return $self->_upgma($matrix);
154             } else {
155 0         0 $self->warn("Unknown tree construction method '$method'. Cannot run.");
156 0         0 return;
157             }
158            
159             }
160              
161              
162             =head2 _nj
163              
164             Title : _nj
165             Usage : my $tree = $disttreefact->_nj($matrix);
166             Function: Construct a tree based on distance matrix using the
167             Neighbor Joining algorithm (Saitou and Nei, 1987)
168             Implementation based on Kevin Howe's Quicktree implementation
169             and uses his tricks (some based on Bill Bruno's work) to eliminate
170             negative branch lengths
171             Returns : L
172             Args : L object
173              
174             =cut
175              
176             sub _nj {
177 1     1   2 my ($self,$distmat) = @_;
178              
179             # we assume type checking of $aln has already been done
180             # client shouldn't be calling this directly anyways, using the
181             # make_tree method is preferred
182            
183             # so that we can trim the number of digits shown as the branch length
184 1         4 my $precisionstr = "%.$Precision"."f";
185              
186 1         5 my @names = $distmat->column_names;
187 1         2 my $N = scalar @names;
188 1         3 my ($i,$j,$m,@nodes,$mat,@r);
189 1         2 my $L = $N;
190              
191 1 50       5 if( $N < 2 ) {
    50          
192 0         0 $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n");
193 0         0 return;
194             } elsif( $N == 2 ) {
195 0         0 $i = 0;
196 0         0 my $d = sprintf($precisionstr,
197             $distmat->get_entry($names[0],$names[1]) / 2);
198 0         0 my $root = Bio::Tree::Node->new();
199 0         0 for my $nm ( @names ) {
200 0         0 $root->add_Descendents( Bio::Tree::Node->new(-id => $nm,
201             -branch_length => $d));
202             }
203 0         0 return Bio::Tree::Tree(-root => $root);
204             }
205 1         2 my $c = 0;
206            
207 1         4 for ( $i = 0; $i < $N; $i++ ) {
208 14         40 push @nodes, Bio::Tree::Node->new(-id => $names[$i]);
209 14         17 my $ri = 0;
210 14         31 for( $j = 0; $j < $N; $j++ ) {
211 196         277 $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]);
212 196         413 $ri += $mat->[$i][$j];
213             }
214 14         37 $r[$i] = $ri / ($L -2);
215             }
216            
217 1         4 for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) {
218 11         14 my ($mini,$minj,$min);
219 11         17 for($i = 0; $i < $N; $i++ ) {
220 154 100       215 next unless defined $nodes[$i];
221 99         122 for( $j = 0; $j < $i; $j++ ) {
222 516 100       618 next unless defined $nodes[$j];
223 451         425 my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]);
224 451 100 100     1001 if( ! defined $min ||
225             $dist <= $min) {
226 32         56 ($mini,$minj,$min) = ($i,$j,$dist);
227             }
228             }
229             }
230 11         17 my $dij = $mat->[$mini][$minj];
231 11         15 my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2;
232 11         12 my $dist_j = $dij - $dist_i;
233            
234             # deal with negative branch lengths
235             # per code in K.Howe's quicktree
236 11 50       24 if( $dist_i < 0 ) {
    50          
237 0         0 $dist_i = 0;
238 0         0 $dist_j = $dij;
239 0 0       0 $dist_j = 0 if( $dist_j < 0 );
240             } elsif( $dist_j < 0 ) {
241 0         0 $dist_j = 0;
242 0         0 $dist_i = $dij;
243 0 0       0 $dist_i = 0 if( $dist_i < 0 );
244             }
245            
246 11         71 $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i));
247 11         43 $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j));
248            
249 11         31 my $newnode = Bio::Tree::Node->new(-descendents => [ $nodes[$mini],
250             $nodes[$minj] ]);
251              
252 11         16 $nodes[$mini] = $newnode;
253 11         12 delete $nodes[$minj];
254            
255             # update the distance matrix
256 11         13 $r[$mini] = 0;
257 11         12 my ($dmi,$dmj);
258 11         20 for( $m = 0; $m < $N; $m++ ) {
259 154 100       211 next unless defined $nodes[$m];
260 88 100       108 if( $m != $mini ) {
261 77         79 $dmj = $mat->[$m][$minj];
262            
263 77         74 my ($row,$col);
264 77         76 ($row,$col) = ($m,$mini);
265 77         74 $dmi = $mat->[$row][$col];
266            
267             # from K.Howe's notes in quicktree
268             # we can actually adjust r[m] here, by using the form:
269             # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1)
270              
271             # Note: in Bill Bruno's method for negative branch
272             # elimination, then if either dist_i is positive and
273             # dist_j is 0, or dist_i is zero and dist_j is positive
274             # (after adjustment) then the matrix entry is formed
275             # from the distance to the node in question (m) to the
276             # node with the zero branch length (whichever it was).
277             # I think my code already has the same effect; this is
278             # certainly true if dij is equal to dist_i + dist_j,
279             # which it should have been fixed to
280              
281 77         101 my $dmk = $mat->[$row][$col] = $mat->[$col][$row] =
282             ($dmi + $dmj - $dij) / 2;
283            
284             # If we don't want to try and correct negative brlens
285             # this is essentially what is in Edddy et al, BSA book.
286             # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1);
287             #
288 77         105 $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj +
289             $mat->[$row][$col]) / ( $L - 3);
290 77         115 $r[$mini] += $dmk;
291             }
292             }
293 11         12 $L--;
294 11         21 $r[$mini] /= $L - 2;
295             }
296            
297             # should be 3 nodes left
298 1         2 my (@leftovernodes,@leftovers);
299 1         4 for( my $k = 0; $k < $N; $k++ ) {
300 14 100       22 if( defined $nodes[$k] ) {
301 3         4 push @leftovers, $k;
302 3         11 push @leftovernodes, $nodes[$k];
303             }
304             }
305 1         2 my ($l_0,$l_1,$l_2) = @leftovers;
306            
307 1         4 my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] -
308             $mat->[$l_2][$l_1] ) / 2;
309            
310 1         3 my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i);
311 1         1 my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i);
312              
313             # This is Kev's code to get rid of negative branch lengths
314 1 50       7 if( $dist_i < 0 ) {
    50          
    50          
315 0         0 $dist_i = 0;
316 0         0 $dist_j = $mat->[$l_1][$l_0];
317 0         0 $dist_k = $mat->[$l_2][$l_0];
318 0 0       0 if( $dist_j < 0 ) {
    0          
319 0         0 $dist_j = 0;
320 0         0 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2;
321 0 0       0 $dist_k = 0 if( $dist_k < 0 );
322             } elsif( $dist_k < 0 ) {
323 0         0 $dist_k = 0;
324 0         0 $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2;
325 0 0       0 $dist_j = 0 if( $dist_j < 0 );
326             }
327             } elsif( $dist_j < 0 ) {
328 0         0 $dist_j = 0;
329 0         0 $dist_i = $mat->[$l_1][$l_0];
330 0         0 $dist_k = $mat->[$l_2][$l_1];
331 0 0       0 if( $dist_i < 0 ) {
    0          
332 0         0 $dist_i = 0;
333 0         0 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2;
334 0 0       0 $dist_k = 0 if( $dist_k < 0 );
335             } elsif( $dist_k < 0 ) {
336 0         0 $dist_k = 0;
337 0         0 $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
338 0 0       0 $dist_i = 0 if( $dist_i < 0 );
339             }
340             } elsif( $dist_k < 0 ) {
341 0         0 $dist_k = 0;
342 0         0 $dist_i = $mat->[$l_2][$l_0];
343 0         0 $dist_j = $mat->[$l_2][$l_1];
344 0 0       0 if( $dist_i < 0 ) {
    0          
345 0         0 $dist_i = 0;
346 0         0 $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2;
347 0 0       0 $dist_j = 0 if $dist_j < 0;
348             } elsif( $dist_j < 0 ) {
349 0         0 $dist_j = 0;
350 0         0 $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
351 0 0       0 $dist_i = 0 if $dist_i < 0;
352             }
353             }
354 1         8 $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i));
355 1         6 $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j));
356 1         4 $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k));
357              
358 1         4 Bio::Tree::Tree->new(-root => Bio::Tree::Node->new
359             (-descendents => \@leftovernodes));
360             }
361              
362             =head2 _upgma
363              
364             Title : _upgma
365             Usage : my $tree = $disttreefact->_upgma($matrix);
366             Function: Construct a tree based on alignment using UPGMA
367             Returns : L
368             Args : L object
369              
370              
371             =cut
372              
373             sub _upgma{
374 0     0   0 my ($self,$distmat) = @_;
375             # we assume type checking of $matrix has already been done
376             # client shouldn't be calling this directly anyways, using the
377             # make_tree method is preferred
378            
379             # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998
380             # originally by Sokal and Michener 1956
381              
382 0         0 my $precisionstr = "%.$Precision"."f";
383            
384 0         0 my ($i,$j,$x,$y,@dmat,@orig,@nodes);
385              
386 0         0 my @names = $distmat->column_names;
387 0         0 my $c = 0;
388             my @clusters = map {
389 0         0 my $r = { 'id' => $c,
  0         0  
390             'height' => 0,
391             'contains' => [$c],
392             };
393 0         0 $c++;
394 0         0 $r;
395             } @names;
396              
397 0         0 my $K = scalar @clusters;
398 0         0 my (@mins,$min);
399 0         0 for ( $i = 0; $i < $K; $i++ ) {
400 0         0 for( $j = $i+1; $j < $K; $j++ ) {
401 0         0 my $d = $distmat->get_entry($names[$i],$names[$j]);
402             # get Min here on first time around, save 1 cycle
403 0         0 $dmat[$j][$i] = $dmat[$i][$j] = $d;
404 0         0 $orig[$i][$j] = $orig[$j][$i] = $d;
405 0 0 0     0 if ( ! defined $min || $d <= $min ) {
406 0 0 0     0 if( defined $min && $min == $d ) {
407 0         0 push @mins, [$i,$j];
408             } else {
409 0         0 @mins = [$i,$j];
410 0         0 $min = $d;
411             }
412             }
413             }
414             }
415             # distance between each cluster is avg distance
416             # between pairs of sequences from each cluster
417 0         0 while( $K > 1 ) {
418             # fencepost - we already have found the $min
419             # so very first time loop is executed we can skip checking
420 0 0       0 unless( defined $min ) {
421 0         0 for($i = 0; $i < $K; $i++ ) {
422 0         0 for( $j = $i+1; $j < $K; $j++ ) {
423 0         0 my $dij = $dmat[$i][$j];
424 0 0 0     0 if( ! defined $min ||
425             $dij <= $min) {
426 0 0 0     0 if( defined $min &&
427             $min == $dij ) {
428 0         0 push @mins, [$i,$j];
429             } else {
430 0         0 @mins = [ $i,$j ];
431 0         0 $min = $dij;
432             }
433             }
434             }
435             }
436             }
437             # randomly break ties
438 0         0 ($x,$y) = @{ $mins[int(rand(scalar @mins))] };
  0         0  
439              
440             # now we are going to join clusters x and y, make a new cluster
441              
442 0         0 my $node = Bio::Tree::Node->new();
443 0         0 my @subids;
444 0         0 for my $cid ( $x,$y ) {
445 0         0 my $nid = $clusters[$cid]->{'id'};
446 0 0       0 if( ! defined $nodes[$nid] ) {
447 0         0 $nodes[$nid] = Bio::Tree::Node->new(-id => $names[$nid]);
448             }
449             $nodes[$nid]->branch_length
450 0         0 (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'}));
451 0         0 $node->add_Descendent($nodes[$nid]);
452 0         0 push @subids, @{ $clusters[$cid]->{'contains'} };
  0         0  
453             }
454 0         0 my $cluster = { 'id' => $c++,
455             'height' => $min / 2,
456             'contains' => [@subids],
457             };
458              
459 0         0 $K--; # we are going to drop the last node so go ahead and decrement K
460 0         0 $nodes[$cluster->{'id'}] = $node;
461 0 0       0 if ( $y != $K ) {
462 0         0 $clusters[$y] = $clusters[$K];
463 0         0 $dmat[$y] = $dmat[$K];
464 0         0 for ( $i = 0; $i < $K; $i++ ) {
465 0         0 $dmat[$i][$y] = $dmat[$y][$i];
466             }
467             }
468 0         0 delete $clusters[$K];
469 0         0 $clusters[$x] = $cluster;
470             # now recalculate @dmat
471 0         0 for( $i = 0; $i < $K; $i++ ) {
472 0 0       0 if( $i != $x) {
473 0         0 $dmat[$i][$x] = $dmat[$x][$i] =
474             &_upgma_distance($clusters[$i],$clusters[$x],\@orig);
475             } else {
476 0         0 $dmat[$i][$i] = 0;
477             }
478             }
479             # reset so next loop iteration
480             # we will find minimum distance
481 0         0 @mins = ();
482 0         0 $min = undef;
483             }
484 0         0 Bio::Tree::Tree->new(-root => $nodes[-1]);
485             }
486              
487             # calculate avg distance between clusters - be they
488             # single sequences or the combination of multiple seqences
489             # $cluster_i and $cluster_j are the clusters to operate on
490             # and $distances is a matrix (arrayref of arrayrefs) of pairwise
491             # differences indexed on the sequence ids -
492             # so $distances->[0][1] is the distance between sequences 0 and 1
493              
494             sub _upgma_distance {
495 0     0   0 my ($cluster_i, $cluster_j, $distances) = @_;
496 0         0 my $ilen = scalar @{ $cluster_i->{'contains'} };
  0         0  
497 0         0 my $jlen = scalar @{ $cluster_j->{'contains'} };
  0         0  
498 0         0 my ($d,$count);
499 0         0 for( my $i = 0; $i < $ilen; $i++ ) {
500 0         0 my $i_id = $cluster_i->{'contains'}->[$i];
501 0         0 for( my $j = 0; $j < $jlen; $j++) {
502 0         0 my $j_id = $cluster_j->{'contains'}->[$j];
503 0 0       0 if( ! defined $distances->[$i_id][$j_id] ) {
504 0         0 warn("no value for $i_id $j_id\n");
505             } else {
506 0         0 $d += $distances->[$i_id][$j_id];
507             }
508 0         0 $count++;
509             }
510             }
511 0         0 return $d / $count;
512             }
513              
514             =head2 method
515              
516             Title : method
517             Usage : $obj->method($newval)
518             Function:
519             Example :
520             Returns : value of method (a scalar)
521             Args : on set, new value (a scalar or undef, optional)
522              
523              
524             =cut
525              
526             sub method{
527 2     2 1 3 my $self = shift;
528 2 100       6 return $self->{'_method'} = shift if @_;
529 1         3 return $self->{'_method'};
530             }
531              
532              
533             =head2 check_additivity
534              
535             Title : check_additivity
536             Usage : if( $distance->check_additivity($matrix) ) {
537             }
538             Function : See if matrix obeys additivity principal
539             Returns : boolean
540             Args : Bio::Matrix::MatrixI
541             References: Based on a Java implementation by
542             Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3
543             http://www.dina.kvl.dk/~sestoft/bsa.html
544             which in turn is based on algorithms described in
545             R. Durbin, S. Eddy, A. Krogh, G. Mitchison.
546             Biological Sequence Analysis CUP 1998, Chapter 7.
547              
548             =cut
549              
550             sub check_additivity{
551 0     0 1   my ($self,$matrix) = @_;
552 0           my @names = $matrix->column_names;
553 0           my $len = scalar @names;
554 0 0         return unless $len >= 4;
555             # look at all sets of 4
556 0           for( my $i = 0; $i < $len; $i++ ) {
557 0           for( my $j = $i+1; $j< $len; $j++) {
558 0           for( my $k = $j+1; $k < $len; $k ++ ) {
559 0           for( my $m = $k +1; $m < $len; $m++ ) {
560 0           my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) +
561             $matrix->get_entry($names[$k],$names[$m]);
562 0           my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) +
563             $matrix->get_entry($names[$j],$names[$m]);
564 0           my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) +
565             $matrix->get_entry($names[$j],$names[$k]);
566 0 0 0       if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk)
      0        
      0        
      0        
      0        
567             || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm)
568             || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) {
569 0           return 0;
570             }
571             }
572             }
573             }
574             }
575 0           return 1;
576             }
577              
578             1;