File Coverage

blib/lib/Bio/Tools/Run/Phylo/Phylip/Neighbor.pm
Criterion Covered Total %
statement 63 168 37.5
branch 9 68 13.2
condition 0 17 0.0
subroutine 16 23 69.5
pod 7 7 100.0
total 95 283 33.5


line stmt bran cond sub pod time code
1             # BioPerl module for Bio::Tools::Run::Phylo::Phylip::Neighbor
2             #
3             # Created by
4             #
5             # Shawn Hoon
6             #
7             # You may distribute this module under the same terms as perl itself
8              
9             # POD documentation - main docs before the code
10              
11             =head1 NAME
12              
13             Bio::Tools::Run::Phylo::Phylip::Neighbor - Wrapper for the phylip
14             program neighbor by Joseph Felsenstein for creating a phylogenetic
15             tree(either through Neighbor or UPGMA) based on protein distances
16             based on amino substitution rate.
17              
18             14 Nov 2002 Shawn
19             Works with Phylip version 3.6
20              
21             =head1 SYNOPSIS
22              
23             #Create a SimpleAlign object
24             @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
25             $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
26             $inputfilename = 't/data/cysprot.fa';
27             $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object.
28              
29             # Create the Distance Matrix
30             # using a default PAM matrix and id name lengths limit of 30 note to
31             # use id name length greater than the standard 10 in neighbor, you
32             # will need to modify the neighbor source code
33              
34             $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
35             my $matrix = $protdist_factory->run($aln);
36              
37             #Create the tree passing in the distance matrix
38             @params = ('type'=>'NJ','outgroup'=>2,'lowtri'=>1,
39             'upptri'=>1,'subrep'=>1);
40              
41             my $neighbor_factory =
42             Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params);
43              
44             #you can set your outgroup using either a number specifying
45             #the rank in the matrix or you can just use the name of the
46             #species
47              
48             $neighbor_factory->outgroup('ENSP00001');
49             #or
50             $neighbor_factory->outgroup(1);
51              
52             my ($tree) = $neighbor_factory->run($matrix);
53              
54             # Alternatively, one can create the tree by passing in a file name
55             # containing a phylip formatted distance matrix(using protdist)
56             my $neighbor_factory =
57             Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params);
58             my ($tree) = $neighbor_factory->run('/home/shawnh/prot.dist');
59              
60             # To prevent PHYLIP from truncating sequence names:
61             # Step 1. Shelf the original names:
62             my ($aln_safe, $ref_name)= # $aln_safe has serial names
63             $aln->set_displayname_safe(); # $ref_name holds original names
64             # Step 2. Run ProtDist and Neighbor:
65             $matrix = $protdist_factory->
66             creat_distance_matrix($aln_safe); # Use $aln_safe instead of $aln
67             $tree = $neighbor_factory->run($matrix);
68             # Step 3. Retrieve orgininal OTU names:
69             use Bio::Tree::Tree;
70             my @nodes=$tree->get_nodes();
71             foreach my $nd (@nodes){
72             $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf;
73             }
74              
75              
76             =head1 PARAMTERS FOR NEIGHBOR COMPUTATION
77              
78             =cut
79              
80             =head2 TYPE
81              
82             Title : TYPE
83             Description : (optional)
84             This sets the type of tree to construct, using
85             neighbor joining or UPGMA.
86              
87             NJ Neighbor Joining
88             UPGMA UPGMA
89              
90             Usage : @params = ('type'=>'X');#where X is one of the values above
91             Defaults to NJ
92              
93             For more information on the usage of the different
94             models, please refer to the documentation found in
95             the phylip package.
96              
97             =head2 OUTGROUP (*ONLY AVAILABLE FOR NEIGHBOR JOINING)
98              
99             Title : OUTGROUP
100             Description : (optional)
101             This option selects the species to be used as the outgroup
102             Acceptable Values: integer
103             Usage : @params = ('outgroup'=>'X');
104             where X is an positive integer not more than the
105             number of sequences
106             Defaults to 1
107              
108             =head2 LOWTRI
109              
110             Title : LOWTRI
111             Description : (optional)
112             This indicates that the distance matrix is
113             input in Lower-triangular form (the lower-left
114             half of the distance matrix only, without the zero
115             diagonal elements)
116             Usage : @params = ('lowtri'=>'X'); where X is either 1 or 0
117             Defaults to 0
118              
119             =head2 UPPTRI
120              
121             Title : UPPTRI
122             Description : (optional)
123             This indicates that the distance matrix is input in
124             upper-triangular form (the upper-right half of the
125             distance matrix only, without the zero diagonal elements.)
126             Usage : @params = ('upptri'=>'X'); where X is either 1 or 0
127             Defaults to 0
128              
129             =head2 SUBREP
130              
131             Title : SUBREP
132             Description : (optional)
133             This is the Subreplication option.
134              
135             It informs the program that after each distance will
136             be provided an integer indicating that the distance
137             is a mean of that many replicates.
138              
139             Usage : @params = ('subrep'=>'X'); where X is either 1 or 0
140             Defaults to 0
141              
142             =head2 JUMBLE
143              
144             Title : JUMBLE
145             Description : (optional)
146              
147             This enables you to tell the program to use a random
148             number generator to choose the input order of
149             species. seed: an integer between 1 and 32767 and of
150             the form 4n+1 which means that it must give a
151             remainder of 1 when divided by 4. Each different
152             seed leads to a different sequence of addition of
153             species. By simply changing the random number seed
154             and re-running programs one can look for other, and
155             better trees. iterations:
156              
157             Usage : @params = ('jumble'=>'17); where 17 is the random seed
158             Defaults to no jumble
159              
160             =head1 FEEDBACK
161              
162             =head2 Mailing Lists
163              
164             User feedback is an integral part of the evolution of this and other
165             Bioperl modules. Send your comments and suggestions preferably to one
166             of the Bioperl mailing lists. Your participation is much appreciated.
167              
168             bioperl-l@bioperl.org - General discussion
169             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
170              
171             =head2 Support
172              
173             Please direct usage questions or support issues to the mailing list:
174              
175             I
176              
177             rather than to the module maintainer directly. Many experienced and
178             reponsive experts will be able look at the problem and quickly
179             address it. Please include a thorough description of the problem
180             with code and data examples if at all possible.
181              
182             =head2 Reporting Bugs
183              
184             Report bugs to the Bioperl bug tracking system to help us keep track
185             the bugs and their resolution. Bug reports can be submitted via the
186             web:
187              
188             http://redmine.open-bio.org/projects/bioperl/
189              
190             =head1 AUTHOR - Shawn Hoon
191              
192             Email shawnh@fugu-sg.org
193              
194             =head1 CONTRIBUTORS
195              
196             Email:jason-at-bioperl.org
197              
198             =head1 APPENDIX
199              
200             The rest of the documentation details each of the object
201             methods. Internal methods are usually preceded with a _
202              
203             =cut
204              
205             #'
206              
207            
208             package Bio::Tools::Run::Phylo::Phylip::Neighbor;
209              
210 1         163 use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
211             @NEIGHBOR_PARAMS @OTHER_SWITCHES
212 1     1   1215 %OK_FIELD);
  1         2  
213 1     1   21 use strict;
  1         3  
  1         48  
214 1     1   10 use Bio::SimpleAlign;
  1         3  
  1         37  
215 1     1   9 use Bio::AlignIO;
  1         3  
  1         39  
216 1     1   8 use Bio::TreeIO;
  1         2  
  1         22  
217 1     1   6 use Bio::Root::Root;
  1         3  
  1         35  
218 1     1   9 use Bio::Root::IO;
  1         3  
  1         43  
219 1     1   9 use Bio::Tools::Run::Phylo::Phylip::Base;
  1         3  
  1         36  
220 1     1   9 use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu);
  1         3  
  1         102  
221              
222 1     1   6 use Cwd;
  1         6  
  1         160  
223            
224             # inherit from Phylip::Base which has some methods for dealing with
225             # Phylip specifics
226             @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base);
227              
228             # You will need to enable the neighbor program. This
229             # can be done in (at least) 3 ways:
230             #
231             # 1. define an environmental variable PHYLIPDIR:
232             # export PHYLIPDIR=/home/shawnh/PHYLIP/bin
233             #
234             # 2. include a definition of an environmental variable PHYLIPDIR in
235             # every script that will use Neighbor.pm.
236             # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin';
237             #
238             # 3. You can set the path to the program through doing:
239             # my @params('program'=>'/usr/local/bin/neighbor');
240             # my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params);
241             #
242              
243              
244             BEGIN {
245              
246 1     1   3 $PROGRAMNAME="neighbor";
247 1 50       4 if (defined $ENV{PHYLIPDIR}) {
248 0   0     0 $PROGRAMDIR = $ENV{PHYLIPDIR} || '';
249 0 0       0 $PROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,
250             $PROGRAMNAME.($^O =~ /mswin/i ?'.exe':''));
251             }
252             else {
253 1         2 $PROGRAM = $PROGRAMNAME;
254             }
255 1         3 @NEIGHBOR_PARAMS = qw(TYPE OUTGROUP LOWTRI UPPTRI SUBREP JUMBLE MULTIPLE);
256 1         2 @OTHER_SWITCHES = qw(QUIET);
257 1         2 foreach my $attr(@NEIGHBOR_PARAMS,@OTHER_SWITCHES) {
258 8         1288 $OK_FIELD{$attr}++;
259             }
260             }
261              
262             =head2 program_name
263              
264             Title : program_name
265             Usage : >program_name()
266             Function: holds the program name
267             Returns: string
268             Args : None
269              
270             =cut
271              
272             sub program_name {
273 6     6 1 29 return 'neighbor';
274             }
275              
276             =head2 program_dir
277              
278             Title : program_dir
279             Usage : ->program_dir()
280             Function: returns the program directory, obtained from ENV variable.
281             Returns: string
282             Args :
283              
284             =cut
285              
286             sub program_dir {
287 3 50   3 1 39 return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR};
288             }
289              
290             sub new {
291 1     1 1 99 my ($class,@args) = @_;
292 1         14 my $self = $class->SUPER::new(@args);
293              
294 1         59 my ($attr, $value);
295 1         5 while (@args) {
296 5         14 $attr = shift @args;
297 5         10 $value = shift @args;
298 5 50       16 next if( $attr =~ /^-/ ); # don't want named parameters
299 5 50       14 if ($attr =~ /IDLENGTH/i){
300 0         0 $self->idlength($value);
301 0         0 next;
302             }
303 5         49 $self->$attr($value);
304             }
305 1 50       4 if (! defined $self->idlength){
306 1         3 $self->idlength(10);
307             }
308 1         3 return $self;
309             }
310              
311             sub AUTOLOAD {
312 5     5   12 my $self = shift;
313 5         10 my $attr = $AUTOLOAD;
314 5         25 $attr =~ s/.*:://;
315 5         14 $attr = uc $attr;
316 5 50       19 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
317 5 50       17 $self->{$attr} = shift if @_;
318 5         16 return $self->{$attr};
319             }
320              
321             =head2 idlength
322              
323             Title : idlength
324             Usage : $obj->idlength ($newval)
325             Function:
326             Returns : value of idlength
327             Args : newvalue (optional)
328              
329              
330             =cut
331              
332             sub idlength{
333 2     2 1 5 my $self = shift;
334 2 100       5 if( @_ ) {
335 1         2 my $value = shift;
336 1         2 $self->{'idlength'} = $value;
337             }
338 2         6 return $self->{'idlength'};
339              
340             }
341              
342              
343             =head2 run
344              
345             Title : run
346             Usage :
347             $inputfilename = 't/data/prot.dist';
348             $tree = $neighborfactory->run($inputfilename);
349             or
350             $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
351             $matrix = $protdist_factory->create_distance_matrix($aln);
352             $tree= $neighborfactory->run($matrix);
353              
354             Function: a Bio:Tree from a protein distance matrix created by protidst
355             Example :
356             Returns : Bio::Tree
357             Args : Name of a file containing a protein distance matrix in Phylip format
358             or a hash ref to a matrix
359              
360             Throws an exception if argument is not either a string (eg a
361             filename) or a Hash. If argument is string, throws exception
362             if file corresponding to string name can not be found.
363              
364             =cut
365              
366             sub run{
367              
368 0     0 1   my ($self,$input) = @_;
369 0           my ($temp,$infilename, $seq);
370 0           my ($attr, $value, $switch);
371             # Create input file pointer
372 0           $infilename = $self->_setinput($input);
373 0 0         if (!$infilename) {$self->throw("Problems setting up for neighbor. Probably bad input data in $input !");}
  0            
374              
375             # Create parameter string to pass to neighbor program
376 0           my $param_string = $self->_setparams();
377              
378             # run neighbor
379 0           my @tree = $self->_run($infilename,$param_string);
380 0 0         return wantarray ? @tree: \@tree;
381             }
382              
383             =head2 create_tree
384              
385             Title : create_tree
386             Usage : my $file = $app->create_tree($treefile);
387             Function: This method is deprecated. Please use run method.
388             Returns : File containing the rendered tree
389             Args : either a Bio::Tree::TreeI
390             OR
391             filename of a tree in newick format
392              
393             =cut
394              
395             sub create_tree{
396 0     0 1   return shift->run(@_);
397             }
398              
399              
400             #################################################
401              
402             =head2 _run
403              
404             Title : _run
405             Usage : Internal function, not to be called directly
406             Function: makes actual system call to neighbor program
407             Example :
408             Returns : Bio::Tree object
409             Args : Name of a file containing protein distances in Phylip format
410             and a parameter string to be passed to neighbor
411              
412             =cut
413              
414             sub _run {
415 0     0     my ($self,$infile,$param_string) = @_;
416 0           my $instring;
417 0           my $curpath = cwd;
418 0 0         unless( File::Spec->file_name_is_absolute($infile) ) {
419 0           $infile = $self->io->catfile($curpath,$infile);
420             }
421 0           $instring = $infile."\n$param_string";
422 0           $self->debug( "Program ".$self->executable."\n");
423 0           chdir($self->tempdir);
424             #open a pipe to run neighbor to bypass interactive menus
425 0 0 0       if ($self->quiet() || $self->verbose() < 0) {
426 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
427 0           open(NEIGHBOR,"|".$self->executable.">$null");
428             }
429             else {
430 0           open(NEIGHBOR,"|".$self->executable);
431             }
432 0           print NEIGHBOR $instring;
433 0           close(NEIGHBOR);
434 0           chdir($curpath);
435             #get the results
436 0           my $outfile = $self->io->catfile($self->tempdir,$self->outfile);
437 0           my $treefile = $self->io->catfile($self->tempdir,$self->treefile);
438            
439 0 0         $self->throw("neighbor did not create tree correctly (expected $treefile) ") unless (-e $treefile);
440 0           my $in = Bio::TreeIO->new(-file => $treefile, '-format' => 'newick');
441 0           my @tree;
442 0           while (my $tree = $in->next_tree){
443 0           push @tree, $tree;
444             }
445              
446             # Clean up the temporary files created along the way...
447 0 0         unless ( $self->save_tempfiles ) {
448 0           unlink $outfile;
449 0           unlink $treefile;
450             }
451 0           return @tree;
452             }
453              
454             =head2 _setinput()
455              
456             Title : _setinput
457             Usage : Internal function, not to be called directly
458             Function: Create input file for neighbor program
459             Example :
460             Returns : name of file containing the protein distance matrix in Phylip format
461             Args : name of file created by protdist or ref to hash created by
462             Bio::Tools:Run::Phylo::Phylip::ProtDist
463              
464              
465             =cut
466              
467             sub _setinput {
468 0     0     my ($self, $input) = @_;
469 0           my ($alnfilename,$infilename, $temp, $tfh,$input_tmp,$input_fh);
470              
471             #If $input is not a filename it better be a HASF reference
472              
473             # a phy formatted alignment file created by protdist
474 0 0         unless (ref $input) {
475             # check that file exists or throw
476 0           $alnfilename= $input;
477 0 0         unless (-e $input) {return 0;}
  0            
478 0           return $alnfilename;
479             }
480              
481 0 0         my @input = ref($input) eq "ARRAY" ? @{$input} : ($input);
  0            
482 0           ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir);
483 0           my $input_count = 0;
484 0           foreach my $input(@input){
485 0 0         if ($input->isa("Bio::Matrix::PhylipDist")){
486             # Open temporary file for both reading & writing of distance matrix
487 0           print $tfh $input->print_matrix;
488 0           $input_count++;
489             }
490             }
491 0           $self->_input_nbr($input_count);
492 0           close($tfh);
493             #get names from the first matrix, to be used in outgroup ordering
494 0           my %names;
495 0           $input = shift @input;
496             #set the species names
497 0           my @names = @{$input->names};
  0            
498 0           for(my $i=0; $i<= $#names; $i++){
499 0           $names{$names[$i]} = $i+1;
500             }
501 0           $self->names(\%names);
502 0           return $alnfilename;
503             }
504              
505             sub _input_nbr {
506 0     0     my ($self,$val) = @_;
507 0 0         if($val){
508 0           $self->{'_input_nbr'} = $val;
509 0           } return $self->{'_input_nbr'};
510             }
511              
512             =head2 names()
513              
514             Title : names
515             Usage : $tree->names(\%names)
516             Function: get/set for a hash ref for storing names in matrix
517             with rank as values.
518             Example :
519             Returns : hash reference
520             Args : hash reference
521              
522             =cut
523              
524             sub names {
525 0     0 1   my ($self,$name) = @_;
526 0 0         if($name){
527 0           $self->{'_names'} = $name;
528             }
529 0           return $self->{'_names'};
530             }
531              
532             =head2 _setparams()
533              
534             Title : _setparams
535             Usage : Internal function, not to be called directly
536             Function: Create parameter inputs for neighbor program
537             Example :
538             Returns : parameter string to be passed to neighbor
539             Args : name of calling object
540              
541             =cut
542              
543             sub _setparams {
544 0     0     my ($attr, $value, $self);
545              
546             #do nothing for now
547 0           $self = shift;
548 0           my $param_string = "";
549 0           my $type ="";
550 0           my $version = $self->version;
551 0           my %menu = %{$Menu{$version}->{'NEIGHBOR'}};
  0            
552              
553 0           foreach my $attr ( @NEIGHBOR_PARAMS) {
554 0           $value = $self->$attr();
555 0 0 0       next unless (defined $value && $value);
556 0 0         if ($attr =~/TYPE/i){
    0          
    0          
    0          
557 0 0         if ($value=~/UPGMA/i){
558 0           $type = "UPGMA";
559 0           $param_string .= $menu{'TYPE'}{'UPGMA'};
560             }
561             }
562             elsif($attr =~ /OUTGROUP/i){
563 0 0         if ($type ne "UPGMA"){
564 0 0         if($value !~/^\d+$/){ # is a name so find the rank
565 0           my %names = %{$self->names};
  0            
566 0 0         $names{$value} || $self->throw("Outgroup $value not found");
567 0           $value = $names{$value};
568             }
569 0           $param_string .= $menu{'OUTGROUP'}."$value\n";
570             }
571             else {
572 0           $self->throw("Can't set outgroup using UPGMA. Use Neighbor-Joining instead");
573             }
574             }
575             elsif ($attr =~ /JUMBLE/i){
576 0 0 0       $self->throw("Unallowed value for random seed, need odd number") unless ($value =~ /\d+/ && ($value % 2 == 1));
577 0           $param_string .=$menu{'JUMBLE'}."$value\n";
578             }
579             elsif($attr=~/MULTIPLE/i){
580 0           $param_string.=$menu{'MULTIPLE'}."$value\n";
581             #version 3.6 needs a random seed
582 0 0         if($version eq "3.6"){
583 0           $param_string .= (2 * int(rand(10000)) + 1)."\n";
584             }
585             }
586             else{
587 0           $param_string .= $menu{uc $attr};
588             }
589             }
590 0 0 0       if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){
      0        
591 0           $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n";
592             }
593            
594 0           $param_string .=$menu{'SUBMIT'};
595              
596 0           return $param_string;
597             }
598              
599             =head2 outfile
600              
601             Title : outfile
602             Usage : $obj->outfile($newval)
603             Function: Get/Set default PHYLIP outfile name ('outfile' usually)
604             Returns : value of outfile
605             Args : newvalue (optional)
606              
607              
608             =cut
609              
610              
611             =head2 treefile
612              
613             Title : treefile
614             Usage : $obj->treefile($newval)
615             Function: Get/Set the default PHYLIP treefile name ('treefile' usually)
616             Returns : value of treefile
617             Args : newvalue (optional)
618              
619              
620             =cut
621              
622              
623             1; # Needed to keep compiler happy