File Coverage

blib/lib/Bio/Tools/Run/Alignment/Clustalw.pm
Criterion Covered Total %
statement 30 219 13.7
branch 0 90 0.0
condition 0 35 0.0
subroutine 11 22 50.0
pod 10 10 100.0
total 51 376 13.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Run::Alignment::Clustalw
3             #
4             # You may distribute this module under the same terms as perl itself
5             # POD documentation - main docs before the code
6              
7             =head1 NAME
8              
9             Bio::Tools::Run::Alignment::Clustalw - Object for the calculation of a
10             multiple sequence alignment from a set of unaligned sequences or
11             alignments using the Clustalw program
12              
13             =head1 SYNOPSIS
14              
15             # Build a clustalw alignment factory
16             @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
17             $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
18              
19             # Pass the factory a list of sequences to be aligned.
20             $inputfilename = 't/data/cysprot.fa';
21             $aln = $factory->align($inputfilename); # $aln is a SimpleAlign object.
22             # or
23             $seq_array_ref = \@seq_array;
24             # where @seq_array is an array of Bio::Seq objects
25             $aln = $factory->align($seq_array_ref);
26              
27             # Or one can pass the factory a pair of (sub)alignments
28             #to be aligned against each other, e.g.:
29             $aln = $factory->profile_align($aln1,$aln2);
30             # where $aln1 and $aln2 are Bio::SimpleAlign objects.
31              
32             # Or one can pass the factory an alignment and one or more unaligned
33             # sequences to be added to the alignment. For example:
34             $aln = $factory->profile_align($aln1,$seq); # $seq is a Bio::Seq object.
35              
36             # Get a tree of the sequences
37             $tree = $factory->tree(\@seq_array);
38              
39             # Get both an alignment and a tree
40             ($aln, $tree) = $factory->run(\@seq_array);
41              
42             # Do a footprinting analysis on the supplied sequences, getting back the
43             # most conserved sub-alignments
44             my @results = $factory->footprint(\@seq_array);
45             foreach my $result (@results) {
46             print $result->consensus_string, "\n";
47             }
48              
49             # There are various additional options and input formats available.
50             # See the DESCRIPTION section that follows for additional details.
51              
52             =head1 DESCRIPTION
53              
54             Note: this DESCRIPTION only documents the Bioperl interface to
55             Clustalw. Clustalw, itself, is a large & complex program - for more
56             information regarding clustalw, please see the clustalw documentation
57             which accompanies the clustalw distribution. Clustalw is available
58             from (among others) ftp://ftp.ebi.ac.uk/pub/software/. Clustalw.pm has
59             only been tested using version 1.8 of clustalw. Compatibility with
60             earlier versions of the clustalw program is currently unknown. Before
61             running Clustalw successfully it will be necessary: to install clustalw
62             on your system, and to ensure that users have execute privileges for
63             the clustalw program.
64              
65             =head2 Helping the module find your executable
66              
67             You will need to enable Clustalw to find the clustalw program. This
68             can be done in (at least) three ways:
69              
70             1. Make sure the clustalw executable is in your path so that
71             which clustalw
72             returns a clustalw executable on your system.
73              
74             2. Define an environmental variable CLUSTALDIR which is a
75             directory which contains the 'clustalw' application:
76             In bash:
77              
78             export CLUSTALDIR=/home/username/clustalw1.8
79              
80             In csh/tcsh:
81              
82             setenv CLUSTALDIR /home/username/clustalw1.8
83              
84             3. Include a definition of an environmental variable CLUSTALDIR in
85             every script that will use this Clustalw wrapper module, e.g.:
86              
87             BEGIN { $ENV{CLUSTALDIR} = '/home/username/clustalw1.8/' }
88             use Bio::Tools::Run::Alignment::Clustalw;
89              
90             If you are running an application on a webserver make sure the
91             webserver environment has the proper PATH set or use the options 2 or
92             3 to set the variables.
93              
94             =head2 How it works
95              
96             Bio::Tools::Run::Alignment::Clustalw is an object for performing a
97             multiple sequence alignment from a set of unaligned sequences and/or
98             sub-alignments by means of the clustalw program.
99              
100             Initially, a clustalw "factory object" is created. Optionally, the
101             factory may be passed most of the parameters or switches of the
102             clustalw program, e.g.:
103              
104             @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
105             $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
106              
107             Any parameters not explicitly set will remain as the defaults of the
108             clustalw program. Additional parameters and switches (not available
109             in clustalw) may also be set. Currently, the only such parameter is
110             "quiet", which when set to a non-zero value, suppresses clustalw
111             terminal output. Not all clustalw parameters are supported at this
112             stage.
113              
114             By default, Clustalw output is returned solely in a the form of a
115             Bio::SimpleAlign object which can then be printed and/or saved
116             in multiple formats using the AlignIO.pm module. Optionally the raw
117             clustalw output file can be saved if the calling script specifies an
118             output file (with the clustalw parameter OUTFILE). Currently only the
119             GCG-MSF output file formats is supported.
120              
121             Not all parameters and features have been implemented yet in Perl format.
122              
123             Alignment parameters can be changed and/or examined at any time after
124             the factory has been created. The program checks that any
125             parameter/switch being set/read is valid. However, currently no
126             additional checks are included to check that parameters are of the
127             proper type (eg string or numeric) or that their values are within the
128             proper range. As an example, to change the value of the clustalw
129             parameter ktuple to 3 and subsequently to check its value one would
130             write:
131              
132             $ktuple = 3;
133             $factory->ktuple($ktuple);
134             $get_ktuple = $factory->ktuple();
135              
136             Once the factory has been created and the appropriate parameters set,
137             one can call the method align() to align a set of unaligned sequences,
138             or call profile_align() to add one or more sequences or a second
139             alignment to an initial alignment.
140              
141             Input to align() may consist of a set of unaligned sequences in the
142             form of the name of file containing the sequences. For example,
143              
144             $inputfilename = 't/data/cysprot.fa';
145             $aln = $factory-Ealign($inputfilename);
146              
147             Alternately one can create an array of Bio::Seq objects somehow
148              
149             $str = Bio::SeqIO->new(-file=> 't/data/cysprot.fa', -format => 'Fasta');
150             @seq_array =();
151             while ( my $seq = $str->next_seq() ) {push (@seq_array, $seq) ;}
152              
153             and pass the factory a reference to that array
154              
155             $seq_array_ref = \@seq_array;
156             $aln = $factory->align($seq_array_ref);
157              
158             In either case, align() returns a reference to a SimpleAlign object
159             which can then used (see L).
160              
161             Once an initial alignment exists, one can pass the factory additional
162             sequence(s) to be added (ie aligned) to the original alignment. The
163             alignment can be passed as either an alignment file or a
164             Bio:SimpleAlign object. The unaligned sequence(s) can be passed as a
165             filename or as an array of BioPerl sequence objects or as a single
166             BioPerl Seq object. For example (to add a single sequence to an
167             alignment),
168              
169             $str = Bio::AlignIO->new(-file=> 't/data/cysprot1a.msf');
170             $aln = $str->next_aln();
171             $str1 = Bio::SeqIO->new(-file=> 't/data/cysprot1b.fa');
172             $seq = $str1->next_seq();
173             $aln = $factory->profile_align($aln,$seq);
174              
175             In either case, profile_align() returns a reference to a SimpleAlign
176             object containing a new SimpleAlign object of the alignment with the
177             additional sequence(s) added in.
178              
179             Finally one can pass the factory a pair of (sub)alignments to be
180             aligned against each other. The alignments can be passed in the form
181             of either a pair of alignment files or a pair of Bio:SimpleAlign
182             objects. For example,
183              
184             $profile1 = 't/data/cysprot1a.msf';
185             $profile2 = 't/data/cysprot1b.msf';
186             $aln = $factory->profile_align($profile1,$profile2);
187              
188             or
189              
190             $str1 = Bio::AlignIO->new(-file=> 't/data/cysprot1a.msf');
191             $aln1 = $str1->next_aln();
192             $str2 = Bio::AlignIO->new(-file=> 't/data/cysprot1b.msf');
193             $aln2 = $str2->next_aln();
194             $aln = $factory->profile_align($aln1,$aln2);
195              
196             In either case, profile_align() returns a reference to a SimpleAlign
197             object containing an (super)alignment of the two input alignments.
198              
199             For more examples of syntax and use of Clustalw, the user is
200             encouraged to look at the script Clustalw.t in the t/ directory.
201              
202             Note: Clustalw is still under development. Various features of the
203             clustalw program have not yet been implemented. If you would like
204             that a specific clustalw feature be added to this perl contact
205             bioperl-l@bioperl.org.
206              
207             These can be specified as parameters when instantiating a new Clustalw
208             object, or through get/set methods of the same name (lowercase).
209              
210             =head1 PARAMETER FOR ALIGNMENT COMPUTATION
211              
212             =head2 KTUPLE
213              
214             Title : KTUPLE
215             Description : (optional) set the word size to be used in the alignment
216             This is the size of exactly matching fragment that is used.
217             INCREASE for speed (max= 2 for proteins; 4 for DNA),
218             DECREASE for sensitivity.
219             For longer sequences (e.g. >1000 residues) you may
220             need to increase the default
221              
222             =head2 TOPDIAGS
223              
224             Title : TOPDIAGS
225             Description : (optional) number of best diagonals to use
226             The number of k-tuple matches on each diagonal
227             (in an imaginary dot-matrix plot) is calculated.
228             Only the best ones (with most matches) are used in
229             the alignment. This parameter specifies how many.
230             Decrease for speed; increase for sensitivity.
231              
232             =head2 WINDOW
233              
234             Title : WINDOW
235             Description : (optional) window size
236             This is the number of diagonals around each of the 'best'
237             diagonals that will be used. Decrease for speed;
238             increase for sensitivity.
239              
240             =head2 PAIRGAP
241              
242             Title : PAIRGAP
243             Description : (optional) gap penalty for pairwise alignments
244             This is a penalty for each gap in the fast alignments.
245             It has little affect on the speed or sensitivity except
246             for extreme values.
247              
248             =head2 FIXEDGAP
249              
250             Title : FIXEDGAP
251             Description : (optional) fixed length gap penalty
252              
253             =head2 FLOATGAP
254              
255             Title : FLOATGAP
256             Description : (optional) variable length gap penalty
257              
258             =head2 MATRIX
259              
260             Title : MATRIX
261             Default : PAM100 for DNA - PAM250 for protein alignment
262             Description : (optional) substitution matrix used in the multiple
263             alignments. Depends on the version of clustalw as to
264             what default matrix will be used
265              
266             PROTEIN WEIGHT MATRIX leads to a new menu where you are
267             offered a choice of weight matrices. The default for
268             proteins in version 1.8 is the PAM series derived by
269             Gonnet and colleagues. Note, a series is used! The
270             actual matrix that is used depends on how similar the
271             sequences to be aligned at this alignment step
272             are. Different matrices work differently at each
273             evolutionary distance.
274              
275             DNA WEIGHT MATRIX leads to a new menu where a single
276             matrix (not a series) can be selected. The default is
277             the matrix used by BESTFIT for comparison of nucleic
278             acid sequences.
279              
280             =head2 TYPE
281              
282             Title : TYPE
283             Description : (optional) sequence type: protein or DNA. This allows
284             you to explicitly overide the programs attempt at
285             guessing the type of the sequence. It is only useful
286             if you are using sequences with a VERY strange
287             composition.
288              
289             =head2 OUTPUT
290              
291             Title : OUTPUT
292             Description : (optional) clustalw supports GCG or PHYLIP or PIR or
293             Clustal format. See the Bio::AlignIO modules for
294             which formats are supported by bioperl.
295              
296             =head2 OUTFILE
297              
298             Title : OUTFILE
299             Description : (optional) Name of clustalw output file. If not set
300             module will erase output file. In any case alignment will
301             be returned in the form of SimpleAlign objects
302              
303             =head2 TRANSMIT
304              
305             Title : TRANSMIT
306             Description : (optional) transitions not weighted. The default is to
307             weight transitions as more favourable than other
308             mismatches in DNA alignments. This switch makes all
309             nucleotide mismatches equally weighted.
310              
311             =head1 FEEDBACK
312              
313             =head2 Mailing Lists
314              
315             User feedback is an integral part of the evolution of this and other
316             Bioperl modules. Send your comments and suggestions preferably to one
317             of the Bioperl mailing lists. Your participation is much appreciated.
318              
319             bioperl-l@bioperl.org - General discussion
320             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
321              
322             =head2 Support
323              
324             Please direct usage questions or support issues to the mailing list:
325              
326             I
327              
328             rather than to the module maintainer directly. Many experienced and
329             reponsive experts will be able look at the problem and quickly
330             address it. Please include a thorough description of the problem
331             with code and data examples if at all possible.
332              
333             =head2 Reporting Bugs
334              
335             Report bugs to the Bioperl bug tracking system to help us keep track
336             the bugs and their resolution. Bug reports can be submitted via the
337             web:
338              
339             http://redmine.open-bio.org/projects/bioperl/
340              
341             =head1 AUTHOR - Peter Schattner
342              
343             Email schattner@alum.mit.edu
344              
345             =head1 CONTRIBUTORS
346              
347             Jason Stajich jason-AT-bioperl_DOT_org
348             Sendu Bala bix@sendu.me.uk
349              
350             =head1 APPENDIX
351              
352             The rest of the documentation details each of the object
353             methods. Internal methods are usually preceded with a _
354              
355             =cut
356              
357             #'
358              
359             package Bio::Tools::Run::Alignment::Clustalw;
360              
361 3     3   105882 use strict;
  3         5  
  3         74  
362 3     3   544 use Bio::Seq;
  3         43972  
  3         62  
363 3     3   1421 use Bio::SeqIO;
  3         31647  
  3         72  
364 3     3   668 use Bio::SimpleAlign;
  3         27019  
  3         60  
365 3     3   479 use Bio::AlignIO;
  3         1058  
  3         47  
366 3     3   421 use Bio::TreeIO;
  3         13732  
  3         46  
367 3     3   10 use Bio::Root::IO;
  3         1  
  3         49  
368              
369 3     3   8 use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
  3         3  
  3         3985  
370              
371             our @CLUSTALW_PARAMS = qw(output ktuple topdiags window pairgap fixedgap
372             floatgap matrix type transit dnamatrix outfile
373             gapopen gapext maxdiv gapdist hgapresidues pwmatrix
374             pwdnamatrix pwgapopen pwgapext score transweight
375             seed helixgap outorder strandgap loopgap terminalgap
376             helixendin helixendout strandendin strandendout program
377             reps outputtree seed bootlabels bootstrap);
378              
379             our @CLUSTALW_SWITCHES = qw(help check options negative noweights endgaps
380             nopgap nohgap novgap kimura tossgaps
381             kimura tossgaps njtree);
382             our @OTHER_SWITCHES = qw(quiet);
383             our $PROGRAM_NAME = 'clustalw';
384             our $PROGRAM_DIR = $ENV{'CLUSTALDIR'} || $ENV{'CLUSTALWDIR'};
385              
386              
387             =head2 program_name
388              
389             Title : program_name
390             Usage : $factory>program_name()
391             Function: holds the program name
392             Returns: string
393             Args : None
394              
395             =cut
396              
397             sub program_name {
398 7     7 1 329 return $PROGRAM_NAME;
399             }
400              
401             =head2 program_dir
402              
403             Title : program_dir
404             Usage : $factory->program_dir(@params)
405             Function: returns the program directory, obtained from ENV variable.
406             Returns: string
407             Args :
408              
409             =cut
410              
411             sub program_dir {
412 4     4 1 413 return $PROGRAM_DIR;
413             }
414              
415             sub new {
416 1     1 1 1758 my ($class,@args) = @_;
417 1         8 my $self = $class->SUPER::new(@args);
418              
419 1         44 $self->_set_from_args(\@args, -methods => [@CLUSTALW_PARAMS,
420             @CLUSTALW_SWITCHES,
421             @OTHER_SWITCHES],
422             -create => 1);
423            
424 1         12 return $self;
425             }
426              
427             =head2 version
428              
429             Title : version
430             Usage : exit if $prog->version() < 1.8
431             Function: Determine the version number of the program
432             Example :
433             Returns : float or undef
434             Args : none
435              
436             =cut
437              
438             sub version {
439 0     0 1   my ($self) = @_;
440              
441 0 0         return undef unless $self->executable;
442 0           my $prog = $self->executable;
443 0           my $string = `$prog -- 2>&1` ;
444 0           $string =~ /\(?([\d.]+)\)?/xms;
445 0   0       return $1 || undef;
446             }
447              
448             =head2 run
449              
450             Title : run
451             Usage : ($aln, $tree) = $factory->run($inputfilename);
452             ($aln, $tree) = $factory->run($seq_array_ref);
453             Function: Perform a multiple sequence alignment, generating a tree at the same
454             time. (Like align() and tree() combined.)
455             Returns : A SimpleAlign object containing the sequence alignment and a
456             Bio::Tree::Tree object with the tree relating the sequences.
457             Args : Name of a file containing a set of unaligned fasta sequences
458             or else an array of references to Bio::Seq objects.
459              
460             =cut
461              
462             sub run {
463 0     0 1   my ($self,$input) = @_;
464 0           my ($temp,$infilename, $seq);
465 0           my ($attr, $value, $switch);
466            
467 0           $self->io->_io_cleanup();
468             # Create input file pointer
469 0           $infilename = $self->_setinput($input);
470 0 0         $self->throw("Bad input data (sequences need an id) or less than 2 sequences in $input!") unless $infilename;
471            
472             # Create parameter string to pass to clustalw program
473 0           my $param_string = $self->_setparams();
474            
475             # run clustalw
476 0           return $self->_run('both', $infilename, $param_string);
477             }
478              
479             =head2 align
480              
481             Title : align
482             Usage : $inputfilename = 't/data/cysprot.fa';
483             $aln = $factory->align($inputfilename);
484             or
485             $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs
486             $aln = $factory->align($seq_array_ref);
487             Function: Perform a multiple sequence alignment
488             Returns : Reference to a SimpleAlign object containing the
489             sequence alignment.
490             Args : Name of a file containing a set of unaligned fasta sequences
491             or else an array of references to Bio::Seq objects.
492              
493             Throws an exception if argument is not either a string (eg a
494             filename) or a reference to an array of Bio::Seq objects. If
495             argument is string, throws exception if file corresponding to string
496             name can not be found. If argument is Bio::Seq array, throws
497             exception if less than two sequence objects are in array.
498              
499             =cut
500              
501             sub align {
502 0     0 1   my ($self,$input) = @_;
503            
504 0           $self->io->_io_cleanup();
505            
506             # Create input file pointer
507 0           my $infilename = $self->_setinput($input);
508 0 0         $self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !") unless $infilename;
509            
510             # Create parameter string to pass to clustalw program
511 0           my $param_string = $self->_setparams();
512            
513             # run clustalw
514 0           my $aln = $self->_run('align', $infilename, $param_string);
515             }
516              
517              
518             =head2 profile_align
519              
520             Title : profile_align
521             Usage : $aln = $factory->profile_align(@simple_aligns);
522             or
523             $aln = $factory->profile_align(@subalignment_filenames);
524             Function: Perform an alignment of 2 (sub)alignments
525             Returns : Reference to a SimpleAlign object containing the (super)alignment.
526             Args : Names of 2 files containing the subalignments
527             or references to 2 Bio::SimpleAlign objects.
528              
529             Throws an exception if arguments are not either strings (eg filenames)
530             or references to SimpleAlign objects.
531              
532             =cut
533              
534             sub profile_align {
535 0     0 1   my ($self,$input1,$input2) = @_;
536            
537 0           $self->io->_io_cleanup();
538            
539             # Create input file pointer
540 0           my $infilename1 = $self->_setinput($input1, 1);
541 0           my $infilename2 = $self->_setinput($input2, 2);
542 0 0 0       if (!$infilename1 || !$infilename2) {$self->throw("Bad input data: $input1 or $input2 !");}
  0            
543 0 0 0       unless ( -e $infilename1 and -e $infilename2) {$self->throw("Bad input file: $input1 or $input2 !");}
  0            
544            
545             # Create parameter string to pass to clustalw program
546 0           my $param_string = $self->_setparams();
547            
548             # run clustalw
549 0           my $aln = $self->_run('profile-aln', $infilename1, $infilename2, $param_string);
550             }
551              
552             =head2 add_sequences
553              
554             Title : add_sequences
555             Usage :
556             Function: Align and add sequences into an alignment
557             Example :
558             Returns : Reference to a SimpleAlign object containing the (super)alignment.
559             Args : Names of 2 files, the first one containing an alignment and the second one containing sequences to be added
560             or references to 2 Bio::SimpleAlign objects.
561              
562             Throws an exception if arguments are not either strings (eg filenames)
563             or references to SimpleAlign objects.
564              
565              
566             =cut
567              
568             sub add_sequences {
569              
570 0     0 1   my ($self,$input1,$input2) = @_;
571 0           my ($temp,$infilename1,$infilename2,$input,$seq);
572            
573 0           $self->io->_io_cleanup();
574             # Create input file pointer
575 0           $infilename1 = $self->_setinput($input1,1);
576 0           $infilename2 = $self->_setinput($input2,2);
577 0 0 0       if (!$infilename1 || !$infilename2) {$self->throw("Bad input data: $input1 or $input2 !");}
  0            
578 0 0 0       unless ( -e $infilename1 and -e $infilename2) {$self->throw("Bad input file: $input1 or $input2 !");}
  0            
579            
580            
581             # Create parameter string to pass to clustalw program
582 0           my $param_string = $self->_setparams();
583             # run clustalw
584 0           my $aln = $self->_run('add_sequences', $infilename1,
585             $infilename2, $param_string);
586              
587             }
588              
589             =head2 tree
590              
591             Title : tree
592             Usage : @params = ('bootstrap' => 1000,
593             'tossgaps' => 1,
594             'kimura' => 1,
595             'seed' => 121,
596             'bootlabels'=> 'nodes',
597             'quiet' => 1);
598             $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
599             $tree_obj = $factory->tree($aln_obj);
600             or
601             $tree_obj = $factory->tree($treefilename);
602             Function: Retrieve a tree corresponding to the input
603             Returns : Bio::TreeIO object
604             Args : Bio::SimpleAlign or filename of a tree
605              
606             =cut
607              
608             sub tree {
609 0     0 1   my ($self,$input) = @_;
610            
611 0           $self->io->_io_cleanup();
612            
613             # Create input file pointer
614 0           my $infilename = $self->_setinput($input);
615            
616 0 0         if (!$infilename) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !");}
  0            
617            
618             # Create parameter string to pass to clustalw program
619 0           my $param_string = $self->_setparams();
620              
621             # run clustalw
622 0           my $tree = $self->_run('tree', $infilename, $param_string);
623             }
624              
625             =head2 footprint
626              
627             Title : footprint
628             Usage : @alns = $factory->footprint($treefilename, $window_size, $diff);
629             @alns = $factory->footprint($seqs_array_ref);
630             Function: Aligns all the supplied sequences and slices out of the alignment
631             those regions along a sliding window who's tree length differs
632             significantly from the total average tree length.
633             Returns : list of Bio::SimpleAlign objects
634             Args : first argument as per run(), optional second argument to specify
635             the size of the sliding window (default 5 bp) and optional third
636             argument to specify the % difference from the total tree length
637             needed for a window to be considered a footprint (default 33%).
638              
639             =cut
640              
641             sub footprint {
642 0     0 1   my ($self, $in, $slice_size, $deviate) = @_;
643            
644 0           my ($simplealn, $tree) = $self->run($in);
645            
646             # total tree length?
647 0           my $total_length = $tree->total_branch_length;
648            
649             # tree length along sliding window, picking regions that significantly
650             # deviate from the average tree length
651 0   0       $slice_size ||= 5;
652 0   0       $deviate ||= 33;
653 0           my $threshold = $total_length - (($total_length / 100) * $deviate);
654 0           my $length = $simplealn->length;
655 0           my $below = 0;
656 0           my $found_minima = 0;
657 0           my $minima = [$threshold, ''];
658 0           my @results;
659 0           for my $i (1..($length - $slice_size + 1)) {
660 0           my $slice = $simplealn->slice($i, ($i + $slice_size - 1), 1);
661 0           my $tree = $self->tree($slice);
662 0 0         $self->throw("No tree returned") unless defined $tree;
663 0           my $slice_length = $tree->total_branch_length;
664            
665 0 0         $slice_length <= $threshold ? ($below = 1) : ($below = 0);
666 0 0         if ($below) {
667 0 0         unless ($found_minima) {
668 0 0         if ($slice_length < ${$minima}[0]) {
  0            
669 0           $minima = [$slice_length, $slice];
670             }
671             else {
672 0           push(@results, ${$minima}[1]);
  0            
673 0           $minima = [$threshold, ''];
674 0           $found_minima = 1;
675             }
676             }
677             }
678             else {
679 0           $found_minima = 0;
680             }
681             }
682            
683 0           return @results;
684             }
685              
686             =head2 _run
687              
688             Title : _run
689             Usage : Internal function, not to be called directly
690             Function: makes actual system call to clustalw program
691             Returns : nothing; clustalw output is written to a
692             temporary file
693             Args : Name of a file containing a set of unaligned fasta sequences
694             and hash of parameters to be passed to clustalw
695              
696             =cut
697              
698             sub _run {
699 0     0     my ($self, $command, $infile1, $infile2, $param_string) = @_;
700            
701 0           my ($instring, $tree);
702 0   0       my $quiet = $self->quiet() || $self->verbose() < 0;
703            
704 0 0         if ($command =~ /align|both/) {
705 0 0         if ($^O eq 'dec_osf') {
706 0           $instring = $infile1;
707 0           $command = '';
708             }
709             else {
710 0           $instring = " -infile=". '"' . $infile1 . '"';
711             }
712 0           $param_string .= " $infile2";
713             }
714            
715 0 0         if ($command =~ /profile/) {
716 0           $instring = "-profile1=$infile1 -profile2=$infile2";
717 0           chmod 0777, $infile1, $infile2;
718 0           $command = '-profile';
719             }
720            
721 0 0         if ($command =~ /add_sequences/) {
722 0           $instring = "-profile1=$infile1 -profile2=$infile2";
723 0           chmod 0777, $infile1,$infile2;
724 0           $command = '-sequences';
725             }
726              
727 0 0         if ($command =~ /tree/) {
728 0 0         if( $^O eq 'dec_osf' ) {
729 0           $instring = $infile1;
730 0           $command = '';
731             }
732             else {
733 0           $instring = " -infile=". '"' . $infile1 . '"';
734             }
735 0           $param_string .= " $infile2";
736            
737 0           $self->debug( "Program ".$self->executable."\n");
738 0           my $commandstring = $self->executable."$instring"."$param_string";
739 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
740 0 0         $commandstring .= " 1>$null" if $quiet;
741 0           $self->debug( "clustal command = $commandstring");
742            
743 0           my $status = system($commandstring);
744 0 0         unless( $status == 0 ) {
745 0           $self->warn( "Clustalw call ($commandstring) crashed: $? \n");
746 0           return undef;
747             }
748            
749 0           return $self->_get_tree($infile1, $param_string);
750             }
751            
752 0   0       my $output = $self->output || 'gcg';
753 0           $self->debug( "Program ".$self->executable."\n");
754 0           my $commandstring = $self->executable." $command"." $instring"." -output=$output". " $param_string";
755 0           $self->debug( "clustal command = $commandstring\n");
756            
757 0 0         open(my $pipe, "$commandstring |") || $self->throw("ClustalW call ($commandstring) failed to start: $? | $!");
758 0           my $score;
759 0           while (<$pipe>) {
760 0 0         print unless $quiet;
761             # Kevin Brown suggested the following regex, though it matches multiple
762             # times: we pick up the last one
763 0 0         $score = $1 if ($_ =~ /Score:(\d+)/);
764             # This one is printed at the end and seems the most appropriate to pick
765             # up; we include the above regex incase 'Alignment Score' isn't given
766 0 0         $score = $1 if ($_ =~ /Alignment Score (-?\d+)/);
767             }
768 0 0         close($pipe) || ($self->throw("ClustalW call ($commandstring) crashed: $?"));
769            
770 0           my $outfile = $self->outfile();
771            
772             # retrieve alignment (Note: MSF format for AlignIO = GCG format of clustalw)
773 0 0         my $format = $output =~ /gcg/i ? 'msf' : $output;
774 0 0         if ($format =~ /clustal/i) {
775 0           $format = 'clustalw'; # force clustalw incase 'clustal' is requested
776             }
777 0           my $in = Bio::AlignIO->new(-file => $outfile, -format=> $format);
778 0           my $aln = $in->next_aln();
779 0           $in->close;
780 0           $aln->score($score);
781            
782 0 0         if ($command eq 'both') {
783 0           $tree = $self->_get_tree($infile1, $param_string);
784             }
785            
786             # Clean up the temporary files created along the way...
787             # Replace file suffix with dnd to find name of dendrogram file(s) to delete
788 0 0         unless ( $self->save_tempfiles ) {
789 0           foreach my $f ($infile1, $infile2) {
790 0           $f =~ s/\.[^\.]*$// ;
791 0 0         unlink $f .'.dnd' if ($f ne '');
792             }
793             }
794            
795 0 0         if ($command eq 'both') {
796 0           return ($aln, $tree);
797             }
798 0           return $aln;
799             }
800              
801             sub _get_tree {
802 0     0     my ($self, $treefile, $param_string) = @_;
803            
804 0           $treefile =~ s/\.[^\.]*$// ;
805            
806 0 0         if ($param_string =~ /-bootstrap/) {
    0          
807 0           $treefile .= '.phb';
808             }
809             elsif ($param_string =~ /-tree/) {
810 0           $treefile .= '.ph';
811             }
812             else {
813 0           $treefile .= '.dnd';
814             }
815            
816 0           my $in = Bio::TreeIO->new('-file' => $treefile,
817             '-format'=> 'newick');
818            
819 0           my $tree = $in->next_tree;
820 0 0         unless ( $self->save_tempfiles ) {
821 0           foreach my $f ( $treefile ) {
822 0 0         unlink $f if( $f ne '' );
823             }
824             }
825            
826 0           return $tree;
827             }
828              
829             =head2 _setinput()
830              
831             Title : _setinput
832             Usage : Internal function, not to be called directly
833             Function: Create input file for clustalw program
834             Returns : name of file containing clustalw data input
835             Args : Seq or Align object reference or input file name
836              
837             =cut
838              
839             sub _setinput {
840 0     0     my ($self, $input, $suffix) = @_;
841 0           my ($infilename, $seq, $temp, $tfh);
842            
843             # suffix is used to distinguish alignment files If $input is not a
844             # reference it better be the name of a file with the sequence/
845            
846             # alignment data...
847 0 0         unless (ref $input) {
848             # check that file exists or throw
849 0           $infilename = $input;
850 0 0         return unless -e $input;
851 0           return $infilename;
852             }
853            
854             # $input may be an array of BioSeq objects...
855 0 0 0       if (ref($input) eq "ARRAY") {
    0 0        
    0          
856             # Open temporary file for both reading & writing of BioSeq array
857 0           ($tfh,$infilename) = $self->io->tempfile(-dir=>$self->tempdir);
858 0           $temp = Bio::SeqIO->new('-fh'=>$tfh, '-format' =>'Fasta');
859            
860             # Need at least 2 seqs for alignment
861 0 0         return unless (scalar(@$input) > 1);
862            
863 0           foreach $seq (@$input) {
864 0 0 0       return unless (defined $seq && $seq->isa("Bio::PrimarySeqI") and $seq->id());
      0        
865 0           $temp->write_seq($seq);
866             }
867 0           $temp->close();
868 0           close($tfh);
869 0           undef $tfh;
870 0           return $infilename;
871             }
872            
873             # $input may be a SimpleAlign object.
874             elsif (ref($input) eq "Bio::SimpleAlign") {
875             # Open temporary file for both reading & writing of SimpleAlign object
876 0           ($tfh,$infilename) = $self->io->tempfile(-dir=>$self->tempdir);
877 0           $temp = Bio::AlignIO->new('-fh'=> $tfh, '-format' => 'fasta');
878 0           $temp->write_aln($input);
879 0           close($tfh);
880 0           undef $tfh;
881 0           return $infilename;
882             }
883            
884             # or $input may be a single BioSeq object (to be added to a previous alignment)
885             elsif (ref($input) && $input->isa("Bio::PrimarySeqI") && $suffix==2) {
886             # Open temporary file for both reading & writing of BioSeq object
887 0           ($tfh,$infilename) = $self->io->tempfile();
888 0           $temp = Bio::SeqIO->new(-fh=> $tfh, '-format' =>'Fasta');
889 0           $temp->write_seq($input);
890 0           close($tfh);
891 0           undef $tfh;
892 0           return $infilename;
893             }
894            
895 0           return;
896             }
897              
898             =head2 _setparams()
899              
900             Title : _setparams
901             Usage : Internal function, not to be called directly
902             Function: Create parameter inputs for clustalw program
903             Returns : parameter string to be passed to clustalw
904             during align or profile_align
905             Args : name of calling object
906              
907             =cut
908              
909             sub _setparams {
910 0     0     my $self = shift;
911            
912 0           my $param_string = $self->SUPER::_setparams(-params => \@CLUSTALW_PARAMS,
913             -switches => \@CLUSTALW_SWITCHES,
914             -dash => 1,
915             -lc => 1,
916             -join => '=');
917            
918             # Set default output file if no explicit output file selected
919 0 0         unless ($param_string =~ /outfile/) {
920 0           my ($tfh, $outfile) = $self->io->tempfile(-dir => $self->tempdir());
921 0           close($tfh);
922 0           undef $tfh;
923 0           $self->outfile($outfile);
924 0           $param_string .= " -outfile=\"$outfile\"" ;
925             }
926            
927 0           $param_string .= ' 2>&1';
928            
929 0           return $param_string;
930             }
931              
932             1;