File Coverage

blib/lib/Bio/Tools/Run/Alignment/Muscle.pm
Criterion Covered Total %
statement 36 172 20.9
branch 4 82 4.8
condition 2 19 10.5
subroutine 11 19 57.8
pod 9 9 100.0
total 62 301 20.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Run::Alignment::Muscle
3             #
4             # Please direct questions and support issues to
5             #
6             # Copyright Jason Stajich
7             #
8             # You may distribute this module under the same terms as perl itself
9             #
10             # POD documentation - main docs before the code
11              
12             =head1 NAME
13              
14             Bio::Tools::Run::Alignment::Muscle - Object for the calculation of an
15             iterative multiple sequence alignment from a set of unaligned
16             sequences or alignments using the MUSCLE program
17              
18             =head1 SYNOPSIS
19              
20             # Build a muscle alignment factory
21             $factory = Bio::Tools::Run::Alignment::Muscle->new(@params);
22              
23             # Pass the factory a list of sequences to be aligned.
24             $inputfilename = 't/cysprot.fa';
25             # $aln is a SimpleAlign object.
26             $aln = $factory->align($inputfilename);
27              
28             # or where @seq_array is an array of Bio::Seq objects
29             $seq_array_ref = \@seq_array;
30             $aln = $factory->align($seq_array_ref);
31              
32             # Or one can pass the factory a pair of (sub)alignments
33             #to be aligned against each other, e.g.:
34              
35             #There are various additional options and input formats available.
36             #See the DESCRIPTION section that follows for additional details.
37              
38             =head1 DESCRIPTION
39              
40             Muscle is Robert Edgar's progressive alignment program. You can get
41             it and see information about it at this URL
42             http://www.drive5.com/muscle
43              
44             It is recommended you use at least version 3.6. Behaviour with earlier versions
45             is questionable.
46              
47             =head2 Helping the module find your executable
48              
49             You will need to enable Muscle to find the muscle program. This can be
50             done in (at least) three ways:
51              
52             1. Make sure the muscle executable is in your path (i.e.
53             'which muscle' returns a valid program
54             2. define an environmental variable MUSCLEDIR which points to a
55             directory containing the 'muscle' app:
56             In bash
57             export MUSCLEDIR=/home/progs/muscle or
58             In csh/tcsh
59             setenv MUSCLEDIR /home/progs/muscle
60              
61             3. include a definition of an environmental variable MUSCLEDIR
62             in every script that will
63             BEGIN {$ENV{MUSCLEDIR} = '/home/progs/muscle'; }
64             use Bio::Tools::Run::Alignment::Muscle;
65              
66             =head1 FEEDBACK
67              
68             =head2 Mailing Lists
69              
70             User feedback is an integral part of the evolution of this and other
71             Bioperl modules. Send your comments and suggestions preferably to one
72             of the Bioperl mailing lists. Your participation is much appreciated.
73              
74             bioperl-l@bioperl.org - General discussion
75             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76              
77             =head2 Support
78              
79             Please direct usage questions or support issues to the mailing list:
80              
81             I
82              
83             rather than to the module maintainer directly. Many experienced and
84             reponsive experts will be able look at the problem and quickly
85             address it. Please include a thorough description of the problem
86             with code and data examples if at all possible.
87              
88             =head2 Reporting Bugs
89              
90             Report bugs to the Bioperl bug tracking system to help us keep track
91             the bugs and their resolution. Bug reports can be submitted via the web:
92              
93             http://redmine.open-bio.org/projects/bioperl/
94              
95             =head1 AUTHOR - Jason Stajich
96              
97             Email jason-at-bioperl-dot-org
98              
99             =head1 APPENDIX
100              
101             The rest of the documentation details each of the object
102             methods. Internal methods are usually preceded with a _
103              
104             =cut
105              
106             package Bio::Tools::Run::Alignment::Muscle;
107              
108 1     1   104152 use strict;
  1         2  
  1         21  
109 1     1   548 use Bio::Seq;
  1         44573  
  1         26  
110 1     1   512 use Bio::SeqIO;
  1         19505  
  1         44  
111 1     1   661 use Bio::SimpleAlign;
  1         27570  
  1         50  
112 1     1   453 use Bio::AlignIO;
  1         1052  
  1         22  
113 1     1   5 use Bio::Root::IO;
  1         1  
  1         17  
114              
115 1     1   3 use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
  1         1  
  1         458  
116              
117             our %DEFAULTS = ( 'AFORMAT' => 'fasta' );
118             our @MUSCLE_PARAMS = qw(in out tree1 log loga scorefile gapopen seqtype
119             maxmb maxhours maxiters kband in1 in2 usetree usetree_nowarn
120             weight1 weight2 smoothwindow SUEFF smoothscoreceil root1 root2
121             refinewindow physout phyiout objscore minsmoothscore minbestcolscore
122             hydrofactor hydro anchorspacing center cluster1 cluster2 clwout diagbreak
123             diaglength diagmargin distance1 distance2);
124             our @MUSCLE_SWITCHES = qw(quiet verbose diags refine stable group
125             clw clwstrict msf brenner cluster dimer fasta html le anchors noanchors
126             phyi phys profile refinew sp spscore spn sv);
127             our $PROGRAM_NAME = 'muscle';
128             our $PROGRAM_DIR = Bio::Root::IO->catfile($ENV{MUSCLEDIR}) if $ENV{MUSCLEDIR};
129              
130             =head2 new
131              
132             Title : new
133             Usage : my $muscle = Bio::Tools::Run::Alignment::Muscle->new();
134             Function: Constructor
135             Returns : Bio::Tools::Run::Alignment::Muscle
136             Args : -outfile_name => $outname
137              
138             =cut
139              
140             sub new {
141 1     1 1 76 my ( $class, @args ) = @_;
142 1         10 my $self = $class->SUPER::new(@args);
143              
144 1         46 $self->aformat( $DEFAULTS{'AFORMAT'} );
145              
146 1         25 $self->_set_from_args(
147             \@args,
148             -methods => [ @MUSCLE_PARAMS, @MUSCLE_SWITCHES ],
149             -create => 1
150             );
151              
152 1         17 my ($out) = $self->SUPER::_rearrange( [qw(OUTFILE_NAME)], @args );
153              
154 1   50     11 $self->outfile_name( $out || '' );
155              
156 1 50       18 $self->aformat('msf') if $self->msf;
157 1 50 33     31 $self->aformat('clustalw') if $self->clw || $self->clwstrict;
158              
159 1 50       46 if ( defined $self->out ) {
160 0         0 $self->outfile_name( $self->out );
161             }
162              
163 1         9 return $self;
164             }
165              
166             =head2 program_name
167              
168             Title : program_name
169             Usage : $factory->program_name()
170             Function: holds the program name
171             Returns: string
172             Args : None
173              
174             =cut
175              
176             sub program_name {
177 6     6 1 23 return $PROGRAM_NAME;
178             }
179              
180             =head2 program_dir
181              
182             Title : program_dir
183             Usage : $factory->program_dir(@params)
184             Function: returns the program directory, obtained from ENV variable.
185             Returns: string
186             Args :
187              
188             =cut
189              
190             sub program_dir {
191 3     3 1 10 return $PROGRAM_DIR;
192             }
193              
194             =head2 error_string
195              
196             Title : error_string
197             Usage : $obj->error_string($newval)
198             Function: Where the output from the last analysus run is stored.
199             Returns : value of error_string
200             Args : newvalue (optional)
201              
202             =cut
203              
204             sub error_string {
205 0     0 1 0 my ( $self, $value ) = @_;
206 0 0       0 if ( defined $value ) {
207 0         0 $self->{'error_string'} = $value;
208             }
209 0         0 return $self->{'error_string'};
210             }
211              
212             =head2 version
213              
214             Title : version
215             Usage : exit if $prog->version() < 1.8
216             Function: Determine the version number of the program
217             Example :
218             Returns : float or undef
219             Args : none
220              
221             =cut
222              
223             sub version {
224 0     0 1 0 my ($self) = @_;
225 0         0 my $exe;
226 0 0       0 return undef unless $exe = $self->executable;
227 0         0 my $string = `$exe 2>&1`;
228              
229 0         0 $string =~ /MUSCLE\s+v(\d+\.\d+)/m;
230 0   0     0 return $1 || undef;
231             }
232              
233             =head2 run
234              
235             Title : run
236             Usage : my $output = $application->run(\@seqs);
237             Function: Generic run of an application
238             Returns : Bio::SimpleAlign object
239             Args : Arrayref of Bio::PrimarySeqI objects or
240             a filename to run on
241              
242             =cut
243              
244             sub run {
245 0     0 1 0 my $self = shift;
246 0         0 return $self->align(shift);
247             }
248              
249             =head2 align
250              
251             Title : align
252             Usage : $inputfilename = 't/data/cysprot.fa';
253             $aln = $factory->align($inputfilename);
254             or
255             $seq_array_ref = \@seq_array;
256             $aln = $factory->align($seq_array_ref);
257             Function: Perform a multiple sequence alignment
258             Returns : Reference to a SimpleAlign object containing the
259             sequence alignment.
260             Args : Name of a file containing a set of unaligned fasta sequences
261             or else an array of references to Bio::Seq objects.
262              
263             Throws an exception if argument is not either a string (e.g. a
264             filename) or a reference to an array of Bio::Seq objects. If
265             argument is string, throws exception if file corresponding to string
266             name can not be found. If argument is Bio::Seq array, throws
267             exception if less than two sequence objects are in array.
268              
269             =cut
270              
271             sub align {
272 0     0 1 0 my ( $self, $input ) = @_;
273              
274             # Create input file pointer
275 0         0 $self->io->_io_cleanup();
276 0         0 my $infilename;
277 0 0       0 if ( defined $input ) {
    0          
278 0         0 $infilename = $self->_setinput($input);
279             }
280             elsif ( defined $self->in ) {
281 0         0 $infilename = $self->_setinput( $self->in );
282             }
283             else {
284 0         0 $self->throw("No inputdata provided\n");
285             }
286 0 0       0 if ( !$infilename ) {
287 0         0 $self->throw("Bad input data or less than 2 sequences in $input !");
288             }
289              
290 0         0 my $param_string = $self->_setparams();
291              
292             # run muscle
293 0         0 return &_run( $self, $infilename, $param_string );
294             }
295              
296             =head2 profile
297              
298             Title : profile
299             Usage : $alnfilename = /t/data/cysprot.msa';
300             $seqsfilename = 't/data/cysprot.fa';
301             $aln = $factory->profile($alnfilename,$seqsfilename);
302              
303             Function: Perform a profile alignment on a MSA to include more seqs
304             Returns : Reference to a SimpleAlign object containing the
305             sequence alignment.
306             Args : Name of a file containing the fasta MSA and name of a file
307             containing a set of unaligned fasta sequences
308             Comments: This only works for muscle version 3.52.
309             Some early versions of the 3.6 sources had a bug that
310             caused a segfault with -profile. The attached should fix
311             it, if not let Bob Edgar know.
312              
313             =cut
314              
315             sub profile {
316 0     0 1 0 my ( $self, $alnfilename, $seqsfilename ) = @_;
317              
318             # Create input file pointer
319 0         0 $self->io->_io_cleanup();
320 0 0       0 if ( $self->version ne '3.52' ) {
321 0         0 $self->throw("profile does not work for this version of muscle\n");
322             }
323 0         0 my $infilename;
324 0 0       0 if ( defined $alnfilename ) {
325 0 0       0 if ( !ref $alnfilename ) {
326              
327             # check that file exists or throw
328 0         0 $infilename = $alnfilename;
329 0 0       0 unless ( -e $infilename ) { return 0; }
  0         0  
330              
331             # let's peek and guess
332 0 0       0 open( IN, $infilename ) || $self->throw("Cannot open $infilename");
333 0         0 my $header;
334 0         0 while ( defined( $header = ) ) {
335 0 0       0 last if $header !~ /^\s+$/;
336             }
337 0         0 close(IN);
338 0 0       0 if ( $header !~ /^>\s*\S+/ ) {
339 0         0 $self->throw(
340             "Need to provide a FASTA format file to muscle profile!");
341             }
342             }
343             }
344             else {
345 0         0 $self->throw("No inputdata provided\n");
346             }
347 0 0       0 if ( !$infilename ) {
348 0         0 $self->throw(
349             "Bad input data or less than 2 sequences in $infilename !");
350             }
351 0 0       0 if ( defined $seqsfilename ) {
352 0 0       0 if ( !ref $seqsfilename ) {
353              
354             # check that file exists or throw
355 0         0 $infilename = $seqsfilename;
356 0 0       0 unless ( -e $infilename ) { return 0; }
  0         0  
357              
358             # let's peek and guess
359 0 0       0 open( IN, $infilename ) || $self->throw("Cannot open $infilename");
360 0         0 my $header;
361 0         0 while ( defined( $header = ) ) {
362 0 0       0 last if $header !~ /^\s+$/;
363             }
364 0         0 close(IN);
365 0 0       0 if ( $header !~ /^>\s*\S+/ ) {
366 0         0 $self->throw(
367             "Need to provide a FASTA format file to muscle profile!");
368             }
369             }
370             }
371             else {
372 0         0 $self->throw("No inputdata provided\n");
373             }
374 0 0       0 if ( !$infilename ) {
375 0         0 $self->throw(
376             "Bad input data or less than 2 sequences in $infilename !");
377             }
378              
379 0         0 my $param_string = $self->_setparams();
380              
381             # run muscle
382 0         0 $self->{_profile} = 1;
383 0         0 return &_run( $self, "$alnfilename -in2 $seqsfilename", $param_string );
384             }
385              
386             =head2 aformat
387              
388             Title : aformat
389             Usage : my $alignmentformat = $self->aformat();
390             Function: Get/Set alignment format
391             Returns : string
392             Args : string
393              
394             =cut
395              
396             sub aformat {
397 1     1 1 2 my $self = shift;
398 1 50       6 $self->{'_aformat'} = shift if @_;
399 1         3 return $self->{'_aformat'};
400             }
401              
402             =head2 _run
403              
404             Title : _run
405             Usage : Internal function, not to be called directly
406             Function: makes actual system call to muscle program
407             Example :
408             Returns : nothing; muscle output is written to a
409             temporary file OR specified output file
410             Args : Name of a file containing a set of unaligned fasta sequences
411             and hash of parameters to be passed to muscle
412              
413              
414             =cut
415              
416             sub _run {
417 0     0     my ( $self, $infilename, $params ) = @_;
418 0           my $commandstring;
419 0 0         if ( $self->{_profile} ) {
420 0           $commandstring =
421             $self->executable . " -profile -in1 $infilename $params";
422 0           $self->{_profile} = 0;
423             }
424             else {
425 0           $commandstring = $self->executable . " -in $infilename $params";
426             }
427              
428 0           $self->debug("muscle command = $commandstring \n");
429              
430 0           my $status = system($commandstring);
431 0           my $outfile = $self->outfile_name();
432 0 0 0       if ( !-e $outfile || -z $outfile ) {
433 0           $self->warn("Muscle call crashed: $? [command $commandstring]\n");
434 0           return undef;
435             }
436              
437 0           my $in = Bio::AlignIO->new(
438             '-file' => $outfile,
439             '-format' => $self->aformat
440             );
441 0           my $aln = $in->next_aln();
442 0           return $aln;
443             }
444              
445             =head2 _setinput
446              
447             Title : _setinput
448             Usage : Internal function, not to be called directly
449             Function: Create input file for muscle program
450             Example :
451             Returns : name of file containing muscle data input AND
452             Args : Arrayref of Seqs or input file name
453              
454             =cut
455              
456             sub _setinput {
457 0     0     my ( $self, $input ) = @_;
458 0           my ( $infilename, $seq, $temp, $tfh );
459 0 0         if ( !ref $input ) {
    0          
460              
461             # check that file exists or throw
462 0           $infilename = $input;
463 0 0         unless ( -e $input ) { return 0; }
  0            
464              
465             # let's peek and guess
466 0 0         open( IN, $infilename ) || $self->throw("Cannot open $infilename");
467 0           my $header;
468 0           while ( defined( $header = ) ) {
469 0 0         last if $header !~ /^\s+$/;
470             }
471 0           close(IN);
472 0 0         if ( $header !~ /^>\s*\S+/ ) {
473 0           $self->throw("Need to provide a FASTA format file to muscle!");
474             }
475 0           return ($infilename);
476             }
477             elsif ( ref($input) =~ /ARRAY/i ) {
478              
479             # $input may be an array of BioSeq objects...
480             # Open temporary file for both reading & writing of array
481 0           ( $tfh, $infilename ) = $self->io->tempfile();
482 0 0         if ( !ref( $input->[0] ) ) {
    0          
483 0           $self->warn(
484             "passed an array ref which did not contain objects to _setinput"
485             );
486 0           return undef;
487             }
488             elsif ( $input->[0]->isa('Bio::PrimarySeqI') ) {
489 0           $temp = Bio::SeqIO->new(
490             '-fh' => $tfh,
491             '-format' => 'fasta'
492             );
493 0           my $ct = 1;
494 0           foreach $seq (@$input) {
495 0 0 0       return 0 unless ( ref($seq)
496             && $seq->isa("Bio::PrimarySeqI") );
497 0 0 0       if ( !defined $seq->display_id
498             || $seq->display_id =~ /^\s+$/ )
499             {
500 0           $seq->display_id( "Seq" . $ct++ );
501             }
502 0           $temp->write_seq($seq);
503             }
504 0           $temp->close();
505 0           undef $temp;
506 0           close($tfh);
507 0           $tfh = undef;
508             }
509             else {
510 0           $self->warn( "got an array ref with 1st entry "
511             . $input->[0]
512             . " and don't know what to do with it\n" );
513             }
514 0           return ($infilename);
515             }
516             else {
517 0           $self->warn("Got $input and don't know what to do with it\n");
518             }
519 0           return 0;
520             }
521              
522             =head2 _setparams
523              
524             Title : _setparams
525             Usage : Internal function, not to be called directly
526             Function: Create parameter inputs for muscle program
527             Example :
528             Returns : parameter string to be passed to muscle
529             during align or profile_align
530             Args : name of calling object
531              
532             =cut
533              
534             sub _setparams {
535 0     0     my ($self) = @_;
536 0           my ( $attr, $value, $param_string );
537 0           $param_string = '';
538 0           my $laststr;
539              
540 0           for $attr (@MUSCLE_PARAMS) {
541 0           $value = $self->$attr();
542 0 0         next unless ( defined $value );
543 0           my $attr_key = lc $attr;
544 0           $attr_key = ' -' . $attr_key;
545 0           $param_string .= $attr_key . ' ' . $value;
546             }
547 0           for $attr (@MUSCLE_SWITCHES) {
548 0           $value = $self->$attr();
549 0 0         next unless ($value);
550 0           my $attr_key = lc $attr; # put switches in format expected by tcoffee
551 0           $attr_key = ' -' . $attr_key;
552 0           $param_string .= $attr_key;
553             }
554              
555             # Set default output file if no explicit output file selected
556 0 0         unless ( $self->outfile_name ) {
557 0           my ( $tfh, $outfile ) = $self->io->tempfile( -dir => $self->tempdir() );
558 0           close($tfh);
559 0           undef $tfh;
560 0           $self->outfile_name($outfile);
561             }
562 0           $param_string .= " -out " . $self->outfile_name;
563              
564 0 0 0       if ( $self->quiet() || $self->verbose < 0 ) {
565 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
566 0           $param_string .= " 2> $null";
567             }
568 0           return $param_string;
569             }
570              
571             =head1 Bio::Tools::Run::BaseWrapper methods
572              
573             =cut
574              
575             =head2 no_param_checks
576              
577             Title : no_param_checks
578             Usage : $obj->no_param_checks($newval)
579             Function: Boolean flag as to whether or not we should
580             trust the sanity checks for parameter values
581             Returns : value of no_param_checks
582             Args : newvalue (optional)
583              
584             =cut
585              
586             =head2 save_tempfiles
587              
588             Title : save_tempfiles
589             Usage : $obj->save_tempfiles($newval)
590             Function:
591             Returns : value of save_tempfiles
592             Args : newvalue (optional)
593              
594             =cut
595              
596             =head2 outfile_name
597              
598             Title : outfile_name
599             Usage : my $outfile = $muscle->outfile_name();
600             Function: Get/Set the name of the output file for this run
601             (if you wanted to do something special)
602             Returns : string
603             Args : [optional] string to set value to
604              
605             =cut
606              
607             =head2 tempdir
608              
609             Title : tempdir
610             Usage : my $tmpdir = $self->tempdir();
611             Function: Retrieve a temporary directory name (which is created)
612             Returns : string which is the name of the temporary directory
613             Args : none
614              
615             =cut
616              
617             =head2 cleanup
618              
619             Title : cleanup
620             Usage : $muscle->cleanup();
621             Function: Will cleanup the tempdir directory
622             Returns : none
623             Args : none
624              
625             =cut
626              
627             =head2 io
628              
629             Title : io
630             Usage : $obj->io($newval)
631             Function: Gets a L object
632             Returns : L
633             Args : none
634              
635             =cut
636              
637             1; # Needed to keep compiler happy
638              
639             __END__