File Coverage

blib/lib/Bio/Tools/Run/Alignment/TCoffee.pm
Criterion Covered Total %
statement 62 252 24.6
branch 12 102 11.7
condition 0 41 0.0
subroutine 16 25 64.0
pod 10 10 100.0
total 100 430 23.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Run::Alignment::TCoffee
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich, Peter Schattner
7             #
8             # Copyright Jason Stajich, Peter Schattner
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Run::Alignment::TCoffee - Object for the calculation of a
17             multiple sequence alignment from a set of unaligned sequences or
18             alignments using the TCoffee program
19              
20             =head1 SYNOPSIS
21              
22             # Build a tcoffee alignment factory
23             @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
24             $factory = Bio::Tools::Run::Alignment::TCoffee->new(@params);
25              
26             # Pass the factory a list of sequences to be aligned.
27             $inputfilename = 't/cysprot.fa';
28             # $aln is a SimpleAlign object.
29             $aln = $factory->align($inputfilename);
30              
31             # or where @seq_array is an array of Bio::Seq objects
32             $seq_array_ref = \@seq_array;
33             $aln = $factory->align($seq_array_ref);
34              
35             # Or one can pass the factory a pair of (sub)alignments
36             #to be aligned against each other, e.g.:
37              
38             # where $aln1 and $aln2 are Bio::SimpleAlign objects.
39             $aln = $factory->profile_align($aln1,$aln2);
40              
41             # Or one can pass the factory an alignment and one or more
42             # unaligned sequences to be added to the alignment. For example:
43              
44             # $seq is a Bio::Seq object.
45             $aln = $factory->profile_align($aln1,$seq);
46              
47             #There are various additional options and input formats available.
48             #See the DESCRIPTION section that follows for additional details.
49              
50             =head1 DESCRIPTION
51              
52             Note: this DESCRIPTION only documents the (Bio)perl interface to
53             TCoffee.
54              
55             =head2 Helping the module find your executable
56              
57             You will need to enable TCoffee to find the t_coffee program. This
58             can be done in (at least) three ways:
59              
60             1. Make sure the t_coffee executable is in your path so that
61             which t_coffee returns a t_coffee executable on your system.
62              
63             2. Define an environmental variable TCOFFEEDIR which is a dir
64             which contains the 't_coffee' app:
65             In bash
66             export TCOFFEEDIR=/home/username/progs/T-COFFEE_distribution_Version_1.37/bin
67             In csh/tcsh
68             setenv TCOFFEEDIR /home/username/progs/T-COFFEE_distribution_Version_1.37/bin
69              
70             3. Include a definition of an environmental variable TCOFFEEDIR in
71             every script that will use this TCoffee wrapper module.
72             BEGIN { $ENV{TCOFFEDIR} = '/home/username/progs/T-COFFEE_distribution_Version_1.37/bin' }
73             use Bio::Tools::Run::Alignment::TCoffee;
74              
75             If you are running an application on a webserver make sure the
76             webserver environment has the proper PATH set or use the options 2 or
77             3 to set the variables.
78              
79             =head1 PARAMETERS FOR ALIGNMENT COMPUTATION
80              
81             There are a number of possible parameters one can pass in TCoffee.
82             One should really read the online manual for the best explanation of
83             all the features. See
84             http://igs-server.cnrs-mrs.fr/~cnotred/Documentation/t_coffee/t_coffee_doc.html
85              
86             These can be specified as parameters when instantiating a new TCoffee
87             object, or through get/set methods of the same name (lowercase).
88              
89             =head2 IN
90              
91             Title : IN
92             Description : (optional) input filename, this is specified when
93             align so should not use this directly unless one
94             understand TCoffee program very well.
95              
96             =head2 TYPE
97              
98             Title : TYPE
99             Args : [string] DNA, PROTEIN
100             Description : (optional) set the sequence type, guessed automatically
101             so should not use this directly
102              
103             =head2 PARAMETERS
104              
105             Title : PARAMETERS
106             Description : (optional) Indicates a file containing extra parameters
107              
108             =head2 EXTEND
109              
110             Title : EXTEND
111             Args : 0, 1, or positive value
112             Default : 1
113             Description : Flag indicating that library extension should be
114             carried out when performing multiple alignments, if set
115             to 0 then extension is not made, if set to 1 extension
116             is made on all pairs in the library. If extension is
117             set to another positive value, the extension is only
118             carried out on pairs having a weigth value superior to
119             the specified limit.
120              
121             =head2 DP_NORMALISE
122              
123             Title : DP_NORMALISE
124             Args : 0 or positive value
125             Default : 1000
126             Description : When using a value different from 0, this flag sets the
127             score of the highest scoring pair to 1000.
128              
129             =head2 DP_MODE
130              
131             Title : DP_MODE
132             Args : [string] gotoh_pair_wise, myers_miller_pair_wise,
133             fasta_pair_wise cfasta_pair_wise
134             Default : cfast_fair_wise
135             Description : Indicates the type of dynamic programming used by
136             the program
137              
138             gotoh_pair_wise : implementation of the gotoh algorithm
139             (quadratic in memory and time)
140              
141             myers_miller_pair_wise : implementation of the Myers and Miller
142             dynamic programming algorithm ( quadratic in time and linear in
143             space). This algorithm is recommended for very long sequences. It
144             is about 2 time slower than gotoh. It only accepts tg_mode=1.
145              
146             fasta_pair_wise: implementation of the fasta algorithm. The
147             sequence is hashed, looking for ktuples words. Dynamic programming
148             is only carried out on the ndiag best scoring diagonals. This is
149             much faster but less accurate than the two previous.
150              
151             cfasta_pair_wise : c stands for checked. It is the same
152             algorithm. The dynamic programming is made on the ndiag best
153             diagonals, and then on the 2*ndiags, and so on until the scores
154             converge. Complexity will depend on the level of divergence of the
155             sequences, but will usually be L*log(L), with an accuracy
156             comparable to the two first mode ( this was checked on BaliBase).
157              
158             =head2 KTUPLE
159              
160             Title : KTUPLE
161             Args : numeric value
162             Default : 1 or 2 (1 for protein, 2 for DNA )
163              
164             Description : Indicates the ktuple size for cfasta_pair_wise dp_mode
165             and fasta_pair_wise. It is set to 1 for proteins, and 2
166             for DNA. The alphabet used for protein is not the 20
167             letter code, but a mildly degenerated version, where
168             some residues are grouped under one letter, based on
169             physicochemical properties:
170             rk, de, qh, vilm, fy (the other residues are
171             not degenerated).
172              
173             =head2 NDIAGS
174              
175             Title : NDIAGS
176             Args : numeric value
177             Default : 0
178             Description : Indicates the number of diagonals used by the
179             fasta_pair_wise algorithm. When set to 0,
180             n_diag=Log (length of the smallest sequence)
181              
182             =head2 DIAG_MODE
183              
184             Title : DIAG_MODE
185             Args : numeric value
186             Default : 0
187              
188              
189             Description : Indicates the manner in which diagonals are scored
190             during the fasta hashing.
191              
192             0 indicates that the score of a diagonal is equal to the
193             sum of the scores of the exact matches it contains.
194              
195              
196             1 indicates that this score is set equal to the score of
197             the best uninterrupted segment
198              
199             1 can be useful when dealing with fragments of sequences.
200              
201             =head2 SIM_MATRIX
202              
203             Title : SIM_MATRIX
204             Args : string
205             Default : vasiliky
206             Description : Indicates the manner in which the amino acid is being
207             degenerated when hashing. All the substitution matrix
208             are acceptable. Categories will be defined as sub-group
209             of residues all having a positive substitution score
210             (they can overlap).
211              
212             If you wish to keep the non degenerated amino acid
213             alphabet, use 'idmat'
214              
215             =head2 MATRIX
216              
217             Title : MATRIX
218             Args :
219             Default :
220             Description : This flag is provided for compatibility with
221             ClustalW. Setting matrix = 'blosum' is equivalent to
222             -in=Xblosum62mt , -matrix=pam is equivalent to
223             in=Xpam250mt . Apart from this, the rules are similar
224             to those applying when declaring a matrix with the
225             -in=X fl
226              
227             =head2 GAPOPEN
228              
229             Title : GAPOPEN
230             Args : numeric
231             Default : 0
232             Description : Indicates the penalty applied for opening a gap. The
233             penalty must be negative. If you provide a positive
234             value, it will automatically be turned into a negative
235             number. We recommend a value of 10 with pam matrices,
236             and a value of 0 when a library is used.
237              
238             =head2 GAPEXT
239              
240             Title : GAPEXT
241             Args : numeric
242             Default : 0
243             Description : Indicates the penalty applied for extending a gap.
244              
245              
246             =head2 COSMETIC_PENALTY
247              
248             Title : COSMETIC_PENALTY
249             Args : numeric
250             Default : 100
251             Description : Indicates the penalty applied for opening a gap. This
252             penalty is set to a very low value. It will only have
253             an influence on the portions of the alignment that are
254             unalignable. It will not make them more correct, but
255             only more pleasing to the eye ( i.e. Avoid stretches of
256             lonely residues).
257              
258             The cosmetic penalty is automatically turned off if a
259             substitution matrix is used rather than a library.
260              
261             =head2 TG_MODE
262              
263             Title : TG_MODE
264             Args : 0,1,2
265             Default : 1
266             Description : (Terminal Gaps)
267             0: indicates that terminal gaps must be panelized with
268             a gapopen and a gapext penalty.
269             1: indicates that terminal gaps must be penalized only
270             with a gapext penalty
271             2: indicates that terminal gaps must not be penalized.
272              
273             =head2 WEIGHT
274              
275             Title : WEIGHT
276             Args : sim or sim_ or integer value
277             Default : sim
278              
279              
280             Description : Weight defines the way alignments are weighted when
281             turned into a library.
282              
283             sim indicates that the weight equals the average
284             identity within the match residues.
285              
286             sim_matrix_name indicates the average identity with two
287             residues regarded as identical when their
288             substitution value is positive. The valid matrices
289             names are in matrices.h (pam250mt) . Matrices not
290             found in this header are considered to be
291             filenames. See the format section for matrices. For
292             instance, -weight=sim_pam250mt indicates that the
293             grouping used for similarity will be the set of
294             classes with positive substitutions. Other groups
295             include
296              
297             sim_clustalw_col ( categories of clustalw
298             marked with :)
299              
300             sim_clustalw_dot ( categories of clustalw
301             marked with .)
302              
303              
304             Value indicates that all the pairs found in the
305             alignments must be given the same weight equal to
306             value. This is useful when the alignment one wishes to
307             turn into a library must be given a pre-specified score
308             (for instance if they come from a structure
309             super-imposition program). Value is an integer:
310              
311             -weight=1000
312              
313             Note : Weight only affects methods that return an alignment to
314             T-Coffee, such as ClustalW. On the contrary, the
315             version of Lalign we use here returns a library where
316             weights have already been applied and are therefore
317             insensitive to the -weight flag.
318              
319             =head2 SEQ_TO_ALIGN
320              
321             Title : SEQ_TO_ALIGN
322             Args : filename
323             Default : no file - align all the sequences
324              
325             Description : You may not wish to align all the sequences brought in
326             by the -in flag. Supplying the seq_to_align flag allows
327             for this, the file is simply a list of names in Fasta
328             format.
329              
330             However, note that library extension will be carried out
331             on all the sequences.
332              
333             =head1 PARAMETERS FOR TREE COMPUTATION AND OUTPUT
334              
335             =head2 NEWTREE
336              
337             Title : NEWTREE
338             Args : treefile
339             Default : no file
340             Description : Indicates the name of the new tree to compute. The
341             default will be .dnd, or .
342             Format is Phylip/Newick tree format
343              
344             =head2 USETREE
345              
346             Title : USETREE
347             Args : treefile
348             Default : no file specified
349             Description : This flag indicates that rather than computing a new
350             dendrogram, t_coffee can use a pre-computed one. The
351             tree files are in phylips format and compatible with
352             ClustalW. In most cases, using a pre-computed tree will
353             halve the computation time required by t_coffee. It is
354             also possible to use trees output by ClustalW or
355             Phylips. Format is Phylips tree format
356              
357             =head2 TREE_MODE
358              
359             Title : TREE_MODE
360             Args : slow, fast, very_fast
361             Default : very_fast
362             Description : This flag indicates the method used for computing the
363             dendrogram.
364             slow : the chosen dp_mode using the extended library,
365             fast : The fasta dp_mode using the extended library.
366             very_fast: The fasta dp_mode using pam250mt.
367              
368             =head2 QUICKTREE
369              
370             Title : QUICKTREE
371             Args :
372             Default :
373             Description : This flag is kept for compatibility with ClustalW.
374             It indicates that: -tree_mode=very_fast
375              
376             =head1 PARAMETERS FOR ALIGNMENT OUTPUT
377              
378             =head2 OUTFILE
379              
380             Title : OUTFILE
381             Args : out_aln file, default, no
382             Default : default ( yourseqfile.aln)
383             Description : indicates name of output alignment file
384              
385             =head2 OUTPUT
386              
387             Title : OUTPUT
388             Args : format1, format2
389             Default : clustalw
390             Description : Indicated format for outputting outputfile
391             Supported formats are:
392              
393             clustalw_aln, clustalw: ClustalW format.
394             gcg, msf_aln : Msf alignment.
395             pir_aln : pir alignment.
396             fasta_aln : fasta alignment.
397             phylip : Phylip format.
398             pir_seq : pir sequences (no gap).
399             fasta_seq : fasta sequences (no gap).
400             As well as:
401             score_html : causes the output to be a reliability
402             plot in HTML
403             score_pdf : idem in PDF.
404             score_ps : idem in postscript.
405              
406             More than one format can be indicated:
407             -output=clustalw,gcg, score_html
408              
409             =head2 CASE
410              
411             Title : CASE
412             Args : upper, lower
413             Default : upper
414             Description : triggers choice of the case for output
415              
416             =head2 CPU
417              
418             Title : CPU
419             Args : value
420             Default : 0
421             Description : Indicates the cpu time (micro seconds) that must be
422             added to the t_coffee computation time.
423              
424             =head2 OUT_LIB
425              
426             Title : OUT_LIB
427             Args : name of library, default, no
428             Default : default
429             Description : Sets the name of the library output. Default implies
430             .tc_lib
431              
432             =head2 OUTORDER
433              
434             Title : OUTORDER
435             Args : input or aligned
436             Default : input
437             Description : Sets the name of the library output. Default implies
438             .tc_lib
439              
440             =head2 SEQNOS
441              
442             Title : SEQNOS
443             Args : on or off
444             Default : off
445             Description : Causes the output alignment to contain residue numbers
446             at the end of each line:
447              
448             =head1 PARAMETERS FOR GENERIC OUTPUT
449              
450             =head2 RUN_NAME
451              
452             Title : RUN_NAME
453             Args : your run name
454             Default :
455             Description : This flag causes the prefix to be
456             replaced by when renaming the default
457             files.
458              
459             =head2 ALIGN
460              
461             Title : ALIGN
462             Args :
463             Default :
464             Description : Indicates that the program must produce the
465             alignment. This flag is here for compatibility with
466             ClustalW
467              
468             =head2 QUIET
469              
470             Title : QUIET
471             Args : stderr, stdout, or filename, or nothing
472             Default : stderr
473             Description : Redirects the standard output to either a file.
474             -quiet on its own redirect the output to /dev/null.
475              
476             =head2 CONVERT
477              
478             Title : CONVERT
479             Args :
480             Default :
481             Description : Indicates that the program must not compute the
482             alignment but simply convert all the sequences,
483             alignments and libraries into the format indicated with
484             -output. This flag can also be used if you simply want
485             to compute a library ( i.e. You have an alignment and
486             you want to turn it into a library).
487              
488             =head1 FEEDBACK
489              
490             =head2 Mailing Lists
491              
492             User feedback is an integral part of the evolution of this and other
493             Bioperl modules. Send your comments and suggestions preferably to one
494             of the Bioperl mailing lists. Your participation is much appreciated.
495              
496             bioperl-l@bioperl.org - General discussion
497             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
498              
499             =head2 Support
500              
501             Please direct usage questions or support issues to the mailing list:
502              
503             I
504              
505             rather than to the module maintainer directly. Many experienced and
506             reponsive experts will be able look at the problem and quickly
507             address it. Please include a thorough description of the problem
508             with code and data examples if at all possible.
509              
510             =head2 Reporting Bugs
511              
512             Report bugs to the Bioperl bug tracking system to help us keep track
513             the bugs and their resolution. Bug reports can be submitted via the web:
514              
515             http://redmine.open-bio.org/projects/bioperl/
516              
517             =head1 AUTHOR - Jason Stajich, Peter Schattner
518              
519             Email jason-at-bioperl-dot-org, schattner@alum.mit.edu
520              
521             =head1 APPENDIX
522              
523             The rest of the documentation details each of the object
524             methods. Internal methods are usually preceded with a _
525              
526             =cut
527              
528             package Bio::Tools::Run::Alignment::TCoffee;
529              
530 1         72 use vars qw($AUTOLOAD @ISA $PROGRAM_NAME $PROGRAM_DIR %DEFAULTS
531             @TCOFFEE_PARAMS @TCOFFEE_SWITCHES @OTHER_SWITCHES %OK_FIELD
532 1     1   101561 );
  1         1  
533 1     1   3 use strict;
  1         1  
  1         15  
534 1     1   3 use Cwd;
  1         1  
  1         43  
535 1     1   538 use Bio::Seq;
  1         44113  
  1         23  
536 1     1   462 use Bio::SeqIO;
  1         19348  
  1         27  
537 1     1   615 use Bio::SimpleAlign;
  1         27076  
  1         35  
538 1     1   446 use Bio::AlignIO;
  1         1039  
  1         24  
539 1     1   5 use Bio::Root::IO;
  1         1  
  1         16  
540 1     1   374 use Bio::Factory::ApplicationFactoryI;
  1         112  
  1         30  
541 1     1   425 use Bio::Tools::Run::WrapperBase;
  1         2  
  1         102  
542             @ISA = qw(Bio::Tools::Run::WrapperBase
543             Bio::Factory::ApplicationFactoryI);
544              
545             # You will need to enable TCoffee to find the tcoffee program. This can be done
546             # in (at least) twp ways:
547             # 1. define an environmental variable TCOFFEE:
548             # export TCOFFEEDIR=/home/progs/tcoffee or
549             # 2. include a definition of an environmental variable TCOFFEEDIR
550             # in every script that will
551             # use Bio::Tools::Run::Alignment::TCoffee.pm.
552             # BEGIN {$ENV{TCOFFEEDIR} = '/home/progs/tcoffee'; }
553              
554             BEGIN {
555 1     1   1 $PROGRAM_NAME = 't_coffee';
556 1         2 $PROGRAM_DIR = $ENV{'TCOFFEEDIR'};
557 1         4 %DEFAULTS = ( 'MATRIX' => 'blosum',
558             'OUTPUT' => 'clustalw',
559             'AFORMAT'=> 'msf',
560             'METHODS' => [qw(lalign_id_pair clustalw_pair)]
561             );
562              
563              
564 1         6 @TCOFFEE_PARAMS = qw(IN TYPE PARAMETERS DO_NORMALISE EXTEND
565             DP_MODE KTUPLE NDIAGS DIAG_MODE SIM_MATRIX
566             MATRIX GAPOPEN GAPEXT COSMETIC_PENALTY TG_MODE
567             WEIGHT SEQ_TO_ALIGN NEWTREE USETREE TREE_MODE
568             OUTFILE OUTPUT CASE CPU OUT_LIB OUTORDER SEQNOS
569             RUN_NAME CONVERT
570             );
571              
572 1         2 @TCOFFEE_SWITCHES = qw(QUICKTREE);
573              
574 1         1 @OTHER_SWITCHES = qw(QUIET ALIGN KEEPDND);
575              
576             # Authorize attribute fields
577 1         2 foreach my $attr ( @TCOFFEE_PARAMS, @TCOFFEE_SWITCHES, @OTHER_SWITCHES ) {
578 33         1810 $OK_FIELD{$attr}++; }
579             }
580              
581             =head2 program_name
582              
583             Title : program_name
584             Usage : $factory->program_name()
585             Function: holds the program name
586             Returns: string
587             Args : None
588              
589             =cut
590              
591             sub program_name {
592 6     6 1 22 return $PROGRAM_NAME;
593             }
594              
595             =head2 program_dir
596              
597             Title : program_dir
598             Usage : $factory->program_dir(@params)
599             Function: returns the program directory, obtained from ENV variable.
600             Returns: string
601             Args :
602              
603             =cut
604              
605             sub program_dir {
606 3     3 1 8 return $PROGRAM_DIR;
607             }
608              
609             sub new {
610 1     1 1 88 my ($class,@args) = @_;
611 1         10 my $self = $class->SUPER::new(@args);
612 1         9 my ($attr, $value);
613              
614 1         4 while (@args) {
615 0         0 $attr = shift @args;
616 0         0 $value = shift @args;
617 0 0       0 next if( $attr =~ /^-/); # don't want named parameters
618 0         0 $self->$attr($value);
619             }
620 1 50       11 $self->matrix($DEFAULTS{'MATRIX'}) unless( $self->matrix );
621 1 50       6 $self->output($DEFAULTS{'OUTPUT'}) unless( $self->output );
622 1 50       4 $self->methods($DEFAULTS{'METHODS'}) unless $self->methods;
623             # $self->aformat($DEFAULTS{'AFORMAT'}) unless $self->aformat;
624              
625 1         2 return $self;
626             }
627              
628             sub AUTOLOAD {
629 7     7   1084 my $self = shift;
630 7         17 my $attr = $AUTOLOAD;
631 7         19 $attr =~ s/.*:://;
632 7         9 $attr = uc $attr;
633             # aliasing
634 7 50       10 $attr = 'OUTFILE' if $attr eq 'OUTFILE_NAME';
635 7 50       15 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
636              
637 7 100       13 $self->{$attr} = shift if @_;
638 7         20 return $self->{$attr};
639             }
640              
641             =head2 error_string
642              
643             Title : error_string
644             Usage : $obj->error_string($newval)
645             Function: Where the output from the last analysus run is stored.
646             Returns : value of error_string
647             Args : newvalue (optional)
648              
649              
650             =cut
651              
652             sub error_string{
653 0     0 1 0 my ($self,$value) = @_;
654 0 0       0 if( defined $value) {
655 0         0 $self->{'error_string'} = $value;
656             }
657 0         0 return $self->{'error_string'};
658              
659             }
660              
661             =head2 version
662              
663             Title : version
664             Usage : exit if $prog->version() < 1.8
665             Function: Determine the version number of the program
666             Example :
667             Returns : float or undef
668             Args : none
669              
670             =cut
671              
672             sub version {
673 0     0 1 0 my ($self) = @_;
674 0         0 my $exe;
675 0 0       0 return undef unless $exe = $self->executable;
676 0         0 my $string = `$exe -quiet=stdout 2>&1` ;
677 0         0 $string =~ /Version_([\d.]+)/;
678 0   0     0 return $1 || undef;
679              
680             }
681              
682             =head2 run
683              
684             Title : run
685             Usage : my $output = $application->run(-seq => $seq,
686             -profile => $profile,
687             -type => 'profile-aln');
688             Function: Generic run of an application
689             Returns : Bio::SimpleAlign object
690             Args : key-value parameters allowed for TCoffee runs AND
691             -type => profile-aln or alignment for profile alignments or
692             just multiple sequence alignment
693             -seq => either Bio::PrimarySeqI object OR
694             array ref of Bio::PrimarySeqI objects OR
695             filename of sequences to run with
696             -profile => profile to align to, if this is an array ref
697             will specify the first two entries as the two
698             profiles to align to each other
699              
700             =cut
701              
702             sub run{
703 0     0 1 0 my ($self,@args) = @_;
704 0         0 my ($type,$seq,$profile) = $self->_rearrange([qw(TYPE
705             SEQ
706             PROFILE)],
707             @args);
708 0 0       0 if( $type =~ /align/i ) {
    0          
709 0         0 return $self->align($seq);
710             } elsif( $type =~ /profile/i ) {
711 0         0 return $self->profile_align($profile,$seq);
712             } else {
713 0         0 $self->warn("unrecognized alignment type $type\n");
714             }
715 0         0 return undef;
716             }
717              
718             =head2 align
719              
720             Title : align
721             Usage :
722             $inputfilename = 't/data/cysprot.fa';
723             $aln = $factory->align($inputfilename);
724             or
725             $seq_array_ref = \@seq_array;
726             # @seq_array is array of Seq objs
727             $aln = $factory->align($seq_array_ref);
728             Function: Perform a multiple sequence alignment
729             Returns : Reference to a SimpleAlign object containing the
730             sequence alignment.
731             Args : Name of a file containing a set of unaligned fasta sequences
732             or else an array of references to Bio::Seq objects.
733              
734             Throws an exception if argument is not either a string (eg a
735             filename) or a reference to an array of Bio::Seq objects. If
736             argument is string, throws exception if file corresponding to string
737             name can not be found. If argument is Bio::Seq array, throws
738             exception if less than two sequence objects are in array.
739              
740             =cut
741              
742             sub align {
743 0     0 1 0 my ($self,$input) = @_;
744             # Create input file pointer
745 0         0 $self->io->_io_cleanup();
746 0         0 my ($infilename,$type) = $self->_setinput($input);
747 0 0       0 if (!$infilename) {
748 0         0 $self->throw("Bad input data or less than 2 sequences in $input !");
749             }
750              
751 0         0 my $param_string = $self->_setparams();
752              
753             # run tcoffee
754 0         0 return $self->_run('align', [$infilename,$type], $param_string);
755             }
756              
757             #################################################
758              
759             =head2 profile_align
760              
761             Title : profile_align
762             Usage :
763             Function: Perform an alignment of 2 (sub)alignments
764             Example :
765             Returns : Reference to a SimpleAlign object containing the (super)alignment.
766             Args : Names of 2 files containing the subalignments
767             or references to 2 Bio::SimpleAlign objects.
768             Note : Needs to be updated to run with newer TCoffee code, which
769             allows more than two profile alignments.
770              
771             Throws an exception if arguments are not either strings (eg filenames)
772             or references to SimpleAlign objects.
773              
774             =cut
775              
776             sub profile_align {
777 0     0 1 0 my $self = shift;
778 0         0 my $input1 = shift;
779 0         0 my $input2 = shift;
780            
781 0         0 my ($temp,$infilename1,$infilename2,$type1,$type2,$input,$seq);
782              
783 0         0 $self->io->_io_cleanup();
784             # Create input file pointers
785 0         0 ($infilename1,$type1) = $self->_setinput($input1);
786 0         0 ($infilename2,$type2) = $self->_setinput($input2);
787 0 0       0 unless ($type1) {
788 0         0 $self->throw("Unknown type for first argument");
789             }
790 0 0       0 unless ($type2) {
791 0         0 $self->throw("Unknown type for second argument")
792             }
793            
794 0 0 0     0 if (!$infilename1 || !$infilename2) {
795 0         0 $self->throw("Bad input data: $input1 or $input2 !");
796             }
797              
798 0         0 my $param_string = $self->_setparams();
799             # run tcoffee
800 0         0 my $aln = $self->_run('profile-aln',
801             [$infilename1,$type1],
802             [$infilename2,$type2],
803             $param_string)
804             ;
805             }
806             #################################################
807              
808             =head2 _run
809              
810             Title : _run
811             Usage : Internal function, not to be called directly
812             Function: makes actual system call to tcoffee program
813             Example :
814             Returns : nothing; tcoffee output is written to a
815             temporary file OR specified output file
816             Args : Name of a file containing a set of unaligned fasta sequences
817             and hash of parameters to be passed to tcoffee
818              
819             =cut
820              
821             sub _run {
822 0     0   0 my ($infilename, $infile1,$infile2) = ('','','');
823 0         0 my $self = shift;
824 0         0 my $command = shift;
825 0         0 my $instring;
826 0 0       0 if ($command =~ /align/) {
827 0         0 my $infile = shift ;
828 0         0 my $type;
829 0         0 ($infilename,$type) = @$infile;
830 0         0 $instring = '-in='.join(',',($infilename, 'X'.$self->matrix,
831             $self->methods));
832             }
833 0 0       0 if ($command =~ /profile/) {
834 0         0 my $in1 = shift ;
835 0         0 my $in2 = shift ;
836 0         0 my ($type1,$type2);
837 0         0 ($infile1,$type1) = @$in1;
838 0         0 ($infile2,$type2) = @$in2;
839             # in later versions (tested on 5.72 and 7.54) the API for profile
840             # alignment changed. This attempts to do the right thing for older
841             # versions but corrects for newer ones
842 0 0 0     0 if ($self->version && $self->version < 5) {
843             # this breaks severely on newer TCoffee (>= v5)
844 0 0 0     0 unless (($self->matrix =~ /none/i) || ($self->matrix =~ /null/i) ) {
845             $instring = '-in='.join(',',
846             ($type2.$infile2),
847             'X'.$self->matrix,
848 0         0 (map {'M'.$_} $self->methods)
  0         0  
849             );
850 0         0 $instring .= ' -profile='.$infile1;
851             } else {
852             $instring = '-in='.join(',',(
853             $type1.$infile1,
854             $type2.$infile2,
855 0         0 (map {'M'.$_} $self->methods)
  0         0  
856             )
857             );
858             }
859             } else {
860 0 0       0 if ($type2 eq 'S') {
861             # second infile is a sequence, not an alignment
862 0         0 $instring .= ' -profile='.join(',',$infile1);
863 0         0 $instring .= ' -seq='.join(',',$infile2);
864             } else {
865 0         0 $instring .= ' -profile='.join(',',$infile1,$infile2);
866             }
867 0 0 0     0 $instring .= ' -matrix='.$self->matrix unless (($self->matrix =~ /none/i) || ($self->matrix =~ /null/i)) ;
868 0 0       0 $instring .= ' -method='.join(',',$self->methods) if ($self->methods) ;
869             }
870             }
871 0         0 my $param_string = shift;
872             # my ($paramfh,$parameterFile) = $self->io->tempfile;
873             # print $paramfh ( "$instring\n-output=gcg$param_string") ;
874             # close($paramfh);
875             # my $commandstring = "t_coffee -output=gcg -parameters $parameterFile" ; ##MJL
876              
877 0         0 my $commandstring = $self->executable." $instring $param_string";
878            
879             #$self->debug( "tcoffee command = $commandstring \n");
880              
881 0         0 my $status = system($commandstring);
882 0         0 my $outfile = $self->outfile();
883              
884 0 0 0     0 if( !-e $outfile || -z $outfile ) {
885 0         0 $self->warn( "TCoffee call crashed: $? [command $commandstring]\n");
886 0         0 return undef;
887             }
888              
889             # retrieve alignment (Note: MSF format for AlignIO = GCG format of
890             # tcoffee)
891              
892 0         0 my $in = Bio::AlignIO->new('-file' => $outfile, '-format' =>
893             $self->output);
894 0         0 my $aln = $in->next_aln();
895              
896             # Replace file suffix with dnd to find name of dendrogram file(s) to delete
897 0 0       0 if( ! $self->keepdnd ) {
898 0         0 foreach my $f ( $infilename, $infile1, $infile2 ) {
899 0 0 0     0 next if( !defined $f || $f eq '');
900 0         0 $f =~ s/\.[^\.]*$// ;
901             # because TCoffee writes these files to the CWD
902 0 0       0 if( $Bio::Root::IO::PATHSEP ) {
903 0         0 my @line = split(/$Bio::Root::IO::PATHSEP/, $f);
904 0         0 $f = pop @line;
905             } else {
906 0         0 (undef, undef, $f) = File::Spec->splitpath($f);
907             }
908 0 0       0 unlink $f .'.dnd' if( $f ne '' );
909             }
910             }
911 0         0 return $aln;
912             }
913              
914              
915             =head2 _setinput
916              
917             Title : _setinput
918             Usage : Internal function, not to be called directly
919             Function: Create input file for tcoffee program
920             Example :
921             Returns : name of file containing tcoffee data input AND
922             type of file (if known, S for sequence, L for sequence library,
923             A for sequence alignment)
924             Args : Seq or Align object reference or input file name
925              
926              
927             =cut
928              
929             sub _setinput {
930 0     0   0 my ($self,$input) = @_;
931 0         0 my ($infilename, $seq, $temp, $tfh);
932             # If $input is not a reference it better be the name of a
933             # file with the sequence/ alignment data...
934 0         0 my $type = '';
935 0 0       0 if (! ref $input) {
    0          
    0          
    0          
936             # check that file exists or throw
937 0         0 $infilename = $input;
938 0 0       0 unless (-e $input) {return 0;}
  0         0  
939             # let's peek and guess
940 0 0       0 open(my $IN,$infilename) || $self->throw("Cannot open $infilename");
941 0         0 my $header = <$IN>;
942 0 0 0     0 if( $header =~ /^\s+\d+\s+\d+/ ||
      0        
943             $header =~ /Pileup/i ||
944             $header =~ /clustal/i ) { # phylip
945 0         0 $type = 'A';
946             }
947            
948             # On some systems, having filenames with / in them (ie. a file in a
949             # directory) causes t-coffee to completely fail. It warns on all systems.
950             # The -no_warning option solves this, but there is still some strange
951             # bug when doing certain profile-related things. This is magically solved
952             # by copying the profile file to a temp file in the current directory, so
953             # it its filename supplied to t-coffee contains no /
954             # (It's messy here - I just do this to /all/ input files to most easily
955             # catch all variants of providing a profile - it may only be the last
956             # form (isa("Bio::PrimarySeqI")) that causes a problem?)
957            
958 0         0 my (undef, undef, $adjustedfilename) = File::Spec->splitpath($infilename);
959 0 0       0 if ($adjustedfilename ne $infilename) {
960 0         0 my ($fh, $tempfile) = $self->io->tempfile(-dir => cwd());
961 0         0 seek($IN, 0, 0);
962 0         0 while (<$IN>) {
963 0         0 print $fh $_;
964             }
965 0         0 close($fh);
966 0         0 (undef, undef, $tempfile) = File::Spec->splitpath($tempfile);
967 0         0 $infilename = $tempfile;
968 0         0 $type = 'S';
969             }
970            
971 0         0 close($IN);
972 0         0 return ($infilename,$type);
973             } elsif (ref($input) =~ /ARRAY/i ) {
974             # $input may be an array of BioSeq objects...
975             # Open temporary file for both reading & writing of array
976 0         0 ($tfh,$infilename) = $self->io->tempfile(-dir => cwd());
977 0         0 (undef, undef, $infilename) = File::Spec->splitpath($infilename);
978 0 0       0 if( ! ref($input->[0]) ) {
    0          
    0          
979 0         0 $self->warn("passed an array ref which did not contain objects to _setinput");
980 0         0 return undef;
981             } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) {
982 0         0 $temp = Bio::SeqIO->new('-fh' => $tfh,
983             '-format' => 'fasta');
984 0         0 my $ct = 1;
985 0         0 foreach $seq (@$input) {
986 0 0 0     0 return 0 unless ( ref($seq) &&
987             $seq->isa("Bio::PrimarySeqI") );
988 0 0 0     0 if( ! defined $seq->display_id ||
989             $seq->display_id =~ /^\s+$/) {
990 0         0 $seq->display_id( "Seq".$ct++);
991             }
992 0         0 $temp->write_seq($seq);
993             }
994 0         0 $temp->close();
995 0         0 undef $temp;
996 0         0 close($tfh);
997 0         0 $tfh = undef;
998 0         0 $type = 'S';
999             } elsif( $input->[0]->isa('Bio::Align::AlignI' ) ) {
1000 0         0 $temp = Bio::AlignIO->new('-fh' => $tfh,
1001             '-format' => $self->aformat);
1002 0         0 foreach my $aln (@$input) {
1003 0 0 0     0 next unless ( ref($aln) &&
1004             $aln->isa("Bio::Align::AlignI") );
1005 0         0 $temp->write_aln($aln);
1006             }
1007 0         0 $temp->close();
1008 0         0 undef $temp;
1009 0         0 close($tfh);
1010 0         0 $tfh = undef;
1011 0         0 $type = 'A';
1012             } else {
1013 0         0 $self->warn( "got an array ref with 1st entry ".
1014             $input->[0].
1015             " and don't know what to do with it\n");
1016             }
1017 0         0 return ($infilename,$type);
1018             # $input may be a SimpleAlign object.
1019             } elsif ( $input->isa("Bio::Align::AlignI") ) {
1020             # Open temporary file for both reading & writing of SimpleAlign object
1021 0         0 ($tfh, $infilename) = $self->io->tempfile(-dir => cwd());
1022 0         0 (undef, undef, $infilename) = File::Spec->splitpath($infilename);
1023 0         0 $temp = Bio::AlignIO->new(-fh=>$tfh,
1024             '-format' => 'clustalw');
1025 0         0 $temp->write_aln($input);
1026 0         0 close($tfh);
1027 0         0 undef $tfh;
1028 0         0 return ($infilename,'A');
1029             }
1030            
1031             # or $input may be a single BioSeq object (to be added to
1032             # a previous alignment)
1033             elsif ( $input->isa("Bio::PrimarySeqI")) {
1034             # Open temporary file for both reading & writing of BioSeq object
1035 0         0 ($tfh,$infilename) = $self->io->tempfile(-dir => cwd());
1036 0         0 (undef, undef, $infilename) = File::Spec->splitpath($infilename);
1037 0         0 $temp = Bio::SeqIO->new(-fh=> $tfh, '-format' =>'Fasta');
1038 0         0 $temp->write_seq($input);
1039 0         0 $temp->close();
1040 0         0 close($tfh);
1041 0         0 undef $tfh;
1042 0         0 return ($infilename,'S');
1043             } else {
1044 0         0 $self->warn("Got $input and don't know what to do with it\n");
1045             }
1046 0         0 return 0;
1047             }
1048              
1049              
1050             =head2 _setparams
1051              
1052             Title : _setparams
1053             Usage : Internal function, not to be called directly
1054             Function: Create parameter inputs for tcoffee program
1055             Example :
1056             Returns : parameter string to be passed to tcoffee
1057             during align or profile_align
1058             Args : name of calling object
1059              
1060             =cut
1061              
1062             sub _setparams {
1063 0     0   0 my ($self) = @_;
1064 0         0 my ($attr, $value,$param_string);
1065 0         0 $param_string = '';
1066 0         0 my $laststr;
1067 0         0 for $attr ( @TCOFFEE_PARAMS ) {
1068 0         0 $value = $self->$attr();
1069 0 0       0 next unless (defined $value);
1070 0         0 my $attr_key = lc $attr;
1071 0 0       0 if( $attr_key =~ /matrix/ ) {
1072 0         0 $self->{'_in'} = [ "X".lc($value) ];
1073             } else {
1074 0         0 $attr_key = ' -'.$attr_key;
1075 0         0 $param_string .= $attr_key .'='.$value;
1076             }
1077             }
1078 0         0 for $attr ( @TCOFFEE_SWITCHES) {
1079 0         0 $value = $self->$attr();
1080 0 0       0 next unless ($value);
1081 0         0 my $attr_key = lc $attr; #put switches in format expected by tcoffee
1082 0         0 $attr_key = ' -'.$attr_key;
1083 0         0 $param_string .= $attr_key ;
1084             }
1085              
1086             # Set default output file if no explicit output file selected
1087 0 0       0 unless ($self->outfile ) {
1088 0         0 my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir());
1089 0         0 close($tfh);
1090 0         0 undef $tfh;
1091 0         0 $self->outfile($outfile);
1092 0         0 $param_string .= " -outfile=$outfile" ;
1093             }
1094              
1095 0 0 0     0 if ($self->quiet() || $self->verbose < 0) { $param_string .= ' -quiet';}
  0         0  
1096            
1097             # -no_warning is required on some systems with certain versions or failure
1098             # is guaranteed
1099 0 0 0     0 if ($self->version >= 4 && $self->version < 4.7) {
1100 0         0 $param_string .= ' -no_warning';
1101             }
1102            
1103 0         0 return $param_string;
1104             }
1105              
1106             =head2 aformat
1107              
1108             Title : aformat
1109             Usage : my $alignmentformat = $self->aformat();
1110             Function: Get/Set alignment format
1111             Returns : string
1112             Args : string
1113              
1114              
1115             =cut
1116              
1117             sub aformat{
1118 0     0 1 0 my $self = shift;
1119 0 0       0 $self->{'_aformat'} = shift if @_;
1120 0         0 return $self->{'_aformat'};
1121             }
1122              
1123              
1124             =head2 methods
1125              
1126             Title : methods
1127             Usage : my @methods = $self->methods()
1128             Function: Get/Set Alignment methods - NOT VALIDATED
1129             Returns : array of strings
1130             Args : arrayref of strings
1131              
1132              
1133             =cut
1134              
1135             sub methods{
1136 2     2 1 3 my ($self) = shift;
1137              
1138 2 50       4 @{$self->{'_methods'}} = @{shift || []} if @_;
  1 100       3  
  1         3  
1139 2 100       2 return @{$self->{'_methods'} || []};
  2         10  
1140             }
1141              
1142              
1143             =head1 Bio::Tools::Run::BaseWrapper methods
1144              
1145             =cut
1146              
1147             =head2 no_param_checks
1148              
1149             Title : no_param_checks
1150             Usage : $obj->no_param_checks($newval)
1151             Function: Boolean flag as to whether or not we should
1152             trust the sanity checks for parameter values
1153             Returns : value of no_param_checks
1154             Args : newvalue (optional)
1155              
1156              
1157             =cut
1158              
1159             =head2 save_tempfiles
1160              
1161             Title : save_tempfiles
1162             Usage : $obj->save_tempfiles($newval)
1163             Function:
1164             Returns : value of save_tempfiles
1165             Args : newvalue (optional)
1166              
1167              
1168             =cut
1169              
1170             =head2 outfile_name
1171              
1172             Title : outfile_name
1173             Usage : my $outfile = $tcoffee->outfile_name();
1174             Function: Get/Set the name of the output file for this run
1175             (if you wanted to do something special)
1176             Returns : string
1177             Args : [optional] string to set value to
1178              
1179              
1180             =cut
1181              
1182              
1183             =head2 tempdir
1184              
1185             Title : tempdir
1186             Usage : my $tmpdir = $self->tempdir();
1187             Function: Retrieve a temporary directory name (which is created)
1188             Returns : string which is the name of the temporary directory
1189             Args : none
1190              
1191              
1192             =cut
1193              
1194             =head2 cleanup
1195              
1196             Title : cleanup
1197             Usage : $tcoffee->cleanup();
1198             Function: Will cleanup the tempdir directory
1199             Returns : none
1200             Args : none
1201              
1202              
1203             =cut
1204              
1205             =head2 io
1206              
1207             Title : io
1208             Usage : $obj->io($newval)
1209             Function: Gets a L object
1210             Returns : L
1211             Args : none
1212              
1213              
1214             =cut
1215              
1216             1; # Needed to keep compiler happy