File Coverage

blib/lib/Bio/Tools/Run/Phylo/Phast/PhastCons.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1             # $Id$
2             #
3             # BioPerl module for Bio::Tools::Run::Phylo::Phast::PhastCons
4             #
5             # Please direct questions and support issues to
6             #
7             # Cared for by Sendu Bala
8             #
9             # Copyright Sendu Bala
10             #
11             # You may distribute this module under the same terms as perl itself
12              
13             # POD documentation - main docs before the code
14              
15             =head1 NAME
16              
17             Bio::Tools::Run::Phylo::Phast::PhastCons - Wrapper for footprinting using
18             phastCons
19              
20             =head1 SYNOPSIS
21              
22             use Bio::Tools::Run::Phylo::Phast::PhastCons;
23              
24             # Make a PhastCons factory
25             $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new();
26              
27             # Pass the factory an alignment and the corresponding species tree
28             $align_filename = 't/data/apes.multi_fasta';
29             $species_tree_filename = 't/data/apes.newick';
30             @features = $factory->run($align_filename, $species_tree_filename);
31              
32             # or get a Bio::Align::AlignI (SimpleAlign) object from somewhere, and
33             # generate the species tree automatically using a Bio::DB::Taxonomy database
34             $tdb = Bio::DB::Taxonomy->new(-source => 'entrez');
35             @features = $factory->run($aln_obj, $tdb);
36              
37             # @features is an array of Bio::SeqFeature::Annotated, one feature per
38             # alignment sequence and prediction
39              
40             =head1 DESCRIPTION
41              
42             This is a wrapper for running the phastCons application by Adam Siepel. You
43             can get details here: http://compgen.bscb.cornell.edu/~acs/software.html
44             phastCons is used for phylogenetic footprinting/ shadowing.
45              
46             Currently the interface is extremely simplified, allowing only one
47             analysis method. The focus here is on ease of use, allowing phastCons
48             to estimate as many parameters as possible and having it output just
49             the 'most conserved' blocks it detects. You can, however, try
50             supplying normal phastCons arguments to new(), or calling arg-named
51             methods (excluding initial hyphens and converting others to
52             underscores, eg. $factory-Eindels_only(1) to set the --indels-only
53             arg).
54              
55             The particular analysis carried out here is to:
56              
57             1. Use phyloFit to generate a tree model for initialization of the nonconserved
58             model from the supplied alignment (all data) and species tree
59             2. Run phastCons in 'training' mode for parameter estimation using all the
60             alignment data and the model from step 1
61             3. Run phastCons with the trees from step 2 to discover the most conserved
62             regions
63              
64             See the 'HowTo' at http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html
65             for details on how to improve results.
66              
67             WARNING: the API is likely to change in the future to allow for alternative
68             analysis types.
69              
70             You will need to enable this phastCons wrapper to find the phast programs (at
71             least phastCons and phyloFit).
72             This can be done in (at least) three ways:
73              
74             1. Make sure the phastCons and phyloFit executables are in your path.
75             2. Define an environmental variable PHASTDIR which is a
76             directory which contains the phastCons and phyloFit applications:
77             In bash:
78              
79             export PHASTDIR=/home/username/phast/bin
80              
81             In csh/tcsh:
82              
83             setenv PHASTDIR /home/username/phast/bin
84              
85             3. Include a definition of an environmental variable PHASTDIR in
86             every script that will use this PhastCons wrapper module, e.g.:
87              
88             BEGIN { $ENV{PHASTDIR} = '/home/username/phast/bin' }
89             use Bio::Tools::Run::Phylo::Phast::PhastCons;
90              
91             =head1 FEEDBACK
92              
93             =head2 Mailing Lists
94              
95             User feedback is an integral part of the evolution of this and other
96             Bioperl modules. Send your comments and suggestions preferably to
97             the Bioperl mailing list. Your participation is much appreciated.
98              
99             bioperl-l@bioperl.org - General discussion
100             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
101              
102             =head2 Support
103              
104             Please direct usage questions or support issues to the mailing list:
105              
106             I
107              
108             rather than to the module maintainer directly. Many experienced and
109             reponsive experts will be able look at the problem and quickly
110             address it. Please include a thorough description of the problem
111             with code and data examples if at all possible.
112              
113             =head2 Reporting Bugs
114              
115             Report bugs to the Bioperl bug tracking system to help us keep track
116             of the bugs and their resolution. Bug reports can be submitted via
117             the web:
118              
119             http://redmine.open-bio.org/projects/bioperl/
120              
121             =head1 AUTHOR - Sendu Bala
122              
123             Email bix@sendu.me.uk
124              
125             =head1 APPENDIX
126              
127             The rest of the documentation details each of the object methods.
128             Internal methods are usually preceded with a _
129              
130             =cut
131              
132             package Bio::Tools::Run::Phylo::Phast::PhastCons;
133 1     1   105954 use strict;
  1         2  
  1         23  
134              
135 1     1   3 use Cwd;
  1         1  
  1         46  
136 1     1   3 use File::Basename;
  1         2  
  1         55  
137 1     1   4 use Clone qw(clone);
  1         1  
  1         41  
138 1     1   488 use Bio::AlignIO;
  1         83164  
  1         28  
139 1     1   421 use Bio::Tools::Run::Phylo::Phast::PhyloFit;
  1         2  
  1         31  
140 1     1   173 use Bio::FeatureIO;
  0            
  0            
141             use Bio::Annotation::SimpleValue;
142              
143             use base qw(Bio::Tools::Run::Phylo::PhyloBase);
144              
145             our $PROGRAM_NAME = 'phastCons';
146             our $PROGRAM_DIR = $ENV{'PHASTDIR'};
147              
148             # methods and their synonyms from the phastCons args we support
149             our %PARAMS = (rho => 'R',
150             nrates => 'k',
151             transitions => 't',
152             target_coverage => 'C',
153             expected_length => ['E', 'expected_lengths'],
154             lnl => 'L',
155             log => 'g',
156             max_micro_indel => 'Y',
157             indel_params => 'D',
158             lambda => 'l',
159             extrapolate => 'e',
160             hmm => 'H',
161             catmap => 'c',
162             states => 'S',
163             reflect_strand => 'U',
164             require_informative => 'M',
165             not_informative => 'F');
166              
167             our %SWITCHES = (quiet => 'q',
168             indels => 'I',
169             indels_only => 'J',
170             FC => 'X',
171             coding_potential => 'p',
172             ignore_missing => 'z');
173              
174             # just to be explicit, args we don't support (yet) or we handle ourselves
175             our %UNSUPPORTED = (estimate_trees => 'T',
176             estimate_rho => 'O',
177             gc => 'G',
178             msa_format => 'i',
179             score => 's',
180             no_post_probs => 'n',
181             seqname => 'N',
182             refidx => 'r',
183             idpref => 'P',
184             help => 'h',
185             alias => 'A',
186             most_conserved => ['V', 'viterbi']);
187              
188              
189             =head2 program_name
190              
191             Title : program_name
192             Usage : $factory>program_name()
193             Function: holds the program name
194             Returns : string
195             Args : None
196              
197             =cut
198              
199             sub program_name {
200             return $PROGRAM_NAME;
201             }
202              
203             =head2 program_dir
204              
205             Title : program_dir
206             Usage : $factory->program_dir(@params)
207             Function: returns the program directory, obtained from ENV variable.
208             Returns : string
209             Args : None
210              
211             =cut
212              
213             sub program_dir {
214             return $PROGRAM_DIR;
215             }
216              
217             =head2 new
218              
219             Title : new
220             Usage : $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new(@params)
221             Function: Creates a new PhastCons factory
222             Returns : Bio::Tools::Run::Phylo::Phast::PhastCons
223             Args : Optionally, provide any of the following (defaults are not to use,
224             see the same-named methods for information on what each option does):
225             {
226             -target_coverage => number between 0 and 1
227             AND
228             -expected_length => int
229             }
230             -rho => number between 0 and 1
231             -quiet => boolean (turn on or off program output to console)
232              
233             Most other options understood by phastCons can be supplied as key =>
234             value pairs in this way. Options that don't normally take a value
235             should be given a value of 1. You can type the keys as you would on
236             the command line (eg. '--indels-only' => 1) or with only a single
237             hyphen to start and internal hyphens converted to underscores (eg.
238             -indels_only => 1) to avoid having to quote the key.
239              
240             These options can NOT be used with this wrapper currently:
241             estimate_trees / T
242             estimate_rho / O
243             gc / G
244             msa_format / i
245             score / s
246             no_post_probs / n
247             seqname / N
248             idpref / P
249             help / h
250             alias / A
251             most_conserved / V / viterbi
252             refidx / r
253              
254             =cut
255              
256             sub new {
257             my ($class, @args) = @_;
258             my $self = $class->SUPER::new(@args);
259            
260             $self->_set_from_args(\@args, -methods => {(map { $_ => $PARAMS{$_} } keys %PARAMS),
261             (map { $_ => $SWITCHES{$_} } keys %SWITCHES)},
262             -create => 1);
263            
264             return $self;
265             }
266              
267             =head2 target_coverage
268              
269             Title : target_coverage
270             Usage : $factory->target_coverage(0.25);
271             Function: Constrain transition parameters such that the expected fraction of
272             sites in conserved elements is the supplied value.
273             Returns : number (default undef)
274             Args : None to get, number (between 0 and 1) to set
275              
276             =cut
277              
278             sub target_coverage {
279             my ($self, $num) = @_;
280             if (defined ($num)) {
281             ($num > 0 && $num < 1) || $self->throw("target_coverage value must be between 0 and 1, exclusive");
282             $self->{coverage} = $num;
283             }
284             return $self->{coverage} || return;
285             }
286              
287             =head2 expected_length
288              
289             Title : expected_length
290             Usage : $factory->expected_length(5);
291             Function: Set transition probabilities such that the expected length of a
292             conserved element is the supplied value. target_coverage() must also
293             be set.
294             Returns : int (default undef)
295             Args : None to get, int to set
296              
297             =cut
298              
299             # created automatically
300              
301             =head2 rho
302              
303             Title : rho
304             Usage : $factory->rho(0.3);
305             Function: Set the *scale* (overall evolutionary rate) of the model for the
306             conserved state to be the supplied number times that of the model for
307             the non-conserved state (default 0.3).
308             Returns : number (default undef)
309             Args : None to get, number (between 0 and 1) to set
310              
311             =cut
312              
313             sub rho {
314             my ($self, $num) = @_;
315             if (defined ($num)) {
316             ($num > 0 && $num < 1) || $self->throw("rho value must be between 0 and 1, exclusive");
317             $self->{rho} = $num;
318             }
319             return $self->{rho} || return;
320             }
321              
322             =head2 run
323              
324             Title : run
325             Usage : $result = $factory->run($fasta_align_file, $newick_tree_file);
326             -or-
327             $result = $factory->run($align_object, $tree_object);
328             -or-
329             $result = $factory->run($align_object, $db_taxonomy_object);
330             Function: Runs phastCons on an alignment to find the most conserved regions
331             ('footprinting').
332             Returns : array of Bio::SeqFeature::Annotated (one feature per alignment
333             sequence and prediction)
334             Args : The first argument represents an alignment, the second argument
335             a species tree.
336             The alignment can be provided as a multi-fasta format alignment
337             filename, or a Bio::Align::AlignI compliant object (eg. a
338             Bio::SimpleAlign).
339             The species tree can be provided as a newick format tree filename
340             or a Bio::Tree::TreeI compliant object. Alternatively a
341             Bio::DB::Taxonomy object can be supplied, in which case the species
342             tree will be generated by using the alignment sequence names as
343             species names and looking for those in the supplied database.
344              
345             In all cases, the alignment sequence names must correspond to node
346             ids in the species tree. Multi-word species names should be joined
347             with underscores to form the sequence names, eg. Homo_sapiens
348              
349             =cut
350              
351             sub run {
352             my ($self, $aln, $tree) = @_;
353            
354             ($aln && $tree) || $self->throw("alignment and tree must be supplied");
355             my $aln_obj = $self->_alignment($aln);
356             $tree = $self->_tree($tree);
357            
358             # if aln was a file, set the alignment id to match file name
359             if (-e $aln) {
360             my $aln_id = basename($aln);
361             ($aln_id) = $aln_id =~ /^([^\.]+)/;
362             $aln_obj->id($aln_id);
363             }
364            
365             return $self->_run;
366             }
367              
368             sub _run {
369             my $self = shift;
370            
371             my $exe = $self->executable || return;
372            
373             # use phyloFit to generate tree model initialization (?) using species tree
374             # and alignment
375             my $pf = Bio::Tools::Run::Phylo::Phast::PhyloFit->new(-verbose => $self->verbose, -quiet => $self->quiet);
376             my $init_mod = $pf->run($self->_alignment, $self->_tree) || $self->throw("phyloFit failed to work as expected, is it installed?");
377            
378             # cd to a temp dir
379             my $temp_dir = $self->tempdir;
380             my $cwd = Cwd->cwd();
381             chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'");
382            
383             my $aln_file = $self->_write_alignment;
384            
385             # do training for parameter estimation
386             my $command = $exe.$self->_setparams($aln_file, $init_mod);
387             $self->debug("phastCons training command = $command\n");
388             system($command) && $self->throw("phastCons training call ($command) crashed: $?");
389            
390             # do the final analysis
391             $command = $exe.$self->_setparams($aln_file);
392             $self->debug("phastCons command = $command\n");
393             system($command) && $self->throw("phastCons call ($command) crashed: $?");
394            
395             # read in most_cons.bed as the result
396             my $bedin = Bio::FeatureIO->new(-format => 'bed', -file => 'most_cons.bed');
397            
398             # cd back to orig dir
399             chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'");
400            
401             my @feats = ();
402             my $aln = $self->_alignment;
403             while (my $feat = $bedin->next_feature) {
404             $feat->source_tag('phastCons');
405             my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'predicted', -value => 1);
406             $feat->annotation->add_Annotation($sv);
407             # $feat->type('TF_binding_site'); causes seg fault in subsequent clone()
408            
409             # features are in zero-based alignment coords; make a feature for each
410             # alignment sequence
411             foreach my $seq ($aln->each_seq) {
412             my $clone = clone($feat);
413             # $clone->type('TF_binding_site'); causes massive slowdown if you later store/retrieve these features from Bio::DB::SeqFeature database
414            
415             # give it the correct id
416             $clone->seq_id($seq->id);
417            
418             # check and correct the coords (sequence may not have the feature)
419             my $sloc = $seq->location_from_column($feat->start + 1) || next;
420             my $eloc = $seq->location_from_column($feat->end + 1) || next;
421             $clone->start($sloc->start - 1);
422             $clone->end($eloc->end - 1);
423            
424             push(@feats, $clone);
425             }
426             }
427             return @feats;
428             }
429              
430             =head2 _setparams
431              
432             Title : _setparams
433             Usage : Internal function, not to be called directly
434             Function: Creates a string of params to be used in the command string
435             Returns : string of params
436             Args : alignment file name for result production, AND filename of phyloFit
437             generated init.mod file to estimate trees
438              
439             =cut
440              
441             sub _setparams {
442             my ($self, $aln_file, $init_mod) = @_;
443            
444             my $param_string = $self->SUPER::_setparams(-params => [keys %PARAMS],
445             -switches => [keys %SWITCHES],
446             -double_dash => 1,
447             -underscore_to_dash => 1);
448            
449             $param_string .= ' --no-post-probs';
450             my $aln_id = $self->_alignment->id;
451             $param_string .= " --seqname $aln_id --idpref $aln_id" if $aln_id;
452             $param_string .= ' --refidx 0';
453            
454             my $input = ' --msa-format FASTA '.$aln_file;
455             if ($init_mod) {
456             $param_string .= ' --estimate-trees mytrees '.$input.' '.$init_mod;
457             }
458             else {
459             $param_string .= $input.' --most-conserved most_cons.bed --score mytrees.cons.mod,mytrees.noncons.mod';
460             }
461            
462             return $param_string;
463             }
464              
465             1;