File Coverage

blib/lib/Bio/Tools/Run/TigrAssembler.pm
Criterion Covered Total %
statement 17 65 26.1
branch 2 20 10.0
condition 0 3 0.0
subroutine 4 6 66.6
pod 1 1 100.0
total 24 95 25.2


line stmt bran cond sub pod time code
1             # BioPerl module for Bio::Tools::Run::TigrAssembler
2             #
3             # Copyright Florent E Angly
4             #
5             # You may distribute this module under the same terms as perl itself
6             #
7             # POD documentation - main docs before the code
8              
9             =head1 NAME
10              
11             Bio::Tools::Run::TigrAssembler - Wrapper for local execution of TIGR Assembler
12             v2
13              
14             =head1 SYNOPSIS
15              
16             use Bio::Tools::Run::TigrAssembler;
17             # Run TIGR Assembler using an input FASTA file
18             my $factory = Bio::Tools::Run::TigrAssembler->new( -minimum_overlap_length => 35 );
19             my $asm_obj = $factory->run($fasta_file, $qual_file);
20             # An assembly object is returned by default
21             for my $contig ($assembly->all_contigs) {
22             ... do something ...
23             }
24              
25             # Read some sequences
26             use Bio::SeqIO;
27             my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta');
28             my @seqs;
29             while (my $seq = $sio->next_seq()) {
30             push @seqs,$seq;
31             }
32              
33             # Run TIGR Assembler with input sequence objects and return an assembly file
34             my $asm_file = 'results.tigr';
35             $factory->out_type($asm_file);
36             $factory->run(\@seqs);
37              
38             # Use LIGR Assembler instead
39             my $ligr = Bio::Tools::Run::TigrAssembler->new(
40             -program_name => 'LIGR_Assembler',
41             -trimmed_seq => 1
42             );
43             $ligr->run(\@seqs);
44              
45             =head1 DESCRIPTION
46              
47             Wrapper module for the local execution of the DNA assembly program TIGR
48             Assembler v2.0. TIGR Assembler is open source software under The Artistic
49             License and available at: http://www.tigr.org/software/assembler/
50              
51             This module runs TIGR Assembler by feeding it a FASTA file or sequence objects
52             and returning an assembly file or assembly and IO objects. When the input is
53             Bioperl object, sequences less than 39 bp long are filtered out since they are
54             not supported by TIGR Assembler.
55              
56             If provided in the following way, TIGR Assembler will use additional
57             information present in the sequence descriptions for assembly:
58             >seq_name minimum_clone_length maximum_clone_length median_clone_length
59             clear_end5 clear_end3
60             or
61             >db|seq_name minimum_clone_length maximum_clone_length median_clone_length
62             clear_end5 clear_end3
63             e.g.
64             >GHIBF57F 500 3000 1750 33 587
65              
66             This module also supports LIGR Assembler, a variant of TIGR Assembler:
67             http://sourceforge.net/projects/ligr-assembler/
68              
69             =head1 FEEDBACK
70              
71             =head2 Mailing Lists
72              
73             User feedback is an integral part of the evolution of this and other Bioperl
74             modules. Send your comments and suggestions preferably to one of the Bioperl
75             mailing lists. Your participation is much appreciated.
76              
77             bioperl-l@bioperl.org - General discussion
78             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79              
80             =head2 Support
81              
82             Please direct usage questions or support issues to the mailing list:
83              
84             I
85              
86             rather than to the module maintainer directly. Many experienced and
87             reponsive experts will be able look at the problem and quickly
88             address it. Please include a thorough description of the problem
89             with code and data examples if at all possible.
90              
91             =head2 Reporting Bugs
92              
93             Report bugs to the Bioperl bug tracking system to help us keep track the bugs
94             and their resolution. Bug reports can be submitted via the web:
95              
96             http://redmine.open-bio.org/projects/bioperl/
97              
98             =head1 AUTHOR - Florent E Angly
99              
100             Email: florent-dot-angly-at-gmail-dot-com
101              
102             =head1 APPENDIX
103              
104             The rest of the documentation details each of the object methods. Internal
105             methods are usually preceded with a _
106              
107             =cut
108              
109              
110             package Bio::Tools::Run::TigrAssembler;
111              
112 1     1   132957 use strict;
  1         1  
  1         24  
113 1     1   4 use IPC::Run;
  1         1  
  1         32  
114              
115 1     1   3 use base qw( Bio::Root::Root Bio::Tools::Run::AssemblerBase );
  1         1  
  1         456  
116              
117             our $program_name = 'TIGR_Assembler'; # name of the executable
118             our @program_params = (qw( minimum_percent minimum_length max_err_32 quality_file
119             maximum_end resort_after ));
120             our @program_switches = (qw( include_singlets consider_low_scores safe_merging_stop
121             ignore_tandem_32mers use_tandem_32mers not_random incl_bad_seq trimmed_seq ));
122             our %param_translation = (
123             'quality_file' => 'q',
124             'minimum_percent' => 'p',
125             'minimum_length' => 'l',
126             'include_singlets' => 's',
127             'max_err_32' => 'g',
128             'consider_low_scores' => 'L',
129             'maximum_end' => 'e',
130             'ignore_tandem_32mers' => 't',
131             'use_tandem_32mers' => 'u',
132             'safe_merging_stop' => 'X',
133             'not_random' => 'N',
134             'resort_after' => 'r',
135             'incl_bad_seq' => 'b',
136             'trimmed_seq' => 'i'
137             );
138             our $qual_param = 'quality_file';
139             our $use_dash = 1;
140             our $join = ' ';
141             our $asm_format = 'tigr';
142             our $min_len = 39;
143              
144              
145             =head2 new
146              
147             Title : new
148             Usage : $factory->new( -minimum_percent => 95,
149             -minimum_length => 50,
150             -include_singlets => 1 );
151             Function: Create a TIGR Assembler factory
152             Returns : A Bio::Tools::Run::TigrAssembler object
153             Args :
154              
155             TIGR Assembler options available in this module:
156              
157             minimum_percent / minimum_overlap_similarity: the minimum percent identity
158             that two DNA fragments must achieve over their entire region of overlap in
159             order to be considered as a possible assembly. Adjustments are made by the
160             program to take into account that the ends of sequences are lower quality
161             and doubled base calls are the most frequent sequencing error.
162             minimum_length / minimum_overlap_length: the minimum length two DNA fragments
163             must overlap to be considered as a possible assembly (warning: this option
164             is not strictly respected by TIGR Assembler...)
165             include_singlets: a flag which indicates that singletons (assemblies made up
166             of a single DNA fragment) should be included in the lassie output_file - the
167             default is to not include singletons.
168             max_err_32: the maximum number + 1 of alignment errors (mismatches or gaps)
169             allowed within any contiguous 32 base pairs in the overlap region between
170             two DNA fragments in the same assembly. This is meant to split apart splice
171             variants which have short splice differences and would not be disqualified
172             by the -p minimum_percent parameter.
173             consider_low_scores: a flag which causes even very LOW pairwise scores to be
174             considered - caution using this flag may cause longer run time and a worse
175             assembly.
176             maximum_end: the maximum length at the end of a DNA fragment that does not
177             match another overlapping DNA fragment (sometimes referred to as overhang)
178             that will not disqualify a DNA fragment from becoming part of an assembly.
179             ignore_tandem_32mers: a flag which causes tandem 32mers (a tandem 32mer is a
180             32mer which occurs more than once in at least one sequence read) to be
181             ignored (this is now the default behavior and this flag is for backward
182             compatibility)
183             use_tandem_32mers: a flag which causes tandem 32mers to be used for pairwise
184             comparison opposite of the -t flag which is now the default).
185             safe_merging_stop: a flag which causes merging to stop when only sequences
186             which appear to be repeats are left and these cannot be merged based on
187             clone length constraints.
188             not_random: a flag which indicates that the DNA fragments in the input_file
189             should not be treated as random genomic fragments for the purpose of
190             determining repeat regions.
191             resort_after: specifies how many sequences should be merged before resorting
192             the possible merges based on clone constraints.
193              
194             LIGR Assembler has the same options as TIGR Assembler, and the following:
195              
196             incl_bad_seq: keep all sequences including potential chimeras and splice variants
197             trimmed_seq: indicates that the sequences are trimmed. High quality scores will be
198             given on the whole sequence length instead of just in the middle)
199              
200             =cut
201              
202             sub new {
203 2     2 1 10403 my ($class,@args) = @_;
204 2         16 my $self = $class->SUPER::new(@args);
205 2         52 $self->_set_program_options(\@args, \@program_params, \@program_switches,
206             \%param_translation, $qual_param, $use_dash, $join);
207 2         6 *minimum_overlap_length = \&minimum_length;
208 2         4 *minimum_overlap_similarity = \&minimum_percent;
209 2 100       6 $self->program_name($program_name) if not defined $self->program_name();
210 2         9 $self->_assembly_format($asm_format);
211 2         9 return $self;
212             }
213              
214              
215             =head2 out_type
216              
217             Title : out_type
218             Usage : $factory->out_type('Bio::Assembly::ScaffoldI')
219             Function: Get/set the desired type of output
220             Returns : The type of results to return
221             Args : Desired type of results to return (optional):
222             'Bio::Assembly::IO' object
223             'Bio::Assembly::ScaffoldI' object (default)
224             The name of a file to save the results in
225              
226             =cut
227              
228              
229             =head2 run
230              
231             Title : run
232             Usage : $factory->run($fasta_file);
233             Function: Run TIGR Assembler
234             Returns : - a Bio::Assembly::ScaffoldI object, a Bio::Assembly::IO
235             object, a filename, or undef if all sequences were too small to
236             be usable
237             Returns : Assembly results (file, IO object or assembly object)
238             Args : - sequence input (FASTA file or sequence object arrayref)
239             - optional quality score input (QUAL file or quality score object
240             arrayref)
241             =cut
242              
243              
244             =head2 _run
245              
246             Title : _run
247             Usage : $assembler->_run()
248             Function: Make a system call and run Newbler
249             Returns : An assembly file
250             Args : - FASTA file, SFF file and MID, or analysis dir and MID
251             - optional QUAL file
252              
253             =cut
254              
255             sub _run {
256 0     0     my ($self, $fasta_file, $qual_file) = @_;
257              
258             # Setup needed files and filehandles first
259 0           my ($output_fh, $output_file ) = $self->_prepare_output_file( );
260 0           my ($scratch_fh, $scratch_file) = $self->io->tempfile( -dir => $self->tempdir() );
261 0           my ($stderr_fh, $stderr_file ) = $self->io->tempfile( -dir => $self->tempdir() );
262              
263             # Get program executable
264 0           my $exe = $self->executable;
265              
266             # Get command-line options
267 0           my $options = $self->_translate_params();
268              
269             # Usage: TIGR_Assembler [options] scratch_file < input_file > output_file
270 0           my @program_args = ( $exe, @$options, $scratch_file );
271 0           my $stdin = $fasta_file;
272 0           my $stdout = $output_file;
273 0           my $stderr = $stderr_file;
274              
275 0           my @ipc_args = ( \@program_args,
276             '<', $fasta_file,
277             '>', $output_file,
278             '2>', $stderr_file );
279              
280             # Print command for debugging
281 0 0         if ($self->verbose() >= 0) {
282 0           my $cmd = '';
283 0           $cmd .= join ( ' ', @program_args );
284 0           for ( my $i = 1 ; $i < scalar @ipc_args ; $i++ ) {
285 0           my $element = $ipc_args[$i];
286 0           my $ref = ref($element);
287 0           my $value;
288 0 0 0       if ( $ref && $ref eq 'SCALAR') {
289 0           $value = $$element;
290             } else {
291 0           $value = $element;
292             }
293 0           $cmd .= " $value";
294             }
295 0           $self->debug( "$exe command = $cmd\n" );
296             }
297              
298             # Execute command
299 0           eval {
300 0 0         IPC::Run::run(@ipc_args) || die("There was a problem running $exe: $!");
301             };
302 0 0         if ($@) {
303 0           $self->throw("$exe call crashed: $@");
304             }
305 0 0         $self->debug(join("\n", "$exe STDERR", $stderr_file)) if $stderr_file;
306             # TIGR Assembler's stderr reports a lot more than just errors
307            
308             # Close filehandles
309 0           close($scratch_fh);
310 0           close($output_fh);
311 0           close($stderr_fh);
312            
313             # Import assembly
314 0           return $output_file;
315             }
316              
317              
318             =head2 _remove_small_sequences
319              
320             Title : _remove_small_sequences
321             Usage : $assembler->_remove_small_sequences(\@seqs, \@quals)
322             Function: Remove sequences below a threshold length
323             Returns : a new sequence object array reference
324             a new quality score object array reference
325             Args : sequence object array reference
326             quality score object array reference (optional)
327              
328             =cut
329              
330             # Aliasing function _prepare_input_sequences to _remove_small_sequences
331             *_prepare_input_sequences = \&_remove_small_sequences;
332              
333             sub _remove_small_sequences {
334 0     0     my ($self, $seqs, $quals) = @_;
335             # The threshold length, $min_len, has been registered as a global variable
336 0           my @new_seqs;
337             my @new_quals;
338              
339 0 0         if (ref($seqs) =~ m/ARRAY/i) {
340 0           my @removed;
341 0           my $nof_seqs = scalar @$seqs;
342 0           for my $i (1 .. $nof_seqs) {
343 0           my $seq = $$seqs[$i-1];
344 0 0         if ($seq->length >= $min_len) {
345 0           push @new_seqs, $seq;
346 0 0         if ($quals) {
347 0           my $qual = $$quals[$i-1];
348 0           push @new_quals, $qual;
349             }
350             } else {
351 0           push @removed, $seq->id;
352             }
353             }
354 0 0         if (scalar @removed > 0) {
355 0           $self->warn("The following sequences were removed because they are smaller".
356             " than $min_len bp: @removed\n");
357             }
358             }
359 0           return \@new_seqs, \@new_quals;
360             }
361              
362             1;