File Coverage

blib/lib/Bio/Tools/Run/Mdust.pm
Criterion Covered Total %
statement 59 119 49.5
branch 16 64 25.0
condition 3 8 37.5
subroutine 14 19 73.6
pod 6 6 100.0
total 98 216 45.3


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Mdust
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Donald Jackson, donald.jackson@bms.com
7             #
8             # Copyright Donald Jackson
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Mdust - Perl extension for Mdust nucleotide filtering
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Run::Mdust;
21             my $mdust = Bio::Tools::Run::Mdust->new();
22              
23             $mdust->run($bio_seq_object);
24              
25             =head1 DESCRIPTION
26              
27             Perl wrapper for the nucleic acid complexity filtering program
28             B as available from TIGR via
29             L. Takes a Bio::SeqI or
30             Bio::PrimarySeqI object of type DNA as input.
31              
32             If a Bio::Seq::RichSeqI is passed then the low-complexity regions will
33             be added to the feature table of the target object as
34             Bio::SeqFeature::Generic items with primary tag = 'Excluded' .
35             Otherwise a new target object will be returned with low-complexity
36             regions masked (by N's or other character as specified by maskchar()).
37              
38             The mdust executable must be in a directory specified with either the
39             PATH or MDUSTDIR environment variable.
40              
41             =head1 SEE ALSO
42              
43             L,
44             L,
45             L,
46             L
47              
48             =head1 FEEDBACK
49              
50             =head2 Mailing Lists
51              
52             User feedback is an integral part of the evolution of this and other
53             Bioperl modules. Send your comments and suggestions preferably to
54             the Bioperl mailing list. Your participation is much appreciated.
55              
56             bioperl-l@bioperl.org - General discussion
57             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
58              
59             =head2 Support
60              
61             Please direct usage questions or support issues to the mailing list:
62              
63             I
64              
65             rather than to the module maintainer directly. Many experienced and
66             reponsive experts will be able look at the problem and quickly
67             address it. Please include a thorough description of the problem
68             with code and data examples if at all possible.
69              
70             =head2 Reporting Bugs
71              
72             Report bugs to the Bioperl bug tracking system to help us keep track
73             of the bugs and their resolution. Bug reports can be submitted via
74             the web:
75              
76             http://redmine.open-bio.org/projects/bioperl/
77              
78             =head1 AUTHOR
79              
80             Donald Jackson (donald.jackson@bms.com)
81              
82             =head1 APPENDIX
83              
84             The rest of the documentation details each of the object methods.
85             Internal methods are usually preceded with a _
86              
87             =cut
88              
89              
90             package Bio::Tools::Run::Mdust;
91              
92             require 5.005_62;
93 1     1   104339 use strict;
  1         3  
  1         24  
94              
95 1     1   297 use Bio::SeqIO;
  1         36827  
  1         31  
96 1     1   323 use Bio::SeqFeature::Generic;
  1         38052  
  1         28  
97 1     1   8 use Bio::Root::Root;
  1         1  
  1         14  
98 1     1   4 use Bio::Root::IO;
  1         2  
  1         17  
99 1     1   301 use Bio::Tools::Run::WrapperBase;
  1         2  
  1         27  
100              
101 1     1   5 use vars qw($AUTOLOAD $PROGRAMNAME @ARGNAMES @MASKCHARS $VERSION @ISA);
  1         2  
  1         1127  
102              
103             @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
104             @ARGNAMES = qw(TARGET WSIZE CUTOFF MASKCHAR COORDS TMPDIR DEBUG);
105             $PROGRAMNAME = 'mdust';
106              
107             @MASKCHARS = qw(N X L);
108              
109             =head2 new
110              
111             Title : new
112             Usage : my $mdust = Bio::Tools::Run::Mdust->new( -target => $target_bioseq)
113             Purpose : Create a new mdust object
114             Returns : A Bio::Seq object
115             Args : target - Bio::Seq object for masking - alphabet MUST be DNA.
116             wsize - word size for masking (default = 3)
117             cutoff - cutoff score for masking (default = 28)
118             maskchar - character for replacing masked regions (default = N)
119             coords - boolean - indicate low-complexity regions as
120             Bio::SeqFeature::Generic
121             objects with primary tag 'Excluded',
122             do not change sequence (default 0)
123             tmpdir - directory for storing temporary files
124             debug - boolean - toggle debugging output,
125             do not remove temporary files
126             Notes : All of the arguments can also be get/set with their own accessors, such as:
127             my $wsize = $mdust->wsize();
128              
129             When processing multiple sequences, call Bio::Tools::Run::Mdust->new() once
130             then pass each sequence as an argument to the target() or run() methods.
131             =cut
132              
133             sub new {
134 1     1 1 59223 my ($proto, @args) = @_;
135 1   33     7 my $pkg = ref($proto) || $proto;
136 1         2 my %args;
137              
138 1         5 my $self = { wsize => undef,
139             cutoff => undef,
140             maskchar => undef,
141             coords => 0,
142             };
143              
144 1         3 bless ($self, $pkg);
145              
146 1         7 @args{@ARGNAMES} = $self->_rearrange(\@ARGNAMES, @args);
147              
148             # load target first since it requires special handling
149 1 50       28 $self->target($args{'TARGET'}) if ($args{'TARGET'});
150              
151             # package settings
152 1 50       3 $self->{'coords'} = $args{'COORDS'} if (defined $args{'COORDS'});
153 1   50     10 $self->{'tmpdir'} = $args{'TMPDIR'} || $ENV{'TMPDIR'} || $ENV{'TMP'} || '.';
154              
155             # mdust options
156 1 50       3 $self->{'wsize'} = $args{'WSIZE'} if (defined $args{'WSIZE'});
157 1 50       3 $self->{'cutoff'} = $args{'CUTOFF'} if (defined $args{'CUTOFF'});
158 1 50       4 $self->{'maskchar'} = $args{'MASKCHAR'} if (defined $args{'CUTOFF'});
159              
160              
161             # set debugging
162 1         21 $self->verbose($args{'DEBUG'});
163 1         20 return $self;
164             }
165              
166             =head2 run
167              
168             Title : run
169             Usage : $mdust->run();
170             Purpose : Run mdust on the target sequence
171             Args : target (optional) - Bio::Seq object of alphabet DNA for masking
172             Returns : Bio::Seq object with masked sequence or low-complexity regions added to feature table.
173              
174             =cut
175              
176             sub run {
177 0     0 1 0 my ($self, $target) = @_;
178              
179 0 0       0 if ($target) {
180 0         0 $self->target($target);
181             }
182              
183 0         0 return $self->_run_mdust;
184             }
185              
186             sub program_dir {
187 3 50   3 1 10 return Bio::Root::IO->catfile($ENV{MDUSTDIR}) if $ENV{MDUSTDIR};
188             }
189              
190              
191             sub program_name {
192 6     6 1 44 return $PROGRAMNAME;
193             }
194              
195             sub _run_mdust {
196             # open a pipe to the mdust command. Pass in sequence(s?) as fasta
197             # files on STDIN, recover filtered seqs on STDOUT
198 0     0   0 my ($self) = @_;
199            
200 0 0       0 my $target = $self->target or warn "No target sequence specified\n" && return undef;
201              
202             # make sure program is available - doesn't seem to check
203             #my $executable = $self->executable('mdust', 1);
204              
205             # add options
206 0         0 my $mdust_cmd = $self->program_path;
207 0 0       0 $mdust_cmd .= " -w " . $self->wsize if (defined $self->wsize);
208 0 0       0 $mdust_cmd .= " -v " . $self->cutoff if (defined $self->cutoff);
209 0 0       0 $mdust_cmd .= " -m " . $self->maskchar if (defined $self->maskchar);
210 0 0       0 $mdust_cmd .= " -c" if ($self->coords);
211 0 0       0 print STDERR "Running mdust: $mdust_cmd\n" if ($self->debug);
212 0         0 my $maskedfile = $self->_maskedfile;
213 0         0 eval {
214 0         0 my $pid = open (MDUST, "| $mdust_cmd > $maskedfile"); # bind STDIN of mdust to filehandle
215              
216 0         0 local $| = 1;
217 0         0 my $seqout = Bio::SeqIO->new(-fh => \*MDUST, -format => 'Fasta');
218 0         0 $seqout->write_seq($target);
219 0         0 close MDUST; # need to do this to get output to flush!
220             };
221              
222 0 0       0 $self->throw($@) if ($@);
223              
224 0         0 my $rval;
225              
226 0 0       0 if ($self->coords) {
227 0         0 $self->_parse_coords($maskedfile);
228 0         0 $rval = $self->target;
229             }
230             else { # replace original seq w/ masked seq
231 0         0 my $seqin = Bio::SeqIO->new(-file=>$maskedfile, -format => 'Fasta');
232 0         0 $rval = $seqin->next_seq
233             }
234              
235 0 0       0 unlink $maskedfile unless $self->save_tempfiles;
236              
237 0         0 return $rval;
238              
239             }
240              
241             =head2 target
242              
243             Title : target
244             Usage : $mdust->target($bio_seq)
245             Purpose : Set/get the target (sequence to be filtered).
246             Returns : Target Bio::Seq object
247             Args : Bio::SeqI or Bio::PrimarySeqI object using the DNA alphabet (optional)
248             Note : If coordinate parsing is selected ($mdust->coords = 1) then target
249             MUST be a Bio::Seq::RichSeqI object. Passing a RichSeqI object automatically
250             turns on coordinate parsing.
251              
252             =cut
253              
254             sub target {
255 1     1 1 3 my ($self, $targobj) = @_;
256              
257 1 50       3 if ($targobj) {
258 1         5 return $self->_set_target($targobj);
259             }
260             else {
261 0         0 return $self->{'target'};
262             }
263              
264             }
265              
266              
267             sub _set_target {
268 1     1   3 my ($self, $targobj) = @_;
269              
270 1 50 33     6 unless ($targobj->isa('Bio::SeqI') or ($targobj->isa('Bio::PrimarySeqI'))) {
271 0         0 $self->throw( -text => "Target must be passed as a Bio::SeqI or Bio::PrimarySeqI object",
272             -class => 'Bio::Root::BadParameter',
273             -value => $targobj );
274             }
275              
276              
277 1 50       9 if ($self->coords) {
    50          
278 0 0       0 unless ($targobj->isa('Bio::Seq::RichSeqI')) {
279 0         0 $self->throw( -text => "Target must be passed as a Bio::Seq::RichSeqSeqI object when coords == 1",
280             -class => 'Bio::Root::BadParameter',
281             -value => $targobj );
282             }
283             }
284             elsif ($targobj->isa('Bio::Seq::RichSeqI')) {
285 1         4 $self->coords(1);
286             }
287              
288              
289 1 50       11 unless ($targobj->alphabet eq 'dna') {
290 0         0 $self->throw( -text => "Target must be a DNA sequence",
291             -class => 'Bio::Root::BadParameter',
292             -value => $targobj );
293             }
294              
295 1         17 $self->{'target'} = $targobj;
296 1         3 return 1;
297              
298             }
299              
300             sub _maskedfile {
301 0     0   0 my ($self, $file) = @_;
302 0         0 my $tmpdir = $self->tempdir;
303              
304 0 0       0 if ($file) {
    0          
305 0         0 $self->{'maskedfile'} = $file;
306             # add some sanity chex for writability?
307             }
308             elsif (!$self->{'maskedfile'}) {
309 0         0 ($self->{'maskedfh'},$self->{'maskedfile'}) =
310             $self->io->tempfile(-dir=>$self->tempdir());
311             }
312 0         0 return $self->{'maskedfile'};
313              
314             }
315              
316             sub _parse_coords {
317 0     0   0 my ($self, $file) = @_;
318 0         0 my $target = $self->target;
319 0 0       0 open(FILE, $file) or die "Unable to open $file: $!";
320 0         0 while () {
321 0         0 chomp;
322 0         0 s/\r//;
323 0         0 my ($seq, $length, $mstart, $mstop) = split(/\t/);
324              
325             # add masked region as a SeqFeature in target
326 0         0 my $masked = Bio::SeqFeature::Generic->new( -start => $mstart,
327             -end => $mstop,
328             );
329 0         0 $masked->primary_tag('Excluded');
330 0         0 $masked->source_tag('mdust');
331              
332 0         0 $target->add_SeqFeature($masked);
333             }
334 0         0 return 1;
335             }
336              
337             =head2 maskchar
338              
339             Title : maskchar
340             Usage : $mdust->maskchar('N')
341             Purpose : Set/get the character for masking low-complexity regions
342             Returns : True on success
343             Args : Either N (default), X or L (lower case)
344              
345             =cut
346              
347             sub maskchar {
348 0     0 1 0 my ($self, $maskchar) = @_;
349              
350 0 0       0 return $self->{'maskchar'} unless (defined $maskchar);
351              
352 0 0       0 unless ( grep {$maskchar eq $_} @MASKCHARS ) {
  0         0  
353 0         0 $self->throw( -text => "maskchar must be one of N, X or L",
354             -class => 'Bio::Root::BadParameter',
355             -value => $maskchar );
356             }
357 0         0 $self->{'maskchar'} = $maskchar;
358              
359 0         0 1;
360             }
361              
362             sub DESTROY {
363 1     1   1901 my $self= shift;
364 1 50       24 unless ( $self->save_tempfiles ) {
365 1         17 $self->cleanup();
366             }
367 1         5 $self->SUPER::DESTROY();
368             }
369              
370              
371             sub AUTOLOAD {
372 2     2   5 my ($self, $value) = @_;
373 2         3 my $name = $AUTOLOAD;
374 2         9 $name =~ s/.+:://;
375              
376 2 50       7 return if ($name eq 'DESTROY');
377              
378              
379 2 100       4 if (defined $value) {
380 1         3 $self->{$name} = $value;
381             }
382              
383 2 50       25 unless (exists $self->{$name}) {
384 0 0       0 warn "Attribute $name not defined for ", ref($self), "\n" if ($self->debug);
385 0         0 return undef;
386             }
387              
388 2         8 return $self->{$name};
389             }
390              
391             1;