File Coverage

blib/lib/Bio/Polloc/Rule/tandemrepeat.pm
Criterion Covered Total %
statement 50 122 40.9
branch 6 62 9.6
condition 0 12 0.0
subroutine 13 15 86.6
pod 3 3 100.0
total 72 214 33.6


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Bio::Polloc::Rule::tandemrepeat - A rule of type tandemrepeat
4              
5             =head1 DESCRIPTION
6              
7             This rule is similar to Bio::Polloc::Rule::repeat, but employes TRF (Benson 1999, NAR 27(2):573-580)
8             for the repeats calculation.
9              
10             =head1 AUTHOR - Luis M. Rodriguez-R
11              
12             Email lmrodriguezr at gmail dot com
13              
14             =cut
15              
16             package Bio::Polloc::Rule::tandemrepeat;
17 4     4   27512 use base qw(Bio::Polloc::RuleI);
  4         10  
  4         1416  
18 4     4   26 use strict;
  4         7  
  4         134  
19 4     4   21 use Bio::Polloc::Polloc::IO;
  4         8  
  4         76  
20 4     4   2535 use Bio::Polloc::LocusI;
  4         10  
  4         126  
21 4     4   4482 use Bio::SeqIO;
  4         222918  
  4         178  
22             # For TRF:
23 4     4   47 use File::Spec;
  4         10  
  4         93  
24 4     4   24 use File::Basename;
  4         7  
  4         514  
25 4     4   24 use Cwd;
  4         8  
  4         399  
26 4     4   29 use Config;
  4         9  
  4         7360  
27             our $VERSION = 1.0503; # [a-version] from Bio::Polloc::Polloc::Version
28              
29              
30             =head1 APPENDIX
31              
32             Methods provided by the package
33              
34             =head2 new
35              
36             Generic initialization method.
37              
38             =cut
39              
40             sub new {
41 0     0 1 0 my($caller,@args) = @_;
42 0         0 my $self = $caller->SUPER::new(@args);
43 0         0 $self->_initialize(@args);
44 0         0 return $self;
45             }
46              
47             =head2 execute
48              
49             This is where magic happens. Translates the parameters of the object into a call to
50             B<TRF>, and scans the sequence for repeats.
51              
52             =head3 Arguments
53              
54             The sequence (C<-seq>) as a L<Bio::Seq> or a L<Bio::SeqIO> object.
55              
56             =head3 Returns
57              
58             An array reference populated with L<Bio::Polloc::Locus::tandemrepeat> objects.
59              
60             =cut
61              
62             sub execute {
63 0     0 1 0 my($self,@args) = @_;
64 0         0 my($seq) = $self->_rearrange([qw(SEQ)], @args);
65            
66 0 0       0 $self->throw("You must provide a sequence to evaluate the rule", $seq) unless $seq;
67 0 0       0 $self->throw("You must provide an object as sequence", $seq)
68             unless UNIVERSAL::can($seq,'isa');
69            
70             # For Bio::SeqIO objects
71 0 0       0 if($seq->isa('Bio::SeqIO')){
72 0         0 my @feats = ();
73 0         0 while(my $s = $seq->next_seq){
74 0         0 push(@feats, @{$self->execute(-seq=>$s)})
  0         0  
75             }
76 0 0       0 return wantarray ? @feats : \@feats;
77             }
78            
79 0 0       0 $self->throw("Illegal class of sequence '".ref($seq)."'", $seq) unless $seq->isa('Bio::Seq');
80              
81             # Include safe_value parameters
82             # MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE PM PI
83 0         0 my %c_v = (
84             "minsize" => 0, "maxsize" => 1e20,
85             "minperiod" => 0, "maxperiod" => 500,
86             "exp" => 0,
87             "match" => 2, "mismatch"=> 3, "indels" => 5,
88             "minscore" => 50, "maxscore" => 0,
89             "minsim" => 0, "maxsim" => 100,
90             "pm" => 80, "pi" => 20
91             );
92 0         0 for my $k ( keys %c_v){
93 0         0 my $p = $self->_search_value($k);
94 0 0       0 $c_v{$k} = $p if defined $p; # override defaults
95             }
96              
97             # Create the IO master
98 0         0 my $io = Bio::Polloc::Polloc::IO->new();
99              
100             # Search for TRF
101 0         0 $self->source('trf'); # For GFF
102 0 0       0 my $trf = $self->_executable(defined $self->ruleset ? $self->ruleset->value("path") : undef)
    0          
103             or $self->throw("Could not find the trf binary");
104            
105             # Next line is because of the horrible practice of creating files at CWD out
106             # of thin air (with no possibility to define another way!)
107 0 0       0 $trf = File::Spec->rel2abs($trf) unless File::Spec->file_name_is_absolute($trf);
108            
109             # Write the sequence
110 0         0 my($seq_fh, $seq_file) = $io->tempfile;
111 0         0 close $seq_fh;
112 0         0 my $seqO = Bio::SeqIO->new(-file=>">$seq_file", -format=>'Fasta');
113 0         0 $seqO->write_seq($seq);
114            
115             # Run it
116 0         0 my @run = ($trf);
117 0         0 push @run, $seq_file;
118 0         0 push @run, $c_v{'match'}, $c_v{'mismatch'}, $c_v{'indels'};
119 0         0 push @run, $c_v{'pm'}, $c_v{'pi'}, $c_v{'minscore'}, $c_v{'maxperiod'};
120 0         0 push @run, "-h"; #<- Simplifies output, and produces one file instead of tons!
121 0         0 push @run, "2>&1";
122 0         0 push @run, "|";
123 0         0 my $cwd = cwd();
124             # Finger crossed
125 0         0 my $tmpdir = Bio::Polloc::Polloc::IO->tempdir();
126 0 0       0 chdir $tmpdir or $self->throw("I can not move myself to the temporal directory: $!", $tmpdir);
127 0         0 $self->debug("Hello from ".cwd());
128 0         0 $self->debug("Running: ".join(" ",@run)." [CWD: ".cwd()."]");
129 0         0 my $run = Bio::Polloc::Polloc::IO->new(-file=>join(" ",@run));
130 0         0 my $runout = '';
131 0 0       0 while($run->_readline){$runout.=$_ if defined $_} # Do nothing, this is truly unuseful output, just save it in case of errors (for debugging)
  0         0  
132 0         0 $run->close();
133             # Finger crossed (yes, again)
134 0 0       0 chdir $cwd or $self->throw("I can not move myself to the original directory: $!", $cwd);
135 0         0 $self->debug("Hello from ".cwd());
136            
137             # Try to locate the output (belive it or not)...
138 0         0 my $outfile = Bio::Polloc::Polloc::IO->catfile($tmpdir, basename($seq_file) . "." .
139             $c_v{'match'} . "." . $c_v{'mismatch'} . "." . $c_v{'indels'} . "." .
140             $c_v{'pm'} . "." . $c_v{'pi'} . "." . $c_v{'minscore'} . "." .
141             $c_v{'maxperiod'} . ".dat");
142 0 0       0 $self->throw("Impossible to locate the output file of TRF: '$outfile', (".join(" ", @run).") showing full output", $runout) unless -e $outfile;
143            
144             # And finally parse it
145 0         0 my $ontable = 0;
146 0         0 my @feats = ();
147 0         0 $run = Bio::Polloc::Polloc::IO->new(-file=>$outfile);
148 0         0 while(my $line = $run->_readline){
149 0 0       0 if($line =~ m/^Parameters:\s/){
    0          
150 0         0 $ontable = 1;
151             }elsif($ontable){
152 0         0 chomp $line;
153 0 0       0 next if $line =~ /^\s*$/;
154             #from to period-size copy-number consensus-size percent-matches percent-indels score A T C G entropy consensus sequence
155             #269 316 18 2.6 19 56 3 51 8 45 35 10 1.68 GTCGCGGCCACGTGCACCC GTCGCGTCCACGTGCGCCCGAGCCGGC...
156 0         0 my @v = split /\s+/, $line;
157 0 0       0 $#v==14 or $self->throw("Unexpected line $.",$line,"Bio::Polloc::Polloc::ParsingException");
158             # MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE
159 0         0 $self->debug("Checking additional parameters");
160 0 0 0     0 next if length($v[14]) > $c_v{'maxsize'} or length($v[14]) < $c_v{'minsize'};
161 0 0 0     0 next if $v[2] > $c_v{'maxperiod'} or $v[2] < $c_v{'minperiod'};
162 0 0       0 next if $v[3] < $c_v{'exp'};
163 0 0       0 next if $v[7] < $c_v{'minscore'};
164 0 0 0     0 next if $c_v{'maxscore'} and $v[7] > $c_v{'maxscore'};
165 0 0 0     0 next if $v[5] < $c_v{'minsim'} or $v[5] > $c_v{'maxsim'};
166 0         0 $self->debug("Cool!");
167            
168 0         0 my $id = $self->_next_child_id;
169 0 0       0 push @feats, Bio::Polloc::LocusI->new(
170             -type=>'repeat', # Be careful, usually $self->type
171             -rule=>$self, -seq=>$seq,
172             -from=>$v[0]+0, -to=>$v[1]+0, -strand=>"+",
173             -name=>$self->name,
174             -id=>(defined $id ? $id : ""),
175             -period=>$v[2]+0, -exponent=>$v[3]+0,
176             -consensus=>$v[13],
177             -error=>100-$v[5],
178             -score=>$v[7],
179             -repeats=>$v[14]);
180             }
181             }
182 0         0 $run->close();
183 0         0 unlink $outfile;
184 0         0 rmdir $tmpdir;
185 0 0       0 return wantarray ? @feats : \@feats;
186             }
187              
188             =head2 stringify_value
189              
190             Produces a string with the value of the rule.
191              
192             =cut
193              
194             sub stringify_value {
195 2     2 1 5 my ($self,@args) = @_;
196 2         3 my $out = "";
197 2         3 for my $k (keys %{$self->value}){
  2         6  
198 28 50       67 $out.= "$k=>".(defined $self->value->{$k} ? $self->value->{$k} : "")." ";
199             }
200 2         9 return $out;
201             }
202              
203             =head2 value
204              
205             =head3 Arguments
206              
207             Value (I<str> or I<hashref> or I<arrayref>). The supported keys are:
208              
209             =over
210              
211             =item -minsize I<int>
212              
213             Minimum size of the repeat
214              
215             =item -maxsize I<int>
216              
217             Maximum size of the repeat
218              
219             =item -minperiod I<float>
220              
221             Minimum period of the repeat
222              
223             =item -maxperiod I<float>
224              
225             Maximum period of the repeat
226              
227             =item -exp I<float>
228              
229             Minimum exponent (number of repeats)
230              
231             =item -match I<int>
232              
233             Matching weight
234              
235             =item -mismatch I<int>
236              
237             Mismatching penalty
238              
239             =item -indels I<int>
240              
241             Indel penalty
242              
243             =item -minscore I<float>
244              
245             Minimum score
246              
247             =item -maxscore I<float>
248              
249             Maximum score
250              
251             =item -minsim I<float>
252              
253             Minimum similarity percent
254              
255             =item -maxsim I<float>
256              
257             Maximum similarity percent
258              
259             =item -pm I<float>
260              
261             Match probability
262              
263             =item -pi I<float>
264              
265             Indel probability
266              
267             =back
268              
269             =head3 Return
270              
271             Value (I<hashref> or C<undef>).
272              
273             =head1 INTERNAL METHODS
274              
275             Methods intended to be used only within the scope of Bio::Polloc::*
276              
277             =head2 _parameters
278              
279             =cut
280              
281 3     3   20 sub _parameters { return [qw(MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP MATCH MISMATCH INDELS MINSCORE MAXSCORE MINSIM MAXSIM PM PI)] }
282              
283             =head2 _executable
284              
285             Attempts to find the TRF executable.
286              
287             =cut
288              
289             sub _executable {
290 2     2   53452 my($self, $path) = @_;
291 2         5 my $exe;
292             my $bin;
293 2         6 my $name = 'trf';
294 2         5 my $io = "Bio::Polloc::Polloc::IO";
295 2         28 $self->debug("Searching the $name binary for $^O");
296             # Note the darwin support. This is because darwin is unable to execute
297             # the linux binary (despite its origin)
298 2 50       1915 $bin=$^O =~ /(macos|darwin)/i ? "$name.macosx.bin" :
    50          
    50          
299             $^O =~ /mswin/i ? "$name.exe" :
300             $Config{'archname64'} ? "$name.linux64.bin" :
301             "$name.linux32.bin";
302 2         12604 my @pre = ('');
303 2 50       11 unshift @pre, $path if $path;
304 2         6 for my $p (@pre) {
305             # Try first WITH version, to avoid v3 series
306 2         9 for my $v ((404, 403, 402, 401, 400, '')){
307 12         20 for my $e (('', '.bin', '.exe')){
308 36         49 for my $n (($bin, $name)){
309 72         310 $exe = $io->exists_exe($p . $n . $v . $e);
310 72 50       227 return $exe if $exe;
311             }
312             }
313             }
314             }
315             # Other awful naming systems can be used by this package, but here we stop!
316             }
317              
318             =head2 _initialize
319              
320             =cut
321              
322             sub _initialize {
323 3     3   12 my($self,@args) = @_;
324 3         25 $self->type('tandemrepeat');
325             }
326              
327             1;