| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
=head1 NAME |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Bio::Polloc::Rule::repeat - A rule of type repeat |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 AUTHOR - Luis M. Rodriguez-R |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Email lmrodriguezr at gmail dot com |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=cut |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
package Bio::Polloc::Rule::repeat; |
|
12
|
1
|
|
|
1
|
|
709
|
use base qw(Bio::Polloc::RuleI); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
508
|
|
|
13
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
31
|
|
|
14
|
1
|
|
|
1
|
|
5
|
use Bio::Polloc::Polloc::IO; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
25
|
|
|
15
|
1
|
|
|
1
|
|
640
|
use Bio::Polloc::LocusI; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
36
|
|
|
16
|
1
|
|
|
1
|
|
892
|
use Bio::SeqIO; |
|
|
1
|
|
|
|
|
54755
|
|
|
|
1
|
|
|
|
|
1422
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = 1.0503; # [a-version] from Bio::Polloc::Polloc::Version |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 APPENDIX |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
Methods provided by the package |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head2 new |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Generic initialization method |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=cut |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub new { |
|
31
|
0
|
|
|
0
|
1
|
0
|
my($caller,@args) = @_; |
|
32
|
0
|
|
|
|
|
0
|
my $self = $caller->SUPER::new(@args); |
|
33
|
0
|
|
|
|
|
0
|
$self->_initialize(@args); |
|
34
|
0
|
|
|
|
|
0
|
return $self; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head2 execute |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This is where magic happens. Translates the parameters of the object into a call to |
|
40
|
|
|
|
|
|
|
B<mreps>, and scans the sequence for repeats. |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 Arguments |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
The sequence (C<-seq>) as a L<Bio::Seq> or a L<Bio::SeqIO> object |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head3 Returns |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
An array reference populated with L<Bio::Polloc::Locus::repeat> objects |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub execute { |
|
53
|
0
|
|
|
0
|
1
|
0
|
my($self,@args) = @_; |
|
54
|
0
|
|
|
|
|
0
|
my($seq) = $self->_rearrange([qw(SEQ)], @args); |
|
55
|
|
|
|
|
|
|
|
|
56
|
0
|
0
|
|
|
|
0
|
$self->throw("You must provide a sequence to evaluate the rule", $seq) unless $seq; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# For Bio::SeqIO objects |
|
59
|
0
|
0
|
|
|
|
0
|
if($seq->isa('Bio::SeqIO')){ |
|
60
|
0
|
|
|
|
|
0
|
my @feats = (); |
|
61
|
0
|
|
|
|
|
0
|
while(my $s = $seq->next_seq){ |
|
62
|
0
|
|
|
|
|
0
|
push(@feats, @{$self->execute(-seq=>$s)}) |
|
|
0
|
|
|
|
|
0
|
|
|
63
|
|
|
|
|
|
|
} |
|
64
|
0
|
0
|
|
|
|
0
|
return wantarray ? @feats : \@feats; |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
0
|
$self->throw("Illegal class of sequence '".ref($seq)."'", $seq) unless $seq->isa('Bio::Seq'); |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# Include safe_value parameters |
|
70
|
0
|
|
|
|
|
0
|
my $cmd_vars = {}; |
|
71
|
0
|
|
|
|
|
0
|
for my $p ( qw(RES MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP ALLOWSMALL WIN) ){ |
|
72
|
0
|
|
|
|
|
0
|
my $tv = $self->_search_value($p); |
|
73
|
0
|
0
|
|
|
|
0
|
$cmd_vars->{"-" . lc $p} = $tv if defined $tv; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
0
|
|
0
|
|
|
0
|
my $minsim = ($self->_search_value("minsim") || 0)+0; |
|
76
|
0
|
|
0
|
|
|
0
|
my $maxsim = ($self->_search_value("maxsim") || 0)+0; |
|
77
|
0
|
|
0
|
|
|
0
|
$maxsim ||= 100; |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# Create the IO master |
|
80
|
0
|
|
|
|
|
0
|
my $io = Bio::Polloc::Polloc::IO->new(); |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Search for mreps |
|
83
|
0
|
|
|
|
|
0
|
$self->source('mreps'); |
|
84
|
0
|
0
|
|
|
|
0
|
my $mreps = $self->_executable(defined $self->ruleset ? $self->ruleset->value("path") : undef) |
|
|
|
0
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
or $self->throw("Could not find the mreps binary"); |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Write the sequence |
|
88
|
0
|
|
|
|
|
0
|
my($seq_fh, $seq_file) = $io->tempfile; |
|
89
|
0
|
|
|
|
|
0
|
close $seq_fh; |
|
90
|
0
|
|
|
|
|
0
|
my $seqO = Bio::SeqIO->new(-file=>">$seq_file", -format=>'Fasta'); |
|
91
|
0
|
|
|
|
|
0
|
$seqO->write_seq($seq); |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# Run it |
|
94
|
0
|
|
|
|
|
0
|
my @run = ($mreps); |
|
95
|
0
|
|
|
|
|
0
|
push @run, %{$cmd_vars}; |
|
|
0
|
|
|
|
|
0
|
|
|
96
|
0
|
|
|
|
|
0
|
push @run, "-fasta", $seq_file; |
|
97
|
0
|
|
|
|
|
0
|
push @run, "2>&1"; |
|
98
|
0
|
|
|
|
|
0
|
push @run, "|"; |
|
99
|
0
|
|
|
|
|
0
|
$self->debug("Running: ".join(" ",@run)); |
|
100
|
0
|
|
|
|
|
0
|
my $run = Bio::Polloc::Polloc::IO->new(-file=>join(" ",@run)); |
|
101
|
0
|
|
|
|
|
0
|
my $ontable = 0; |
|
102
|
0
|
|
|
|
|
0
|
my @feats = (); |
|
103
|
0
|
|
|
|
|
0
|
while(my $line = $run->_readline){ |
|
104
|
0
|
0
|
|
|
|
0
|
if($line =~ m/^ -----------------------------/){ |
|
|
|
0
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
0
|
$ontable = !$ontable; |
|
106
|
|
|
|
|
|
|
}elsif($ontable){ |
|
107
|
0
|
|
|
|
|
0
|
chomp $line; |
|
108
|
|
|
|
|
|
|
# from -> to : size <per.> [exp.] err-rate sequence |
|
109
|
0
|
0
|
|
|
|
0
|
$line =~ m/^\s+(\d+)\s+->\s+(\d+)\s+:\s+(\d+)\s+<(\d+)>\s+\[([\d\.]+)\]\s+([\d\.]+)\s+([\w\s]+)$/ |
|
110
|
|
|
|
|
|
|
or $self->throw("Unexpected line $.",$line,"Bio::Polloc::Polloc::ParsingException"); |
|
111
|
0
|
|
|
|
|
0
|
my $score = 100 - $6*100; |
|
112
|
0
|
0
|
0
|
|
|
0
|
next if $score > $maxsim or $score < $minsim; |
|
113
|
0
|
|
|
|
|
0
|
my $id = $self->_next_child_id; |
|
114
|
0
|
|
|
|
|
0
|
my $cons = $self->_calculate_consensus($7); |
|
115
|
0
|
0
|
|
|
|
0
|
push @feats, Bio::Polloc::LocusI->new( |
|
116
|
|
|
|
|
|
|
-type=>$self->type, -rule=>$self, -seq=>$seq, |
|
117
|
|
|
|
|
|
|
-from=>$1+0, -to=>$2+0, -strand=>"+", |
|
118
|
|
|
|
|
|
|
-name=>$self->name, |
|
119
|
|
|
|
|
|
|
-id=>(defined $id ? $id : ""), |
|
120
|
|
|
|
|
|
|
-period=>$4+0, -exponent=>$5+0, |
|
121
|
|
|
|
|
|
|
-error=>$6*100, |
|
122
|
|
|
|
|
|
|
-score=>$score, |
|
123
|
|
|
|
|
|
|
-consensus=>$cons, |
|
124
|
|
|
|
|
|
|
-repeats=>$7); |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
} |
|
127
|
0
|
|
|
|
|
0
|
$run->close(); |
|
128
|
0
|
0
|
|
|
|
0
|
return wantarray ? @feats : \@feats; |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head2 stringify_value |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Produces a readable string containing the value of the rule. |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=cut |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub stringify_value { |
|
138
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
|
139
|
0
|
|
|
|
|
0
|
my $out = ""; |
|
140
|
0
|
|
|
|
|
0
|
for my $k (keys %{$self->value}){ |
|
|
0
|
|
|
|
|
0
|
|
|
141
|
0
|
0
|
|
|
|
0
|
$out.= "$k=>".(defined $self->value->{$k} ? $self->value->{$k} : "")." "; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
0
|
|
|
|
|
0
|
return $out; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head2 value |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Implements the C<_qualify_value()> from the L<Bio::Polloc::RuleI> interface |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head3 Arguments |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Value (str or ref-to-hash or ref-to-array). The supported keys are: |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=over |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item -res I<float> |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Resolution (allowed error) |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item -minsize I<int> |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Minimum size of the repeat |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=item -maxsize I<int> |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Maximum size of the repeat |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=item -minperiod I<float> |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Minimum period of the repeat |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=item -maxperiod I<float> |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
Maximum period of the repeat |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=item -exp I<float> |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Minimum exponent (number of repeats) |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=item -allowsmall I<bool (int)> |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
If true, allows spurious results |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item -win I<float> |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Process by sliding windows of size C<2*n> overlaping by C<n> |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item -minsim I<float> |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Minimum similarity percent |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=item -maxsim I<float> |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
Maximum similarity percent |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=back |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=head3 Return |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Value (I<hashref> or C<undef>). |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=head1 INTERNAL METHODS |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Methods intended to be used only within the scope of Bio::Polloc::* |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head2 _calculate_consensus |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
Attempts to calculate the consensus of the repeat units. |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head3 Arguments |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
The repetitive sequence with repeat units separated by spaces (I<str>). |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head3 Returns |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
The consensus sequence (I<str>) |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=cut |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub _calculate_consensus { |
|
221
|
0
|
|
|
0
|
|
0
|
my($self,$seq) = @_; |
|
222
|
0
|
0
|
|
|
|
0
|
return unless $seq; |
|
223
|
0
|
|
|
|
|
0
|
my $io = Bio::Polloc::Polloc::IO->new(); |
|
224
|
0
|
|
|
|
|
0
|
my $emma = $io->exists_exe("emma"); |
|
225
|
0
|
|
|
|
|
0
|
my $cons = $io->exists_exe("cons"); |
|
226
|
0
|
0
|
|
|
|
0
|
return "no-emma" unless $emma; |
|
227
|
0
|
0
|
|
|
|
0
|
return "no-cons" unless $cons; |
|
228
|
0
|
|
|
|
|
0
|
my ($outseq_fh, $outseq) = $io->tempfile; |
|
229
|
0
|
|
|
|
|
0
|
my $i=0; |
|
230
|
0
|
|
|
|
|
0
|
print $outseq_fh ">".(++$i)."\n$_\n" for split /\s+/, $seq; |
|
231
|
0
|
|
|
|
|
0
|
close $outseq_fh; |
|
232
|
0
|
0
|
|
|
|
0
|
return "err-seq" unless -s $outseq; |
|
233
|
0
|
|
|
|
|
0
|
my $outaln = "$outseq.aln"; |
|
234
|
0
|
|
|
|
|
0
|
my $emmarun = Bio::Polloc::Polloc::IO->new(-file=>"$emma '$outseq' '$outaln' '/dev/null' -auto >/dev/null |"); |
|
235
|
0
|
|
|
|
|
0
|
while($emmarun->_readline){ print STDERR $_ } |
|
|
0
|
|
|
|
|
0
|
|
|
236
|
0
|
|
|
|
|
0
|
$emmarun->close(); |
|
237
|
0
|
0
|
|
|
|
0
|
unless(-s $outaln){ |
|
238
|
0
|
0
|
|
|
|
0
|
unlink $outaln if -e $outaln; |
|
239
|
0
|
|
|
|
|
0
|
return "err-aln"; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
0
|
|
|
|
|
0
|
my $consout = ""; |
|
242
|
0
|
|
|
|
|
0
|
my $consrun = Bio::Polloc::Polloc::IO->new(-file=>"$cons '$outaln' stdout -auto |"); |
|
243
|
0
|
|
|
|
|
0
|
while(my $ln = $consrun->_readline){ |
|
244
|
0
|
|
|
|
|
0
|
chomp $ln; |
|
245
|
0
|
0
|
|
|
|
0
|
next if $ln =~ /^>/; |
|
246
|
0
|
|
|
|
|
0
|
$consout .= $ln; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
0
|
|
|
|
|
0
|
unlink $outaln; |
|
249
|
0
|
|
|
|
|
0
|
$consrun->close(); |
|
250
|
0
|
|
|
|
|
0
|
$io->close(); |
|
251
|
0
|
|
|
|
|
0
|
return $consout; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 _parameters |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub _parameters { |
|
259
|
0
|
|
|
0
|
|
0
|
return [qw(RES MINSIZE MAXSIZE MINPERIOD MAXPERIOD EXP ALLOWSMALL WIN MINSIM MAXSIM)]; |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=head2 _executable |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
Attempts to get the mreps executable. |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=cut |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub _executable { |
|
269
|
1
|
|
|
1
|
|
48079
|
my($self, $path) = @_; |
|
270
|
1
|
|
|
|
|
2
|
my $name = 'mreps'; |
|
271
|
1
|
|
|
|
|
3
|
my $io = "Bio::Polloc::Polloc::IO"; |
|
272
|
1
|
|
|
|
|
17
|
$self->debug("Searching the $name binary for $^O"); |
|
273
|
|
|
|
|
|
|
# Note the darwin support. This is because darwin is unable to execute |
|
274
|
|
|
|
|
|
|
#Â the linux binary (despite its origin) |
|
275
|
1
|
50
|
|
|
|
12
|
my $bin = $^O =~ /(macos|darwin)/i ? "mreps.macosx.bin" : |
|
|
|
50
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
$^O =~ /mswin/i ? "mreps.exe" : |
|
277
|
|
|
|
|
|
|
"mreps.linux.bin"; |
|
278
|
1
|
|
|
|
|
2
|
my @where = (''); |
|
279
|
1
|
50
|
|
|
|
3
|
unshift @where, $path if $path; |
|
280
|
1
|
|
|
|
|
4
|
for my $p (@where){ |
|
281
|
1
|
|
|
|
|
2
|
for my $n (($bin, $name, "$name.bin")){ |
|
282
|
3
|
|
|
|
|
17
|
my $exe = $io->exists_exe($p . $n); |
|
283
|
3
|
50
|
|
|
|
12
|
return $exe if $exe; |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
} |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=head2 _initialize |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub _initialize { |
|
293
|
0
|
|
|
0
|
|
|
my($self,@args) = @_; |
|
294
|
0
|
|
|
|
|
|
$self->type('repeat'); |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
1; |