| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::Tools::Prepeat; |
|
2
|
1
|
|
|
1
|
|
12381
|
use 5.006; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
36
|
|
|
3
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
41
|
|
|
4
|
|
|
|
|
|
|
our $VERSION = '0.06'; |
|
5
|
1
|
|
|
1
|
|
5
|
use XSLoader; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
34
|
|
|
6
|
|
|
|
|
|
|
XSLoader::load 'Bio::Tools::Prepeat'; |
|
7
|
1
|
|
|
1
|
|
5
|
use Exporter; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
67
|
|
|
8
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
9
|
|
|
|
|
|
|
our @EXPORT_OK = qw(random_sequence); |
|
10
|
1
|
|
|
1
|
|
5
|
use Cwd qw(abs_path); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
47
|
|
|
11
|
1
|
|
|
1
|
|
899
|
use String::Random qw(random_string); |
|
|
1
|
|
|
|
|
3611
|
|
|
|
1
|
|
|
|
|
92
|
|
|
12
|
1
|
|
|
1
|
|
1032
|
use IO::File; |
|
|
1
|
|
|
|
|
18956
|
|
|
|
1
|
|
|
|
|
185
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
1
|
|
|
1
|
|
26039
|
use Data::Dumper; |
|
|
1
|
|
|
|
|
15118
|
|
|
|
1
|
|
|
|
|
1884
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @amino = qw/A C D E F G H I K L M N P Q R S T V W Y/; |
|
17
|
|
|
|
|
|
|
our @extamino = 'A'..'Z'; # extended amino acid |
|
18
|
2
|
100
|
|
2
|
1
|
3853819
|
sub random_sequence { ref($_[0]) ? shift : undef;random_string('0'x(shift), \@amino) } |
|
|
2
|
|
|
|
|
1302
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub new { |
|
21
|
1
|
|
|
1
|
1
|
142
|
my $pkg = shift; |
|
22
|
1
|
50
|
|
|
|
7
|
my $arg = ref($_[0]) ? $_[0] : {@_}; |
|
23
|
1
|
50
|
33
|
|
|
25
|
die "It exists as a non-directory\n" if -e $arg->{wd} && !-d $arg->{wd}; |
|
24
|
1
|
|
|
|
|
210
|
mkdir abs_path($arg->{wd}); |
|
25
|
1
|
|
|
|
|
36
|
bless { |
|
26
|
|
|
|
|
|
|
wd => abs_path($arg->{wd}), |
|
27
|
|
|
|
|
|
|
seqarr => undef, |
|
28
|
|
|
|
|
|
|
}, $pkg; |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub reset { |
|
32
|
1
|
|
|
1
|
1
|
205
|
my $pkg = shift; |
|
33
|
1
|
|
|
|
|
2
|
@{$pkg->{seqarr}} = (); |
|
|
1
|
|
|
|
|
12
|
|
|
34
|
1
|
|
|
|
|
2
|
@{$pkg->{acclen}} = (); |
|
|
1
|
|
|
|
|
4
|
|
|
35
|
1
|
|
|
|
|
4
|
$pkg->{built} = 0; |
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
|
|
38
|
1
|
|
|
1
|
1
|
321
|
sub feed { push @{(shift)->{seqarr}}, map{uc$_}@_ } |
|
|
1
|
|
|
|
|
10
|
|
|
|
13
|
|
|
|
|
45
|
|
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# cartesian product over @extamino X @extamino |
|
41
|
|
|
|
|
|
|
sub bigram_set { |
|
42
|
2
|
|
|
2
|
0
|
3
|
my @ret; |
|
43
|
2
|
|
|
|
|
7
|
foreach my $f (@extamino){ |
|
44
|
52
|
|
|
|
|
85
|
foreach my $s (@extamino){ |
|
45
|
1352
|
|
|
|
|
3240
|
push @ret, $f.$s; |
|
46
|
|
|
|
|
|
|
} |
|
47
|
|
|
|
|
|
|
} |
|
48
|
2
|
|
|
|
|
478
|
@ret; |
|
49
|
|
|
|
|
|
|
} |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub loadseq { |
|
52
|
1
|
|
|
1
|
0
|
2
|
my $pkg = shift; |
|
53
|
1
|
50
|
|
|
|
94
|
open F, $pkg->{wd}."/seqs" or die "Cannot load sequence file\n"; |
|
54
|
1
|
|
|
|
|
113
|
while(chomp($_=)){ |
|
55
|
13
|
|
|
|
|
15
|
push @{$pkg->{seqarr}}, $_; |
|
|
13
|
|
|
|
|
354
|
|
|
56
|
|
|
|
|
|
|
} |
|
57
|
1
|
|
|
|
|
26
|
close F; |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# accumulating lengths |
|
61
|
|
|
|
|
|
|
sub acclen { |
|
62
|
2
|
|
|
2
|
0
|
4
|
my $pkg = shift; |
|
63
|
2
|
|
|
|
|
4
|
my $sum; |
|
64
|
2
|
|
|
|
|
5
|
push @{$pkg->{acclen}}, 0; |
|
|
2
|
|
|
|
|
8
|
|
|
65
|
2
|
|
|
|
|
3
|
for (1..$#{$pkg->{seqarr}}){ |
|
|
2
|
|
|
|
|
13
|
|
|
66
|
24
|
|
|
|
|
35
|
$sum += length $pkg->{seqarr}->[$_-1]; |
|
67
|
24
|
|
|
|
|
27
|
push @{$pkg->{acclen}}, $sum; |
|
|
24
|
|
|
|
|
50
|
|
|
68
|
|
|
|
|
|
|
} |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub loadidx { |
|
72
|
1
|
|
|
1
|
1
|
83
|
my $pkg = shift; |
|
73
|
1
|
|
|
|
|
5
|
$pkg->{seqarr} = $pkg->{acclen} = undef; |
|
74
|
1
|
|
|
|
|
7
|
$pkg->loadseq; |
|
75
|
1
|
|
|
|
|
5
|
$pkg->acclen; |
|
76
|
1
|
|
|
|
|
4
|
$pkg->{built} = 1; |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub buildidx { |
|
80
|
1
|
|
|
1
|
1
|
100
|
my $pkg = shift; |
|
81
|
1
|
|
|
|
|
2
|
my %fh; |
|
82
|
|
|
|
|
|
|
|
|
83
|
1
|
|
|
|
|
7
|
$pkg->acclen; |
|
84
|
|
|
|
|
|
|
|
|
85
|
1
|
|
|
|
|
6
|
foreach (bigram_set){ |
|
86
|
676
|
|
|
|
|
4356
|
$fh{$_} = new IO::File $pkg->{wd}."/idx.$_", O_CREAT|O_WRONLY|O_TRUNC; |
|
87
|
676
|
50
|
|
|
|
125804
|
die "Cannot open index file $_ for writing\n" unless defined $fh{$_}; |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
1
|
|
|
|
|
121
|
open F, '>', $pkg->{wd}."/seqs"; |
|
91
|
1
|
|
|
|
|
2
|
my $cnt = 0; |
|
92
|
1
|
|
|
|
|
2
|
foreach my $seq (@{$pkg->{seqarr}}){ |
|
|
1
|
|
|
|
|
6
|
|
|
93
|
13
|
|
|
|
|
57
|
print F "$seq\n"; |
|
94
|
13
|
|
|
|
|
34
|
for(my $i = 0; $i < length($seq)-1; $i++){ |
|
95
|
4122
|
|
|
|
|
6233
|
my $bigram = substr($seq, $i, 2); |
|
96
|
4122
|
|
|
|
|
6995
|
my $h = $fh{$bigram}; |
|
97
|
4122
|
|
|
|
|
21149
|
print $h ($pkg->{acclen}->[$cnt] + $i)."\n"; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
13
|
|
|
|
|
25
|
$cnt++; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
1
|
|
|
|
|
123
|
close F; |
|
102
|
|
|
|
|
|
|
|
|
103
|
1
|
|
|
|
|
65
|
$_->close foreach (values %fh); |
|
104
|
1
|
|
|
|
|
26720
|
$pkg->{built} = 1; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
# relative position |
|
108
|
|
|
|
|
|
|
sub relpos { |
|
109
|
75266
|
|
|
75266
|
0
|
100894
|
my ($pkg, $abspos) = @_; |
|
110
|
75266
|
|
|
|
|
104804
|
my ($seqid, $pos) = (0, 0); |
|
111
|
75266
|
|
|
|
|
87782
|
for( my $i=$#{$pkg->{acclen}} ; $i>=0 ; $i--){ |
|
|
75266
|
|
|
|
|
223058
|
|
|
112
|
481073
|
100
|
|
|
|
1282159
|
if( $abspos >= $pkg->{acclen}->[$i] ){ |
|
113
|
75266
|
|
|
|
|
108709
|
$pos = $abspos - $pkg->{acclen}->[$i]; |
|
114
|
75266
|
|
|
|
|
86196
|
$seqid = $i; |
|
115
|
75266
|
|
|
|
|
92245
|
last; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
} |
|
118
|
75266
|
|
|
|
|
177990
|
return ($seqid, $pos); |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# absolute position |
|
122
|
|
|
|
|
|
|
sub abspos { |
|
123
|
9175
|
|
|
9175
|
0
|
13245
|
my ($pkg, $seqid, $pos) = @_; |
|
124
|
9175
|
|
|
|
|
30501
|
return $pkg->{acclen}->[$seqid] + $pos; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub coincidence_length { |
|
128
|
37633
|
|
|
37633
|
0
|
57841
|
my ($pkg, $prev, $this) = @_; |
|
129
|
37633
|
|
|
|
|
66832
|
my @prevrel = $pkg->relpos($prev); |
|
130
|
37633
|
|
|
|
|
76644
|
my @thisrel = $pkg->relpos($this); |
|
131
|
37633
|
|
|
|
|
49392
|
my @range = @{$pkg->{length}}; |
|
|
37633
|
|
|
|
|
81274
|
|
|
132
|
37633
|
|
|
|
|
50467
|
my @ret; |
|
133
|
|
|
|
|
|
|
|
|
134
|
37633
|
|
|
|
|
53518
|
foreach my $len (@range){ |
|
135
|
44045
|
|
|
|
|
81885
|
my $str_a = substr($pkg->{seqarr}->[$prevrel[0]], $prevrel[1], $len); |
|
136
|
44045
|
|
|
|
|
69424
|
my $str_b = substr($pkg->{seqarr}->[$thisrel[0]], $thisrel[1], $len); |
|
137
|
|
|
|
|
|
|
# print "$len $str_a $str_b$/" if $str_a =~/^ACL/o || $str_b =~ /^ACL/o; |
|
138
|
|
|
|
|
|
|
|
|
139
|
44045
|
100
|
100
|
|
|
176368
|
last if length $str_a != $len || length $str_b != $len; |
|
140
|
43410
|
100
|
|
|
|
97404
|
last if $str_a ne $str_b; |
|
141
|
|
|
|
|
|
|
# print "$prev, $this => ", substr($pkg->{seqarr}->[$prevrel[0]], $prevrel[1], $pkg->{length}), '/', substr($pkg->{seqarr}->[$thisrel[0]], $thisrel[1], $pkg->{length}), $/; |
|
142
|
9175
|
|
|
|
|
32600
|
push @ret, [ \@prevrel, \@thisrel, $len ]; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
37633
|
|
|
|
|
94429
|
\@ret; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
1
|
|
|
1
|
|
13
|
use List::Util qw/min/; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
1236
|
|
|
148
|
|
|
|
|
|
|
sub query { |
|
149
|
1
|
|
|
1
|
1
|
104
|
my ($pkg, $length) = @_; |
|
150
|
1
|
50
|
|
|
|
7
|
die "Index files are not built or loaded. Please use 'buildidx' or 'loadidx' first\n" unless $pkg->{built}; |
|
151
|
1
|
50
|
|
|
|
6
|
die "Length of a repeat sequence must exceed 3\n" unless $length >= 3; |
|
152
|
1
|
50
|
|
|
|
13
|
$pkg->{length} = ref($length) ? [sort{$a<=>$b}@$length] : [ $length ]; |
|
|
3
|
|
|
|
|
10
|
|
|
153
|
1
|
|
|
|
|
168
|
open R, '>', $pkg->{wd}."/result"; |
|
154
|
1
|
|
|
|
|
2
|
my ($prev, $this); |
|
155
|
1
|
|
|
|
|
5
|
foreach (bigram_set){ |
|
156
|
676
|
100
|
|
|
|
18505
|
next unless -s $pkg->{wd}."/idx.$_"; |
|
157
|
368
|
50
|
|
|
|
15720
|
open F, $pkg->{wd}."/idx.$_" or die "Cannot open index file $_ for query\n"; |
|
158
|
368
|
|
|
|
|
610
|
my @posarr; |
|
159
|
368
|
|
|
|
|
5761
|
while ( chomp ($_ = ) ){ push @posarr, $_; } |
|
|
4122
|
|
|
|
|
11643
|
|
|
160
|
|
|
|
|
|
|
|
|
161
|
368
|
|
|
|
|
567
|
my %checked; |
|
162
|
368
|
|
|
|
|
855
|
foreach $prev (@posarr){ |
|
163
|
4122
|
50
|
|
|
|
9305
|
next if $checked{$prev}; |
|
164
|
4122
|
|
|
|
|
5696
|
foreach $this (@posarr){ |
|
165
|
79724
|
100
|
|
|
|
101596
|
if($this - $prev > min( @{$pkg->{length}})){ |
|
|
79724
|
|
|
|
|
250625
|
|
|
166
|
37633
|
|
|
|
|
73817
|
my $occs = $pkg->coincidence_length($prev, $this); |
|
167
|
37633
|
50
|
|
|
|
86909
|
if(ref $occs){ |
|
168
|
37633
|
|
|
|
|
41895
|
foreach my $occ (@{$occs}){ |
|
|
37633
|
|
|
|
|
99814
|
|
|
169
|
9175
|
|
|
|
|
23255
|
$checked{$pkg->abspos($occ->[0]->[0], $occ->[0]->[1])} = 1; |
|
170
|
|
|
|
|
|
|
# print substr($pkg->{seqarr}->[$occ->[0]->[0]],$occ->[0]->[1], $occ->[2]),$/; |
|
171
|
|
|
|
|
|
|
|
|
172
|
9175
|
|
|
|
|
25061
|
print R |
|
173
|
|
|
|
|
|
|
join ' ', |
|
174
|
|
|
|
|
|
|
substr($pkg->{seqarr}->[$occ->[0]->[0]], |
|
175
|
|
|
|
|
|
|
$occ->[0]->[1], $occ->[2]), |
|
176
|
9175
|
|
|
|
|
21432
|
"@{$occ->[0]} @{$occ->[1]}\n"; |
|
|
9175
|
|
|
|
|
39057
|
|
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
} |
|
182
|
368
|
|
|
|
|
24115
|
close F; |
|
183
|
|
|
|
|
|
|
} |
|
184
|
1
|
|
|
|
|
177
|
close R; |
|
185
|
|
|
|
|
|
|
|
|
186
|
1
|
50
|
|
|
|
51
|
open R, $pkg->{wd}."/result" or die "Cannot open result file\n"; |
|
187
|
1
|
|
|
|
|
3
|
my $ret; |
|
188
|
1
|
|
|
|
|
43
|
while(chomp($_=)){ |
|
189
|
9175
|
|
|
|
|
35704
|
my @e = split /\s/, $_; |
|
190
|
9175
|
|
|
|
|
22907
|
$ret->{$e[0]}->{join q/ /,@e[1..2]} = 1; |
|
191
|
9175
|
|
|
|
|
32349
|
$ret->{$e[0]}->{join q/ /,@e[3..4]} = 1; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
1
|
|
|
|
|
35
|
close R; |
|
194
|
1
|
|
|
|
|
2
|
foreach (keys %{$ret}){ |
|
|
1
|
|
|
|
|
930
|
|
|
195
|
7062
|
|
|
|
|
15695
|
$ret->{$_} = [ |
|
196
|
6559
|
|
|
|
|
10994
|
sort{ $a->[0] <=> $b->[0] } |
|
197
|
7582
|
|
|
|
|
22850
|
sort{ $a->[1] <=> $b->[1] } |
|
198
|
2692
|
|
|
|
|
7406
|
map { [split/ /] } |
|
199
|
2692
|
|
|
|
|
2898
|
keys %{$ret->{$_}} |
|
200
|
|
|
|
|
|
|
]; |
|
201
|
|
|
|
|
|
|
} |
|
202
|
1
|
|
|
|
|
266
|
$ret; |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub cleanidx { |
|
207
|
1
|
|
|
1
|
0
|
15500
|
map{unlink $_} glob($_[0]->{wd}."/*"); |
|
|
678
|
|
|
|
|
722987
|
|
|
208
|
1
|
|
|
|
|
559
|
rmdir $_[0]->{wd}; |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
1; |
|
214
|
|
|
|
|
|
|
__END__ |