| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package App::Fasops::Common; |
|
2
|
26
|
|
|
26
|
|
403532
|
use strict; |
|
|
26
|
|
|
|
|
122
|
|
|
|
26
|
|
|
|
|
802
|
|
|
3
|
26
|
|
|
26
|
|
131
|
use warnings; |
|
|
26
|
|
|
|
|
57
|
|
|
|
26
|
|
|
|
|
689
|
|
|
4
|
26
|
|
|
26
|
|
3567
|
use autodie; |
|
|
26
|
|
|
|
|
108957
|
|
|
|
26
|
|
|
|
|
234
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
26
|
|
|
26
|
|
150671
|
use 5.018001; |
|
|
26
|
|
|
|
|
420
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
26
|
|
|
26
|
|
173
|
use Carp qw(); |
|
|
26
|
|
|
|
|
48
|
|
|
|
26
|
|
|
|
|
543
|
|
|
9
|
26
|
|
|
26
|
|
13644
|
use File::ShareDir qw(); |
|
|
26
|
|
|
|
|
556903
|
|
|
|
26
|
|
|
|
|
734
|
|
|
10
|
26
|
|
|
26
|
|
3729
|
use IO::Zlib; |
|
|
26
|
|
|
|
|
471868
|
|
|
|
26
|
|
|
|
|
214
|
|
|
11
|
26
|
|
|
26
|
|
17957
|
use IPC::Cmd qw(); |
|
|
26
|
|
|
|
|
1258267
|
|
|
|
26
|
|
|
|
|
838
|
|
|
12
|
26
|
|
|
26
|
|
236
|
use List::Util; |
|
|
26
|
|
|
|
|
55
|
|
|
|
26
|
|
|
|
|
1712
|
|
|
13
|
26
|
|
|
26
|
|
5925
|
use Path::Tiny qw(); |
|
|
26
|
|
|
|
|
76902
|
|
|
|
26
|
|
|
|
|
556
|
|
|
14
|
26
|
|
|
26
|
|
3600
|
use Tie::IxHash; |
|
|
26
|
|
|
|
|
20183
|
|
|
|
26
|
|
|
|
|
726
|
|
|
15
|
26
|
|
|
26
|
|
3346
|
use YAML::Syck qw(); |
|
|
26
|
|
|
|
|
13433
|
|
|
|
26
|
|
|
|
|
491
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
26
|
|
|
26
|
|
3910
|
use AlignDB::IntSpan; |
|
|
26
|
|
|
|
|
50261
|
|
|
|
26
|
|
|
|
|
602
|
|
|
18
|
26
|
|
|
26
|
|
3354
|
use App::RL::Common; |
|
|
26
|
|
|
|
|
142078
|
|
|
|
26
|
|
|
|
|
227553
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub mean (@) { |
|
21
|
24
|
|
|
24
|
0
|
47
|
@_ = grep { defined $_ } @_; |
|
|
59
|
|
|
|
|
153
|
|
|
22
|
24
|
50
|
|
|
|
63
|
return 0 unless @_; |
|
23
|
24
|
100
|
|
|
|
62
|
return $_[0] unless @_ > 1; |
|
24
|
12
|
|
|
|
|
58
|
return List::Util::sum(@_) / scalar(@_); |
|
25
|
|
|
|
|
|
|
} |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub any (&@) { |
|
28
|
39914
|
|
|
39914
|
0
|
55957
|
my $f = shift; |
|
29
|
39914
|
|
|
|
|
58283
|
for (@_) { |
|
30
|
376683
|
100
|
|
|
|
502101
|
return 1 if $f->(); |
|
31
|
|
|
|
|
|
|
} |
|
32
|
38277
|
|
|
|
|
122199
|
return 0; |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub all (&@) { |
|
36
|
42656
|
|
|
42656
|
0
|
58793
|
my $f = shift; |
|
37
|
42656
|
|
|
|
|
61443
|
for (@_) { |
|
38
|
385793
|
100
|
|
|
|
557204
|
return 0 unless $f->(); |
|
39
|
|
|
|
|
|
|
} |
|
40
|
41127
|
|
|
|
|
75021
|
return 1; |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub uniq (@) { |
|
44
|
2426
|
|
|
2426
|
0
|
3319
|
my %seen = (); |
|
45
|
2426
|
|
|
|
|
3250
|
my $k; |
|
46
|
|
|
|
|
|
|
my $seen_undef; |
|
47
|
2426
|
50
|
|
|
|
3450
|
return grep { defined $_ ? not $seen{ $k = $_ }++ : not $seen_undef++ } @_; |
|
|
13816
|
|
|
|
|
31957
|
|
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub firstidx (&@) { |
|
51
|
12
|
|
|
12
|
0
|
17
|
my $f = shift; |
|
52
|
12
|
|
|
|
|
19
|
for my $i ( 0 .. $#_ ) { |
|
53
|
19
|
|
|
|
|
28
|
local *_ = \$_[$i]; |
|
54
|
19
|
100
|
|
|
|
36
|
return $i if $f->(); |
|
55
|
|
|
|
|
|
|
} |
|
56
|
0
|
|
|
|
|
0
|
return -1; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub read_replaces { |
|
60
|
3
|
|
|
3
|
0
|
5
|
my $file = shift; |
|
61
|
|
|
|
|
|
|
|
|
62
|
3
|
|
|
|
|
20
|
tie my %replace, "Tie::IxHash"; |
|
63
|
3
|
|
|
|
|
43
|
my @lines = Path::Tiny::path($file)->lines( { chomp => 1 } ); |
|
64
|
3
|
|
|
|
|
590
|
for (@lines) { |
|
65
|
5
|
|
|
|
|
51
|
my @fields = split /\t/; |
|
66
|
5
|
50
|
|
|
|
14
|
if ( @fields >= 1 ) { |
|
67
|
5
|
|
|
|
|
7
|
my $ori = shift @fields; |
|
68
|
5
|
|
|
|
|
25
|
$replace{$ori} = [@fields]; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
3
|
|
|
|
|
43
|
return \%replace; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub parse_block { |
|
76
|
115
|
|
|
115
|
0
|
706
|
my $block = shift; |
|
77
|
115
|
|
|
|
|
204
|
my $want_full = shift; |
|
78
|
|
|
|
|
|
|
|
|
79
|
115
|
|
|
|
|
2739
|
my @lines = grep {/\S/} split /\n/, $block; |
|
|
1098
|
|
|
|
|
2548
|
|
|
80
|
115
|
50
|
|
|
|
389
|
Carp::croak "Numbers of headers not equal to seqs\n" if @lines % 2; |
|
81
|
|
|
|
|
|
|
|
|
82
|
115
|
|
|
|
|
773
|
tie my %info_of, "Tie::IxHash"; |
|
83
|
115
|
|
|
|
|
2062
|
while (@lines) { |
|
84
|
549
|
|
|
|
|
7143
|
my $header = shift @lines; |
|
85
|
549
|
|
|
|
|
2141
|
$header =~ s/^\>//; |
|
86
|
549
|
|
|
|
|
1030
|
chomp $header; |
|
87
|
|
|
|
|
|
|
|
|
88
|
549
|
|
|
|
|
819
|
my $seq = shift @lines; |
|
89
|
549
|
|
|
|
|
762
|
chomp $seq; |
|
90
|
|
|
|
|
|
|
|
|
91
|
549
|
|
|
|
|
1296
|
my $info_ref = App::RL::Common::decode_header($header); |
|
92
|
549
|
|
|
|
|
84509
|
$info_ref->{seq} = $seq; |
|
93
|
549
|
100
|
66
|
|
|
7881
|
if ( $want_full or !defined $info_ref->{name} ) { |
|
94
|
376
|
|
|
|
|
853
|
my $ess_header = App::RL::Common::encode_header( $info_ref, 1 ); |
|
95
|
376
|
|
|
|
|
28660
|
$info_of{$ess_header} = $info_ref; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
else { |
|
98
|
173
|
|
|
|
|
1383
|
$info_of{ $info_ref->{name} } = $info_ref; |
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
|
|
102
|
115
|
|
|
|
|
2182
|
return \%info_of; |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub parse_block_array { |
|
106
|
10
|
|
|
10
|
0
|
19
|
my $block = shift; |
|
107
|
|
|
|
|
|
|
|
|
108
|
10
|
|
|
|
|
64
|
my @lines = grep {/\S/} split /\n/, $block; |
|
|
78
|
|
|
|
|
172
|
|
|
109
|
10
|
50
|
|
|
|
33
|
Carp::croak "Numbers of headers not equal to seqs\n" if @lines % 2; |
|
110
|
|
|
|
|
|
|
|
|
111
|
10
|
|
|
|
|
19
|
my $info_ary = []; |
|
112
|
10
|
|
|
|
|
23
|
while (@lines) { |
|
113
|
39
|
|
|
|
|
75
|
my $header = shift @lines; |
|
114
|
39
|
|
|
|
|
126
|
$header =~ s/^\>//; |
|
115
|
39
|
|
|
|
|
60
|
chomp $header; |
|
116
|
|
|
|
|
|
|
|
|
117
|
39
|
|
|
|
|
51
|
my $seq = shift @lines; |
|
118
|
39
|
|
|
|
|
44
|
chomp $seq; |
|
119
|
|
|
|
|
|
|
|
|
120
|
39
|
|
|
|
|
85
|
my $info_ref = App::RL::Common::decode_header($header); |
|
121
|
39
|
|
|
|
|
5561
|
$info_ref->{seq} = $seq; |
|
122
|
|
|
|
|
|
|
|
|
123
|
39
|
|
|
|
|
415
|
push @{$info_ary}, $info_ref; |
|
|
39
|
|
|
|
|
105
|
|
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
10
|
|
|
|
|
25
|
return $info_ary; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub parse_block_header { |
|
130
|
9
|
|
|
9
|
0
|
17
|
my $block = shift; |
|
131
|
|
|
|
|
|
|
|
|
132
|
9
|
|
|
|
|
40
|
my @lines = grep {/\S/} split /\n/, $block; |
|
|
72
|
|
|
|
|
135
|
|
|
133
|
9
|
50
|
|
|
|
23
|
Carp::croak "Numbers of headers not equal to seqs\n" if @lines % 2; |
|
134
|
|
|
|
|
|
|
|
|
135
|
9
|
|
|
|
|
36
|
tie my %info_of, "Tie::IxHash"; |
|
136
|
9
|
|
|
|
|
113
|
while (@lines) { |
|
137
|
36
|
|
|
|
|
319
|
my $header = shift @lines; |
|
138
|
36
|
|
|
|
|
111
|
$header =~ s/^\>//; |
|
139
|
36
|
|
|
|
|
95
|
chomp $header; |
|
140
|
|
|
|
|
|
|
|
|
141
|
36
|
|
|
|
|
46
|
my $seq = shift @lines; |
|
142
|
36
|
|
|
|
|
37
|
chomp $seq; |
|
143
|
|
|
|
|
|
|
|
|
144
|
36
|
|
|
|
|
72
|
my $info_ref = App::RL::Common::decode_header($header); |
|
145
|
36
|
|
|
|
|
4820
|
my $ess_header = App::RL::Common::encode_header( $info_ref, 1 ); |
|
146
|
36
|
|
|
|
|
2231
|
$info_ref->{seq} = $seq; |
|
147
|
36
|
|
|
|
|
489
|
$info_of{$ess_header} = $info_ref; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
9
|
|
|
|
|
111
|
return \%info_of; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub parse_axt_block { |
|
154
|
6
|
|
|
6
|
0
|
14
|
my $block = shift; |
|
155
|
6
|
|
|
|
|
9
|
my $length_of = shift; |
|
156
|
|
|
|
|
|
|
|
|
157
|
6
|
|
|
|
|
33
|
my @lines = grep {/\S/} split /\n/, $block; |
|
|
18
|
|
|
|
|
58
|
|
|
158
|
6
|
50
|
|
|
|
16
|
Carp::croak "A block of axt should contain three lines\n" if @lines != 3; |
|
159
|
|
|
|
|
|
|
|
|
160
|
6
|
|
|
|
|
43
|
my (undef, $first_chr, $first_start, $first_end, $second_chr, |
|
161
|
|
|
|
|
|
|
$second_start, $second_end, $query_strand, undef, |
|
162
|
|
|
|
|
|
|
) = split /\s+/, $lines[0]; |
|
163
|
|
|
|
|
|
|
|
|
164
|
6
|
100
|
|
|
|
20
|
if ( $query_strand eq "-" ) { |
|
165
|
3
|
100
|
66
|
|
|
15
|
if ( defined $length_of and ref $length_of eq "HASH" ) { |
|
166
|
1
|
50
|
|
|
|
8
|
if ( exists $length_of->{$second_chr} ) { |
|
167
|
1
|
|
|
|
|
4
|
$second_start = $length_of->{$second_chr} - $second_start + 1; |
|
168
|
1
|
|
|
|
|
4
|
$second_end = $length_of->{$second_chr} - $second_end + 1; |
|
169
|
1
|
|
|
|
|
3
|
( $second_start, $second_end ) = ( $second_end, $second_start ); |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
6
|
|
|
|
|
59
|
my $info_refs = [ |
|
175
|
|
|
|
|
|
|
{ name => "target", |
|
176
|
|
|
|
|
|
|
chr => $first_chr, |
|
177
|
|
|
|
|
|
|
start => $first_start, |
|
178
|
|
|
|
|
|
|
end => $first_end, |
|
179
|
|
|
|
|
|
|
strand => "+", |
|
180
|
|
|
|
|
|
|
seq => $lines[1], |
|
181
|
|
|
|
|
|
|
}, |
|
182
|
|
|
|
|
|
|
{ name => "query", |
|
183
|
|
|
|
|
|
|
chr => $second_chr, |
|
184
|
|
|
|
|
|
|
start => $second_start, |
|
185
|
|
|
|
|
|
|
end => $second_end, |
|
186
|
|
|
|
|
|
|
strand => $query_strand, |
|
187
|
|
|
|
|
|
|
seq => $lines[2], |
|
188
|
|
|
|
|
|
|
}, |
|
189
|
|
|
|
|
|
|
]; |
|
190
|
|
|
|
|
|
|
|
|
191
|
6
|
|
|
|
|
21
|
return $info_refs; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub parse_maf_block { |
|
195
|
4
|
|
|
4
|
0
|
18
|
my $block = shift; |
|
196
|
|
|
|
|
|
|
|
|
197
|
4
|
|
|
|
|
22
|
my @lines = grep {/\S/} split /\n/, $block; |
|
|
16
|
|
|
|
|
49
|
|
|
198
|
4
|
50
|
|
|
|
12
|
Carp::croak "A block of maf should contain s lines\n" unless @lines > 0; |
|
199
|
|
|
|
|
|
|
|
|
200
|
4
|
|
|
|
|
26
|
tie my %info_of, "Tie::IxHash"; |
|
201
|
|
|
|
|
|
|
|
|
202
|
4
|
|
|
|
|
67
|
for my $sline (@lines) { |
|
203
|
16
|
|
|
|
|
274
|
my ( undef, $src, $start, $size, $strand, $srcSize, $text ) = split /\s+/, $sline; |
|
204
|
|
|
|
|
|
|
|
|
205
|
16
|
|
|
|
|
41
|
my ( $species, $chr_name ) = split /\./, $src; |
|
206
|
16
|
50
|
|
|
|
37
|
$chr_name = $species if !defined $chr_name; |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# adjust coordinates to be one-based inclusive |
|
209
|
16
|
|
|
|
|
30
|
$start = $start + 1; |
|
210
|
|
|
|
|
|
|
|
|
211
|
16
|
|
|
|
|
27
|
my $end = $start + $size - 1; |
|
212
|
|
|
|
|
|
|
|
|
213
|
16
|
100
|
|
|
|
32
|
if ( $strand eq "-" ) { |
|
214
|
4
|
|
|
|
|
12
|
$start = $srcSize - $start + 1; |
|
215
|
4
|
|
|
|
|
6
|
$end = $srcSize - $end + 1; |
|
216
|
4
|
|
|
|
|
16
|
( $start, $end ) = ( $end, $start ); |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
16
|
|
|
|
|
97
|
$info_of{$species} = { |
|
220
|
|
|
|
|
|
|
name => $species, |
|
221
|
|
|
|
|
|
|
chr => $chr_name, |
|
222
|
|
|
|
|
|
|
start => $start, |
|
223
|
|
|
|
|
|
|
end => $end, |
|
224
|
|
|
|
|
|
|
strand => $strand, |
|
225
|
|
|
|
|
|
|
seq => $text, |
|
226
|
|
|
|
|
|
|
}; |
|
227
|
|
|
|
|
|
|
} |
|
228
|
|
|
|
|
|
|
|
|
229
|
4
|
|
|
|
|
63
|
return \%info_of; |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub revcom { |
|
233
|
10
|
|
|
10
|
0
|
2168
|
my $seq = shift; |
|
234
|
|
|
|
|
|
|
|
|
235
|
10
|
|
|
|
|
39
|
$seq =~ tr/ACGTMRWSYKVHDBNacgtmrwsykvhdbn-/TGCAKYWSRMBDHVNtgcakyswrmbdhvn-/; |
|
236
|
10
|
|
|
|
|
19
|
my $seq_rc = reverse $seq; |
|
237
|
|
|
|
|
|
|
|
|
238
|
10
|
|
|
|
|
28
|
return $seq_rc; |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
sub seq_length { |
|
242
|
15
|
|
|
15
|
0
|
2125
|
my $seq = shift; |
|
243
|
|
|
|
|
|
|
|
|
244
|
15
|
|
|
|
|
36
|
my $gaps = $seq =~ tr/-/-/; |
|
245
|
|
|
|
|
|
|
|
|
246
|
15
|
|
|
|
|
68
|
return length($seq) - $gaps; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
#@returns AlignDB::IntSpan |
|
250
|
|
|
|
|
|
|
sub indel_intspan { |
|
251
|
208
|
|
|
208
|
0
|
4867
|
my $seq = shift; |
|
252
|
|
|
|
|
|
|
|
|
253
|
208
|
|
|
|
|
566
|
my $intspan = AlignDB::IntSpan->new(); |
|
254
|
208
|
|
|
|
|
2028
|
my $length = length($seq); |
|
255
|
|
|
|
|
|
|
|
|
256
|
208
|
|
|
|
|
299
|
my $offset = 0; |
|
257
|
208
|
|
|
|
|
282
|
my $start = 0; |
|
258
|
208
|
|
|
|
|
317
|
my $end = 0; |
|
259
|
208
|
|
|
|
|
407
|
for my $pos ( 1 .. $length ) { |
|
260
|
78177
|
|
|
|
|
103587
|
my $base = substr( $seq, $pos - 1, 1 ); |
|
261
|
78177
|
100
|
|
|
|
113751
|
if ( $base eq '-' ) { |
|
262
|
2271
|
100
|
|
|
|
3672
|
if ( $offset == 0 ) { |
|
263
|
536
|
|
|
|
|
689
|
$start = $pos; |
|
264
|
|
|
|
|
|
|
} |
|
265
|
2271
|
|
|
|
|
2988
|
$offset++; |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
else { |
|
268
|
75906
|
100
|
|
|
|
115090
|
if ( $offset != 0 ) { |
|
269
|
524
|
|
|
|
|
687
|
$end = $pos - 1; |
|
270
|
524
|
|
|
|
|
1171
|
$intspan->add_pair( $start, $end ); |
|
271
|
|
|
|
|
|
|
} |
|
272
|
75906
|
|
|
|
|
135229
|
$offset = 0; |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
} |
|
275
|
208
|
100
|
|
|
|
423
|
if ( $offset != 0 ) { |
|
276
|
12
|
|
|
|
|
37
|
$end = $length; |
|
277
|
12
|
|
|
|
|
39
|
$intspan->add_pair( $start, $end ); |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
|
|
280
|
208
|
|
|
|
|
1189
|
return $intspan; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
#@returns AlignDB::IntSpan |
|
284
|
|
|
|
|
|
|
sub seq_intspan { |
|
285
|
24
|
|
|
24
|
0
|
11074
|
my $seq = shift; |
|
286
|
|
|
|
|
|
|
|
|
287
|
24
|
|
|
|
|
103
|
my $intspan = AlignDB::IntSpan->new(); |
|
288
|
24
|
|
|
|
|
297
|
my $length = length($seq); |
|
289
|
24
|
|
|
|
|
99
|
$intspan->add_pair( 1, $length ); |
|
290
|
|
|
|
|
|
|
|
|
291
|
24
|
|
|
|
|
1460
|
$intspan->subtract( indel_intspan($seq) ); |
|
292
|
|
|
|
|
|
|
|
|
293
|
24
|
|
|
|
|
20487
|
return $intspan; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub align_seqs { |
|
297
|
0
|
|
|
0
|
0
|
0
|
my $seq_refs = shift; |
|
298
|
0
|
|
0
|
|
|
0
|
my $aln_bin = shift // "mafft"; |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# get executable |
|
301
|
0
|
|
|
|
|
0
|
my $bin; |
|
302
|
|
|
|
|
|
|
|
|
303
|
0
|
0
|
|
|
|
0
|
if ( $aln_bin =~ /clus/i ) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
304
|
0
|
|
|
|
|
0
|
$aln_bin = 'clustalw'; |
|
305
|
0
|
|
|
|
|
0
|
for my $e (qw{clustalw clustal-w clustalw2}) { |
|
306
|
0
|
0
|
|
|
|
0
|
if ( IPC::Cmd::can_run($e) ) { |
|
307
|
0
|
|
|
|
|
0
|
$bin = $e; |
|
308
|
0
|
|
|
|
|
0
|
last; |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
} |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
elsif ( $aln_bin =~ /musc/i ) { |
|
313
|
0
|
|
|
|
|
0
|
$aln_bin = 'muscle'; |
|
314
|
0
|
|
|
|
|
0
|
for my $e (qw{muscle}) { |
|
315
|
0
|
0
|
|
|
|
0
|
if ( IPC::Cmd::can_run($e) ) { |
|
316
|
0
|
|
|
|
|
0
|
$bin = $e; |
|
317
|
0
|
|
|
|
|
0
|
last; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
elsif ( $aln_bin =~ /maff/i ) { |
|
322
|
0
|
|
|
|
|
0
|
$aln_bin = 'mafft'; |
|
323
|
0
|
|
|
|
|
0
|
for my $e (qw{mafft}) { |
|
324
|
0
|
0
|
|
|
|
0
|
if ( IPC::Cmd::can_run($e) ) { |
|
325
|
0
|
|
|
|
|
0
|
$bin = $e; |
|
326
|
0
|
|
|
|
|
0
|
last; |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
} |
|
330
|
|
|
|
|
|
|
|
|
331
|
0
|
0
|
|
|
|
0
|
if ( !defined $bin ) { |
|
332
|
0
|
|
|
|
|
0
|
Carp::confess "Could not find the executable for $aln_bin\n"; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# temp in and out |
|
336
|
|
|
|
|
|
|
#@type Path::Tiny |
|
337
|
0
|
|
|
|
|
0
|
my $temp_in = Path::Tiny->tempfile("seq_in_XXXXXXXX"); |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
#@type Path::Tiny |
|
340
|
0
|
|
|
|
|
0
|
my $temp_out = Path::Tiny->tempfile("seq_out_XXXXXXXX"); |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# msa may change the order of sequences |
|
343
|
0
|
|
|
|
|
0
|
my @indexes = 0 .. scalar( @{$seq_refs} - 1 ); |
|
|
0
|
|
|
|
|
0
|
|
|
344
|
|
|
|
|
|
|
{ |
|
345
|
0
|
|
|
|
|
0
|
my $fh = $temp_in->openw(); |
|
|
0
|
|
|
|
|
0
|
|
|
346
|
0
|
|
|
|
|
0
|
for my $i (@indexes) { |
|
347
|
0
|
|
|
|
|
0
|
printf {$fh} ">seq_%d\n", $i; |
|
|
0
|
|
|
|
|
0
|
|
|
348
|
0
|
|
|
|
|
0
|
printf {$fh} "%s\n", $seq_refs->[$i]; |
|
|
0
|
|
|
|
|
0
|
|
|
349
|
|
|
|
|
|
|
} |
|
350
|
0
|
|
|
|
|
0
|
close $fh; |
|
351
|
|
|
|
|
|
|
} |
|
352
|
|
|
|
|
|
|
|
|
353
|
0
|
|
|
|
|
0
|
my @args; |
|
354
|
0
|
0
|
|
|
|
0
|
if ( $aln_bin eq "clustalw" ) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
0
|
push @args, "-align -type=dna -output=fasta -outorder=input -quiet"; |
|
356
|
0
|
|
|
|
|
0
|
push @args, "-infile=" . $temp_in->absolute->stringify; |
|
357
|
0
|
|
|
|
|
0
|
push @args, "-outfile=" . $temp_out->absolute->stringify; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
elsif ( $aln_bin eq "muscle" ) { |
|
360
|
0
|
|
|
|
|
0
|
push @args, "-quiet"; |
|
361
|
0
|
|
|
|
|
0
|
push @args, "-in " . $temp_in->absolute->stringify; |
|
362
|
0
|
|
|
|
|
0
|
push @args, "-out " . $temp_out->absolute->stringify; |
|
363
|
|
|
|
|
|
|
} |
|
364
|
|
|
|
|
|
|
elsif ( $aln_bin eq "mafft" ) { |
|
365
|
0
|
|
|
|
|
0
|
push @args, "--quiet"; |
|
366
|
0
|
|
|
|
|
0
|
push @args, "--auto"; |
|
367
|
0
|
|
|
|
|
0
|
push @args, $temp_in->absolute->stringify; |
|
368
|
0
|
|
|
|
|
0
|
push @args, "> " . $temp_out->absolute->stringify; |
|
369
|
|
|
|
|
|
|
} |
|
370
|
|
|
|
|
|
|
|
|
371
|
0
|
|
|
|
|
0
|
my $cmd_line = join " ", ( $bin, @args ); |
|
372
|
0
|
|
|
|
|
0
|
my $ok = IPC::Cmd::run( command => $cmd_line ); |
|
373
|
|
|
|
|
|
|
|
|
374
|
0
|
0
|
|
|
|
0
|
if ( !$ok ) { |
|
375
|
0
|
|
|
|
|
0
|
Carp::confess("$aln_bin call failed\n"); |
|
376
|
|
|
|
|
|
|
} |
|
377
|
|
|
|
|
|
|
|
|
378
|
0
|
|
|
|
|
0
|
my @aligned; |
|
379
|
0
|
|
|
|
|
0
|
my $seq_of = read_fasta( $temp_out->absolute->stringify ); |
|
380
|
0
|
|
|
|
|
0
|
for my $i (@indexes) { |
|
381
|
0
|
|
|
|
|
0
|
push @aligned, $seq_of->{ "seq_" . $i }; |
|
382
|
|
|
|
|
|
|
} |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# delete .dnd files created by clustalw |
|
385
|
|
|
|
|
|
|
#printf STDERR "%s\n", $temp_in->absolute->stringify; |
|
386
|
0
|
0
|
|
|
|
0
|
if ( $aln_bin eq "clustalw" ) { |
|
387
|
0
|
|
|
|
|
0
|
my $dnd = $temp_in->absolute->stringify . ".dnd"; |
|
388
|
0
|
|
|
|
|
0
|
path($dnd)->remove; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
|
|
391
|
0
|
|
|
|
|
0
|
undef $temp_in; |
|
392
|
0
|
|
|
|
|
0
|
undef $temp_out; |
|
393
|
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
0
|
return \@aligned; |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
sub align_seqs_quick { |
|
398
|
0
|
|
|
0
|
0
|
0
|
my $seq_refs = shift; |
|
399
|
0
|
|
|
|
|
0
|
my $aln_prog = shift; |
|
400
|
0
|
|
|
|
|
0
|
my $indel_pad = shift; |
|
401
|
0
|
|
|
|
|
0
|
my $indel_fill = shift; |
|
402
|
|
|
|
|
|
|
|
|
403
|
0
|
0
|
|
|
|
0
|
if ( !defined $aln_prog ) { |
|
404
|
0
|
|
|
|
|
0
|
$aln_prog = "mafft"; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
0
|
0
|
|
|
|
0
|
if ( !defined $indel_pad ) { |
|
407
|
0
|
|
|
|
|
0
|
$indel_pad = 50; |
|
408
|
|
|
|
|
|
|
} |
|
409
|
0
|
0
|
|
|
|
0
|
if ( !defined $indel_fill ) { |
|
410
|
0
|
|
|
|
|
0
|
$indel_fill = 50; |
|
411
|
|
|
|
|
|
|
} |
|
412
|
|
|
|
|
|
|
|
|
413
|
0
|
|
|
|
|
0
|
my @aligned = @{$seq_refs}; |
|
|
0
|
|
|
|
|
0
|
|
|
414
|
0
|
|
|
|
|
0
|
my $seq_count = scalar @aligned; |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
# all indel regions |
|
417
|
0
|
|
|
|
|
0
|
my $realign_region = AlignDB::IntSpan->new(); |
|
418
|
0
|
|
|
|
|
0
|
for my $seq (@aligned) { |
|
419
|
0
|
|
|
|
|
0
|
my $indel_intspan = indel_intspan($seq); |
|
420
|
0
|
|
|
|
|
0
|
$indel_intspan = $indel_intspan->pad($indel_pad); |
|
421
|
0
|
|
|
|
|
0
|
$realign_region->merge($indel_intspan); |
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# join adjacent realign regions |
|
425
|
0
|
|
|
|
|
0
|
$realign_region = $realign_region->fill($indel_fill); |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# realign all segments in realign_region |
|
428
|
0
|
|
|
|
|
0
|
for my $span ( reverse $realign_region->spans ) { |
|
429
|
0
|
|
|
|
|
0
|
my $seg_start = $span->[0]; |
|
430
|
0
|
|
|
|
|
0
|
my $seg_end = $span->[1]; |
|
431
|
|
|
|
|
|
|
|
|
432
|
0
|
|
|
|
|
0
|
my @segments; |
|
433
|
0
|
|
|
|
|
0
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
434
|
0
|
|
|
|
|
0
|
my $seg = substr( $aligned[$i], $seg_start - 1, $seg_end - $seg_start + 1 ); |
|
435
|
0
|
|
|
|
|
0
|
push @segments, $seg; |
|
436
|
|
|
|
|
|
|
} |
|
437
|
0
|
|
|
|
|
0
|
my $realigned_segments = align_seqs( \@segments ); |
|
438
|
|
|
|
|
|
|
|
|
439
|
0
|
|
|
|
|
0
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
440
|
0
|
|
|
|
|
0
|
my $seg = $realigned_segments->[$i]; |
|
441
|
0
|
|
|
|
|
0
|
$seg = uc $seg; |
|
442
|
0
|
|
|
|
|
0
|
substr( $aligned[$i], $seg_start - 1, $seg_end - $seg_start + 1, $seg ); |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
|
|
446
|
0
|
|
|
|
|
0
|
return \@aligned; |
|
447
|
|
|
|
|
|
|
} |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
#----------------------------# |
|
450
|
|
|
|
|
|
|
# trim pure dash regions |
|
451
|
|
|
|
|
|
|
#----------------------------# |
|
452
|
|
|
|
|
|
|
sub trim_pure_dash { |
|
453
|
10
|
|
|
10
|
0
|
1774
|
my $seq_refs = shift; |
|
454
|
|
|
|
|
|
|
|
|
455
|
10
|
|
|
|
|
17
|
my $seq_count = @{$seq_refs}; |
|
|
10
|
|
|
|
|
20
|
|
|
456
|
10
|
|
|
|
|
21
|
my $seq_length = length $seq_refs->[0]; |
|
457
|
|
|
|
|
|
|
|
|
458
|
10
|
50
|
|
|
|
23
|
return unless $seq_length; |
|
459
|
|
|
|
|
|
|
|
|
460
|
10
|
|
|
|
|
61
|
my $trim_region = AlignDB::IntSpan->new(); |
|
461
|
|
|
|
|
|
|
|
|
462
|
10
|
|
|
|
|
124
|
for my $pos ( 1 .. $seq_length ) { |
|
463
|
528
|
|
|
|
|
886
|
my @bases; |
|
464
|
528
|
|
|
|
|
753
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
465
|
2062
|
|
|
|
|
2473
|
my $base = substr( $seq_refs->[$i], $pos - 1, 1 ); |
|
466
|
2062
|
|
|
|
|
2893
|
push @bases, $base; |
|
467
|
|
|
|
|
|
|
} |
|
468
|
|
|
|
|
|
|
|
|
469
|
528
|
100
|
|
544
|
|
1114
|
if ( all { $_ eq '-' } @bases ) { |
|
|
544
|
|
|
|
|
1673
|
|
|
470
|
4
|
|
|
|
|
12
|
$trim_region->add($pos); |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
|
|
474
|
10
|
|
|
|
|
47
|
for my $span ( reverse $trim_region->spans ) { |
|
475
|
3
|
|
|
|
|
53
|
my $seg_start = $span->[0]; |
|
476
|
3
|
|
|
|
|
4
|
my $seg_end = $span->[1]; |
|
477
|
|
|
|
|
|
|
|
|
478
|
3
|
|
|
|
|
6
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
479
|
9
|
|
|
|
|
20
|
substr( $seq_refs->[$i], $seg_start - 1, $seg_end - $seg_start + 1, "" ); |
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
|
|
483
|
10
|
|
|
|
|
237
|
return; |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
#----------------------------# |
|
487
|
|
|
|
|
|
|
# trim outgroup only sequence |
|
488
|
|
|
|
|
|
|
#----------------------------# |
|
489
|
|
|
|
|
|
|
# if intersect is superset of union |
|
490
|
|
|
|
|
|
|
# T G----C |
|
491
|
|
|
|
|
|
|
# Q G----C |
|
492
|
|
|
|
|
|
|
# O GAAAAC |
|
493
|
|
|
|
|
|
|
sub trim_outgroup { |
|
494
|
11
|
|
|
11
|
0
|
11411
|
my $seq_refs = shift; |
|
495
|
|
|
|
|
|
|
|
|
496
|
11
|
|
|
|
|
20
|
my $seq_count = scalar @{$seq_refs}; |
|
|
11
|
|
|
|
|
21
|
|
|
497
|
|
|
|
|
|
|
|
|
498
|
11
|
50
|
|
|
|
29
|
Carp::confess "Need three or more sequences\n" if $seq_count < 3; |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
# Don't expand indel set here. Last seq is outgroup |
|
501
|
11
|
|
|
|
|
20
|
my @indel_intspans; |
|
502
|
11
|
|
|
|
|
25
|
for my $i ( 0 .. $seq_count - 2 ) { |
|
503
|
22
|
|
|
|
|
49
|
my $indel_intspan = indel_intspan( $seq_refs->[$i] ); |
|
504
|
22
|
|
|
|
|
39
|
push @indel_intspans, $indel_intspan; |
|
505
|
|
|
|
|
|
|
} |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
# find trim_region |
|
508
|
11
|
|
|
|
|
27
|
my $union_set = AlignDB::IntSpan::union(@indel_intspans); |
|
509
|
11
|
|
|
|
|
2074
|
my $intersect_set = AlignDB::IntSpan::intersect(@indel_intspans); |
|
510
|
|
|
|
|
|
|
|
|
511
|
11
|
|
|
|
|
3850
|
my $trim_region = AlignDB::IntSpan->new(); |
|
512
|
11
|
|
|
|
|
114
|
for my $span ( $union_set->runlists ) { |
|
513
|
23
|
100
|
|
|
|
2952
|
if ( $intersect_set->superset($span) ) { |
|
514
|
15
|
|
|
|
|
5351
|
$trim_region->add($span); |
|
515
|
|
|
|
|
|
|
} |
|
516
|
|
|
|
|
|
|
} |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
# trim all segments in trim_region |
|
519
|
11
|
|
|
|
|
1796
|
for my $span ( reverse $trim_region->spans ) { |
|
520
|
13
|
|
|
|
|
207
|
my $seg_start = $span->[0]; |
|
521
|
13
|
|
|
|
|
20
|
my $seg_end = $span->[1]; |
|
522
|
|
|
|
|
|
|
|
|
523
|
13
|
|
|
|
|
25
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
524
|
39
|
|
|
|
|
85
|
substr( $seq_refs->[$i], $seg_start - 1, $seg_end - $seg_start + 1, "" ); |
|
525
|
|
|
|
|
|
|
} |
|
526
|
|
|
|
|
|
|
} |
|
527
|
|
|
|
|
|
|
|
|
528
|
11
|
|
|
|
|
128
|
return; |
|
529
|
|
|
|
|
|
|
} |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
#----------------------------# |
|
532
|
|
|
|
|
|
|
# record complex indels and ingroup indels |
|
533
|
|
|
|
|
|
|
#----------------------------# |
|
534
|
|
|
|
|
|
|
# All ingroup intersect sets are parts of complex indels after trim_outgroup() |
|
535
|
|
|
|
|
|
|
# intersect 4-6 |
|
536
|
|
|
|
|
|
|
# T GGA--C |
|
537
|
|
|
|
|
|
|
# Q G----C |
|
538
|
|
|
|
|
|
|
# O GGAGAC |
|
539
|
|
|
|
|
|
|
# result, complex_region 2-3 |
|
540
|
|
|
|
|
|
|
# T GGAC |
|
541
|
|
|
|
|
|
|
# Q G--C |
|
542
|
|
|
|
|
|
|
# O GGAC |
|
543
|
|
|
|
|
|
|
sub trim_complex_indel { |
|
544
|
6
|
|
|
6
|
0
|
21
|
my $seq_refs = shift; |
|
545
|
|
|
|
|
|
|
|
|
546
|
6
|
|
|
|
|
10
|
my $seq_count = scalar @{$seq_refs}; |
|
|
6
|
|
|
|
|
9
|
|
|
547
|
6
|
|
|
|
|
16
|
my @ingroup_idx = ( 0 .. $seq_count - 2 ); |
|
548
|
|
|
|
|
|
|
|
|
549
|
6
|
50
|
|
|
|
15
|
Carp::confess "Need three or more sequences\n" if $seq_count < 3; |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
# Don't expand indel set here. Last seq is outgroup |
|
552
|
6
|
|
|
|
|
10
|
my @indel_intspans; |
|
553
|
6
|
|
|
|
|
8
|
for my $i (@ingroup_idx) { |
|
554
|
12
|
|
|
|
|
21
|
my $indel_intspan = indel_intspan( $seq_refs->[$i] ); |
|
555
|
12
|
|
|
|
|
22
|
push @indel_intspans, $indel_intspan; |
|
556
|
|
|
|
|
|
|
} |
|
557
|
6
|
|
|
|
|
15
|
my $outgroup_indel_intspan = indel_intspan( $seq_refs->[ $seq_count - 1 ] ); |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# find trim_region |
|
560
|
6
|
|
|
|
|
13
|
my $union_set = AlignDB::IntSpan::union(@indel_intspans); |
|
561
|
6
|
|
|
|
|
619
|
my $intersect_set = AlignDB::IntSpan::intersect(@indel_intspans); |
|
562
|
|
|
|
|
|
|
|
|
563
|
6
|
|
|
|
|
1287
|
my $complex_region = AlignDB::IntSpan->new(); |
|
564
|
6
|
|
|
|
|
59
|
for ( reverse $intersect_set->spans ) { |
|
565
|
3
|
|
|
|
|
60
|
my $seg_start = $_->[0]; |
|
566
|
3
|
|
|
|
|
4
|
my $seg_end = $_->[1]; |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
# trim sequence, including outgroup |
|
569
|
3
|
|
|
|
|
8
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
570
|
9
|
|
|
|
|
20
|
substr( $seq_refs->[$i], $seg_start - 1, $seg_end - $seg_start + 1, "" ); |
|
571
|
|
|
|
|
|
|
} |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# add to complex_region |
|
574
|
3
|
|
|
|
|
7
|
for my $span ( $union_set->runlists ) { |
|
575
|
4
|
|
|
|
|
203
|
my $sub_union_set = AlignDB::IntSpan->new($span); |
|
576
|
4
|
100
|
|
|
|
395
|
if ( $sub_union_set->superset("$seg_start-$seg_end") ) { |
|
577
|
3
|
|
|
|
|
1010
|
$complex_region->merge($sub_union_set); |
|
578
|
|
|
|
|
|
|
} |
|
579
|
|
|
|
|
|
|
} |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
# modify all related set |
|
582
|
3
|
|
|
|
|
518
|
$union_set = $union_set->banish_span( $seg_start, $seg_end ); |
|
583
|
3
|
|
|
|
|
488
|
for (@ingroup_idx) { |
|
584
|
6
|
|
|
|
|
350
|
$indel_intspans[$_] |
|
585
|
|
|
|
|
|
|
= $indel_intspans[$_]->banish_span( $seg_start, $seg_end ); |
|
586
|
|
|
|
|
|
|
} |
|
587
|
3
|
|
|
|
|
212
|
$outgroup_indel_intspan->banish_span( $seg_start, $seg_end ); |
|
588
|
3
|
|
|
|
|
338
|
$complex_region = $complex_region->banish_span( $seg_start, $seg_end ); |
|
589
|
|
|
|
|
|
|
} |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
# add ingroup-outgroup complex indels to complex_region |
|
592
|
6
|
|
|
|
|
385
|
for my $i (@ingroup_idx) { |
|
593
|
12
|
|
|
|
|
1261
|
my $outgroup_intersect_intspan = $outgroup_indel_intspan->intersect( $indel_intspans[$i] ); |
|
594
|
12
|
|
|
|
|
2507
|
for my $sub_out_set ( $outgroup_intersect_intspan->sets ) { |
|
595
|
4
|
|
|
|
|
510
|
for my $sub_union_set ( $union_set->sets ) { |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# union_set > intersect_set |
|
598
|
5
|
50
|
|
|
|
921
|
if ( $sub_union_set->larger_than($sub_out_set) ) { |
|
599
|
0
|
|
|
|
|
0
|
$complex_region->merge($sub_union_set); |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
} |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
} |
|
604
|
|
|
|
|
|
|
|
|
605
|
6
|
|
|
|
|
97
|
return $complex_region->runlist; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
# read normal fasta files |
|
609
|
|
|
|
|
|
|
sub read_fasta { |
|
610
|
1
|
|
|
1
|
0
|
565
|
my $infile = shift; |
|
611
|
|
|
|
|
|
|
|
|
612
|
1
|
|
|
|
|
4
|
my ( @names, %seqs ); |
|
613
|
|
|
|
|
|
|
|
|
614
|
1
|
|
|
|
|
11
|
my $in_fh = IO::Zlib->new( $infile, "rb" ); |
|
615
|
|
|
|
|
|
|
|
|
616
|
1
|
|
|
|
|
2003
|
my $cur_name; |
|
617
|
1
|
|
|
|
|
6
|
while ( my $line = $in_fh->getline ) { |
|
618
|
8
|
|
|
|
|
1205
|
chomp $line; |
|
619
|
8
|
50
|
33
|
|
|
42
|
if ( $line eq '' or substr( $line, 0, 1 ) eq " " ) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
620
|
0
|
|
|
|
|
0
|
next; |
|
621
|
|
|
|
|
|
|
} |
|
622
|
|
|
|
|
|
|
elsif ( substr( $line, 0, 1 ) eq "#" ) { |
|
623
|
0
|
|
|
|
|
0
|
next; |
|
624
|
|
|
|
|
|
|
} |
|
625
|
|
|
|
|
|
|
elsif ( substr( $line, 0, 1 ) eq ">" ) { |
|
626
|
4
|
|
|
|
|
12
|
($cur_name) = split /\s+/, $line; |
|
627
|
4
|
|
|
|
|
16
|
$cur_name =~ s/^>//; |
|
628
|
4
|
|
|
|
|
10
|
push @names, $cur_name; |
|
629
|
4
|
|
|
|
|
15
|
$seqs{$cur_name} = ''; |
|
630
|
|
|
|
|
|
|
} |
|
631
|
|
|
|
|
|
|
else { |
|
632
|
4
|
|
|
|
|
16
|
$seqs{$cur_name} .= $line; |
|
633
|
|
|
|
|
|
|
} |
|
634
|
|
|
|
|
|
|
} |
|
635
|
|
|
|
|
|
|
|
|
636
|
1
|
|
|
|
|
301
|
$in_fh->close; |
|
637
|
|
|
|
|
|
|
|
|
638
|
1
|
|
|
|
|
194
|
tie my %seq_of, "Tie::IxHash"; |
|
639
|
1
|
|
|
|
|
30
|
for my $name (@names) { |
|
640
|
4
|
|
|
|
|
62
|
$seq_of{$name} = $seqs{$name}; |
|
641
|
|
|
|
|
|
|
} |
|
642
|
1
|
|
|
|
|
21
|
return \%seq_of; |
|
643
|
|
|
|
|
|
|
} |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
sub calc_gc_ratio { |
|
646
|
13
|
|
|
13
|
0
|
5515
|
my $seq_refs = shift; |
|
647
|
|
|
|
|
|
|
|
|
648
|
13
|
|
|
|
|
20
|
my $seq_count = scalar @{$seq_refs}; |
|
|
13
|
|
|
|
|
27
|
|
|
649
|
|
|
|
|
|
|
|
|
650
|
13
|
|
|
|
|
20
|
my @ratios; |
|
651
|
13
|
|
|
|
|
47
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
# Count all four bases |
|
654
|
17
|
|
|
|
|
32
|
my $a_count = $seq_refs->[$i] =~ tr/Aa/Aa/; |
|
655
|
17
|
|
|
|
|
23
|
my $g_count = $seq_refs->[$i] =~ tr/Gg/Gg/; |
|
656
|
17
|
|
|
|
|
30
|
my $c_count = $seq_refs->[$i] =~ tr/Cc/Cc/; |
|
657
|
17
|
|
|
|
|
21
|
my $t_count = $seq_refs->[$i] =~ tr/Tt/Tt/; |
|
658
|
|
|
|
|
|
|
|
|
659
|
17
|
|
|
|
|
27
|
my $four_count = $a_count + $g_count + $c_count + $t_count; |
|
660
|
17
|
|
|
|
|
23
|
my $gc_count = $g_count + $c_count; |
|
661
|
|
|
|
|
|
|
|
|
662
|
17
|
50
|
|
|
|
33
|
if ( $four_count == 0 ) { |
|
663
|
0
|
|
|
|
|
0
|
next; |
|
664
|
|
|
|
|
|
|
} |
|
665
|
|
|
|
|
|
|
else { |
|
666
|
17
|
|
|
|
|
27
|
my $gc_ratio = $gc_count / $four_count; |
|
667
|
17
|
|
|
|
|
39
|
push @ratios, $gc_ratio; |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
} |
|
670
|
|
|
|
|
|
|
|
|
671
|
13
|
|
|
|
|
33
|
return mean(@ratios); |
|
672
|
|
|
|
|
|
|
} |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
sub pair_D { |
|
675
|
60
|
|
|
60
|
0
|
361
|
my $seq_refs = shift; |
|
676
|
|
|
|
|
|
|
|
|
677
|
60
|
|
|
|
|
68
|
my $seq_count = scalar @{$seq_refs}; |
|
|
60
|
|
|
|
|
86
|
|
|
678
|
60
|
50
|
|
|
|
118
|
if ( $seq_count != 2 ) { |
|
679
|
0
|
|
|
|
|
0
|
Carp::confess "Need two sequences\n"; |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
60
|
|
|
|
|
95
|
my $legnth = length $seq_refs->[0]; |
|
683
|
|
|
|
|
|
|
|
|
684
|
60
|
|
|
|
|
108
|
my ( $comparable_bases, $differences ) = (0) x 2; |
|
685
|
|
|
|
|
|
|
|
|
686
|
60
|
|
|
|
|
109
|
for my $pos ( 1 .. $legnth ) { |
|
687
|
4943
|
|
|
|
|
7113
|
my $base0 = substr $seq_refs->[0], $pos - 1, 1; |
|
688
|
4943
|
|
|
|
|
6671
|
my $base1 = substr $seq_refs->[1], $pos - 1, 1; |
|
689
|
|
|
|
|
|
|
|
|
690
|
4943
|
100
|
100
|
|
|
15460
|
if ( $base0 =~ /[atcg]/i |
|
691
|
|
|
|
|
|
|
&& $base1 =~ /[atcg]/i ) |
|
692
|
|
|
|
|
|
|
{ |
|
693
|
4858
|
|
|
|
|
5808
|
$comparable_bases++; |
|
694
|
4858
|
100
|
|
|
|
8038
|
if ( $base0 ne $base1 ) { |
|
695
|
915
|
|
|
|
|
1279
|
$differences++; |
|
696
|
|
|
|
|
|
|
} |
|
697
|
|
|
|
|
|
|
} |
|
698
|
|
|
|
|
|
|
} |
|
699
|
|
|
|
|
|
|
|
|
700
|
60
|
50
|
|
|
|
108
|
if ( $comparable_bases == 0 ) { |
|
701
|
0
|
|
|
|
|
0
|
Carp::carp "comparable_bases == 0\n"; |
|
702
|
0
|
|
|
|
|
0
|
return 0; |
|
703
|
|
|
|
|
|
|
} |
|
704
|
|
|
|
|
|
|
else { |
|
705
|
60
|
|
|
|
|
153
|
return $differences / $comparable_bases; |
|
706
|
|
|
|
|
|
|
} |
|
707
|
|
|
|
|
|
|
} |
|
708
|
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
# Split D value to D1 (substitutions in first_seq), D2( substitutions in second_seq) and Dcomplex |
|
710
|
|
|
|
|
|
|
# (substitutions can't be referred) |
|
711
|
|
|
|
|
|
|
sub ref_pair_D { |
|
712
|
7
|
|
|
7
|
0
|
2742
|
my $seq_refs = shift; # first, second, outgroup |
|
713
|
|
|
|
|
|
|
|
|
714
|
7
|
|
|
|
|
25
|
my $seq_count = scalar @{$seq_refs}; |
|
|
7
|
|
|
|
|
16
|
|
|
715
|
7
|
50
|
|
|
|
18
|
if ( $seq_count != 3 ) { |
|
716
|
0
|
|
|
|
|
0
|
Carp::confess "Need three sequences\n"; |
|
717
|
|
|
|
|
|
|
} |
|
718
|
|
|
|
|
|
|
|
|
719
|
7
|
|
|
|
|
15
|
my ( $d1, $d2, $dc ) = (0) x 3; |
|
720
|
7
|
|
|
|
|
12
|
my $length = length $seq_refs->[0]; |
|
721
|
|
|
|
|
|
|
|
|
722
|
7
|
50
|
|
|
|
16
|
return ( $d1, $d2, $dc ) if $length == 0; |
|
723
|
|
|
|
|
|
|
|
|
724
|
7
|
|
|
|
|
15
|
for my $pos ( 1 .. $length ) { |
|
725
|
70
|
|
|
|
|
106
|
my $base0 = substr $seq_refs->[0], $pos - 1, 1; |
|
726
|
70
|
|
|
|
|
91
|
my $base1 = substr $seq_refs->[1], $pos - 1, 1; |
|
727
|
70
|
|
|
|
|
106
|
my $base_og = substr $seq_refs->[2], $pos - 1, 1; |
|
728
|
70
|
100
|
|
|
|
128
|
if ( $base0 ne $base1 ) { |
|
729
|
8
|
100
|
33
|
|
|
59
|
if ( $base0 =~ /[atcg]/i |
|
|
|
|
66
|
|
|
|
|
|
730
|
|
|
|
|
|
|
&& $base1 =~ /[atcg]/i |
|
731
|
|
|
|
|
|
|
&& $base_og =~ /[atcg]/i ) |
|
732
|
|
|
|
|
|
|
{ |
|
733
|
7
|
100
|
|
|
|
17
|
if ( $base1 eq $base_og ) { |
|
|
|
100
|
|
|
|
|
|
|
734
|
3
|
|
|
|
|
7
|
$d1++; |
|
735
|
|
|
|
|
|
|
} |
|
736
|
|
|
|
|
|
|
elsif ( $base0 eq $base_og ) { |
|
737
|
3
|
|
|
|
|
5
|
$d2++; |
|
738
|
|
|
|
|
|
|
} |
|
739
|
|
|
|
|
|
|
else { |
|
740
|
1
|
|
|
|
|
3
|
$dc++; |
|
741
|
|
|
|
|
|
|
} |
|
742
|
|
|
|
|
|
|
} |
|
743
|
|
|
|
|
|
|
else { |
|
744
|
1
|
|
|
|
|
2
|
$dc++; |
|
745
|
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
} |
|
747
|
|
|
|
|
|
|
} |
|
748
|
|
|
|
|
|
|
|
|
749
|
7
|
|
|
|
|
14
|
for ( $d1, $d2, $dc ) { |
|
750
|
21
|
|
|
|
|
30
|
$_ /= $length; |
|
751
|
|
|
|
|
|
|
} |
|
752
|
|
|
|
|
|
|
|
|
753
|
7
|
|
|
|
|
22
|
return ( $d1, $d2, $dc ); |
|
754
|
|
|
|
|
|
|
} |
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
sub multi_seq_stat { |
|
757
|
11
|
|
|
11
|
0
|
1290
|
my $seq_refs = shift; |
|
758
|
|
|
|
|
|
|
|
|
759
|
11
|
|
|
|
|
17
|
my $seq_count = scalar @{$seq_refs}; |
|
|
11
|
|
|
|
|
18
|
|
|
760
|
11
|
50
|
|
|
|
28
|
if ( $seq_count < 2 ) { |
|
761
|
0
|
|
|
|
|
0
|
Carp::confess "Need two or more sequences\n"; |
|
762
|
|
|
|
|
|
|
} |
|
763
|
|
|
|
|
|
|
|
|
764
|
11
|
|
|
|
|
25
|
my $legnth = length $seq_refs->[0]; |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
# For every positions, search for polymorphism_site |
|
767
|
11
|
|
|
|
|
30
|
my ( $comparable_bases, $identities, $differences, $gaps, $ns, $align_errors, ) = (0) x 6; |
|
768
|
11
|
|
|
|
|
28
|
for my $pos ( 1 .. $legnth ) { |
|
769
|
746
|
|
|
|
|
1083
|
my @bases = (); |
|
770
|
746
|
|
|
|
|
1238
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
771
|
2645
|
|
|
|
|
3815
|
my $base = substr( $seq_refs->[$i], $pos - 1, 1 ); |
|
772
|
2645
|
|
|
|
|
4159
|
push @bases, $base; |
|
773
|
|
|
|
|
|
|
} |
|
774
|
746
|
|
|
|
|
1126
|
@bases = uniq(@bases); |
|
775
|
|
|
|
|
|
|
|
|
776
|
746
|
100
|
|
991
|
|
2055
|
if ( all { $_ =~ /[agct]/i } @bases ) { |
|
|
991
|
100
|
|
|
|
3180
|
|
|
777
|
725
|
|
|
|
|
932
|
$comparable_bases++; |
|
778
|
725
|
100
|
|
936
|
|
1778
|
if ( all { $_ eq $bases[0] } @bases ) { |
|
|
936
|
|
|
|
|
2326
|
|
|
779
|
514
|
|
|
|
|
1278
|
$identities++; |
|
780
|
|
|
|
|
|
|
} |
|
781
|
|
|
|
|
|
|
else { |
|
782
|
211
|
|
|
|
|
557
|
$differences++; |
|
783
|
|
|
|
|
|
|
} |
|
784
|
|
|
|
|
|
|
} |
|
785
|
41
|
|
|
41
|
|
113
|
elsif ( any { $_ eq '-' } @bases ) { |
|
786
|
20
|
|
|
|
|
53
|
$gaps++; |
|
787
|
|
|
|
|
|
|
} |
|
788
|
|
|
|
|
|
|
else { |
|
789
|
1
|
|
|
|
|
3
|
$ns++; |
|
790
|
|
|
|
|
|
|
} |
|
791
|
|
|
|
|
|
|
} |
|
792
|
11
|
50
|
|
|
|
28
|
if ( $comparable_bases == 0 ) { |
|
793
|
0
|
|
|
|
|
0
|
print YAML::Syck::Dump { seqs => $seq_refs, }; |
|
794
|
0
|
|
|
|
|
0
|
Carp::carp "number_of_comparable_bases == 0!!\n"; |
|
795
|
0
|
|
|
|
|
0
|
return [ $legnth, $comparable_bases, $identities, $differences, |
|
796
|
|
|
|
|
|
|
$gaps, $ns, $legnth, undef, ]; |
|
797
|
|
|
|
|
|
|
} |
|
798
|
|
|
|
|
|
|
|
|
799
|
11
|
|
|
|
|
28
|
my @all_Ds; |
|
800
|
11
|
|
|
|
|
32
|
for ( my $i = 0; $i < $seq_count; $i++ ) { |
|
801
|
35
|
|
|
|
|
90
|
for ( my $j = $i + 1; $j < $seq_count; $j++ ) { |
|
802
|
42
|
|
|
|
|
114
|
my $D = pair_D( [ $seq_refs->[$i], $seq_refs->[$j] ] ); |
|
803
|
42
|
|
|
|
|
133
|
push @all_Ds, $D; |
|
804
|
|
|
|
|
|
|
} |
|
805
|
|
|
|
|
|
|
} |
|
806
|
|
|
|
|
|
|
|
|
807
|
11
|
|
|
|
|
29
|
my $D = mean(@all_Ds); |
|
808
|
|
|
|
|
|
|
|
|
809
|
11
|
|
|
|
|
49
|
return [ $legnth, $comparable_bases, $identities, $differences, |
|
810
|
|
|
|
|
|
|
$gaps, $ns, $align_errors, $D, ]; |
|
811
|
|
|
|
|
|
|
} |
|
812
|
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
sub get_snps { |
|
814
|
40
|
|
|
40
|
0
|
3042
|
my $seq_refs = shift; |
|
815
|
|
|
|
|
|
|
|
|
816
|
40
|
|
|
|
|
83
|
my $align_length = length $seq_refs->[0]; |
|
817
|
40
|
|
|
|
|
64
|
my $seq_count = scalar @{$seq_refs}; |
|
|
40
|
|
|
|
|
70
|
|
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
# SNPs |
|
820
|
40
|
|
|
|
|
73
|
my $snp_bases_of = {}; |
|
821
|
40
|
|
|
|
|
89
|
for my $pos ( 1 .. $align_length ) { |
|
822
|
40657
|
|
|
|
|
52216
|
my @bases; |
|
823
|
40657
|
|
|
|
|
67806
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
824
|
388137
|
|
|
|
|
530368
|
my $base = substr( $seq_refs->[$i], $pos - 1, 1 ); |
|
825
|
388137
|
|
|
|
|
574835
|
push @bases, $base; |
|
826
|
|
|
|
|
|
|
} |
|
827
|
|
|
|
|
|
|
|
|
828
|
40657
|
100
|
|
383322
|
|
99280
|
if ( all { $_ =~ /[agct]/i } @bases ) { |
|
|
383322
|
|
|
|
|
981508
|
|
|
829
|
39884
|
100
|
|
376606
|
|
98363
|
if ( any { $_ ne $bases[0] } @bases ) { |
|
|
376606
|
|
|
|
|
762196
|
|
|
830
|
1617
|
|
|
|
|
8068
|
$snp_bases_of->{$pos} = \@bases; |
|
831
|
|
|
|
|
|
|
} |
|
832
|
|
|
|
|
|
|
} |
|
833
|
|
|
|
|
|
|
} |
|
834
|
|
|
|
|
|
|
|
|
835
|
40
|
|
|
|
|
68
|
my @sites; |
|
836
|
40
|
|
|
|
|
77
|
for my $pos ( sort { $a <=> $b } keys %{$snp_bases_of} ) { |
|
|
9823
|
|
|
|
|
12115
|
|
|
|
40
|
|
|
|
|
674
|
|
|
837
|
|
|
|
|
|
|
|
|
838
|
1617
|
|
|
|
|
2250
|
my @bases = @{ $snp_bases_of->{$pos} }; |
|
|
1617
|
|
|
|
|
4148
|
|
|
839
|
|
|
|
|
|
|
|
|
840
|
1617
|
|
|
|
|
2225
|
my $target_base = $bases[0]; |
|
841
|
1617
|
|
|
|
|
2941
|
my $all_bases = join '', @bases; |
|
842
|
|
|
|
|
|
|
|
|
843
|
1617
|
|
|
|
|
2207
|
my $query_base; |
|
844
|
|
|
|
|
|
|
my $mutant_to; |
|
845
|
1617
|
|
|
|
|
1942
|
my $snp_freq = 0; |
|
846
|
1617
|
|
|
|
|
1933
|
my $snp_occured; |
|
847
|
1617
|
|
|
|
|
3877
|
my @class = uniq( sort @bases ); |
|
848
|
1617
|
50
|
|
|
|
3513
|
if ( scalar @class < 2 ) { |
|
|
|
100
|
|
|
|
|
|
|
849
|
0
|
|
|
|
|
0
|
Carp::confess "no snp\n"; |
|
850
|
|
|
|
|
|
|
} |
|
851
|
|
|
|
|
|
|
elsif ( scalar @class > 2 ) { |
|
852
|
84
|
|
|
|
|
122
|
$snp_freq = -1; |
|
853
|
84
|
|
|
|
|
111
|
$snp_occured = 'unknown'; |
|
854
|
84
|
|
|
|
|
118
|
$query_base = ""; |
|
855
|
84
|
|
|
|
|
115
|
$mutant_to = ""; |
|
856
|
|
|
|
|
|
|
} |
|
857
|
|
|
|
|
|
|
else { |
|
858
|
1533
|
|
|
|
|
2420
|
for (@bases) { |
|
859
|
10462
|
100
|
|
|
|
15292
|
if ( $target_base ne $_ ) { |
|
860
|
3619
|
|
|
|
|
4350
|
$snp_freq++; |
|
861
|
3619
|
|
|
|
|
4861
|
$snp_occured .= '0'; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
|
|
|
|
|
|
else { |
|
864
|
6843
|
|
|
|
|
9358
|
$snp_occured .= '1'; |
|
865
|
|
|
|
|
|
|
} |
|
866
|
|
|
|
|
|
|
} |
|
867
|
1533
|
|
|
|
|
2149
|
($query_base) = grep { $_ ne $target_base } @bases; |
|
|
10462
|
|
|
|
|
16272
|
|
|
868
|
1533
|
|
|
|
|
2577
|
$mutant_to = $target_base . '<->' . $query_base; |
|
869
|
|
|
|
|
|
|
} |
|
870
|
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# here freq is the minor allele freq |
|
872
|
1617
|
|
|
|
|
3022
|
$snp_freq = List::Util::min( $snp_freq, $seq_count - $snp_freq ); |
|
873
|
|
|
|
|
|
|
|
|
874
|
1617
|
|
|
|
|
8704
|
push @sites, |
|
875
|
|
|
|
|
|
|
{ |
|
876
|
|
|
|
|
|
|
snp_pos => $pos, |
|
877
|
|
|
|
|
|
|
snp_target_base => $target_base, |
|
878
|
|
|
|
|
|
|
snp_query_base => $query_base, |
|
879
|
|
|
|
|
|
|
snp_all_bases => $all_bases, |
|
880
|
|
|
|
|
|
|
snp_mutant_to => $mutant_to, |
|
881
|
|
|
|
|
|
|
snp_freq => $snp_freq, |
|
882
|
|
|
|
|
|
|
snp_occured => $snp_occured, |
|
883
|
|
|
|
|
|
|
}; |
|
884
|
|
|
|
|
|
|
} |
|
885
|
|
|
|
|
|
|
|
|
886
|
40
|
|
|
|
|
1119
|
return \@sites; |
|
887
|
|
|
|
|
|
|
} |
|
888
|
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
sub polarize_snp { |
|
890
|
13
|
|
|
13
|
0
|
67
|
my $sites = shift; |
|
891
|
13
|
|
|
|
|
25
|
my $outgroup_seq = shift; |
|
892
|
|
|
|
|
|
|
|
|
893
|
13
|
|
|
|
|
24
|
for my $site ( @{$sites} ) { |
|
|
13
|
|
|
|
|
25
|
|
|
894
|
195
|
|
|
|
|
360
|
my $outgroup_base = substr $outgroup_seq, $site->{snp_pos} - 1, 1; |
|
895
|
|
|
|
|
|
|
|
|
896
|
195
|
|
|
|
|
400
|
my @nts = split '', $site->{snp_all_bases}; |
|
897
|
195
|
|
|
|
|
254
|
my @class; |
|
898
|
195
|
|
|
|
|
278
|
for my $nt (@nts) { |
|
899
|
582
|
|
|
|
|
737
|
my $class_bool = 0; |
|
900
|
582
|
|
|
|
|
821
|
for (@class) { |
|
901
|
571
|
100
|
|
|
|
1070
|
if ( $_ eq $nt ) { $class_bool = 1; } |
|
|
188
|
|
|
|
|
275
|
|
|
902
|
|
|
|
|
|
|
} |
|
903
|
582
|
100
|
|
|
|
969
|
unless ($class_bool) { |
|
904
|
394
|
|
|
|
|
694
|
push @class, $nt; |
|
905
|
|
|
|
|
|
|
} |
|
906
|
|
|
|
|
|
|
} |
|
907
|
|
|
|
|
|
|
|
|
908
|
195
|
|
|
|
|
286
|
my ( $mutant_to, $snp_freq, $snp_occured ); |
|
909
|
|
|
|
|
|
|
|
|
910
|
195
|
50
|
|
|
|
391
|
if ( scalar @class < 2 ) { |
|
|
|
100
|
|
|
|
|
|
|
911
|
0
|
|
|
|
|
0
|
Carp::confess "Not a real SNP\n"; |
|
912
|
|
|
|
|
|
|
} |
|
913
|
|
|
|
|
|
|
elsif ( scalar @class == 2 ) { |
|
914
|
191
|
|
|
|
|
271
|
for my $nt (@nts) { |
|
915
|
570
|
100
|
|
|
|
914
|
if ( $nt eq $outgroup_base ) { |
|
916
|
255
|
|
|
|
|
372
|
$snp_occured .= '0'; |
|
917
|
|
|
|
|
|
|
} |
|
918
|
|
|
|
|
|
|
else { |
|
919
|
315
|
|
|
|
|
419
|
$snp_occured .= '1'; |
|
920
|
315
|
|
|
|
|
387
|
$snp_freq++; |
|
921
|
315
|
|
|
|
|
506
|
$mutant_to = "$outgroup_base->$nt"; |
|
922
|
|
|
|
|
|
|
} |
|
923
|
|
|
|
|
|
|
} |
|
924
|
|
|
|
|
|
|
} |
|
925
|
|
|
|
|
|
|
else { |
|
926
|
4
|
|
|
|
|
7
|
$snp_freq = -1; |
|
927
|
4
|
|
|
|
|
7
|
$mutant_to = 'Complex'; |
|
928
|
4
|
|
|
|
|
8
|
$snp_occured = 'unknown'; |
|
929
|
|
|
|
|
|
|
} |
|
930
|
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
# outgroup_base is not equal to any nts |
|
932
|
195
|
100
|
|
|
|
416
|
if ( $snp_occured eq ( '1' x ( length $snp_occured ) ) ) { |
|
933
|
28
|
|
|
|
|
41
|
$snp_freq = -1; |
|
934
|
28
|
|
|
|
|
40
|
$mutant_to = 'Complex'; |
|
935
|
28
|
|
|
|
|
35
|
$snp_occured = 'unknown'; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
|
|
|
|
|
|
|
|
938
|
195
|
|
|
|
|
455
|
$site->{snp_outgroup_base} = $outgroup_base; |
|
939
|
195
|
|
|
|
|
278
|
$site->{snp_mutant_to} = $mutant_to; |
|
940
|
195
|
|
|
|
|
241
|
$site->{snp_freq} = $snp_freq; |
|
941
|
195
|
|
|
|
|
397
|
$site->{snp_occured} = $snp_occured; |
|
942
|
|
|
|
|
|
|
} |
|
943
|
|
|
|
|
|
|
|
|
944
|
13
|
|
|
|
|
33
|
return $sites; |
|
945
|
|
|
|
|
|
|
} |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
sub get_indels { |
|
948
|
27
|
|
|
27
|
0
|
2076
|
my $seq_refs = shift; |
|
949
|
|
|
|
|
|
|
|
|
950
|
27
|
|
|
|
|
35
|
my $seq_count = scalar @{$seq_refs}; |
|
|
27
|
|
|
|
|
55
|
|
|
951
|
|
|
|
|
|
|
|
|
952
|
27
|
|
|
|
|
165
|
my $indel_set = AlignDB::IntSpan->new(); |
|
953
|
27
|
|
|
|
|
389
|
for my $i ( 0 .. $seq_count - 1 ) { |
|
954
|
99
|
|
|
|
|
4789
|
my $seq_indel_set = indel_intspan( $seq_refs->[$i] ); |
|
955
|
99
|
|
|
|
|
258
|
$indel_set->merge($seq_indel_set); |
|
956
|
|
|
|
|
|
|
} |
|
957
|
|
|
|
|
|
|
|
|
958
|
27
|
|
|
|
|
3862
|
my @sites; |
|
959
|
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# indels |
|
961
|
27
|
|
|
|
|
76
|
for my $cur_indel ( $indel_set->spans ) { |
|
962
|
52
|
|
|
|
|
462
|
my ( $indel_start, $indel_end ) = @{$cur_indel}; |
|
|
52
|
|
|
|
|
101
|
|
|
963
|
52
|
|
|
|
|
84
|
my $indel_length = $indel_end - $indel_start + 1; |
|
964
|
|
|
|
|
|
|
|
|
965
|
52
|
|
|
|
|
75
|
my @indel_seqs; |
|
966
|
52
|
|
|
|
|
67
|
for my $seq ( @{$seq_refs} ) { |
|
|
52
|
|
|
|
|
89
|
|
|
967
|
194
|
|
|
|
|
375
|
push @indel_seqs, ( substr $seq, $indel_start - 1, $indel_length ); |
|
968
|
|
|
|
|
|
|
} |
|
969
|
52
|
|
|
|
|
120
|
my $indel_all_seqs = join "|", @indel_seqs; |
|
970
|
|
|
|
|
|
|
|
|
971
|
52
|
|
|
|
|
78
|
my $indel_type; |
|
972
|
52
|
|
|
|
|
106
|
my @uniq_indel_seqs = uniq(@indel_seqs); |
|
973
|
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# seqs with least '-' char wins |
|
975
|
118
|
|
|
|
|
237
|
my ($indel_seq) = map { $_->[0] } |
|
976
|
80
|
|
|
|
|
183
|
sort { $a->[1] <=> $b->[1] } |
|
977
|
52
|
|
|
|
|
101
|
map { [ $_, tr/-/-/ ] } @uniq_indel_seqs; |
|
|
118
|
|
|
|
|
332
|
|
|
978
|
|
|
|
|
|
|
|
|
979
|
52
|
50
|
|
|
|
205
|
if ( scalar @uniq_indel_seqs < 2 ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
980
|
0
|
|
|
|
|
0
|
Carp::confess "no indel!\n"; |
|
981
|
0
|
|
|
|
|
0
|
next; |
|
982
|
|
|
|
|
|
|
} |
|
983
|
|
|
|
|
|
|
elsif ( scalar @uniq_indel_seqs > 2 ) { |
|
984
|
14
|
|
|
|
|
30
|
$indel_type = 'C'; |
|
985
|
|
|
|
|
|
|
} |
|
986
|
|
|
|
|
|
|
elsif ( $indel_seq =~ /-/ ) { |
|
987
|
0
|
|
|
|
|
0
|
$indel_type = 'C'; |
|
988
|
|
|
|
|
|
|
} |
|
989
|
|
|
|
|
|
|
else { |
|
990
|
|
|
|
|
|
|
# 'D': means deletion relative to target/first seq |
|
991
|
|
|
|
|
|
|
# target is ---- |
|
992
|
|
|
|
|
|
|
# 'I': means insertion relative to target/first seq |
|
993
|
|
|
|
|
|
|
# target is AAAA |
|
994
|
38
|
100
|
|
|
|
84
|
if ( $indel_seqs[0] eq $indel_seq ) { |
|
995
|
27
|
|
|
|
|
46
|
$indel_type = 'I'; |
|
996
|
|
|
|
|
|
|
} |
|
997
|
|
|
|
|
|
|
else { |
|
998
|
11
|
|
|
|
|
26
|
$indel_type = 'D'; |
|
999
|
|
|
|
|
|
|
} |
|
1000
|
|
|
|
|
|
|
} |
|
1001
|
|
|
|
|
|
|
|
|
1002
|
52
|
|
|
|
|
136
|
my $indel_freq = 0; |
|
1003
|
52
|
|
|
|
|
71
|
my $indel_occured; |
|
1004
|
52
|
100
|
|
|
|
99
|
if ( $indel_type eq 'C' ) { |
|
1005
|
14
|
|
|
|
|
22
|
$indel_freq = -1; |
|
1006
|
14
|
|
|
|
|
26
|
$indel_occured = 'unknown'; |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
else { |
|
1009
|
38
|
|
|
|
|
65
|
for (@indel_seqs) { |
|
1010
|
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
# same as target '1', not '0' |
|
1012
|
138
|
100
|
|
|
|
231
|
if ( $indel_seqs[0] eq $_ ) { |
|
1013
|
84
|
|
|
|
|
111
|
$indel_freq++; |
|
1014
|
84
|
|
|
|
|
134
|
$indel_occured .= '1'; |
|
1015
|
|
|
|
|
|
|
} |
|
1016
|
|
|
|
|
|
|
else { |
|
1017
|
54
|
|
|
|
|
90
|
$indel_occured .= '0'; |
|
1018
|
|
|
|
|
|
|
} |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
} |
|
1021
|
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
# here freq is the minor allele freq |
|
1023
|
52
|
|
|
|
|
114
|
$indel_freq = List::Util::min( $indel_freq, $seq_count - $indel_freq ); |
|
1024
|
|
|
|
|
|
|
|
|
1025
|
52
|
|
|
|
|
368
|
push @sites, |
|
1026
|
|
|
|
|
|
|
{ |
|
1027
|
|
|
|
|
|
|
indel_start => $indel_start, |
|
1028
|
|
|
|
|
|
|
indel_end => $indel_end, |
|
1029
|
|
|
|
|
|
|
indel_length => $indel_length, |
|
1030
|
|
|
|
|
|
|
indel_seq => $indel_seq, |
|
1031
|
|
|
|
|
|
|
indel_all_seqs => $indel_all_seqs, |
|
1032
|
|
|
|
|
|
|
indel_freq => $indel_freq, |
|
1033
|
|
|
|
|
|
|
indel_occured => $indel_occured, |
|
1034
|
|
|
|
|
|
|
indel_type => $indel_type, |
|
1035
|
|
|
|
|
|
|
}; |
|
1036
|
|
|
|
|
|
|
} |
|
1037
|
|
|
|
|
|
|
|
|
1038
|
27
|
|
|
|
|
347
|
return \@sites; |
|
1039
|
|
|
|
|
|
|
} |
|
1040
|
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
sub polarize_indel { |
|
1042
|
3
|
|
|
3
|
0
|
9
|
my $sites = shift; |
|
1043
|
3
|
|
|
|
|
7
|
my $outgroup_seq = shift; |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
3
|
|
|
|
|
9
|
my $outgroup_indel_set = indel_intspan($outgroup_seq); |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
3
|
|
|
|
|
7
|
for my $site ( @{$sites} ) { |
|
|
3
|
|
|
|
|
8
|
|
|
1048
|
5
|
|
|
|
|
18
|
my @indel_seqs = split /\|/, $site->{indel_all_seqs}; |
|
1049
|
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
my $indel_outgroup_seq = substr $outgroup_seq, $site->{indel_start} - 1, |
|
1051
|
5
|
|
|
|
|
27
|
$site->{indel_length}; |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
5
|
|
|
|
|
12
|
my ( $indel_type, $indel_occured, $indel_freq ); |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
my $indel_set |
|
1056
|
5
|
|
|
|
|
14
|
= AlignDB::IntSpan->new()->add_pair( $site->{indel_start}, $site->{indel_end} ); |
|
1057
|
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
# this line is different to previous subroutines |
|
1059
|
5
|
|
|
|
|
326
|
my @uniq_indel_seqs = uniq( @indel_seqs, $indel_outgroup_seq ); |
|
1060
|
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
# seqs with least '-' char wins |
|
1062
|
11
|
|
|
|
|
21
|
my ($indel_seq) = map { $_->[0] } |
|
1063
|
7
|
|
|
|
|
21
|
sort { $a->[1] <=> $b->[1] } |
|
1064
|
5
|
|
|
|
|
14
|
map { [ $_, tr/-/-/ ] } @uniq_indel_seqs; |
|
|
11
|
|
|
|
|
33
|
|
|
1065
|
|
|
|
|
|
|
|
|
1066
|
5
|
50
|
|
|
|
31
|
if ( scalar @uniq_indel_seqs < 2 ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1067
|
0
|
|
|
|
|
0
|
Carp::confess "no indel!\n"; |
|
1068
|
|
|
|
|
|
|
} |
|
1069
|
|
|
|
|
|
|
elsif ( scalar @uniq_indel_seqs > 2 ) { |
|
1070
|
1
|
|
|
|
|
3
|
$indel_type = 'C'; |
|
1071
|
|
|
|
|
|
|
} |
|
1072
|
|
|
|
|
|
|
elsif ( $indel_seq =~ /-/ ) { |
|
1073
|
0
|
|
|
|
|
0
|
$indel_type = 'C'; |
|
1074
|
|
|
|
|
|
|
} |
|
1075
|
|
|
|
|
|
|
else { |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
4
|
50
|
66
|
|
|
54
|
if ( ( $indel_outgroup_seq !~ /\-/ ) |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
and ( $indel_seq ne $indel_outgroup_seq ) ) |
|
1079
|
|
|
|
|
|
|
{ |
|
1080
|
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
# this section should already be judged in previes |
|
1082
|
|
|
|
|
|
|
# uniq_indel_seqs section, but I keep it here for safe |
|
1083
|
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
# reference indel content does not contain '-' and is not equal |
|
1085
|
|
|
|
|
|
|
# to the one of alignment |
|
1086
|
|
|
|
|
|
|
# AAA |
|
1087
|
|
|
|
|
|
|
# A-A |
|
1088
|
|
|
|
|
|
|
# ref ACA |
|
1089
|
0
|
|
|
|
|
0
|
$indel_type = 'C'; |
|
1090
|
|
|
|
|
|
|
} |
|
1091
|
|
|
|
|
|
|
elsif ( $outgroup_indel_set->intersect($indel_set)->is_not_empty ) { |
|
1092
|
3
|
|
|
|
|
1101
|
my $island = $outgroup_indel_set->find_islands($indel_set); |
|
1093
|
3
|
100
|
|
|
|
6759
|
if ( $island->equal($indel_set) ) { |
|
1094
|
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
# NNNN |
|
1096
|
|
|
|
|
|
|
# N--N |
|
1097
|
|
|
|
|
|
|
# ref N--N |
|
1098
|
2
|
|
|
|
|
159
|
$indel_type = 'I'; |
|
1099
|
|
|
|
|
|
|
} |
|
1100
|
|
|
|
|
|
|
else { |
|
1101
|
|
|
|
|
|
|
# reference indel island is larger or smaller |
|
1102
|
|
|
|
|
|
|
# NNNN |
|
1103
|
|
|
|
|
|
|
# N-NN |
|
1104
|
|
|
|
|
|
|
# ref N--N |
|
1105
|
|
|
|
|
|
|
# or |
|
1106
|
|
|
|
|
|
|
# NNNN |
|
1107
|
|
|
|
|
|
|
# N--N |
|
1108
|
|
|
|
|
|
|
# ref N-NN |
|
1109
|
1
|
|
|
|
|
70
|
$indel_type = 'C'; |
|
1110
|
|
|
|
|
|
|
} |
|
1111
|
|
|
|
|
|
|
} |
|
1112
|
|
|
|
|
|
|
elsif ( $outgroup_indel_set->intersect($indel_set)->is_empty ) { |
|
1113
|
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
# NNNN |
|
1115
|
|
|
|
|
|
|
# N--N |
|
1116
|
|
|
|
|
|
|
# ref NNNN |
|
1117
|
1
|
|
|
|
|
774
|
$indel_type = 'D'; |
|
1118
|
|
|
|
|
|
|
} |
|
1119
|
|
|
|
|
|
|
else { |
|
1120
|
0
|
|
|
|
|
0
|
Carp::confess "Errors when polarizing indel!\n"; |
|
1121
|
|
|
|
|
|
|
} |
|
1122
|
|
|
|
|
|
|
} |
|
1123
|
|
|
|
|
|
|
|
|
1124
|
5
|
100
|
|
|
|
19
|
if ( $indel_type eq 'C' ) { |
|
1125
|
2
|
|
|
|
|
5
|
$indel_occured = 'unknown'; |
|
1126
|
2
|
|
|
|
|
4
|
$indel_freq = -1; |
|
1127
|
|
|
|
|
|
|
} |
|
1128
|
|
|
|
|
|
|
else { |
|
1129
|
3
|
|
|
|
|
9
|
for my $seq (@indel_seqs) { |
|
1130
|
8
|
100
|
|
|
|
19
|
if ( $seq eq $indel_outgroup_seq ) { |
|
1131
|
3
|
|
|
|
|
9
|
$indel_occured .= '0'; |
|
1132
|
|
|
|
|
|
|
} |
|
1133
|
|
|
|
|
|
|
else { |
|
1134
|
5
|
|
|
|
|
11
|
$indel_occured .= '1'; |
|
1135
|
5
|
|
|
|
|
11
|
$indel_freq++; |
|
1136
|
|
|
|
|
|
|
} |
|
1137
|
|
|
|
|
|
|
} |
|
1138
|
|
|
|
|
|
|
} |
|
1139
|
|
|
|
|
|
|
|
|
1140
|
5
|
|
|
|
|
15
|
$site->{indel_outgroup_seq} = $indel_outgroup_seq; |
|
1141
|
5
|
|
|
|
|
11
|
$site->{indel_type} = $indel_type; |
|
1142
|
5
|
|
|
|
|
9
|
$site->{indel_occured} = $indel_occured; |
|
1143
|
5
|
|
|
|
|
18
|
$site->{indel_freq} = $indel_freq; |
|
1144
|
|
|
|
|
|
|
} |
|
1145
|
|
|
|
|
|
|
|
|
1146
|
3
|
|
|
|
|
13
|
return $sites; |
|
1147
|
|
|
|
|
|
|
} |
|
1148
|
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
sub chr_to_align { |
|
1150
|
8
|
|
|
8
|
1
|
75
|
my AlignDB::IntSpan $intspan = shift; |
|
1151
|
8
|
|
|
|
|
15
|
my $pos = shift; |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
8
|
|
50
|
|
|
21
|
my $chr_start = shift || 1; |
|
1154
|
8
|
|
50
|
|
|
20
|
my $chr_strand = shift || "+"; |
|
1155
|
|
|
|
|
|
|
|
|
1156
|
8
|
|
|
|
|
25
|
my $chr_end = $chr_start + $intspan->size - 1; |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
8
|
50
|
33
|
|
|
627
|
if ( $pos < $chr_start || $pos > $chr_end ) { |
|
1159
|
0
|
|
|
|
|
0
|
Carp::confess "[$pos] out of ranges [$chr_start,$chr_end]\n"; |
|
1160
|
|
|
|
|
|
|
} |
|
1161
|
|
|
|
|
|
|
|
|
1162
|
8
|
|
|
|
|
11
|
my $align_pos; |
|
1163
|
8
|
100
|
|
|
|
21
|
if ( $chr_strand eq "+" ) { |
|
1164
|
5
|
|
|
|
|
9
|
$align_pos = $pos - $chr_start + 1; |
|
1165
|
5
|
|
|
|
|
18
|
$align_pos = $intspan->at($align_pos); |
|
1166
|
|
|
|
|
|
|
} |
|
1167
|
|
|
|
|
|
|
else { |
|
1168
|
3
|
|
|
|
|
9
|
$align_pos = $pos - $chr_start + 1; |
|
1169
|
3
|
|
|
|
|
8
|
$align_pos = $intspan->at( -$align_pos ); |
|
1170
|
|
|
|
|
|
|
} |
|
1171
|
|
|
|
|
|
|
|
|
1172
|
8
|
|
|
|
|
816
|
return $align_pos; |
|
1173
|
|
|
|
|
|
|
} |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
sub align_to_chr { |
|
1176
|
734
|
|
|
734
|
1
|
7773
|
my AlignDB::IntSpan $intspan = shift; |
|
1177
|
734
|
|
|
|
|
974
|
my $pos = shift; |
|
1178
|
|
|
|
|
|
|
|
|
1179
|
734
|
|
50
|
|
|
1586
|
my $chr_start = shift || 1; |
|
1180
|
734
|
|
50
|
|
|
4719
|
my $chr_strand = shift || "+"; |
|
1181
|
|
|
|
|
|
|
|
|
1182
|
734
|
|
|
|
|
4958
|
my $chr_end = $chr_start + $intspan->size - 1; |
|
1183
|
|
|
|
|
|
|
|
|
1184
|
734
|
50
|
|
|
|
101997
|
if ( $pos < 1 ) { |
|
1185
|
0
|
|
|
|
|
0
|
Carp::confess "align pos out of ranges\n"; |
|
1186
|
|
|
|
|
|
|
} |
|
1187
|
|
|
|
|
|
|
|
|
1188
|
734
|
|
|
|
|
960
|
my $chr_pos; |
|
1189
|
734
|
100
|
|
|
|
1561
|
if ( $intspan->contains($pos) ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1190
|
728
|
|
|
|
|
34618
|
$chr_pos = $intspan->index($pos); |
|
1191
|
|
|
|
|
|
|
} |
|
1192
|
|
|
|
|
|
|
elsif ( $pos < $intspan->min ) { |
|
1193
|
2
|
|
|
|
|
117
|
$chr_pos = 1; |
|
1194
|
|
|
|
|
|
|
} |
|
1195
|
|
|
|
|
|
|
elsif ( $pos > $intspan->max ) { |
|
1196
|
2
|
|
|
|
|
167
|
$chr_pos = $intspan->size; |
|
1197
|
|
|
|
|
|
|
} |
|
1198
|
|
|
|
|
|
|
else { |
|
1199
|
|
|
|
|
|
|
# pin to left base |
|
1200
|
2
|
|
|
|
|
145
|
my @spans = $intspan->spans; |
|
1201
|
2
|
|
|
|
|
54
|
for my $i ( 0 .. $#spans ) { |
|
1202
|
4
|
100
|
|
|
|
10
|
if ( $spans[$i]->[1] < $pos ) { |
|
1203
|
2
|
|
|
|
|
4
|
next; |
|
1204
|
|
|
|
|
|
|
} |
|
1205
|
|
|
|
|
|
|
else { |
|
1206
|
2
|
|
|
|
|
6
|
$pos = $spans[ $i - 1 ]->[1]; |
|
1207
|
2
|
|
|
|
|
4
|
last; |
|
1208
|
|
|
|
|
|
|
} |
|
1209
|
|
|
|
|
|
|
} |
|
1210
|
2
|
|
|
|
|
6
|
$chr_pos = $intspan->index($pos); |
|
1211
|
|
|
|
|
|
|
} |
|
1212
|
|
|
|
|
|
|
|
|
1213
|
734
|
100
|
|
|
|
81440
|
if ( $chr_strand eq "+" ) { |
|
1214
|
727
|
|
|
|
|
1081
|
$chr_pos = $chr_pos + $chr_start - 1; |
|
1215
|
|
|
|
|
|
|
} |
|
1216
|
|
|
|
|
|
|
else { |
|
1217
|
7
|
|
|
|
|
12
|
$chr_pos = $chr_end - $chr_pos + 1; |
|
1218
|
|
|
|
|
|
|
} |
|
1219
|
|
|
|
|
|
|
|
|
1220
|
734
|
|
|
|
|
1443
|
return $chr_pos; |
|
1221
|
|
|
|
|
|
|
} |
|
1222
|
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
sub calc_ld { |
|
1224
|
9
|
|
|
9
|
1
|
5275
|
my $strA = shift; |
|
1225
|
9
|
|
|
|
|
16
|
my $strB = shift; |
|
1226
|
|
|
|
|
|
|
|
|
1227
|
9
|
|
|
|
|
15
|
my $size = length $strA; |
|
1228
|
|
|
|
|
|
|
|
|
1229
|
9
|
50
|
|
|
|
21
|
if ( $size != length $strB ) { |
|
1230
|
0
|
|
|
|
|
0
|
Carp::confess "Lengths not equal for [$strA] and [$strA]\n"; |
|
1231
|
0
|
|
|
|
|
0
|
return; |
|
1232
|
|
|
|
|
|
|
} |
|
1233
|
|
|
|
|
|
|
|
|
1234
|
9
|
|
|
|
|
18
|
for ( $strA, $strB ) { |
|
1235
|
18
|
50
|
|
|
|
54
|
if (/[^10]/) { |
|
1236
|
0
|
|
|
|
|
0
|
Carp::confess "[$_] contains illegal chars\n"; |
|
1237
|
0
|
|
|
|
|
0
|
return; |
|
1238
|
|
|
|
|
|
|
} |
|
1239
|
|
|
|
|
|
|
} |
|
1240
|
|
|
|
|
|
|
|
|
1241
|
9
|
|
|
|
|
17
|
my $A_count = $strA =~ tr/1/1/; |
|
1242
|
9
|
|
|
|
|
20
|
my $fA = $A_count / $size; |
|
1243
|
9
|
|
|
|
|
15
|
my $fa = 1 - $fA; |
|
1244
|
|
|
|
|
|
|
|
|
1245
|
9
|
|
|
|
|
13
|
my $B_count = $strB =~ tr/1/1/; |
|
1246
|
9
|
|
|
|
|
15
|
my $fB = $B_count / $size; |
|
1247
|
9
|
|
|
|
|
12
|
my $fb = 1 - $fB; |
|
1248
|
|
|
|
|
|
|
|
|
1249
|
9
|
50
|
|
36
|
|
48
|
if ( any { $_ == 0 } ( $fA, $fa, $fB, $fb ) ) { |
|
|
36
|
|
|
|
|
82
|
|
|
1250
|
0
|
|
|
|
|
0
|
return ( undef, undef ); |
|
1251
|
|
|
|
|
|
|
} |
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
# '1' in strA and '1' in strB as fAB |
|
1254
|
9
|
|
|
|
|
28
|
my ( $AB_count, $fAB ) = ( 0, 0 ); |
|
1255
|
9
|
|
|
|
|
18
|
for my $i ( 1 .. $size ) { |
|
1256
|
70
|
|
|
|
|
99
|
my $ichar = substr $strA, $i - 1, 1; |
|
1257
|
70
|
|
|
|
|
94
|
my $schar = substr $strB, $i - 1, 1; |
|
1258
|
70
|
100
|
100
|
|
|
181
|
if ( $ichar eq '1' and $schar eq '1' ) { |
|
1259
|
23
|
|
|
|
|
33
|
$AB_count++; |
|
1260
|
|
|
|
|
|
|
} |
|
1261
|
|
|
|
|
|
|
} |
|
1262
|
9
|
|
|
|
|
14
|
$fAB = $AB_count / $size; |
|
1263
|
|
|
|
|
|
|
|
|
1264
|
9
|
|
|
|
|
16
|
my $DAB = $fAB - $fA * $fB; |
|
1265
|
|
|
|
|
|
|
|
|
1266
|
9
|
|
|
|
|
13
|
my ( $r, $dprime ); |
|
1267
|
9
|
|
|
|
|
18
|
$r = $DAB / sqrt( $fA * $fa * $fB * $fb ); |
|
1268
|
|
|
|
|
|
|
|
|
1269
|
9
|
100
|
|
|
|
17
|
if ( $DAB < 0 ) { |
|
1270
|
1
|
|
|
|
|
4
|
$dprime = $DAB / List::Util::min( $fA * $fB, $fa * $fb ); |
|
1271
|
|
|
|
|
|
|
} |
|
1272
|
|
|
|
|
|
|
else { |
|
1273
|
8
|
|
|
|
|
24
|
$dprime = $DAB / List::Util::min( $fA * $fb, $fa * $fB ); |
|
1274
|
|
|
|
|
|
|
} |
|
1275
|
|
|
|
|
|
|
|
|
1276
|
9
|
|
|
|
|
28
|
return ( $r, $dprime ); |
|
1277
|
|
|
|
|
|
|
} |
|
1278
|
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
sub poa_consensus { |
|
1280
|
0
|
|
|
0
|
0
|
|
my $seq_refs = shift; |
|
1281
|
|
|
|
|
|
|
|
|
1282
|
0
|
|
|
|
|
|
my $aln_prog = "poa"; |
|
1283
|
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
# temp in and out |
|
1285
|
0
|
|
|
|
|
|
my $temp_in = Path::Tiny->tempfile("seq_in_XXXXXXXX"); |
|
1286
|
0
|
|
|
|
|
|
my $temp_out = Path::Tiny->tempfile("seq_out_XXXXXXXX"); |
|
1287
|
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
# msa may change the order of sequences |
|
1289
|
0
|
|
|
|
|
|
my @indexes = 0 .. scalar( @{$seq_refs} - 1 ); |
|
|
0
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
{ |
|
1291
|
0
|
|
|
|
|
|
my $fh = $temp_in->openw(); |
|
|
0
|
|
|
|
|
|
|
|
1292
|
0
|
|
|
|
|
|
for my $i (@indexes) { |
|
1293
|
0
|
|
|
|
|
|
printf {$fh} ">seq_%d\n", $i; |
|
|
0
|
|
|
|
|
|
|
|
1294
|
0
|
|
|
|
|
|
printf {$fh} "%s\n", $seq_refs->[$i]; |
|
|
0
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
} |
|
1296
|
0
|
|
|
|
|
|
close $fh; |
|
1297
|
|
|
|
|
|
|
} |
|
1298
|
|
|
|
|
|
|
|
|
1299
|
0
|
|
|
|
|
|
my @args; |
|
1300
|
|
|
|
|
|
|
{ |
|
1301
|
0
|
|
|
|
|
|
push @args, "-hb"; |
|
|
0
|
|
|
|
|
|
|
|
1302
|
0
|
|
|
|
|
|
push @args, "-read_fasta " . $temp_in->absolute->stringify; |
|
1303
|
0
|
|
|
|
|
|
push @args, "-clustal " . $temp_out->absolute->stringify; |
|
1304
|
0
|
|
|
|
|
|
push @args, File::ShareDir::dist_file( 'App-Fasops', 'poa-blosum80.mat' ); |
|
1305
|
|
|
|
|
|
|
} |
|
1306
|
|
|
|
|
|
|
|
|
1307
|
0
|
|
|
|
|
|
my $cmd_line = join " ", ( $aln_prog, @args ); |
|
1308
|
0
|
|
|
|
|
|
my $ok = IPC::Cmd::run( command => $cmd_line ); |
|
1309
|
|
|
|
|
|
|
|
|
1310
|
0
|
0
|
|
|
|
|
if ( !$ok ) { |
|
1311
|
0
|
|
|
|
|
|
Carp::confess("Calling [$aln_prog] failed\n"); |
|
1312
|
|
|
|
|
|
|
} |
|
1313
|
|
|
|
|
|
|
|
|
1314
|
0
|
|
|
|
|
|
my $consensus = join "", grep {/^CONSENS0/} $temp_out->lines( { chomp => 1, } ); |
|
|
0
|
|
|
|
|
|
|
|
1315
|
0
|
|
|
|
|
|
$consensus =~ s/CONSENS0//g; |
|
1316
|
0
|
|
|
|
|
|
$consensus =~ s/\s//g; |
|
1317
|
0
|
|
|
|
|
|
$consensus =~ s/-//g; |
|
1318
|
|
|
|
|
|
|
|
|
1319
|
0
|
|
|
|
|
|
return $consensus; |
|
1320
|
|
|
|
|
|
|
} |
|
1321
|
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
1; |
|
1323
|
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
__END__ |