| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Pheno::Ranker::Align; |
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
22
|
use strict; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
103
|
|
|
4
|
3
|
|
|
3
|
|
20
|
use warnings; |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
73
|
|
|
5
|
3
|
|
|
3
|
|
15
|
use autodie; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
33
|
|
|
6
|
3
|
|
|
3
|
|
17045
|
use feature qw(say); |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
258
|
|
|
7
|
3
|
|
|
3
|
|
18
|
use List::Util qw(any shuffle); |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
198
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#use List::MoreUtils qw(duplicates); |
|
10
|
3
|
|
|
3
|
|
18
|
use Data::Dumper; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
145
|
|
|
11
|
3
|
|
|
3
|
|
2494
|
use Sort::Naturally qw(nsort); |
|
|
3
|
|
|
|
|
13760
|
|
|
|
3
|
|
|
|
|
209
|
|
|
12
|
3
|
|
|
3
|
|
2187
|
use Hash::Fold fold => { array_delimiter => ':' }; |
|
|
3
|
|
|
|
|
114535
|
|
|
|
3
|
|
|
|
|
19
|
|
|
13
|
3
|
|
|
3
|
|
2847
|
use Pheno::Ranker::Stats; |
|
|
3
|
|
|
|
|
9
|
|
|
|
3
|
|
|
|
|
255
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
3
|
|
|
3
|
|
22
|
use Exporter 'import'; |
|
|
3
|
|
|
|
|
8
|
|
|
|
3
|
|
|
|
|
145
|
|
|
16
|
|
|
|
|
|
|
our @EXPORT = |
|
17
|
|
|
|
|
|
|
qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json); |
|
18
|
|
|
|
|
|
|
|
|
19
|
3
|
|
|
3
|
|
16
|
use constant DEVEL_MODE => 0; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
13595
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our %nomenclature = (); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub check_format { |
|
24
|
|
|
|
|
|
|
|
|
25
|
3
|
|
|
3
|
0
|
17
|
my $data = shift; |
|
26
|
3
|
50
|
|
|
|
36
|
return exists $data->[0]{subject} ? 'PXF' : 'BFF'; |
|
27
|
|
|
|
|
|
|
} |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub cohort_comparison { |
|
30
|
|
|
|
|
|
|
|
|
31
|
5
|
|
|
5
|
0
|
15
|
my ( $ref_binary_hash, $self ) = @_; |
|
32
|
5
|
|
|
|
|
15
|
my $out_file = $self->{out_file}; |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
say "Performing COHORT comparison" |
|
35
|
5
|
50
|
33
|
|
|
44
|
if ( $self->{debug} || $self->{verbose} ); |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
38
|
|
|
|
|
|
|
# The ids/cohorts are naturally sorted (it won't match --append-prefixes!!!) |
|
39
|
5
|
|
|
|
|
12
|
my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} ); |
|
|
5
|
|
|
|
|
200
|
|
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Print to $out_file |
|
42
|
|
|
|
|
|
|
# |
|
43
|
5
|
|
|
|
|
23219
|
open( my $fh, ">", $out_file ); |
|
44
|
5
|
|
|
|
|
7032
|
say $fh "\t", join "\t", @sorted_keys_ref_binary_hash; |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# NB: It's a symmetric matrix so we could just compute |
|
47
|
|
|
|
|
|
|
# triangle. However, R needs the whole matrix |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# Initialize the 2D matrix |
|
50
|
5
|
|
|
|
|
20
|
my @matrix; |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# I elements |
|
53
|
5
|
|
|
|
|
35
|
for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) { |
|
54
|
|
|
|
|
|
|
say "Calculating <" |
|
55
|
|
|
|
|
|
|
. $sorted_keys_ref_binary_hash[$i] |
|
56
|
|
|
|
|
|
|
. "> against the cohort..." |
|
57
|
163
|
50
|
|
|
|
295
|
if $self->{verbose}; |
|
58
|
|
|
|
|
|
|
my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] } |
|
59
|
163
|
|
|
|
|
223
|
{binary_digit_string_weighted}; |
|
60
|
163
|
|
|
|
|
228
|
print $fh $sorted_keys_ref_binary_hash[$i] . "\t"; |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# J elements |
|
63
|
163
|
|
|
|
|
271
|
for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) { |
|
64
|
|
|
|
|
|
|
my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] } |
|
65
|
5875
|
|
|
|
|
7104
|
{binary_digit_string_weighted}; |
|
66
|
5875
|
|
|
|
|
5072
|
my $distance; |
|
67
|
|
|
|
|
|
|
|
|
68
|
5875
|
100
|
|
|
|
7481
|
if ( $j < $i ) { |
|
|
|
100
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# Use the distance from the lower triangle |
|
71
|
2856
|
|
|
|
|
2881
|
$distance = $matrix[$j][$i]; |
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
elsif ( $j == $i ) { |
|
74
|
163
|
|
|
|
|
174
|
$distance = 0; # Diagonal element |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
else { |
|
77
|
|
|
|
|
|
|
# Compute the distance and store it in the matrix |
|
78
|
2856
|
|
|
|
|
3845
|
$distance = hd_fast( $str1, $str2 ); |
|
79
|
2856
|
|
|
|
|
4045
|
$matrix[$i][$j] = $distance; |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Fill out the other triangle |
|
83
|
5875
|
|
|
|
|
6678
|
$matrix[$j][$i] = $distance; |
|
84
|
|
|
|
|
|
|
|
|
85
|
5875
|
|
|
|
|
8345
|
print $fh $distance . "\t"; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
163
|
|
|
|
|
226
|
print $fh "\n"; |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
|
|
91
|
5
|
|
|
|
|
35
|
close $fh; |
|
92
|
5
|
50
|
33
|
|
|
3099
|
say "Matrix saved to <$out_file>" if ( $self->{debug} || $self->{verbose} ); |
|
93
|
5
|
|
|
|
|
156
|
return 1; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
sub compare_and_rank { |
|
97
|
|
|
|
|
|
|
|
|
98
|
1
|
|
|
1
|
0
|
4
|
my $arg = shift; |
|
99
|
1
|
|
|
|
|
3
|
my $glob_hash = $arg->{glob_hash}; |
|
100
|
1
|
|
|
|
|
3
|
my $ref_binary_hash = $arg->{ref_binary_hash}; |
|
101
|
1
|
|
|
|
|
2
|
my $tar_binary_hash = $arg->{tar_binary_hash}; |
|
102
|
1
|
|
|
|
|
2
|
my $weight = $arg->{weight}; |
|
103
|
1
|
|
|
|
|
4
|
my $self = $arg->{self}; |
|
104
|
1
|
|
|
|
|
3
|
my $sort_by = $self->{sort_by}; |
|
105
|
1
|
|
|
|
|
3
|
my $align = $self->{align}; |
|
106
|
1
|
|
|
|
|
2
|
my $max_out = $self->{max_out}; |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
say "Performing COHORT(REF)-PATIENT(TAR) comparison" |
|
109
|
1
|
50
|
33
|
|
|
78
|
if ( $self->{debug} || $self->{verbose} ); |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# Hash for compiling distances |
|
112
|
1
|
|
|
|
|
3
|
my $score; |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# Hash for stats |
|
115
|
|
|
|
|
|
|
my $stat; |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# Load TAR binary string |
|
118
|
1
|
|
|
|
|
3
|
my ($tar) = keys %{$tar_binary_hash}; |
|
|
1
|
|
|
|
|
3
|
|
|
119
|
|
|
|
|
|
|
my $tar_str_weighted = |
|
120
|
1
|
|
|
|
|
4
|
$tar_binary_hash->{$tar}{binary_digit_string_weighted}; |
|
121
|
|
|
|
|
|
|
|
|
122
|
1
|
|
|
|
|
2
|
for my $key ( keys %{$ref_binary_hash} ) { # No need to sort |
|
|
1
|
|
|
|
|
8
|
|
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# Load REF binary string |
|
125
|
|
|
|
|
|
|
my $ref_str_weighted = |
|
126
|
36
|
|
|
|
|
65
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}; |
|
127
|
36
|
50
|
|
|
|
57
|
say "Comparing <id:$key> --- <id:$tar>" if $self->{verbose}; |
|
128
|
|
|
|
|
|
|
say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n" |
|
129
|
36
|
50
|
33
|
|
|
70
|
if ( defined $self->{debug} && $self->{debug} > 1 ); |
|
130
|
|
|
|
|
|
|
$score->{$key}{hamming} = |
|
131
|
36
|
|
|
|
|
76
|
hd_fast( $ref_str_weighted, $tar_str_weighted ); |
|
132
|
|
|
|
|
|
|
$score->{$key}{jaccard} = |
|
133
|
36
|
|
|
|
|
63
|
jaccard_similarity( $ref_str_weighted, $tar_str_weighted ); |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Add values |
|
136
|
36
|
|
|
|
|
47
|
push @{ $stat->{hamming_data} }, $score->{$key}{hamming}; |
|
|
36
|
|
|
|
|
80
|
|
|
137
|
36
|
|
|
|
|
37
|
push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard}; |
|
|
36
|
|
|
|
|
63
|
|
|
138
|
|
|
|
|
|
|
} |
|
139
|
1
|
|
|
|
|
8
|
$stat->{hamming_stats} = add_stats( $stat->{hamming_data} ); |
|
140
|
1
|
|
|
|
|
6
|
$stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} ); |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# Initialize a few variables |
|
143
|
1
|
|
|
|
|
6
|
my @headers = ( |
|
144
|
|
|
|
|
|
|
'RANK', 'REFERENCE(ID)', |
|
145
|
|
|
|
|
|
|
'TARGET(ID)', 'FORMAT', |
|
146
|
|
|
|
|
|
|
'LENGTH', 'WEIGHTED', |
|
147
|
|
|
|
|
|
|
'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE', |
|
148
|
|
|
|
|
|
|
'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)', |
|
149
|
|
|
|
|
|
|
'JACCARD-INDEX', 'JACCARD-Z-SCORE', |
|
150
|
|
|
|
|
|
|
'JACCARD-P-VALUE' |
|
151
|
|
|
|
|
|
|
); |
|
152
|
1
|
|
|
|
|
5
|
my $header = join "\t", @headers; |
|
153
|
1
|
|
|
|
|
2
|
my @results = $header; |
|
154
|
1
|
|
|
|
|
2
|
my %info; |
|
155
|
1
|
|
|
|
|
2
|
my $length_align = length($tar_str_weighted); |
|
156
|
1
|
50
|
|
|
|
4
|
my $weight_bool = $weight ? 'True' : 'False'; |
|
157
|
1
|
|
|
|
|
2
|
my @alignments_ascii; |
|
158
|
|
|
|
|
|
|
my $alignment_str_csv; |
|
159
|
1
|
|
|
|
|
2
|
my @alignments_csv = join ';', |
|
160
|
|
|
|
|
|
|
qw/id ref indicator tar weight hamming-distance json-path label/; |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# The dataframe will have two header lines |
|
163
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
164
|
|
|
|
|
|
|
# nsort does not yield same results as canonical from JSON::XS |
|
165
|
|
|
|
|
|
|
# NB: we're sorting here and in create_binary_digit_string() |
|
166
|
1
|
|
|
|
|
2
|
my @sort_keys_glob_hash = sort keys %{$glob_hash}; |
|
|
1
|
|
|
|
|
90
|
|
|
167
|
1
|
100
|
|
|
|
8
|
my @labels = map { exists $nomenclature{$_} ? $nomenclature{$_} : $_ } |
|
|
159
|
|
|
|
|
233
|
|
|
168
|
|
|
|
|
|
|
@sort_keys_glob_hash; |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Die if #elements in arrays differ |
|
171
|
1
|
50
|
|
|
|
5
|
die "Mismatch between variables and labels" |
|
172
|
|
|
|
|
|
|
if @sort_keys_glob_hash != @labels; |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Labels |
|
175
|
|
|
|
|
|
|
# NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE' |
|
176
|
1
|
|
|
|
|
22
|
my @dataframe = join ';', 'Id', @labels; |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# Variables (json path) |
|
179
|
1
|
|
|
|
|
22
|
push @dataframe, join ';', 'Id', @sort_keys_glob_hash; |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# 0/1 |
|
182
|
|
|
|
|
|
|
push @dataframe, join ';', qq/T|$tar/, |
|
183
|
1
|
|
|
|
|
38
|
( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# Sort %score by value and load results |
|
186
|
1
|
|
|
|
|
9
|
my $count = 1; |
|
187
|
1
|
|
|
|
|
2
|
$max_out++; # to be able to start w/ ONE |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# Start loop |
|
190
|
1
|
|
|
|
|
11
|
for my $key ( |
|
191
|
|
|
|
|
|
|
sort { |
|
192
|
|
|
|
|
|
|
$sort_by eq 'jaccard' # |
|
193
|
|
|
|
|
|
|
? $score->{$b}{$sort_by} |
|
194
|
|
|
|
|
|
|
<=> $score->{$a}{$sort_by} # 1 to 0 (similarity) |
|
195
|
|
|
|
|
|
|
: $score->{$a}{$sort_by} |
|
196
|
141
|
50
|
|
|
|
209
|
<=> $score->{$b}{$sort_by} # 0 to N (distance) |
|
197
|
|
|
|
|
|
|
} keys %$score |
|
198
|
|
|
|
|
|
|
) |
|
199
|
|
|
|
|
|
|
{ |
|
200
|
|
|
|
|
|
|
|
|
201
|
36
|
50
|
|
|
|
87
|
say "$count: Creating alignment <id:$key>" if $self->{verbose}; |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# Create ASCII alignment |
|
204
|
|
|
|
|
|
|
# NB: We need it here to get $n_00 |
|
205
|
|
|
|
|
|
|
my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) = |
|
206
|
|
|
|
|
|
|
create_alignment( |
|
207
|
|
|
|
|
|
|
{ |
|
208
|
|
|
|
|
|
|
ref_key => $key, |
|
209
|
|
|
|
|
|
|
ref_str_weighted => |
|
210
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}, |
|
211
|
36
|
|
|
|
|
230
|
tar_str_weighted => $tar_str_weighted, |
|
212
|
|
|
|
|
|
|
glob_hash => $glob_hash, |
|
213
|
|
|
|
|
|
|
sorted_keys_glob_hash => \@sort_keys_glob_hash, |
|
214
|
|
|
|
|
|
|
labels => \@labels |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
); |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
219
|
|
|
|
|
|
|
# The LENGTH of the alignment is based on the #variables in the REF-COHORT |
|
220
|
|
|
|
|
|
|
# Compute estimated av and dev for binary_string of L = length_align - n_00 |
|
221
|
|
|
|
|
|
|
# Corrected length_align L = length_align - n_00 |
|
222
|
36
|
|
|
|
|
102
|
my $length_align_corrected = $length_align - $n_00; |
|
223
|
36
|
|
|
|
|
116
|
( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) = |
|
224
|
|
|
|
|
|
|
estimate_hamming_stats($length_align_corrected); |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Compute a few stats |
|
227
|
|
|
|
|
|
|
my $hamming_z_score = z_score( |
|
228
|
|
|
|
|
|
|
$score->{$key}{hamming}, |
|
229
|
|
|
|
|
|
|
$stat->{hamming_stats}{mean}, |
|
230
|
|
|
|
|
|
|
$stat->{hamming_stats}{sd} |
|
231
|
36
|
|
|
|
|
108
|
); |
|
232
|
|
|
|
|
|
|
my $hamming_z_score_from_random = z_score( |
|
233
|
|
|
|
|
|
|
$score->{$key}{hamming}, |
|
234
|
|
|
|
|
|
|
$stat->{hamming_stats}{mean_rnd}, |
|
235
|
|
|
|
|
|
|
$stat->{hamming_stats}{sd_rnd} |
|
236
|
36
|
|
|
|
|
79
|
); |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
#my $hamming_p_value = |
|
239
|
|
|
|
|
|
|
# p_value( $score->{$key}{hamming}, $length_align_corrected ); |
|
240
|
36
|
|
|
|
|
60
|
my $hamming_p_value_from_z_score = |
|
241
|
|
|
|
|
|
|
p_value_from_z_score($hamming_z_score); |
|
242
|
|
|
|
|
|
|
my $jaccard_z_score = z_score( |
|
243
|
|
|
|
|
|
|
$score->{$key}{jaccard}, |
|
244
|
|
|
|
|
|
|
$stat->{jaccard_stats}{mean}, |
|
245
|
|
|
|
|
|
|
$stat->{jaccard_stats}{sd} |
|
246
|
36
|
|
|
|
|
88
|
); |
|
247
|
36
|
|
|
|
|
63
|
my $jaccard_p_value_from_z_score = |
|
248
|
|
|
|
|
|
|
p_value_from_z_score( 1 - $jaccard_z_score ); |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Create a hash with formats |
|
251
|
|
|
|
|
|
|
my $format = { |
|
252
|
|
|
|
|
|
|
'RANK' => { value => $count, format => undef }, |
|
253
|
|
|
|
|
|
|
'REFERENCE(ID)' => { value => $key, format => undef }, |
|
254
|
|
|
|
|
|
|
'TARGET(ID)' => { value => $tar, format => undef }, |
|
255
|
|
|
|
|
|
|
'FORMAT' => { value => $self->{format}, format => undef }, |
|
256
|
|
|
|
|
|
|
'WEIGHTED' => { value => $weight_bool, format => undef }, |
|
257
|
|
|
|
|
|
|
'LENGTH' => { value => $length_align_corrected, format => '%6d' }, |
|
258
|
|
|
|
|
|
|
'HAMMING-DISTANCE' => |
|
259
|
|
|
|
|
|
|
{ value => $score->{$key}{hamming}, format => '%4d' }, |
|
260
|
|
|
|
|
|
|
'DISTANCE-Z-SCORE' => |
|
261
|
|
|
|
|
|
|
{ value => $hamming_z_score, format => '%7.3f' }, |
|
262
|
|
|
|
|
|
|
'DISTANCE-P-VALUE' => |
|
263
|
|
|
|
|
|
|
{ value => $hamming_p_value_from_z_score, format => '%12.7f' }, |
|
264
|
|
|
|
|
|
|
'DISTANCE-Z-SCORE(RAND)' => |
|
265
|
|
|
|
|
|
|
{ value => $hamming_z_score_from_random, format => '%8.4f' }, |
|
266
|
|
|
|
|
|
|
'JACCARD-INDEX' => |
|
267
|
36
|
|
|
|
|
688
|
{ value => $score->{$key}{jaccard}, format => '%7.3f' }, |
|
268
|
|
|
|
|
|
|
'JACCARD-Z-SCORE' => |
|
269
|
|
|
|
|
|
|
{ value => $jaccard_z_score, format => '%7.3f' }, |
|
270
|
|
|
|
|
|
|
'JACCARD-P-VALUE' => |
|
271
|
|
|
|
|
|
|
{ value => $jaccard_p_value_from_z_score, format => '%12.7f' }, |
|
272
|
|
|
|
|
|
|
}; |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# Serialize results |
|
275
|
|
|
|
|
|
|
my $tmp_str = join "\t", map { |
|
276
|
36
|
|
|
|
|
94
|
defined $format->{$_}{format} |
|
277
|
|
|
|
|
|
|
? sprintf( $format->{$_}{format}, $format->{$_}{value} ) |
|
278
|
|
|
|
|
|
|
: $format->{$_}{value} |
|
279
|
468
|
100
|
|
|
|
1607
|
} @headers; |
|
280
|
36
|
|
|
|
|
106
|
push @results, $tmp_str; |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# To save memory only load if --align |
|
283
|
36
|
50
|
|
|
|
59
|
if ( defined $align ) { |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# Add all of the above to @alignments_ascii |
|
286
|
36
|
|
|
|
|
66
|
my $sep = ('-') x 80; |
|
287
|
36
|
|
|
|
|
1153
|
push @alignments_ascii, |
|
288
|
|
|
|
|
|
|
qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/; |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# Add all of the above to @alignments_csv |
|
291
|
36
|
|
|
|
|
1364
|
push @alignments_csv, @$alignment_arr_csv; |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Add data to dataframe |
|
294
|
|
|
|
|
|
|
push @dataframe, join ';', qq/R|$key/, |
|
295
|
36
|
|
|
|
|
1527
|
( split //, $ref_binary_hash->{$key}{binary_digit_string} ); |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# Add values to info |
|
298
|
|
|
|
|
|
|
$info{$key} = { |
|
299
|
|
|
|
|
|
|
weighted => $weight_bool eq 'True' |
|
300
|
|
|
|
|
|
|
? JSON::XS::true |
|
301
|
|
|
|
|
|
|
: JSON::XS::false, |
|
302
|
|
|
|
|
|
|
reference_id => $key, |
|
303
|
|
|
|
|
|
|
target_id => $tar, |
|
304
|
|
|
|
|
|
|
reference_binary_string => |
|
305
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string}, |
|
306
|
|
|
|
|
|
|
target_binary_string => |
|
307
|
|
|
|
|
|
|
$tar_binary_hash->{$key}{binary_digit_string}, |
|
308
|
|
|
|
|
|
|
reference_binary_string_weighted => |
|
309
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}, |
|
310
|
|
|
|
|
|
|
target_binary_string_weighted => |
|
311
|
|
|
|
|
|
|
$tar_binary_hash->{$key}{binary_digit_string_weighted}, |
|
312
|
|
|
|
|
|
|
alignment_length => $length_align_corrected, |
|
313
|
|
|
|
|
|
|
hamming_distance => $score->{$key}{hamming}, |
|
314
|
|
|
|
|
|
|
hamming_z_score => $hamming_z_score, |
|
315
|
|
|
|
|
|
|
hamming_p_value => $hamming_p_value_from_z_score, |
|
316
|
|
|
|
|
|
|
jaccard_similarity => $score->{$key}{jaccard}, |
|
317
|
|
|
|
|
|
|
jaccard_z_score => $jaccard_z_score, |
|
318
|
|
|
|
|
|
|
jaccard_p_value => $jaccard_p_value_from_z_score, |
|
319
|
|
|
|
|
|
|
jaccard_distance => 1 - $score->{$key}{jaccard}, |
|
320
|
|
|
|
|
|
|
format => $self->{format}, |
|
321
|
36
|
50
|
|
|
|
430
|
alignment => $$alignment_str_ascii, |
|
322
|
|
|
|
|
|
|
}; |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
|
|
326
|
36
|
|
|
|
|
1524
|
$count++; |
|
327
|
36
|
100
|
|
|
|
474
|
last if $count == $max_out; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
1
|
|
|
|
|
45
|
return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv; |
|
331
|
|
|
|
|
|
|
} |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub create_alignment { |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# NB: The alignment will use the weighted string |
|
336
|
36
|
|
|
36
|
0
|
54
|
my $arg = shift; |
|
337
|
36
|
|
|
|
|
76
|
my $ref_key = $arg->{ref_key}; |
|
338
|
36
|
|
|
|
|
47
|
my $binary_string1 = $arg->{ref_str_weighted}; |
|
339
|
36
|
|
|
|
|
42
|
my $binary_string2 = $arg->{tar_str_weighted}; |
|
340
|
36
|
|
|
|
|
43
|
my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash}; |
|
341
|
36
|
|
|
|
|
64
|
my $labels = $arg->{labels}; |
|
342
|
36
|
|
|
|
|
45
|
my $glob_hash = $arg->{glob_hash}; |
|
343
|
36
|
|
|
|
|
44
|
my $length1 = length($binary_string1); |
|
344
|
36
|
|
|
|
|
43
|
my $length2 = length($binary_string2); |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# Check that l1 = l2 |
|
347
|
36
|
50
|
|
|
|
64
|
die "The binary strings must have the same length" |
|
348
|
|
|
|
|
|
|
if ( $length1 != $length2 ); |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# Expand array to have weights as N-elements |
|
351
|
36
|
|
|
|
|
62
|
my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash ); |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# Initialize some variables |
|
354
|
36
|
|
|
|
|
52
|
my $out_ascii = "REF -- TAR\n"; |
|
355
|
36
|
|
|
|
|
36
|
my @out_csv; |
|
356
|
36
|
|
|
|
|
35
|
my $cum_distance = 0; |
|
357
|
36
|
|
|
|
|
40
|
my $n_00 = 0; |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# For loop with 2 variables |
|
360
|
|
|
|
|
|
|
# i index for weighted |
|
361
|
|
|
|
|
|
|
# j the number of variables |
|
362
|
36
|
|
|
|
|
60
|
my ( $i, $j ) = ( 0, 0 ); |
|
363
|
36
|
|
|
|
|
66
|
for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) { |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
# Load key and value |
|
366
|
5724
|
|
|
|
|
7499
|
my $key = $recreated_array->[$i]; |
|
367
|
5724
|
|
|
|
|
9385
|
my $val = sprintf( "%3d", $glob_hash->{$key} ); |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# We have to keep track with $j |
|
370
|
5724
|
|
|
|
|
5877
|
my $sorted_key = $sorted_keys_glob_hash->[$j]; |
|
371
|
5724
|
|
|
|
|
5769
|
my $label = $labels->[$j]; |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# Load chars |
|
374
|
5724
|
|
|
|
|
6874
|
my $char1 = substr( $binary_string1, $i, 1 ); |
|
375
|
5724
|
|
|
|
|
6188
|
my $char2 = substr( $binary_string2, $i, 1 ); |
|
376
|
5724
|
100
|
100
|
|
|
12080
|
$n_00++ if ( $char1 == 0 && $char2 == 0 ); |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# Correct $i index by adding weights |
|
379
|
5724
|
|
|
|
|
6384
|
$i = $i + $glob_hash->{$key} - 1; |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Load metrics |
|
382
|
5724
|
100
|
|
|
|
7810
|
$cum_distance += $glob_hash->{$key} if $char1 ne $char2; |
|
383
|
5724
|
|
|
|
|
8061
|
my $cum_distance_pretty = sprintf( "%3d", $cum_distance ); |
|
384
|
5724
|
100
|
|
|
|
7550
|
my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key}; |
|
385
|
5724
|
|
|
|
|
6879
|
my $distance_pretty = sprintf( "%3d", $distance ); |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# w = weight, d = distance, cd = cumul distance |
|
388
|
5724
|
|
|
|
|
11041
|
my %format = ( |
|
389
|
|
|
|
|
|
|
'11' => '-----', |
|
390
|
|
|
|
|
|
|
'10' => 'xxx--', |
|
391
|
|
|
|
|
|
|
'01' => '--xxx', |
|
392
|
|
|
|
|
|
|
'00' => ' ' |
|
393
|
|
|
|
|
|
|
); |
|
394
|
5724
|
|
|
|
|
15105
|
$out_ascii .= |
|
395
|
|
|
|
|
|
|
qq/$char1 $format{ $char1 . $char2 } $char2 | (w:$val|d:$distance_pretty|cd:$cum_distance_pretty|) $sorted_key ($label)\n/; |
|
396
|
5724
|
|
|
|
|
25838
|
push @out_csv, |
|
397
|
|
|
|
|
|
|
qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/; |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
#REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path |
|
400
|
|
|
|
|
|
|
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y |
|
401
|
|
|
|
|
|
|
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y |
|
402
|
|
|
|
|
|
|
#1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y |
|
403
|
|
|
|
|
|
|
} |
|
404
|
36
|
|
|
|
|
362
|
return $n_00, \$out_ascii, \@out_csv; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
sub recreate_array { |
|
408
|
|
|
|
|
|
|
|
|
409
|
36
|
|
|
36
|
0
|
68
|
my ( $glob_hash, $sorted_keys_glob_hash ) = @_; |
|
410
|
36
|
|
|
|
|
45
|
my @recreated_array; |
|
411
|
36
|
|
|
|
|
62
|
foreach my $key (@$sorted_keys_glob_hash) { |
|
412
|
5724
|
|
|
|
|
8410
|
for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) { |
|
413
|
6012
|
|
|
|
|
10273
|
push @recreated_array, $key; |
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
} |
|
416
|
36
|
|
|
|
|
70
|
return \@recreated_array; |
|
417
|
|
|
|
|
|
|
} |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub create_glob_and_ref_hashes { |
|
420
|
|
|
|
|
|
|
|
|
421
|
6
|
|
|
6
|
0
|
23
|
my ( $array, $weight, $self ) = @_; |
|
422
|
6
|
|
|
|
|
16
|
my $primary_key = $self->{primary_key}; |
|
423
|
6
|
|
|
|
|
13
|
my $glob_hash = {}; |
|
424
|
6
|
|
|
|
|
20
|
my $ref_hash_flattened; |
|
425
|
|
|
|
|
|
|
|
|
426
|
6
|
|
|
|
|
12
|
for my $i ( @{$array} ) { |
|
|
6
|
|
|
|
|
16
|
|
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# For consistency, we obtain the primary_key for both BFF/PXF |
|
429
|
|
|
|
|
|
|
# from $_->{id} (not from subject.id) |
|
430
|
199
|
|
|
|
|
520
|
my $id = $i->{$primary_key}; |
|
431
|
199
|
50
|
|
|
|
462
|
say "Flattening and remapping <id:$id> ..." if $self->{verbose}; |
|
432
|
199
|
|
|
|
|
661
|
my $ref_hash = remap_hash( |
|
433
|
|
|
|
|
|
|
{ |
|
434
|
|
|
|
|
|
|
hash => $i, |
|
435
|
|
|
|
|
|
|
weight => $weight, |
|
436
|
|
|
|
|
|
|
self => $self |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
); |
|
439
|
199
|
|
|
|
|
749
|
$ref_hash_flattened->{$id} = $ref_hash; |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
# The idea is to create a $glob_hash with unique key-values |
|
442
|
|
|
|
|
|
|
# Duplicates will be automatically merged |
|
443
|
199
|
|
|
|
|
8021
|
$glob_hash = { %$glob_hash, %$ref_hash }; |
|
444
|
|
|
|
|
|
|
} |
|
445
|
6
|
|
|
|
|
39
|
return ( $glob_hash, $ref_hash_flattened ); |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub randomize_variables { |
|
449
|
|
|
|
|
|
|
|
|
450
|
0
|
|
|
0
|
0
|
0
|
my ( $glob_hash, $self ) = @_; |
|
451
|
0
|
|
|
|
|
0
|
my $max = $self->{max_number_var}; |
|
452
|
0
|
|
|
|
|
0
|
my $seed = $self->{seed}; |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
# set random seed |
|
455
|
0
|
|
|
|
|
0
|
srand($seed); |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# Randomize |
|
458
|
|
|
|
|
|
|
# NB:keys have to be sorted for srand to work!!!! |
|
459
|
|
|
|
|
|
|
# perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5' |
|
460
|
0
|
|
|
|
|
0
|
my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ]; |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# Create a new hash with a hash slice |
|
463
|
0
|
|
|
|
|
0
|
my %new_glob_hash; |
|
464
|
0
|
|
|
|
|
0
|
@new_glob_hash{@items} = @{$glob_hash}{@items}; |
|
|
0
|
|
|
|
|
0
|
|
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# return reference |
|
467
|
0
|
|
|
|
|
0
|
return \%new_glob_hash; |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
sub prune_excluded_included { |
|
471
|
|
|
|
|
|
|
|
|
472
|
200
|
|
|
200
|
0
|
346
|
my ( $hash, $self ) = @_; |
|
473
|
200
|
|
|
|
|
231
|
my @included = @{ $self->{include_terms} }; |
|
|
200
|
|
|
|
|
384
|
|
|
474
|
200
|
|
|
|
|
247
|
my @excluded = @{ $self->{exclude_terms} }; |
|
|
200
|
|
|
|
|
287
|
|
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
# Die if we have both options at the same time |
|
477
|
200
|
50
|
66
|
|
|
448
|
die "Sorry, <--include> and <--exclude> are mutually exclusive\n" |
|
478
|
|
|
|
|
|
|
if ( @included && @excluded ); |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
481
|
|
|
|
|
|
|
# Original $hash is modified |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# INCLUDED |
|
484
|
200
|
100
|
|
|
|
334
|
if (@included) { |
|
485
|
25
|
|
|
|
|
52
|
for my $key ( keys %$hash ) { |
|
486
|
125
|
100
|
|
225
|
|
322
|
delete $hash->{$key} unless any { $_ eq $key } @included; |
|
|
225
|
|
|
|
|
437
|
|
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# EXCLUDED |
|
491
|
200
|
50
|
|
|
|
339
|
if (@excluded) { |
|
492
|
0
|
|
|
|
|
0
|
for my $key (@excluded) { |
|
493
|
0
|
0
|
|
|
|
0
|
delete $hash->{$key} if exists $hash->{$key}; |
|
494
|
|
|
|
|
|
|
} |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# We will do nothing if @included = @excluded = [] (DEFAULT) |
|
498
|
200
|
|
|
|
|
292
|
return 1; |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
sub undef_excluded_phenotypicFeatures { |
|
502
|
|
|
|
|
|
|
|
|
503
|
200
|
|
|
200
|
0
|
235
|
my $hash = shift; |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Setting the property to undef (it will be discarded later) |
|
506
|
200
|
100
|
|
|
|
467
|
if ( exists $hash->{phenotypicFeatures} ) { |
|
507
|
71
|
|
|
|
|
226
|
@{ $hash->{phenotypicFeatures} } = |
|
508
|
71
|
100
|
|
|
|
92
|
map { $_->{excluded} ? undef : $_ } @{ $hash->{phenotypicFeatures} }; |
|
|
179
|
|
|
|
|
1289
|
|
|
|
71
|
|
|
|
|
286
|
|
|
509
|
|
|
|
|
|
|
} |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
512
|
|
|
|
|
|
|
# Because of setting to undef the excludd properties, it can happen that in |
|
513
|
|
|
|
|
|
|
# the stats file we have phenotypicFeatures = 100% but t hen it turns out |
|
514
|
|
|
|
|
|
|
# that some individuals have phenotypicFeatures = {} (all excluded) |
|
515
|
200
|
|
|
|
|
598
|
return $hash; |
|
516
|
|
|
|
|
|
|
} |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
sub remap_hash { |
|
519
|
|
|
|
|
|
|
|
|
520
|
200
|
|
|
200
|
0
|
301
|
my $arg = shift; |
|
521
|
200
|
|
|
|
|
278
|
my $hash = $arg->{hash}; |
|
522
|
200
|
|
|
|
|
228
|
my $weight = $arg->{weight}; |
|
523
|
200
|
|
|
|
|
239
|
my $self = $arg->{self}; |
|
524
|
200
|
|
|
|
|
251
|
my $nodes = $self->{nodes}; |
|
525
|
200
|
|
|
|
|
279
|
my $edges = $self->{edges}; |
|
526
|
200
|
|
|
|
|
192
|
my $out_hash; |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# Do some pruning excluded / included |
|
529
|
200
|
|
|
|
|
427
|
prune_excluded_included( $hash, $self ); |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
532
|
|
|
|
|
|
|
# The user may include a term that: |
|
533
|
|
|
|
|
|
|
# A - may not exist in any individual |
|
534
|
|
|
|
|
|
|
# B - does not exist in some individuals |
|
535
|
|
|
|
|
|
|
# If the term does not exist in a invidual |
|
536
|
|
|
|
|
|
|
# - a) -include-terms contains ANOTHER TERM THAT EXISTS |
|
537
|
|
|
|
|
|
|
# %$hash will contain keys => OK |
|
538
|
|
|
|
|
|
|
# - b) -include-terms only includes the term/terms not present |
|
539
|
|
|
|
|
|
|
# %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm |
|
540
|
|
|
|
|
|
|
#print Dumper $hash; |
|
541
|
200
|
50
|
|
|
|
366
|
return {} unless %$hash; |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# A bit more pruning plus collapsing |
|
544
|
200
|
|
|
|
|
327
|
$hash = fold( undef_excluded_phenotypicFeatures($hash) ); |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# Load the hash that points to the hierarchy for ontology |
|
547
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
548
|
|
|
|
|
|
|
# - phenotypicFeatures.featureType.id => BFF |
|
549
|
|
|
|
|
|
|
# - phenotypicFeatures.type.id => PXF |
|
550
|
200
|
|
|
|
|
893536
|
my $id_correspondence = $self->{id_correspondence}; |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# Now we proceed for each key |
|
553
|
200
|
|
|
|
|
282
|
for my $key ( keys %{$hash} ) { |
|
|
200
|
|
|
|
|
3534
|
|
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
# To see which ones were discarded |
|
556
|
|
|
|
|
|
|
#say $key if !defined $hash->{$key}; |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# Discard undefined |
|
559
|
20603
|
100
|
|
|
|
35040
|
next unless defined $hash->{$key}; |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time) |
|
562
|
|
|
|
|
|
|
# Some can be "rescued" by adding the ontology as ($1) |
|
563
|
|
|
|
|
|
|
# NB1: We discard _labels too!! |
|
564
|
|
|
|
|
|
|
# NB2: info|metaData are always discarded |
|
565
|
|
|
|
|
|
|
|
|
566
|
19847
|
|
|
|
|
23046
|
my $regex = $self->{exclude_properties_regex}; |
|
567
|
|
|
|
|
|
|
next |
|
568
|
19847
|
100
|
100
|
|
|
94501
|
if ( $regex && $key =~ m/$regex/ ) |
|
569
|
|
|
|
|
|
|
; # $regex has to be defined and be != '' |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# The user can turn on age related values |
|
572
|
7976
|
100
|
66
|
|
|
28010
|
next if ( $key =~ m/age|onset/i && !$self->{age} ); # $self->{age} [0|1] |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# Load values |
|
575
|
7563
|
|
|
|
|
10016
|
my $val = $hash->{$key}; |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
# Discarding lines with val (Time profiled with :NYTProf: ms time) |
|
578
|
|
|
|
|
|
|
next |
|
579
|
7563
|
100
|
33
|
|
|
39187
|
if ( $val eq 'NA' |
|
|
|
|
33
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
580
|
|
|
|
|
|
|
|| $val eq 'Fake' |
|
581
|
|
|
|
|
|
|
|| $val eq 'None:No matching concept' |
|
582
|
|
|
|
|
|
|
|| $val =~ m/1900-01-01|NA0000|P999Y|P9999Y|ARRAY|phenopacket_id/ ); |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
# Add IDs to key |
|
585
|
5715
|
|
|
|
|
9479
|
my $id_key = add_id2key( $key, $hash, $self ); |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# Finally add value to id_key |
|
588
|
5715
|
|
|
|
|
11387
|
my $tmp_key = $id_key . '.' . $val; |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
# Add HPO ascendants |
|
591
|
5715
|
50
|
33
|
|
|
9276
|
if ( defined $edges && $val =~ /^HP:/ ) { |
|
592
|
0
|
|
|
|
|
0
|
my $ascendants = add_hpo_ascendants( $tmp_key, $nodes, $edges ); |
|
593
|
0
|
|
|
|
|
0
|
$out_hash->{$_} = 1 for @$ascendants; # weight 1 for now |
|
594
|
|
|
|
|
|
|
} |
|
595
|
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
################## |
|
597
|
|
|
|
|
|
|
# Assign weights # |
|
598
|
|
|
|
|
|
|
################## |
|
599
|
|
|
|
|
|
|
# NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION! |
|
600
|
|
|
|
|
|
|
# NB: We don't warn if it does not exist, just assign 1 |
|
601
|
|
|
|
|
|
|
# *** IMPORTANT *** 07-26-2023 |
|
602
|
|
|
|
|
|
|
# We allow for assigning weights by TERM (e.g., 1D) |
|
603
|
|
|
|
|
|
|
# but VARIABLE level takes precedence to TERM |
|
604
|
|
|
|
|
|
|
|
|
605
|
5715
|
|
|
|
|
6221
|
my $tmp_key_at_term_level = $tmp_key; |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
# If variable has . then capture $1 |
|
608
|
5715
|
50
|
|
|
|
10407
|
if ( $tmp_key_at_term_level =~ m/\./ ) { |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# NB: For long str regex is faster than (split /\./, $foo)[0] |
|
611
|
5715
|
|
|
|
|
10573
|
$tmp_key_at_term_level =~ m/^(\w+)\./; |
|
612
|
5715
|
|
|
|
|
9084
|
$tmp_key_at_term_level = $1; |
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# ORDER MATTERS !!!! |
|
616
|
|
|
|
|
|
|
$out_hash->{$tmp_key} = |
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
# VARIABLE LEVEL |
|
619
|
|
|
|
|
|
|
exists $weight->{$tmp_key} |
|
620
|
|
|
|
|
|
|
? $weight->{$tmp_key} |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# TERM LEVEL |
|
623
|
|
|
|
|
|
|
: exists $weight->{$tmp_key_at_term_level} |
|
624
|
5715
|
50
|
|
|
|
14904
|
? $weight->{$tmp_key_at_term_level} |
|
|
|
100
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
# NO WEIGHT |
|
627
|
|
|
|
|
|
|
: 1; |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
# Finally we load the Nomenclature hash |
|
630
|
5715
|
|
|
|
|
6260
|
my $label = $key; |
|
631
|
5715
|
|
|
|
|
13175
|
$label =~ s/id/label/; |
|
632
|
5715
|
100
|
|
|
|
17732
|
$nomenclature{$tmp_key} = $hash->{$label} if defined $hash->{$label}; |
|
633
|
|
|
|
|
|
|
} |
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
636
|
|
|
|
|
|
|
# We have to return an object {} when undef |
|
637
|
200
|
|
50
|
|
|
4034
|
return $out_hash // {}; |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
sub add_hpo_ascendants { |
|
641
|
|
|
|
|
|
|
|
|
642
|
0
|
|
|
0
|
0
|
0
|
my ( $key, $nodes, $edges ) = @_; |
|
643
|
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
# First we obtain the ontology (0000539) from HP:0000539 |
|
645
|
0
|
|
|
|
|
0
|
$key =~ m/HP:(\w+)$/; |
|
646
|
0
|
|
|
|
|
0
|
my $ontology = $1; |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# We'll use it to build a string equivalent to a key from $edges |
|
649
|
0
|
|
|
|
|
0
|
my $hpo_url = 'http://purl.obolibrary.org/obo/HP_'; |
|
650
|
0
|
|
|
|
|
0
|
my $hpo_key = $hpo_url . $ontology; |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# We will include all ascendants in an array |
|
653
|
0
|
|
|
|
|
0
|
my @ascendants; |
|
654
|
0
|
|
|
|
|
0
|
for my $parent_id ( @{ $edges->{$hpo_key} } ) { |
|
|
0
|
|
|
|
|
0
|
|
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# We have to create a copy to not modify the original $parent_id |
|
657
|
|
|
|
|
|
|
# as it can appear in multiple individuals |
|
658
|
0
|
|
|
|
|
0
|
my $copy_parent_id = $parent_id; |
|
659
|
0
|
|
|
|
|
0
|
$copy_parent_id =~ m/\/(\w+)$/; |
|
660
|
0
|
|
|
|
|
0
|
$copy_parent_id = $1; |
|
661
|
0
|
|
|
|
|
0
|
$copy_parent_id =~ tr/_/:/; |
|
662
|
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
664
|
|
|
|
|
|
|
# We cannot add any label to the ascendants, otherwise they will |
|
665
|
|
|
|
|
|
|
# not be matched by an indv down the tree |
|
666
|
|
|
|
|
|
|
# Myopia |
|
667
|
|
|
|
|
|
|
# Mild Myopia |
|
668
|
|
|
|
|
|
|
# We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia' |
|
669
|
|
|
|
|
|
|
# Use the labels only for debug |
|
670
|
0
|
|
|
|
|
0
|
my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key; |
|
671
|
0
|
|
|
|
|
0
|
$asc_key =~ s/HP:$ontology/$copy_parent_id/g; |
|
672
|
0
|
|
|
|
|
0
|
push @ascendants, $asc_key; |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
# We finally add the label to %nomenclature |
|
675
|
0
|
|
|
|
|
0
|
my $hpo_asc_str = $hpo_url |
|
676
|
|
|
|
|
|
|
. $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539 |
|
677
|
0
|
|
|
|
|
0
|
$hpo_asc_str =~ s/HP://; # 0000539 |
|
678
|
0
|
|
|
|
|
0
|
$nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl}; |
|
679
|
|
|
|
|
|
|
} |
|
680
|
0
|
|
|
|
|
0
|
return \@ascendants; |
|
681
|
|
|
|
|
|
|
} |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
sub add_id2key { |
|
684
|
|
|
|
|
|
|
|
|
685
|
5715
|
|
|
5715
|
0
|
8169
|
my ( $key, $hash, $self ) = @_; |
|
686
|
5715
|
|
|
|
|
7978
|
my $id_correspondence = $self->{id_correspondence}{ $self->{format} }; |
|
687
|
5715
|
|
|
|
|
5788
|
my $array_terms_str = join '|', @{ $self->{array_terms} }; |
|
|
5715
|
|
|
|
|
12338
|
|
|
688
|
5715
|
|
|
|
|
6775
|
my $array_regex = $self->{array_regex}; |
|
689
|
|
|
|
|
|
|
|
|
690
|
5715
|
100
|
|
|
|
20117
|
if ( $key =~ /$array_terms_str/ ) { |
|
691
|
5094
|
|
|
|
|
14458
|
$key =~ m/$array_regex/; |
|
692
|
5094
|
|
|
|
|
6251
|
my ( $tmp_key, $val ); |
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
# Normal behaviour |
|
695
|
5094
|
100
|
|
|
|
8856
|
if ( defined $3 ) { |
|
696
|
4964
|
|
|
|
|
11648
|
$tmp_key = $1 . ':' . $2 . '.' . $id_correspondence->{$1}; |
|
697
|
4964
|
|
|
|
|
6967
|
$val = $hash->{$tmp_key}; |
|
698
|
4964
|
|
|
|
|
9388
|
$key = $1 . '.' . $val . '.' . $3; |
|
699
|
|
|
|
|
|
|
} |
|
700
|
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
# Only two |
|
702
|
|
|
|
|
|
|
else { |
|
703
|
|
|
|
|
|
|
|
|
704
|
130
|
|
|
|
|
240
|
$tmp_key = $1 . ':' . $2; |
|
705
|
130
|
|
|
|
|
176
|
$val = $hash->{$tmp_key}; |
|
706
|
130
|
|
|
|
|
201
|
$key = $1; |
|
707
|
|
|
|
|
|
|
} |
|
708
|
|
|
|
|
|
|
} |
|
709
|
5715
|
|
|
|
|
9073
|
return $key; |
|
710
|
|
|
|
|
|
|
} |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
sub create_binary_digit_string { |
|
713
|
|
|
|
|
|
|
|
|
714
|
7
|
|
|
7
|
0
|
17
|
my ( $glob_hash, $cmp_hash ) = @_; |
|
715
|
7
|
|
|
|
|
12
|
my $out_hash; |
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
718
|
|
|
|
|
|
|
# Being a nested for, keys %{$glob_hash} does not need sorting |
|
719
|
|
|
|
|
|
|
# BUT, we sort to follow the same order as serialized (sorted) |
|
720
|
7
|
|
|
|
|
13
|
my @sorted_keys_glob_hash = sort keys %{$glob_hash}; |
|
|
7
|
|
|
|
|
433
|
|
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
# IDs of each indidividual |
|
723
|
7
|
|
|
|
|
39
|
for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort |
|
|
7
|
|
|
|
|
49
|
|
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# One-hot encoding = Representing categorical data as numerical |
|
726
|
200
|
|
|
|
|
294
|
my ( $binary_str, $binary_str_weighted ) = ('') x 2; |
|
727
|
200
|
|
|
|
|
230
|
for my $key (@sorted_keys_glob_hash) { |
|
728
|
21837
|
|
|
|
|
23644
|
my $ones = (1) x $glob_hash->{$key}; |
|
729
|
21837
|
|
|
|
|
23099
|
my $zeros = (0) x $glob_hash->{$key}; |
|
730
|
21837
|
100
|
|
|
|
28492
|
$binary_str .= exists $cmp_hash->{$individual_id}{$key} ? 1 : 0; |
|
731
|
|
|
|
|
|
|
$binary_str_weighted .= |
|
732
|
21837
|
100
|
|
|
|
30395
|
exists $cmp_hash->{$individual_id}{$key} ? $ones : $zeros; |
|
733
|
|
|
|
|
|
|
} |
|
734
|
200
|
|
|
|
|
488
|
$out_hash->{$individual_id}{binary_digit_string} = $binary_str; |
|
735
|
|
|
|
|
|
|
$out_hash->{$individual_id}{binary_digit_string_weighted} = |
|
736
|
200
|
|
|
|
|
314
|
$binary_str_weighted; |
|
737
|
|
|
|
|
|
|
} |
|
738
|
7
|
|
|
|
|
77
|
return $out_hash; |
|
739
|
|
|
|
|
|
|
} |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
sub parse_hpo_json { |
|
742
|
|
|
|
|
|
|
|
|
743
|
0
|
|
|
0
|
0
|
|
my $data = shift; |
|
744
|
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# The <hp.json> file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format. |
|
746
|
|
|
|
|
|
|
# The HPO is structured into a directed acyclic graph (DAG) |
|
747
|
|
|
|
|
|
|
# Here's a brief overview of the structure of the hpo.json file: |
|
748
|
|
|
|
|
|
|
# - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys: |
|
749
|
|
|
|
|
|
|
# - nodes: An array of objects, each representing an HPO term. Each term object has the following keys: |
|
750
|
|
|
|
|
|
|
# - id: The identifier of the term (e.g., "HP:0000118"). |
|
751
|
|
|
|
|
|
|
# - lbl: The label (name) of the term (e.g., "Phenotypic abnormality"). |
|
752
|
|
|
|
|
|
|
# - meta: Metadata associated with the term, including definition, synonyms, and other information. |
|
753
|
|
|
|
|
|
|
# - type: The type of the term, usually "CLASS". |
|
754
|
|
|
|
|
|
|
# - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys: |
|
755
|
|
|
|
|
|
|
# - sub: The subject (child) term ID (e.g., "HP:0000924"). |
|
756
|
|
|
|
|
|
|
# - obj: The object (parent) term ID (e.g., "HP:0000118"). |
|
757
|
|
|
|
|
|
|
# - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO. |
|
758
|
|
|
|
|
|
|
# - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details. |
|
759
|
|
|
|
|
|
|
|
|
760
|
0
|
|
|
|
|
|
my $graph = $data->{graphs}->[0]; |
|
761
|
0
|
|
|
|
|
|
my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} }; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
762
|
0
|
|
|
|
|
|
my %edges = (); |
|
763
|
|
|
|
|
|
|
|
|
764
|
0
|
|
|
|
|
|
for my $edge ( @{ $graph->{edges} } ) { |
|
|
0
|
|
|
|
|
|
|
|
765
|
0
|
|
|
|
|
|
my $child_id = $edge->{sub}; |
|
766
|
0
|
|
|
|
|
|
my $parent_id = $edge->{obj}; |
|
767
|
0
|
|
|
|
|
|
push @{ $edges{$child_id} }, $parent_id; |
|
|
0
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
} |
|
769
|
0
|
|
|
|
|
|
return \%nodes, \%edges; |
|
770
|
|
|
|
|
|
|
} |
|
771
|
|
|
|
|
|
|
1; |