File Coverage

lib/Pheno/Ranker/Align.pm

Criterion	Covered	Total	%
statement	262	307	85.3
branch	65	88	73.8
condition	20	38	52.6
subroutine	23	26	88.4
pod	0	14	0.0
total	370	473	78.2

line	stmt	bran	cond	sub	pod	time	code
1							package Pheno::Ranker::Align;
2
3	3			3		22	use strict;
	3					6
	3					100
4	3			3		15	use warnings;
	3					10
	3					79
5	3			3		15	use autodie;
	3					5
	3					17
6	3			3		17234	use feature qw(say);
	3					8
	3					229
7	3			3		20	use List::Util qw(any shuffle);
	3					4
	3					210
8
9							#use List::MoreUtils qw(duplicates);
10	3			3		17	use Data::Dumper;
	3					6
	3					162
11	3			3		2770	use Sort::Naturally qw(nsort);
	3					13999
	3					202
12	3			3		2482	use Hash::Fold fold => { array_delimiter => ':' };
	3					117109
	3					17
13	3			3		2968	use Pheno::Ranker::Stats;
	3					9
	3					240
14
15	3			3		22	use Exporter 'import';
	3					6
	3					151
16							our @EXPORT =
17							qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json);
18
19	3			3		15	use constant DEVEL_MODE => 0;
	3					7
	3					13721
20
21							our %nomenclature = ();
22
23							sub check_format {
24
25	3			3	0	17	my $data = shift;
26	3	50				25	return exists $data->[0]{subject} ? 'PXF' : 'BFF';
27							}
28
29							sub cohort_comparison {
30
31	5			5	0	11	my ( $ref_binary_hash, $self ) = @_;
32	5					16	my $out_file = $self->{out_file};
33
34							say "Performing COHORT comparison"
35	5	50	33			34	if ( $self->{debug} \|\| $self->{verbose} );
36
37							# * IMPORTANT *
38							# The ids/cohorts are naturally sorted (it won't match --append-prefixes!!!)
39	5					12	my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} );
	5					230
40
41							# Print to $out_file
42							#
43	5					23678	open( my $fh, ">", $out_file );
44	5					7232	say $fh "\t", join "\t", @sorted_keys_ref_binary_hash;
45
46							# NB: It's a symmetric matrix so we could just compute
47							# triangle. However, R needs the whole matrix
48
49							# Initialize the 2D matrix
50	5					15	my @matrix;
51
52							# I elements
53	5					36	for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) {
54							say "Calculating <"
55							. $sorted_keys_ref_binary_hash[$i]
56							. "> against the cohort..."
57	163	50				293	if $self->{verbose};
58							my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] }
59	163					251	{binary_digit_string_weighted};
60	163					218	print $fh $sorted_keys_ref_binary_hash[$i] . "\t";
61
62							# J elements
63	163					277	for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) {
64							my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
65	5875					7375	{binary_digit_string_weighted};
66	5875					5276	my $distance;
67
68	5875	100				7707	if ( $j < $i ) {
		100
69
70							# Use the distance from the lower triangle
71	2856					2981	$distance = $matrix[$j][$i];
72							}
73							elsif ( $j == $i ) {
74	163					168	$distance = 0; # Diagonal element
75							}
76							else {
77							# Compute the distance and store it in the matrix
78	2856					3991	$distance = hd_fast( $str1, $str2 );
79	2856					4150	$matrix[$i][$j] = $distance;
80							}
81
82							# Fill out the other triangle
83	5875					6817	$matrix[$j][$i] = $distance;
84
85	5875					8668	print $fh $distance . "\t";
86							}
87
88	163					244	print $fh "\n";
89							}
90
91	5					58	close $fh;
92	5	50	33			3269	say "Matrix saved to <$out_file>" if ( $self->{debug} \|\| $self->{verbose} );
93	5					166	return 1;
94							}
95
96							sub compare_and_rank {
97
98	1			1	0	3	my $arg = shift;
99	1					2	my $glob_hash = $arg->{glob_hash};
100	1					3	my $ref_binary_hash = $arg->{ref_binary_hash};
101	1					2	my $tar_binary_hash = $arg->{tar_binary_hash};
102	1					3	my $weight = $arg->{weight};
103	1					3	my $self = $arg->{self};
104	1					3	my $sort_by = $self->{sort_by};
105	1					4	my $align = $self->{align};
106	1					2	my $max_out = $self->{max_out};
107
108							say "Performing COHORT(REF)-PATIENT(TAR) comparison"
109	1	50	33			82	if ( $self->{debug} \|\| $self->{verbose} );
110
111							# Hash for compiling distances
112	1					4	my $score;
113
114							# Hash for stats
115							my $stat;
116
117							# Load TAR binary string
118	1					2	my ($tar) = keys %{$tar_binary_hash};
	1					5
119							my $tar_str_weighted =
120	1					4	$tar_binary_hash->{$tar}{binary_digit_string_weighted};
121
122	1					2	for my $key ( keys %{$ref_binary_hash} ) { # No need to sort
	1					11
123
124							# Load REF binary string
125							my $ref_str_weighted =
126	36					79	$ref_binary_hash->{$key}{binary_digit_string_weighted};
127	36	50				59	say "Comparing <id:$key> --- <id:$tar>" if $self->{verbose};
128							say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n"
129	36	50	33			66	if ( defined $self->{debug} && $self->{debug} > 1 );
130							$score->{$key}{hamming} =
131	36					78	hd_fast( $ref_str_weighted, $tar_str_weighted );
132							$score->{$key}{jaccard} =
133	36					56	jaccard_similarity( $ref_str_weighted, $tar_str_weighted );
134
135							# Add values
136	36					46	push @{ $stat->{hamming_data} }, $score->{$key}{hamming};
	36					77
137	36					36	push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard};
	36					61
138							}
139	1					7	$stat->{hamming_stats} = add_stats( $stat->{hamming_data} );
140	1					5	$stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} );
141
142							# Initialize a few variables
143	1					6	my @headers = (
144							'RANK', 'REFERENCE(ID)',
145							'TARGET(ID)', 'FORMAT',
146							'LENGTH', 'WEIGHTED',
147							'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE',
148							'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)',
149							'JACCARD-INDEX', 'JACCARD-Z-SCORE',
150							'JACCARD-P-VALUE'
151							);
152	1					7	my $header = join "\t", @headers;
153	1					2	my @results = $header;
154	1					3	my %info;
155	1					3	my $length_align = length($tar_str_weighted);
156	1	50				4	my $weight_bool = $weight ? 'True' : 'False';
157	1					2	my @alignments_ascii;
158							my $alignment_str_csv;
159	1					3	my @alignments_csv = join ';',
160							qw/id ref indicator tar weight hamming-distance json-path label/;
161
162							# The dataframe will have two header lines
163							# * IMPORTANT *
164							# nsort does not yield same results as canonical from JSON::XS
165							# NB: we're sorting here and in create_binary_digit_string()
166	1					2	my @sort_keys_glob_hash = sort keys %{$glob_hash};
	1					136
167	1	100				10	my @labels = map { exists $nomenclature{$_} ? $nomenclature{$_} : $_ }
	159					262
168							@sort_keys_glob_hash;
169
170							# Die if #elements in arrays differ
171	1	50				6	die "Mismatch between variables and labels"
172							if @sort_keys_glob_hash != @labels;
173
174							# Labels
175							# NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE'
176	1					22	my @dataframe = join ';', 'Id', @labels;
177
178							# Variables (json path)
179	1					21	push @dataframe, join ';', 'Id', @sort_keys_glob_hash;
180
181							# 0/1
182							push @dataframe, join ';', qq/T\|$tar/,
183	1					43	( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target
184
185							# Sort %score by value and load results
186	1					9	my $count = 1;
187	1					3	$max_out++; # to be able to start w/ ONE
188
189							# Start loop
190	1					13	for my $key (
191							sort {
192							$sort_by eq 'jaccard' #
193							? $score->{$b}{$sort_by}
194							<=> $score->{$a}{$sort_by} # 1 to 0 (similarity)
195							: $score->{$a}{$sort_by}
196	139	50				211	<=> $score->{$b}{$sort_by} # 0 to N (distance)
197							} keys %$score
198							)
199							{
200
201	36	50				95	say "$count: Creating alignment <id:$key>" if $self->{verbose};
202
203							# Create ASCII alignment
204							# NB: We need it here to get $n_00
205							my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) =
206							create_alignment(
207							{
208							ref_key => $key,
209							ref_str_weighted =>
210							$ref_binary_hash->{$key}{binary_digit_string_weighted},
211	36					259	tar_str_weighted => $tar_str_weighted,
212							glob_hash => $glob_hash,
213							sorted_keys_glob_hash => \@sort_keys_glob_hash,
214							labels => \@labels
215							}
216							);
217
218							# * IMPORTANT *
219							# The LENGTH of the alignment is based on the #variables in the REF-COHORT
220							# Compute estimated av and dev for binary_string of L = length_align - n_00
221							# Corrected length_align L = length_align - n_00
222	36					106	my $length_align_corrected = $length_align - $n_00;
223	36					116	( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) =
224							estimate_hamming_stats($length_align_corrected);
225
226							# Compute a few stats
227							my $hamming_z_score = z_score(
228							$score->{$key}{hamming},
229							$stat->{hamming_stats}{mean},
230							$stat->{hamming_stats}{sd}
231	36					128	);
232							my $hamming_z_score_from_random = z_score(
233							$score->{$key}{hamming},
234							$stat->{hamming_stats}{mean_rnd},
235							$stat->{hamming_stats}{sd_rnd}
236	36					85	);
237
238							#my $hamming_p_value =
239							# p_value( $score->{$key}{hamming}, $length_align_corrected );
240	36					62	my $hamming_p_value_from_z_score =
241							p_value_from_z_score($hamming_z_score);
242							my $jaccard_z_score = z_score(
243							$score->{$key}{jaccard},
244							$stat->{jaccard_stats}{mean},
245							$stat->{jaccard_stats}{sd}
246	36					103	);
247	36					66	my $jaccard_p_value_from_z_score =
248							p_value_from_z_score( 1 - $jaccard_z_score );
249
250							# Create a hash with formats
251							my $format = {
252							'RANK' => { value => $count, format => undef },
253							'REFERENCE(ID)' => { value => $key, format => undef },
254							'TARGET(ID)' => { value => $tar, format => undef },
255							'FORMAT' => { value => $self->{format}, format => undef },
256							'WEIGHTED' => { value => $weight_bool, format => undef },
257							'LENGTH' => { value => $length_align_corrected, format => '%6d' },
258							'HAMMING-DISTANCE' =>
259							{ value => $score->{$key}{hamming}, format => '%4d' },
260							'DISTANCE-Z-SCORE' =>
261							{ value => $hamming_z_score, format => '%7.3f' },
262							'DISTANCE-P-VALUE' =>
263							{ value => $hamming_p_value_from_z_score, format => '%12.7f' },
264							'DISTANCE-Z-SCORE(RAND)' =>
265							{ value => $hamming_z_score_from_random, format => '%8.4f' },
266							'JACCARD-INDEX' =>
267	36					658	{ value => $score->{$key}{jaccard}, format => '%7.3f' },
268							'JACCARD-Z-SCORE' =>
269							{ value => $jaccard_z_score, format => '%7.3f' },
270							'JACCARD-P-VALUE' =>
271							{ value => $jaccard_p_value_from_z_score, format => '%12.7f' },
272							};
273
274							# Serialize results
275							my $tmp_str = join "\t", map {
276	36					88	defined $format->{$_}{format}
277							? sprintf( $format->{$_}{format}, $format->{$_}{value} )
278							: $format->{$_}{value}
279	468	100				1627	} @headers;
280	36					102	push @results, $tmp_str;
281
282							# To save memory only load if --align
283	36	50				68	if ( defined $align ) {
284
285							# Add all of the above to @alignments_ascii
286	36					121	my $sep = ('-') x 80;
287	36					1161	push @alignments_ascii,
288							qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/;
289
290							# Add all of the above to @alignments_csv
291	36					980	push @alignments_csv, @$alignment_arr_csv;
292
293							# Add data to dataframe
294							push @dataframe, join ';', qq/R\|$key/,
295	36					1527	( split //, $ref_binary_hash->{$key}{binary_digit_string} );
296
297							# Add values to info
298							$info{$key} = {
299							weighted => $weight_bool eq 'True'
300							? JSON::XS::true
301							: JSON::XS::false,
302							reference_id => $key,
303							target_id => $tar,
304							reference_binary_string =>
305							$ref_binary_hash->{$key}{binary_digit_string},
306							target_binary_string =>
307							$tar_binary_hash->{$key}{binary_digit_string},
308							reference_binary_string_weighted =>
309							$ref_binary_hash->{$key}{binary_digit_string_weighted},
310							target_binary_string_weighted =>
311							$tar_binary_hash->{$key}{binary_digit_string_weighted},
312							alignment_length => $length_align_corrected,
313							hamming_distance => $score->{$key}{hamming},
314							hamming_z_score => $hamming_z_score,
315							hamming_p_value => $hamming_p_value_from_z_score,
316							jaccard_similarity => $score->{$key}{jaccard},
317							jaccard_z_score => $jaccard_z_score,
318							jaccard_p_value => $jaccard_p_value_from_z_score,
319							jaccard_distance => 1 - $score->{$key}{jaccard},
320							format => $self->{format},
321	36	50				419	alignment => $$alignment_str_ascii,
322							};
323
324							}
325
326	36					1556	$count++;
327	36	100				496	last if $count == $max_out;
328							}
329
330	1					57	return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv;
331							}
332
333							sub create_alignment {
334
335							# NB: The alignment will use the weighted string
336	36			36	0	59	my $arg = shift;
337	36					46	my $ref_key = $arg->{ref_key};
338	36					42	my $binary_string1 = $arg->{ref_str_weighted};
339	36					77	my $binary_string2 = $arg->{tar_str_weighted};
340	36					42	my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash};
341	36					43	my $labels = $arg->{labels};
342	36					41	my $glob_hash = $arg->{glob_hash};
343	36					43	my $length1 = length($binary_string1);
344	36					45	my $length2 = length($binary_string2);
345
346							# Check that l1 = l2
347	36	50				67	die "The binary strings must have the same length"
348							if ( $length1 != $length2 );
349
350							# Expand array to have weights as N-elements
351	36					62	my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash );
352
353							# Initialize some variables
354	36					55	my $out_ascii = "REF -- TAR\n";
355	36					41	my @out_csv;
356	36					44	my $cum_distance = 0;
357	36					39	my $n_00 = 0;
358
359							# For loop with 2 variables
360							# i index for weighted
361							# j the number of variables
362	36					50	my ( $i, $j ) = ( 0, 0 );
363	36					73	for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) {
364
365							# Load key and value
366	5724					7147	my $key = $recreated_array->[$i];
367	5724					9513	my $val = sprintf( "%3d", $glob_hash->{$key} );
368
369							# We have to keep track with $j
370	5724					6095	my $sorted_key = $sorted_keys_glob_hash->[$j];
371	5724					5545	my $label = $labels->[$j];
372
373							# Load chars
374	5724					6947	my $char1 = substr( $binary_string1, $i, 1 );
375	5724					5953	my $char2 = substr( $binary_string2, $i, 1 );
376	5724	100	100			11893	$n_00++ if ( $char1 == 0 && $char2 == 0 );
377
378							# Correct $i index by adding weights
379	5724					6387	$i = $i + $glob_hash->{$key} - 1;
380
381							# Load metrics
382	5724	100				7734	$cum_distance += $glob_hash->{$key} if $char1 ne $char2;
383	5724					7549	my $cum_distance_pretty = sprintf( "%3d", $cum_distance );
384	5724	100				7803	my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key};
385	5724					6693	my $distance_pretty = sprintf( "%3d", $distance );
386
387							# w = weight, d = distance, cd = cumul distance
388	5724					11168	my %format = (
389							'11' => '-----',
390							'10' => 'xxx--',
391							'01' => '--xxx',
392							'00' => ' '
393							);
394	5724					14784	$out_ascii .=
395							qq/$char1 $format{ $char1 . $char2 } $char2 \| (w:$val\|d:$distance_pretty\|cd:$cum_distance_pretty\|) $sorted_key ($label)\n/;
396	5724					25522	push @out_csv,
397							qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/;
398
399							#REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path
400							#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y
401							#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y
402							#1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y
403							}
404	36					392	return $n_00, \$out_ascii, \@out_csv;
405							}
406
407							sub recreate_array {
408
409	36			36	0	63	my ( $glob_hash, $sorted_keys_glob_hash ) = @_;
410	36					48	my @recreated_array;
411	36					63	foreach my $key (@$sorted_keys_glob_hash) {
412	5724					8205	for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) {
413	6012					10235	push @recreated_array, $key;
414							}
415							}
416	36					74	return \@recreated_array;
417							}
418
419							sub create_glob_and_ref_hashes {
420
421	6			6	0	16	my ( $array, $weight, $self ) = @_;
422	6					14	my $primary_key = $self->{primary_key};
423	6					25	my $glob_hash = {};
424	6					15	my $ref_hash_flattened;
425
426	6					11	for my $i ( @{$array} ) {
	6					16
427
428							# For consistency, we obtain the primary_key for both BFF/PXF
429							# from $_->{id} (not from subject.id)
430	199					489	my $id = $i->{$primary_key};
431	199	50				557	say "Flattening and remapping <id:$id> ..." if $self->{verbose};
432	199					801	my $ref_hash = remap_hash(
433							{
434							hash => $i,
435							weight => $weight,
436							self => $self
437							}
438							);
439	199					842	$ref_hash_flattened->{$id} = $ref_hash;
440
441							# The idea is to create a $glob_hash with unique key-values
442							# Duplicates will be automatically merged
443	199					8509	$glob_hash = { %$glob_hash, %$ref_hash };
444							}
445	6					39	return ( $glob_hash, $ref_hash_flattened );
446							}
447
448							sub randomize_variables {
449
450	0			0	0	0	my ( $glob_hash, $self ) = @_;
451	0					0	my $max = $self->{max_number_var};
452	0					0	my $seed = $self->{seed};
453
454							# set random seed
455	0					0	srand($seed);
456
457							# Randomize
458							# NB:keys have to be sorted for srand to work!!!!
459							# perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5'
460	0					0	my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ];
461
462							# Create a new hash with a hash slice
463	0					0	my %new_glob_hash;
464	0					0	@new_glob_hash{@items} = @{$glob_hash}{@items};
	0					0
465
466							# return reference
467	0					0	return \%new_glob_hash;
468							}
469
470							sub prune_excluded_included {
471
472	200			200	0	384	my ( $hash, $self ) = @_;
473	200					220	my @included = @{ $self->{include_terms} };
	200					409
474	200					305	my @excluded = @{ $self->{exclude_terms} };
	200					292
475
476							# Die if we have both options at the same time
477	200	50	66			466	die "Sorry, <--include> and <--exclude> are mutually exclusive\n"
478							if ( @included && @excluded );
479
480							# * IMPORTANT *
481							# Original $hash is modified
482
483							# INCLUDED
484	200	100				359	if (@included) {
485	25					52	for my $key ( keys %$hash ) {
486	125	100		225		322	delete $hash->{$key} unless any { $_ eq $key } @included;
	225					444
487							}
488							}
489
490							# EXCLUDED
491	200	50				367	if (@excluded) {
492	0					0	for my $key (@excluded) {
493	0	0				0	delete $hash->{$key} if exists $hash->{$key};
494							}
495							}
496
497							# We will do nothing if @included = @excluded = [] (DEFAULT)
498	200					298	return 1;
499							}
500
501							sub undef_excluded_phenotypicFeatures {
502
503	200			200	0	290	my $hash = shift;
504
505							# Setting the property to undef (it will be discarded later)
506	200	100				398	if ( exists $hash->{phenotypicFeatures} ) {
507	71					298	@{ $hash->{phenotypicFeatures} } =
508	71	100				100	map { $_->{excluded} ? undef : $_ } @{ $hash->{phenotypicFeatures} };
	179					1448
	71					260
509							}
510
511							# * IMPORTANT *
512							# Because of setting to undef the excludd properties, it can happen that in
513							# the stats file we have phenotypicFeatures = 100% but t hen it turns out
514							# that some individuals have phenotypicFeatures = {} (all excluded)
515	200					682	return $hash;
516							}
517
518							sub remap_hash {
519
520	200			200	0	342	my $arg = shift;
521	200					280	my $hash = $arg->{hash};
522	200					349	my $weight = $arg->{weight};
523	200					275	my $self = $arg->{self};
524	200					297	my $nodes = $self->{nodes};
525	200					267	my $edges = $self->{edges};
526	200					207	my $out_hash;
527
528							# Do some pruning excluded / included
529	200					450	prune_excluded_included( $hash, $self );
530
531							# * IMPORTANT *
532							# The user may include a term that:
533							# A - may not exist in any individual
534							# B - does not exist in some individuals
535							# If the term does not exist in a invidual
536							# - a) -include-terms contains ANOTHER TERM THAT EXISTS
537							# %$hash will contain keys => OK
538							# - b) -include-terms only includes the term/terms not present
539							# %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm
540							#print Dumper $hash;
541	200	50				340	return {} unless %$hash;
542
543							# A bit more pruning plus collapsing
544	200					391	$hash = fold( undef_excluded_phenotypicFeatures($hash) );
545
546							# Load the hash that points to the hierarchy for ontology
547							# * IMPORTANT *
548							# - phenotypicFeatures.featureType.id => BFF
549							# - phenotypicFeatures.type.id => PXF
550	200					891820	my $id_correspondence = $self->{id_correspondence};
551
552							# Now we proceed for each key
553	200					286	for my $key ( keys %{$hash} ) {
	200					4157
554
555							# To see which ones were discarded
556							#say $key if !defined $hash->{$key};
557
558							# Discard undefined
559	20603	100				35199	next unless defined $hash->{$key};
560
561							# Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time)
562							# Some can be "rescued" by adding the ontology as ($1)
563							# NB1: We discard _labels too!!
564							# NB2: info\|metaData are always discarded
565
566	19847					21295	my $regex = $self->{exclude_properties_regex};
567							next
568	19847	100	100			92427	if ( $regex && $key =~ m/$regex/ )
569							; # $regex has to be defined and be != ''
570
571							# The user can turn on age related values
572	7976	100	66			28226	next if ( $key =~ m/age\|onset/i && !$self->{age} ); # $self->{age} [0\|1]
573
574							# Load values
575	7563					9694	my $val = $hash->{$key};
576
577							# Discarding lines with val (Time profiled with :NYTProf: ms time)
578							next
579	7563	100	33			39521	if ( $val eq 'NA'
			33
			66
580							\|\| $val eq 'Fake'
581							\|\| $val eq 'None:No matching concept'
582							\|\| $val =~ m/1900-01-01\|NA0000\|P999Y\|P9999Y\|ARRAY\|phenopacket_id/ );
583
584							# Add IDs to key
585	5715					9082	my $id_key = add_id2key( $key, $hash, $self );
586
587							# Finally add value to id_key
588	5715					11345	my $tmp_key = $id_key . '.' . $val;
589
590							# Add HPO ascendants
591	5715	50	33			9678	if ( defined $edges && $val =~ /^HP:/ ) {
592	0					0	my $ascendants = add_hpo_ascendants( $tmp_key, $nodes, $edges );
593	0					0	$out_hash->{$_} = 1 for @$ascendants; # weight 1 for now
594							}
595
596							##################
597							# Assign weights #
598							##################
599							# NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION!
600							# NB: We don't warn if it does not exist, just assign 1
601							# * IMPORTANT * 07-26-2023
602							# We allow for assigning weights by TERM (e.g., 1D)
603							# but VARIABLE level takes precedence to TERM
604
605	5715					5900	my $tmp_key_at_term_level = $tmp_key;
606
607							# If variable has . then capture $1
608	5715	50				10308	if ( $tmp_key_at_term_level =~ m/\./ ) {
609
610							# NB: For long str regex is faster than (split /\./, $foo)[0]
611	5715					10130	$tmp_key_at_term_level =~ m/^(\w+)\./;
612	5715					8829	$tmp_key_at_term_level = $1;
613							}
614
615							# ORDER MATTERS !!!!
616							$out_hash->{$tmp_key} =
617
618							# VARIABLE LEVEL
619							exists $weight->{$tmp_key}
620							? $weight->{$tmp_key}
621
622							# TERM LEVEL
623							: exists $weight->{$tmp_key_at_term_level}
624	5715	50				15221	? $weight->{$tmp_key_at_term_level}
		100
625
626							# NO WEIGHT
627							: 1;
628
629							# Finally we load the Nomenclature hash
630	5715					6206	my $label = $key;
631	5715					12850	$label =~ s/id/label/;
632	5715	100				18748	$nomenclature{$tmp_key} = $hash->{$label} if defined $hash->{$label};
633							}
634
635							# * IMPORTANT *
636							# We have to return an object {} when undef
637	200		50			5433	return $out_hash // {};
638							}
639
640							sub add_hpo_ascendants {
641
642	0			0	0	0	my ( $key, $nodes, $edges ) = @_;
643
644							# First we obtain the ontology (0000539) from HP:0000539
645	0					0	$key =~ m/HP:(\w+)$/;
646	0					0	my $ontology = $1;
647
648							# We'll use it to build a string equivalent to a key from $edges
649	0					0	my $hpo_url = 'http://purl.obolibrary.org/obo/HP_';
650	0					0	my $hpo_key = $hpo_url . $ontology;
651
652							# We will include all ascendants in an array
653	0					0	my @ascendants;
654	0					0	for my $parent_id ( @{ $edges->{$hpo_key} } ) {
	0					0
655
656							# We have to create a copy to not modify the original $parent_id
657							# as it can appear in multiple individuals
658	0					0	my $copy_parent_id = $parent_id;
659	0					0	$copy_parent_id =~ m/\/(\w+)$/;
660	0					0	$copy_parent_id = $1;
661	0					0	$copy_parent_id =~ tr/_/:/;
662
663							# * IMPORTANT *
664							# We cannot add any label to the ascendants, otherwise they will
665							# not be matched by an indv down the tree
666							# Myopia
667							# Mild Myopia
668							# We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia'
669							# Use the labels only for debug
670	0					0	my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key;
671	0					0	$asc_key =~ s/HP:$ontology/$copy_parent_id/g;
672	0					0	push @ascendants, $asc_key;
673
674							# We finally add the label to %nomenclature
675	0					0	my $hpo_asc_str = $hpo_url
676							. $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539
677	0					0	$hpo_asc_str =~ s/HP://; # 0000539
678	0					0	$nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl};
679							}
680	0					0	return \@ascendants;
681							}
682
683							sub add_id2key {
684
685	5715			5715	0	8146	my ( $key, $hash, $self ) = @_;
686	5715					8410	my $id_correspondence = $self->{id_correspondence}{ $self->{format} };
687	5715					5847	my $array_terms_str = join '\|', @{ $self->{array_terms} };
	5715					12193
688	5715					7214	my $array_regex = $self->{array_regex};
689
690	5715	100				19650	if ( $key =~ /$array_terms_str/ ) {
691	5094					15270	$key =~ m/$array_regex/;
692	5094					6558	my ( $tmp_key, $val );
693
694							# Normal behaviour
695	5094	100				9091	if ( defined $3 ) {
696	4964					11751	$tmp_key = $1 . ':' . $2 . '.' . $id_correspondence->{$1};
697	4964					6612	$val = $hash->{$tmp_key};
698	4964					9617	$key = $1 . '.' . $val . '.' . $3;
699							}
700
701							# Only two
702							else {
703
704	130					245	$tmp_key = $1 . ':' . $2;
705	130					170	$val = $hash->{$tmp_key};
706	130					213	$key = $1;
707							}
708							}
709	5715					9435	return $key;
710							}
711
712							sub create_binary_digit_string {
713
714	7			7	0	19	my ( $glob_hash, $cmp_hash ) = @_;
715	7					11	my $out_hash;
716
717							# * IMPORTANT *
718							# Being a nested for, keys %{$glob_hash} does not need sorting
719							# BUT, we sort to follow the same order as serialized (sorted)
720	7					15	my @sorted_keys_glob_hash = sort keys %{$glob_hash};
	7					424
721
722							# IDs of each indidividual
723	7					40	for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort
	7					60
724
725							# One-hot encoding = Representing categorical data as numerical
726	200					292	my ( $binary_str, $binary_str_weighted ) = ('') x 2;
727	200					243	for my $key (@sorted_keys_glob_hash) {
728	21837					23431	my $ones = (1) x $glob_hash->{$key};
729	21837					23325	my $zeros = (0) x $glob_hash->{$key};
730	21837	100				28582	$binary_str .= exists $cmp_hash->{$individual_id}{$key} ? 1 : 0;
731							$binary_str_weighted .=
732	21837	100				30534	exists $cmp_hash->{$individual_id}{$key} ? $ones : $zeros;
733							}
734	200					486	$out_hash->{$individual_id}{binary_digit_string} = $binary_str;
735							$out_hash->{$individual_id}{binary_digit_string_weighted} =
736	200					308	$binary_str_weighted;
737							}
738	7					74	return $out_hash;
739							}
740
741							sub parse_hpo_json {
742
743	0			0	0		my $data = shift;
744
745							# The <hp.json> file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format.
746							# The HPO is structured into a directed acyclic graph (DAG)
747							# Here's a brief overview of the structure of the hpo.json file:
748							# - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys:
749							# - nodes: An array of objects, each representing an HPO term. Each term object has the following keys:
750							# - id: The identifier of the term (e.g., "HP:0000118").
751							# - lbl: The label (name) of the term (e.g., "Phenotypic abnormality").
752							# - meta: Metadata associated with the term, including definition, synonyms, and other information.
753							# - type: The type of the term, usually "CLASS".
754							# - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys:
755							# - sub: The subject (child) term ID (e.g., "HP:0000924").
756							# - obj: The object (parent) term ID (e.g., "HP:0000118").
757							# - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO.
758							# - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details.
759
760	0						my $graph = $data->{graphs}->[0];
761	0						my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} };
	0
	0
762	0						my %edges = ();
763
764	0						for my $edge ( @{ $graph->{edges} } ) {
	0
765	0						my $child_id = $edge->{sub};
766	0						my $parent_id = $edge->{obj};
767	0						push @{ $edges{$child_id} }, $parent_id;
	0
768							}
769	0						return \%nodes, \%edges;
770							}
771							1;