File Coverage

blib/lib/UMLS/Association/Measures/MWA.pm

Criterion	Covered	Total	%
statement	139	163	85.2
branch	24	30	80.0
condition	3	6	50.0
subroutine	7	8	87.5
pod	0	1	0.0
total	173	208	83.1

line	stmt	bran	cond	sub	pod	time	code
1							#UMLS::Association::Measures::MWA
2							# Computes the Minimum Weight Association between two sets of terms
3							#
4							# MWA works by first finding the sets of linking terms for the A terms
5							# and C terms to form stes B_A and B_C. It then uses these sets to
6							# compute N1P - the count of co-occurrences with A (same as direct
7							# association), NP1 - the count of co-occurrences with C (same as direct
8							# association), NPP - the total count of co-occurrences in the dataset
9							# (same as direct association), and N11 - the average minimum of A to B
10							# and B to C co-occurrences for each A to B to C connection. In other words,
11							# to find N11, we find sum A_i to B_j to form ABj and sum B_j to C_k to form
12							# BjC. We then take the minimum between ABj and BjC for each Bj and average
13							# over all BjC. This imitates the average of minimum information flow between
14							# A and C between each shared linking term, Bj.
15	1			1		4	use strict;
	1					1
	1					18
16	1			1		3	use warnings;
	1					1
	1					762
17
18							package UMLS::Association::Measures::MWA;
19
20							# Gets stats (n11,n1p,np1,npp) for each pairHash in the pairHashList
21							# using minimum weight association (MWA)
22							# Input:
23							# $pairHashListRef - ref to an array of pairHashes
24							# $matrixFileName - the fileName of the co-occurrence matrix
25							# $noOrder - 1 if order is enforced, 0 if not
26							# Output:
27							# \@statsList - ref to an array of \@stats, refs to arrays
28							# containing the ordered values: n11, n1p, np1, npp
29							# for each of the pair hashes. The index of the
30							# \@statsList corresponds to the index of the pairHash
31							# in the input $pairHashListRef
32							sub getStats {
33	4			4	0	4	my $pairHashListRef = shift;
34	4					5	my $matrixFileName = shift;
35	4					5	my $noOrder = shift;
36
37							#Read in all stats
38	4					6	my ($n1pRef, $np1Ref, $npp, $matrixRef, $linkingPairHashListRef) = &UMLS::Association::StatFinder::getLinkingTermsPairHashList($pairHashListRef, $matrixFileName, $noOrder, 1, 0);
39
40							#compute stats for each pairHash
41	4					4	my @statsList = ();
42	4					5	for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
	10					15
43	6					3	my $pairHashRef = ${$pairHashListRef}[$i];
	6					7
44	6					5	my $linkingPairHashRef = ${$linkingPairHashListRef}[$i];
	6					7
45
46							#get the stats for this pair hash
47	6					10	push @statsList, &_statsFromAllLinkingInfo($pairHashRef, $linkingPairHashRef, $n1pRef, $np1Ref, $npp, $matrixRef, $noOrder);
48							}
49
50	4					22	return \@statsList;
51							}
52
53
54							# Gets stats (n11,n1p,np1,npp) for a single pairHash using the
55							# precomputed linkingPairHash (from StatFinder::getLinkingTermsPairHashList)
56							# Input:
57							# $pairHashRef - ref to a pairHash
58							# $linkingPairHashRef - ref to the linking terms pair hash for this pairHash
59							# $n1pRef - ref to a hash{$cui}=n1p for that cui, order enforced
60							# $np1Ref - ref to a hash{$cui}=np1 for that cui, order enforced
61							# $npp - npp for the subGraphRef
62							# $subGraphRef - ref to the subgraph or matrix read in
63							# $noOrder - 1 if order is enforced, 0 if not
64							# Output:
65							# \@stats - ref to an array of (n11,n1p,np1,npp)
66							sub _statsFromAllLinkingInfo {
67	6			6		6	my $pairHashRef = shift;
68	6					6	my $linkingPairHashRef = shift;
69	6					6	my $n1pRef = shift;
70	6					5	my $np1Ref = shift;
71	6					5	my $npp = shift;
72	6					5	my $subGraphRef = shift;
73	6					5	my $noOrder = shift;
74
75							###############################
76							# Find Shared B Terms
77							###
78							# Find the overlapping (shared) Co-occurrences
79							#grab terms from set1
80	6					7	my %set1Terms = ();
81	6					4	foreach my $cui (@{${$linkingPairHashRef}{'set1'}}) {
	6					13
	6					10
82	16					15	$set1Terms{$cui} = 1;
83							}
84
85							#find the overlapping B terms and save as an array
86	6					7	my %sharedBTerms = ();
87	6					4	foreach my $cui (@{${$linkingPairHashRef}{'set2'}}) {
	6					4
	6					9
88	20	100				37	if (exists $set1Terms{$cui}) {
89	10					10	$sharedBTerms{$cui} = 1;
90							}
91							}
92
93							###############################
94							# Calculate Stats
95							###
96	6					8	my $n11 = &_calculateN11($subGraphRef, $pairHashRef, \%sharedBTerms, $noOrder);
97	6					9	my $n1p = &_calculateN1P($subGraphRef, $pairHashRef, $n1pRef, $noOrder);
98	6					9	my $np1 = &_calculateNP1($subGraphRef, $pairHashRef, $np1Ref, $noOrder);
99
100							#pack and save the stats for this pair hash
101	6					7	my @stats = ($n11, $n1p, $np1, $npp);
102	6					13	return \@stats;
103							}
104
105
106							# Calculates N11 for a pairHash
107							# Input:
108							# $subGraphRef - ref to the subgraph or matrix read in
109							# $pairHashRef - ref to a pairHash
110							# $sharedCoocRef - ref to hash{cui} = 1 of all shared B terms
111							# $noOrder - 1 if order is enforced, 0 if not
112							# Output:
113							# $n11 - n11 for this pairHash
114							sub _calculateN11 {
115							#grab params
116	6			6		7	my $subGraphRef = shift;
117	6					3	my $pairHashRef = shift;
118	6					18	my $sharedCoocRef = shift;
119	6					6	my $noOrder = shift;
120
121							#calculate n11 as the minimum average weight
122	6					3	my $n11 = 0;
123							#my $count = 0;
124	6					6	foreach my $bNode (keys %{$sharedCoocRef}) {
	6					8
125							#get the a to b value, which is the sum of all a_i to b
126	10					10	my $abVal = 0;
127	10					9	my $counted = 0;
128	10					7	foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
	10					8
	10					13
129	14					10	my $counted = 0;
130	14	100				11	if (exists ${${$subGraphRef}{$aNode}}{$bNode}) {
	14					14
	14					31
131	12					11	$abVal += ${${$subGraphRef}{$aNode}}{$bNode};
	12					12
	12					12
132							}
133	14	100				21	if ($noOrder) {
134							#avoid double counting either self references
135							# or overlapping set references
136	7	50				8	if ($counted == 0) {
137							#increment for noorder
138	7	50				7	if (exists ${${$subGraphRef}{$bNode}}{$aNode}) {
	7					4
	7					26
139	0					0	$abVal += ${${$subGraphRef}{$bNode}}{$aNode};
	0					0
	0					0
140							}
141							}
142							}
143							}
144
145							#get the b to C value, which is the sum of all b to c_i
146	10					11	my $bcVal = 0;
147	10					9	foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
	10					10
	10					11
148	16					12	my $counted = 0;
149							#get the c to b value
150	16	100				23	if (exists ${${$subGraphRef}{$bNode}}{$cNode}) {
	16					13
	16					26
151	12					10	$bcVal += ${${$subGraphRef}{$bNode}}{$cNode};
	12					8
	12					12
152	12					10	$counted = 1;
153							}
154	16	100				22	if ($noOrder) {
155							#avoid double counting either self references
156							# or overlapping set references
157	8	100				11	if ($counted == 0) {
158	2	50				2	if (exists ${${$subGraphRef}{$cNode}}{$bNode}) {
	2					2
	2					4
159	0					0	$bcVal += ${${$subGraphRef}{$cNode}}{$bNode};
	0					0
	0					0
160							}
161							}
162							}
163							}
164
165							#get the mininum value and increment n11
166							#find the min
167	10					11	my $min = $abVal;
168	10	50				11	if ($bcVal < $min) {
169	0					0	$min = $bcVal;
170							}
171							#increment n11
172	10					11	$n11 += $min;
173							#$count++;
174							}
175
176							#NOTE - can delete count completely from this, but
177							# this re-enable divide by count if you want to compute AMW (then just return n11)
178							#if ($count > 0) {
179							# $n11 /= $count;
180							# }
181
182	6					9	return $n11;
183							}
184
185
186							#calculates N1P for a pairHash
187							# Input:
188							# $subGraphRef - ref to the subgraph or matrix read in
189							# $pairHashRef - ref to a pairHash
190							# $n1pRef - ref to a hash{$cui}=n1p for that cui, order enforced
191							# $noOrder - 1 if order is enforced, 0 if not
192							# Output:
193							# $n1p - n1p for this pairHash
194							sub _calculateN1P {
195	6			6		6	my $subGraphRef = shift;
196	6					4	my $pairHashRef = shift;
197	6					5	my $n1pRef = shift;
198	6					6	my $noOrder = shift;
199
200							#NOTE - two methods, one if we record n1p, one if we dont
201							#calculate $n1p as the sum of all set1 cooc
202							=comment
203							my $n1p = 0;
204							#find all a to b co-occurrences
205							foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
206							foreach my $bNode (keys @{$linkingTermsRef}) {
207							$n1p += ${${$subGraphRef}{$aNode}}{$bNode};
208							}
209							}
210							=cut
211	6					4	my $n1p = 0;
212							#find all a to b co-occurrences
213	6					6	foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
	6					5
	6					8
214	8					30	$n1p += ${$n1pRef}{$aNode};
	8					8
215							}
216	6	100				13	if ($noOrder) {
217							#convert the pair hash array to a hash
218	3					4	my %set1 = ();
219	3					2	foreach my $key (@{${$pairHashRef}{'set1'}}) {
	3					2
	3					4
220	4					9	$set1{$key} = 1;
221							}
222
223							#find all b to c co-occurrences
224	3					4	foreach my $bNode (keys %{$subGraphRef}) {
	3					6
225	25					17	foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
	25					22
	25					25
226							#avoid double counting self co-occurrences
227	35	100	66			63	if (exists $set1{$aNode} && exists $set1{$bNode}) {
228	6					7	next;
229							}
230
231							#increment n1p
232	29	50				19	if (defined ${${$subGraphRef}{$bNode}}{$aNode}) {
	29					25
	29					44
233	0					0	$n1p += ${${$subGraphRef}{$bNode}}{$aNode};
	0					0
	0					0
234							}
235							}
236							}
237							}
238
239	6					7	return $n1p;
240							}
241
242							# Calculates NP1 for a pair hash
243							# Input:
244							# $subGraphRef - ref to the subgraph or matrix read in
245							# $pairHashRef - ref to a pairHash
246							# $np1Ref - ref to a hash{$cui}=np1 for that cui, order enforced
247							# $noOrder - 1 if order is enforced, 0 if not
248							# Output:
249							# \@stats - ref to an array of (n11,n1p,np1,npp)
250							sub _calculateNP1 {
251	6			6		5	my $subGraphRef = shift;
252	6					6	my $pairHashRef = shift;
253	6					4	my $np1Ref = shift;
254	6					6	my $noOrder = shift;
255
256							#NOTE - two methods, one if we record np1, one if we dont
257							#calculate $n1p as the sum of all set2 cooc
258							=comment
259							my $np1 = 0;
260							#find all b to c co-occurrences
261							foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
262							foreach my $bNode (keys @{$linkingTermsRef}) {
263							$np1 += ${${$subGraphRef}{$bNode}}{$cNode};
264							}
265							}
266							=cut
267	6					4	my $np1 = 0;
268							#find all b to c co-occurrences
269	6					5	foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
	6					6
	6					7
270	10					10	$np1 += ${$np1Ref}{$cNode};
	10					10
271							}
272	6	100				7	if ($noOrder) {
273							#convert the pair hash array to a hash
274	3					4	my %set2 = ();
275	3					3	foreach my $key (@{${$pairHashRef}{'set2'}}) {
	3					3
	3					4
276	5					5	$set2{$key} = 1;
277							}
278
279							#find all c to b co-occurrences
280	3					8	foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
	3					3
	3					6
281	5					4	foreach my $bNode (keys %{${$subGraphRef}{$cNode}}) {
	5					4
	5					11
282
283							#avoid double counting pointing to self
284	3	50	33			15	if (exists $set2{$bNode} && exists $set2{$cNode}) {
285	3					4	next;
286							}
287
288							#increment $np1
289	0					0	$np1 += ${${$subGraphRef}{$cNode}}{$bNode};
	0					0
	0					0
290							}
291							}
292							}
293
294	6					8	return $np1;
295							}
296
297
298							# Calculates NPP for a subGraph (dataset)
299							# Input:
300							# $subGraphRef - ref to the subgraph or matrix read in
301							# Output:
302							# $npp - npp for this dataset
303							sub _calculateNPP {
304	0			0			my $subGraphRef = shift;
305
306							#calculate npp as the total number of cooccurrences
307	0						my $npp = 0;
308	0						foreach my $key1 (keys %{$subGraphRef}) {
	0
309	0						foreach my $key2 (keys %{${$subGraphRef}{$key1}}) {
	0
	0
310	0						$npp += ${${$subGraphRef}{$key1}}{$key2};
	0
	0
311							}
312							}
313	0						return $npp;
314							}
315
316							1;