File Coverage

blib/lib/UMLS/Association/Measures/MWA.pm
Criterion Covered Total %
statement 139 163 85.2
branch 24 30 80.0
condition 3 6 50.0
subroutine 7 8 87.5
pod 0 1 0.0
total 173 208 83.1


line stmt bran cond sub pod time code
1             #UMLS::Association::Measures::MWA
2             # Computes the Minimum Weight Association between two sets of terms
3             #
4             # MWA works by first finding the sets of linking terms for the A terms
5             # and C terms to form stes B_A and B_C. It then uses these sets to
6             # compute N1P - the count of co-occurrences with A (same as direct
7             # association), NP1 - the count of co-occurrences with C (same as direct
8             # association), NPP - the total count of co-occurrences in the dataset
9             # (same as direct association), and N11 - the average minimum of A to B
10             # and B to C co-occurrences for each A to B to C connection. In other words,
11             # to find N11, we find sum A_i to B_j to form ABj and sum B_j to C_k to form
12             # BjC. We then take the minimum between ABj and BjC for each Bj and average
13             # over all BjC. This imitates the average of minimum information flow between
14             # A and C between each shared linking term, Bj.
15 1     1   4 use strict;
  1         1  
  1         18  
16 1     1   3 use warnings;
  1         1  
  1         762  
17              
18             package UMLS::Association::Measures::MWA;
19              
20             # Gets stats (n11,n1p,np1,npp) for each pairHash in the pairHashList
21             # using minimum weight association (MWA)
22             # Input:
23             # $pairHashListRef - ref to an array of pairHashes
24             # $matrixFileName - the fileName of the co-occurrence matrix
25             # $noOrder - 1 if order is enforced, 0 if not
26             # Output:
27             # \@statsList - ref to an array of \@stats, refs to arrays
28             # containing the ordered values: n11, n1p, np1, npp
29             # for each of the pair hashes. The index of the
30             # \@statsList corresponds to the index of the pairHash
31             # in the input $pairHashListRef
32             sub getStats {
33 4     4 0 4 my $pairHashListRef = shift;
34 4         5 my $matrixFileName = shift;
35 4         5 my $noOrder = shift;
36              
37             #Read in all stats
38 4         6 my ($n1pRef, $np1Ref, $npp, $matrixRef, $linkingPairHashListRef) = &UMLS::Association::StatFinder::getLinkingTermsPairHashList($pairHashListRef, $matrixFileName, $noOrder, 1, 0);
39              
40             #compute stats for each pairHash
41 4         4 my @statsList = ();
42 4         5 for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  10         15  
43 6         3 my $pairHashRef = ${$pairHashListRef}[$i];
  6         7  
44 6         5 my $linkingPairHashRef = ${$linkingPairHashListRef}[$i];
  6         7  
45              
46             #get the stats for this pair hash
47 6         10 push @statsList, &_statsFromAllLinkingInfo($pairHashRef, $linkingPairHashRef, $n1pRef, $np1Ref, $npp, $matrixRef, $noOrder);
48             }
49              
50 4         22 return \@statsList;
51             }
52              
53              
54             # Gets stats (n11,n1p,np1,npp) for a single pairHash using the
55             # precomputed linkingPairHash (from StatFinder::getLinkingTermsPairHashList)
56             # Input:
57             # $pairHashRef - ref to a pairHash
58             # $linkingPairHashRef - ref to the linking terms pair hash for this pairHash
59             # $n1pRef - ref to a hash{$cui}=n1p for that cui, order enforced
60             # $np1Ref - ref to a hash{$cui}=np1 for that cui, order enforced
61             # $npp - npp for the subGraphRef
62             # $subGraphRef - ref to the subgraph or matrix read in
63             # $noOrder - 1 if order is enforced, 0 if not
64             # Output:
65             # \@stats - ref to an array of (n11,n1p,np1,npp)
66             sub _statsFromAllLinkingInfo {
67 6     6   6 my $pairHashRef = shift;
68 6         6 my $linkingPairHashRef = shift;
69 6         6 my $n1pRef = shift;
70 6         5 my $np1Ref = shift;
71 6         5 my $npp = shift;
72 6         5 my $subGraphRef = shift;
73 6         5 my $noOrder = shift;
74              
75             ###############################
76             # Find Shared B Terms
77             ###
78             # Find the overlapping (shared) Co-occurrences
79             #grab terms from set1
80 6         7 my %set1Terms = ();
81 6         4 foreach my $cui (@{${$linkingPairHashRef}{'set1'}}) {
  6         13  
  6         10  
82 16         15 $set1Terms{$cui} = 1;
83             }
84              
85             #find the overlapping B terms and save as an array
86 6         7 my %sharedBTerms = ();
87 6         4 foreach my $cui (@{${$linkingPairHashRef}{'set2'}}) {
  6         4  
  6         9  
88 20 100       37 if (exists $set1Terms{$cui}) {
89 10         10 $sharedBTerms{$cui} = 1;
90             }
91             }
92              
93             ###############################
94             # Calculate Stats
95             ###
96 6         8 my $n11 = &_calculateN11($subGraphRef, $pairHashRef, \%sharedBTerms, $noOrder);
97 6         9 my $n1p = &_calculateN1P($subGraphRef, $pairHashRef, $n1pRef, $noOrder);
98 6         9 my $np1 = &_calculateNP1($subGraphRef, $pairHashRef, $np1Ref, $noOrder);
99              
100             #pack and save the stats for this pair hash
101 6         7 my @stats = ($n11, $n1p, $np1, $npp);
102 6         13 return \@stats;
103             }
104              
105              
106             # Calculates N11 for a pairHash
107             # Input:
108             # $subGraphRef - ref to the subgraph or matrix read in
109             # $pairHashRef - ref to a pairHash
110             # $sharedCoocRef - ref to hash{cui} = 1 of all shared B terms
111             # $noOrder - 1 if order is enforced, 0 if not
112             # Output:
113             # $n11 - n11 for this pairHash
114             sub _calculateN11 {
115             #grab params
116 6     6   7 my $subGraphRef = shift;
117 6         3 my $pairHashRef = shift;
118 6         18 my $sharedCoocRef = shift;
119 6         6 my $noOrder = shift;
120            
121             #calculate n11 as the minimum average weight
122 6         3 my $n11 = 0;
123             #my $count = 0;
124 6         6 foreach my $bNode (keys %{$sharedCoocRef}) {
  6         8  
125             #get the a to b value, which is the sum of all a_i to b
126 10         10 my $abVal = 0;
127 10         9 my $counted = 0;
128 10         7 foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
  10         8  
  10         13  
129 14         10 my $counted = 0;
130 14 100       11 if (exists ${${$subGraphRef}{$aNode}}{$bNode}) {
  14         14  
  14         31  
131 12         11 $abVal += ${${$subGraphRef}{$aNode}}{$bNode};
  12         12  
  12         12  
132             }
133 14 100       21 if ($noOrder) {
134             #avoid double counting either self references
135             # or overlapping set references
136 7 50       8 if ($counted == 0) {
137             #increment for noorder
138 7 50       7 if (exists ${${$subGraphRef}{$bNode}}{$aNode}) {
  7         4  
  7         26  
139 0         0 $abVal += ${${$subGraphRef}{$bNode}}{$aNode};
  0         0  
  0         0  
140             }
141             }
142             }
143             }
144            
145             #get the b to C value, which is the sum of all b to c_i
146 10         11 my $bcVal = 0;
147 10         9 foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
  10         10  
  10         11  
148 16         12 my $counted = 0;
149             #get the c to b value
150 16 100       23 if (exists ${${$subGraphRef}{$bNode}}{$cNode}) {
  16         13  
  16         26  
151 12         10 $bcVal += ${${$subGraphRef}{$bNode}}{$cNode};
  12         8  
  12         12  
152 12         10 $counted = 1;
153             }
154 16 100       22 if ($noOrder) {
155             #avoid double counting either self references
156             # or overlapping set references
157 8 100       11 if ($counted == 0) {
158 2 50       2 if (exists ${${$subGraphRef}{$cNode}}{$bNode}) {
  2         2  
  2         4  
159 0         0 $bcVal += ${${$subGraphRef}{$cNode}}{$bNode};
  0         0  
  0         0  
160             }
161             }
162             }
163             }
164              
165             #get the mininum value and increment n11
166             #find the min
167 10         11 my $min = $abVal;
168 10 50       11 if ($bcVal < $min) {
169 0         0 $min = $bcVal;
170             }
171             #increment n11
172 10         11 $n11 += $min;
173             #$count++;
174             }
175            
176             #NOTE - can delete count completely from this, but
177             # this re-enable divide by count if you want to compute AMW (then just return n11)
178             #if ($count > 0) {
179             # $n11 /= $count;
180             # }
181            
182 6         9 return $n11;
183             }
184              
185              
186             #calculates N1P for a pairHash
187             # Input:
188             # $subGraphRef - ref to the subgraph or matrix read in
189             # $pairHashRef - ref to a pairHash
190             # $n1pRef - ref to a hash{$cui}=n1p for that cui, order enforced
191             # $noOrder - 1 if order is enforced, 0 if not
192             # Output:
193             # $n1p - n1p for this pairHash
194             sub _calculateN1P {
195 6     6   6 my $subGraphRef = shift;
196 6         4 my $pairHashRef = shift;
197 6         5 my $n1pRef = shift;
198 6         6 my $noOrder = shift;
199            
200             #NOTE - two methods, one if we record n1p, one if we dont
201             #calculate $n1p as the sum of all set1 cooc
202             =comment
203             my $n1p = 0;
204             #find all a to b co-occurrences
205             foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
206             foreach my $bNode (keys @{$linkingTermsRef}) {
207             $n1p += ${${$subGraphRef}{$aNode}}{$bNode};
208             }
209             }
210             =cut
211 6         4 my $n1p = 0;
212             #find all a to b co-occurrences
213 6         6 foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
  6         5  
  6         8  
214 8         30 $n1p += ${$n1pRef}{$aNode};
  8         8  
215             }
216 6 100       13 if ($noOrder) {
217             #convert the pair hash array to a hash
218 3         4 my %set1 = ();
219 3         2 foreach my $key (@{${$pairHashRef}{'set1'}}) {
  3         2  
  3         4  
220 4         9 $set1{$key} = 1;
221             }
222              
223             #find all b to c co-occurrences
224 3         4 foreach my $bNode (keys %{$subGraphRef}) {
  3         6  
225 25         17 foreach my $aNode (@{${$pairHashRef}{'set1'}}) {
  25         22  
  25         25  
226             #avoid double counting self co-occurrences
227 35 100 66     63 if (exists $set1{$aNode} && exists $set1{$bNode}) {
228 6         7 next;
229             }
230              
231             #increment n1p
232 29 50       19 if (defined ${${$subGraphRef}{$bNode}}{$aNode}) {
  29         25  
  29         44  
233 0         0 $n1p += ${${$subGraphRef}{$bNode}}{$aNode};
  0         0  
  0         0  
234             }
235             }
236             }
237             }
238              
239 6         7 return $n1p;
240             }
241              
242             # Calculates NP1 for a pair hash
243             # Input:
244             # $subGraphRef - ref to the subgraph or matrix read in
245             # $pairHashRef - ref to a pairHash
246             # $np1Ref - ref to a hash{$cui}=np1 for that cui, order enforced
247             # $noOrder - 1 if order is enforced, 0 if not
248             # Output:
249             # \@stats - ref to an array of (n11,n1p,np1,npp)
250             sub _calculateNP1 {
251 6     6   5 my $subGraphRef = shift;
252 6         6 my $pairHashRef = shift;
253 6         4 my $np1Ref = shift;
254 6         6 my $noOrder = shift;
255            
256             #NOTE - two methods, one if we record np1, one if we dont
257             #calculate $n1p as the sum of all set2 cooc
258             =comment
259             my $np1 = 0;
260             #find all b to c co-occurrences
261             foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
262             foreach my $bNode (keys @{$linkingTermsRef}) {
263             $np1 += ${${$subGraphRef}{$bNode}}{$cNode};
264             }
265             }
266             =cut
267 6         4 my $np1 = 0;
268             #find all b to c co-occurrences
269 6         5 foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
  6         6  
  6         7  
270 10         10 $np1 += ${$np1Ref}{$cNode};
  10         10  
271             }
272 6 100       7 if ($noOrder) {
273             #convert the pair hash array to a hash
274 3         4 my %set2 = ();
275 3         3 foreach my $key (@{${$pairHashRef}{'set2'}}) {
  3         3  
  3         4  
276 5         5 $set2{$key} = 1;
277             }
278              
279             #find all c to b co-occurrences
280 3         8 foreach my $cNode (@{${$pairHashRef}{'set2'}}) {
  3         3  
  3         6  
281 5         4 foreach my $bNode (keys %{${$subGraphRef}{$cNode}}) {
  5         4  
  5         11  
282              
283             #avoid double counting pointing to self
284 3 50 33     15 if (exists $set2{$bNode} && exists $set2{$cNode}) {
285 3         4 next;
286             }
287            
288             #increment $np1
289 0         0 $np1 += ${${$subGraphRef}{$cNode}}{$bNode};
  0         0  
  0         0  
290             }
291             }
292             }
293              
294 6         8 return $np1;
295             }
296              
297              
298             # Calculates NPP for a subGraph (dataset)
299             # Input:
300             # $subGraphRef - ref to the subgraph or matrix read in
301             # Output:
302             # $npp - npp for this dataset
303             sub _calculateNPP {
304 0     0     my $subGraphRef = shift;
305              
306             #calculate npp as the total number of cooccurrences
307 0           my $npp = 0;
308 0           foreach my $key1 (keys %{$subGraphRef}) {
  0            
309 0           foreach my $key2 (keys %{${$subGraphRef}{$key1}}) {
  0            
  0            
310 0           $npp += ${${$subGraphRef}{$key1}}{$key2};
  0            
  0            
311             }
312             }
313 0           return $npp;
314             }
315              
316             1;