File Coverage

blib/lib/UMLS/Association/Measures/WSA.pm
Criterion Covered Total %
statement 161 161 100.0
branch 23 24 95.8
condition 8 9 88.8
subroutine 6 6 100.0
pod 0 1 0.0
total 198 201 98.5


line stmt bran cond sub pod time code
1             #UMLS::Association::Measures::WSA
2             # Computes the Weighted Set Association (WSA) between two sets of terms.
3             # WSA finds the shared linking terms between A and C and weights those
4             # terms based on their association to A. Each B term therefore has a
5             # weight relative to its association with A, which is multiplied by
6             # its n11,n1p,np1 to make more associated terms more or less important.
7             # The shared B to C set associaiton is then found using the weighted B
8             # terms to produce the final association score.
9 1     1   4 use strict;
  1         1  
  1         18  
10 1     1   3 use warnings;
  1         1  
  1         808  
11              
12             package UMLS::Association::Measures::WSA;
13              
14              
15             # Gets stats (n11,n1p,np1,npp) for each pairHash in the pairHashList
16             # using linking set association (LSA)
17             # Input:
18             # $pairHashListRef - ref to an array of pairHashes
19             # $matrixFileName - the fileName of the co-occurrence matrix
20             # $noOrder - 1 if order is enforced, 0 if not
21             # $paramsRef - the params used to create UMLS::Association which
22             # are used when finding the A to B weights
23             # Output:
24             # \@statsList - ref to an array of \@stats, refs to arrays
25             # containing the ordered values: n11, n1p, np1, npp
26             # for each of the pair hashes. The index of the
27             # \@statsList corresponds to the index of the pairHash
28             # in the input $pairHashListRef
29             sub getStats {
30 2     2 0 2 my $pairHashListRef = shift;
31 2         2 my $matrixFileName = shift;
32 2         3 my $noOrder = shift;
33 2         2 my $paramsRef = shift;
34              
35             ################## STEP 1 #######################
36             ######### Find the linking (B) terms ###########
37             #################################################
38             #read in the matrix - FILE READ ONE
39 2         3 my ($matrixRef, $vocabSize) = &UMLS::Association::StatFinder::readInMatrix($pairHashListRef, $matrixFileName);
40            
41             #construct A to shared B pair hashListRef
42 2         3 my @newPairHashList = ();
43 2         2 my @sharedCoocs = ();
44 2         2 foreach my $pairHashRef (@{$pairHashListRef}) {
  2         2  
45             #get the linking terms and shared linking terms
46             #MATRIX PASS 1
47 2         4 my ($set1CoocRef, $set2CoocRef) = &_getLinkingTermSets(
48             $pairHashRef, $matrixRef, $noOrder);
49 2         3 my $sharedCoocRef = &_getSharedLinkingTerms(
50             $set1CoocRef, $set2CoocRef);
51 2         2 my @sharedTerms = keys %{$sharedCoocRef};
  2         3  
52 2         3 push @sharedCoocs, \@sharedTerms;
53              
54             #construct pair hashes
55 2         2 foreach my $term (keys %{$sharedCoocRef}) {
  2         2  
56 5         5 my %newPairHash = ();
57 5         5 $newPairHash{'set1'} = ${$pairHashRef}{'set1'};
  5         7  
58              
59 5         4 my @set2 = ();
60 5         5 push @set2, $term;
61 5         6 $newPairHash{'set2'} = \@set2;
62 5         7 push @newPairHashList, \%newPairHash;
63             }
64             }
65             #Now we have a new pair hash ref which we will get associations
66             # for. This is setA to each B linking term. It is arranged
67             # such that you iterate over the shared terms of each pair hash
68             # to get A to each B for that term (e.g. pairHash1 has 10 linking
69             # terms, the first 10 pairHashes are A a single B. The B terms
70             # are ordered in the sharedCoocs Array of Arrays
71             #Doing it in this manner allows for WSA to be calculated in 3
72             # file reads and number pair hashes + 2 passes of the matrix
73              
74             ################## STEP 2 #######################
75             ####### Find the Weight of each B term ########
76             #################################################
77             #get A to shared B associations for all possible linking B terms
78             # FILE READ 2 - MATRIX PASS 2 (calculateAssociation_pairHashList)
79 2         2 my %optionsHash = ();
80 2 100 66     8 if (defined $noOrder && $noOrder > 0) { $optionsHash{'noorder'} = 1; }
  1         1  
81 2         2 $optionsHash{'measure'} = ${$paramsRef}{'measure'};
  2         3  
82 2         2 $optionsHash{'matrix'} = ${$paramsRef}{'matrix'};
  2         3  
83 2         11 my $assoc = UMLS::Association->new(\%optionsHash);
84 2         3 my $aToBScoresRef = $assoc->_calculateAssociation_pairHashList(\@newPairHashList, ${$paramsRef}{'measure'});
  2         5  
85              
86             #Normalize the weights unless told not to
87 2         2 my $weightIterator = 0;
88 2         3 my $reweightIterator = 0;
89 2 50       4 if (!$paramsRef->{'nonorm'}) {
90             #normalize the weights for each pairhash
91 2         2 for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  4         5  
92             #get all set B term weights as a hash{term}=weight
93 2         3 my %pairWeights = ();
94 2         2 foreach my $term (@{$sharedCoocs[$i]}) {
  2         2  
95 5         4 $pairWeights{$term} = ${$aToBScoresRef}[$weightIterator];
  5         5  
96 5         5 $weightIterator++;
97             }
98            
99             #scale the weights between 0 and 1 (weight/sum), so weight
100             # becomes a percentage of the total weights
101             # I need to keep weights <= 1 to maintain correctness
102             # of stats (npp in particular, but others as well?)
103 2         3 my $sum = 0;
104 2         3 foreach my $cui (keys %pairWeights) {
105 5         6 $sum += $pairWeights{$cui};
106             }
107 2         3 foreach my $cui (keys %pairWeights) {
108 5         5 ${$aToBScoresRef}[$reweightIterator] /= $sum;
  5         4  
109 5         7 $reweightIterator++;
110             }
111             }
112             ##### Now we have the normalized weights
113             }
114              
115             #So now we have the weights for all B terms and for each pair hash. Next
116             # step is to weight the subgraph using these weights for each
117             # pairhash and then calculate the B to C direct assocition
118             # using each of those re-weighted sub graphs
119              
120             ################## STEP 3 #######################
121             ####### Find the WSA between B and C ##########
122             #################################################
123             # Create the B to C pair hash and read in the matrix of B to C terms
124             # MATRIX READ 3 - reqiured because of links between the linking set terms
125             # (e.g. edge 3->4 in sample4. This becomes a source sink if matrix isnt
126             # read in again
127 2         3 my @bToCPairHashList = ();
128 2         2 for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  4         5  
129             #construct the B to C pair Hash
130 2         2 my %pairHash = ();
131 2         2 $pairHash{'set1'} = $sharedCoocs[$i];
132 2         2 $pairHash{'set2'} = ${${$pairHashListRef}[$i]}{'set2'};
  2         2  
  2         2  
133 2         4 push @bToCPairHashList, \%pairHash;
134             }
135 2         3 ($matrixRef, $vocabSize) = &UMLS::Association::StatFinder::readInMatrix(\@bToCPairHashList, $matrixFileName);
136              
137             # MATRIX PASS +numPairHashes - to calculate WSA we need to
138             # reweight the matrix differently for each pairHash
139             #get WSA Stats (n11,n1p,np1,npp) for each pairHash
140 2         6 $weightIterator = 0;
141 2         3 my @statsList = ();
142 2         2 for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  4         7  
143             #get all set B term weights as a hash{term}=weight
144 2         2 my %weights = ();
145 2         2 foreach my $term (@{$sharedCoocs[$i]}) {
  2         4  
146 5         3 $weights{$term} = ${$aToBScoresRef}[$weightIterator];
  5         7  
147 5         5 $weightIterator++;
148             }
149              
150             #get the weighted subgraph
151 2         4 my $weightedSubGraphRef = &_constructWeightedSubGraph($matrixRef, $bToCPairHashList[$i], \%weights);
152              
153             #calculate n11, n1p, np1, npp, using the weights specific to
154             # this pair hash, and save the results
155 2         4 my ($n1pRef, $np1Ref, $npp) = &UMLS::Association::Measures::Direct::_getAllCounts($weightedSubGraphRef);
156 2         5 push @statsList, &UMLS::Association::Measures::Direct::_statsFromAllCounts(
157             $weightedSubGraphRef, $n1pRef, $np1Ref, $npp, $noOrder, $bToCPairHashList[$i], \%weights);
158             }
159            
160             #return the stats list, an array of array refs
161             # each array ref conatins four values:
162             # n11, n1p, np1, and npp for the pair hash at
163             # the corresponding index in the pairHashList
164 2         10 return \@statsList;
165             }
166              
167              
168              
169             ##################################################
170             # Sub Graph Construction
171             ##################################################
172             #builds a subgraph relevant to this pair hash this includes adding
173             # cuis in other pair hashes to the universal source/sink, and collapsing
174             # edges to create set-nodes rather than cui nodes This also takes care of
175             # noOrder weights contains - hash{term} = weight
176             # Input:
177             # $matrixRef - ref to a matrix from which we construct a subgraph
178             # $pairHashRef - ref to a pairHash
179             # $weightsRef - ref to a hash{cui} = weight of that cui
180             # Output:
181             # \%subGraph - a weighted subgraph for this pairHash
182             sub _constructWeightedSubGraph {
183 2     2   3 my $matrixRef = shift;
184 2         2 my $pairHashRef = shift;
185 2         2 my $weightsRef = shift;
186              
187             #convert the pair hash to two hashes of cuis
188 2         2 my %set1 = ();
189 2         2 foreach my $key (@{${$pairHashRef}{'set1'}}) {
  2         2  
  2         4  
190 5         5 $set1{$key} = 1;
191             }
192 2         3 my %set2 = ();
193 2         2 foreach my $key (@{${$pairHashRef}{'set2'}}) {
  2         2  
  2         4  
194 4         5 $set2{$key} = 1;
195             }
196            
197             # Restrict graph to nodes in this pairhash. That is,
198             # set any nodes outside of sets1 and 2 to be the
199             # universal source and sink
200             #initalize the sub graph
201 2         2 my %subGraph = ();
202 2         3 my %emptyHash = ();
203 2         3 $subGraph{'source'} = \%emptyHash;
204              
205             #loop through all source and targets, and if not in
206             # either of the sets, replace with the universal
207             # sink or source
208 2         3 foreach my $source (keys %{$matrixRef}) {
  2         5  
209             #convert source to the universal source
210             # node if it is not in this pair hash
211 15         15 my $newSource = $source;
212 15 100 100     28 if (!exists $set1{$source} && !exists $set2{$source}) {
213 6         6 $newSource = 'source';
214             }
215            
216             #go through all targets for this source
217 15         16 foreach my $target (keys %{${$matrixRef}{$source}}) {
  15         10  
  15         23  
218             #convert to universal sink if node is
219             # not in this pair hash
220 18         17 my $newTarget = $target;
221 18 100 100     28 if (!exists $set1{$target} && !exists $set2{$target}) {
222 5         11 $newTarget = 'sink';
223             }
224            
225             #weights the value (if both source and target
226             # have weights, then weight is the their product)
227 18         17 my $value = ${${$matrixRef}{$source}}{$target};
  18         13  
  18         20  
228 18 100       16 if (defined ${$weightsRef}{$source}){
  18         20  
229 7         8 $value *= ${$weightsRef}{$source};
  7         7  
230             }
231 18 100       16 if (defined ${$weightsRef}{$target}) {
  18         20  
232 9         7 $value *= ${$weightsRef}{$target};
  9         10  
233             }
234            
235             #add the value to the subgraph
236 18         18 ${$subGraph{$newSource}}{$newTarget} += $value;
  18         23  
237             }
238             }
239             #At this point, the sub graph has been converted, such that
240             # it contains only the nodes in this pair hash. All other nodes
241             # have been converted to the universal source and univerals sink
242              
243             #return the subgraph
244 2         5 return \%subGraph;
245             }
246              
247              
248             ##################################################
249             # Linking Set Acquisition
250             ##################################################
251             # Find the linking terms (direct co-occurrences) between sets 1 and
252             # sets 2 and outputs them as co-occurrence hashes (hash{cui}=1)
253             # Input:
254             # $pairHashRef - ref to a pairHash
255             # $matrixRef - ref to the read in co-occurrence matrix
256             # $noOrder - 1 if order is enforced, 0 if not
257             # Output:
258             # \%set1Cooc - a hash{cui}=1 of all of set 1's direct co-occurrences
259             # (order/noOrder is accounted for)
260             # \%set2Cooc - a hash{cui}=1 of all of set 2's direct co-occurrences
261             # (order/noOrder is accounted for)
262             sub _getLinkingTermSets {
263 2     2   3 my $pairHashRef = shift;
264 2         2 my $matrixRef = shift;
265 2         2 my $noOrder = shift;
266              
267             #convert pair hash to sets 1 and 2 hashes
268 2         3 my %set1 = ();
269 2         2 foreach my $node (@{${$pairHashRef}{'set1'}}) {
  2         1  
  2         4  
270 4         6 $set1{$node} = 1;
271             }
272 2         2 my %set2 = ();
273 2         2 foreach my $node (@{${$pairHashRef}{'set2'}}) {
  2         1  
  2         3  
274 4         4 $set2{$node} = 1;
275             }
276              
277             #get all co-occurring terms with set1 and set2
278 2         3 my %set1Cooc = ();
279 2         3 my %set2Cooc = ();
280             #check all nodes in the dataset
281 2         2 foreach my $source (keys %{$matrixRef}) {
  2         4  
282 16         16 foreach my $target (keys %{${$matrixRef}{$source}}) {
  16         12  
  16         23  
283             #add co-occurrences to set1 and set2
284 18 100       21 if (exists $set1{$source}) {
285 6         7 $set1Cooc{$target} = 1;
286             }
287 18 100       20 if (exists $set2{$target}) {
288 4         4 $set2Cooc{$source} = 1;
289             }
290              
291             #if noorder, add co-occurrences
292             # to set1 and set2
293 18 100       21 if ($noOrder) {
294 9 100       11 if (exists $set1{$target}) {
295 1         1 $set1Cooc{$source} = 1;
296             }
297 9 100       11 if (exists $set2{$source}) {
298 2         2 $set2Cooc{$target} = 1;
299             }
300             }
301             }
302             }
303              
304             #return the two co-occurring sets
305 2         4 return (\%set1Cooc, \%set2Cooc);
306             }
307              
308             # Finds the shared co-occurrences between the two input co-occurrence hashes
309             # Input:
310             # \%set1Cooc - a hash{cui}=1 of all of set 1's direct co-occurrences
311             # (order/noOrder is accounted for)
312             # \%set2Cooc - a hash{cui}=1 of all of set 2's direct co-occurrences
313             # (order/noOrder is accounted for)
314             # Output:
315             # \%sharedCooc - a hash{cui}=1 of the shared co-occurrences between
316             # the input co-occurrence hashes
317             sub _getSharedLinkingTerms {
318 2     2   2 my $set1CoocRef = shift;
319 2         2 my $set2CoocRef = shift;
320            
321             #get the shared linking terms between
322             # set1 and set2 co-occurrences
323 2         2 my %sharedCooc = ();
324 2         2 foreach my $node (keys %{$set1CoocRef}) {
  2         3  
325 6 100       6 if (defined ${$set2CoocRef}{$node}) {
  6         7  
326 5         6 $sharedCooc{$node} = 1;
327             }
328             }
329 2         3 return \%sharedCooc;
330             }
331              
332              
333             1;