File Coverage

blib/lib/UMLS/Association/StatFinder.pm
Criterion Covered Total %
statement 241 247 97.5
branch 48 58 82.7
condition 10 15 66.6
subroutine 17 17 100.0
pod 0 4 0.0
total 316 341 92.6


line stmt bran cond sub pod time code
1             #UMLS::Association
2             #
3             # Perl module for scoring the semantic association of terms in the Unified
4             # Medical Language System (UMLS).
5             #
6             # Copyright (c) 2015
7             #
8             # Sam Henry, Virginia Commonwealth University
9             # henryst at vcu.edu
10             #
11             # Bridget McInnes, Virginia Commonwealth University
12             # btmcinnes at vcu.edu
13             #
14             # Keith Herbert, Virginia Commonwealth University
15             # herbertkb at vcu.edu
16             #
17             # Alexander D. McQuilkin, Virginia Commonwealth University
18             # alexmcq99 at yahoo.com
19             #
20             # This program is free software; you can redistribute it and/or
21             # modify it under the terms of the GNU General Public License
22             # as published by the Free Software Foundation; either version 2
23             # of the License, or (at your option) any later version.
24             #
25             # This program is distributed in the hope that it will be useful,
26             # but WITHOUT ANY WARRANTY; without even the implied warranty of
27             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28             # GNU General Public License for more details.
29             #
30             # You should have received a copy of the GNU General Public License
31             # along with this program; if not, write to
32             #
33             # The Free Software Foundation, Inc.,
34             # 59 Temple Place - Suite 330,
35             # Boston, MA 02111-1307, USA.
36             package UMLS::Association::StatFinder;
37              
38 1     1   4 use Fcntl;
  1         1  
  1         120  
39 1     1   4 use strict;
  1         1  
  1         13  
40 1     1   3 use warnings;
  1         1  
  1         22  
41 1     1   3 use bytes;
  1         1  
  1         8  
42 1     1   15 use File::Spec;
  1         1  
  1         25  
43              
44 1     1   299 use UMLS::Association::Measures::Direct;
  1         2  
  1         19  
45 1     1   280 use UMLS::Association::Measures::LTA;
  1         2  
  1         18  
46 1     1   308 use UMLS::Association::Measures::MWA;
  1         2  
  1         18  
47 1     1   283 use UMLS::Association::Measures::SBC;
  1         2  
  1         17  
48 1     1   272 use UMLS::Association::Measures::LSA;
  1         1  
  1         17  
49 1     1   292 use UMLS::Association::Measures::WSA;
  1         2  
  1         1237  
50              
51             # error handling variables
52             my $errorhandler = "";
53             my $pkg = "UMLS::Association::StatFinder";
54              
55             #NOTE: every global variable is followed by a _G with the
56             # exception of debug error handler, and constants which are all caps
57             # global variables
58             my $debug = 0; #in debug mode or not
59              
60             #global options variables
61             my $lta_G = 0; #1 or 0 is using lta or not
62             my $mwa_G = 0; #1 or 0 if using mwa or not
63             my $lsa_G = 0; #1 or 0 if using lsa or not
64             my $sbc_G = 0; #1 or 0 if using sbc or not
65             my $wsa_G = 0; #1 or 0 if using wsa or not
66             my $noOrder_G = 0; #1 or 0 if noOrder is enabled or not
67             my $matrix_G = 0; #matrix file name is using a matrix file rather than DB
68             my $params_G; #stores all params (if needed)
69              
70             ######################################################################
71             # Initialization Functions
72             ######################################################################
73             # method to create a new UMLS::Association::StatFinder object
74             # input : $params <- reference to hash of database parameters
75             # output: $self
76             sub new {
77             #grab params and create self
78 24     24 0 24 my $self = {};
79 24         24 my $className = shift;
80 24         23 my $params = shift;
81              
82             #bless the object.
83 24         40 bless($self, $className);
84              
85             #initialize error handler
86 24         33 $errorhandler = UMLS::Association::ErrorHandler->new();
87 24 50       34 if(! defined $errorhandler) {
88 0         0 print STDERR "The error handler did not get passed properly.\n";
89 0         0 exit;
90             }
91              
92             # initialize the object.
93 24         24 $debug = 0;
94 24         39 $self->_initialize($params);
95 24         33 return $self;
96             }
97              
98             # method to initialize the UMLS::Association::StatFinder object.
99             # input : $parameters <- reference to a hash of database parameters
100             # output: none, but $self is initialized
101             sub _initialize {
102             #grab parameters
103 24     24   34 my $self = shift;
104 24         20 my $paramsRef = shift;
105 24         31 my %params = %{$paramsRef};
  24         81  
106              
107             #set global variables using option hash
108 24         39 $lta_G = $params{'lta'};
109 24         20 $mwa_G = $params{'mwa'};
110 24         19 $lsa_G = $params{'lsa'};
111 24         21 $sbc_G = $params{'sbc'};
112 24         23 $wsa_G = $params{'wsa'};
113 24         22 $noOrder_G = $params{'noorder'};
114 24         22 $matrix_G = $params{'matrix'};
115 24         22 $params_G = $paramsRef;
116            
117             #error checking
118 24         21 my $function = "_initialize";
119 24         34 &_debug($function);
120 24 50 33     54 if(!defined $self || !ref $self) {
121 0         0 $errorhandler->_error($pkg, $function, "", 2);
122             }
123 24 50       46 if (!defined $matrix_G) {
124 0         0 die ("ERROR: co-occurrence matrix must be defined\n");
125             }
126             }
127              
128             sub _debug {
129 24     24   21 my $function = shift;
130 24 50       31 if($debug) { print STDERR "In UMLS::Association::StatFinder::$function\n"; }
  0         0  
131             }
132              
133             ######################################################################
134             # public interface to get observed counts
135             ######################################################################
136              
137             # Gets observed counts (n11, n1p, np1, npp) of the cui sets
138             # input: $pairHashListRef - a ref to an array of pairHashes
139             # output: \@allStatsRef - a ref to an array of observed counts 4-tuples
140             # each 4-tuple consists of in order:
141             # $n11, $n1p, $np1, and $npp
142             # and they correspond to the observed counts of
143             # each of the pairHashes passed in
144             sub getObservedCounts {
145             #grab parameters
146 24     24 0 24 my $self = shift;
147 24         19 my $pairHashListRef = shift;
148              
149             #error checking
150 24         22 my $function = "getObservedCounts";
151 24 50 33     53 if(!defined $self || !ref $self) {
152 0         0 $errorhandler->_error($pkg, $function, "", 2);
153             }
154              
155             #calculate n11, n1p, np1, npp
156 24         23 my $allStatsRef = -1;
157 24 100       41 if ($lta_G) {
    100          
    100          
    100          
    100          
158 4         5 $allStatsRef = &UMLS::Association::Measures::LTA::getStats($pairHashListRef, $matrix_G, $noOrder_G);
159             }
160             elsif ($mwa_G) {
161 4         6 $allStatsRef = &UMLS::Association::Measures::MWA::getStats($pairHashListRef, $matrix_G, $noOrder_G);
162             }
163             elsif ($lsa_G) {
164 5         8 $allStatsRef = &UMLS::Association::Measures::LSA::getStats($pairHashListRef, $matrix_G, $noOrder_G);
165             }
166             elsif ($sbc_G) {
167 4         8 $allStatsRef = &UMLS::Association::Measures::SBC::getStats($pairHashListRef, $matrix_G, $noOrder_G);
168             }
169             elsif ($wsa_G) {
170 2         3 $allStatsRef = &UMLS::Association::Measures::WSA::getStats($pairHashListRef, $matrix_G, $noOrder_G, $params_G);
171             }
172             else {
173 5         8 $allStatsRef = &UMLS::Association::Measures::Direct::getStats($pairHashListRef, $matrix_G, $noOrder_G);
174             }
175              
176             #return a reference to a list of stats for each pairHash
177 24         45 return $allStatsRef;
178             }
179              
180             ####NOTE: fine for direct, LTA, and MWA
181             #NOT for SBC and LSA
182             #Reads in the matrix values that are needed
183             # anything not in the pairHashListRef is counted
184             # towards a universal source and/or universal sink node
185             # Input: $pairHashListRef - reference to the pairHashList
186             # $matrixFileName - fileName of the matrix
187             # Output: \%matrix - matrix as read in. Matrix is stored
188             # as a hash of hashes, where hash{node}
189             # contains all a hash of all outgoing edges
190             # from that node, such that:
191             # ${$hash{$source}}{$target} = $weight
192             sub readInMatrix {
193 18     18 0 15 my $pairHashListRef = shift;
194 18         27 my $matrixFileName = shift;
195              
196             #convert the pairhash list to a list of leading and
197             # trailing cuis
198 18         17 my %leadingCuis = ();
199 18         18 my %trailingCuis = ();
200 18         18 my %allCuis = ();
201 18         16 foreach my $pairHashRef(@{$pairHashListRef}) {
  18         43  
202 25         22 foreach my $cui(@{${$pairHashRef}{'set1'}}) {
  25         49  
  25         33  
203 52         49 $leadingCuis{$cui} = 1;
204 52         56 $allCuis{$cui} = 1;
205             }
206 25         23 foreach my $cui(@{${$pairHashRef}{'set2'}}) {
  25         21  
  25         25  
207 48         47 $trailingCuis{$cui} = 1;
208 48         51 $allCuis{$cui} = 1;
209             }
210             }
211            
212             #Initalize the matrix, and ensure there is a source node
213 18         17 my %matrix = ();
214 18         18 my %sourceHash = ();
215 18         22 $matrix{'source'} = \%sourceHash;
216 18         19 ${$matrix{'source'}}{'sink'} = 0;
  18         22  
217            
218             #also get a count of vocabulary when reading
219 18         29 my %vocabulary = ();
220              
221             #### Read in matrix, longest execution time, so keep this fast
222             #read in all matrix values associated with all
223             # cuis needed. For co-occurrences outside of the
224             # needed set, created a universal source and sink
225 18 50       450 open IN, $matrix_G or die "Cannot open $matrix_G for input: $!\n";
226 18         199 while (my $line = ) {
227             #get cuis and value from the line
228 179         192 chomp $line;
229 179         346 my ($cui1, $cui2, $num) = split /\t/, $line;
230            
231             #update the vocabulary
232 179         212 $vocabulary{$cui1} = 1;
233 179         178 $vocabulary{$cui2} = 1;
234              
235             #if cui1 or cui2 are not cuis of interest, replace
236             # them with universal source/sink
237 179 100 100     275 if (!exists $allCuis{$cui1} && !exists $allCuis{$cui2}) {
238 33         33 $cui1 = 'source';
239 33         27 $cui2 = 'sink';
240             }
241              
242             #record the co-occurrence as a directed weighted
243             # weighted edge list ${hash{n1}}{n2}=$val;
244 179 100       205 if (!exists $matrix{$cui1}) {
245 103         96 my %emptyHash = ();
246 103         130 $matrix{$cui1} = \%emptyHash;
247             }
248 179         157 ${$matrix{$cui1}}{$cui2} += $num;
  179         423  
249             }
250 18         111 close IN;
251             ##### END reading in matrix
252              
253 18         106 return \%matrix, (scalar keys %vocabulary);
254             }
255              
256              
257             #gets a new pair hash list, that is the linking terms of the
258             # original pair hashes
259             # Input:
260             # $pairHashListRef - ref to an array of pairHashes
261             # $matrixFileName - the fileName of the co-occurrence matrix
262             # $noOrder - 1 if order is enforced, 0 if not
263             # $recordStats - 1 if stats are to be recorded (n1p for all, np1 for all, npp)
264             # $recordCounts - 1 if unique counts are to be used rather than co-occurrence
265             # counts (i.e. npp = vocab size, n1p = count of co-occurring
266             # terms rather than npp = all co-occurrences, n1p = sum of
267             # co-occurrences for a term).
268             # Output:
269             # output varies whether or not stats are recorded.
270             # If stats are not recorded:
271             # \@newPairHashList - an array of pair hashes, where each pairHash is
272             # the linking terms, B_A and B_C of the original pair hash at that
273             # index (order/NoOrder is accounted for in this list)
274             # If stats are recorded:
275             # \%n1p - n1p{cui} = n1p for that term (order enforced
276             # \%np1 - np1{cui} = np1 for that term (order enforced
277             # $npp - npp for this dataset (term counts or co-occurrence counts)
278             # \%matrix - the co-occurrence matrix for all A, B_A, B_C, and C terms
279             # this is essentially all possibly N11's (order enforced)
280             # \@newPairHashList - same as above
281             #NOTE: LSA and SBC do not need anything but newPairHashList
282             # LTA needs npp (with counts), newPairHashList
283             # MWA needs n1p, np1, npp, matrix, newPairHashList
284             # ...I kept everything in this method because calculating the newPairHashList
285             # of linking terms is fairly complicated and I didn't want to replicate code
286             # to make bug fixes/updates easier. For minor speed iprovements, and
287             # understandability, we could create 3 seperate methods, one for each of the
288             # required sets of info needed
289             sub getLinkingTermsPairHashList {
290 17     17 0 19 my $pairHashListRef = shift;
291 17         16 my $matrixFileName = shift;
292 17         15 my $noOrder = shift;
293 17         17 my $recordStats = shift;
294 17         16 my $recordCounts = shift;
295              
296             #####################################
297             #create data structures to make line reading fast
298             #####################################
299 17         15 my %allCuis = (); #a hash of all cuis we need to grab hash{cui}=1
300 17         13 my %set1Cuis = (); #hash{cui} = list of pairHash indeces containing the cui in set1
301 17         14 my %set2Cuis = (); #hash{cui} = list of pairHash indeces containing the cui in set2
302 17         17 my @newPairHashList = (); #the linking term pairHashList
303 17         16 for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  42         65  
304 25         21 my $pairHashRef = ${$pairHashListRef}[$i];
  25         25  
305              
306             #initialize set1 index
307 25         25 my %set1Hash = ();
308 25         22 foreach my $cui(@{${$pairHashRef}{'set1'}}) {
  25         29  
  25         36  
309 33 50       53 if (!exists $set1Cuis{$cui}) {
310 33         29 my @emptyArray = ();
311 33         42 $set1Cuis{$cui} = \@emptyArray;
312 33         57 $allCuis{$cui} = 1;
313             }
314 33         32 push @{$set1Cuis{$cui}}, $i;
  33         44  
315             }
316             #initialize set2 index
317 25         25 my %set2Hash = ();
318 25         19 foreach my $cui(@{${$pairHashRef}{'set2'}}) {
  25         51  
  25         30  
319 41 50       54 if (!exists $set2Cuis{$cui}) {
320 41         37 my @emptyArray = ();
321 41         44 $set2Cuis{$cui} = \@emptyArray;
322 41         42 $allCuis{$cui}=1;
323             }
324 41         34 push @{$set2Cuis{$cui}}, $i;
  41         46  
325             }
326              
327             #initialize the parallel linking set pairHash
328 25         24 my %newPairHash = ();
329 25         22 my @set1Array = ();
330 25         51 $newPairHash{'set1'} = \@set1Array;
331 25         22 my @set2Array = ();
332 25         24 $newPairHash{'set2'} = \@set2Array;
333 25         38 push @newPairHashList, \%newPairHash;
334             }
335              
336             ##################################
337             ##### Initialize values for recording stats while reading the matrix
338             ##################################
339 17         27 my %n1p = ();
340 17         15 my %np1 = ();
341 17         16 my $npp = 0;
342 17         14 my %vocab = ();
343 17         17 my %matrix = ();
344             # We want to make sure that n1p and np1 have a value for
345             # all set1 and set2 terms, so initialize to 0
346             # ...and we want to do this here, so that it is only done once
347 17         17 for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  42         47  
348 25         25 my $pairHashRef = ${$pairHashListRef}[$i];
  25         22  
349 25         32 foreach my $cui(@{${$pairHashRef}{'set1'}}) {
  25         19  
  25         26  
350 33         36 $n1p{$cui}=0
351             }
352 25         30 foreach my $cui(@{${$pairHashRef}{'set2'}}) {
  25         20  
  25         30  
353 41         44 $np1{$cui}=0;
354             }
355             }
356              
357             ####################################
358             #### File Read in, where most execution time is spent, keep this fast
359             ####################################
360             #read in the linking sets from the matrix
361 17 50       464 open IN, $matrixFileName or die "Cannot open $matrixFileName for input: $!\n";
362 17         49 my @vals;
363 17         186 while (my $line = ) {
364             #get cuis and value from the line
365 200         371 @vals = (split /\t/, $line);
366             #$cui1 = $vals[0]
367             #$cui2 = $vals[1]
368             #$num = $vals[2]
369 200         217 chomp $vals[2];
370              
371             #update npp and vocab if needed
372 200 100       211 if ($recordStats) {
373 92 100       102 if ($recordCounts) {
374 46         41 $vals[2] = 1;
375 46         48 $vocab{$vals[0]} = 1;
376 46         44 $vocab{$vals[1]} = 1;
377             }
378 92         90 $npp += $vals[2];
379             }
380              
381             #If either of the CUIs are in the pairHashList, then record
382             # their co-occurrences, and additional stats if needed
383 200 100 100     381 if (exists $allCuis{$vals[0]} || exists $allCuis{$vals[1]}) {
384             #add any cui1 linking terms if needed
385 164 100       189 if (exists $set1Cuis{$vals[0]}) {
386             #cui1 is in one more or more set1s
387             # add to the linking term (cui2) to each pairhash
388             # that cui1 is in
389 74         63 foreach my $index (@{$set1Cuis{$vals[0]}}) {
  74         92  
390 74         68 push @{${$newPairHashList[$index]}{'set1'}}, $vals[1];
  74         58  
  74         129  
391             }
392             }
393              
394             #add any cui2 linking terms if needed
395 164 100       189 if (exists $set2Cuis{$vals[1]}) {
396             #cui2 is in one more or more set2s
397             # add to the linking term (cui1) to each pairhash
398             # that cui2 is in
399 96         73 foreach my $index (@{$set2Cuis{$vals[1]}}) {
  96         124  
400 96         69 push @{${$newPairHashList[$index]}{'set2'}}, $vals[0];
  96         77  
  96         135  
401             }
402             }
403            
404             #record n1p and np1 and npp if needed
405 164 100       180 if ($recordStats) {
406             #n1p and np1 are recorded for MWA only
407 80         91 $n1p{$vals[0]} += $vals[2];
408 80         79 $np1{$vals[1]} += $vals[2];
409              
410             #The matrix must be recorded to calculate MWA
411 80 50 66     128 if (exists $set1Cuis{$vals[0]} || exists $set2Cuis{$vals[1]}) {
412 80 100       93 if (!defined $matrix{$vals[0]}) {
413 52         49 my %emptyHash = ();
414 52         65 $matrix{$vals[0]} = \%emptyHash;
415             }
416 80         68 ${$matrix{$vals[0]}}{$vals[1]} = $vals[2];
  80         109  
417             }
418             }
419              
420             #add cuis if order doesnt matter
421 164 100       262 if ($noOrder) {
422             #NOTE: this is the same as above, but with cui1
423             # and cui2 swapped. I didn't make this a seperate
424             # method becuase method calls are slow, and
425             # iterating thru lines should be fast
426 84         77 my $temp = $vals[0];
427 84         71 $vals[0] = $vals[1];
428 84         74 $vals[1] = $temp;
429             # ...literally copy and paste below
430            
431             #add any cui1 linking terms if needed
432 84 100       95 if (exists $set1Cuis{$vals[0]}) {
433             #cui1 is in one more or more set1s
434             # add to the linking term (cui2) to each pairhash
435             # that cui1 is in
436 3         3 foreach my $index (@{$set1Cuis{$vals[0]}}) {
  3         5  
437 3         3 push @{${$newPairHashList[$index]}{'set1'}}, $vals[1];
  3         3  
  3         6  
438             }
439             }
440              
441             #add any cui2 linking terms if needed
442 84 100       188 if (exists $set2Cuis{$vals[1]}) {
443             #cui2 is in one more or more set2s
444             # add to the linking term (cui1) to each pairhash
445             # that cui2 is in
446 13         12 foreach my $index (@{$set2Cuis{$vals[1]}}) {
  13         13  
447 13         14 push @{${$newPairHashList[$index]}{'set2'}}, $vals[0];
  13         11  
  13         46  
448             }
449             }
450             }
451             }
452             }
453             ###### DONE reading in the file and constructing linking term sets
454              
455             #################################
456             # Remove Duplicates
457             ##################################
458             #due to how cuis are added to the pair hashes
459             # it is possible to have duplicate cuis in each
460             # remove any duplicates
461 17         28 for (my $i = 0; $i < scalar @newPairHashList; $i++) {
462 25         17 my $set1Ref = ${$newPairHashList[$i]}{'set1'};
  25         34  
463 25         23 my $set2Ref = ${$newPairHashList[$i]}{'set2'};
  25         21  
464              
465             #remove duplicates from set1
466 25         27 my %set1 = ();
467 25         21 foreach my $term (@{$set1Ref}) {
  25         29  
468 77         95 $set1{$term} = 1;
469             }
470 25         43 my @set1Terms = keys %set1;
471 25         25 ${$newPairHashList[$i]}{'set1'} = \@set1Terms;
  25         26  
472            
473             #remove duplicates from set2
474 25         36 my %set2 = ();
475 25         21 foreach my $term (@{$set2Ref}) {
  25         25  
476 109         105 $set2{$term} = 1;
477             }
478 25         36 my @set2Terms = keys %set2;
479 25         31 ${$newPairHashList[$i]}{'set2'} = \@set2Terms;
  25         74  
480             }
481              
482             #################################
483             # Return values
484             ##################################
485             #return the pair hash list of linking sets
486 17 100       22 if ($recordStats) {
487 8 100       11 if ($recordCounts) {
488 4         5 $npp = scalar keys %vocab;
489             }
490 8         40 return \%n1p, \%np1, $npp, \%matrix, \@newPairHashList;
491             }
492             else {
493 9         35 return \@newPairHashList;
494             }
495             }
496              
497              
498             1;
499             __END__