File Coverage

blib/lib/UMLS/Association.pm
Criterion Covered Total %
statement 121 188 64.3
branch 27 60 45.0
condition 6 27 22.2
subroutine 14 18 77.7
pod 0 6 0.0
total 168 299 56.1


line stmt bran cond sub pod time code
1             # UMLS::Association
2             #
3             # Perl module for scoring the semantic association of terms in the Unified
4             # Medical Language System (UMLS).
5             #
6             # Copyright (c) 2015
7             #
8             # Sam Henry, Virginia Commonwealth University
9             # henryst at vcu.edu
10             #
11             # Bridget McInnes, Virginia Commonwealth University
12             # btmcinees at vcu.edu
13             #
14             # Keith Herbert, Virginia Commonwealth University
15             # herbertkb at vcu.edu
16             #
17             # Alexander D. McQuilkin, Virginia Commonwealth University
18             # alexmcq99 at yahoo.com
19             #
20             # This program is free software; you can redistribute it and/or
21             # modify it under the terms of the GNU General Public License
22             # as published by the Free Software Foundation; either version 2
23             # of the License, or (at your option) any later version.
24             #
25             # This program is distributed in the hope that it will be useful,
26             # but WITHOUT ANY WARRANTY; without even the implied warranty of
27             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28             # GNU General Public License for more details.
29             #
30             # You should have received a copy of the GNU General Public License
31             # along with this program; if not, write to
32             #
33             # The Free Software Foundation, Inc.,
34             # 59 Temple Place - Suite 330,
35             # Boston, MA 02111-1307, USA.
36              
37             =head1 NAME
38              
39             UMLS::Association - A suite of Perl modules that implement a number of semantic
40             association measures in order to calculate the semantic association between two
41             concepts in the UMLS.
42              
43              
44             =head1 INSTALL
45              
46             To install the module, run the following magic commands:
47              
48             perl Makefile.PL
49             make
50             make test
51             make install
52              
53             This will install the module in the standard location. You will, most
54             probably, require root privileges to install in standard system
55             directories. To install in a non-standard directory, specify a prefix
56             during the 'perl Makefile.PL' stage as:
57              
58             perl Makefile.PL PREFIX=/home/sid
59              
60             It is possible to modify other parameters during installation. The
61             details of these can be found in the ExtUtils::MakeMaker
62             documentation. However, it is highly recommended not messing around
63             with other parameters, unless you know what you're doing.
64              
65             =head1 DESCRIPTION
66              
67             This package provides a Perl interface to
68              
69             =head1 INITIALIZING THE MODULE
70              
71             To create an instance of the interface object, using default values
72             for all configuration options:
73              
74             use UMLS::Association;
75             my $associaton = UMLS::Association->new();
76              
77             More information is provided in the INSTALL file.
78              
79             =cut
80             package UMLS::Association;
81              
82 1     1   329 use Fcntl;
  1         1  
  1         162  
83 1     1   4 use strict;
  1         1  
  1         18  
84 1     1   4 use warnings;
  1         1  
  1         16  
85 1     1   450 use bytes;
  1         10  
  1         3  
86              
87 1     1   337 use UMLS::Association::StatFinder;
  1         1  
  1         21  
88 1     1   352 use UMLS::Association::ErrorHandler;
  1         2  
  1         30  
89              
90             my $pkg = "UMLS::Association";
91              
92 1     1   5 use vars qw($VERSION);
  1         1  
  1         1135  
93              
94             $VERSION = '0.17';
95              
96             my $errorhandler = "";
97             my $statfinder_G = "";
98             my $debug = 0;
99             my $precision_G = 4; #precision of the output
100              
101              
102             # -------------------- Class methods start here --------------------
103              
104             # method to create a new UMLS::Association object
105             # input : $params <- reference to hash containing the parameters
106             # output: $self
107             sub new {
108 24     24 0 1027 my $self = {};
109 24         27 my $className = shift;
110 24         21 my $params = shift;
111              
112             # bless the object.
113 24         27 bless($self, $className);
114              
115             # initialize error handler
116 24         48 $errorhandler = UMLS::Association::ErrorHandler->new();
117 24 50       52 if(! defined $errorhandler) {
118 0         0 print STDERR "The error handler did not get passed properly.\n";
119 0         0 exit;
120             }
121            
122             # Initialize the object.
123 24         38 $self->_initialize($params);
124              
125 24         33 return $self;
126             }
127              
128             # initialize the variables and set the parameters
129             # input : $params <- reference to hash containing the parameters
130             # output: none, but $self is initialized
131             sub _initialize {
132 24     24   23 my $self = shift;
133 24         21 my $params = shift;
134              
135             # check self
136 24         19 my $function = "_initialize";
137 24 50 33     66 if(!defined $self || !ref $self) {
138 0         0 $errorhandler->_error($pkg, $function, "", 2);
139             }
140 24         24 my $paramCount = 0;
141 24 100       33 if ($params->{'mwa'}) {$paramCount++;}
  4         5  
142 24 100       28 if ($params->{'lta'}) {$paramCount++;}
  4         4  
143 24 100       28 if ($params->{'lsa'}) {$paramCount++;}
  5         5  
144 24 100       29 if ($params->{'sbc'}) {$paramCount++;}
  4         3  
145 24 100       30 if ($params->{'wsa'}) {$paramCount++;}
  2         2  
146 24 50       41 if ($paramCount > 1) {
147 0         0 $errorhandler->_error($pkg, $function, "Only one of lta, mwa, lsa, sbc, wsa may be specified", 12);
148             }
149              
150             # set parameters
151 24 50       40 if ($params->{'precision'}) {
152 0         0 $precision_G = $params->{'precision'};
153             }
154 24 50       26 if ($params->{'debug'}) {
155 0         0 $debug = 1;
156             }
157              
158             # set the statfinder
159 24         22 $params->{'association'} = $self; #set associaiton for WSA
160 24         48 $statfinder_G = UMLS::Association::StatFinder->new($params);
161 24 50       41 if(! defined $statfinder_G) {
162 0         0 my $str = "The UMLS::Association::StatFinder object was not created.";
163 0         0 $errorhandler->_error($pkg, $function, $str, 8);
164             }
165             }
166              
167             # returns the version currently being used
168             # input : none
169             # output: the version number being used
170             sub version {
171 0     0 0 0 my $self = shift;
172 0         0 return $VERSION;
173             }
174              
175             ##########################################################################
176             # Public Association Interface
177             ##########################################################################
178             # All association scores are computed through a data structure, the pair hash
179             # list. This forces all the modes of operation to use the same code, and allows
180             # all data to be retreived in a single pass of a matrix file
181             # The pair hash list is an array of pairHashRefs. The pair hash is a
182             # hash with two keys, 'set1' and 'set2' each of these keys holds an arrayRef of
183             # cuis which correspond to cuis in that set. This allows for lists of pairs of
184             # sets of CUIs to be computed
185             # In the case where only a single pair computation is needed, or rather
186             # than a set, just a single cui is needed, each function still wraps the
187             # values into a pairHashList. 'set1' cuis are the leading cuis in the pair, and
188             # 'set2 are the trailing cuis in the pair'
189              
190              
191             # calculates association for a list of single cui pairs
192             # input: $cuiPairListRef - an array ref of comma seperated cui pairs
193             # the first in the pair is the leading,
194             # second in the pair is the trailing
195             # $measure - a string specifying the association measure to use
196             # output: \@scores - an array ref of scores corresponding to the assocaition
197             # score for each of the pairHashes that were input
198             sub calculateAssociation_termPairList {
199 0     0 0 0 my $self = shift;
200 0         0 my $cuiPairListRef = shift;
201 0         0 my $measure = shift;
202              
203             #create the cuiPairs hash datastructure
204 0         0 my @pairHashes = ();
205 0         0 foreach my $pair (@{$cuiPairListRef}) {
  0         0  
206             #grab the cuis from the pair
207 0         0 (my $cui1, my $cui2) = split(',',$pair);
208 0         0 push @pairHashes, $self->_createPairHash_singleTerms($cui1,$cui2);
209             }
210              
211             #return the array of association scores for each pair
212 0         0 return $self->_calculateAssociation_pairHashList(\@pairHashes, $measure);
213             }
214              
215             # calculates association for a single cui pair
216             # input: $cui1 - the leading cui
217             # $cui2 - the trailing cui
218             # $measure - a string specifying the association measure to use
219             # output: $score - the association between the cuis
220             sub calculateAssociation_termPair {
221 0     0 0 0 my $self = shift;
222 0         0 my $cui1 = shift;
223 0         0 my $cui2 = shift;
224 0         0 my $measure = shift;
225            
226             #create the pairHash List
227 0         0 my @pairHashes = ();
228 0         0 push @pairHashes, $self->_createPairHash_singleTerms($cui1,$cui2);
229              
230             #return the association score, which is the first (and only)
231             # values of the return array
232 0         0 return ${$self->_calculateAssociation_pairHashList(\@pairHashes, $measure)}[0];
  0         0  
233             }
234              
235             # calculates association for two sets of cuis (leading and trailing cuis)
236             # input: \@cuis1Ref - a ref to an array of leading cuis
237             # \@cuis2Ref - a ref to an array of trailing cuis
238             # $measure - a string specifying the association measure to use
239             # output: $score - the association between the cui sets
240             sub calculateAssociation_setPair {
241 14     14 0 38 my $self = shift;
242 14         10 my $cuis1Ref = shift;
243 14         9 my $cuis2Ref = shift;
244 14         14 my $measure = shift;
245              
246             #create the cuiPairs hash datasetructure
247 14         13 my @pairHashes = ();
248 14         20 push @pairHashes, $self->_createPairHash_termLists($cuis1Ref, $cuis2Ref);
249              
250             #return the association score, which is the first (and only)
251             # value of the return array
252 14         14 return ${$self->_calculateAssociation_pairHashList(\@pairHashes, $measure)}[0];
  14         18  
253             }
254              
255              
256             # calculate association between a list of cui set pairs. The cui lists are
257             # passed in as parallel arrays of sets of cuis, where assoc(cuis1[i], and cuis2[i]
258             # are calcualted.
259             # input: \@cuis1Ref - an array ref to an array of arrays, where each element
260             # of the array contains a set of cuis
261             # \@cuis2Ref - an array ref to an array of arrays of the same format as
262             # \@cuis1Ref
263             # $measure - a string specifying the association measure to use
264             # \@scores - an array ref of scores corresponding to the assocaition
265             # score for each of the pairHashes that were input
266             sub calculateAssociation_setPairList {
267 8     8 0 20 my $self = shift;
268 8         8 my $cuis1Ref = shift;
269 8         7 my $cuis2Ref = shift;
270 8         6 my $measure = shift;
271            
272             #create the pair hash for each set of pairs
273 8         9 my @pairHashes = ();
274 8         8 for (my $i = 0; $i < scalar @{$cuis1Ref}; $i++) {
  24         29  
275             #turn the cui arrays into a hash ref
276 16         15 push @pairHashes, $self->_createPairHash_termLists(${$cuis1Ref}[$i],${$cuis2Ref}[$i]);
  16         15  
  16         19  
277             }
278              
279             #return the array of association scores for each pair
280 8         12 return $self->_calculateAssociation_pairHashList(\@pairHashes, $measure);
281             }
282              
283              
284              
285              
286              
287              
288             ##########################################################################
289             # PairHash Creators
290             ##########################################################################
291              
292             # creates a pair hash object from two cuis
293             # input: $cui1 - the leading cui in the pair hash
294             # $cui2 - the trailing cui in the pair hash
295             # output: \%pairHash - a ref to a pairHash
296             sub _createPairHash_singleTerms {
297 0     0   0 my $self = shift;
298 0         0 my $cui1 = shift;
299 0         0 my $cui2 = shift;
300              
301             #create the hash data structures
302 0         0 my %pairHash = ();
303              
304             #populate the @cuiLists
305             #set the cui lists to the concept
306 0         0 my @cui1List = ();
307 0         0 push @cui1List, $cui1;
308 0         0 my @cui2List = ();
309 0         0 push @cui2List, $cui2;
310              
311 0         0 $pairHash{'set1'} = \@cui1List;
312 0         0 $pairHash{'set2'} = \@cui2List;
313              
314 0         0 return \%pairHash;
315             }
316              
317              
318             # Creates a pair hash from two cui lists
319             # input: \@set1Ref - the leading cuis of the pair set
320             # \@set2Ref - the trailing cuis of the pair set
321             # output: \%pairHash - a ref to a pairHash
322             sub _createPairHash_termLists {
323 30     30   36 my $self = shift;
324 30         25 my $set1Ref = shift;
325 30         22 my $set2Ref = shift;
326              
327             #create the hash data structures
328 30         27 my %pairHash = ();
329              
330             #set the cui lists to the pair hash directly
331 30         37 $pairHash{'set1'} = $set1Ref;
332 30         27 $pairHash{'set2'} = $set2Ref;
333            
334              
335 30         39 return \%pairHash;
336              
337             }
338              
339             ##########################################################################
340             # Association Calculators
341             ##########################################################################
342              
343             # calculate association for each of the pairHashes in the input list of
344             # cui pair hashes
345             # input: $pairHashListRef - an array ref of cui pairHashes
346             # output: \@scores - an array ref of scores corresponding to the assocaition
347             # score for each of the pairHashes that were input
348             sub _calculateAssociation_pairHashList {
349 24     24   22 my $self = shift;
350 24         22 my $pairHashListRef = shift;
351 24         16 my $measure = shift;
352              
353             #retreive observed counts for each pairHash
354 24         56 my $statsListRef = $statfinder_G->getObservedCounts($pairHashListRef);
355            
356             #calculate associaiton score for each pairHash
357 24         24 my @scores = ();
358 24         22 foreach my $statsRef(@{$statsListRef}) {
  24         24  
359             #grab stats for this pairHash from the list of stats
360 35         32 my $n11 = ${$statsRef}[0];
  35         36  
361 35         48 my $n1p = ${$statsRef}[1];
  35         33  
362 35         25 my $np1 = ${$statsRef}[2];
  35         31  
363 35         29 my $npp = ${$statsRef}[3];
  35         33  
364            
365             #calculate the association score
366 35         47 push @scores, $self->_calculateAssociation_fromObservedCounts($n11, $n1p, $np1, $npp, $measure);
367             }
368              
369             #return the association scores for all pairHashes in the list
370             return \@scores
371 24         74 }
372              
373             # calculates an association score from the provided values
374             # input: $n11 <- n11 for the cui pair
375             # $npp <- npp for the dataset
376             # $n1p <- n1p for the cui pair
377             # $np1 <- np1 for the cui pair
378             # $statistic <- the string specifying the stat to calc
379             # output: the statistic (association score) between the two concepts
380             sub _calculateAssociation_fromObservedCounts {
381             #grab parameters
382 35     35   35 my $self = shift;
383 35         27 my $n11 = shift;
384 35         28 my $n1p = shift;
385 35         31 my $np1 = shift;
386 35         26 my $npp = shift;
387 35         33 my $statistic = shift;
388              
389             #print the values used in computation if debugging
390 35 50       38 if ($debug) {
391 0         0 print "-----n11, n1p, np1, npp = $n11, $n1p, $np1, $npp\n";
392             }
393              
394             #set frequency and marginal totals
395 35         81 my %values = (n11=>$n11,
396             n1p=>$n1p,
397             np1=>$np1,
398             npp=>$npp);
399            
400             #return cannot compute, or 0
401             #if($n1p < 0 || $np1 < 0) { #NOTE, this kind of makes sense, says if there as an error then return -1
402             # the method I am doing now just says if any didn't occurr in the dataset then return -1
403 35 50 33     105 if($n1p <= 0 || $np1 <= 0 || $npp <= 0) {
      33        
404 0         0 return -1.000;
405             }
406 35 100       42 if($n11 <= 0) {
407 2         4 return 0.000;
408             }
409            
410             #set default statistic
411 33 50       45 if(!defined $statistic) {
412 0         0 die ("ERROR: no association measure defined\n");
413             }
414              
415             #set statistic module (Text::NSP)
416 33         28 my $includename = ""; my $usename = ""; my $ngram = 2; #NOTE: calculation actually change slightly with 3-grams, 4-grams, etc... does it make enough of a difference to matter?
  33         26  
  33         32  
417 33 100 33     123 if ($statistic eq "freq") {
    50 33        
    50 33        
    50 0        
    0 0        
    0 0        
    0          
    0          
418 7         11 return $n11;
419             }
420             elsif($statistic eq "ll") {
421 0         0 $usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
422 0         0 $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
423             }
424             elsif($statistic eq "pmi" || $statistic eq "tmi" || $statistic eq "ps") {
425 0         0 $usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
426 0         0 $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
427             }
428             elsif($statistic eq "x2"||$statistic eq "phi") {
429 26         41 $usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
430 26         164 $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
431             }
432             elsif($statistic eq "leftFisher"||$statistic eq "rightFisher"||$statistic eq "twotailed") {
433 0 0       0 if($statistic eq "leftFisher") { $statistic = "left"; }
  0 0       0  
434 0         0 elsif($statistic eq "rightFisher") { $statistic = "right"; }
435 0         0 $usename = 'Text::NSP::Measures::'.$ngram.'D::Fisher::'.$statistic;
436 0         0 $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','Fisher',$statistic.'.pm');
437             }
438             elsif($statistic eq "dice" || $statistic eq "jaccard") {
439 0         0 $usename = 'Text::NSP::Measures::'.$ngram.'D::Dice::'.$statistic;
440 0         0 $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','Dice',$statistic.'.pm');
441             }
442             elsif($statistic eq "odds") {
443 0         0 $usename = 'Text::NSP::Measures::'.$ngram.'D::'.$statistic;
444 0         0 $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D',$statistic.'.pm');
445             }
446             elsif($statistic eq "tscore") {
447 0         0 $usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
448 0         0 $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
449             }
450            
451             # import module
452 26         465 require $includename;
453 26         3663 import $usename;
454            
455             # get statistics (From NSP package)
456 26         62 my $statisticValue = calculateStatistic(%values);
457            
458             # check for errors/warnings from statistics.pm
459 26         1657 my $errorMessage="";
460 26         33 my $errorCode = getErrorCode();
461 26 50       99 if (defined $errorCode) {
462 0 0       0 if($errorCode =~ /^1/) {
463 0         0 printf(STDERR "Error from statistic library!\n Error code: %d\n", $errorCode);
464 0         0 $errorMessage = getErrorMessage();
465 0 0       0 print STDERR " Error message: $errorMessage\n" if( $errorMessage ne "");
466 0         0 exit; # exit on error
467             }
468 0 0       0 if ($errorCode =~ /^2/) {
469 0         0 printf(STDERR "Warning from statistic library!\n Warning code: %d\n", $errorCode);
470 0         0 $errorMessage = getErrorMessage();
471 0 0       0 print STDERR " Warning message: $errorMessage\n" if( $errorMessage ne "");
472 0         0 print STDERR "Skipping ngram\n";
473 0         0 next; # if warning, dont save the statistic value just computed
474             }
475             }
476              
477             #return statistic to given precision. if no precision given, default is 4
478 26         33 my $floatFormat = join '', '%', '.', $precision_G, 'f';
479 26         130 my $statScore = sprintf $floatFormat, $statisticValue;
480              
481 26         72 return $statScore;
482             }
483              
484              
485             1;
486              
487             __END__