File Coverage

blib/lib/UMLS/Association.pm
Criterion Covered Total %
statement 47 184 25.5
branch 10 58 17.2
condition 2 30 6.6
subroutine 10 20 50.0
pod 0 6 0.0
total 69 298 23.1


line stmt bran cond sub pod time code
1             # UMLS::Association
2             #
3             # Perl module for scoring the semantic association of terms in the Unified
4             # Medical Language System (UMLS).
5             #
6             # This module borrows heavily from the UMLS::Interface package so you will
7             # see similarities
8             #
9             # Copyright (c) 2015
10             #
11             # Bridget T. McInnes, Virginia Commonwealth University
12             # btmcinnes at vcu.edu
13             #
14             # Keith Herbert, Virginia Commonwealth University
15             # herbertkb at vcu.edu
16             #
17             # Alexander D. McQuilkin, Virginia Commonwealth University
18             # alexmcq99 at yahoo.com
19             #
20             # Sam Henry, Virginia Commonwealth University
21             # henryst at vcu.edu
22             #
23             # This program is free software; you can redistribute it and/or
24             # modify it under the terms of the GNU General Public License
25             # as published by the Free Software Foundation; either version 2
26             # of the License, or (at your option) any later version.
27             #
28             # This program is distributed in the hope that it will be useful,
29             # but WITHOUT ANY WARRANTY; without even the implied warranty of
30             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31             # GNU General Public License for more details.
32             #
33             # You should have received a copy of the GNU General Public License
34             # along with this program; if not, write to
35             #
36             # The Free Software Foundation, Inc.,
37             # 59 Temple Place - Suite 330,
38             # Boston, MA 02111-1307, USA.
39              
40             =head1 NAME
41              
42             UMLS::Association - A suite of Perl modules that implement a number of semantic
43             association measures in order to calculate the semantic association between two
44             concepts in the UMLS.
45              
46             =head1 SYNOPSIS
47              
48              
49             =head1 INSTALL
50              
51             To install the module, run the following magic commands:
52              
53             perl Makefile.PL
54             make
55             make test
56             make install
57              
58             This will install the module in the standard location. You will, most
59             probably, require root privileges to install in standard system
60             directories. To install in a non-standard directory, specify a prefix
61             during the 'perl Makefile.PL' stage as:
62              
63             perl Makefile.PL PREFIX=/home/sid
64              
65             It is possible to modify other parameters during installation. The
66             details of these can be found in the ExtUtils::MakeMaker
67             documentation. However, it is highly recommended not messing around
68             with other parameters, unless you know what you're doing.
69              
70             =head1 DESCRIPTION
71              
72             This package provides a Perl interface to
73              
74             =head1 DATABASE SETUP
75              
76             The interface assumes that the CUI network extracted from the MetaMapped
77             Medline Baseline is present in a mysql database. The name of the database
78             can be passed as configuration options at initialization. However, if the
79             names of the databases are not provided at initialization, then default
80             value is used -- the database is called 'CUI_BIGRAMS'.
81              
82             The CUI_BIGRAMS database must contain four? tables:
83             1. N11
84             2. N1P
85             3. NP1
86             4. NPP
87              
88             All other tables in the databases will be ignored, and any of these
89             tables missing would raise an error.
90              
91             A script explaining how to create the CUI network and the mysql database
92             are in the INSTALL file.
93              
94             If the files that are being parsed are large, "ERROR 1206: The total number
95             of locks exceeds the lock table size" may occur. This can be corrected by increasing
96             the lock table size of mysql. This is done by increasing the innodb_buffer_pool_size
97             variable in your my.cnf file. If the variable does not exist in the my.cnf file simply
98             add a line such as:
99             "innodb_buffer_pool_size=1G"
100             which sets the size to 1 GB. Once updated mysql must be restarted for the changes to
101             take effect.
102              
103             =head1 INITIALIZING THE MODULE
104              
105             To create an instance of the interface object, using default values
106             for all configuration options:
107              
108             use UMLS::Association;
109             my $associaton = UMLS::Association->new();
110              
111             Database connection options can be passed through the my.cnf file. For
112             example:
113             [client]
114             user =
115             password =
116             port = 3306
117             socket = /tmp/mysql.sock
118             database = mmb
119              
120             Or through the by passing the connection information when first
121             instantiating an instance. For example:
122              
123             $associaton = UMLS::Association->new({"driver" => "mysql",
124             "database" => "$database",
125             "username" => "$username",
126             "password" => "$password",
127             "hostname" => "$hostname",
128             "socket" => "$socket"});
129              
130             'driver' -> Default value 'mysql'. This option specifies the Perl
131             DBD driver that should be used to access the
132             database. This implies that the some other DBMS
133             system (such as PostgresSQL) could also be used,
134             as long as there exist Perl DBD drivers to
135             access the database.
136             'database' -> Default value 'CUI_BIGRAM'. This option specifies the name
137             of the database.
138             'hostname' -> Default value 'localhost'. The name or the IP address
139             of the machine on which the database server is
140             running.
141             'socket' -> Default value '/tmp/mysql.sock'. The socket on which
142             the database server is using.
143             'port' -> The port number on which the database server accepts
144             connections.
145             'username' -> Username to use to connect to the database server. If
146             not provided, the module attempts to connect as an
147             anonymous user.
148             'password' -> Password for access to the database server. If not
149             provided, the module attempts to access the server
150             without a password.
151              
152             More information is provided in the INSTALL file.
153              
154             =head1 PARAMETERS
155              
156             You can also pass other parameters which controls the functionality
157             of the Association.pm module.
158              
159             $assoc = UMLS::Association->new({"measure" => "lch"});
160              
161             'measure' -> This modifies the association measure
162              
163             =head1 FUNCTION DESCRIPTIONS
164              
165             =cut
166              
167             package UMLS::Association;
168              
169 1     1   577 use Fcntl;
  1         3  
  1         288  
170 1     1   8 use strict;
  1         4  
  1         33  
171 1     1   6 use warnings;
  1         3  
  1         38  
172 1     1   1314 use DBI;
  1         16662  
  1         61  
173 1     1   504 use bytes;
  1         13  
  1         5  
174              
175 1     1   401 use UMLS::Association::StatFinder;
  1         2  
  1         39  
176 1     1   341 use UMLS::Association::ErrorHandler;
  1         3  
  1         68  
177              
178             my $errorhandler = "";
179             my $statfinder_G = "";
180              
181             my $pkg = "UMLS::Association";
182              
183 1     1   10 use vars qw($VERSION);
  1         3  
  1         1587  
184              
185             $VERSION = '0.15';
186            
187             my $debug = 0;
188             my $umls_G = undef;
189             my $conceptExpansion_G = 0;
190             my $precision_G = 4; #precision of the output
191              
192              
193             # UMLS-specific stuff ends ----------
194              
195             # -------------------- Class methods start here --------------------
196              
197             # method to create a new UMLS::Association object
198             # input : $params <- reference to hash containing the parameters
199             # output: $self
200             sub new {
201 1     1 0 44 my $self = {};
202 1         3 my $className = shift;
203 1         2 my $params = shift;
204              
205             # bless the object.
206 1         3 bless($self, $className);
207              
208             # initialize error handler
209 1         9 $errorhandler = UMLS::Association::ErrorHandler->new();
210 1 50       5 if(! defined $errorhandler) {
211 0         0 print STDERR "The error handler did not get passed properly.\n";
212 0         0 exit;
213             }
214            
215             # Initialize the object.
216 1         5 $self->_initialize($params);
217              
218 1         2 return $self;
219             }
220              
221             # initialize the variables and set the parameters
222             # input : $params <- reference to hash containing the parameters
223             # output: none, but $self is initialized
224             sub _initialize {
225 1     1   2 my $self = shift;
226 1         2 my $params = shift;
227              
228             # check self
229 1         1 my $function = "_initialize";
230 1 50 33     8 if(!defined $self || !ref $self) {
231 0         0 $errorhandler->_error($pkg, $function, "", 2);
232             }
233 1         2 my $paramCount = 0;
234 1 50       4 if ($params->{'mwa'}) {$paramCount++;}
  0         0  
235 1 50       3 if ($params->{'lta'}) {$paramCount++;}
  0         0  
236 1 50       4 if ($params->{'vsa'}) {$paramCount++;}
  0         0  
237 1 50       3 if ($paramCount > 1) {
238 0         0 $errorhandler->_error($pkg, $function, "Only one of LTA, MWA, and VSA may be specified", 12);
239             }
240              
241             # set parameters
242 1 50       3 if ($params->{'conceptexpansion'}) {
243 0         0 $conceptExpansion_G = 1;
244             }
245 1 50       3 if ($params->{'precision'}) {
246 0         0 $precision_G = $params->{'precision'};
247             }
248 1         2 $umls_G = $params->{'umls'};
249              
250             # set the statfinder
251 1         8 $statfinder_G = UMLS::Association::StatFinder->new($params);
252 1 50       3 if(! defined $statfinder_G) {
253 0         0 my $str = "The UMLS::Association::StatFinder object was not created.";
254 0         0 $errorhandler->_error($pkg, $function, $str, 8);
255             }
256              
257             #require UMLS::Interface to be defined if using a DB, or if
258             # using concept expansion
259 1 50 33     4 if ($conceptExpansion_G && !defined $umls_G) {
260 0           die( "ERROR initializing Association: UMLS::Interface (params{umls}) must be defined when using database queries or when using concept expansion\n");
261             }
262             }
263              
264             # returns the version currently being used
265             # input : none
266             # output: the version number being used
267             sub version {
268 0     0 0   my $self = shift;
269 0           return $VERSION;
270             }
271              
272             ##########################################################################
273             # Public Association Interface
274             ##########################################################################
275             # All association scores are computed through a data structure, the pair hash
276             # list. This forces all the modes of operation to use the same code, and allows
277             # all data to be retreived in a single pass of a matrix file, or efficient DB
278             # queries. The pair hash list is an array of pairHashRefs. The pair hash is a
279             # hash with two keys, 'set1' and 'set2' each of these keys holds an arrayRef of
280             # cuis which correspond to cuis in that set. This allows for lists of pairs of
281             # sets of CUIs to be computed, either through concept expansion or input as a
282             # set. In the case where only a single pair computation is needed, or rather
283             # than a set, just a single cui is needed, each function still wraps the
284             # values into a pairHashList. 'set1' cuis are the leading cuis in the pair, and
285             # 'set2 are the trailing cuis in the pair'
286              
287              
288             # calculates association for a list of single cui pairs
289             # input: $cuiPairsFromFileRef - an array ref of comma seperated cui pairs
290             # the first in the pair is the leading,
291             # second in the pair is the trailing
292             # $measure - a string specifying the association measure to use
293             # output: $score - the association between the cuis
294             sub calculateAssociation_termPairList {
295 0     0 0   my $self = shift;
296 0           my $cuiPairListRef = shift;
297 0           my $measure = shift;
298              
299             #create the cuiPairs hash datasetructure
300 0           my @pairHashes = ();
301 0           foreach my $pair (@{$cuiPairListRef}) {
  0            
302             #grab the cuis from the pair
303 0           (my $cui1, my $cui2) = split(',',$pair);
304 0           push @pairHashes, $self->_createPairHash_singleTerms($cui1,$cui2);
305             }
306              
307             #return the array of association scores for each pair
308 0           return $self->_calculateAssociation_pairHashList(\@pairHashes, $measure);
309             }
310              
311             # calculates association for a single cui pair
312             # input: $cui1 - the leading cui
313             # $cui2 - the trailing cui
314             # $measure - a string specifying the association measure to use
315             # output: $score - the association between the cuis
316             sub calculateAssociation_termPair {
317 0     0 0   my $self = shift;
318 0           my $cui1 = shift;
319 0           my $cui2 = shift;
320 0           my $measure = shift;
321            
322             #create the pairHash List
323 0           my @pairHashes = ();
324 0           push @pairHashes, $self->_createPairHash_singleTerms($cui1,$cui2);
325              
326             #return the association score, which is the first (and only)
327             # values of the return array
328 0           return ${$self->_calculateAssociation_pairHashList(\@pairHashes, $measure)}[0];
  0            
329             }
330              
331             # calculates association for two sets of cuis (leading and trailing cuis)
332             # input: \@cuis1Ref - a ref to an array of leading cuis
333             # \@cuis2Ref - a ref to an array of trailing cuis
334             # $measure - a string specifying the association measure to use
335             # output: $score - the association between the cui sets
336             sub calculateAssociation_setPair {
337 0     0 0   my $self = shift;
338 0           my $cuis1Ref = shift;
339 0           my $cuis2Ref = shift;
340 0           my $measure = shift;
341              
342             #create the cuiPairs hash datasetructure
343 0           my @pairHashes = ();
344 0           push @pairHashes, $self->createPairHash_termList($cuis1Ref, $cuis2Ref);
345              
346             #return the association score, which is the first (and only)
347             # value of the return array
348 0           return ${$self->_calculateAssociation_pairHashList(\@pairHashes, $measure)}[0];
  0            
349             }
350              
351              
352             # calculate association between a list of cui pairs
353             # input:
354             # output:
355       0 0   sub calculateAssociation_setPairList {
356              
357             #TODO
358              
359             }
360              
361             ##########################################################################
362             # PairHash Creators
363             ##########################################################################
364              
365             # creates a pair hash object from two cuis
366             # input: $cui1 - the leading cui in the pair hash
367             # $cui2 - the trailing cui in the pair hash
368             # output: \%pairHash - a ref to a pairHash
369             sub _createPairHash_singleTerms {
370 0     0     my $self = shift;
371 0           my $cui1 = shift;
372 0           my $cui2 = shift;
373              
374             #create the hash data structures
375 0           my %pairHash = ();
376              
377             #populate the @cuiLists
378 0 0         if ($conceptExpansion_G) {
379             #set the cui lists to the expanded concept
380 0           $pairHash{'set1'} = $self->_expandConcept($cui1);
381 0           $pairHash{'set2'} = $self->_expandConcept($cui2);
382             }
383             else {
384             #set the cui lists to the concept
385 0           my @cui1List = ();
386 0           push @cui1List, $cui1;
387 0           my @cui2List = ();
388 0           push @cui2List, $cui2;
389              
390 0           $pairHash{'set1'} = \@cui1List;
391 0           $pairHash{'set2'} = \@cui2List;
392             }
393 0           return \%pairHash;
394             }
395              
396              
397             # Creates a pair hash from two cui lists
398             # input:
399             # output:
400             sub _createPairHash_termLists {
401 0     0     my $self = shift;
402 0           my $set1Ref = shift;
403 0           my $set2Ref = shift;
404              
405             #TODO
406              
407             }
408              
409             ##########################################################################
410             # Association Calculators
411             ##########################################################################
412              
413             # calculate association for each of the pairHashes in the input list of
414             # cui pair hashes
415             # input: $pairHashListRef - an array ref of cui pairHashes
416             # output: \@scores - an array ref of scores corresponding to the assocaition
417             # score for each of the pairHashes that were input
418             sub _calculateAssociation_pairHashList {
419 0     0     my $self = shift;
420 0           my $pairHashListRef = shift;
421 0           my $measure = shift;
422              
423             #retreive observed counts for each pairHash
424 0           my $statsListRef = $statfinder_G->getObservedCounts($pairHashListRef);
425            
426             #calculate associaiton score for each pairHash
427 0           my @scores = ();
428 0           foreach my $statsRef(@{$statsListRef}) {
  0            
429             #grab stats for this pairHash from the list of stats
430 0           my $n11 = ${$statsRef}[0];
  0            
431 0           my $n1p = ${$statsRef}[1];
  0            
432 0           my $np1 = ${$statsRef}[2];
  0            
433 0           my $npp = ${$statsRef}[3];
  0            
434            
435             #calculate the association score
436 0           push @scores, $self->_calculateAssociation_fromObservedCounts($n11, $n1p, $np1, $npp, $measure);
437             }
438              
439             #return the association scores for all pairHashes in the list
440             return \@scores
441 0           }
442              
443             # calculates an association score from the provided values
444             # NOTE: Please be careful when writing code that uses this
445             # method. Results may become inconsistent if you don't check
446             # that CUIs occur in the hierarchy before calling
447             # e.g. C0009951 does not occur in the SNOMEDCT Hierarchy but
448             # it likely occurs in the association database so if not check
449             # is made an association score will be calculate for it, but it has not
450             # been done in reported results from this application
451             # input: $n11 <- n11 for the cui pair
452             # $npp <- npp for the dataset
453             # $n1p <- n1p for the cui pair
454             # $np1 <- np1 for the cui pair
455             # $statistic <- the string specifying the stat to calc
456             # output: the statistic (association score) between the two concepts
457             sub _calculateAssociation_fromObservedCounts {
458             #grab parameters
459 0     0     my $self = shift;
460 0           my $n11 = shift;
461 0           my $n1p = shift;
462 0           my $np1 = shift;
463 0           my $npp = shift;
464 0           my $statistic = shift;
465              
466             #set frequency and marginal totals
467 0           my %values = (n11=>$n11,
468             n1p=>$n1p,
469             np1=>$np1,
470             npp=>$npp);
471            
472             #return cannot compute, or 0
473             #if($n1p < 0 || $np1 < 0) { #NOTE, this kind of makes sense, says if there as an error then return -1
474             # the method I am doing now just says if any didn't occurr in the dataset then return -1
475 0 0 0       if($n1p <= 0 || $np1 <= 0 || $npp <= 0) {
      0        
476 0           return -1.000;
477             }
478 0 0         if($n11 <= 0) {
479 0           return 0.000;
480             }
481            
482             #set default statistic
483 0 0         if(!defined $statistic) {
484 0           die ("ERROR: no association measure defined\n");
485             }
486              
487             #set statistic module (Text::NSP)
488 0           my $includename = ""; my $usename = ""; my $ngram = 2; #TODO, what is this ngram parameter
  0            
  0            
489 0 0 0       if ($statistic eq "freq") {
    0 0        
    0 0        
    0 0        
    0 0        
    0 0        
    0          
    0          
490 0           return $n11;
491             }
492             elsif($statistic eq "ll") {
493 0           $usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
494 0           $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
495             }
496             elsif($statistic eq "pmi" || $statistic eq "tmi" || $statistic eq "ps") {
497 0           $usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
498 0           $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
499             }
500             elsif($statistic eq "x2"||$statistic eq "phi") {
501 0           $usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
502 0           $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
503             }
504             elsif($statistic eq "leftFisher"||$statistic eq "rightFisher"||$statistic eq "twotailed") {
505 0 0         if($statistic eq "leftFisher") { $statistic = "left"; }
  0 0          
506 0           elsif($statistic eq "rightFisher") { $statistic = "right"; }
507 0           $usename = 'Text::NSP::Measures::'.$ngram.'D::Fisher::'.$statistic;
508 0           $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','Fisher',$statistic.'.pm');
509             }
510             elsif($statistic eq "dice" || $statistic eq "jaccard") {
511 0           $usename = 'Text::NSP::Measures::'.$ngram.'D::Dice::'.$statistic;
512 0           $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','Dice',$statistic.'.pm');
513             }
514             elsif($statistic eq "odds") {
515 0           $usename = 'Text::NSP::Measures::'.$ngram.'D::'.$statistic;
516 0           $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D',$statistic.'.pm');
517             }
518             elsif($statistic eq "tscore") {
519 0           $usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
520 0           $includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
521             }
522            
523             # import module
524 0           require $includename;
525 0           import $usename;
526            
527             # get statistics (From NSP package)
528 0           my $statisticValue = calculateStatistic(%values);
529            
530             # check for errors/warnings from statistics.pm
531 0           my $errorMessage="";
532 0           my $errorCode = getErrorCode();
533 0 0         if (defined $errorCode) {
534 0 0         if($errorCode =~ /^1/) {
535 0           printf(STDERR "Error from statistic library!\n Error code: %d\n", $errorCode);
536 0           $errorMessage = getErrorMessage();
537 0 0         print STDERR " Error message: $errorMessage\n" if( $errorMessage ne "");
538 0           exit; # exit on error
539             }
540 0 0         if ($errorCode =~ /^2/) {
541 0           printf(STDERR "Warning from statistic library!\n Warning code: %d\n", $errorCode);
542 0           $errorMessage = getErrorMessage();
543 0 0         print STDERR " Warning message: $errorMessage\n" if( $errorMessage ne "");
544 0           print STDERR "Skipping ngram\n";
545 0           next; # if warning, dont save the statistic value just computed
546             }
547             }
548              
549             #return statistic to given precision. if no precision given, default is 4
550 0           my $floatFormat = join '', '%', '.', $precision_G, 'f';
551 0           my $statScore = sprintf $floatFormat, $statisticValue;
552              
553 0           return $statScore;
554             }
555              
556             #################################################
557             # Utilitiy Functions
558             #################################################
559              
560             # Applies concept expansion by creating an array
561             # of the input concept and all of its UMLS d
562             # descendants
563             # input : $cui - the cui that will be expanded
564             # output: \@cuis - the expanded concept array
565             sub _expandConcept {
566 0     0     my $self = shift;
567 0           my $cui = shift;
568              
569             #find all descendants
570 0           my $descendantsRef = $umls_G->findDescendants($cui);
571              
572             #add all cuis to the expanded cuis list
573 0           my @cuis = ();
574 0           push @cuis, $cui;
575 0           foreach my $desc (keys %{$descendantsRef}) {
  0            
576 0           push @cuis, $desc;
577             }
578            
579             #return the expanded cuis array
580 0           return \@cuis;
581             }
582              
583             1;
584              
585             __END__