File Coverage

blib/lib/UMLS/Association/StatFinder.pm
Criterion Covered Total %
statement 44 663 6.6
branch 6 146 4.1
condition 3 78 3.8
subroutine 9 29 31.0
pod 0 2 0.0
total 62 918 6.7


line stmt bran cond sub pod time code
1             #UMLS::Association
2             #
3             # Perl module for scoring the semantic association of terms in the Unified
4             # Medical Language System (UMLS).
5             #
6             # Copyright (c) 2015
7             #
8             # Bridget T. McInnes, Virginia Commonwealth University
9             # btmcinnes at vcu.edu
10             #
11             # Keith Herbert, Virginia Commonwealth University
12             # herbertkb at vcu.edu
13             #
14             # Alexander D. McQuilkin, Virginia Commonwealth University
15             # alexmcq99 at yahoo.com
16             #
17             # Sam Henry, Virginia Commonwealth University
18             # henryst at vcu.edu
19             #
20             # This program is free software; you can redistribute it and/or
21             # modify it under the terms of the GNU General Public License
22             # as published by the Free Software Foundation; either version 2
23             # of the License, or (at your option) any later version.
24             #
25             # This program is distributed in the hope that it will be useful,
26             # but WITHOUT ANY WARRANTY; without even the implied warranty of
27             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28             # GNU General Public License for more details.
29             #
30             # You should have received a copy of the GNU General Public License
31             # along with this program; if not, write to
32             #
33             # The Free Software Foundation, Inc.,
34             # 59 Temple Place - Suite 330,
35             # Boston, MA 02111-1307, USA.
36              
37             package UMLS::Association::StatFinder;
38              
39 1     1   7 use Fcntl;
  1         3  
  1         214  
40 1     1   6 use strict;
  1         2  
  1         19  
41 1     1   4 use warnings;
  1         1  
  1         29  
42 1     1   4 use DBI;
  1         2  
  1         30  
43 1     1   5 use bytes;
  1         1  
  1         4  
44 1     1   18 use File::Spec;
  1         1  
  1         5346  
45              
46             # error handling variables
47             my $errorhandler = "";
48              
49             my $pkg = "UMLS::Association::StatFinder";
50              
51             # debug variables
52             #local(*DEBUG_FILE);
53              
54             #NOTE: every global variable is followed by a _G with the
55             # exception of debug error handler, and constants which are all caps
56             # global variables
57             my $debug = 0; #in debug mode or not
58              
59             #global options variables
60             my $assocDB_G;
61             my $lta_G = 0; #1 or 0 is using lta or not
62             my $mwa_G = 0; #1 or 0 if using mwa or not
63             my $vsa_G = 0; #1 or 0 if using vsa or not
64             my $noOrder_G = 0; #1 or 0 if noOrder is enabled or not
65             my $matrix_G = 0; #matrix file name is using a matrix file rather than DB
66              
67             ######################################################################
68             # Initialization Functions
69             ######################################################################
70             # method to create a new UMLS::Association::StatFinder object
71             # input : $params <- reference to hash of database parameters
72             # output: $self
73             sub new {
74             #grab params and create self
75 1     1 0 2 my $self = {};
76 1         2 my $className = shift;
77 1         2 my $params = shift;
78              
79             #bless the object.
80 1         2 bless($self, $className);
81              
82             #initialize error handler
83 1         5 $errorhandler = UMLS::Association::ErrorHandler->new();
84 1 50       13 if(! defined $errorhandler) {
85 0         0 print STDERR "The error handler did not get passed properly.\n";
86 0         0 exit;
87             }
88              
89             # initialize the object.
90 1         3 $debug = 0;
91 1         8 $self->_initialize($params);
92 1         3 return $self;
93             }
94              
95             # method to initialize the UMLS::Association::StatFinder object.
96             # input : $parameters <- reference to a hash of database parameters
97             # output: none, but $self is initialized
98             sub _initialize {
99             #grab parameters
100 1     1   4 my $self = shift;
101 1         3 my $paramsRef = shift;
102 1         4 my %params = %{$paramsRef};
  1         7  
103              
104             #set global variables using option hash
105 1         5 $lta_G = $params{'lta'};
106 1         3 $mwa_G = $params{'mwa'};
107 1         3 $vsa_G = $params{'vsa'};
108 1         4 $noOrder_G = $params{'noorder'};
109 1         4 $matrix_G = $params{'matrix'};
110              
111             #connect to the database of association scores
112 1 50       4 if (!$matrix_G) {
113 0         0 $self->_setDatabase($paramsRef);
114             }
115            
116             #error checking
117 1         2 my $function = "_initialize";
118 1         4 &_debug($function);
119 1 50 33     5 if(!defined $self || !ref $self) {
120 0         0 $errorhandler->_error($pkg, $function, "", 2);
121             }
122              
123             #TODO, remove this once I have DB implemented
124             #check that a matrix is specified for options (need to implement DB mode)
125 1 50 33     11 if (!$matrix_G && $mwa_G) {
126 0         0 $errorhandler->_error($pkg, $function, "MWA requires the --matrix option", 12);
127             }
128 1 50 33     6 if (!$matrix_G && $vsa_G) {
129 0         0 $errorhandler->_error($pkg, $function, "VSA requires the --matrix option", 12);
130             }
131             }
132              
133             sub _debug {
134 1     1   2 my $function = shift;
135 1 50       3 if($debug) { print STDERR "In UMLS::Association::StatFinder::$function\n"; }
  0            
136             }
137              
138             # method to set the association database
139             # input : $params <- reference to a hash
140             # output: none, but association database is set and initialized
141             sub _setDatabase {
142 0     0     my $self = shift;
143 0           my $params = shift;
144              
145 0           my $function = "_setDatabase";
146 0           &_debug($function);
147              
148             # check self
149 0 0 0       if(!defined $self || !ref $self) {
150 0           $errorhandler->_error($pkg, $function, "", 2);
151             }
152              
153             # check the params
154 0 0         $params = {} if(!defined $params);
155              
156             # get the database connection parameters
157 0           my $database = $params->{'database'};
158 0           my $hostname = $params->{'hostname'};
159 0           my $socket = $params->{'socket'};
160 0           my $port = $params->{'port'};
161 0           my $username = $params->{'username'};
162 0           my $password = $params->{'password'};
163              
164             # set up defaults if the options were not passed
165 0 0         if(! defined $database) { $database = "cuicounts"; }
  0            
166 0 0         if(! defined $socket) { $socket = "/var/run/mysqld/mysqld.sock"; }
  0            
167 0 0         if(! defined $hostname) { $hostname = "localhost"; }
  0            
168              
169             # initialize the database handler
170 0           $assocDB_G = "";
171              
172             # create the database object...
173 0 0 0       if(defined $username and defined $password) {
174 0 0         if($debug) { print STDERR "Connecting with username and password\n"; }
  0            
175 0           $assocDB_G = DBI->connect("DBI:mysql:database=$database;mysql_socket=$socket;host=$hostname",$username, $password, {RaiseError => 0});
176             }
177             else {
178 0 0         if($debug) { print STDERR "Connecting using the my.cnf file\n"; }
  0            
179 0           my $dsn = "DBI:mysql:umls;mysql_read_default_group=client;database=$database";
180 0           $assocDB_G = DBI->connect($dsn);
181             }
182              
183             # check if there is an error
184 0           $errorhandler->_checkDbError($pkg, $function, $assocDB_G);
185              
186             # check that the db exists
187 0 0         if(!$assocDB_G) { $errorhandler->_error($pkg, $function, "Error with db.", 3); }
  0            
188              
189             # set database parameters
190 0           $assocDB_G->{'mysql_enable_utf8'} = 1;
191 0           $assocDB_G->do('SET NAMES utf8');
192 0           $assocDB_G->{mysql_auto_reconnect} = 1;
193             }
194              
195             ######################################################################
196             # public interface to get observed counts
197             ######################################################################
198              
199             # Gets observed counts (n11, n1p, np1, npp) of the cui sets
200             # input: $pairHashListRef - a ref to an array of pairHashes
201             # output: \@allStatsRef - a ref to an array of observed counts 4-tuples
202             # each 4-tuple consists of in order:
203             # $n11, $n1p, $np1, and $npp
204             # and they correspond to the observed counts of
205             # each of the pairHashes passed in
206             sub getObservedCounts {
207             #grab parameters
208 0     0 0   my $self = shift;
209 0           my $pairHashListRef = shift;
210              
211             #error checking
212 0           my $function = "getObservedCounts";
213 0 0 0       if(!defined $self || !ref $self) {
214 0           $errorhandler->_error($pkg, $function, "", 2);
215             }
216              
217             #calculate n11, n1p, np1, npp using a matrix or DB
218             # and according to the method of various other options
219 0           my $allStatsRef = -1;
220 0 0         if ($lta_G) {
    0          
    0          
221 0           $allStatsRef = $self->_getStats_LTA($pairHashListRef);
222             }
223             elsif ($mwa_G) {
224 0           $allStatsRef = $self->_getStats_MWA($pairHashListRef);
225             }
226             elsif ($vsa_G) {
227 0           $allStatsRef = $self->_getStats_VSA($pairHashListRef);
228             }
229             else {
230 0 0         if ($matrix_G) {
231 0           $allStatsRef = $self->_getStats_matrix($pairHashListRef);
232             }
233             else {
234 0           $allStatsRef = $self->_getStats_DB($pairHashListRef);
235             }
236             }
237              
238             #return a reference to a list of stats for each pairHash
239 0           return $allStatsRef;
240             }
241              
242              
243             ######################################################################
244             # functions to get statistical information about the cuis using a DB
245             ######################################################################
246              
247             # gets N11, N1P, NP1, NPP for a pairHashList using a database
248             # input : $pairHashListRef <- ref to a pairHashList
249             # output: $\@data <- array ref containing array refs of four values
250             # for each pair Hash, $n11, $n1p, $np1, and $npp
251             sub _getStats_DB {
252             #grab parameters
253 0     0     my $self = shift;
254 0           my $pairHashListRef = shift;
255            
256             #error checking
257 0           my $function = "_getStats_DB";
258 0 0 0       if(!defined $self || !ref $self) {
259 0           $errorhandler->_error($pkg, $function, "", 2);
260             }
261              
262             #compute observed counts for each pair hash
263 0           my @data = ();
264 0           my $npp = $self->_getNpp_DB();
265 0           foreach my $pairHashRef(@{$pairHashListRef}) {
  0            
266              
267             #grab the data from a DB
268 0           my $n11 = $self->_getN11_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'});
  0            
  0            
269 0           my $n1p = $self->_getN1p_DB(${$pairHashRef}{'set1'});
  0            
270 0           my $np1 = $self->_getNp1_DB(${$pairHashRef}{'set2'});
  0            
271              
272             #store the data
273 0           my @values = ($n11, $n1p, $np1, $npp);
274 0           push @data, \@values;
275             }
276              
277             #return the data
278 0           return \@data;
279             }
280              
281             # Gets N11 of the cui pair using a database
282             # input: $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
283             # $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
284             # output: $n11 <- n11 of cui sets
285             sub _getN11_DB {
286             #grab parameters
287 0     0     my $self = shift;
288 0           my $cuis1Ref = shift;
289 0           my $cuis2Ref = shift;
290              
291             #error checking
292 0           my $function = "_getN11";
293 0 0 0       if(!defined $self || !ref $self) {
294 0           $errorhandler->_error($pkg, $function, "", 2);
295             }
296            
297             #build a query string for n11
298 0           my $firstCui = shift @{$cuis1Ref};
  0            
299 0           my $queryString = "select SUM(n_11) from N_11 where ((cui_1 = '$firstCui' ";
300 0           foreach my $cui (@{$cuis1Ref}) {
  0            
301 0           $queryString .= "or cui_1 = '$cui' ";
302             }
303 0           unshift @{$cuis1Ref}, $firstCui;
  0            
304              
305             #set all cui2's
306 0           $firstCui = shift @{$cuis2Ref};
  0            
307 0           $queryString .= ") and (cui_2 = '$firstCui' ";
308 0           foreach my $cui (@{$cuis2Ref}) {
  0            
309 0           $queryString .= "or cui_2 = '$cui' ";
310             }
311 0           unshift @{$cuis2Ref}, $firstCui;
  0            
312              
313             #finalize the query string
314 0 0         if ($noOrder_G) {
315             #swap the positions of the cuis
316 0           $firstCui = shift @{$cuis2Ref};
  0            
317 0           $queryString .= ")) or ((cui_1 = '$firstCui' ";
318 0           foreach my $cui (@{$cuis2Ref}) {
  0            
319 0           $queryString .= "or cui_1 = '$cui' ";
320             }
321 0           unshift @{$cuis2Ref}, $firstCui;
  0            
322              
323 0           $firstCui = shift @{$cuis1Ref};
  0            
324 0           $queryString .= ") and (cui_2 = '$firstCui' ";
325 0           foreach my $cui (@{$cuis1Ref}) {
  0            
326 0           $queryString .= "or cui_2 = '$cui' ";
327             }
328 0           unshift @{$cuis1Ref}, $firstCui;
  0            
329             }
330 0           $queryString .= "));";
331            
332             #query the DB and return n11
333 0           my $n11 = shift @{$assocDB_G->selectcol_arrayref($queryString)};
  0            
334 0 0         if (!defined $n11) {
335 0           $n11 = 0;
336             }
337 0           return $n11;
338             }
339              
340             # Method to return the np1 of a concept using a database
341             # input : $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
342             # output: $np1 <- number of times the cuis2Ref set occurs in second bigram position
343             sub _getNp1_DB {
344 0     0     my $self = shift;
345 0           my $cuis2Ref = shift;
346            
347             #error checking
348 0           my $function = "_getNp1_DB";
349 0 0 0       if(!defined $self || !ref $self) {
350 0           $errorhandler->_error($pkg, $function, "", 2);
351             }
352              
353             #build a query string for all where cui2's are in the second position
354 0           my $firstCui = shift @{$cuis2Ref};
  0            
355 0           my $queryString = "select SUM(n_11) from N_11 where (cui_2 = '$firstCui' ";
356 0           foreach my $cui (@{$cuis2Ref}) {
  0            
357 0           $queryString .= "or cui_2 = '$cui' ";
358             }
359 0           unshift @{$cuis2Ref}, $firstCui;
  0            
360              
361             #finalize the query string
362 0 0         if ($noOrder_G) {
363             #add where cui2 is in the first position
364 0           $firstCui = shift @{$cuis2Ref};
  0            
365 0           $queryString .= ") or (cui_1 = '$firstCui' ";
366 0           foreach my $cui (@{$cuis2Ref}) {
  0            
367 0           $queryString .= "or cui_1 = '$cui' ";
368             }
369 0           unshift @{$cuis2Ref}, $firstCui;
  0            
370             }
371 0           $queryString .= ");";
372              
373             #query the db to retrive np1
374 0           my $np1 = shift @{$assocDB_G->selectcol_arrayref($queryString)};
  0            
375 0 0         if (!defined $np1) {
376 0           $np1 = -1;
377             }
378 0           return $np1;
379             }
380              
381             # Method to return the n1p of a concept from a database
382             # input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
383             # output: $n1p <- number of times cuis in cuis1 set occurs in first bigram position
384             sub _getN1p_DB {
385 0     0     my $self = shift;
386 0           my $cuis1Ref = shift;
387              
388             #error checking
389 0           my $function = "_getN1p";
390 0 0 0       if(!defined $self || !ref $self) {
391 0           $errorhandler->_error($pkg, $function, "", 2);
392             }
393            
394             #build the query string for all where cui1's are in the first position
395 0           my $firstCui = shift @{$cuis1Ref};
  0            
396 0           my $queryString = "select SUM(n_11) from N_11 where (cui_1 = '$firstCui' ";
397 0           foreach my $cui (@{$cuis1Ref}) {
  0            
398 0           $queryString .= "or cui_1 = '$cui' ";
399             }
400 0           unshift @{$cuis1Ref}, $firstCui;
  0            
401              
402             #finalize the query string
403 0 0         if ($noOrder_G) {
404             #add where cui1 is in the second position
405 0           $firstCui = shift @{$cuis1Ref};
  0            
406 0           $queryString .= ") or (cui_2 = '$firstCui' ";
407 0           foreach my $cui (@{$cuis1Ref}) {
  0            
408 0           $queryString .= "or cui_2 = '$cui' ";
409             }
410 0           unshift @{$cuis1Ref}, $firstCui;
  0            
411             }
412 0           $queryString .= ");";
413              
414             #query the db to retrive n1p
415 0           my $n1p = shift @{$assocDB_G->selectcol_arrayref($queryString)};
  0            
416 0 0         if (!defined $n1p) {
417 0           $n1p = -1;
418             }
419 0           return $n1p;
420             }
421              
422             # Method to calculate npp from a DB
423             # input : none
424             # output: $npp
425             sub _getNpp_DB {
426 0     0     my $self = shift;
427            
428             #error checking
429 0           my $function = "getNpp_DB";
430 0 0 0       if(!defined $self || !ref $self) {
431 0           $errorhandler->_error($pkg, $function, "", 2);
432             }
433              
434             #get npp, the number of co-occurrences
435 0           my $npp = shift @{$assocDB_G->selectcol_arrayref("select sum(N_11) from N_11")};
  0            
436              
437             #update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp
438 0 0         if ($noOrder_G) {
439 0           $npp *= 2;
440             }
441              
442             #return npp
443 0 0         if($npp <= 0) { $errorhandler->_error($pkg, $function, "", 5); }
  0            
444 0           return $npp;
445             }
446              
447             ########################################################################
448             # functions to get statistical information about the cuis using a matrix
449             ########################################################################
450              
451              
452             # Gets arrays of all first (leading) and second (trailing) cuis
453             # This is used when retreiving data from a matrix flat file
454             # input: $pairHashListRef - a ref to an array of pairHashes
455             # output: (\@cuis1, \@cuis2) - two array refs, the first contains
456             # all leading cuis in the dataset, the
457             # second contains all trailing cuis in
458             # the dataset.
459             sub _getAllLeadingAndTrailingCuis {
460 0     0     my $self = shift;
461 0           my $pairHashListRef = shift;
462              
463             #create a list of all possible cuis in the first and second positions
464 0           my @cuis1 = ();
465 0           my @cuis2 = ();
466 0           foreach my $pairHashRef(@{$pairHashListRef}) {
  0            
467 0           foreach my $cui(@{${$pairHashRef}{'set1'}}) {
  0            
  0            
468 0           push @cuis1, $cui;
469             }
470 0           foreach my $cui(@{${$pairHashRef}{'set2'}}) {
  0            
  0            
471 0           push @cuis2, $cui;
472             }
473             }
474 0           return (\@cuis1, \@cuis2);
475             }
476              
477              
478             # gets N11, N1P, NP1, NPP for a pairHashList using a matrix
479             # input : $pairHashListRef <- ref to a pairHashList
480             # output: $\@data <- array ref containing array refs of four values
481             # for each pair Hash, $n11, $n1p, $np1, and $npp
482             sub _getStats_matrix {
483             #grab parameters
484 0     0     my $self = shift;
485 0           my $pairHashListRef = shift;
486              
487             #error checking
488 0           my $function = "_getStats_matrix";
489 0 0 0       if(!defined $self || !ref $self) {
490 0           $errorhandler->_error($pkg, $function, "", 2);
491             }
492            
493             #get all observed counts for all possible cuis in the term pairs
494 0           (my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef);
495 0           my $countsRef = $self->_getObservedCounts_matrix($cuis1Ref, $cuis2Ref);
496 0           my $n11AllRef = ${$countsRef}[0];
  0            
497 0           my $n1pAllRef = ${$countsRef}[1];
  0            
498 0           my $np1AllRef = ${$countsRef}[2];
  0            
499 0           my $npp = ${$countsRef}[3];
  0            
500              
501             #update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp
502 0 0         if ($noOrder_G) {
503 0           $npp *= 2;
504             }
505              
506             #get values for each pairHash based on what was retreived from the matrix
507 0           my @data = ();
508 0           foreach my $pairHashRef (@{$pairHashListRef}) {
  0            
509 0           my $n11 = $self->_getN11_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}, $n11AllRef);
  0            
  0            
510 0           my $n1p = $self->_getN1p_matrix(${$pairHashRef}{'set1'}, $n11AllRef, $n1pAllRef, $np1AllRef);
  0            
511 0           my $np1 = $self->_getNp1_matrix(${$pairHashRef}{'set2'}, $n11AllRef, $n1pAllRef, $np1AllRef);
  0            
512            
513 0           my @vals = ($n11, $n1p, $np1, $npp);
514 0           push @data, \@vals;
515             }
516            
517             #return the data
518 0           return \@data;
519             }
520              
521             #computes the observed counts for all combinations of the cuis passed in
522             #doing this in a single function makes it so all values can be computed with a
523             #single pass of the input file, making execution time much faster
524             # input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
525             # $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
526             # output: $\@counts <- array ref containing four sets of values:
527             # \%n11, \%n1p, \%np1, and $npp for the cui pairs
528             # hashes are indexed: $n11{"$cui1,$cui2"}, $n1p{$cui},
529             # $np1{$cui}
530             sub _getObservedCounts_matrix {
531 0     0     my $self = shift;
532 0           my $cuis1Ref = shift;
533 0           my $cuis2Ref = shift;
534              
535             #convert cui arrays to hashes, makes looping thru
536             # the file faster
537 0           my %cuis1 = ();
538 0           foreach my $cui(@{$cuis1Ref}) {
  0            
539 0           $cuis1{$cui} = 1;
540             }
541 0           my %cuis2 = ();
542 0           foreach my $cui(@{$cuis2Ref}) {
  0            
543 0           $cuis2{$cui} = 1;
544             }
545              
546             #precalculate values for all cuis and cui pairs
547 0           my %n11 = ();
548 0           my %n1p = ();
549 0           my %np1 = ();
550 0           my $npp = 0;
551 0 0         open IN, $matrix_G or die "Cannot open $matrix_G for input: $!\n";
552 0           while (my $line = ) {
553             #get cuis and value from the line
554 0           chomp $line;
555 0           my ($cui1, $cui2, $num) = split /\t/, $line;
556              
557             #record any occurrence of any cui1 or 2, in case order is ignored
558 0 0 0       if (exists $cuis1{$cui1} || exists $cuis1{$cui2}
      0        
      0        
559             || exists $cuis2{$cui1} || exists $cuis2{$cui2}) {
560 0           $n1p{$cui1} += $num;
561 0           $np1{$cui2} += $num;
562 0           $n11{"$cui1,$cui2"} = $num;
563             }
564              
565             #update npp
566 0           $npp += $num;
567             }
568 0           close IN;
569              
570             #return counts
571 0           my @counts = (\%n11, \%n1p, \%np1, $npp);
572 0           return \@counts;
573             }
574              
575             # Gets N11 of the cui pair using a matrix
576             # input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
577             # $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
578             # $n11AllRef <- ref to an array containing n11 values for all possible
579             # cui pairs of the cuis1 and cuis2, of the form
580             # n11All{"$cui1,$cui2"}=value. See _getObservedCounts_matrix
581             # output: $n11 <- frequency of co-occurrences of the cuis in the cui sets
582             sub _getN11_matrix {
583             #grab parameters
584 0     0     my $self = shift;
585 0           my $cuis1Ref = shift;
586 0           my $cuis2Ref = shift;
587 0           my $n11AllRef = shift;
588              
589             #error checking
590 0           my $function = "_getN11_matrix";
591 0 0 0       if(!defined $self || !ref $self) {
592 0           $errorhandler->_error($pkg, $function, "", 2);
593             }
594              
595             #calculate n11 as the sum n11s for all combinations of
596             # cuis1, cuis2 (order matters, cui1 must be first)
597 0           my $n11 = 0;
598 0           foreach my $cui1 (@{$cuis1Ref}) {
  0            
599 0           foreach my $cui2 (@{$cuis2Ref}) {
  0            
600 0           my $num = ${$n11AllRef}{"$cui1,$cui2"};
  0            
601 0 0         if(defined $num) {
602 0           $n11 += $num;
603             }
604             }
605             }
606              
607             #update values if ignoring word order
608 0 0         if($noOrder_G) {
609             #add all n11's, now with the order reversed
610 0           foreach my $cui1 (@{$cuis1Ref}) {
  0            
611 0           foreach my $cui2 (@{$cuis2Ref}) {
  0            
612 0           my $num = ${$n11AllRef}{"$cui2,$cui1"};
  0            
613 0 0         if(defined $num) {
614 0           $n11 += $num;
615             }
616             }
617             }
618             }
619              
620 0           return $n11;
621             }
622              
623             # gets N1P for a concept using a matrix
624             # input : $cuis1Ref <- reference to an array containing the first cuis in a set of cui pairs
625             # $countsRef <- ref to an array containing n11, n1p, np1, and npp counts
626             # for the cui combinations. See _getObservedCounts_matrix()
627             # $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2,
628             # of the form n1pAll{$cui} = value. See _getObservedCounts_matrix
629             # $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2,
630             # of the form np1All{$cui} = value. See _getObservedCounts_matrix
631             # output: $n1p <- the number of times the set of concepts occurs in first position
632             sub _getN1p_matrix {
633             #grab parameters
634 0     0     my $self = shift;
635 0           my $cuis1Ref = shift;
636 0           my $n11AllRef = shift;
637 0           my $n1pAllRef = shift;
638 0           my $np1AllRef = shift;
639              
640             #error checking
641 0           my $function = "_getN1P_matrix";
642 0 0 0       if(!defined $self || !ref $self) {
643 0           $errorhandler->_error($pkg, $function, "", 2);
644             }
645            
646             #calculate n1p as the sum of n1p's for all cuis1
647 0           my $n1p = 0;
648 0           foreach my $cui (@{$cuis1Ref}) {
  0            
649 0           my $num = ${$n1pAllRef}{$cui};
  0            
650 0 0         if(defined $num) {
651 0           $n1p += $num;
652             }
653             }
654              
655             #update values if ignoring word order
656 0 0         if ($noOrder_G) {
657             #add all np1's to n1p
658 0           foreach my $cui (@{$cuis1Ref}) {
  0            
659 0           my $num = ${$np1AllRef}{$cui};
  0            
660 0 0         if(defined $num) {
661 0           $n1p += $num;
662             }
663             }
664              
665             #avoid double counting occurrences with self, subtract them
666 0           foreach my $cui1(@{$cuis1Ref}) {
  0            
667 0           foreach my $cui2(@{$cuis1Ref}) {
  0            
668 0           my $val = ${$n11AllRef}{"$cui1,$cui2"};
  0            
669 0 0         if (defined $val) {
670 0           $n1p -= $val;
671             }
672             }
673             }
674             }
675              
676             #set n1p to -1 if there are no values for it since this indicates
677             # there is not enough information to calculate the score
678 0 0         if ($n1p == 0) {
679 0           $n1p = -1;
680             }
681              
682             #return the value
683 0           return $n1p;
684             }
685              
686             # gets NP1 for a concept using a matrix
687             # input : $cuis2Ref <- reference to an array containing the first cuis in a set of cui pairs
688             # $countsRef <- ref to an array containing n11, n1p, np1, and npp counts
689             # for the cui combinations. See _getObservedCounts_matrix()
690             # $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2,
691             # of the form n1pAll{$cui} = value. See _getObservedCounts_matrix
692             # $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2,
693             # of the form np1All{$cui} = value. See _getObservedCounts_matrix
694             # output: $np1 <- the number of times the set of concepts occurs in second position
695             sub _getNp1_matrix {
696             #grab parameters
697 0     0     my $self = shift;
698 0           my $cuis2Ref = shift;
699 0           my $n11AllRef = shift;
700 0           my $n1pAllRef = shift;
701 0           my $np1AllRef = shift;
702              
703             #calculate np1 as the sum of np1's for all cuis2
704 0           my $np1 = 0;
705 0           foreach my $cui (@{$cuis2Ref}) {
  0            
706 0           my $num = ${$np1AllRef}{$cui};
  0            
707 0 0         if (defined $num) {
708 0           $np1 += $num;
709             }
710             }
711              
712             #update values if ignoring word order
713 0 0         if ($noOrder_G) {
714             #add all n1p's to np1s
715 0           foreach my $cui (@{$cuis2Ref}) {
  0            
716 0           my $num = ${$n1pAllRef}{$cui};
  0            
717 0 0         if (defined $num) {
718 0           $np1 += $num;
719             }
720             }
721              
722             #avoid double counting occurrences with self, subtract them
723 0           foreach my $cui1(@{$cuis2Ref}) {
  0            
724 0           foreach my $cui2(@{$cuis2Ref}) {
  0            
725 0           my $val = ${$n11AllRef}{"$cui1,$cui2"};
  0            
726 0 0         if (defined $val) {
727 0           $np1 -= $val;
728             }
729             }
730             }
731             }
732              
733             #set n1p to -1 if there are no values for it since this indicates
734             # there is not enough information to calculate the score
735 0 0         if ($np1 == 0) {
736 0           $np1 = -1;
737             }
738              
739             #return the value
740 0           return $np1;
741             }
742              
743              
744             ########################################################################
745             # functions to get statistical information about the cuis LTA, MWA, VSA
746             ########################################################################
747             # Gets contingency table values for Linking Term Association (LTA)
748             # input : $pairHashListRef <- ref to a pairHashList
749             # output: $\@data <- valuesarray ref containing array refs of four values
750             # for each pairHash in the pairHash list. The
751             # values are $n11, $n1p, $np1, and $npp
752             sub _getStats_LTA {
753             #grab parameters
754 0     0     my $self = shift;
755 0           my $pairHashListRef = shift;
756            
757             #error checking
758 0           my $function = "_getStats_LTA";
759 0 0 0       if(!defined $self || !ref $self) {
760 0           $errorhandler->_error($pkg, $function, "", 2);
761             }
762             #get data from the matrix
763 0           (my $cooccurrences1ListRef, my $cooccurrences2ListRef,
764             my $numCooccurrences, my $numUniqueCuis)
765             = $self->_readMatrixValues_Linking($pairHashListRef);
766              
767             #for LTA, npp= num unique cuis in the dataset
768 0           my $npp = $numUniqueCuis;
769              
770             #calculate stats for each pairHash based on the co-occurrences data
771 0           my @data = ();
772 0           for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  0            
773            
774             #calculate n1p and np1 as the number of co-occurring terms
775 0           my $n1p = scalar keys %{${$cooccurrences1ListRef}[$i]};
  0            
  0            
776 0           my $np1 = scalar keys %{${$cooccurrences2ListRef}[$i]};
  0            
  0            
777              
778             #calculate n11
779 0           my $n11 = 0;
780             #Find number of CUIs that co-occur with both CUI 1 and CUI 2
781 0           foreach my $cui (keys %{${$cooccurrences1ListRef}[$i]}) {
  0            
  0            
782 0 0         if (exists ${${$cooccurrences2ListRef}[$i]}{$cui}) {
  0            
  0            
783 0           $n11++;
784             }
785             }
786              
787             #store the data for this pairHash
788 0           my @vals = ($n11, $n1p, $np1, $npp);
789 0           push @data, \@vals;
790             }
791              
792             #return the data
793 0           return \@data;
794             }
795              
796              
797             # Gets contingency table values for Minimum Weight Association (MWA)
798             # input : $pairHashListRef <- ref to a pairHashList
799             # output: $\@data <- array ref containing array refs of four values
800             # for each pairHash in the pairHash list. The
801             # values are $n11, $n1p, $np1, and $npp
802             sub _getStats_MWA {
803             #grab parameters
804 0     0     my $self = shift;
805 0           my $pairHashListRef = shift;
806            
807             #error checking
808 0           my $function = "_getStats_MWA";
809 0 0 0       if(!defined $self || !ref $self) {
810 0           $errorhandler->_error($pkg, $function, "", 2);
811             }
812            
813             #get data from the matrix
814 0           (my $cooccurrences1ListRef, my $cooccurrences2ListRef,
815             my $numCooccurrences, my $numUniqueCuis)
816             = $self->_readMatrixValues_Linking($pairHashListRef);
817              
818             #for MWA, npp= numCooccurrences in the dataset
819 0           my $npp = $numCooccurrences;
820              
821             #calculate stats for each pairHash based on the co-occurrences data
822 0           my @data = ();
823 0           for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  0            
824 0           my $set1CoRef = ${$cooccurrences1ListRef}[$i];
  0            
825 0           my $set2CoRef = ${$cooccurrences2ListRef}[$i];
  0            
826              
827             #calculate n1p and np1 as the number of co-occurrences for the term
828 0           my $n1p = 0;
829 0           foreach my $cui (keys %{$set1CoRef}) {
  0            
830 0           $n1p += ${$set1CoRef}{$cui};
  0            
831             }
832 0           my $np1 = 0;
833 0           foreach my $cui (keys %{$set2CoRef}) {
  0            
834 0           $np1 += ${$set2CoRef}{$cui};
  0            
835             }
836              
837             #Find $n11, the min co-occurrence value of the pair
838 0           my $n11 = 0;
839 0           foreach my $cui (keys %{$set1CoRef}) {
  0            
840             #if this cui co-occurs with both sets, then increment n11
841 0 0         if (exists ${$set2CoRef}{$cui}) {
  0            
842             #increment n11 by the minimum of the co-occurrences
843 0           my $min = ${$set1CoRef}{$cui};
  0            
844 0 0         if (${$set2CoRef}{$cui} < $min) {
  0            
845 0           $min = ${$set2CoRef}{$cui};
  0            
846             }
847 0           $n11+=$min;
848             }
849             }
850              
851             #store the data for this pairHash
852 0           my @vals = ($n11, $n1p, $np1, $npp);
853 0           push @data, \@vals;
854             }
855              
856             #return the data
857 0           return \@data;
858             }
859              
860              
861             # Gets contingency table values for Vector Set Association (VSA)
862             # input : $pairHashListRef <- ref to a pairHashList
863             # output: $\@data <- array ref containing array refs of four values
864             # for each pairHash in the pairHash list. The
865             # values are $n11, $n1p, $np1, and $npp
866             sub _getStats_VSA {
867             #grab parameters
868 0     0     my $self = shift;
869 0           my $pairHashListRef = shift;
870            
871             #error checking
872 0           my $function = "_getStats_VSA";
873 0 0 0       if(!defined $self || !ref $self) {
874 0           $errorhandler->_error($pkg, $function, "", 2);
875             }
876             #get data from the matrix
877 0           (my $cooccurrences1ListRef, my $cooccurrences2ListRef,
878             my $numCooccurrences, my $numUniqueCuis)
879             = $self->_readMatrixValues_Linking($pairHashListRef);
880              
881             #convert the cooccurrence lists to pairHashLists
882 0           my @newPairHashList = ();
883 0           for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
  0            
884 0           my %pairHash = ();
885            
886             #make set 1 an array
887 0           my @set1 = ();
888 0           foreach my $key (keys %{${$cooccurrences1ListRef}[$i]}) {
  0            
  0            
889 0           push @set1, $key;
890             }
891 0           $pairHash{'set1'} = \@set1;
892              
893             #make set 2 an array
894 0           my @set2 = ();
895 0           foreach my $key (keys %{${$cooccurrences2ListRef}[$i]}) {
  0            
  0            
896 0           push @set2, $key;
897             }
898 0           $pairHash{'set2'} = \@set2;
899              
900             #add the pairHash to the pairHashList
901 0           push @newPairHashList, \%pairHash;
902             }
903             #So, at this point we have converted the sets of B terms
904             # into a pairhashlist.
905             #Next we find the stats for each of those pair hashes and
906             # use that as the stats for the original pair.
907             # in this way we are finding the assocaition between
908             # sets of co-occurring terms of the original terms
909 0           my $allStatsRef;
910 0 0         if ($matrix_G) {
911 0           $allStatsRef = $self->_getStats_matrix(\@newPairHashList);
912             }
913             else {
914 0           $allStatsRef = $self->_getStats_DB(\@newPairHashList);
915             }
916             #all stats ref contains n11, np1, n1p, and npp for
917             # each of the pair hashes
918 0           return $allStatsRef;
919             }
920              
921              
922              
923             # Gets co-occurrence data for each of the pairHashes in the pairHashList
924             # and gets global stats, total number of co-occurrences in the dataset,
925             # and the number of unique cuis in the dataset. The co-occurrences data
926             # is returned in the form of a co-occurrences hash for cuis1 and cuis2
927             # of the pairHash. Each co-occurrences hash is:
928             # $cooccurrences1{$cui2} = $val
929             # There is no distinction between different cuis of cuis1
930             # input : $pairHashListRef <- ref to a pairHashList
931             # output: $\@data <- array ref containing array refs of four values
932             sub _readMatrixValues_Linking {
933             #grab parameters
934 0     0     my $self = shift;
935 0           my $pairHashListRef = shift;
936            
937             #error checking
938 0           my $function = "_readMatrixValues_Linking";
939 0 0 0       if(!defined $self || !ref $self) {
940 0           $errorhandler->_error($pkg, $function, "", 2);
941             }
942              
943             #Get co-occurrences with each set of CUIs
944             # for each set of cuis we find a list of cuis that co-occur with that set
945             # this is done for cuis1 and cuis2. Once retreiving these two lists
946             # of co-occurring cuis, we can calculate LTA based on the overlap of
947             # co-occurrences.
948 0           my @cooccurrences1List;
949             my @cooccurrences2List;
950 0           my $totalCooccurrences = 0;
951 0           my $totalUniqueCuis = 0;
952 0 0         if ($matrix_G) {
953             #get observed counts for all data
954 0           (my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef);
955 0           (my $n1pAllRef, my $np1AllRef, $totalCooccurrences, $totalUniqueCuis)
956             = $self->_getObserved_matrix_Linking($cuis1Ref, $cuis2Ref);
957              
958             #get co-occurrence data for each pairHash
959 0           foreach my $pairHashRef(@{$pairHashListRef}) {
  0            
960             (my $cooccurrences1Ref, my $cooccurrences2Ref) = $self
961 0           ->_getCUICooccurrences_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'},
  0            
  0            
962             $n1pAllRef, $np1AllRef);
963              
964 0           push @cooccurrences1List, $cooccurrences1Ref;
965 0           push @cooccurrences2List, $cooccurrences2Ref;
966             }
967             }
968             else {
969             #get total co-occurrences and total unique cuis
970 0           $totalCooccurrences = $self->_getNpp_DB();
971              
972             #get npp, the number of unique cuis
973             #TODO, query is slightly wrong. If the there are cuis that occur in the second position ONLY this will be wrong. I need to merge the CUI 1 and CUI2 tables then select distinct elements
974 0           $totalUniqueCuis = shift @{$assocDB_G->selectcol_arrayref("SELECT COUNT(cui_1) FROM (SELECT DISTINCT cui_1 FROM N_11) AS names")};
  0            
975              
976             #TODO, check this with MWA now ...will need to code it
977             #get co-occurrence data for each pair hash
978 0           foreach my $pairHashRef(@{$pairHashListRef}) {
  0            
979             (my $cooccurrences1Ref, my $cooccurrences2Ref) = $self
980 0           ->_getCUICooccurrences_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'});
  0            
  0            
981 0           push @cooccurrences1List, $cooccurrences1Ref;
982 0           push @cooccurrences2List, $cooccurrences2Ref;
983             }
984             }
985              
986 0           return (\@cooccurrences1List, \@cooccurrences2List, $totalCooccurrences, $totalUniqueCuis);
987             }
988              
989              
990             # computes the observed co-occurrences for all combinations of the cuis passed in
991             # doing this in a single function makes it so all values can be computed with a
992             # single pass of the input file, making execution time much faster
993             # input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
994             # $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
995             # output: $n1pAllRef <- a ref to a hash of hashes that contains co-occurence
996             # data organized as:
997             # matrix{leadingCUI}{trailingCUI} = cooccurrencecount
998             # $np1AllRef <- a ref to a hash of hashes that contains co-occurence
999             # data organized as:
1000             # matrix{trailingCUI}{leadingCUI} = cooccurrencecount
1001             # $cooccurrenceCount <- the total number of co-occurrences in
1002             # the dataset
1003             # $numUniquCuis <- the number of unique cuis in the dataset
1004             sub _getObserved_matrix_Linking {
1005             #grab parameters
1006 0     0     my $self = shift;
1007 0           my $cuis1Ref = shift;
1008 0           my $cuis2Ref = shift;
1009              
1010             #convert cui arrays to hashes, makes looping thru
1011             # the file faster
1012 0           my %cuis1 = ();
1013 0           foreach my $cui(@{$cuis1Ref}) {
  0            
1014 0           $cuis1{$cui} = 1;
1015             }
1016 0           my %cuis2 = ();
1017 0           foreach my $cui(@{$cuis2Ref}) {
  0            
1018 0           $cuis2{$cui} = 1;
1019             }
1020              
1021             #get stats
1022 0           my %n1pAll = ();
1023 0           my %np1All = ();
1024 0           my %uniqueCuis = ();
1025 0           my $cooccurrenceCount = 0;
1026 0 0         open IN, $matrix_G or die "Cannot open matrix_G for input: $matrix_G\n";
1027 0           while (my $line = ) {
1028             #get cuis and value fro mthe line
1029 0           chomp $line;
1030 0           my ($cui1, $cui2, $num) = split /\t/, $line;
1031              
1032             #update unique cui lists
1033 0           $uniqueCuis{$cui1} = 1;
1034 0           $uniqueCuis{$cui2} = 1;
1035              
1036             #update co-occurrence count
1037 0           $cooccurrenceCount += $num;
1038              
1039             #update n1pAll and np1All. These just record data
1040             # so we record any possible co-occurrence that matters
1041             # with or without order mattering so just check
1042             # if a CUI of interest is anywhere on the line
1043 0 0 0       if (exists $cuis1{$cui1} || exists $cuis2{$cui2}
      0        
      0        
1044             || exists $cuis1{$cui2} || exists $cuis2{$cui1}) {
1045              
1046             #update n1pAll
1047             #create n1p{$cui1} hash if needed
1048 0 0         if (!defined $n1pAll{$cui1}) {
1049 0           my %newHash = ();
1050 0           $n1pAll{$cui1} = \%newHash;
1051             }
1052              
1053             #add cui2 and value
1054 0           ${$n1pAll{$cui1}}{$cui2} = $num;
  0            
1055              
1056             #update np1All
1057             #create np1{$cui2} hash if needed
1058 0 0         if (!defined $np1All{$cui2}) {
1059 0           my %newHash = ();
1060 0           $np1All{$cui2} = \%newHash;
1061             }
1062              
1063             #add cui1 and value
1064 0           ${$np1All{$cui2}}{$cui1} = $num;
  0            
1065              
1066             }
1067             }
1068 0           close IN;
1069              
1070             #return the observed values
1071 0           return (\%n1pAll, \%np1All, $cooccurrenceCount, (scalar keys %uniqueCuis));
1072             }
1073              
1074              
1075             # Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using
1076             # a matrix. This is the first step in computing linking term associations
1077             # input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
1078             # $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
1079             # $n1pAllRef <- a ref to a hash of hashes that contains co-occurence
1080             # data organized as:
1081             # matrix{leadingCUI}{trailingCUI} = cooccurrencecount
1082             # $np1AllRef <- a ref to a hash of hashes that contains co-occurence
1083             # data organized as:
1084             # matrix{trailingCUI}{leadingCUI} = cooccurrencecount
1085             # output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1,
1086             # values are the co-occurrence count
1087             # \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2,
1088             # values are the co-occurrence count
1089             sub _getCUICooccurrences_matrix {
1090             #grab parameters
1091 0     0     my $self = shift;
1092 0           my $cuis1Ref = shift;
1093 0           my $cuis2Ref = shift;
1094 0           my $n1pAllRef = shift;
1095 0           my $np1AllRef = shift;
1096              
1097             #error checking
1098 0           my $function = "_getCUICooccurrences";
1099 0 0 0       if(!defined $self || !ref $self) {
1100 0           $errorhandler->_error($pkg, $function, "", 2);
1101             }
1102              
1103             #get lists of explicitly co-occurring CUIs for each concept
1104             #add trailing cui co-occurrences to cui1Data
1105 0           my %cooccurrences1;
1106 0           foreach my $cui1 (@{$cuis1Ref}){
  0            
1107 0 0         if (defined ${$n1pAllRef}{$cui1}) {
  0            
1108 0           foreach my $cui2 (keys %{${$n1pAllRef}{$cui1}}) {
  0            
  0            
1109 0           $cooccurrences1{$cui2} = ${${$n1pAllRef}{$cui1}}{$cui2};
  0            
  0            
1110             }
1111             }
1112             }
1113              
1114             #add leading cui co-occurrences to cui2Data
1115 0           my %cooccurrences2;
1116 0           foreach my $cui2 (@{$cuis2Ref}){
  0            
1117 0 0         if (defined ${$np1AllRef}{$cui2}) {
  0            
1118 0           foreach my $cui1 (keys %{${$np1AllRef}{$cui2}}) {
  0            
  0            
1119 0           $cooccurrences2{$cui1} = ${${$np1AllRef}{$cui2}}{$cui1};
  0            
  0            
1120             }
1121             }
1122             }
1123            
1124             #add more CUIs if order doesn't matter
1125 0 0         if ($noOrder_G) {
1126             #add leading cui co-occurrences to cui1Data
1127 0           foreach my $cui1 (@{$cuis1Ref}){
  0            
1128 0 0         if (defined ${$np1AllRef}{$cui1}) {
  0            
1129 0           foreach my $cui2 (keys %{${$np1AllRef}{$cui1}}) {
  0            
  0            
1130 0           $cooccurrences1{$cui2} = ${${$np1AllRef}{$cui1}}{$cui2};
  0            
  0            
1131             }
1132             }
1133             }
1134             #add trailing cui co-occurrences to cui2Data
1135 0           foreach my $cui2 (@{$cuis2Ref}){
  0            
1136 0 0         if (defined ${$n1pAllRef}{$cui2}) {
  0            
1137 0           foreach my $cui1 (keys %{${$n1pAllRef}{$cui2}}) {
  0            
  0            
1138 0           $cooccurrences2{$cui1} = ${${$n1pAllRef}{$cui2}}{$cui1};
  0            
  0            
1139             }
1140             }
1141             }
1142             }
1143            
1144 0           return (\%cooccurrences1, \%cooccurrences2);
1145             }
1146              
1147              
1148             # Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using
1149             # a database. This is the first step in computing linking term associations
1150             # input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
1151             # $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
1152             # output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1,
1153             # values are 1
1154             # \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2,
1155             # values are 1
1156             sub _getCUICooccurrences_DB {
1157             #grab parameters
1158 0     0     my $self = shift;
1159 0           my $cuis1Ref = shift;
1160 0           my $cuis2Ref = shift;
1161            
1162             #error checking
1163 0           my $function = "_getCUICooccurrences_DB";
1164 0 0 0       if(!defined $self || !ref $self) {
1165 0           $errorhandler->_error($pkg, $function, "", 2);
1166             }
1167              
1168             #get hashes of co-occurring CUIs
1169 0           my %cooccurrences1 = ();
1170 0           my %cooccurrences2 = ();
1171              
1172             #query DB to get cuis, where concept 1 is the leading cui
1173 0           my $firstCui = shift @{$cuis1Ref};
  0            
1174 0           my $query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' ";
1175 0           foreach my $cui (@{$cuis1Ref}) {
  0            
1176 0           $query .= "OR N_11.cui_1 = '$cui' ";
1177             }
1178 0           $query .= ") AND N_11.n_11 > 0;";
1179 0           my @cuis = @{$assocDB_G->selectcol_arrayref($query)};
  0            
1180 0           unshift @{$cuis1Ref}, $firstCui;
  0            
1181              
1182             #turn CUIs into a hash of cui1's cooccurrences
1183 0           foreach my $cui (@cuis) {
1184 0           $cooccurrences1{$cui} = 1;
1185             }
1186              
1187             #query DB to get cuis, where concept 2 is the trailing cui
1188 0           $firstCui = shift @{$cuis2Ref};
  0            
1189 0           $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' ";
1190 0           foreach my $cui (@{$cuis2Ref}) {
  0            
1191 0           $query .= "OR N_11.cui_2 = '$cui' ";
1192             }
1193 0           $query .= ") AND N_11.n_11 > 0;";
1194 0           @cuis = @{$assocDB_G->selectcol_arrayref($query)};
  0            
1195 0           unshift @{$cuis2Ref}, $firstCui;
  0            
1196              
1197             #turn CUIs into a hash of cui2's co-occurrences
1198 0           foreach my $cui (@cuis) {
1199 0           $cooccurrences2{$cui} = 1;
1200             }
1201              
1202             #add additional cuis if order doesn't matter
1203 0 0         if($noOrder_G) {
1204             #get cuis, where concept 1 is the trailing cui
1205 0           $firstCui = shift @{$cuis1Ref};
  0            
1206 0           my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' ";
1207 0           foreach my $cui (@{$cuis1Ref}) {
  0            
1208 0           $query .= "OR N_11.cui_2 = '$cui' ";
1209             }
1210 0           $query .= ") AND N_11.n_11 > 0;";
1211 0           @cuis = @{$assocDB_G->selectcol_arrayref($query)};
  0            
1212 0           unshift @{$cuis1Ref}, $firstCui;
  0            
1213              
1214             #add cuis to the hash of cui1's co-occurrences
1215 0           foreach my $cui (@cuis) {
1216 0           $cooccurrences1{$cui} = 1;
1217             }
1218              
1219             #get cuis, where concept 2 is the leading cui
1220 0           $firstCui = shift @{$cuis2Ref};
  0            
1221 0           $query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' ";
1222 0           foreach my $cui (@{$cuis2Ref}) {
  0            
1223 0           $query .= "OR N_11.cui_1 = '$cui' ";
1224             }
1225 0           $query .= ") AND N_11.n_11 > 0;";
1226 0           @cuis = @{$assocDB_G->selectcol_arrayref($query)};
  0            
1227 0           unshift @{$cuis2Ref}, $firstCui;
  0            
1228              
1229             #add cuis to the hash of cui2's co-occurrences
1230 0           foreach my $cui (@cuis) {
1231 0           $cooccurrences2{$cui} = 1;
1232             }
1233             }
1234              
1235             #return the cui co-occurrences
1236 0           return (\%cooccurrences1, \%cooccurrences2);
1237             }
1238              
1239              
1240             =comment
1241             # Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using
1242             # a database. This is the first step in computing linking term associations
1243             # input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs
1244             # $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs
1245             # output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1,
1246             # values are 1
1247             # \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2,
1248             # values are 1
1249             sub _getCUICooccurrences_DB {
1250             #grab parameters
1251             my $self = shift;
1252             my $cuis1Ref = shift;
1253             my $cuis2Ref = shift;
1254            
1255             #error checking
1256             my $function = "_getCUICooccurrences_DB";
1257             if(!defined $self || !ref $self) {
1258             $errorhandler->_error($pkg, $function, "", 2);
1259             }
1260              
1261             #get hashes of co-occurring CUIs
1262             my %cooccurrences1 = ();
1263             my %cooccurrences2 = ();
1264              
1265             #query DB to get cuis, where concept 1 is the leading cui
1266             my $firstCui = shift @{$cuis1Ref};
1267             my $query = "SELECT * FROM N_11 WHERE (N_11.cui_1 = '$firstCui' ";
1268             foreach my $cui (@{$cuis1Ref}) {
1269             $query .= "OR N_11.cui_1 = '$cui' ";
1270             }
1271             $query .= ") AND N_11.n_11 > 0;";
1272             my $sth = $assocDB_G->prepare($query);
1273             $sth->execute();
1274             my @rows = @{$sth->fetchall_arrayref()};
1275             unshift @{$cuis1Ref}, $firstCui;
1276              
1277             #turn CUIs into a hash of cui1's cooccurrences
1278             foreach my $rowRef (@rows) {
1279             print STDERR join(' ', @{$rowRef})."\n";
1280             }
1281             #TODO - this is done, it works ... it gets back the whole relevant table. Now fill up as needed.
1282              
1283              
1284             my @cuis;
1285             #query DB to get cuis, where concept 2 is the trailing cui
1286             $firstCui = shift @{$cuis2Ref};
1287             $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' ";
1288             foreach my $cui (@{$cuis2Ref}) {
1289             $query .= "OR N_11.cui_2 = '$cui' ";
1290             }
1291             $query .= ") AND N_11.n_11 > 0;";
1292             @cuis = @{$assocDB_G->selectcol_arrayref($query)};
1293             unshift @{$cuis2Ref}, $firstCui;
1294              
1295             #turn CUIs into a hash of cui2's co-occurrences
1296             foreach my $cui (@cuis) {
1297             $cooccurrences2{$cui} = 1;
1298             }
1299              
1300             #add additional cuis if order doesn't matter
1301             if($noOrder_G) {
1302             #get cuis, where concept 1 is the trailing cui
1303             $firstCui = shift @{$cuis1Ref};
1304             my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' ";
1305             foreach my $cui (@{$cuis1Ref}) {
1306             $query .= "OR N_11.cui_2 = '$cui' ";
1307             }
1308             $query .= ") AND N_11.n_11 > 0;";
1309             @cuis = @{$assocDB_G->selectcol_arrayref($query)};
1310             unshift @{$cuis1Ref}, $firstCui;
1311              
1312             #add cuis to the hash of cui1's co-occurrences
1313             foreach my $cui (@cuis) {
1314             $cooccurrences1{$cui} = 1;
1315             }
1316              
1317             #get cuis, where concept 2 is the leading cui
1318             $firstCui = shift @{$cuis2Ref};
1319             $query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' ";
1320             foreach my $cui (@{$cuis2Ref}) {
1321             $query .= "OR N_11.cui_1 = '$cui' ";
1322             }
1323             $query .= ") AND N_11.n_11 > 0;";
1324             @cuis = @{$assocDB_G->selectcol_arrayref($query)};
1325             unshift @{$cuis2Ref}, $firstCui;
1326              
1327             #add cuis to the hash of cui2's co-occurrences
1328             foreach my $cui (@cuis) {
1329             $cooccurrences2{$cui} = 1;
1330             }
1331             }
1332              
1333             #return the cui co-occurrences
1334             return (\%cooccurrences1, \%cooccurrences2);
1335             }
1336             =cut
1337              
1338             1;
1339              
1340             __END__