| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | #UMLS::Association | 
| 2 |  |  |  |  |  |  | # | 
| 3 |  |  |  |  |  |  | # Perl module for scoring the semantic association of terms in the Unified | 
| 4 |  |  |  |  |  |  | # Medical Language System (UMLS). | 
| 5 |  |  |  |  |  |  | # | 
| 6 |  |  |  |  |  |  | # Copyright (c) 2015 | 
| 7 |  |  |  |  |  |  | # | 
| 8 |  |  |  |  |  |  | # Bridget T. McInnes, Virginia Commonwealth University | 
| 9 |  |  |  |  |  |  | # btmcinnes at vcu.edu | 
| 10 |  |  |  |  |  |  | # | 
| 11 |  |  |  |  |  |  | # Keith Herbert, Virginia Commonwealth University | 
| 12 |  |  |  |  |  |  | # herbertkb at vcu.edu | 
| 13 |  |  |  |  |  |  | # | 
| 14 |  |  |  |  |  |  | # Alexander D. McQuilkin, Virginia Commonwealth University | 
| 15 |  |  |  |  |  |  | # alexmcq99 at yahoo.com | 
| 16 |  |  |  |  |  |  | # | 
| 17 |  |  |  |  |  |  | # Sam Henry, Virginia Commonwealth University | 
| 18 |  |  |  |  |  |  | # henryst at vcu.edu | 
| 19 |  |  |  |  |  |  | # | 
| 20 |  |  |  |  |  |  | # This program is free software; you can redistribute it and/or | 
| 21 |  |  |  |  |  |  | # modify it under the terms of the GNU General Public License | 
| 22 |  |  |  |  |  |  | # as published by the Free Software Foundation; either version 2 | 
| 23 |  |  |  |  |  |  | # of the License, or (at your option) any later version. | 
| 24 |  |  |  |  |  |  | # | 
| 25 |  |  |  |  |  |  | # This program is distributed in the hope that it will be useful, | 
| 26 |  |  |  |  |  |  | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 27 |  |  |  |  |  |  | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| 28 |  |  |  |  |  |  | # GNU General Public License for more details. | 
| 29 |  |  |  |  |  |  | # | 
| 30 |  |  |  |  |  |  | # You should have received a copy of the GNU General Public License | 
| 31 |  |  |  |  |  |  | # along with this program; if not, write to | 
| 32 |  |  |  |  |  |  | # | 
| 33 |  |  |  |  |  |  | # The Free Software Foundation, Inc., | 
| 34 |  |  |  |  |  |  | # 59 Temple Place - Suite 330, | 
| 35 |  |  |  |  |  |  | # Boston, MA  02111-1307, USA. | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | package UMLS::Association::StatFinder; | 
| 38 |  |  |  |  |  |  |  | 
| 39 | 1 |  |  | 1 |  | 7 | use Fcntl; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 214 |  | 
| 40 | 1 |  |  | 1 |  | 6 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 19 |  | 
| 41 | 1 |  |  | 1 |  | 4 | use warnings; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 29 |  | 
| 42 | 1 |  |  | 1 |  | 4 | use DBI; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 30 |  | 
| 43 | 1 |  |  | 1 |  | 5 | use bytes; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 4 |  | 
| 44 | 1 |  |  | 1 |  | 18 | use File::Spec; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 5346 |  | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | #  error handling variables | 
| 47 |  |  |  |  |  |  | my $errorhandler = ""; | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | my $pkg = "UMLS::Association::StatFinder"; | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | #  debug variables | 
| 52 |  |  |  |  |  |  | #local(*DEBUG_FILE); | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | #NOTE: every global variable is followed by a _G with the | 
| 55 |  |  |  |  |  |  | # exception of debug error handler, and constants which are all caps | 
| 56 |  |  |  |  |  |  | #  global variables | 
| 57 |  |  |  |  |  |  | my $debug     = 0; #in debug mode or not | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | #global options variables | 
| 60 |  |  |  |  |  |  | my $assocDB_G; | 
| 61 |  |  |  |  |  |  | my $lta_G = 0; #1 or 0 is using lta or not | 
| 62 |  |  |  |  |  |  | my $mwa_G = 0; #1 or 0 if using mwa or not | 
| 63 |  |  |  |  |  |  | my $vsa_G = 0; #1 or 0 if using vsa or not | 
| 64 |  |  |  |  |  |  | my $noOrder_G = 0; #1 or 0 if noOrder is enabled or not | 
| 65 |  |  |  |  |  |  | my $matrix_G = 0; #matrix file name is using a matrix file rather than DB | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | ###################################################################### | 
| 68 |  |  |  |  |  |  | #                 Initialization Functions | 
| 69 |  |  |  |  |  |  | ###################################################################### | 
| 70 |  |  |  |  |  |  | #  method to create a new UMLS::Association::StatFinder object | 
| 71 |  |  |  |  |  |  | #  input : $params <- reference to hash of database parameters | 
| 72 |  |  |  |  |  |  | #  output: $self | 
| 73 |  |  |  |  |  |  | sub new { | 
| 74 |  |  |  |  |  |  | #grab params and create self | 
| 75 | 1 |  |  | 1 | 0 | 2 | my $self = {}; | 
| 76 | 1 |  |  |  |  | 2 | my $className = shift; | 
| 77 | 1 |  |  |  |  | 2 | my $params = shift; | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | #bless the object. | 
| 80 | 1 |  |  |  |  | 2 | bless($self, $className); | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | #initialize error handler | 
| 83 | 1 |  |  |  |  | 5 | $errorhandler = UMLS::Association::ErrorHandler->new(); | 
| 84 | 1 | 50 |  |  |  | 13 | if(! defined $errorhandler) { | 
| 85 | 0 |  |  |  |  | 0 | print STDERR "The error handler did not get passed properly.\n"; | 
| 86 | 0 |  |  |  |  | 0 | exit; | 
| 87 |  |  |  |  |  |  | } | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | # initialize the object. | 
| 90 | 1 |  |  |  |  | 3 | $debug = 0; | 
| 91 | 1 |  |  |  |  | 8 | $self->_initialize($params); | 
| 92 | 1 |  |  |  |  | 3 | return $self; | 
| 93 |  |  |  |  |  |  | } | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | #  method to initialize the UMLS::Association::StatFinder object. | 
| 96 |  |  |  |  |  |  | #  input : $parameters <- reference to a hash of database parameters | 
| 97 |  |  |  |  |  |  | #  output: none, but $self is initialized | 
| 98 |  |  |  |  |  |  | sub _initialize { | 
| 99 |  |  |  |  |  |  | #grab parameters | 
| 100 | 1 |  |  | 1 |  | 4 | my $self = shift; | 
| 101 | 1 |  |  |  |  | 3 | my $paramsRef = shift; | 
| 102 | 1 |  |  |  |  | 4 | my %params = %{$paramsRef}; | 
|  | 1 |  |  |  |  | 7 |  | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | #set global variables using option hash | 
| 105 | 1 |  |  |  |  | 5 | $lta_G = $params{'lta'}; | 
| 106 | 1 |  |  |  |  | 3 | $mwa_G = $params{'mwa'}; | 
| 107 | 1 |  |  |  |  | 3 | $vsa_G = $params{'vsa'}; | 
| 108 | 1 |  |  |  |  | 4 | $noOrder_G = $params{'noorder'}; | 
| 109 | 1 |  |  |  |  | 4 | $matrix_G = $params{'matrix'}; | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | #connect to the database of association scores | 
| 112 | 1 | 50 |  |  |  | 4 | if (!$matrix_G) { | 
| 113 | 0 |  |  |  |  | 0 | $self->_setDatabase($paramsRef); | 
| 114 |  |  |  |  |  |  | } | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | #error checking | 
| 117 | 1 |  |  |  |  | 2 | my $function = "_initialize"; | 
| 118 | 1 |  |  |  |  | 4 | &_debug($function); | 
| 119 | 1 | 50 | 33 |  |  | 5 | if(!defined $self || !ref $self) { | 
| 120 | 0 |  |  |  |  | 0 | $errorhandler->_error($pkg, $function, "", 2); | 
| 121 |  |  |  |  |  |  | } | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | #TODO, remove this once I have DB implemented | 
| 124 |  |  |  |  |  |  | #check that a matrix is specified for options (need to implement DB mode) | 
| 125 | 1 | 50 | 33 |  |  | 11 | if (!$matrix_G && $mwa_G) { | 
| 126 | 0 |  |  |  |  | 0 | $errorhandler->_error($pkg, $function, "MWA requires the --matrix option", 12); | 
| 127 |  |  |  |  |  |  | } | 
| 128 | 1 | 50 | 33 |  |  | 6 | if (!$matrix_G && $vsa_G) { | 
| 129 | 0 |  |  |  |  | 0 | $errorhandler->_error($pkg, $function, "VSA requires the --matrix option", 12); | 
| 130 |  |  |  |  |  |  | } | 
| 131 |  |  |  |  |  |  | } | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | sub _debug { | 
| 134 | 1 |  |  | 1 |  | 2 | my $function = shift; | 
| 135 | 1 | 50 |  |  |  | 3 | if($debug) { print STDERR "In UMLS::Association::StatFinder::$function\n"; } | 
|  | 0 |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | } | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | #  method to set the association database | 
| 139 |  |  |  |  |  |  | #  input : $params <- reference to a hash | 
| 140 |  |  |  |  |  |  | #  output: none, but association database is set and initialized | 
| 141 |  |  |  |  |  |  | sub _setDatabase  { | 
| 142 | 0 |  |  | 0 |  |  | my $self   = shift; | 
| 143 | 0 |  |  |  |  |  | my $params = shift; | 
| 144 |  |  |  |  |  |  |  | 
| 145 | 0 |  |  |  |  |  | my $function = "_setDatabase"; | 
| 146 | 0 |  |  |  |  |  | &_debug($function); | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | #  check self | 
| 149 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 150 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 151 |  |  |  |  |  |  | } | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | #  check the params | 
| 154 | 0 | 0 |  |  |  |  | $params = {} if(!defined $params); | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | #  get the database connection parameters | 
| 157 | 0 |  |  |  |  |  | my $database     = $params->{'database'}; | 
| 158 | 0 |  |  |  |  |  | my $hostname     = $params->{'hostname'}; | 
| 159 | 0 |  |  |  |  |  | my $socket       = $params->{'socket'}; | 
| 160 | 0 |  |  |  |  |  | my $port         = $params->{'port'}; | 
| 161 | 0 |  |  |  |  |  | my $username     = $params->{'username'}; | 
| 162 | 0 |  |  |  |  |  | my $password     = $params->{'password'}; | 
| 163 |  |  |  |  |  |  |  | 
| 164 |  |  |  |  |  |  | #  set up defaults if the options were not passed | 
| 165 | 0 | 0 |  |  |  |  | if(! defined $database) { $database = "cuicounts";            } | 
|  | 0 |  |  |  |  |  |  | 
| 166 | 0 | 0 |  |  |  |  | if(! defined $socket)   { $socket   = "/var/run/mysqld/mysqld.sock"; } | 
|  | 0 |  |  |  |  |  |  | 
| 167 | 0 | 0 |  |  |  |  | if(! defined $hostname) { $hostname = "localhost";       } | 
|  | 0 |  |  |  |  |  |  | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | #  initialize the database handler | 
| 170 | 0 |  |  |  |  |  | $assocDB_G  = ""; | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | #  create the database object... | 
| 173 | 0 | 0 | 0 |  |  |  | if(defined $username and defined $password) { | 
| 174 | 0 | 0 |  |  |  |  | if($debug) { print STDERR "Connecting with username and password\n"; } | 
|  | 0 |  |  |  |  |  |  | 
| 175 | 0 |  |  |  |  |  | $assocDB_G = DBI->connect("DBI:mysql:database=$database;mysql_socket=$socket;host=$hostname",$username, $password, {RaiseError => 0}); | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  | else { | 
| 178 | 0 | 0 |  |  |  |  | if($debug) { print STDERR "Connecting using the my.cnf file\n"; } | 
|  | 0 |  |  |  |  |  |  | 
| 179 | 0 |  |  |  |  |  | my $dsn = "DBI:mysql:umls;mysql_read_default_group=client;database=$database"; | 
| 180 | 0 |  |  |  |  |  | $assocDB_G = DBI->connect($dsn); | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | #  check if there is an error | 
| 184 | 0 |  |  |  |  |  | $errorhandler->_checkDbError($pkg, $function, $assocDB_G); | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | #  check that the db exists | 
| 187 | 0 | 0 |  |  |  |  | if(!$assocDB_G) { $errorhandler->_error($pkg, $function, "Error with db.", 3); } | 
|  | 0 |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | #  set database parameters | 
| 190 | 0 |  |  |  |  |  | $assocDB_G->{'mysql_enable_utf8'} = 1; | 
| 191 | 0 |  |  |  |  |  | $assocDB_G->do('SET NAMES utf8'); | 
| 192 | 0 |  |  |  |  |  | $assocDB_G->{mysql_auto_reconnect} = 1; | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | ###################################################################### | 
| 196 |  |  |  |  |  |  | #           public interface to get observed counts | 
| 197 |  |  |  |  |  |  | ###################################################################### | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | # Gets observed counts (n11, n1p, np1, npp) of the cui sets | 
| 200 |  |  |  |  |  |  | # input: $pairHashListRef - a ref to an array of pairHashes | 
| 201 |  |  |  |  |  |  | # output: \@allStatsRef - a ref to an array of observed counts 4-tuples | 
| 202 |  |  |  |  |  |  | #                         each 4-tuple consists of in order: | 
| 203 |  |  |  |  |  |  | #                         $n11, $n1p, $np1, and $npp | 
| 204 |  |  |  |  |  |  | #                         and they correspond to the observed counts of | 
| 205 |  |  |  |  |  |  | #                         each of the pairHashes passed in | 
| 206 |  |  |  |  |  |  | sub getObservedCounts { | 
| 207 |  |  |  |  |  |  | #grab parameters | 
| 208 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 209 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | #error checking | 
| 212 | 0 |  |  |  |  |  | my $function = "getObservedCounts"; | 
| 213 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 214 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | #calculate n11, n1p, np1, npp using a matrix or DB | 
| 218 |  |  |  |  |  |  | # and according to the method of various other options | 
| 219 | 0 |  |  |  |  |  | my $allStatsRef = -1; | 
| 220 | 0 | 0 |  |  |  |  | if ($lta_G) { | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 221 | 0 |  |  |  |  |  | $allStatsRef = $self->_getStats_LTA($pairHashListRef); | 
| 222 |  |  |  |  |  |  | } | 
| 223 |  |  |  |  |  |  | elsif ($mwa_G) { | 
| 224 | 0 |  |  |  |  |  | $allStatsRef = $self->_getStats_MWA($pairHashListRef); | 
| 225 |  |  |  |  |  |  | } | 
| 226 |  |  |  |  |  |  | elsif ($vsa_G) { | 
| 227 | 0 |  |  |  |  |  | $allStatsRef = $self->_getStats_VSA($pairHashListRef); | 
| 228 |  |  |  |  |  |  | } | 
| 229 |  |  |  |  |  |  | else { | 
| 230 | 0 | 0 |  |  |  |  | if ($matrix_G) { | 
| 231 | 0 |  |  |  |  |  | $allStatsRef = $self->_getStats_matrix($pairHashListRef); | 
| 232 |  |  |  |  |  |  | } | 
| 233 |  |  |  |  |  |  | else { | 
| 234 | 0 |  |  |  |  |  | $allStatsRef = $self->_getStats_DB($pairHashListRef); | 
| 235 |  |  |  |  |  |  | } | 
| 236 |  |  |  |  |  |  | } | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | #return a reference to a list of stats for each pairHash | 
| 239 | 0 |  |  |  |  |  | return $allStatsRef; | 
| 240 |  |  |  |  |  |  | } | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | ###################################################################### | 
| 244 |  |  |  |  |  |  | # functions to get statistical information about the cuis using a DB | 
| 245 |  |  |  |  |  |  | ###################################################################### | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | # gets N11, N1P, NP1, NPP for a pairHashList using a database | 
| 248 |  |  |  |  |  |  | #  input : $pairHashListRef <- ref to a pairHashList | 
| 249 |  |  |  |  |  |  | #  output: $\@data  <- array ref containing array refs of four values | 
| 250 |  |  |  |  |  |  | #                      for each pair Hash, $n11, $n1p, $np1, and $npp | 
| 251 |  |  |  |  |  |  | sub _getStats_DB { | 
| 252 |  |  |  |  |  |  | #grab parameters | 
| 253 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 254 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | #error checking | 
| 257 | 0 |  |  |  |  |  | my $function = "_getStats_DB"; | 
| 258 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 259 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 260 |  |  |  |  |  |  | } | 
| 261 |  |  |  |  |  |  |  | 
| 262 |  |  |  |  |  |  | #compute observed counts for each pair hash | 
| 263 | 0 |  |  |  |  |  | my @data = (); | 
| 264 | 0 |  |  |  |  |  | my $npp = $self->_getNpp_DB(); | 
| 265 | 0 |  |  |  |  |  | foreach my $pairHashRef(@{$pairHashListRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  |  | 
| 267 |  |  |  |  |  |  | #grab the data from a DB | 
| 268 | 0 |  |  |  |  |  | my $n11 = $self->_getN11_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}); | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 269 | 0 |  |  |  |  |  | my $n1p = $self->_getN1p_DB(${$pairHashRef}{'set1'}); | 
|  | 0 |  |  |  |  |  |  | 
| 270 | 0 |  |  |  |  |  | my $np1 = $self->_getNp1_DB(${$pairHashRef}{'set2'}); | 
|  | 0 |  |  |  |  |  |  | 
| 271 |  |  |  |  |  |  |  | 
| 272 |  |  |  |  |  |  | #store the data | 
| 273 | 0 |  |  |  |  |  | my @values = ($n11, $n1p, $np1, $npp); | 
| 274 | 0 |  |  |  |  |  | push @data, \@values; | 
| 275 |  |  |  |  |  |  | } | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | #return the data | 
| 278 | 0 |  |  |  |  |  | return  \@data; | 
| 279 |  |  |  |  |  |  | } | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | #  Gets N11 of the cui pair using a database | 
| 282 |  |  |  |  |  |  | #  input:  $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 283 |  |  |  |  |  |  | #          $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 284 |  |  |  |  |  |  | #  output: $n11  <- n11 of cui sets | 
| 285 |  |  |  |  |  |  | sub _getN11_DB { | 
| 286 |  |  |  |  |  |  | #grab parameters | 
| 287 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 288 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 289 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  | #error checking | 
| 292 | 0 |  |  |  |  |  | my $function = "_getN11"; | 
| 293 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 294 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 295 |  |  |  |  |  |  | } | 
| 296 |  |  |  |  |  |  |  | 
| 297 |  |  |  |  |  |  | #build a query string for n11 | 
| 298 | 0 |  |  |  |  |  | my $firstCui = shift @{$cuis1Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 299 | 0 |  |  |  |  |  | my $queryString = "select SUM(n_11) from N_11 where ((cui_1 = '$firstCui' "; | 
| 300 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 301 | 0 |  |  |  |  |  | $queryString .= "or cui_1 = '$cui' "; | 
| 302 |  |  |  |  |  |  | } | 
| 303 | 0 |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  |  | 
| 305 |  |  |  |  |  |  | #set all cui2's | 
| 306 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis2Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 307 | 0 |  |  |  |  |  | $queryString .= ") and (cui_2 = '$firstCui' "; | 
| 308 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 309 | 0 |  |  |  |  |  | $queryString .= "or cui_2 = '$cui' "; | 
| 310 |  |  |  |  |  |  | } | 
| 311 | 0 |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 312 |  |  |  |  |  |  |  | 
| 313 |  |  |  |  |  |  | #finalize the query string | 
| 314 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 315 |  |  |  |  |  |  | #swap the positions of the cuis | 
| 316 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis2Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 317 | 0 |  |  |  |  |  | $queryString .= ")) or ((cui_1 = '$firstCui' "; | 
| 318 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 319 | 0 |  |  |  |  |  | $queryString .= "or cui_1 = '$cui' "; | 
| 320 |  |  |  |  |  |  | } | 
| 321 | 0 |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 322 |  |  |  |  |  |  |  | 
| 323 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis1Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 324 | 0 |  |  |  |  |  | $queryString .= ") and (cui_2 = '$firstCui' "; | 
| 325 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 326 | 0 |  |  |  |  |  | $queryString .= "or cui_2 = '$cui' "; | 
| 327 |  |  |  |  |  |  | } | 
| 328 | 0 |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 329 |  |  |  |  |  |  | } | 
| 330 | 0 |  |  |  |  |  | $queryString .= "));"; | 
| 331 |  |  |  |  |  |  |  | 
| 332 |  |  |  |  |  |  | #query the DB and return n11 | 
| 333 | 0 |  |  |  |  |  | my $n11 = shift @{$assocDB_G->selectcol_arrayref($queryString)}; | 
|  | 0 |  |  |  |  |  |  | 
| 334 | 0 | 0 |  |  |  |  | if (!defined $n11) { | 
| 335 | 0 |  |  |  |  |  | $n11 = 0; | 
| 336 |  |  |  |  |  |  | } | 
| 337 | 0 |  |  |  |  |  | return $n11; | 
| 338 |  |  |  |  |  |  | } | 
| 339 |  |  |  |  |  |  |  | 
| 340 |  |  |  |  |  |  | #  Method to return the np1 of a concept using a database | 
| 341 |  |  |  |  |  |  | #  input : $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 342 |  |  |  |  |  |  | #  output: $np1 <- number of times the cuis2Ref set occurs in second bigram position | 
| 343 |  |  |  |  |  |  | sub _getNp1_DB { | 
| 344 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 345 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | #error checking | 
| 348 | 0 |  |  |  |  |  | my $function = "_getNp1_DB"; | 
| 349 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 350 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 351 |  |  |  |  |  |  | } | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | #build a query string for all where cui2's are in the second position | 
| 354 | 0 |  |  |  |  |  | my $firstCui = shift @{$cuis2Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 355 | 0 |  |  |  |  |  | my $queryString = "select SUM(n_11) from N_11 where (cui_2 = '$firstCui' "; | 
| 356 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 357 | 0 |  |  |  |  |  | $queryString .= "or cui_2 = '$cui' "; | 
| 358 |  |  |  |  |  |  | } | 
| 359 | 0 |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 360 |  |  |  |  |  |  |  | 
| 361 |  |  |  |  |  |  | #finalize the query string | 
| 362 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 363 |  |  |  |  |  |  | #add where cui2 is in the first position | 
| 364 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis2Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 365 | 0 |  |  |  |  |  | $queryString .= ") or (cui_1 = '$firstCui' "; | 
| 366 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 367 | 0 |  |  |  |  |  | $queryString .= "or cui_1 = '$cui' "; | 
| 368 |  |  |  |  |  |  | } | 
| 369 | 0 |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | } | 
| 371 | 0 |  |  |  |  |  | $queryString .= ");"; | 
| 372 |  |  |  |  |  |  |  | 
| 373 |  |  |  |  |  |  | #query the db to retrive np1 | 
| 374 | 0 |  |  |  |  |  | my $np1 = shift @{$assocDB_G->selectcol_arrayref($queryString)}; | 
|  | 0 |  |  |  |  |  |  | 
| 375 | 0 | 0 |  |  |  |  | if (!defined $np1) { | 
| 376 | 0 |  |  |  |  |  | $np1 = -1; | 
| 377 |  |  |  |  |  |  | } | 
| 378 | 0 |  |  |  |  |  | return $np1; | 
| 379 |  |  |  |  |  |  | } | 
| 380 |  |  |  |  |  |  |  | 
| 381 |  |  |  |  |  |  | #  Method to return the n1p of a concept from a database | 
| 382 |  |  |  |  |  |  | #  input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 383 |  |  |  |  |  |  | #  output: $n1p <- number of times cuis in cuis1 set occurs in first bigram position | 
| 384 |  |  |  |  |  |  | sub _getN1p_DB { | 
| 385 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 386 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | #error checking | 
| 389 | 0 |  |  |  |  |  | my $function = "_getN1p"; | 
| 390 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 391 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 392 |  |  |  |  |  |  | } | 
| 393 |  |  |  |  |  |  |  | 
| 394 |  |  |  |  |  |  | #build the query string for all where cui1's are in the first position | 
| 395 | 0 |  |  |  |  |  | my $firstCui = shift @{$cuis1Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 396 | 0 |  |  |  |  |  | my $queryString = "select SUM(n_11) from N_11 where (cui_1 = '$firstCui' "; | 
| 397 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 398 | 0 |  |  |  |  |  | $queryString .= "or cui_1 = '$cui' "; | 
| 399 |  |  |  |  |  |  | } | 
| 400 | 0 |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 401 |  |  |  |  |  |  |  | 
| 402 |  |  |  |  |  |  | #finalize the query string | 
| 403 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 404 |  |  |  |  |  |  | #add where cui1 is in the second position | 
| 405 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis1Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 406 | 0 |  |  |  |  |  | $queryString .= ") or (cui_2 = '$firstCui' "; | 
| 407 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 408 | 0 |  |  |  |  |  | $queryString .= "or cui_2 = '$cui' "; | 
| 409 |  |  |  |  |  |  | } | 
| 410 | 0 |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 411 |  |  |  |  |  |  | } | 
| 412 | 0 |  |  |  |  |  | $queryString .= ");"; | 
| 413 |  |  |  |  |  |  |  | 
| 414 |  |  |  |  |  |  | #query the db to retrive n1p | 
| 415 | 0 |  |  |  |  |  | my $n1p = shift @{$assocDB_G->selectcol_arrayref($queryString)}; | 
|  | 0 |  |  |  |  |  |  | 
| 416 | 0 | 0 |  |  |  |  | if (!defined $n1p) { | 
| 417 | 0 |  |  |  |  |  | $n1p = -1; | 
| 418 |  |  |  |  |  |  | } | 
| 419 | 0 |  |  |  |  |  | return $n1p; | 
| 420 |  |  |  |  |  |  | } | 
| 421 |  |  |  |  |  |  |  | 
| 422 |  |  |  |  |  |  | #  Method to calculate npp from a DB | 
| 423 |  |  |  |  |  |  | #  input : none | 
| 424 |  |  |  |  |  |  | #  output: $npp | 
| 425 |  |  |  |  |  |  | sub _getNpp_DB { | 
| 426 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | #error checking | 
| 429 | 0 |  |  |  |  |  | my $function = "getNpp_DB"; | 
| 430 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 431 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 432 |  |  |  |  |  |  | } | 
| 433 |  |  |  |  |  |  |  | 
| 434 |  |  |  |  |  |  | #get npp, the number of co-occurrences | 
| 435 | 0 |  |  |  |  |  | my $npp = shift @{$assocDB_G->selectcol_arrayref("select sum(N_11) from N_11")}; | 
|  | 0 |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  |  | 
| 437 |  |  |  |  |  |  | #update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp | 
| 438 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 439 | 0 |  |  |  |  |  | $npp *= 2; | 
| 440 |  |  |  |  |  |  | } | 
| 441 |  |  |  |  |  |  |  | 
| 442 |  |  |  |  |  |  | #return npp | 
| 443 | 0 | 0 |  |  |  |  | if($npp <= 0) { $errorhandler->_error($pkg, $function, "", 5); } | 
|  | 0 |  |  |  |  |  |  | 
| 444 | 0 |  |  |  |  |  | return $npp; | 
| 445 |  |  |  |  |  |  | } | 
| 446 |  |  |  |  |  |  |  | 
| 447 |  |  |  |  |  |  | ######################################################################## | 
| 448 |  |  |  |  |  |  | # functions to get statistical information about the cuis using a matrix | 
| 449 |  |  |  |  |  |  | ######################################################################## | 
| 450 |  |  |  |  |  |  |  | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | # Gets arrays of all first (leading) and second (trailing) cuis | 
| 453 |  |  |  |  |  |  | # This is used when retreiving data from a matrix flat file | 
| 454 |  |  |  |  |  |  | # input:  $pairHashListRef - a ref to an array of pairHashes | 
| 455 |  |  |  |  |  |  | # output: (\@cuis1, \@cuis2) - two array refs, the first contains | 
| 456 |  |  |  |  |  |  | #                              all leading cuis in the dataset, the | 
| 457 |  |  |  |  |  |  | #                              second contains all trailing cuis in | 
| 458 |  |  |  |  |  |  | #                              the dataset. | 
| 459 |  |  |  |  |  |  | sub _getAllLeadingAndTrailingCuis { | 
| 460 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 461 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 462 |  |  |  |  |  |  |  | 
| 463 |  |  |  |  |  |  | #create a list of all possible cuis in the first and second positions | 
| 464 | 0 |  |  |  |  |  | my @cuis1 = (); | 
| 465 | 0 |  |  |  |  |  | my @cuis2 = (); | 
| 466 | 0 |  |  |  |  |  | foreach my $pairHashRef(@{$pairHashListRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 467 | 0 |  |  |  |  |  | foreach my $cui(@{${$pairHashRef}{'set1'}}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 468 | 0 |  |  |  |  |  | push @cuis1, $cui; | 
| 469 |  |  |  |  |  |  | } | 
| 470 | 0 |  |  |  |  |  | foreach my $cui(@{${$pairHashRef}{'set2'}}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 471 | 0 |  |  |  |  |  | push @cuis2, $cui; | 
| 472 |  |  |  |  |  |  | } | 
| 473 |  |  |  |  |  |  | } | 
| 474 | 0 |  |  |  |  |  | return (\@cuis1, \@cuis2); | 
| 475 |  |  |  |  |  |  | } | 
| 476 |  |  |  |  |  |  |  | 
| 477 |  |  |  |  |  |  |  | 
| 478 |  |  |  |  |  |  | # gets N11, N1P, NP1, NPP for a pairHashList using a matrix | 
| 479 |  |  |  |  |  |  | #  input : $pairHashListRef <- ref to a pairHashList | 
| 480 |  |  |  |  |  |  | #  output: $\@data  <- array ref containing array refs of four values | 
| 481 |  |  |  |  |  |  | #                      for each pair Hash, $n11, $n1p, $np1, and $npp | 
| 482 |  |  |  |  |  |  | sub _getStats_matrix { | 
| 483 |  |  |  |  |  |  | #grab parameters | 
| 484 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 485 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 486 |  |  |  |  |  |  |  | 
| 487 |  |  |  |  |  |  | #error checking | 
| 488 | 0 |  |  |  |  |  | my $function = "_getStats_matrix"; | 
| 489 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 490 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 491 |  |  |  |  |  |  | } | 
| 492 |  |  |  |  |  |  |  | 
| 493 |  |  |  |  |  |  | #get all observed counts for all possible cuis in the term pairs | 
| 494 | 0 |  |  |  |  |  | (my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef); | 
| 495 | 0 |  |  |  |  |  | my $countsRef = $self->_getObservedCounts_matrix($cuis1Ref, $cuis2Ref); | 
| 496 | 0 |  |  |  |  |  | my $n11AllRef = ${$countsRef}[0]; | 
|  | 0 |  |  |  |  |  |  | 
| 497 | 0 |  |  |  |  |  | my $n1pAllRef = ${$countsRef}[1]; | 
|  | 0 |  |  |  |  |  |  | 
| 498 | 0 |  |  |  |  |  | my $np1AllRef = ${$countsRef}[2]; | 
|  | 0 |  |  |  |  |  |  | 
| 499 | 0 |  |  |  |  |  | my $npp = ${$countsRef}[3]; | 
|  | 0 |  |  |  |  |  |  | 
| 500 |  |  |  |  |  |  |  | 
| 501 |  |  |  |  |  |  | #update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp | 
| 502 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 503 | 0 |  |  |  |  |  | $npp *= 2; | 
| 504 |  |  |  |  |  |  | } | 
| 505 |  |  |  |  |  |  |  | 
| 506 |  |  |  |  |  |  | #get values for each pairHash based on what was retreived from the matrix | 
| 507 | 0 |  |  |  |  |  | my @data = (); | 
| 508 | 0 |  |  |  |  |  | foreach my $pairHashRef (@{$pairHashListRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 509 | 0 |  |  |  |  |  | my $n11 = $self->_getN11_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}, $n11AllRef); | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 510 | 0 |  |  |  |  |  | my $n1p = $self->_getN1p_matrix(${$pairHashRef}{'set1'}, $n11AllRef, $n1pAllRef, $np1AllRef); | 
|  | 0 |  |  |  |  |  |  | 
| 511 | 0 |  |  |  |  |  | my $np1 = $self->_getNp1_matrix(${$pairHashRef}{'set2'}, $n11AllRef, $n1pAllRef, $np1AllRef); | 
|  | 0 |  |  |  |  |  |  | 
| 512 |  |  |  |  |  |  |  | 
| 513 | 0 |  |  |  |  |  | my @vals = ($n11, $n1p, $np1, $npp); | 
| 514 | 0 |  |  |  |  |  | push @data, \@vals; | 
| 515 |  |  |  |  |  |  | } | 
| 516 |  |  |  |  |  |  |  | 
| 517 |  |  |  |  |  |  | #return the data | 
| 518 | 0 |  |  |  |  |  | return \@data; | 
| 519 |  |  |  |  |  |  | } | 
| 520 |  |  |  |  |  |  |  | 
| 521 |  |  |  |  |  |  | #computes the observed counts for all combinations of the cuis passed in | 
| 522 |  |  |  |  |  |  | #doing this in a single function makes it so all values can be computed with a | 
| 523 |  |  |  |  |  |  | #single pass of the input file, making execution time much faster | 
| 524 |  |  |  |  |  |  | #  input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 525 |  |  |  |  |  |  | #          $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 526 |  |  |  |  |  |  | #  output: $\@counts  <- array ref containing four sets of values: | 
| 527 |  |  |  |  |  |  | #                      \%n11, \%n1p, \%np1, and $npp for the cui pairs | 
| 528 |  |  |  |  |  |  | #                      hashes are indexed: $n11{"$cui1,$cui2"}, $n1p{$cui}, | 
| 529 |  |  |  |  |  |  | #                                          $np1{$cui} | 
| 530 |  |  |  |  |  |  | sub _getObservedCounts_matrix { | 
| 531 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 532 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 533 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 534 |  |  |  |  |  |  |  | 
| 535 |  |  |  |  |  |  | #convert cui arrays to hashes, makes looping thru | 
| 536 |  |  |  |  |  |  | # the file faster | 
| 537 | 0 |  |  |  |  |  | my %cuis1 = (); | 
| 538 | 0 |  |  |  |  |  | foreach my $cui(@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 539 | 0 |  |  |  |  |  | $cuis1{$cui} = 1; | 
| 540 |  |  |  |  |  |  | } | 
| 541 | 0 |  |  |  |  |  | my %cuis2 = (); | 
| 542 | 0 |  |  |  |  |  | foreach my $cui(@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 543 | 0 |  |  |  |  |  | $cuis2{$cui} = 1; | 
| 544 |  |  |  |  |  |  | } | 
| 545 |  |  |  |  |  |  |  | 
| 546 |  |  |  |  |  |  | #precalculate values for all cuis and cui pairs | 
| 547 | 0 |  |  |  |  |  | my %n11 = (); | 
| 548 | 0 |  |  |  |  |  | my %n1p = (); | 
| 549 | 0 |  |  |  |  |  | my %np1 = (); | 
| 550 | 0 |  |  |  |  |  | my $npp = 0; | 
| 551 | 0 | 0 |  |  |  |  | open IN, $matrix_G or die "Cannot open $matrix_G for input: $!\n"; | 
| 552 | 0 |  |  |  |  |  | while (my $line = ) { | 
| 553 |  |  |  |  |  |  | #get cuis and value from the line | 
| 554 | 0 |  |  |  |  |  | chomp $line; | 
| 555 | 0 |  |  |  |  |  | my ($cui1, $cui2, $num) = split /\t/, $line; | 
| 556 |  |  |  |  |  |  |  | 
| 557 |  |  |  |  |  |  | #record any occurrence of any cui1 or 2, in case order is ignored | 
| 558 | 0 | 0 | 0 |  |  |  | if (exists $cuis1{$cui1} || exists $cuis1{$cui2} | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 559 |  |  |  |  |  |  | || exists $cuis2{$cui1} || exists $cuis2{$cui2}) { | 
| 560 | 0 |  |  |  |  |  | $n1p{$cui1} += $num; | 
| 561 | 0 |  |  |  |  |  | $np1{$cui2} += $num; | 
| 562 | 0 |  |  |  |  |  | $n11{"$cui1,$cui2"} = $num; | 
| 563 |  |  |  |  |  |  | } | 
| 564 |  |  |  |  |  |  |  | 
| 565 |  |  |  |  |  |  | #update npp | 
| 566 | 0 |  |  |  |  |  | $npp += $num; | 
| 567 |  |  |  |  |  |  | } | 
| 568 | 0 |  |  |  |  |  | close IN; | 
| 569 |  |  |  |  |  |  |  | 
| 570 |  |  |  |  |  |  | #return counts | 
| 571 | 0 |  |  |  |  |  | my @counts = (\%n11, \%n1p, \%np1, $npp); | 
| 572 | 0 |  |  |  |  |  | return \@counts; | 
| 573 |  |  |  |  |  |  | } | 
| 574 |  |  |  |  |  |  |  | 
| 575 |  |  |  |  |  |  | #  Gets N11 of the cui pair using a matrix | 
| 576 |  |  |  |  |  |  | #  input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 577 |  |  |  |  |  |  | #          $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 578 |  |  |  |  |  |  | #          $n11AllRef <- ref to an array containing n11 values for all possible | 
| 579 |  |  |  |  |  |  | #                        cui pairs of the cuis1 and cuis2, of the form | 
| 580 |  |  |  |  |  |  | #                        n11All{"$cui1,$cui2"}=value. See _getObservedCounts_matrix | 
| 581 |  |  |  |  |  |  | #  output: $n11      <- frequency of co-occurrences of the cuis in the cui sets | 
| 582 |  |  |  |  |  |  | sub _getN11_matrix { | 
| 583 |  |  |  |  |  |  | #grab parameters | 
| 584 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 585 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 586 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 587 | 0 |  |  |  |  |  | my $n11AllRef = shift; | 
| 588 |  |  |  |  |  |  |  | 
| 589 |  |  |  |  |  |  | #error checking | 
| 590 | 0 |  |  |  |  |  | my $function = "_getN11_matrix"; | 
| 591 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 592 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 593 |  |  |  |  |  |  | } | 
| 594 |  |  |  |  |  |  |  | 
| 595 |  |  |  |  |  |  | #calculate n11 as the sum n11s for all combinations of | 
| 596 |  |  |  |  |  |  | # cuis1, cuis2 (order matters, cui1 must be first) | 
| 597 | 0 |  |  |  |  |  | my $n11 = 0; | 
| 598 | 0 |  |  |  |  |  | foreach my $cui1 (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 599 | 0 |  |  |  |  |  | foreach my $cui2 (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 600 | 0 |  |  |  |  |  | my $num = ${$n11AllRef}{"$cui1,$cui2"}; | 
|  | 0 |  |  |  |  |  |  | 
| 601 | 0 | 0 |  |  |  |  | if(defined $num) { | 
| 602 | 0 |  |  |  |  |  | $n11 += $num; | 
| 603 |  |  |  |  |  |  | } | 
| 604 |  |  |  |  |  |  | } | 
| 605 |  |  |  |  |  |  | } | 
| 606 |  |  |  |  |  |  |  | 
| 607 |  |  |  |  |  |  | #update values if ignoring word order | 
| 608 | 0 | 0 |  |  |  |  | if($noOrder_G) { | 
| 609 |  |  |  |  |  |  | #add all n11's, now with the order reversed | 
| 610 | 0 |  |  |  |  |  | foreach my $cui1 (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 611 | 0 |  |  |  |  |  | foreach my $cui2 (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 612 | 0 |  |  |  |  |  | my $num = ${$n11AllRef}{"$cui2,$cui1"}; | 
|  | 0 |  |  |  |  |  |  | 
| 613 | 0 | 0 |  |  |  |  | if(defined $num) { | 
| 614 | 0 |  |  |  |  |  | $n11 += $num; | 
| 615 |  |  |  |  |  |  | } | 
| 616 |  |  |  |  |  |  | } | 
| 617 |  |  |  |  |  |  | } | 
| 618 |  |  |  |  |  |  | } | 
| 619 |  |  |  |  |  |  |  | 
| 620 | 0 |  |  |  |  |  | return $n11; | 
| 621 |  |  |  |  |  |  | } | 
| 622 |  |  |  |  |  |  |  | 
| 623 |  |  |  |  |  |  | #  gets N1P for a concept using a matrix | 
| 624 |  |  |  |  |  |  | #  input : $cuis1Ref <- reference to an array containing the first cuis in a set of cui pairs | 
| 625 |  |  |  |  |  |  | #          $countsRef <- ref to an array containing n11, n1p, np1, and npp counts | 
| 626 |  |  |  |  |  |  | #                        for the cui combinations. See _getObservedCounts_matrix() | 
| 627 |  |  |  |  |  |  | #          $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, | 
| 628 |  |  |  |  |  |  | #                        of the form n1pAll{$cui} = value. See _getObservedCounts_matrix | 
| 629 |  |  |  |  |  |  | #          $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, | 
| 630 |  |  |  |  |  |  | #                        of the form np1All{$cui} = value. See _getObservedCounts_matrix | 
| 631 |  |  |  |  |  |  | #  output: $n1p      <- the number of times the set of concepts occurs in first position | 
| 632 |  |  |  |  |  |  | sub _getN1p_matrix { | 
| 633 |  |  |  |  |  |  | #grab parameters | 
| 634 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 635 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 636 | 0 |  |  |  |  |  | my $n11AllRef = shift; | 
| 637 | 0 |  |  |  |  |  | my $n1pAllRef = shift; | 
| 638 | 0 |  |  |  |  |  | my $np1AllRef = shift; | 
| 639 |  |  |  |  |  |  |  | 
| 640 |  |  |  |  |  |  | #error checking | 
| 641 | 0 |  |  |  |  |  | my $function = "_getN1P_matrix"; | 
| 642 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 643 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 644 |  |  |  |  |  |  | } | 
| 645 |  |  |  |  |  |  |  | 
| 646 |  |  |  |  |  |  | #calculate n1p as the sum of n1p's for all cuis1 | 
| 647 | 0 |  |  |  |  |  | my $n1p = 0; | 
| 648 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 649 | 0 |  |  |  |  |  | my $num = ${$n1pAllRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 650 | 0 | 0 |  |  |  |  | if(defined $num) { | 
| 651 | 0 |  |  |  |  |  | $n1p += $num; | 
| 652 |  |  |  |  |  |  | } | 
| 653 |  |  |  |  |  |  | } | 
| 654 |  |  |  |  |  |  |  | 
| 655 |  |  |  |  |  |  | #update values if ignoring word order | 
| 656 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 657 |  |  |  |  |  |  | #add all np1's to n1p | 
| 658 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 659 | 0 |  |  |  |  |  | my $num = ${$np1AllRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 660 | 0 | 0 |  |  |  |  | if(defined $num) { | 
| 661 | 0 |  |  |  |  |  | $n1p += $num; | 
| 662 |  |  |  |  |  |  | } | 
| 663 |  |  |  |  |  |  | } | 
| 664 |  |  |  |  |  |  |  | 
| 665 |  |  |  |  |  |  | #avoid double counting occurrences with self, subtract them | 
| 666 | 0 |  |  |  |  |  | foreach my $cui1(@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 667 | 0 |  |  |  |  |  | foreach my $cui2(@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 668 | 0 |  |  |  |  |  | my $val = ${$n11AllRef}{"$cui1,$cui2"}; | 
|  | 0 |  |  |  |  |  |  | 
| 669 | 0 | 0 |  |  |  |  | if (defined $val) { | 
| 670 | 0 |  |  |  |  |  | $n1p -= $val; | 
| 671 |  |  |  |  |  |  | } | 
| 672 |  |  |  |  |  |  | } | 
| 673 |  |  |  |  |  |  | } | 
| 674 |  |  |  |  |  |  | } | 
| 675 |  |  |  |  |  |  |  | 
| 676 |  |  |  |  |  |  | #set n1p to -1 if there are no values for it since this indicates | 
| 677 |  |  |  |  |  |  | # there is not enough information to calculate the score | 
| 678 | 0 | 0 |  |  |  |  | if ($n1p == 0) { | 
| 679 | 0 |  |  |  |  |  | $n1p = -1; | 
| 680 |  |  |  |  |  |  | } | 
| 681 |  |  |  |  |  |  |  | 
| 682 |  |  |  |  |  |  | #return the value | 
| 683 | 0 |  |  |  |  |  | return $n1p; | 
| 684 |  |  |  |  |  |  | } | 
| 685 |  |  |  |  |  |  |  | 
| 686 |  |  |  |  |  |  | #  gets NP1 for a concept using a matrix | 
| 687 |  |  |  |  |  |  | #  input : $cuis2Ref <- reference to an array containing the first cuis in a set of cui pairs | 
| 688 |  |  |  |  |  |  | #          $countsRef <- ref to an array containing n11, n1p, np1, and npp counts | 
| 689 |  |  |  |  |  |  | #                        for the cui combinations. See _getObservedCounts_matrix() | 
| 690 |  |  |  |  |  |  | #          $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, | 
| 691 |  |  |  |  |  |  | #                        of the form n1pAll{$cui} = value. See _getObservedCounts_matrix | 
| 692 |  |  |  |  |  |  | #          $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, | 
| 693 |  |  |  |  |  |  | #                        of the form np1All{$cui} = value. See _getObservedCounts_matrix | 
| 694 |  |  |  |  |  |  | #  output: $np1      <- the number of times the set of concepts occurs in second position | 
| 695 |  |  |  |  |  |  | sub _getNp1_matrix { | 
| 696 |  |  |  |  |  |  | #grab parameters | 
| 697 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 698 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 699 | 0 |  |  |  |  |  | my $n11AllRef = shift; | 
| 700 | 0 |  |  |  |  |  | my $n1pAllRef = shift; | 
| 701 | 0 |  |  |  |  |  | my $np1AllRef = shift; | 
| 702 |  |  |  |  |  |  |  | 
| 703 |  |  |  |  |  |  | #calculate np1 as the sum of np1's for all cuis2 | 
| 704 | 0 |  |  |  |  |  | my $np1 = 0; | 
| 705 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 706 | 0 |  |  |  |  |  | my $num = ${$np1AllRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 707 | 0 | 0 |  |  |  |  | if (defined $num) { | 
| 708 | 0 |  |  |  |  |  | $np1 += $num; | 
| 709 |  |  |  |  |  |  | } | 
| 710 |  |  |  |  |  |  | } | 
| 711 |  |  |  |  |  |  |  | 
| 712 |  |  |  |  |  |  | #update values if ignoring word order | 
| 713 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 714 |  |  |  |  |  |  | #add all n1p's to np1s | 
| 715 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 716 | 0 |  |  |  |  |  | my $num = ${$n1pAllRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 717 | 0 | 0 |  |  |  |  | if (defined $num) { | 
| 718 | 0 |  |  |  |  |  | $np1 += $num; | 
| 719 |  |  |  |  |  |  | } | 
| 720 |  |  |  |  |  |  | } | 
| 721 |  |  |  |  |  |  |  | 
| 722 |  |  |  |  |  |  | #avoid double counting occurrences with self, subtract them | 
| 723 | 0 |  |  |  |  |  | foreach my $cui1(@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 724 | 0 |  |  |  |  |  | foreach my $cui2(@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 725 | 0 |  |  |  |  |  | my $val = ${$n11AllRef}{"$cui1,$cui2"}; | 
|  | 0 |  |  |  |  |  |  | 
| 726 | 0 | 0 |  |  |  |  | if (defined $val) { | 
| 727 | 0 |  |  |  |  |  | $np1 -= $val; | 
| 728 |  |  |  |  |  |  | } | 
| 729 |  |  |  |  |  |  | } | 
| 730 |  |  |  |  |  |  | } | 
| 731 |  |  |  |  |  |  | } | 
| 732 |  |  |  |  |  |  |  | 
| 733 |  |  |  |  |  |  | #set n1p to -1 if there are no values for it since this indicates | 
| 734 |  |  |  |  |  |  | # there is not enough information to calculate the score | 
| 735 | 0 | 0 |  |  |  |  | if ($np1 == 0) { | 
| 736 | 0 |  |  |  |  |  | $np1 = -1; | 
| 737 |  |  |  |  |  |  | } | 
| 738 |  |  |  |  |  |  |  | 
| 739 |  |  |  |  |  |  | #return the value | 
| 740 | 0 |  |  |  |  |  | return $np1; | 
| 741 |  |  |  |  |  |  | } | 
| 742 |  |  |  |  |  |  |  | 
| 743 |  |  |  |  |  |  |  | 
| 744 |  |  |  |  |  |  | ######################################################################## | 
| 745 |  |  |  |  |  |  | # functions to get statistical information about the cuis LTA, MWA, VSA | 
| 746 |  |  |  |  |  |  | ######################################################################## | 
| 747 |  |  |  |  |  |  | #  Gets contingency table values for Linking Term Association (LTA) | 
| 748 |  |  |  |  |  |  | #  input : $pairHashListRef <- ref to a pairHashList | 
| 749 |  |  |  |  |  |  | #  output: $\@data  <- valuesarray ref containing array refs of four values | 
| 750 |  |  |  |  |  |  | #                      for each pairHash in the pairHash list. The | 
| 751 |  |  |  |  |  |  | #                      values are $n11, $n1p, $np1, and $npp | 
| 752 |  |  |  |  |  |  | sub _getStats_LTA { | 
| 753 |  |  |  |  |  |  | #grab parameters | 
| 754 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 755 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 756 |  |  |  |  |  |  |  | 
| 757 |  |  |  |  |  |  | #error checking | 
| 758 | 0 |  |  |  |  |  | my $function = "_getStats_LTA"; | 
| 759 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 760 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 761 |  |  |  |  |  |  | } | 
| 762 |  |  |  |  |  |  | #get data from the matrix | 
| 763 | 0 |  |  |  |  |  | (my $cooccurrences1ListRef, my $cooccurrences2ListRef, | 
| 764 |  |  |  |  |  |  | my $numCooccurrences, my $numUniqueCuis) | 
| 765 |  |  |  |  |  |  | = $self->_readMatrixValues_Linking($pairHashListRef); | 
| 766 |  |  |  |  |  |  |  | 
| 767 |  |  |  |  |  |  | #for LTA, npp= num unique cuis in the dataset | 
| 768 | 0 |  |  |  |  |  | my $npp = $numUniqueCuis; | 
| 769 |  |  |  |  |  |  |  | 
| 770 |  |  |  |  |  |  | #calculate stats for each pairHash based on the co-occurrences data | 
| 771 | 0 |  |  |  |  |  | my @data = (); | 
| 772 | 0 |  |  |  |  |  | for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { | 
|  | 0 |  |  |  |  |  |  | 
| 773 |  |  |  |  |  |  |  | 
| 774 |  |  |  |  |  |  | #calculate n1p and np1 as the number of co-occurring terms | 
| 775 | 0 |  |  |  |  |  | my $n1p = scalar keys %{${$cooccurrences1ListRef}[$i]}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 776 | 0 |  |  |  |  |  | my $np1 = scalar keys %{${$cooccurrences2ListRef}[$i]}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 777 |  |  |  |  |  |  |  | 
| 778 |  |  |  |  |  |  | #calculate n11 | 
| 779 | 0 |  |  |  |  |  | my $n11 = 0; | 
| 780 |  |  |  |  |  |  | #Find number of CUIs that co-occur with both CUI 1 and CUI 2 | 
| 781 | 0 |  |  |  |  |  | foreach my $cui (keys %{${$cooccurrences1ListRef}[$i]}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 782 | 0 | 0 |  |  |  |  | if (exists ${${$cooccurrences2ListRef}[$i]}{$cui}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 783 | 0 |  |  |  |  |  | $n11++; | 
| 784 |  |  |  |  |  |  | } | 
| 785 |  |  |  |  |  |  | } | 
| 786 |  |  |  |  |  |  |  | 
| 787 |  |  |  |  |  |  | #store the data for this pairHash | 
| 788 | 0 |  |  |  |  |  | my @vals = ($n11, $n1p, $np1, $npp); | 
| 789 | 0 |  |  |  |  |  | push @data, \@vals; | 
| 790 |  |  |  |  |  |  | } | 
| 791 |  |  |  |  |  |  |  | 
| 792 |  |  |  |  |  |  | #return the data | 
| 793 | 0 |  |  |  |  |  | return  \@data; | 
| 794 |  |  |  |  |  |  | } | 
| 795 |  |  |  |  |  |  |  | 
| 796 |  |  |  |  |  |  |  | 
| 797 |  |  |  |  |  |  | #  Gets contingency table values for Minimum Weight Association (MWA) | 
| 798 |  |  |  |  |  |  | #  input : $pairHashListRef <- ref to a pairHashList | 
| 799 |  |  |  |  |  |  | #  output: $\@data  <- array ref containing array refs of four values | 
| 800 |  |  |  |  |  |  | #                      for each pairHash in the pairHash list. The | 
| 801 |  |  |  |  |  |  | #                      values are $n11, $n1p, $np1, and $npp | 
| 802 |  |  |  |  |  |  | sub _getStats_MWA { | 
| 803 |  |  |  |  |  |  | #grab parameters | 
| 804 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 805 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 806 |  |  |  |  |  |  |  | 
| 807 |  |  |  |  |  |  | #error checking | 
| 808 | 0 |  |  |  |  |  | my $function = "_getStats_MWA"; | 
| 809 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 810 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 811 |  |  |  |  |  |  | } | 
| 812 |  |  |  |  |  |  |  | 
| 813 |  |  |  |  |  |  | #get data from the matrix | 
| 814 | 0 |  |  |  |  |  | (my $cooccurrences1ListRef, my $cooccurrences2ListRef, | 
| 815 |  |  |  |  |  |  | my $numCooccurrences, my $numUniqueCuis) | 
| 816 |  |  |  |  |  |  | = $self->_readMatrixValues_Linking($pairHashListRef); | 
| 817 |  |  |  |  |  |  |  | 
| 818 |  |  |  |  |  |  | #for MWA, npp= numCooccurrences in the dataset | 
| 819 | 0 |  |  |  |  |  | my $npp = $numCooccurrences; | 
| 820 |  |  |  |  |  |  |  | 
| 821 |  |  |  |  |  |  | #calculate stats for each pairHash based on the co-occurrences data | 
| 822 | 0 |  |  |  |  |  | my @data = (); | 
| 823 | 0 |  |  |  |  |  | for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { | 
|  | 0 |  |  |  |  |  |  | 
| 824 | 0 |  |  |  |  |  | my $set1CoRef = ${$cooccurrences1ListRef}[$i]; | 
|  | 0 |  |  |  |  |  |  | 
| 825 | 0 |  |  |  |  |  | my $set2CoRef = ${$cooccurrences2ListRef}[$i]; | 
|  | 0 |  |  |  |  |  |  | 
| 826 |  |  |  |  |  |  |  | 
| 827 |  |  |  |  |  |  | #calculate n1p and np1 as the number of co-occurrences for the term | 
| 828 | 0 |  |  |  |  |  | my $n1p = 0; | 
| 829 | 0 |  |  |  |  |  | foreach my $cui (keys %{$set1CoRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 830 | 0 |  |  |  |  |  | $n1p += ${$set1CoRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 831 |  |  |  |  |  |  | } | 
| 832 | 0 |  |  |  |  |  | my $np1 = 0; | 
| 833 | 0 |  |  |  |  |  | foreach my $cui (keys %{$set2CoRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 834 | 0 |  |  |  |  |  | $np1 += ${$set2CoRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 835 |  |  |  |  |  |  | } | 
| 836 |  |  |  |  |  |  |  | 
| 837 |  |  |  |  |  |  | #Find $n11, the min co-occurrence value of the pair | 
| 838 | 0 |  |  |  |  |  | my $n11 = 0; | 
| 839 | 0 |  |  |  |  |  | foreach my $cui (keys %{$set1CoRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 840 |  |  |  |  |  |  | #if this cui co-occurs with both sets, then increment n11 | 
| 841 | 0 | 0 |  |  |  |  | if (exists ${$set2CoRef}{$cui}) { | 
|  | 0 |  |  |  |  |  |  | 
| 842 |  |  |  |  |  |  | #increment n11 by the minimum of the co-occurrences | 
| 843 | 0 |  |  |  |  |  | my $min = ${$set1CoRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 844 | 0 | 0 |  |  |  |  | if (${$set2CoRef}{$cui} < $min) { | 
|  | 0 |  |  |  |  |  |  | 
| 845 | 0 |  |  |  |  |  | $min = ${$set2CoRef}{$cui}; | 
|  | 0 |  |  |  |  |  |  | 
| 846 |  |  |  |  |  |  | } | 
| 847 | 0 |  |  |  |  |  | $n11+=$min; | 
| 848 |  |  |  |  |  |  | } | 
| 849 |  |  |  |  |  |  | } | 
| 850 |  |  |  |  |  |  |  | 
| 851 |  |  |  |  |  |  | #store the data for this pairHash | 
| 852 | 0 |  |  |  |  |  | my @vals = ($n11, $n1p, $np1, $npp); | 
| 853 | 0 |  |  |  |  |  | push @data, \@vals; | 
| 854 |  |  |  |  |  |  | } | 
| 855 |  |  |  |  |  |  |  | 
| 856 |  |  |  |  |  |  | #return the data | 
| 857 | 0 |  |  |  |  |  | return  \@data; | 
| 858 |  |  |  |  |  |  | } | 
| 859 |  |  |  |  |  |  |  | 
| 860 |  |  |  |  |  |  |  | 
| 861 |  |  |  |  |  |  | #  Gets contingency table values for Vector Set Association (VSA) | 
| 862 |  |  |  |  |  |  | #  input : $pairHashListRef <- ref to a pairHashList | 
| 863 |  |  |  |  |  |  | #  output: $\@data  <- array ref containing array refs of four values | 
| 864 |  |  |  |  |  |  | #                      for each pairHash in the pairHash list. The | 
| 865 |  |  |  |  |  |  | #                      values are $n11, $n1p, $np1, and $npp | 
| 866 |  |  |  |  |  |  | sub _getStats_VSA { | 
| 867 |  |  |  |  |  |  | #grab parameters | 
| 868 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 869 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 870 |  |  |  |  |  |  |  | 
| 871 |  |  |  |  |  |  | #error checking | 
| 872 | 0 |  |  |  |  |  | my $function = "_getStats_VSA"; | 
| 873 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 874 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 875 |  |  |  |  |  |  | } | 
| 876 |  |  |  |  |  |  | #get data from the matrix | 
| 877 | 0 |  |  |  |  |  | (my $cooccurrences1ListRef, my $cooccurrences2ListRef, | 
| 878 |  |  |  |  |  |  | my $numCooccurrences, my $numUniqueCuis) | 
| 879 |  |  |  |  |  |  | = $self->_readMatrixValues_Linking($pairHashListRef); | 
| 880 |  |  |  |  |  |  |  | 
| 881 |  |  |  |  |  |  | #convert the cooccurrence lists to pairHashLists | 
| 882 | 0 |  |  |  |  |  | my @newPairHashList = (); | 
| 883 | 0 |  |  |  |  |  | for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { | 
|  | 0 |  |  |  |  |  |  | 
| 884 | 0 |  |  |  |  |  | my %pairHash = (); | 
| 885 |  |  |  |  |  |  |  | 
| 886 |  |  |  |  |  |  | #make set 1 an array | 
| 887 | 0 |  |  |  |  |  | my @set1 = (); | 
| 888 | 0 |  |  |  |  |  | foreach my $key (keys %{${$cooccurrences1ListRef}[$i]}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 889 | 0 |  |  |  |  |  | push @set1, $key; | 
| 890 |  |  |  |  |  |  | } | 
| 891 | 0 |  |  |  |  |  | $pairHash{'set1'} = \@set1; | 
| 892 |  |  |  |  |  |  |  | 
| 893 |  |  |  |  |  |  | #make set 2 an array | 
| 894 | 0 |  |  |  |  |  | my @set2 = (); | 
| 895 | 0 |  |  |  |  |  | foreach my $key (keys %{${$cooccurrences2ListRef}[$i]}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 896 | 0 |  |  |  |  |  | push @set2, $key; | 
| 897 |  |  |  |  |  |  | } | 
| 898 | 0 |  |  |  |  |  | $pairHash{'set2'} = \@set2; | 
| 899 |  |  |  |  |  |  |  | 
| 900 |  |  |  |  |  |  | #add the pairHash to the pairHashList | 
| 901 | 0 |  |  |  |  |  | push @newPairHashList, \%pairHash; | 
| 902 |  |  |  |  |  |  | } | 
| 903 |  |  |  |  |  |  | #So, at this point we have converted the sets of B terms | 
| 904 |  |  |  |  |  |  | # into a pairhashlist. | 
| 905 |  |  |  |  |  |  | #Next we find the stats for each of those pair hashes and | 
| 906 |  |  |  |  |  |  | # use that as the stats for the original pair. | 
| 907 |  |  |  |  |  |  | # in this way we are finding the assocaition between | 
| 908 |  |  |  |  |  |  | # sets of co-occurring terms of the original terms | 
| 909 | 0 |  |  |  |  |  | my $allStatsRef; | 
| 910 | 0 | 0 |  |  |  |  | if ($matrix_G) { | 
| 911 | 0 |  |  |  |  |  | $allStatsRef = $self->_getStats_matrix(\@newPairHashList); | 
| 912 |  |  |  |  |  |  | } | 
| 913 |  |  |  |  |  |  | else { | 
| 914 | 0 |  |  |  |  |  | $allStatsRef = $self->_getStats_DB(\@newPairHashList); | 
| 915 |  |  |  |  |  |  | } | 
| 916 |  |  |  |  |  |  | #all stats ref contains n11, np1, n1p, and npp for | 
| 917 |  |  |  |  |  |  | # each of the pair hashes | 
| 918 | 0 |  |  |  |  |  | return $allStatsRef; | 
| 919 |  |  |  |  |  |  | } | 
| 920 |  |  |  |  |  |  |  | 
| 921 |  |  |  |  |  |  |  | 
| 922 |  |  |  |  |  |  |  | 
| 923 |  |  |  |  |  |  | #  Gets co-occurrence data for each of the pairHashes in the pairHashList | 
| 924 |  |  |  |  |  |  | #  and gets global stats, total number of co-occurrences in the dataset, | 
| 925 |  |  |  |  |  |  | #  and the number of unique cuis in the dataset. The co-occurrences data | 
| 926 |  |  |  |  |  |  | #  is returned in the form of a co-occurrences hash for cuis1 and cuis2 | 
| 927 |  |  |  |  |  |  | #  of the pairHash. Each co-occurrences hash is: | 
| 928 |  |  |  |  |  |  | #              $cooccurrences1{$cui2} = $val | 
| 929 |  |  |  |  |  |  | #  There is no distinction between different cuis of cuis1 | 
| 930 |  |  |  |  |  |  | #  input : $pairHashListRef <- ref to a pairHashList | 
| 931 |  |  |  |  |  |  | #  output: $\@data  <- array ref containing array refs of four values | 
| 932 |  |  |  |  |  |  | sub _readMatrixValues_Linking { | 
| 933 |  |  |  |  |  |  | #grab parameters | 
| 934 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 935 | 0 |  |  |  |  |  | my $pairHashListRef = shift; | 
| 936 |  |  |  |  |  |  |  | 
| 937 |  |  |  |  |  |  | #error checking | 
| 938 | 0 |  |  |  |  |  | my $function = "_readMatrixValues_Linking"; | 
| 939 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 940 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 941 |  |  |  |  |  |  | } | 
| 942 |  |  |  |  |  |  |  | 
| 943 |  |  |  |  |  |  | #Get co-occurrences with each set of CUIs | 
| 944 |  |  |  |  |  |  | # for each set of cuis we find a list of cuis that co-occur with that set | 
| 945 |  |  |  |  |  |  | # this is done for cuis1 and cuis2. Once retreiving these two lists | 
| 946 |  |  |  |  |  |  | # of co-occurring cuis, we can calculate LTA based on the overlap of | 
| 947 |  |  |  |  |  |  | # co-occurrences. | 
| 948 | 0 |  |  |  |  |  | my @cooccurrences1List; | 
| 949 |  |  |  |  |  |  | my @cooccurrences2List; | 
| 950 | 0 |  |  |  |  |  | my $totalCooccurrences = 0; | 
| 951 | 0 |  |  |  |  |  | my $totalUniqueCuis = 0; | 
| 952 | 0 | 0 |  |  |  |  | if ($matrix_G) { | 
| 953 |  |  |  |  |  |  | #get observed counts for all data | 
| 954 | 0 |  |  |  |  |  | (my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef); | 
| 955 | 0 |  |  |  |  |  | (my $n1pAllRef, my $np1AllRef, $totalCooccurrences, $totalUniqueCuis) | 
| 956 |  |  |  |  |  |  | = $self->_getObserved_matrix_Linking($cuis1Ref, $cuis2Ref); | 
| 957 |  |  |  |  |  |  |  | 
| 958 |  |  |  |  |  |  | #get co-occurrence data for each pairHash | 
| 959 | 0 |  |  |  |  |  | foreach my $pairHashRef(@{$pairHashListRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 960 |  |  |  |  |  |  | (my $cooccurrences1Ref, my $cooccurrences2Ref) = $self | 
| 961 | 0 |  |  |  |  |  | ->_getCUICooccurrences_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}, | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 962 |  |  |  |  |  |  | $n1pAllRef, $np1AllRef); | 
| 963 |  |  |  |  |  |  |  | 
| 964 | 0 |  |  |  |  |  | push @cooccurrences1List, $cooccurrences1Ref; | 
| 965 | 0 |  |  |  |  |  | push @cooccurrences2List, $cooccurrences2Ref; | 
| 966 |  |  |  |  |  |  | } | 
| 967 |  |  |  |  |  |  | } | 
| 968 |  |  |  |  |  |  | else { | 
| 969 |  |  |  |  |  |  | #get total co-occurrences and total unique cuis | 
| 970 | 0 |  |  |  |  |  | $totalCooccurrences = $self->_getNpp_DB(); | 
| 971 |  |  |  |  |  |  |  | 
| 972 |  |  |  |  |  |  | #get npp, the number of unique cuis | 
| 973 |  |  |  |  |  |  | #TODO, query is slightly wrong. If the there are cuis that occur in the second position ONLY this will be wrong. I need to merge the CUI 1 and CUI2 tables then select distinct elements | 
| 974 | 0 |  |  |  |  |  | $totalUniqueCuis = shift @{$assocDB_G->selectcol_arrayref("SELECT COUNT(cui_1) FROM (SELECT DISTINCT cui_1 FROM N_11) AS names")}; | 
|  | 0 |  |  |  |  |  |  | 
| 975 |  |  |  |  |  |  |  | 
| 976 |  |  |  |  |  |  | #TODO, check this with MWA now ...will need to code it | 
| 977 |  |  |  |  |  |  | #get co-occurrence data for each pair hash | 
| 978 | 0 |  |  |  |  |  | foreach my $pairHashRef(@{$pairHashListRef}) { | 
|  | 0 |  |  |  |  |  |  | 
| 979 |  |  |  |  |  |  | (my $cooccurrences1Ref, my $cooccurrences2Ref) = $self | 
| 980 | 0 |  |  |  |  |  | ->_getCUICooccurrences_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}); | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 981 | 0 |  |  |  |  |  | push @cooccurrences1List, $cooccurrences1Ref; | 
| 982 | 0 |  |  |  |  |  | push @cooccurrences2List, $cooccurrences2Ref; | 
| 983 |  |  |  |  |  |  | } | 
| 984 |  |  |  |  |  |  | } | 
| 985 |  |  |  |  |  |  |  | 
| 986 | 0 |  |  |  |  |  | return (\@cooccurrences1List, \@cooccurrences2List, $totalCooccurrences, $totalUniqueCuis); | 
| 987 |  |  |  |  |  |  | } | 
| 988 |  |  |  |  |  |  |  | 
| 989 |  |  |  |  |  |  |  | 
| 990 |  |  |  |  |  |  | # computes the observed co-occurrences for all combinations of the cuis passed in | 
| 991 |  |  |  |  |  |  | # doing this in a single function makes it so all values can be computed with a | 
| 992 |  |  |  |  |  |  | # single pass of the input file, making execution time much faster | 
| 993 |  |  |  |  |  |  | #  input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 994 |  |  |  |  |  |  | #          $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 995 |  |  |  |  |  |  | #  output: $n1pAllRef <- a ref to a hash of hashes that contains co-occurence | 
| 996 |  |  |  |  |  |  | #                        data organized as: | 
| 997 |  |  |  |  |  |  | #                        matrix{leadingCUI}{trailingCUI} = cooccurrencecount | 
| 998 |  |  |  |  |  |  | #          $np1AllRef <- a ref to a hash of hashes that contains co-occurence | 
| 999 |  |  |  |  |  |  | #                        data organized as: | 
| 1000 |  |  |  |  |  |  | #                        matrix{trailingCUI}{leadingCUI} = cooccurrencecount | 
| 1001 |  |  |  |  |  |  | #          $cooccurrenceCount <- the total number of co-occurrences in | 
| 1002 |  |  |  |  |  |  | #                                the dataset | 
| 1003 |  |  |  |  |  |  | #          $numUniquCuis <- the number of unique cuis in the dataset | 
| 1004 |  |  |  |  |  |  | sub _getObserved_matrix_Linking { | 
| 1005 |  |  |  |  |  |  | #grab parameters | 
| 1006 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 1007 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 1008 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 1009 |  |  |  |  |  |  |  | 
| 1010 |  |  |  |  |  |  | #convert cui arrays to hashes, makes looping thru | 
| 1011 |  |  |  |  |  |  | # the file faster | 
| 1012 | 0 |  |  |  |  |  | my %cuis1 = (); | 
| 1013 | 0 |  |  |  |  |  | foreach my $cui(@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1014 | 0 |  |  |  |  |  | $cuis1{$cui} = 1; | 
| 1015 |  |  |  |  |  |  | } | 
| 1016 | 0 |  |  |  |  |  | my %cuis2 = (); | 
| 1017 | 0 |  |  |  |  |  | foreach my $cui(@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1018 | 0 |  |  |  |  |  | $cuis2{$cui} = 1; | 
| 1019 |  |  |  |  |  |  | } | 
| 1020 |  |  |  |  |  |  |  | 
| 1021 |  |  |  |  |  |  | #get stats | 
| 1022 | 0 |  |  |  |  |  | my %n1pAll = (); | 
| 1023 | 0 |  |  |  |  |  | my %np1All = (); | 
| 1024 | 0 |  |  |  |  |  | my %uniqueCuis = (); | 
| 1025 | 0 |  |  |  |  |  | my $cooccurrenceCount = 0; | 
| 1026 | 0 | 0 |  |  |  |  | open IN, $matrix_G or die "Cannot open matrix_G for input: $matrix_G\n"; | 
| 1027 | 0 |  |  |  |  |  | while (my $line = ) { | 
| 1028 |  |  |  |  |  |  | #get cuis and value fro mthe line | 
| 1029 | 0 |  |  |  |  |  | chomp $line; | 
| 1030 | 0 |  |  |  |  |  | my ($cui1, $cui2, $num) = split /\t/, $line; | 
| 1031 |  |  |  |  |  |  |  | 
| 1032 |  |  |  |  |  |  | #update unique cui lists | 
| 1033 | 0 |  |  |  |  |  | $uniqueCuis{$cui1} = 1; | 
| 1034 | 0 |  |  |  |  |  | $uniqueCuis{$cui2} = 1; | 
| 1035 |  |  |  |  |  |  |  | 
| 1036 |  |  |  |  |  |  | #update co-occurrence count | 
| 1037 | 0 |  |  |  |  |  | $cooccurrenceCount += $num; | 
| 1038 |  |  |  |  |  |  |  | 
| 1039 |  |  |  |  |  |  | #update n1pAll and np1All. These just record data | 
| 1040 |  |  |  |  |  |  | # so we record any possible co-occurrence that matters | 
| 1041 |  |  |  |  |  |  | # with or without order mattering so just check | 
| 1042 |  |  |  |  |  |  | # if a CUI of interest is anywhere on the line | 
| 1043 | 0 | 0 | 0 |  |  |  | if (exists $cuis1{$cui1} || exists $cuis2{$cui2} | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 1044 |  |  |  |  |  |  | || exists $cuis1{$cui2} || exists $cuis2{$cui1}) { | 
| 1045 |  |  |  |  |  |  |  | 
| 1046 |  |  |  |  |  |  | #update n1pAll | 
| 1047 |  |  |  |  |  |  | #create n1p{$cui1} hash if needed | 
| 1048 | 0 | 0 |  |  |  |  | if (!defined $n1pAll{$cui1}) { | 
| 1049 | 0 |  |  |  |  |  | my %newHash = (); | 
| 1050 | 0 |  |  |  |  |  | $n1pAll{$cui1} = \%newHash; | 
| 1051 |  |  |  |  |  |  | } | 
| 1052 |  |  |  |  |  |  |  | 
| 1053 |  |  |  |  |  |  | #add cui2 and value | 
| 1054 | 0 |  |  |  |  |  | ${$n1pAll{$cui1}}{$cui2} = $num; | 
|  | 0 |  |  |  |  |  |  | 
| 1055 |  |  |  |  |  |  |  | 
| 1056 |  |  |  |  |  |  | #update np1All | 
| 1057 |  |  |  |  |  |  | #create np1{$cui2} hash if needed | 
| 1058 | 0 | 0 |  |  |  |  | if (!defined $np1All{$cui2}) { | 
| 1059 | 0 |  |  |  |  |  | my %newHash = (); | 
| 1060 | 0 |  |  |  |  |  | $np1All{$cui2} = \%newHash; | 
| 1061 |  |  |  |  |  |  | } | 
| 1062 |  |  |  |  |  |  |  | 
| 1063 |  |  |  |  |  |  | #add cui1 and value | 
| 1064 | 0 |  |  |  |  |  | ${$np1All{$cui2}}{$cui1} = $num; | 
|  | 0 |  |  |  |  |  |  | 
| 1065 |  |  |  |  |  |  |  | 
| 1066 |  |  |  |  |  |  | } | 
| 1067 |  |  |  |  |  |  | } | 
| 1068 | 0 |  |  |  |  |  | close IN; | 
| 1069 |  |  |  |  |  |  |  | 
| 1070 |  |  |  |  |  |  | #return the observed values | 
| 1071 | 0 |  |  |  |  |  | return (\%n1pAll, \%np1All, $cooccurrenceCount, (scalar keys %uniqueCuis)); | 
| 1072 |  |  |  |  |  |  | } | 
| 1073 |  |  |  |  |  |  |  | 
| 1074 |  |  |  |  |  |  |  | 
| 1075 |  |  |  |  |  |  | # Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using | 
| 1076 |  |  |  |  |  |  | # a matrix. This is the first step in computing linking term associations | 
| 1077 |  |  |  |  |  |  | #  input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 1078 |  |  |  |  |  |  | #          $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 1079 |  |  |  |  |  |  | #          $n1pAllRef <- a ref to a hash of hashes that contains co-occurence | 
| 1080 |  |  |  |  |  |  | #                        data organized as: | 
| 1081 |  |  |  |  |  |  | #                        matrix{leadingCUI}{trailingCUI} = cooccurrencecount | 
| 1082 |  |  |  |  |  |  | #          $np1AllRef <- a ref to a hash of hashes that contains co-occurence | 
| 1083 |  |  |  |  |  |  | #                        data organized as: | 
| 1084 |  |  |  |  |  |  | #                        matrix{trailingCUI}{leadingCUI} = cooccurrencecount | 
| 1085 |  |  |  |  |  |  | # output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, | 
| 1086 |  |  |  |  |  |  | #                             values are the co-occurrence count | 
| 1087 |  |  |  |  |  |  | #         \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, | 
| 1088 |  |  |  |  |  |  | #                             values are the co-occurrence count | 
| 1089 |  |  |  |  |  |  | sub _getCUICooccurrences_matrix { | 
| 1090 |  |  |  |  |  |  | #grab parameters | 
| 1091 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 1092 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 1093 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 1094 | 0 |  |  |  |  |  | my $n1pAllRef = shift; | 
| 1095 | 0 |  |  |  |  |  | my $np1AllRef = shift; | 
| 1096 |  |  |  |  |  |  |  | 
| 1097 |  |  |  |  |  |  | #error checking | 
| 1098 | 0 |  |  |  |  |  | my $function = "_getCUICooccurrences"; | 
| 1099 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 1100 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 1101 |  |  |  |  |  |  | } | 
| 1102 |  |  |  |  |  |  |  | 
| 1103 |  |  |  |  |  |  | #get lists of explicitly co-occurring CUIs for each concept | 
| 1104 |  |  |  |  |  |  | #add trailing cui co-occurrences to cui1Data | 
| 1105 | 0 |  |  |  |  |  | my %cooccurrences1; | 
| 1106 | 0 |  |  |  |  |  | foreach my $cui1 (@{$cuis1Ref}){ | 
|  | 0 |  |  |  |  |  |  | 
| 1107 | 0 | 0 |  |  |  |  | if (defined ${$n1pAllRef}{$cui1}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1108 | 0 |  |  |  |  |  | foreach my $cui2 (keys %{${$n1pAllRef}{$cui1}}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1109 | 0 |  |  |  |  |  | $cooccurrences1{$cui2} = ${${$n1pAllRef}{$cui1}}{$cui2}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1110 |  |  |  |  |  |  | } | 
| 1111 |  |  |  |  |  |  | } | 
| 1112 |  |  |  |  |  |  | } | 
| 1113 |  |  |  |  |  |  |  | 
| 1114 |  |  |  |  |  |  | #add leading cui co-occurrences to cui2Data | 
| 1115 | 0 |  |  |  |  |  | my %cooccurrences2; | 
| 1116 | 0 |  |  |  |  |  | foreach my $cui2 (@{$cuis2Ref}){ | 
|  | 0 |  |  |  |  |  |  | 
| 1117 | 0 | 0 |  |  |  |  | if (defined ${$np1AllRef}{$cui2}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1118 | 0 |  |  |  |  |  | foreach my $cui1 (keys %{${$np1AllRef}{$cui2}}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1119 | 0 |  |  |  |  |  | $cooccurrences2{$cui1} = ${${$np1AllRef}{$cui2}}{$cui1}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1120 |  |  |  |  |  |  | } | 
| 1121 |  |  |  |  |  |  | } | 
| 1122 |  |  |  |  |  |  | } | 
| 1123 |  |  |  |  |  |  |  | 
| 1124 |  |  |  |  |  |  | #add more CUIs if order doesn't matter | 
| 1125 | 0 | 0 |  |  |  |  | if ($noOrder_G) { | 
| 1126 |  |  |  |  |  |  | #add leading cui co-occurrences to cui1Data | 
| 1127 | 0 |  |  |  |  |  | foreach my $cui1 (@{$cuis1Ref}){ | 
|  | 0 |  |  |  |  |  |  | 
| 1128 | 0 | 0 |  |  |  |  | if (defined ${$np1AllRef}{$cui1}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1129 | 0 |  |  |  |  |  | foreach my $cui2 (keys %{${$np1AllRef}{$cui1}}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1130 | 0 |  |  |  |  |  | $cooccurrences1{$cui2} = ${${$np1AllRef}{$cui1}}{$cui2}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1131 |  |  |  |  |  |  | } | 
| 1132 |  |  |  |  |  |  | } | 
| 1133 |  |  |  |  |  |  | } | 
| 1134 |  |  |  |  |  |  | #add trailing cui co-occurrences to cui2Data | 
| 1135 | 0 |  |  |  |  |  | foreach my $cui2 (@{$cuis2Ref}){ | 
|  | 0 |  |  |  |  |  |  | 
| 1136 | 0 | 0 |  |  |  |  | if (defined ${$n1pAllRef}{$cui2}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1137 | 0 |  |  |  |  |  | foreach my $cui1 (keys %{${$n1pAllRef}{$cui2}}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1138 | 0 |  |  |  |  |  | $cooccurrences2{$cui1} = ${${$n1pAllRef}{$cui2}}{$cui1}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 1139 |  |  |  |  |  |  | } | 
| 1140 |  |  |  |  |  |  | } | 
| 1141 |  |  |  |  |  |  | } | 
| 1142 |  |  |  |  |  |  | } | 
| 1143 |  |  |  |  |  |  |  | 
| 1144 | 0 |  |  |  |  |  | return (\%cooccurrences1, \%cooccurrences2); | 
| 1145 |  |  |  |  |  |  | } | 
| 1146 |  |  |  |  |  |  |  | 
| 1147 |  |  |  |  |  |  |  | 
| 1148 |  |  |  |  |  |  | # Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using | 
| 1149 |  |  |  |  |  |  | # a database. This is the first step in computing linking term associations | 
| 1150 |  |  |  |  |  |  | #  input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 1151 |  |  |  |  |  |  | #          $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 1152 |  |  |  |  |  |  | # output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, | 
| 1153 |  |  |  |  |  |  | #                             values are 1 | 
| 1154 |  |  |  |  |  |  | #         \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, | 
| 1155 |  |  |  |  |  |  | #                             values are 1 | 
| 1156 |  |  |  |  |  |  | sub _getCUICooccurrences_DB { | 
| 1157 |  |  |  |  |  |  | #grab parameters | 
| 1158 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 1159 | 0 |  |  |  |  |  | my $cuis1Ref = shift; | 
| 1160 | 0 |  |  |  |  |  | my $cuis2Ref = shift; | 
| 1161 |  |  |  |  |  |  |  | 
| 1162 |  |  |  |  |  |  | #error checking | 
| 1163 | 0 |  |  |  |  |  | my $function = "_getCUICooccurrences_DB"; | 
| 1164 | 0 | 0 | 0 |  |  |  | if(!defined $self || !ref $self) { | 
| 1165 | 0 |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 1166 |  |  |  |  |  |  | } | 
| 1167 |  |  |  |  |  |  |  | 
| 1168 |  |  |  |  |  |  | #get hashes of co-occurring CUIs | 
| 1169 | 0 |  |  |  |  |  | my %cooccurrences1 = (); | 
| 1170 | 0 |  |  |  |  |  | my %cooccurrences2 = (); | 
| 1171 |  |  |  |  |  |  |  | 
| 1172 |  |  |  |  |  |  | #query DB to get cuis, where concept 1 is the leading cui | 
| 1173 | 0 |  |  |  |  |  | my $firstCui = shift @{$cuis1Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 1174 | 0 |  |  |  |  |  | my $query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; | 
| 1175 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1176 | 0 |  |  |  |  |  | $query .= "OR N_11.cui_1 = '$cui' "; | 
| 1177 |  |  |  |  |  |  | } | 
| 1178 | 0 |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1179 | 0 |  |  |  |  |  | my @cuis = @{$assocDB_G->selectcol_arrayref($query)}; | 
|  | 0 |  |  |  |  |  |  | 
| 1180 | 0 |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 1181 |  |  |  |  |  |  |  | 
| 1182 |  |  |  |  |  |  | #turn CUIs into a hash of cui1's cooccurrences | 
| 1183 | 0 |  |  |  |  |  | foreach my $cui (@cuis) { | 
| 1184 | 0 |  |  |  |  |  | $cooccurrences1{$cui} = 1; | 
| 1185 |  |  |  |  |  |  | } | 
| 1186 |  |  |  |  |  |  |  | 
| 1187 |  |  |  |  |  |  | #query DB to get cuis, where concept 2 is the trailing cui | 
| 1188 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis2Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 1189 | 0 |  |  |  |  |  | $query =  "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; | 
| 1190 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1191 | 0 |  |  |  |  |  | $query .= "OR N_11.cui_2 = '$cui' "; | 
| 1192 |  |  |  |  |  |  | } | 
| 1193 | 0 |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1194 | 0 |  |  |  |  |  | @cuis = @{$assocDB_G->selectcol_arrayref($query)}; | 
|  | 0 |  |  |  |  |  |  | 
| 1195 | 0 |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 1196 |  |  |  |  |  |  |  | 
| 1197 |  |  |  |  |  |  | #turn CUIs into a hash of cui2's co-occurrences | 
| 1198 | 0 |  |  |  |  |  | foreach my $cui (@cuis) { | 
| 1199 | 0 |  |  |  |  |  | $cooccurrences2{$cui} = 1; | 
| 1200 |  |  |  |  |  |  | } | 
| 1201 |  |  |  |  |  |  |  | 
| 1202 |  |  |  |  |  |  | #add additional cuis if order doesn't matter | 
| 1203 | 0 | 0 |  |  |  |  | if($noOrder_G) { | 
| 1204 |  |  |  |  |  |  | #get cuis, where concept 1 is the trailing cui | 
| 1205 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis1Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 1206 | 0 |  |  |  |  |  | my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; | 
| 1207 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1208 | 0 |  |  |  |  |  | $query .= "OR N_11.cui_2 = '$cui' "; | 
| 1209 |  |  |  |  |  |  | } | 
| 1210 | 0 |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1211 | 0 |  |  |  |  |  | @cuis = @{$assocDB_G->selectcol_arrayref($query)}; | 
|  | 0 |  |  |  |  |  |  | 
| 1212 | 0 |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 1213 |  |  |  |  |  |  |  | 
| 1214 |  |  |  |  |  |  | #add cuis to the hash of cui1's co-occurrences | 
| 1215 | 0 |  |  |  |  |  | foreach my $cui (@cuis) { | 
| 1216 | 0 |  |  |  |  |  | $cooccurrences1{$cui} = 1; | 
| 1217 |  |  |  |  |  |  | } | 
| 1218 |  |  |  |  |  |  |  | 
| 1219 |  |  |  |  |  |  | #get cuis, where concept 2 is the leading cui | 
| 1220 | 0 |  |  |  |  |  | $firstCui = shift @{$cuis2Ref}; | 
|  | 0 |  |  |  |  |  |  | 
| 1221 | 0 |  |  |  |  |  | $query =  "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; | 
| 1222 | 0 |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
|  | 0 |  |  |  |  |  |  | 
| 1223 | 0 |  |  |  |  |  | $query .= "OR N_11.cui_1 = '$cui' "; | 
| 1224 |  |  |  |  |  |  | } | 
| 1225 | 0 |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1226 | 0 |  |  |  |  |  | @cuis = @{$assocDB_G->selectcol_arrayref($query)}; | 
|  | 0 |  |  |  |  |  |  | 
| 1227 | 0 |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
|  | 0 |  |  |  |  |  |  | 
| 1228 |  |  |  |  |  |  |  | 
| 1229 |  |  |  |  |  |  | #add cuis to the hash of cui2's co-occurrences | 
| 1230 | 0 |  |  |  |  |  | foreach my $cui (@cuis) { | 
| 1231 | 0 |  |  |  |  |  | $cooccurrences2{$cui} = 1; | 
| 1232 |  |  |  |  |  |  | } | 
| 1233 |  |  |  |  |  |  | } | 
| 1234 |  |  |  |  |  |  |  | 
| 1235 |  |  |  |  |  |  | #return the cui co-occurrences | 
| 1236 | 0 |  |  |  |  |  | return (\%cooccurrences1, \%cooccurrences2); | 
| 1237 |  |  |  |  |  |  | } | 
| 1238 |  |  |  |  |  |  |  | 
| 1239 |  |  |  |  |  |  |  | 
| 1240 |  |  |  |  |  |  | =comment | 
| 1241 |  |  |  |  |  |  | # Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using | 
| 1242 |  |  |  |  |  |  | # a database. This is the first step in computing linking term associations | 
| 1243 |  |  |  |  |  |  | #  input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs | 
| 1244 |  |  |  |  |  |  | #          $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs | 
| 1245 |  |  |  |  |  |  | # output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, | 
| 1246 |  |  |  |  |  |  | #                             values are 1 | 
| 1247 |  |  |  |  |  |  | #         \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, | 
| 1248 |  |  |  |  |  |  | #                             values are 1 | 
| 1249 |  |  |  |  |  |  | sub _getCUICooccurrences_DB { | 
| 1250 |  |  |  |  |  |  | #grab parameters | 
| 1251 |  |  |  |  |  |  | my $self = shift; | 
| 1252 |  |  |  |  |  |  | my $cuis1Ref = shift; | 
| 1253 |  |  |  |  |  |  | my $cuis2Ref = shift; | 
| 1254 |  |  |  |  |  |  |  | 
| 1255 |  |  |  |  |  |  | #error checking | 
| 1256 |  |  |  |  |  |  | my $function = "_getCUICooccurrences_DB"; | 
| 1257 |  |  |  |  |  |  | if(!defined $self || !ref $self) { | 
| 1258 |  |  |  |  |  |  | $errorhandler->_error($pkg, $function, "", 2); | 
| 1259 |  |  |  |  |  |  | } | 
| 1260 |  |  |  |  |  |  |  | 
| 1261 |  |  |  |  |  |  | #get hashes of co-occurring CUIs | 
| 1262 |  |  |  |  |  |  | my %cooccurrences1 = (); | 
| 1263 |  |  |  |  |  |  | my %cooccurrences2 = (); | 
| 1264 |  |  |  |  |  |  |  | 
| 1265 |  |  |  |  |  |  | #query DB to get cuis, where concept 1 is the leading cui | 
| 1266 |  |  |  |  |  |  | my $firstCui = shift @{$cuis1Ref}; | 
| 1267 |  |  |  |  |  |  | my $query = "SELECT * FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; | 
| 1268 |  |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
| 1269 |  |  |  |  |  |  | $query .= "OR N_11.cui_1 = '$cui' "; | 
| 1270 |  |  |  |  |  |  | } | 
| 1271 |  |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1272 |  |  |  |  |  |  | my $sth = $assocDB_G->prepare($query); | 
| 1273 |  |  |  |  |  |  | $sth->execute(); | 
| 1274 |  |  |  |  |  |  | my @rows = @{$sth->fetchall_arrayref()}; | 
| 1275 |  |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
| 1276 |  |  |  |  |  |  |  | 
| 1277 |  |  |  |  |  |  | #turn CUIs into a hash of cui1's cooccurrences | 
| 1278 |  |  |  |  |  |  | foreach my $rowRef (@rows) { | 
| 1279 |  |  |  |  |  |  | print STDERR join(' ', @{$rowRef})."\n"; | 
| 1280 |  |  |  |  |  |  | } | 
| 1281 |  |  |  |  |  |  | #TODO - this is done, it works ... it gets back the whole relevant table. Now fill up as needed. | 
| 1282 |  |  |  |  |  |  |  | 
| 1283 |  |  |  |  |  |  |  | 
| 1284 |  |  |  |  |  |  | my @cuis; | 
| 1285 |  |  |  |  |  |  | #query DB to get cuis, where concept 2 is the trailing cui | 
| 1286 |  |  |  |  |  |  | $firstCui = shift @{$cuis2Ref}; | 
| 1287 |  |  |  |  |  |  | $query =  "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; | 
| 1288 |  |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
| 1289 |  |  |  |  |  |  | $query .= "OR N_11.cui_2 = '$cui' "; | 
| 1290 |  |  |  |  |  |  | } | 
| 1291 |  |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1292 |  |  |  |  |  |  | @cuis = @{$assocDB_G->selectcol_arrayref($query)}; | 
| 1293 |  |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
| 1294 |  |  |  |  |  |  |  | 
| 1295 |  |  |  |  |  |  | #turn CUIs into a hash of cui2's co-occurrences | 
| 1296 |  |  |  |  |  |  | foreach my $cui (@cuis) { | 
| 1297 |  |  |  |  |  |  | $cooccurrences2{$cui} = 1; | 
| 1298 |  |  |  |  |  |  | } | 
| 1299 |  |  |  |  |  |  |  | 
| 1300 |  |  |  |  |  |  | #add additional cuis if order doesn't matter | 
| 1301 |  |  |  |  |  |  | if($noOrder_G) { | 
| 1302 |  |  |  |  |  |  | #get cuis, where concept 1 is the trailing cui | 
| 1303 |  |  |  |  |  |  | $firstCui = shift @{$cuis1Ref}; | 
| 1304 |  |  |  |  |  |  | my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; | 
| 1305 |  |  |  |  |  |  | foreach my $cui (@{$cuis1Ref}) { | 
| 1306 |  |  |  |  |  |  | $query .= "OR N_11.cui_2 = '$cui' "; | 
| 1307 |  |  |  |  |  |  | } | 
| 1308 |  |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1309 |  |  |  |  |  |  | @cuis = @{$assocDB_G->selectcol_arrayref($query)}; | 
| 1310 |  |  |  |  |  |  | unshift @{$cuis1Ref}, $firstCui; | 
| 1311 |  |  |  |  |  |  |  | 
| 1312 |  |  |  |  |  |  | #add cuis to the hash of cui1's co-occurrences | 
| 1313 |  |  |  |  |  |  | foreach my $cui (@cuis) { | 
| 1314 |  |  |  |  |  |  | $cooccurrences1{$cui} = 1; | 
| 1315 |  |  |  |  |  |  | } | 
| 1316 |  |  |  |  |  |  |  | 
| 1317 |  |  |  |  |  |  | #get cuis, where concept 2 is the leading cui | 
| 1318 |  |  |  |  |  |  | $firstCui = shift @{$cuis2Ref}; | 
| 1319 |  |  |  |  |  |  | $query =  "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; | 
| 1320 |  |  |  |  |  |  | foreach my $cui (@{$cuis2Ref}) { | 
| 1321 |  |  |  |  |  |  | $query .= "OR N_11.cui_1 = '$cui' "; | 
| 1322 |  |  |  |  |  |  | } | 
| 1323 |  |  |  |  |  |  | $query .= ") AND N_11.n_11 > 0;"; | 
| 1324 |  |  |  |  |  |  | @cuis = @{$assocDB_G->selectcol_arrayref($query)}; | 
| 1325 |  |  |  |  |  |  | unshift @{$cuis2Ref}, $firstCui; | 
| 1326 |  |  |  |  |  |  |  | 
| 1327 |  |  |  |  |  |  | #add cuis to the hash of cui2's co-occurrences | 
| 1328 |  |  |  |  |  |  | foreach my $cui (@cuis) { | 
| 1329 |  |  |  |  |  |  | $cooccurrences2{$cui} = 1; | 
| 1330 |  |  |  |  |  |  | } | 
| 1331 |  |  |  |  |  |  | } | 
| 1332 |  |  |  |  |  |  |  | 
| 1333 |  |  |  |  |  |  | #return the cui co-occurrences | 
| 1334 |  |  |  |  |  |  | return (\%cooccurrences1, \%cooccurrences2); | 
| 1335 |  |  |  |  |  |  | } | 
| 1336 |  |  |  |  |  |  | =cut | 
| 1337 |  |  |  |  |  |  |  | 
| 1338 |  |  |  |  |  |  | 1; | 
| 1339 |  |  |  |  |  |  |  | 
| 1340 |  |  |  |  |  |  | __END__ |