| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#UMLS::Association |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# Perl module for scoring the semantic association of terms in the Unified |
|
4
|
|
|
|
|
|
|
# Medical Language System (UMLS). |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Copyright (c) 2015 |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Bridget T. McInnes, Virginia Commonwealth University |
|
9
|
|
|
|
|
|
|
# btmcinnes at vcu.edu |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# Keith Herbert, Virginia Commonwealth University |
|
12
|
|
|
|
|
|
|
# herbertkb at vcu.edu |
|
13
|
|
|
|
|
|
|
# |
|
14
|
|
|
|
|
|
|
# Alexander D. McQuilkin, Virginia Commonwealth University |
|
15
|
|
|
|
|
|
|
# alexmcq99 at yahoo.com |
|
16
|
|
|
|
|
|
|
# |
|
17
|
|
|
|
|
|
|
# Sam Henry, Virginia Commonwealth University |
|
18
|
|
|
|
|
|
|
# henryst at vcu.edu |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or |
|
21
|
|
|
|
|
|
|
# modify it under the terms of the GNU General Public License |
|
22
|
|
|
|
|
|
|
# as published by the Free Software Foundation; either version 2 |
|
23
|
|
|
|
|
|
|
# of the License, or (at your option) any later version. |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
|
26
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
27
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
28
|
|
|
|
|
|
|
# GNU General Public License for more details. |
|
29
|
|
|
|
|
|
|
# |
|
30
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
|
31
|
|
|
|
|
|
|
# along with this program; if not, write to |
|
32
|
|
|
|
|
|
|
# |
|
33
|
|
|
|
|
|
|
# The Free Software Foundation, Inc., |
|
34
|
|
|
|
|
|
|
# 59 Temple Place - Suite 330, |
|
35
|
|
|
|
|
|
|
# Boston, MA 02111-1307, USA. |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
package UMLS::Association::StatFinder; |
|
38
|
|
|
|
|
|
|
|
|
39
|
1
|
|
|
1
|
|
7
|
use Fcntl; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
214
|
|
|
40
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
19
|
|
|
41
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
29
|
|
|
42
|
1
|
|
|
1
|
|
4
|
use DBI; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
30
|
|
|
43
|
1
|
|
|
1
|
|
5
|
use bytes; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
4
|
|
|
44
|
1
|
|
|
1
|
|
18
|
use File::Spec; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
5346
|
|
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# error handling variables |
|
47
|
|
|
|
|
|
|
my $errorhandler = ""; |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
my $pkg = "UMLS::Association::StatFinder"; |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# debug variables |
|
52
|
|
|
|
|
|
|
#local(*DEBUG_FILE); |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
#NOTE: every global variable is followed by a _G with the |
|
55
|
|
|
|
|
|
|
# exception of debug error handler, and constants which are all caps |
|
56
|
|
|
|
|
|
|
# global variables |
|
57
|
|
|
|
|
|
|
my $debug = 0; #in debug mode or not |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
#global options variables |
|
60
|
|
|
|
|
|
|
my $assocDB_G; |
|
61
|
|
|
|
|
|
|
my $lta_G = 0; #1 or 0 is using lta or not |
|
62
|
|
|
|
|
|
|
my $mwa_G = 0; #1 or 0 if using mwa or not |
|
63
|
|
|
|
|
|
|
my $vsa_G = 0; #1 or 0 if using vsa or not |
|
64
|
|
|
|
|
|
|
my $noOrder_G = 0; #1 or 0 if noOrder is enabled or not |
|
65
|
|
|
|
|
|
|
my $matrix_G = 0; #matrix file name is using a matrix file rather than DB |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
###################################################################### |
|
68
|
|
|
|
|
|
|
# Initialization Functions |
|
69
|
|
|
|
|
|
|
###################################################################### |
|
70
|
|
|
|
|
|
|
# method to create a new UMLS::Association::StatFinder object |
|
71
|
|
|
|
|
|
|
# input : $params <- reference to hash of database parameters |
|
72
|
|
|
|
|
|
|
# output: $self |
|
73
|
|
|
|
|
|
|
sub new { |
|
74
|
|
|
|
|
|
|
#grab params and create self |
|
75
|
1
|
|
|
1
|
0
|
2
|
my $self = {}; |
|
76
|
1
|
|
|
|
|
2
|
my $className = shift; |
|
77
|
1
|
|
|
|
|
2
|
my $params = shift; |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
#bless the object. |
|
80
|
1
|
|
|
|
|
2
|
bless($self, $className); |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
#initialize error handler |
|
83
|
1
|
|
|
|
|
5
|
$errorhandler = UMLS::Association::ErrorHandler->new(); |
|
84
|
1
|
50
|
|
|
|
13
|
if(! defined $errorhandler) { |
|
85
|
0
|
|
|
|
|
0
|
print STDERR "The error handler did not get passed properly.\n"; |
|
86
|
0
|
|
|
|
|
0
|
exit; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# initialize the object. |
|
90
|
1
|
|
|
|
|
3
|
$debug = 0; |
|
91
|
1
|
|
|
|
|
8
|
$self->_initialize($params); |
|
92
|
1
|
|
|
|
|
3
|
return $self; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# method to initialize the UMLS::Association::StatFinder object. |
|
96
|
|
|
|
|
|
|
# input : $parameters <- reference to a hash of database parameters |
|
97
|
|
|
|
|
|
|
# output: none, but $self is initialized |
|
98
|
|
|
|
|
|
|
sub _initialize { |
|
99
|
|
|
|
|
|
|
#grab parameters |
|
100
|
1
|
|
|
1
|
|
4
|
my $self = shift; |
|
101
|
1
|
|
|
|
|
3
|
my $paramsRef = shift; |
|
102
|
1
|
|
|
|
|
4
|
my %params = %{$paramsRef}; |
|
|
1
|
|
|
|
|
7
|
|
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
#set global variables using option hash |
|
105
|
1
|
|
|
|
|
5
|
$lta_G = $params{'lta'}; |
|
106
|
1
|
|
|
|
|
3
|
$mwa_G = $params{'mwa'}; |
|
107
|
1
|
|
|
|
|
3
|
$vsa_G = $params{'vsa'}; |
|
108
|
1
|
|
|
|
|
4
|
$noOrder_G = $params{'noorder'}; |
|
109
|
1
|
|
|
|
|
4
|
$matrix_G = $params{'matrix'}; |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
#connect to the database of association scores |
|
112
|
1
|
50
|
|
|
|
4
|
if (!$matrix_G) { |
|
113
|
0
|
|
|
|
|
0
|
$self->_setDatabase($paramsRef); |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
#error checking |
|
117
|
1
|
|
|
|
|
2
|
my $function = "_initialize"; |
|
118
|
1
|
|
|
|
|
4
|
&_debug($function); |
|
119
|
1
|
50
|
33
|
|
|
5
|
if(!defined $self || !ref $self) { |
|
120
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "", 2); |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
#TODO, remove this once I have DB implemented |
|
124
|
|
|
|
|
|
|
#check that a matrix is specified for options (need to implement DB mode) |
|
125
|
1
|
50
|
33
|
|
|
11
|
if (!$matrix_G && $mwa_G) { |
|
126
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "MWA requires the --matrix option", 12); |
|
127
|
|
|
|
|
|
|
} |
|
128
|
1
|
50
|
33
|
|
|
6
|
if (!$matrix_G && $vsa_G) { |
|
129
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "VSA requires the --matrix option", 12); |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _debug { |
|
134
|
1
|
|
|
1
|
|
2
|
my $function = shift; |
|
135
|
1
|
50
|
|
|
|
3
|
if($debug) { print STDERR "In UMLS::Association::StatFinder::$function\n"; } |
|
|
0
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# method to set the association database |
|
139
|
|
|
|
|
|
|
# input : $params <- reference to a hash |
|
140
|
|
|
|
|
|
|
# output: none, but association database is set and initialized |
|
141
|
|
|
|
|
|
|
sub _setDatabase { |
|
142
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
143
|
0
|
|
|
|
|
|
my $params = shift; |
|
144
|
|
|
|
|
|
|
|
|
145
|
0
|
|
|
|
|
|
my $function = "_setDatabase"; |
|
146
|
0
|
|
|
|
|
|
&_debug($function); |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# check self |
|
149
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
150
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# check the params |
|
154
|
0
|
0
|
|
|
|
|
$params = {} if(!defined $params); |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# get the database connection parameters |
|
157
|
0
|
|
|
|
|
|
my $database = $params->{'database'}; |
|
158
|
0
|
|
|
|
|
|
my $hostname = $params->{'hostname'}; |
|
159
|
0
|
|
|
|
|
|
my $socket = $params->{'socket'}; |
|
160
|
0
|
|
|
|
|
|
my $port = $params->{'port'}; |
|
161
|
0
|
|
|
|
|
|
my $username = $params->{'username'}; |
|
162
|
0
|
|
|
|
|
|
my $password = $params->{'password'}; |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# set up defaults if the options were not passed |
|
165
|
0
|
0
|
|
|
|
|
if(! defined $database) { $database = "cuicounts"; } |
|
|
0
|
|
|
|
|
|
|
|
166
|
0
|
0
|
|
|
|
|
if(! defined $socket) { $socket = "/var/run/mysqld/mysqld.sock"; } |
|
|
0
|
|
|
|
|
|
|
|
167
|
0
|
0
|
|
|
|
|
if(! defined $hostname) { $hostname = "localhost"; } |
|
|
0
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# initialize the database handler |
|
170
|
0
|
|
|
|
|
|
$assocDB_G = ""; |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# create the database object... |
|
173
|
0
|
0
|
0
|
|
|
|
if(defined $username and defined $password) { |
|
174
|
0
|
0
|
|
|
|
|
if($debug) { print STDERR "Connecting with username and password\n"; } |
|
|
0
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
|
$assocDB_G = DBI->connect("DBI:mysql:database=$database;mysql_socket=$socket;host=$hostname",$username, $password, {RaiseError => 0}); |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
else { |
|
178
|
0
|
0
|
|
|
|
|
if($debug) { print STDERR "Connecting using the my.cnf file\n"; } |
|
|
0
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
my $dsn = "DBI:mysql:umls;mysql_read_default_group=client;database=$database"; |
|
180
|
0
|
|
|
|
|
|
$assocDB_G = DBI->connect($dsn); |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# check if there is an error |
|
184
|
0
|
|
|
|
|
|
$errorhandler->_checkDbError($pkg, $function, $assocDB_G); |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# check that the db exists |
|
187
|
0
|
0
|
|
|
|
|
if(!$assocDB_G) { $errorhandler->_error($pkg, $function, "Error with db.", 3); } |
|
|
0
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# set database parameters |
|
190
|
0
|
|
|
|
|
|
$assocDB_G->{'mysql_enable_utf8'} = 1; |
|
191
|
0
|
|
|
|
|
|
$assocDB_G->do('SET NAMES utf8'); |
|
192
|
0
|
|
|
|
|
|
$assocDB_G->{mysql_auto_reconnect} = 1; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
###################################################################### |
|
196
|
|
|
|
|
|
|
# public interface to get observed counts |
|
197
|
|
|
|
|
|
|
###################################################################### |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# Gets observed counts (n11, n1p, np1, npp) of the cui sets |
|
200
|
|
|
|
|
|
|
# input: $pairHashListRef - a ref to an array of pairHashes |
|
201
|
|
|
|
|
|
|
# output: \@allStatsRef - a ref to an array of observed counts 4-tuples |
|
202
|
|
|
|
|
|
|
# each 4-tuple consists of in order: |
|
203
|
|
|
|
|
|
|
# $n11, $n1p, $np1, and $npp |
|
204
|
|
|
|
|
|
|
# and they correspond to the observed counts of |
|
205
|
|
|
|
|
|
|
# each of the pairHashes passed in |
|
206
|
|
|
|
|
|
|
sub getObservedCounts { |
|
207
|
|
|
|
|
|
|
#grab parameters |
|
208
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
209
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
#error checking |
|
212
|
0
|
|
|
|
|
|
my $function = "getObservedCounts"; |
|
213
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
214
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
#calculate n11, n1p, np1, npp using a matrix or DB |
|
218
|
|
|
|
|
|
|
# and according to the method of various other options |
|
219
|
0
|
|
|
|
|
|
my $allStatsRef = -1; |
|
220
|
0
|
0
|
|
|
|
|
if ($lta_G) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
221
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_LTA($pairHashListRef); |
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
elsif ($mwa_G) { |
|
224
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_MWA($pairHashListRef); |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
elsif ($vsa_G) { |
|
227
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_VSA($pairHashListRef); |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
else { |
|
230
|
0
|
0
|
|
|
|
|
if ($matrix_G) { |
|
231
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_matrix($pairHashListRef); |
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
else { |
|
234
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_DB($pairHashListRef); |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
#return a reference to a list of stats for each pairHash |
|
239
|
0
|
|
|
|
|
|
return $allStatsRef; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
###################################################################### |
|
244
|
|
|
|
|
|
|
# functions to get statistical information about the cuis using a DB |
|
245
|
|
|
|
|
|
|
###################################################################### |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# gets N11, N1P, NP1, NPP for a pairHashList using a database |
|
248
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
|
249
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
|
250
|
|
|
|
|
|
|
# for each pair Hash, $n11, $n1p, $np1, and $npp |
|
251
|
|
|
|
|
|
|
sub _getStats_DB { |
|
252
|
|
|
|
|
|
|
#grab parameters |
|
253
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
254
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
#error checking |
|
257
|
0
|
|
|
|
|
|
my $function = "_getStats_DB"; |
|
258
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
259
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
#compute observed counts for each pair hash |
|
263
|
0
|
|
|
|
|
|
my @data = (); |
|
264
|
0
|
|
|
|
|
|
my $npp = $self->_getNpp_DB(); |
|
265
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
|
0
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
#grab the data from a DB |
|
268
|
0
|
|
|
|
|
|
my $n11 = $self->_getN11_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
269
|
0
|
|
|
|
|
|
my $n1p = $self->_getN1p_DB(${$pairHashRef}{'set1'}); |
|
|
0
|
|
|
|
|
|
|
|
270
|
0
|
|
|
|
|
|
my $np1 = $self->_getNp1_DB(${$pairHashRef}{'set2'}); |
|
|
0
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
#store the data |
|
273
|
0
|
|
|
|
|
|
my @values = ($n11, $n1p, $np1, $npp); |
|
274
|
0
|
|
|
|
|
|
push @data, \@values; |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
#return the data |
|
278
|
0
|
|
|
|
|
|
return \@data; |
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# Gets N11 of the cui pair using a database |
|
282
|
|
|
|
|
|
|
# input: $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
283
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
284
|
|
|
|
|
|
|
# output: $n11 <- n11 of cui sets |
|
285
|
|
|
|
|
|
|
sub _getN11_DB { |
|
286
|
|
|
|
|
|
|
#grab parameters |
|
287
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
288
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
289
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
#error checking |
|
292
|
0
|
|
|
|
|
|
my $function = "_getN11"; |
|
293
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
294
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
#build a query string for n11 |
|
298
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
|
|
0
|
|
|
|
|
|
|
|
299
|
0
|
|
|
|
|
|
my $queryString = "select SUM(n_11) from N_11 where ((cui_1 = '$firstCui' "; |
|
300
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
301
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
|
302
|
|
|
|
|
|
|
} |
|
303
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
#set all cui2's |
|
306
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
|
0
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
|
$queryString .= ") and (cui_2 = '$firstCui' "; |
|
308
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
309
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
|
310
|
|
|
|
|
|
|
} |
|
311
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
#finalize the query string |
|
314
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
315
|
|
|
|
|
|
|
#swap the positions of the cuis |
|
316
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
|
0
|
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
$queryString .= ")) or ((cui_1 = '$firstCui' "; |
|
318
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
319
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
|
320
|
|
|
|
|
|
|
} |
|
321
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
|
|
323
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
|
|
0
|
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
$queryString .= ") and (cui_2 = '$firstCui' "; |
|
325
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
326
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
|
327
|
|
|
|
|
|
|
} |
|
328
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
} |
|
330
|
0
|
|
|
|
|
|
$queryString .= "));"; |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
#query the DB and return n11 |
|
333
|
0
|
|
|
|
|
|
my $n11 = shift @{$assocDB_G->selectcol_arrayref($queryString)}; |
|
|
0
|
|
|
|
|
|
|
|
334
|
0
|
0
|
|
|
|
|
if (!defined $n11) { |
|
335
|
0
|
|
|
|
|
|
$n11 = 0; |
|
336
|
|
|
|
|
|
|
} |
|
337
|
0
|
|
|
|
|
|
return $n11; |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# Method to return the np1 of a concept using a database |
|
341
|
|
|
|
|
|
|
# input : $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
342
|
|
|
|
|
|
|
# output: $np1 <- number of times the cuis2Ref set occurs in second bigram position |
|
343
|
|
|
|
|
|
|
sub _getNp1_DB { |
|
344
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
345
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
#error checking |
|
348
|
0
|
|
|
|
|
|
my $function = "_getNp1_DB"; |
|
349
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
350
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
351
|
|
|
|
|
|
|
} |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
#build a query string for all where cui2's are in the second position |
|
354
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis2Ref}; |
|
|
0
|
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
my $queryString = "select SUM(n_11) from N_11 where (cui_2 = '$firstCui' "; |
|
356
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
#finalize the query string |
|
362
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
363
|
|
|
|
|
|
|
#add where cui2 is in the first position |
|
364
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
|
0
|
|
|
|
|
|
|
|
365
|
0
|
|
|
|
|
|
$queryString .= ") or (cui_1 = '$firstCui' "; |
|
366
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
367
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
|
368
|
|
|
|
|
|
|
} |
|
369
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
} |
|
371
|
0
|
|
|
|
|
|
$queryString .= ");"; |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
#query the db to retrive np1 |
|
374
|
0
|
|
|
|
|
|
my $np1 = shift @{$assocDB_G->selectcol_arrayref($queryString)}; |
|
|
0
|
|
|
|
|
|
|
|
375
|
0
|
0
|
|
|
|
|
if (!defined $np1) { |
|
376
|
0
|
|
|
|
|
|
$np1 = -1; |
|
377
|
|
|
|
|
|
|
} |
|
378
|
0
|
|
|
|
|
|
return $np1; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Method to return the n1p of a concept from a database |
|
382
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
383
|
|
|
|
|
|
|
# output: $n1p <- number of times cuis in cuis1 set occurs in first bigram position |
|
384
|
|
|
|
|
|
|
sub _getN1p_DB { |
|
385
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
386
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
#error checking |
|
389
|
0
|
|
|
|
|
|
my $function = "_getN1p"; |
|
390
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
391
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
#build the query string for all where cui1's are in the first position |
|
395
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
|
|
0
|
|
|
|
|
|
|
|
396
|
0
|
|
|
|
|
|
my $queryString = "select SUM(n_11) from N_11 where (cui_1 = '$firstCui' "; |
|
397
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
398
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
|
399
|
|
|
|
|
|
|
} |
|
400
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
#finalize the query string |
|
403
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
404
|
|
|
|
|
|
|
#add where cui1 is in the second position |
|
405
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
|
|
0
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
|
$queryString .= ") or (cui_2 = '$firstCui' "; |
|
407
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
408
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
|
409
|
|
|
|
|
|
|
} |
|
410
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
} |
|
412
|
0
|
|
|
|
|
|
$queryString .= ");"; |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
#query the db to retrive n1p |
|
415
|
0
|
|
|
|
|
|
my $n1p = shift @{$assocDB_G->selectcol_arrayref($queryString)}; |
|
|
0
|
|
|
|
|
|
|
|
416
|
0
|
0
|
|
|
|
|
if (!defined $n1p) { |
|
417
|
0
|
|
|
|
|
|
$n1p = -1; |
|
418
|
|
|
|
|
|
|
} |
|
419
|
0
|
|
|
|
|
|
return $n1p; |
|
420
|
|
|
|
|
|
|
} |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
# Method to calculate npp from a DB |
|
423
|
|
|
|
|
|
|
# input : none |
|
424
|
|
|
|
|
|
|
# output: $npp |
|
425
|
|
|
|
|
|
|
sub _getNpp_DB { |
|
426
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
#error checking |
|
429
|
0
|
|
|
|
|
|
my $function = "getNpp_DB"; |
|
430
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
431
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
432
|
|
|
|
|
|
|
} |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
#get npp, the number of co-occurrences |
|
435
|
0
|
|
|
|
|
|
my $npp = shift @{$assocDB_G->selectcol_arrayref("select sum(N_11) from N_11")}; |
|
|
0
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
#update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp |
|
438
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
439
|
0
|
|
|
|
|
|
$npp *= 2; |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
#return npp |
|
443
|
0
|
0
|
|
|
|
|
if($npp <= 0) { $errorhandler->_error($pkg, $function, "", 5); } |
|
|
0
|
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
|
return $npp; |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
######################################################################## |
|
448
|
|
|
|
|
|
|
# functions to get statistical information about the cuis using a matrix |
|
449
|
|
|
|
|
|
|
######################################################################## |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# Gets arrays of all first (leading) and second (trailing) cuis |
|
453
|
|
|
|
|
|
|
# This is used when retreiving data from a matrix flat file |
|
454
|
|
|
|
|
|
|
# input: $pairHashListRef - a ref to an array of pairHashes |
|
455
|
|
|
|
|
|
|
# output: (\@cuis1, \@cuis2) - two array refs, the first contains |
|
456
|
|
|
|
|
|
|
# all leading cuis in the dataset, the |
|
457
|
|
|
|
|
|
|
# second contains all trailing cuis in |
|
458
|
|
|
|
|
|
|
# the dataset. |
|
459
|
|
|
|
|
|
|
sub _getAllLeadingAndTrailingCuis { |
|
460
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
461
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
#create a list of all possible cuis in the first and second positions |
|
464
|
0
|
|
|
|
|
|
my @cuis1 = (); |
|
465
|
0
|
|
|
|
|
|
my @cuis2 = (); |
|
466
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
|
0
|
|
|
|
|
|
|
|
467
|
0
|
|
|
|
|
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
468
|
0
|
|
|
|
|
|
push @cuis1, $cui; |
|
469
|
|
|
|
|
|
|
} |
|
470
|
0
|
|
|
|
|
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
471
|
0
|
|
|
|
|
|
push @cuis2, $cui; |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
} |
|
474
|
0
|
|
|
|
|
|
return (\@cuis1, \@cuis2); |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
# gets N11, N1P, NP1, NPP for a pairHashList using a matrix |
|
479
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
|
480
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
|
481
|
|
|
|
|
|
|
# for each pair Hash, $n11, $n1p, $np1, and $npp |
|
482
|
|
|
|
|
|
|
sub _getStats_matrix { |
|
483
|
|
|
|
|
|
|
#grab parameters |
|
484
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
485
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
#error checking |
|
488
|
0
|
|
|
|
|
|
my $function = "_getStats_matrix"; |
|
489
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
490
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
#get all observed counts for all possible cuis in the term pairs |
|
494
|
0
|
|
|
|
|
|
(my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef); |
|
495
|
0
|
|
|
|
|
|
my $countsRef = $self->_getObservedCounts_matrix($cuis1Ref, $cuis2Ref); |
|
496
|
0
|
|
|
|
|
|
my $n11AllRef = ${$countsRef}[0]; |
|
|
0
|
|
|
|
|
|
|
|
497
|
0
|
|
|
|
|
|
my $n1pAllRef = ${$countsRef}[1]; |
|
|
0
|
|
|
|
|
|
|
|
498
|
0
|
|
|
|
|
|
my $np1AllRef = ${$countsRef}[2]; |
|
|
0
|
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
|
my $npp = ${$countsRef}[3]; |
|
|
0
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
#update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp |
|
502
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
503
|
0
|
|
|
|
|
|
$npp *= 2; |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
#get values for each pairHash based on what was retreived from the matrix |
|
507
|
0
|
|
|
|
|
|
my @data = (); |
|
508
|
0
|
|
|
|
|
|
foreach my $pairHashRef (@{$pairHashListRef}) { |
|
|
0
|
|
|
|
|
|
|
|
509
|
0
|
|
|
|
|
|
my $n11 = $self->_getN11_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}, $n11AllRef); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
510
|
0
|
|
|
|
|
|
my $n1p = $self->_getN1p_matrix(${$pairHashRef}{'set1'}, $n11AllRef, $n1pAllRef, $np1AllRef); |
|
|
0
|
|
|
|
|
|
|
|
511
|
0
|
|
|
|
|
|
my $np1 = $self->_getNp1_matrix(${$pairHashRef}{'set2'}, $n11AllRef, $n1pAllRef, $np1AllRef); |
|
|
0
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
|
|
513
|
0
|
|
|
|
|
|
my @vals = ($n11, $n1p, $np1, $npp); |
|
514
|
0
|
|
|
|
|
|
push @data, \@vals; |
|
515
|
|
|
|
|
|
|
} |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
#return the data |
|
518
|
0
|
|
|
|
|
|
return \@data; |
|
519
|
|
|
|
|
|
|
} |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
#computes the observed counts for all combinations of the cuis passed in |
|
522
|
|
|
|
|
|
|
#doing this in a single function makes it so all values can be computed with a |
|
523
|
|
|
|
|
|
|
#single pass of the input file, making execution time much faster |
|
524
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
525
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
526
|
|
|
|
|
|
|
# output: $\@counts <- array ref containing four sets of values: |
|
527
|
|
|
|
|
|
|
# \%n11, \%n1p, \%np1, and $npp for the cui pairs |
|
528
|
|
|
|
|
|
|
# hashes are indexed: $n11{"$cui1,$cui2"}, $n1p{$cui}, |
|
529
|
|
|
|
|
|
|
# $np1{$cui} |
|
530
|
|
|
|
|
|
|
sub _getObservedCounts_matrix { |
|
531
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
532
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
533
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
#convert cui arrays to hashes, makes looping thru |
|
536
|
|
|
|
|
|
|
# the file faster |
|
537
|
0
|
|
|
|
|
|
my %cuis1 = (); |
|
538
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
539
|
0
|
|
|
|
|
|
$cuis1{$cui} = 1; |
|
540
|
|
|
|
|
|
|
} |
|
541
|
0
|
|
|
|
|
|
my %cuis2 = (); |
|
542
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
543
|
0
|
|
|
|
|
|
$cuis2{$cui} = 1; |
|
544
|
|
|
|
|
|
|
} |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
#precalculate values for all cuis and cui pairs |
|
547
|
0
|
|
|
|
|
|
my %n11 = (); |
|
548
|
0
|
|
|
|
|
|
my %n1p = (); |
|
549
|
0
|
|
|
|
|
|
my %np1 = (); |
|
550
|
0
|
|
|
|
|
|
my $npp = 0; |
|
551
|
0
|
0
|
|
|
|
|
open IN, $matrix_G or die "Cannot open $matrix_G for input: $!\n"; |
|
552
|
0
|
|
|
|
|
|
while (my $line = ) { |
|
553
|
|
|
|
|
|
|
#get cuis and value from the line |
|
554
|
0
|
|
|
|
|
|
chomp $line; |
|
555
|
0
|
|
|
|
|
|
my ($cui1, $cui2, $num) = split /\t/, $line; |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
#record any occurrence of any cui1 or 2, in case order is ignored |
|
558
|
0
|
0
|
0
|
|
|
|
if (exists $cuis1{$cui1} || exists $cuis1{$cui2} |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
559
|
|
|
|
|
|
|
|| exists $cuis2{$cui1} || exists $cuis2{$cui2}) { |
|
560
|
0
|
|
|
|
|
|
$n1p{$cui1} += $num; |
|
561
|
0
|
|
|
|
|
|
$np1{$cui2} += $num; |
|
562
|
0
|
|
|
|
|
|
$n11{"$cui1,$cui2"} = $num; |
|
563
|
|
|
|
|
|
|
} |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
#update npp |
|
566
|
0
|
|
|
|
|
|
$npp += $num; |
|
567
|
|
|
|
|
|
|
} |
|
568
|
0
|
|
|
|
|
|
close IN; |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
#return counts |
|
571
|
0
|
|
|
|
|
|
my @counts = (\%n11, \%n1p, \%np1, $npp); |
|
572
|
0
|
|
|
|
|
|
return \@counts; |
|
573
|
|
|
|
|
|
|
} |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# Gets N11 of the cui pair using a matrix |
|
576
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
577
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
578
|
|
|
|
|
|
|
# $n11AllRef <- ref to an array containing n11 values for all possible |
|
579
|
|
|
|
|
|
|
# cui pairs of the cuis1 and cuis2, of the form |
|
580
|
|
|
|
|
|
|
# n11All{"$cui1,$cui2"}=value. See _getObservedCounts_matrix |
|
581
|
|
|
|
|
|
|
# output: $n11 <- frequency of co-occurrences of the cuis in the cui sets |
|
582
|
|
|
|
|
|
|
sub _getN11_matrix { |
|
583
|
|
|
|
|
|
|
#grab parameters |
|
584
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
585
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
586
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
587
|
0
|
|
|
|
|
|
my $n11AllRef = shift; |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
#error checking |
|
590
|
0
|
|
|
|
|
|
my $function = "_getN11_matrix"; |
|
591
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
592
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
593
|
|
|
|
|
|
|
} |
|
594
|
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
#calculate n11 as the sum n11s for all combinations of |
|
596
|
|
|
|
|
|
|
# cuis1, cuis2 (order matters, cui1 must be first) |
|
597
|
0
|
|
|
|
|
|
my $n11 = 0; |
|
598
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
599
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
600
|
0
|
|
|
|
|
|
my $num = ${$n11AllRef}{"$cui1,$cui2"}; |
|
|
0
|
|
|
|
|
|
|
|
601
|
0
|
0
|
|
|
|
|
if(defined $num) { |
|
602
|
0
|
|
|
|
|
|
$n11 += $num; |
|
603
|
|
|
|
|
|
|
} |
|
604
|
|
|
|
|
|
|
} |
|
605
|
|
|
|
|
|
|
} |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
#update values if ignoring word order |
|
608
|
0
|
0
|
|
|
|
|
if($noOrder_G) { |
|
609
|
|
|
|
|
|
|
#add all n11's, now with the order reversed |
|
610
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
611
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
612
|
0
|
|
|
|
|
|
my $num = ${$n11AllRef}{"$cui2,$cui1"}; |
|
|
0
|
|
|
|
|
|
|
|
613
|
0
|
0
|
|
|
|
|
if(defined $num) { |
|
614
|
0
|
|
|
|
|
|
$n11 += $num; |
|
615
|
|
|
|
|
|
|
} |
|
616
|
|
|
|
|
|
|
} |
|
617
|
|
|
|
|
|
|
} |
|
618
|
|
|
|
|
|
|
} |
|
619
|
|
|
|
|
|
|
|
|
620
|
0
|
|
|
|
|
|
return $n11; |
|
621
|
|
|
|
|
|
|
} |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# gets N1P for a concept using a matrix |
|
624
|
|
|
|
|
|
|
# input : $cuis1Ref <- reference to an array containing the first cuis in a set of cui pairs |
|
625
|
|
|
|
|
|
|
# $countsRef <- ref to an array containing n11, n1p, np1, and npp counts |
|
626
|
|
|
|
|
|
|
# for the cui combinations. See _getObservedCounts_matrix() |
|
627
|
|
|
|
|
|
|
# $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
|
628
|
|
|
|
|
|
|
# of the form n1pAll{$cui} = value. See _getObservedCounts_matrix |
|
629
|
|
|
|
|
|
|
# $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
|
630
|
|
|
|
|
|
|
# of the form np1All{$cui} = value. See _getObservedCounts_matrix |
|
631
|
|
|
|
|
|
|
# output: $n1p <- the number of times the set of concepts occurs in first position |
|
632
|
|
|
|
|
|
|
sub _getN1p_matrix { |
|
633
|
|
|
|
|
|
|
#grab parameters |
|
634
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
635
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
636
|
0
|
|
|
|
|
|
my $n11AllRef = shift; |
|
637
|
0
|
|
|
|
|
|
my $n1pAllRef = shift; |
|
638
|
0
|
|
|
|
|
|
my $np1AllRef = shift; |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
#error checking |
|
641
|
0
|
|
|
|
|
|
my $function = "_getN1P_matrix"; |
|
642
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
643
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
644
|
|
|
|
|
|
|
} |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
#calculate n1p as the sum of n1p's for all cuis1 |
|
647
|
0
|
|
|
|
|
|
my $n1p = 0; |
|
648
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
649
|
0
|
|
|
|
|
|
my $num = ${$n1pAllRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
650
|
0
|
0
|
|
|
|
|
if(defined $num) { |
|
651
|
0
|
|
|
|
|
|
$n1p += $num; |
|
652
|
|
|
|
|
|
|
} |
|
653
|
|
|
|
|
|
|
} |
|
654
|
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
#update values if ignoring word order |
|
656
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
657
|
|
|
|
|
|
|
#add all np1's to n1p |
|
658
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
659
|
0
|
|
|
|
|
|
my $num = ${$np1AllRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
660
|
0
|
0
|
|
|
|
|
if(defined $num) { |
|
661
|
0
|
|
|
|
|
|
$n1p += $num; |
|
662
|
|
|
|
|
|
|
} |
|
663
|
|
|
|
|
|
|
} |
|
664
|
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
#avoid double counting occurrences with self, subtract them |
|
666
|
0
|
|
|
|
|
|
foreach my $cui1(@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
667
|
0
|
|
|
|
|
|
foreach my $cui2(@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
668
|
0
|
|
|
|
|
|
my $val = ${$n11AllRef}{"$cui1,$cui2"}; |
|
|
0
|
|
|
|
|
|
|
|
669
|
0
|
0
|
|
|
|
|
if (defined $val) { |
|
670
|
0
|
|
|
|
|
|
$n1p -= $val; |
|
671
|
|
|
|
|
|
|
} |
|
672
|
|
|
|
|
|
|
} |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
} |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
#set n1p to -1 if there are no values for it since this indicates |
|
677
|
|
|
|
|
|
|
# there is not enough information to calculate the score |
|
678
|
0
|
0
|
|
|
|
|
if ($n1p == 0) { |
|
679
|
0
|
|
|
|
|
|
$n1p = -1; |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
#return the value |
|
683
|
0
|
|
|
|
|
|
return $n1p; |
|
684
|
|
|
|
|
|
|
} |
|
685
|
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
# gets NP1 for a concept using a matrix |
|
687
|
|
|
|
|
|
|
# input : $cuis2Ref <- reference to an array containing the first cuis in a set of cui pairs |
|
688
|
|
|
|
|
|
|
# $countsRef <- ref to an array containing n11, n1p, np1, and npp counts |
|
689
|
|
|
|
|
|
|
# for the cui combinations. See _getObservedCounts_matrix() |
|
690
|
|
|
|
|
|
|
# $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
|
691
|
|
|
|
|
|
|
# of the form n1pAll{$cui} = value. See _getObservedCounts_matrix |
|
692
|
|
|
|
|
|
|
# $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
|
693
|
|
|
|
|
|
|
# of the form np1All{$cui} = value. See _getObservedCounts_matrix |
|
694
|
|
|
|
|
|
|
# output: $np1 <- the number of times the set of concepts occurs in second position |
|
695
|
|
|
|
|
|
|
sub _getNp1_matrix { |
|
696
|
|
|
|
|
|
|
#grab parameters |
|
697
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
698
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
699
|
0
|
|
|
|
|
|
my $n11AllRef = shift; |
|
700
|
0
|
|
|
|
|
|
my $n1pAllRef = shift; |
|
701
|
0
|
|
|
|
|
|
my $np1AllRef = shift; |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
#calculate np1 as the sum of np1's for all cuis2 |
|
704
|
0
|
|
|
|
|
|
my $np1 = 0; |
|
705
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
706
|
0
|
|
|
|
|
|
my $num = ${$np1AllRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
707
|
0
|
0
|
|
|
|
|
if (defined $num) { |
|
708
|
0
|
|
|
|
|
|
$np1 += $num; |
|
709
|
|
|
|
|
|
|
} |
|
710
|
|
|
|
|
|
|
} |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
#update values if ignoring word order |
|
713
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
714
|
|
|
|
|
|
|
#add all n1p's to np1s |
|
715
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
716
|
0
|
|
|
|
|
|
my $num = ${$n1pAllRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
717
|
0
|
0
|
|
|
|
|
if (defined $num) { |
|
718
|
0
|
|
|
|
|
|
$np1 += $num; |
|
719
|
|
|
|
|
|
|
} |
|
720
|
|
|
|
|
|
|
} |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
#avoid double counting occurrences with self, subtract them |
|
723
|
0
|
|
|
|
|
|
foreach my $cui1(@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
724
|
0
|
|
|
|
|
|
foreach my $cui2(@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
725
|
0
|
|
|
|
|
|
my $val = ${$n11AllRef}{"$cui1,$cui2"}; |
|
|
0
|
|
|
|
|
|
|
|
726
|
0
|
0
|
|
|
|
|
if (defined $val) { |
|
727
|
0
|
|
|
|
|
|
$np1 -= $val; |
|
728
|
|
|
|
|
|
|
} |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
} |
|
731
|
|
|
|
|
|
|
} |
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
#set n1p to -1 if there are no values for it since this indicates |
|
734
|
|
|
|
|
|
|
# there is not enough information to calculate the score |
|
735
|
0
|
0
|
|
|
|
|
if ($np1 == 0) { |
|
736
|
0
|
|
|
|
|
|
$np1 = -1; |
|
737
|
|
|
|
|
|
|
} |
|
738
|
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
#return the value |
|
740
|
0
|
|
|
|
|
|
return $np1; |
|
741
|
|
|
|
|
|
|
} |
|
742
|
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
######################################################################## |
|
745
|
|
|
|
|
|
|
# functions to get statistical information about the cuis LTA, MWA, VSA |
|
746
|
|
|
|
|
|
|
######################################################################## |
|
747
|
|
|
|
|
|
|
# Gets contingency table values for Linking Term Association (LTA) |
|
748
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
|
749
|
|
|
|
|
|
|
# output: $\@data <- valuesarray ref containing array refs of four values |
|
750
|
|
|
|
|
|
|
# for each pairHash in the pairHash list. The |
|
751
|
|
|
|
|
|
|
# values are $n11, $n1p, $np1, and $npp |
|
752
|
|
|
|
|
|
|
sub _getStats_LTA { |
|
753
|
|
|
|
|
|
|
#grab parameters |
|
754
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
755
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
756
|
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
#error checking |
|
758
|
0
|
|
|
|
|
|
my $function = "_getStats_LTA"; |
|
759
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
760
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
761
|
|
|
|
|
|
|
} |
|
762
|
|
|
|
|
|
|
#get data from the matrix |
|
763
|
0
|
|
|
|
|
|
(my $cooccurrences1ListRef, my $cooccurrences2ListRef, |
|
764
|
|
|
|
|
|
|
my $numCooccurrences, my $numUniqueCuis) |
|
765
|
|
|
|
|
|
|
= $self->_readMatrixValues_Linking($pairHashListRef); |
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
#for LTA, npp= num unique cuis in the dataset |
|
768
|
0
|
|
|
|
|
|
my $npp = $numUniqueCuis; |
|
769
|
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
#calculate stats for each pairHash based on the co-occurrences data |
|
771
|
0
|
|
|
|
|
|
my @data = (); |
|
772
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
|
0
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
#calculate n1p and np1 as the number of co-occurring terms |
|
775
|
0
|
|
|
|
|
|
my $n1p = scalar keys %{${$cooccurrences1ListRef}[$i]}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
776
|
0
|
|
|
|
|
|
my $np1 = scalar keys %{${$cooccurrences2ListRef}[$i]}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
#calculate n11 |
|
779
|
0
|
|
|
|
|
|
my $n11 = 0; |
|
780
|
|
|
|
|
|
|
#Find number of CUIs that co-occur with both CUI 1 and CUI 2 |
|
781
|
0
|
|
|
|
|
|
foreach my $cui (keys %{${$cooccurrences1ListRef}[$i]}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
782
|
0
|
0
|
|
|
|
|
if (exists ${${$cooccurrences2ListRef}[$i]}{$cui}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
783
|
0
|
|
|
|
|
|
$n11++; |
|
784
|
|
|
|
|
|
|
} |
|
785
|
|
|
|
|
|
|
} |
|
786
|
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
#store the data for this pairHash |
|
788
|
0
|
|
|
|
|
|
my @vals = ($n11, $n1p, $np1, $npp); |
|
789
|
0
|
|
|
|
|
|
push @data, \@vals; |
|
790
|
|
|
|
|
|
|
} |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
#return the data |
|
793
|
0
|
|
|
|
|
|
return \@data; |
|
794
|
|
|
|
|
|
|
} |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
# Gets contingency table values for Minimum Weight Association (MWA) |
|
798
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
|
799
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
|
800
|
|
|
|
|
|
|
# for each pairHash in the pairHash list. The |
|
801
|
|
|
|
|
|
|
# values are $n11, $n1p, $np1, and $npp |
|
802
|
|
|
|
|
|
|
sub _getStats_MWA { |
|
803
|
|
|
|
|
|
|
#grab parameters |
|
804
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
805
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
806
|
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
#error checking |
|
808
|
0
|
|
|
|
|
|
my $function = "_getStats_MWA"; |
|
809
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
810
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
811
|
|
|
|
|
|
|
} |
|
812
|
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
#get data from the matrix |
|
814
|
0
|
|
|
|
|
|
(my $cooccurrences1ListRef, my $cooccurrences2ListRef, |
|
815
|
|
|
|
|
|
|
my $numCooccurrences, my $numUniqueCuis) |
|
816
|
|
|
|
|
|
|
= $self->_readMatrixValues_Linking($pairHashListRef); |
|
817
|
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
#for MWA, npp= numCooccurrences in the dataset |
|
819
|
0
|
|
|
|
|
|
my $npp = $numCooccurrences; |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
#calculate stats for each pairHash based on the co-occurrences data |
|
822
|
0
|
|
|
|
|
|
my @data = (); |
|
823
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
|
0
|
|
|
|
|
|
|
|
824
|
0
|
|
|
|
|
|
my $set1CoRef = ${$cooccurrences1ListRef}[$i]; |
|
|
0
|
|
|
|
|
|
|
|
825
|
0
|
|
|
|
|
|
my $set2CoRef = ${$cooccurrences2ListRef}[$i]; |
|
|
0
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
#calculate n1p and np1 as the number of co-occurrences for the term |
|
828
|
0
|
|
|
|
|
|
my $n1p = 0; |
|
829
|
0
|
|
|
|
|
|
foreach my $cui (keys %{$set1CoRef}) { |
|
|
0
|
|
|
|
|
|
|
|
830
|
0
|
|
|
|
|
|
$n1p += ${$set1CoRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
} |
|
832
|
0
|
|
|
|
|
|
my $np1 = 0; |
|
833
|
0
|
|
|
|
|
|
foreach my $cui (keys %{$set2CoRef}) { |
|
|
0
|
|
|
|
|
|
|
|
834
|
0
|
|
|
|
|
|
$np1 += ${$set2CoRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
} |
|
836
|
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
#Find $n11, the min co-occurrence value of the pair |
|
838
|
0
|
|
|
|
|
|
my $n11 = 0; |
|
839
|
0
|
|
|
|
|
|
foreach my $cui (keys %{$set1CoRef}) { |
|
|
0
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
#if this cui co-occurs with both sets, then increment n11 |
|
841
|
0
|
0
|
|
|
|
|
if (exists ${$set2CoRef}{$cui}) { |
|
|
0
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
#increment n11 by the minimum of the co-occurrences |
|
843
|
0
|
|
|
|
|
|
my $min = ${$set1CoRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
844
|
0
|
0
|
|
|
|
|
if (${$set2CoRef}{$cui} < $min) { |
|
|
0
|
|
|
|
|
|
|
|
845
|
0
|
|
|
|
|
|
$min = ${$set2CoRef}{$cui}; |
|
|
0
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
} |
|
847
|
0
|
|
|
|
|
|
$n11+=$min; |
|
848
|
|
|
|
|
|
|
} |
|
849
|
|
|
|
|
|
|
} |
|
850
|
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
#store the data for this pairHash |
|
852
|
0
|
|
|
|
|
|
my @vals = ($n11, $n1p, $np1, $npp); |
|
853
|
0
|
|
|
|
|
|
push @data, \@vals; |
|
854
|
|
|
|
|
|
|
} |
|
855
|
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
#return the data |
|
857
|
0
|
|
|
|
|
|
return \@data; |
|
858
|
|
|
|
|
|
|
} |
|
859
|
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
# Gets contingency table values for Vector Set Association (VSA) |
|
862
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
|
863
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
|
864
|
|
|
|
|
|
|
# for each pairHash in the pairHash list. The |
|
865
|
|
|
|
|
|
|
# values are $n11, $n1p, $np1, and $npp |
|
866
|
|
|
|
|
|
|
sub _getStats_VSA { |
|
867
|
|
|
|
|
|
|
#grab parameters |
|
868
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
869
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
870
|
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
#error checking |
|
872
|
0
|
|
|
|
|
|
my $function = "_getStats_VSA"; |
|
873
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
874
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
875
|
|
|
|
|
|
|
} |
|
876
|
|
|
|
|
|
|
#get data from the matrix |
|
877
|
0
|
|
|
|
|
|
(my $cooccurrences1ListRef, my $cooccurrences2ListRef, |
|
878
|
|
|
|
|
|
|
my $numCooccurrences, my $numUniqueCuis) |
|
879
|
|
|
|
|
|
|
= $self->_readMatrixValues_Linking($pairHashListRef); |
|
880
|
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
#convert the cooccurrence lists to pairHashLists |
|
882
|
0
|
|
|
|
|
|
my @newPairHashList = (); |
|
883
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
|
0
|
|
|
|
|
|
|
|
884
|
0
|
|
|
|
|
|
my %pairHash = (); |
|
885
|
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
#make set 1 an array |
|
887
|
0
|
|
|
|
|
|
my @set1 = (); |
|
888
|
0
|
|
|
|
|
|
foreach my $key (keys %{${$cooccurrences1ListRef}[$i]}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
889
|
0
|
|
|
|
|
|
push @set1, $key; |
|
890
|
|
|
|
|
|
|
} |
|
891
|
0
|
|
|
|
|
|
$pairHash{'set1'} = \@set1; |
|
892
|
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
#make set 2 an array |
|
894
|
0
|
|
|
|
|
|
my @set2 = (); |
|
895
|
0
|
|
|
|
|
|
foreach my $key (keys %{${$cooccurrences2ListRef}[$i]}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
896
|
0
|
|
|
|
|
|
push @set2, $key; |
|
897
|
|
|
|
|
|
|
} |
|
898
|
0
|
|
|
|
|
|
$pairHash{'set2'} = \@set2; |
|
899
|
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
#add the pairHash to the pairHashList |
|
901
|
0
|
|
|
|
|
|
push @newPairHashList, \%pairHash; |
|
902
|
|
|
|
|
|
|
} |
|
903
|
|
|
|
|
|
|
#So, at this point we have converted the sets of B terms |
|
904
|
|
|
|
|
|
|
# into a pairhashlist. |
|
905
|
|
|
|
|
|
|
#Next we find the stats for each of those pair hashes and |
|
906
|
|
|
|
|
|
|
# use that as the stats for the original pair. |
|
907
|
|
|
|
|
|
|
# in this way we are finding the assocaition between |
|
908
|
|
|
|
|
|
|
# sets of co-occurring terms of the original terms |
|
909
|
0
|
|
|
|
|
|
my $allStatsRef; |
|
910
|
0
|
0
|
|
|
|
|
if ($matrix_G) { |
|
911
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_matrix(\@newPairHashList); |
|
912
|
|
|
|
|
|
|
} |
|
913
|
|
|
|
|
|
|
else { |
|
914
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_DB(\@newPairHashList); |
|
915
|
|
|
|
|
|
|
} |
|
916
|
|
|
|
|
|
|
#all stats ref contains n11, np1, n1p, and npp for |
|
917
|
|
|
|
|
|
|
# each of the pair hashes |
|
918
|
0
|
|
|
|
|
|
return $allStatsRef; |
|
919
|
|
|
|
|
|
|
} |
|
920
|
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
# Gets co-occurrence data for each of the pairHashes in the pairHashList |
|
924
|
|
|
|
|
|
|
# and gets global stats, total number of co-occurrences in the dataset, |
|
925
|
|
|
|
|
|
|
# and the number of unique cuis in the dataset. The co-occurrences data |
|
926
|
|
|
|
|
|
|
# is returned in the form of a co-occurrences hash for cuis1 and cuis2 |
|
927
|
|
|
|
|
|
|
# of the pairHash. Each co-occurrences hash is: |
|
928
|
|
|
|
|
|
|
# $cooccurrences1{$cui2} = $val |
|
929
|
|
|
|
|
|
|
# There is no distinction between different cuis of cuis1 |
|
930
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
|
931
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
|
932
|
|
|
|
|
|
|
sub _readMatrixValues_Linking { |
|
933
|
|
|
|
|
|
|
#grab parameters |
|
934
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
935
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
|
936
|
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
#error checking |
|
938
|
0
|
|
|
|
|
|
my $function = "_readMatrixValues_Linking"; |
|
939
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
940
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
941
|
|
|
|
|
|
|
} |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
#Get co-occurrences with each set of CUIs |
|
944
|
|
|
|
|
|
|
# for each set of cuis we find a list of cuis that co-occur with that set |
|
945
|
|
|
|
|
|
|
# this is done for cuis1 and cuis2. Once retreiving these two lists |
|
946
|
|
|
|
|
|
|
# of co-occurring cuis, we can calculate LTA based on the overlap of |
|
947
|
|
|
|
|
|
|
# co-occurrences. |
|
948
|
0
|
|
|
|
|
|
my @cooccurrences1List; |
|
949
|
|
|
|
|
|
|
my @cooccurrences2List; |
|
950
|
0
|
|
|
|
|
|
my $totalCooccurrences = 0; |
|
951
|
0
|
|
|
|
|
|
my $totalUniqueCuis = 0; |
|
952
|
0
|
0
|
|
|
|
|
if ($matrix_G) { |
|
953
|
|
|
|
|
|
|
#get observed counts for all data |
|
954
|
0
|
|
|
|
|
|
(my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef); |
|
955
|
0
|
|
|
|
|
|
(my $n1pAllRef, my $np1AllRef, $totalCooccurrences, $totalUniqueCuis) |
|
956
|
|
|
|
|
|
|
= $self->_getObserved_matrix_Linking($cuis1Ref, $cuis2Ref); |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
#get co-occurrence data for each pairHash |
|
959
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
|
0
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
(my $cooccurrences1Ref, my $cooccurrences2Ref) = $self |
|
961
|
0
|
|
|
|
|
|
->_getCUICooccurrences_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}, |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
$n1pAllRef, $np1AllRef); |
|
963
|
|
|
|
|
|
|
|
|
964
|
0
|
|
|
|
|
|
push @cooccurrences1List, $cooccurrences1Ref; |
|
965
|
0
|
|
|
|
|
|
push @cooccurrences2List, $cooccurrences2Ref; |
|
966
|
|
|
|
|
|
|
} |
|
967
|
|
|
|
|
|
|
} |
|
968
|
|
|
|
|
|
|
else { |
|
969
|
|
|
|
|
|
|
#get total co-occurrences and total unique cuis |
|
970
|
0
|
|
|
|
|
|
$totalCooccurrences = $self->_getNpp_DB(); |
|
971
|
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
#get npp, the number of unique cuis |
|
973
|
|
|
|
|
|
|
#TODO, query is slightly wrong. If the there are cuis that occur in the second position ONLY this will be wrong. I need to merge the CUI 1 and CUI2 tables then select distinct elements |
|
974
|
0
|
|
|
|
|
|
$totalUniqueCuis = shift @{$assocDB_G->selectcol_arrayref("SELECT COUNT(cui_1) FROM (SELECT DISTINCT cui_1 FROM N_11) AS names")}; |
|
|
0
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
#TODO, check this with MWA now ...will need to code it |
|
977
|
|
|
|
|
|
|
#get co-occurrence data for each pair hash |
|
978
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
|
0
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
(my $cooccurrences1Ref, my $cooccurrences2Ref) = $self |
|
980
|
0
|
|
|
|
|
|
->_getCUICooccurrences_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
981
|
0
|
|
|
|
|
|
push @cooccurrences1List, $cooccurrences1Ref; |
|
982
|
0
|
|
|
|
|
|
push @cooccurrences2List, $cooccurrences2Ref; |
|
983
|
|
|
|
|
|
|
} |
|
984
|
|
|
|
|
|
|
} |
|
985
|
|
|
|
|
|
|
|
|
986
|
0
|
|
|
|
|
|
return (\@cooccurrences1List, \@cooccurrences2List, $totalCooccurrences, $totalUniqueCuis); |
|
987
|
|
|
|
|
|
|
} |
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
# computes the observed co-occurrences for all combinations of the cuis passed in |
|
991
|
|
|
|
|
|
|
# doing this in a single function makes it so all values can be computed with a |
|
992
|
|
|
|
|
|
|
# single pass of the input file, making execution time much faster |
|
993
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
994
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
995
|
|
|
|
|
|
|
# output: $n1pAllRef <- a ref to a hash of hashes that contains co-occurence |
|
996
|
|
|
|
|
|
|
# data organized as: |
|
997
|
|
|
|
|
|
|
# matrix{leadingCUI}{trailingCUI} = cooccurrencecount |
|
998
|
|
|
|
|
|
|
# $np1AllRef <- a ref to a hash of hashes that contains co-occurence |
|
999
|
|
|
|
|
|
|
# data organized as: |
|
1000
|
|
|
|
|
|
|
# matrix{trailingCUI}{leadingCUI} = cooccurrencecount |
|
1001
|
|
|
|
|
|
|
# $cooccurrenceCount <- the total number of co-occurrences in |
|
1002
|
|
|
|
|
|
|
# the dataset |
|
1003
|
|
|
|
|
|
|
# $numUniquCuis <- the number of unique cuis in the dataset |
|
1004
|
|
|
|
|
|
|
sub _getObserved_matrix_Linking { |
|
1005
|
|
|
|
|
|
|
#grab parameters |
|
1006
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
1007
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
1008
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
1009
|
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
#convert cui arrays to hashes, makes looping thru |
|
1011
|
|
|
|
|
|
|
# the file faster |
|
1012
|
0
|
|
|
|
|
|
my %cuis1 = (); |
|
1013
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
1014
|
0
|
|
|
|
|
|
$cuis1{$cui} = 1; |
|
1015
|
|
|
|
|
|
|
} |
|
1016
|
0
|
|
|
|
|
|
my %cuis2 = (); |
|
1017
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
1018
|
0
|
|
|
|
|
|
$cuis2{$cui} = 1; |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
#get stats |
|
1022
|
0
|
|
|
|
|
|
my %n1pAll = (); |
|
1023
|
0
|
|
|
|
|
|
my %np1All = (); |
|
1024
|
0
|
|
|
|
|
|
my %uniqueCuis = (); |
|
1025
|
0
|
|
|
|
|
|
my $cooccurrenceCount = 0; |
|
1026
|
0
|
0
|
|
|
|
|
open IN, $matrix_G or die "Cannot open matrix_G for input: $matrix_G\n"; |
|
1027
|
0
|
|
|
|
|
|
while (my $line = ) { |
|
1028
|
|
|
|
|
|
|
#get cuis and value fro mthe line |
|
1029
|
0
|
|
|
|
|
|
chomp $line; |
|
1030
|
0
|
|
|
|
|
|
my ($cui1, $cui2, $num) = split /\t/, $line; |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
#update unique cui lists |
|
1033
|
0
|
|
|
|
|
|
$uniqueCuis{$cui1} = 1; |
|
1034
|
0
|
|
|
|
|
|
$uniqueCuis{$cui2} = 1; |
|
1035
|
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
#update co-occurrence count |
|
1037
|
0
|
|
|
|
|
|
$cooccurrenceCount += $num; |
|
1038
|
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
#update n1pAll and np1All. These just record data |
|
1040
|
|
|
|
|
|
|
# so we record any possible co-occurrence that matters |
|
1041
|
|
|
|
|
|
|
# with or without order mattering so just check |
|
1042
|
|
|
|
|
|
|
# if a CUI of interest is anywhere on the line |
|
1043
|
0
|
0
|
0
|
|
|
|
if (exists $cuis1{$cui1} || exists $cuis2{$cui2} |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
|| exists $cuis1{$cui2} || exists $cuis2{$cui1}) { |
|
1045
|
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
#update n1pAll |
|
1047
|
|
|
|
|
|
|
#create n1p{$cui1} hash if needed |
|
1048
|
0
|
0
|
|
|
|
|
if (!defined $n1pAll{$cui1}) { |
|
1049
|
0
|
|
|
|
|
|
my %newHash = (); |
|
1050
|
0
|
|
|
|
|
|
$n1pAll{$cui1} = \%newHash; |
|
1051
|
|
|
|
|
|
|
} |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
#add cui2 and value |
|
1054
|
0
|
|
|
|
|
|
${$n1pAll{$cui1}}{$cui2} = $num; |
|
|
0
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
#update np1All |
|
1057
|
|
|
|
|
|
|
#create np1{$cui2} hash if needed |
|
1058
|
0
|
0
|
|
|
|
|
if (!defined $np1All{$cui2}) { |
|
1059
|
0
|
|
|
|
|
|
my %newHash = (); |
|
1060
|
0
|
|
|
|
|
|
$np1All{$cui2} = \%newHash; |
|
1061
|
|
|
|
|
|
|
} |
|
1062
|
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
#add cui1 and value |
|
1064
|
0
|
|
|
|
|
|
${$np1All{$cui2}}{$cui1} = $num; |
|
|
0
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
} |
|
1067
|
|
|
|
|
|
|
} |
|
1068
|
0
|
|
|
|
|
|
close IN; |
|
1069
|
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
#return the observed values |
|
1071
|
0
|
|
|
|
|
|
return (\%n1pAll, \%np1All, $cooccurrenceCount, (scalar keys %uniqueCuis)); |
|
1072
|
|
|
|
|
|
|
} |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
# Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using |
|
1076
|
|
|
|
|
|
|
# a matrix. This is the first step in computing linking term associations |
|
1077
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
1078
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
1079
|
|
|
|
|
|
|
# $n1pAllRef <- a ref to a hash of hashes that contains co-occurence |
|
1080
|
|
|
|
|
|
|
# data organized as: |
|
1081
|
|
|
|
|
|
|
# matrix{leadingCUI}{trailingCUI} = cooccurrencecount |
|
1082
|
|
|
|
|
|
|
# $np1AllRef <- a ref to a hash of hashes that contains co-occurence |
|
1083
|
|
|
|
|
|
|
# data organized as: |
|
1084
|
|
|
|
|
|
|
# matrix{trailingCUI}{leadingCUI} = cooccurrencecount |
|
1085
|
|
|
|
|
|
|
# output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, |
|
1086
|
|
|
|
|
|
|
# values are the co-occurrence count |
|
1087
|
|
|
|
|
|
|
# \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, |
|
1088
|
|
|
|
|
|
|
# values are the co-occurrence count |
|
1089
|
|
|
|
|
|
|
sub _getCUICooccurrences_matrix { |
|
1090
|
|
|
|
|
|
|
#grab parameters |
|
1091
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
1092
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
1093
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
1094
|
0
|
|
|
|
|
|
my $n1pAllRef = shift; |
|
1095
|
0
|
|
|
|
|
|
my $np1AllRef = shift; |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
#error checking |
|
1098
|
0
|
|
|
|
|
|
my $function = "_getCUICooccurrences"; |
|
1099
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
1100
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
1101
|
|
|
|
|
|
|
} |
|
1102
|
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
#get lists of explicitly co-occurring CUIs for each concept |
|
1104
|
|
|
|
|
|
|
#add trailing cui co-occurrences to cui1Data |
|
1105
|
0
|
|
|
|
|
|
my %cooccurrences1; |
|
1106
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}){ |
|
|
0
|
|
|
|
|
|
|
|
1107
|
0
|
0
|
|
|
|
|
if (defined ${$n1pAllRef}{$cui1}) { |
|
|
0
|
|
|
|
|
|
|
|
1108
|
0
|
|
|
|
|
|
foreach my $cui2 (keys %{${$n1pAllRef}{$cui1}}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1109
|
0
|
|
|
|
|
|
$cooccurrences1{$cui2} = ${${$n1pAllRef}{$cui1}}{$cui2}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
} |
|
1111
|
|
|
|
|
|
|
} |
|
1112
|
|
|
|
|
|
|
} |
|
1113
|
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
#add leading cui co-occurrences to cui2Data |
|
1115
|
0
|
|
|
|
|
|
my %cooccurrences2; |
|
1116
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}){ |
|
|
0
|
|
|
|
|
|
|
|
1117
|
0
|
0
|
|
|
|
|
if (defined ${$np1AllRef}{$cui2}) { |
|
|
0
|
|
|
|
|
|
|
|
1118
|
0
|
|
|
|
|
|
foreach my $cui1 (keys %{${$np1AllRef}{$cui2}}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1119
|
0
|
|
|
|
|
|
$cooccurrences2{$cui1} = ${${$np1AllRef}{$cui2}}{$cui1}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
} |
|
1121
|
|
|
|
|
|
|
} |
|
1122
|
|
|
|
|
|
|
} |
|
1123
|
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
#add more CUIs if order doesn't matter |
|
1125
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
|
1126
|
|
|
|
|
|
|
#add leading cui co-occurrences to cui1Data |
|
1127
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}){ |
|
|
0
|
|
|
|
|
|
|
|
1128
|
0
|
0
|
|
|
|
|
if (defined ${$np1AllRef}{$cui1}) { |
|
|
0
|
|
|
|
|
|
|
|
1129
|
0
|
|
|
|
|
|
foreach my $cui2 (keys %{${$np1AllRef}{$cui1}}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1130
|
0
|
|
|
|
|
|
$cooccurrences1{$cui2} = ${${$np1AllRef}{$cui1}}{$cui2}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
} |
|
1132
|
|
|
|
|
|
|
} |
|
1133
|
|
|
|
|
|
|
} |
|
1134
|
|
|
|
|
|
|
#add trailing cui co-occurrences to cui2Data |
|
1135
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}){ |
|
|
0
|
|
|
|
|
|
|
|
1136
|
0
|
0
|
|
|
|
|
if (defined ${$n1pAllRef}{$cui2}) { |
|
|
0
|
|
|
|
|
|
|
|
1137
|
0
|
|
|
|
|
|
foreach my $cui1 (keys %{${$n1pAllRef}{$cui2}}) { |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1138
|
0
|
|
|
|
|
|
$cooccurrences2{$cui1} = ${${$n1pAllRef}{$cui2}}{$cui1}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
} |
|
1140
|
|
|
|
|
|
|
} |
|
1141
|
|
|
|
|
|
|
} |
|
1142
|
|
|
|
|
|
|
} |
|
1143
|
|
|
|
|
|
|
|
|
1144
|
0
|
|
|
|
|
|
return (\%cooccurrences1, \%cooccurrences2); |
|
1145
|
|
|
|
|
|
|
} |
|
1146
|
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
# Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using |
|
1149
|
|
|
|
|
|
|
# a database. This is the first step in computing linking term associations |
|
1150
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
1151
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
1152
|
|
|
|
|
|
|
# output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, |
|
1153
|
|
|
|
|
|
|
# values are 1 |
|
1154
|
|
|
|
|
|
|
# \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, |
|
1155
|
|
|
|
|
|
|
# values are 1 |
|
1156
|
|
|
|
|
|
|
sub _getCUICooccurrences_DB { |
|
1157
|
|
|
|
|
|
|
#grab parameters |
|
1158
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
1159
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
1160
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
1161
|
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
#error checking |
|
1163
|
0
|
|
|
|
|
|
my $function = "_getCUICooccurrences_DB"; |
|
1164
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
|
1165
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
1166
|
|
|
|
|
|
|
} |
|
1167
|
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
#get hashes of co-occurring CUIs |
|
1169
|
0
|
|
|
|
|
|
my %cooccurrences1 = (); |
|
1170
|
0
|
|
|
|
|
|
my %cooccurrences2 = (); |
|
1171
|
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
#query DB to get cuis, where concept 1 is the leading cui |
|
1173
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
|
|
0
|
|
|
|
|
|
|
|
1174
|
0
|
|
|
|
|
|
my $query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
|
1175
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
1176
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
|
1177
|
|
|
|
|
|
|
} |
|
1178
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1179
|
0
|
|
|
|
|
|
my @cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
|
0
|
|
|
|
|
|
|
|
1180
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
#turn CUIs into a hash of cui1's cooccurrences |
|
1183
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
|
1184
|
0
|
|
|
|
|
|
$cooccurrences1{$cui} = 1; |
|
1185
|
|
|
|
|
|
|
} |
|
1186
|
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
#query DB to get cuis, where concept 2 is the trailing cui |
|
1188
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
|
0
|
|
|
|
|
|
|
|
1189
|
0
|
|
|
|
|
|
$query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
|
1190
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
1191
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
|
1192
|
|
|
|
|
|
|
} |
|
1193
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1194
|
0
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
|
0
|
|
|
|
|
|
|
|
1195
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
#turn CUIs into a hash of cui2's co-occurrences |
|
1198
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
|
1199
|
0
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
|
1200
|
|
|
|
|
|
|
} |
|
1201
|
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
#add additional cuis if order doesn't matter |
|
1203
|
0
|
0
|
|
|
|
|
if($noOrder_G) { |
|
1204
|
|
|
|
|
|
|
#get cuis, where concept 1 is the trailing cui |
|
1205
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
|
|
0
|
|
|
|
|
|
|
|
1206
|
0
|
|
|
|
|
|
my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
|
1207
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
1208
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
|
1209
|
|
|
|
|
|
|
} |
|
1210
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1211
|
0
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
|
0
|
|
|
|
|
|
|
|
1212
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
#add cuis to the hash of cui1's co-occurrences |
|
1215
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
|
1216
|
0
|
|
|
|
|
|
$cooccurrences1{$cui} = 1; |
|
1217
|
|
|
|
|
|
|
} |
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
#get cuis, where concept 2 is the leading cui |
|
1220
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
|
0
|
|
|
|
|
|
|
|
1221
|
0
|
|
|
|
|
|
$query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
|
1222
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
|
0
|
|
|
|
|
|
|
|
1223
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
|
1224
|
|
|
|
|
|
|
} |
|
1225
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1226
|
0
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
|
0
|
|
|
|
|
|
|
|
1227
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
|
0
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
#add cuis to the hash of cui2's co-occurrences |
|
1230
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
|
1231
|
0
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
|
1232
|
|
|
|
|
|
|
} |
|
1233
|
|
|
|
|
|
|
} |
|
1234
|
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
#return the cui co-occurrences |
|
1236
|
0
|
|
|
|
|
|
return (\%cooccurrences1, \%cooccurrences2); |
|
1237
|
|
|
|
|
|
|
} |
|
1238
|
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
=comment |
|
1241
|
|
|
|
|
|
|
# Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using |
|
1242
|
|
|
|
|
|
|
# a database. This is the first step in computing linking term associations |
|
1243
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
|
1244
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
|
1245
|
|
|
|
|
|
|
# output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, |
|
1246
|
|
|
|
|
|
|
# values are 1 |
|
1247
|
|
|
|
|
|
|
# \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, |
|
1248
|
|
|
|
|
|
|
# values are 1 |
|
1249
|
|
|
|
|
|
|
sub _getCUICooccurrences_DB { |
|
1250
|
|
|
|
|
|
|
#grab parameters |
|
1251
|
|
|
|
|
|
|
my $self = shift; |
|
1252
|
|
|
|
|
|
|
my $cuis1Ref = shift; |
|
1253
|
|
|
|
|
|
|
my $cuis2Ref = shift; |
|
1254
|
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
#error checking |
|
1256
|
|
|
|
|
|
|
my $function = "_getCUICooccurrences_DB"; |
|
1257
|
|
|
|
|
|
|
if(!defined $self || !ref $self) { |
|
1258
|
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
|
1259
|
|
|
|
|
|
|
} |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
#get hashes of co-occurring CUIs |
|
1262
|
|
|
|
|
|
|
my %cooccurrences1 = (); |
|
1263
|
|
|
|
|
|
|
my %cooccurrences2 = (); |
|
1264
|
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
#query DB to get cuis, where concept 1 is the leading cui |
|
1266
|
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
|
1267
|
|
|
|
|
|
|
my $query = "SELECT * FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
|
1268
|
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
1269
|
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
|
1270
|
|
|
|
|
|
|
} |
|
1271
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1272
|
|
|
|
|
|
|
my $sth = $assocDB_G->prepare($query); |
|
1273
|
|
|
|
|
|
|
$sth->execute(); |
|
1274
|
|
|
|
|
|
|
my @rows = @{$sth->fetchall_arrayref()}; |
|
1275
|
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
1276
|
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
#turn CUIs into a hash of cui1's cooccurrences |
|
1278
|
|
|
|
|
|
|
foreach my $rowRef (@rows) { |
|
1279
|
|
|
|
|
|
|
print STDERR join(' ', @{$rowRef})."\n"; |
|
1280
|
|
|
|
|
|
|
} |
|
1281
|
|
|
|
|
|
|
#TODO - this is done, it works ... it gets back the whole relevant table. Now fill up as needed. |
|
1282
|
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
my @cuis; |
|
1285
|
|
|
|
|
|
|
#query DB to get cuis, where concept 2 is the trailing cui |
|
1286
|
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
1287
|
|
|
|
|
|
|
$query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
|
1288
|
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
1289
|
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
|
1290
|
|
|
|
|
|
|
} |
|
1291
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1292
|
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
1293
|
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
1294
|
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
#turn CUIs into a hash of cui2's co-occurrences |
|
1296
|
|
|
|
|
|
|
foreach my $cui (@cuis) { |
|
1297
|
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
|
1298
|
|
|
|
|
|
|
} |
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
#add additional cuis if order doesn't matter |
|
1301
|
|
|
|
|
|
|
if($noOrder_G) { |
|
1302
|
|
|
|
|
|
|
#get cuis, where concept 1 is the trailing cui |
|
1303
|
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
|
1304
|
|
|
|
|
|
|
my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
|
1305
|
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
1306
|
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
|
1307
|
|
|
|
|
|
|
} |
|
1308
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1309
|
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
1310
|
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
1311
|
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
#add cuis to the hash of cui1's co-occurrences |
|
1313
|
|
|
|
|
|
|
foreach my $cui (@cuis) { |
|
1314
|
|
|
|
|
|
|
$cooccurrences1{$cui} = 1; |
|
1315
|
|
|
|
|
|
|
} |
|
1316
|
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
#get cuis, where concept 2 is the leading cui |
|
1318
|
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
1319
|
|
|
|
|
|
|
$query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
|
1320
|
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
1321
|
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
|
1322
|
|
|
|
|
|
|
} |
|
1323
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
|
1324
|
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
1325
|
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
1326
|
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
#add cuis to the hash of cui2's co-occurrences |
|
1328
|
|
|
|
|
|
|
foreach my $cui (@cuis) { |
|
1329
|
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
|
1330
|
|
|
|
|
|
|
} |
|
1331
|
|
|
|
|
|
|
} |
|
1332
|
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
#return the cui co-occurrences |
|
1334
|
|
|
|
|
|
|
return (\%cooccurrences1, \%cooccurrences2); |
|
1335
|
|
|
|
|
|
|
} |
|
1336
|
|
|
|
|
|
|
=cut |
|
1337
|
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
1; |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
__END__ |