line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!usr/bin/perl |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
###################################################################################### |
4
|
|
|
|
|
|
|
# # |
5
|
|
|
|
|
|
|
# Author: Clint Cuffy # |
6
|
|
|
|
|
|
|
# Date: 06/16/2016 # |
7
|
|
|
|
|
|
|
# Revised: 02/19/2017 # |
8
|
|
|
|
|
|
|
# UMLS Similarity - Medline XML-To-Word2Vec Input Format Conversion Module # |
9
|
|
|
|
|
|
|
# # |
10
|
|
|
|
|
|
|
###################################################################################### |
11
|
|
|
|
|
|
|
# # |
12
|
|
|
|
|
|
|
# Description: # |
13
|
|
|
|
|
|
|
# ============ # |
14
|
|
|
|
|
|
|
# Perl Medline XML-To-Word2Vec Input Format Conversion Module # |
15
|
|
|
|
|
|
|
# for the "word2vec" package. # |
16
|
|
|
|
|
|
|
# Features: # |
17
|
|
|
|
|
|
|
# ========= # |
18
|
|
|
|
|
|
|
# Supports Parsing Individual Files or Directories # |
19
|
|
|
|
|
|
|
# Plain XML files or .gz XML files (extracts and processes in RAM) # |
20
|
|
|
|
|
|
|
# Include results by specified Date Ranges: 00/00/0000 Format # |
21
|
|
|
|
|
|
|
# Include results by title, abstract or both per article # |
22
|
|
|
|
|
|
|
# Multi-Threading Support - Divides work by number of threads # |
23
|
|
|
|
|
|
|
# Text Compoundify # |
24
|
|
|
|
|
|
|
# # |
25
|
|
|
|
|
|
|
###################################################################################### |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
package Word2vec::Xmltow2v; |
29
|
|
|
|
|
|
|
|
30
|
4
|
|
|
4
|
|
11188
|
use strict; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
96
|
|
31
|
4
|
|
|
4
|
|
12
|
use warnings; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
76
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Standard Package(s) |
34
|
4
|
|
|
4
|
|
2005
|
use utf8; |
|
4
|
|
|
|
|
31
|
|
|
4
|
|
|
|
|
15
|
|
35
|
4
|
|
|
4
|
|
2009
|
use threads; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
use threads::shared; |
37
|
|
|
|
|
|
|
use IO::Uncompress::Gunzip qw(gunzip $GunzipError); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# CPAN Package(s) |
40
|
|
|
|
|
|
|
use Cwd; |
41
|
|
|
|
|
|
|
use File::Type; |
42
|
|
|
|
|
|
|
use Text::Unidecode; |
43
|
|
|
|
|
|
|
use XML::Twig; |
44
|
|
|
|
|
|
|
use Sys::CpuAffinity; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Word2Vec Utility Package(s) |
47
|
|
|
|
|
|
|
use Word2vec::Bst; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
use vars qw($VERSION); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
$VERSION = '0.02'; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Global Variables |
56
|
|
|
|
|
|
|
my $debugLock :shared; |
57
|
|
|
|
|
|
|
my $writeLock :shared; |
58
|
|
|
|
|
|
|
my $queueLock :shared; |
59
|
|
|
|
|
|
|
my $appendLock :shared; |
60
|
|
|
|
|
|
|
my @xmlJobQueue :shared; |
61
|
|
|
|
|
|
|
my $totalJobCount :shared; |
62
|
|
|
|
|
|
|
my $finishedJobCount :shared; |
63
|
|
|
|
|
|
|
my $preCompWordCount :shared; |
64
|
|
|
|
|
|
|
my $postCompWordCount :shared; |
65
|
|
|
|
|
|
|
my $compoundWordCount :shared; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
###################################################################################### |
69
|
|
|
|
|
|
|
# Constructor |
70
|
|
|
|
|
|
|
###################################################################################### |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
BEGIN |
73
|
|
|
|
|
|
|
{ |
74
|
|
|
|
|
|
|
# CONSTRUCTOR : DO SOMETHING HERE |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
###################################################################################### |
79
|
|
|
|
|
|
|
# Deconstructor |
80
|
|
|
|
|
|
|
###################################################################################### |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
END |
83
|
|
|
|
|
|
|
{ |
84
|
|
|
|
|
|
|
# DECONSTRUCTOR : DO SOMETHING HERE |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
###################################################################################### |
89
|
|
|
|
|
|
|
# new Class Operator |
90
|
|
|
|
|
|
|
###################################################################################### |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub new |
93
|
|
|
|
|
|
|
{ |
94
|
|
|
|
|
|
|
my $class = shift; |
95
|
|
|
|
|
|
|
my $self = { |
96
|
|
|
|
|
|
|
# Private Member Variables |
97
|
|
|
|
|
|
|
_debugLog => shift, # Boolean (Binary): 0 = False, 1 = True |
98
|
|
|
|
|
|
|
_writeLog => shift, # Boolean (Binary): 0 = False, 1 = True |
99
|
|
|
|
|
|
|
_storeTitle => shift, # Boolean (Binary): 0 = False, 1 = True |
100
|
|
|
|
|
|
|
_storeAbstract => shift, # Boolean (Binary): 0 = False, 1 = True |
101
|
|
|
|
|
|
|
_quickParse => shift, # Boolean (Binary): 0 = False, 1 = True |
102
|
|
|
|
|
|
|
_compoundifyText => shift, # Boolean (Binary): 0 = False, 1 = True |
103
|
|
|
|
|
|
|
_numOfThreads => shift, # Integer |
104
|
|
|
|
|
|
|
_workingDir => shift, # String |
105
|
|
|
|
|
|
|
_savePath => shift, # String |
106
|
|
|
|
|
|
|
_beginDate => shift, # String Format: Month/Day/Year |
107
|
|
|
|
|
|
|
_endDate => shift, # String Format: Month/Day/Year |
108
|
|
|
|
|
|
|
_xmlStringToParse => shift, # String |
109
|
|
|
|
|
|
|
_textCorpusStr => shift, # String |
110
|
|
|
|
|
|
|
_fileHandle => shift, # File Handle |
111
|
|
|
|
|
|
|
_twigHandler => shift, # File Handle |
112
|
|
|
|
|
|
|
_parsedCount => shift, # Int |
113
|
|
|
|
|
|
|
_tempDate => shift, # String (Temporary Placeholder) |
114
|
|
|
|
|
|
|
_tempStr => shift, # String (Temporary Placeholder) |
115
|
|
|
|
|
|
|
_compoundWordAry => shift, # Array Of Compound Words |
116
|
|
|
|
|
|
|
_compoundWordBST => shift, # Binary Search Tree Reference |
117
|
|
|
|
|
|
|
_maxCompoundWordLength => shift, # Integer |
118
|
|
|
|
|
|
|
_overwriteExistingFile => shift, # Integer |
119
|
|
|
|
|
|
|
_compoundWordCount => shift, # Integer |
120
|
|
|
|
|
|
|
}; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# Set debug log variable to false if not defined |
123
|
|
|
|
|
|
|
$self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } ); |
124
|
|
|
|
|
|
|
$self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } ); |
125
|
|
|
|
|
|
|
$self->{ _storeTitle } = 1 if !defined ( $self->{ _storeTitle } ); |
126
|
|
|
|
|
|
|
$self->{ _storeAbstract } = 1 if !defined ( $self->{ _storeAbstract } ); |
127
|
|
|
|
|
|
|
$self->{ _quickParse } = 0 if !defined ( $self->{ _quickParse } ); |
128
|
|
|
|
|
|
|
$self->{ _compoundifyText } = 0 if !defined ( $self->{ _compoundifyText } ); |
129
|
|
|
|
|
|
|
$self->{ _numOfThreads } = Sys::CpuAffinity::getNumCpus() if !defined ( $self->{ _numOfThreads } ); |
130
|
|
|
|
|
|
|
$self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } ); |
131
|
|
|
|
|
|
|
$self->{ _savePath } = Cwd::getcwd() if !defined ( $self->{ _savePath } ); |
132
|
|
|
|
|
|
|
$self->{ _beginDate } = "00/00/0000" if !defined ( $self->{ _beginDate } ); |
133
|
|
|
|
|
|
|
$self->{ _endDate } = "99/99/9999" if !defined ( $self->{ _endDate } ); |
134
|
|
|
|
|
|
|
$self->{ _xmlStringToParse } = "(null)" if !defined ( $self->{ _xmlStringToParse } ); |
135
|
|
|
|
|
|
|
$self->{ _textCorpusStr } = "" if !defined ( $self->{ _textCorpusStr } ); |
136
|
|
|
|
|
|
|
$self->{ _twigHandler } = 0 if !defined ( $self->{ _twigHandler } ); |
137
|
|
|
|
|
|
|
$self->{ _parsedCount } = 0 if !defined ( $self->{ _parsedCount } ); |
138
|
|
|
|
|
|
|
$self->{ _tempDate } = "" if !defined ( $self->{ _tempDate } ); |
139
|
|
|
|
|
|
|
$self->{ _tempStr } = "" if !defined ( $self->{ _tempStr } ); |
140
|
|
|
|
|
|
|
$self->{ _outputFileName } = "textcorpus.txt" if !defined ( $self->{ _outputFileName } ); |
141
|
|
|
|
|
|
|
@{ $self->{ _compoundWordAry } } = () if !defined ( $self->{ _compoundWordAry } ); |
142
|
|
|
|
|
|
|
@{ $self->{ _compoundWordAry } } = @{ $self->{ _compoundWordAry } } if defined ( $self->{ _compoundWordAry } ); |
143
|
|
|
|
|
|
|
$self->{ _compoundWordBST } = Word2vec::Bst->new() if !defined ( $self->{ _compoundWordBST } ); |
144
|
|
|
|
|
|
|
$self->{ _maxCompoundWordLength } = 20 if !defined ( $self->{ _maxCompoundWordLength } ); |
145
|
|
|
|
|
|
|
$self->{ _overwriteExistingFile } = 0 if !defined ( $self->{ _overwriteExistingFile } ); |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# Initialize Thread Safe Counting Variables |
148
|
|
|
|
|
|
|
@xmlJobQueue = (); |
149
|
|
|
|
|
|
|
$compoundWordCount = 0; |
150
|
|
|
|
|
|
|
$preCompWordCount = 0; |
151
|
|
|
|
|
|
|
$postCompWordCount = 0; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# Open File Handler if checked variable is true |
154
|
|
|
|
|
|
|
if( $self->{ _writeLog } ) |
155
|
|
|
|
|
|
|
{ |
156
|
|
|
|
|
|
|
open( $self->{ _fileHandle }, '>:utf8', 'Xmltow2vLog.txt' ); |
157
|
|
|
|
|
|
|
$self->{ _fileHandle }->autoflush( 1 ); # Auto-flushes writes to log |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# Declare XML parser |
161
|
|
|
|
|
|
|
# Quick Parse Method(s): Much Faster With Less Hardware Requirements and Accuracy |
162
|
|
|
|
|
|
|
if( $self->{ _quickParse } == 1 ) |
163
|
|
|
|
|
|
|
{ |
164
|
|
|
|
|
|
|
$self->{ _twigHandler } = XML::Twig->new( |
165
|
|
|
|
|
|
|
twig_handlers => |
166
|
|
|
|
|
|
|
{ |
167
|
|
|
|
|
|
|
'DateCreated' => sub { _QuickParseDateCreated( @_, $self ) }, |
168
|
|
|
|
|
|
|
'Journal' => sub { _QuickParseJournal( @_, $self ) }, |
169
|
|
|
|
|
|
|
'Article' => sub { _QuickParseArticle( @_, $self ) }, |
170
|
|
|
|
|
|
|
'OtherAbstract' => sub { _QuickParseOtherAbstract( @_, $self ) }, |
171
|
|
|
|
|
|
|
}, |
172
|
|
|
|
|
|
|
); |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
# Default Parse Method: Much Slower With High RAM Requirements and Better Accuracy |
175
|
|
|
|
|
|
|
else |
176
|
|
|
|
|
|
|
{ |
177
|
|
|
|
|
|
|
$self->{ _twigHandler } = XML::Twig->new( |
178
|
|
|
|
|
|
|
twig_handlers => |
179
|
|
|
|
|
|
|
{ |
180
|
|
|
|
|
|
|
'MedlineCitationSet' => sub { _ParseMedlineCitationSet( @_, $self ) }, |
181
|
|
|
|
|
|
|
}, |
182
|
|
|
|
|
|
|
); |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
bless $self, $class; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
$self->WriteLog( "New - Debug On" ); |
188
|
|
|
|
|
|
|
$self->WriteLog( "New - QuickParse Enabled" ) if( $self->{ _quickParse } == 1 ); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
if( $self->{ _xmlStringToParse } ne "(null)" ) |
191
|
|
|
|
|
|
|
{ |
192
|
|
|
|
|
|
|
#$self->_RemoveXMLVersion( \$self->{ _xmlStringToParse } ); |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
if( $self->_CheckForNullData ( $self->{ _xmlStringToParse } ) ) |
195
|
|
|
|
|
|
|
{ |
196
|
|
|
|
|
|
|
$self->WriteLog( "New - Error: XML String is null" ); |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
else |
199
|
|
|
|
|
|
|
{ |
200
|
|
|
|
|
|
|
$self->{ _twigHandler }->parse( $self->{ _xmlStringToParse } ); |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
else |
204
|
|
|
|
|
|
|
{ |
205
|
|
|
|
|
|
|
$self->WriteLog( "New - No XML String Argument To Parse" ); |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
return $self; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
###################################################################################### |
213
|
|
|
|
|
|
|
# DESTROY |
214
|
|
|
|
|
|
|
###################################################################################### |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub DESTROY |
217
|
|
|
|
|
|
|
{ |
218
|
|
|
|
|
|
|
my ( $self ) = @_; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Close FileHandle |
221
|
|
|
|
|
|
|
close( $self->{ _fileHandle } ) if( $self->{ _fileHandle } ); |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
###################################################################################### |
226
|
|
|
|
|
|
|
# Module Functions |
227
|
|
|
|
|
|
|
###################################################################################### |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
sub ConvertMedlineXMLToW2V |
230
|
|
|
|
|
|
|
{ |
231
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
232
|
|
|
|
|
|
|
$dir = $self->GetWorkingDir() if !defined ( $dir ); |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
my $result = $self->_DateCheck(); |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# Check(s) |
237
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2v - Error: Date Check Failed" ) if ( $result == -1 ); |
238
|
|
|
|
|
|
|
return -1 if ( $result == -1 ); |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Error: StoreTitle and StoreAbstract Variables Set To 0 - No Data Will Be Extracted" ) |
241
|
|
|
|
|
|
|
if ( $self->GetStoreTitle() == 0 && $self->GetStoreAbstract() == 0 ); |
242
|
|
|
|
|
|
|
return -1 if ( $self->GetStoreTitle() == 0 && $self->GetStoreAbstract() == 0 ); |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# Check To See If Overwrite Existing File Option Is Enabled And Overwrite |
245
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Overwrite Existing File Option Enabled" ) if $self->GetOverwriteExistingFile() == 1; |
246
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Existing File Found - Removing Existing File" ) if ( $self->GetOverwriteExistingFile() == 1 && -e $self->GetSavePath() ); |
247
|
|
|
|
|
|
|
unlink( $self->GetSavePath() ) if ( $self->GetOverwriteExistingFile() == 1 && -e $self->GetSavePath() ); |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
my $isFileOrDir = $self->IsFileOrDirectory( $dir ); |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
# Process File In Working Directory |
252
|
|
|
|
|
|
|
if( $isFileOrDir eq "file" ) |
253
|
|
|
|
|
|
|
{ |
254
|
|
|
|
|
|
|
$self->SetXMLStringToParse( $self->_ReadXMLDataFromFile( $dir ) ); |
255
|
|
|
|
|
|
|
return -1 if ( $self->GetXMLStringToParse() ) eq "(null)"; |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing XML File: $dir" ); |
258
|
|
|
|
|
|
|
$self->_ParseXMLString( $self->GetXMLStringToParse() ); |
259
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing Complete" ); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
# Process All Files In Directory |
262
|
|
|
|
|
|
|
elsif( $isFileOrDir eq "dir" ) |
263
|
|
|
|
|
|
|
{ |
264
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - No File Specified/Using Directory Option" ); |
265
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Obtaining File(s) In Directory" ); |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# Read File Name(s) From Specified Directory |
268
|
|
|
|
|
|
|
opendir( my $dirHandle, "$dir" ) or $result = -1; |
269
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Error: Can't open $dir: $!" ) if $result == -1; |
270
|
|
|
|
|
|
|
return -1 if $result == -1; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
for my $file ( readdir( $dirHandle ) ) |
273
|
|
|
|
|
|
|
{ |
274
|
|
|
|
|
|
|
push( @xmlJobQueue, $file ) if ( ( index( $file, ".xml" ) != -1 ) && ( index( $file, ".xml.gz") == -1 ) ); |
275
|
|
|
|
|
|
|
push( @xmlJobQueue, $file ) if ( index( $file, ".gz" ) != -1 ); |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
closedir $dirHandle; |
279
|
|
|
|
|
|
|
undef $dirHandle; |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# Set Total Job Count |
282
|
|
|
|
|
|
|
$totalJobCount = @xmlJobQueue; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing $totalJobCount File(s)" ); |
285
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Starting Worker Thread(s) / Compiling Text Corpus" ); |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# Start Thread(s) |
288
|
|
|
|
|
|
|
for( my $i = 0; $i < $self->GetNumOfThreads(); $i++ ) |
289
|
|
|
|
|
|
|
{ |
290
|
|
|
|
|
|
|
my $thread = threads->create( "_ThreadedConvert", $self, $dir ); |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Join All Running Threads Prior To Termination |
294
|
|
|
|
|
|
|
my @threadAry = threads->list(); |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
for my $thread ( @threadAry ) |
297
|
|
|
|
|
|
|
{ |
298
|
|
|
|
|
|
|
$thread->join() if ( $thread->is_running() || $thread->is_joinable() ); |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
print( "Parsed $finishedJobCount of $totalJobCount Files\n" ) if ( $self->GetDebugLog() == 0 ); |
302
|
|
|
|
|
|
|
print( "Number Of Compound Words: $compoundWordCount\n" ) if ( $self->GetDebugLog() == 0 ); |
303
|
|
|
|
|
|
|
print( "Number Of Words (Before Compounding): $preCompWordCount\n" ) if ( $self->GetDebugLog() == 0 ); |
304
|
|
|
|
|
|
|
print( "Number Of Words (After Compounding): $postCompWordCount\n" ) if ( $self->GetDebugLog() == 0 ); |
305
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsed $finishedJobCount of $totalJobCount Files" ); |
306
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Number Of Compound Words: $compoundWordCount" ); |
307
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Number Of Words (Before Compounding): $preCompWordCount" ); |
308
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Number Of Words (After Compounding): $postCompWordCount" ); |
309
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing Complete" ); |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# Clean Up |
312
|
|
|
|
|
|
|
ClearTempStr(); |
313
|
|
|
|
|
|
|
ClearTextCorpusStr(); |
314
|
|
|
|
|
|
|
$totalJobCount = 0; |
315
|
|
|
|
|
|
|
$preCompWordCount = 0; |
316
|
|
|
|
|
|
|
$compoundWordCount = 0; |
317
|
|
|
|
|
|
|
$postCompWordCount = 0; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
else |
320
|
|
|
|
|
|
|
{ |
321
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Unknown Parameter Type: Not File Or Directory" ); |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
return 0; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub _ThreadedConvert |
328
|
|
|
|
|
|
|
{ |
329
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
my $keepWorking = 1; |
332
|
|
|
|
|
|
|
my $tid = threads->tid(); |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Warning: Requested Thread $tid Not Needed/Threads Exceed Work Load - Terminating Thread" ) if ( @xmlJobQueue == 0 ); |
335
|
|
|
|
|
|
|
return 1 if ( @xmlJobQueue == 0 ); |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Starting Thread: $tid" ); |
338
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid Parsing File(s) In Job Queue" ); |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
while( $keepWorking == 1 ) |
341
|
|
|
|
|
|
|
{ |
342
|
|
|
|
|
|
|
my $file; |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
# Prevent Other Threads From Reading Shared Job Queue (Array) At The Same Time |
345
|
|
|
|
|
|
|
{ |
346
|
|
|
|
|
|
|
lock( $queueLock ); |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
# Fetch A File Name To Parse |
349
|
|
|
|
|
|
|
my $index = 0; |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# Keep Iterating Through Queue While Elements Are Not Defined |
352
|
|
|
|
|
|
|
while( $index < @xmlJobQueue ) |
353
|
|
|
|
|
|
|
{ |
354
|
|
|
|
|
|
|
$file = $xmlJobQueue[$index]; |
355
|
|
|
|
|
|
|
delete( $xmlJobQueue[$index] ) if defined( $file ); |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
# Exit Loop If Element Array Defined |
358
|
|
|
|
|
|
|
$index = @xmlJobQueue if defined( $file ); |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
$index++; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# Increment Parsed File Counter |
364
|
|
|
|
|
|
|
$finishedJobCount++ if defined( $file ); |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# Exit The Main Loop If The Last Element Was Parsed |
367
|
|
|
|
|
|
|
$keepWorking = 0 if ( @xmlJobQueue == 0 ); |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
if( defined( $file ) ) |
371
|
|
|
|
|
|
|
{ |
372
|
|
|
|
|
|
|
print( "Thread $tid: Parsing $file\n" ) if ( !$self->GetDebugLog() ); |
373
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid: Processing File: $file" ); |
374
|
|
|
|
|
|
|
$self->SetXMLStringToParse( $self->_ReadXMLDataFromFile( "$dir/$file" ) ); |
375
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid: Parsing XML Data" ); |
376
|
|
|
|
|
|
|
$self->_ParseXMLString( $self->GetXMLStringToParse() ); |
377
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid: Parsed $file" ); |
378
|
|
|
|
|
|
|
print( "Thread $tid: Parsed $file\n" ) if ( !$self->GetDebugLog() ); |
379
|
|
|
|
|
|
|
$self->_SaveTextCorpusToFile( $self->GetSavePath(), 1 ); |
380
|
|
|
|
|
|
|
$self->ClearTextCorpusStr(); |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid Finished - Terminating" ); |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
return 0; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
sub _ParseXMLString |
390
|
|
|
|
|
|
|
{ |
391
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
392
|
|
|
|
|
|
|
$string = "" if !defined ( $string ); |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
if( $self->_CheckParseRequirements( $string ) eq -1 ) |
395
|
|
|
|
|
|
|
{ |
396
|
|
|
|
|
|
|
return -1; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# REMOVEME |
400
|
|
|
|
|
|
|
#$self->_RemoveXMLVersion( \$string ); |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
if( $self->_CheckForNullData( $string ) ) |
403
|
|
|
|
|
|
|
{ |
404
|
|
|
|
|
|
|
$self->WriteLog( "_ParseXMLString - Cannot Parse (null) string" ); |
405
|
|
|
|
|
|
|
return -1; |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
else |
408
|
|
|
|
|
|
|
{ |
409
|
|
|
|
|
|
|
$self->{ _twigHandler }->parse( $string ); |
410
|
|
|
|
|
|
|
$self->WriteLog( "_ParseXMLString: Released PubmedArticle from memory" ); |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# Print how many entries were parsed |
413
|
|
|
|
|
|
|
$self->WriteLog( "_ParseXMLString: Parsed " . $self->GetParsedCount() . " entries" ); |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
return 0; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub _CheckParseRequirements |
420
|
|
|
|
|
|
|
{ |
421
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
422
|
|
|
|
|
|
|
$string = "" if !defined ( $string ); |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
if( $string eq "" ) |
425
|
|
|
|
|
|
|
{ |
426
|
|
|
|
|
|
|
$self->WriteLog( "_CheckParseRequirements - Error: Nothing To Parse" ); |
427
|
|
|
|
|
|
|
return -1; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
elsif( $self->GetTwigHandler() == 0 ) |
430
|
|
|
|
|
|
|
{ |
431
|
|
|
|
|
|
|
$self->WriteLog( "_CheckParseRequirements - Error: Unable To Parse XML Data/TwigHandler = (null)" ); |
432
|
|
|
|
|
|
|
return -1; |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
return 0; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# Checks to see if Medline XML data in memory is a null string |
439
|
|
|
|
|
|
|
sub _CheckForNullData |
440
|
|
|
|
|
|
|
{ |
441
|
|
|
|
|
|
|
my ( $self, $temp ) = @_; |
442
|
|
|
|
|
|
|
my $nullStr = "(null)"; |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
if( my $n = index( $temp, $nullStr ) != -1 ) |
445
|
|
|
|
|
|
|
{ |
446
|
|
|
|
|
|
|
# Return True |
447
|
|
|
|
|
|
|
return 1 if $n == 0; |
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# Return False |
451
|
|
|
|
|
|
|
return 0; |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
# Removes the XML Version string prior to parsing the XML string |
455
|
|
|
|
|
|
|
sub _RemoveXMLVersion |
456
|
|
|
|
|
|
|
{ |
457
|
|
|
|
|
|
|
my ( $self, $temp ) = @_; |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
# Checking For XML Version |
460
|
|
|
|
|
|
|
my $xmlVersion = '
|
461
|
|
|
|
|
|
|
my $docType = '!DOCTYPE'; |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
my $line = ""; |
464
|
|
|
|
|
|
|
my $newXMLString = ""; |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
foreach $line ( split /\n/ , ${$temp} ) |
467
|
|
|
|
|
|
|
{ |
468
|
|
|
|
|
|
|
if( index( $line, $xmlVersion ) == -1 && index( $line, $docType ) == -1 ) |
469
|
|
|
|
|
|
|
{ |
470
|
|
|
|
|
|
|
$newXMLString .= ( $line . "\n" ); |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
${$temp} = $newXMLString; |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
sub _ParseMedlineCitationSet |
478
|
|
|
|
|
|
|
{ |
479
|
|
|
|
|
|
|
my ( $twigSelf, $root, $self ) = @_; |
480
|
|
|
|
|
|
|
my @pubMedArticles = $root->children(); |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
my $parsedData = 0; |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
foreach my $pubMedArticle ( @pubMedArticles ) |
485
|
|
|
|
|
|
|
{ |
486
|
|
|
|
|
|
|
# Parse XML Data |
487
|
|
|
|
|
|
|
$parsedData = $self->_ParseMedlineArticle( $pubMedArticle ); |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
490
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
491
|
|
|
|
|
|
|
{ |
492
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
495
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
498
|
|
|
|
|
|
|
{ |
499
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
500
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
# Clear string placeholders |
504
|
|
|
|
|
|
|
$self->ClearTempStr(); |
505
|
|
|
|
|
|
|
$self->ClearTempDate(); |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
# Increment Parsed Counter |
508
|
|
|
|
|
|
|
$self->{ _parsedCount }++ if ( $parsedData == 1 ); |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Release the stored XML section from memory (not fully tested) |
511
|
|
|
|
|
|
|
$pubMedArticle->purge() if defined( $pubMedArticle ); |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
# Reset Parsed Data Flag |
514
|
|
|
|
|
|
|
$parsedData = 0; |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
# Release the stored XML section from memory (not fully tested) |
518
|
|
|
|
|
|
|
$root->purge(); |
519
|
|
|
|
|
|
|
$self->WriteLog( "_ParseMedlineCitationSet: Released PubmedArticleSet group from memory" ); |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
sub _ParseMedlineArticle |
523
|
|
|
|
|
|
|
{ |
524
|
|
|
|
|
|
|
my ( $self, $medlineArticle ) = @_; |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
my @articles = $medlineArticle->children(); |
527
|
|
|
|
|
|
|
my $dateCreated = ""; |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
for my $article ( @articles ) |
530
|
|
|
|
|
|
|
{ |
531
|
|
|
|
|
|
|
if( $article->tag() eq "Article" ) |
532
|
|
|
|
|
|
|
{ |
533
|
|
|
|
|
|
|
$self->_ParseArticle( $article ); |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
elsif( $article->tag() eq "DateCreated" ) |
536
|
|
|
|
|
|
|
{ |
537
|
|
|
|
|
|
|
$self->SetTempDate( $self->_ParseDateCreated( $article ) ); |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
elsif( $article->tag() eq "OtherAbstract" ) |
540
|
|
|
|
|
|
|
{ |
541
|
|
|
|
|
|
|
$self->_ParseOtherAbstract( $article ); |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
else |
544
|
|
|
|
|
|
|
{ |
545
|
|
|
|
|
|
|
$self->WriteLog( "_ParseMedlineArticle - (New Data Found) - Tag: " . $article->tag() . ", Field: " . $article->field() ); |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
# Release article from memory |
549
|
|
|
|
|
|
|
$article->purge(); |
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
return 1; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
sub _ParseDateCreated |
556
|
|
|
|
|
|
|
{ |
557
|
|
|
|
|
|
|
my ( $self, $article ) = @_; |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
my $month = ""; |
560
|
|
|
|
|
|
|
my $day = ""; |
561
|
|
|
|
|
|
|
my $year = ""; |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
my @dateAry = $article->children(); |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
for my $date ( @dateAry ) |
566
|
|
|
|
|
|
|
{ |
567
|
|
|
|
|
|
|
$day = $date->field() if ( $date->tag() eq "Day" ); |
568
|
|
|
|
|
|
|
$month = $date->field if ( $date->tag() eq "Month" ); |
569
|
|
|
|
|
|
|
$year = $date->field() if ( $date->tag() eq "Year" ); |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
# Check(s) |
573
|
|
|
|
|
|
|
$day = "00" if !defined ( $day ); |
574
|
|
|
|
|
|
|
$month = "00" if !defined ( $month ); |
575
|
|
|
|
|
|
|
$year = "0000" if !defined ( $year ); |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
$self->WriteLog( "_ParseDateCreated - Month: $month, Day: $day, Year: $year " ); |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
return "$month/$day/$year"; |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
sub _ParseArticle |
583
|
|
|
|
|
|
|
{ |
584
|
|
|
|
|
|
|
my ( $self, $article ) = @_; |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
my @articleChildren = $article->children(); |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
for my $articleChild ( @articleChildren ) |
589
|
|
|
|
|
|
|
{ |
590
|
|
|
|
|
|
|
if( $articleChild->tag() eq "Journal" ) |
591
|
|
|
|
|
|
|
{ |
592
|
|
|
|
|
|
|
$self->_ParseJournal( $articleChild ); |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
elsif( $articleChild->tag() eq "ArticleTitle" ) |
595
|
|
|
|
|
|
|
{ |
596
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
597
|
|
|
|
|
|
|
chomp( $tempStr ); |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
# Store String |
600
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
$self->WriteLog( "_ParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
elsif( $articleChild->tag() eq "Abstract" ) |
605
|
|
|
|
|
|
|
{ |
606
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
607
|
|
|
|
|
|
|
chomp( $tempStr ); |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# Store String |
610
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
$self->WriteLog( "_ParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
else |
615
|
|
|
|
|
|
|
{ |
616
|
|
|
|
|
|
|
$self->WriteLog( "_ParseArticle - (New Tag Found) - Tag: " . $articleChild->tag() . ", Field: " . $articleChild->field() ); |
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
sub _ParseJournal |
622
|
|
|
|
|
|
|
{ |
623
|
|
|
|
|
|
|
my ( $self, $journalRoot ) = @_; |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
my @journalChildren = $journalRoot->children(); |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
for my $journalChild ( @journalChildren ) |
628
|
|
|
|
|
|
|
{ |
629
|
|
|
|
|
|
|
if( $journalChild->tag() eq "Title" ) |
630
|
|
|
|
|
|
|
{ |
631
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $journalChild->field() ); |
632
|
|
|
|
|
|
|
chomp( $tempStr ); |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# Store String |
635
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
$self->WriteLog( "_ParseJournal - Tag: " . $journalChild->tag() . ", Field: " . $tempStr ); |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
else |
640
|
|
|
|
|
|
|
{ |
641
|
|
|
|
|
|
|
$self->WriteLog( "_ParseJournal - (New Tag Found) - Tag: " . $journalChild->tag() . ", Field: " . $journalChild->field() ); |
642
|
|
|
|
|
|
|
} |
643
|
|
|
|
|
|
|
} |
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
sub _ParseOtherAbstract |
647
|
|
|
|
|
|
|
{ |
648
|
|
|
|
|
|
|
my ( $self, $abstractRoot ) = @_; |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
my @otherAbstractChildren = $abstractRoot->children(); |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
for my $abstractChild ( @otherAbstractChildren ) |
653
|
|
|
|
|
|
|
{ |
654
|
|
|
|
|
|
|
if( $abstractChild->tag() eq "AbstractText" ) |
655
|
|
|
|
|
|
|
{ |
656
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $abstractChild->field() ); |
657
|
|
|
|
|
|
|
chomp( $tempStr ); |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
# Store String |
660
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
$self->WriteLog( "_ParseOtherAbstract - Tag: " . $abstractChild->tag() . ", Field: " . $tempStr ); |
663
|
|
|
|
|
|
|
} |
664
|
|
|
|
|
|
|
else |
665
|
|
|
|
|
|
|
{ |
666
|
|
|
|
|
|
|
$self->WriteLog( "_ParseOtherAbstract - (New Tag Found) - Tag: " . $abstractChild->tag() . ", Field: " . $abstractChild->field() ); |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
sub _QuickParseDateCreated |
672
|
|
|
|
|
|
|
{ |
673
|
|
|
|
|
|
|
my ( $twigSelf, $article, $self ) = @_; |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
my $month = ""; |
676
|
|
|
|
|
|
|
my $day = ""; |
677
|
|
|
|
|
|
|
my $year = ""; |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
# Clear Old Date |
680
|
|
|
|
|
|
|
$self->ClearTempDate(); |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
my @dateAry = $article->children(); |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
for my $date ( @dateAry ) |
685
|
|
|
|
|
|
|
{ |
686
|
|
|
|
|
|
|
$day = $date->field() if ( $date->tag() eq "Day" ); |
687
|
|
|
|
|
|
|
$month = $date->field if ( $date->tag() eq "Month" ); |
688
|
|
|
|
|
|
|
$year = $date->field() if ( $date->tag() eq "Year" ); |
689
|
|
|
|
|
|
|
} |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# Check(s) |
692
|
|
|
|
|
|
|
$day = "00" if !defined ( $day ); |
693
|
|
|
|
|
|
|
$month = "00" if !defined ( $month ); |
694
|
|
|
|
|
|
|
$year = "0000" if !defined ( $year ); |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseDateCreated - Month: $month, Day: $day, Year: $year " ); |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
$self->SetTempDate( "$month/$day/$year" ); |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# Free Memory |
701
|
|
|
|
|
|
|
$article->purge(); |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
sub _QuickParseJournal |
705
|
|
|
|
|
|
|
{ |
706
|
|
|
|
|
|
|
my ( $twigSelf, $journalRoot, $self ) = @_; |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
my @journalChildren = $journalRoot->children(); |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
for my $journalChild ( @journalChildren ) |
711
|
|
|
|
|
|
|
{ |
712
|
|
|
|
|
|
|
if( $journalChild->tag() eq "Title" ) |
713
|
|
|
|
|
|
|
{ |
714
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $journalChild->field() ); |
715
|
|
|
|
|
|
|
chomp( $tempStr ); |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
# Store String |
718
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseJournal - Tag: " . $journalChild->tag() . ", Field: " . $tempStr ); |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
else |
723
|
|
|
|
|
|
|
{ |
724
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseJournal - (New Tag Found) - Tag: " . $journalChild->tag() . ", Field: " . $journalChild->field() ); |
725
|
|
|
|
|
|
|
} |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
729
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
730
|
|
|
|
|
|
|
{ |
731
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
734
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
737
|
|
|
|
|
|
|
{ |
738
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
739
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
# Clear string placeholders |
743
|
|
|
|
|
|
|
$self->ClearTempStr(); |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# Free Memory |
746
|
|
|
|
|
|
|
$journalRoot->purge(); |
747
|
|
|
|
|
|
|
} |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
sub _QuickParseArticle |
750
|
|
|
|
|
|
|
{ |
751
|
|
|
|
|
|
|
my ( $twigSelf, $article, $self ) = @_; |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
my @articleChildren = $article->children(); |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
for my $articleChild ( @articleChildren ) |
756
|
|
|
|
|
|
|
{ |
757
|
|
|
|
|
|
|
if( $articleChild->tag() eq "ArticleTitle" ) |
758
|
|
|
|
|
|
|
{ |
759
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
760
|
|
|
|
|
|
|
chomp( $tempStr ); |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
# Store String |
763
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
766
|
|
|
|
|
|
|
} |
767
|
|
|
|
|
|
|
elsif( $articleChild->tag() eq "Abstract" ) |
768
|
|
|
|
|
|
|
{ |
769
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
770
|
|
|
|
|
|
|
chomp( $tempStr ); |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
# Store String |
773
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
776
|
|
|
|
|
|
|
} |
777
|
|
|
|
|
|
|
else |
778
|
|
|
|
|
|
|
{ |
779
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseArticle - (New Tag Found) - Tag: " . $articleChild->tag() . ", Field: " . $articleChild->field() ); |
780
|
|
|
|
|
|
|
} |
781
|
|
|
|
|
|
|
} |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
784
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
785
|
|
|
|
|
|
|
{ |
786
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
789
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
790
|
|
|
|
|
|
|
} |
791
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
792
|
|
|
|
|
|
|
{ |
793
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
794
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
795
|
|
|
|
|
|
|
} |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
# Clear string placeholders |
798
|
|
|
|
|
|
|
$self->ClearTempStr(); |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
# Free Memory |
801
|
|
|
|
|
|
|
$article->purge(); |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
sub _QuickParseOtherAbstract |
805
|
|
|
|
|
|
|
{ |
806
|
|
|
|
|
|
|
my ( $twigSelf, $abstractRoot, $self ) = @_; |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
my @otherAbstractChildren = $abstractRoot->children(); |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
for my $abstractChild ( @otherAbstractChildren ) |
811
|
|
|
|
|
|
|
{ |
812
|
|
|
|
|
|
|
if( $abstractChild->tag() eq "AbstractText" ) |
813
|
|
|
|
|
|
|
{ |
814
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $abstractChild->field() ); |
815
|
|
|
|
|
|
|
chomp( $tempStr ); |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
# Store String |
818
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseOtherAbstract - Tag: " . $abstractChild->tag() . ", Field: " . $tempStr ); |
821
|
|
|
|
|
|
|
} |
822
|
|
|
|
|
|
|
else |
823
|
|
|
|
|
|
|
{ |
824
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseOtherAbstract - (New Tag Found) - Tag: " . $abstractChild->tag() . ", Field: " . $abstractChild->field() ); |
825
|
|
|
|
|
|
|
} |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
829
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
830
|
|
|
|
|
|
|
{ |
831
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
834
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
837
|
|
|
|
|
|
|
{ |
838
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
839
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# Clear string placeholders |
843
|
|
|
|
|
|
|
$self->ClearTempStr(); |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
# Free Memory |
846
|
|
|
|
|
|
|
$abstractRoot->purge(); |
847
|
|
|
|
|
|
|
} |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
sub CreateCompoundWordBST |
850
|
|
|
|
|
|
|
{ |
851
|
|
|
|
|
|
|
my ( $self ) = @_; |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
$self->WriteLog( "CreateCompoundWordBST - Creating Binary Search Tree From Compound Word Array" ); |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
my $bst = $self->GetCompoundWordBST(); |
856
|
|
|
|
|
|
|
my @compoundWordAry = $self->GetCompoundWordAry(); |
857
|
|
|
|
|
|
|
my $arySize = @compoundWordAry; |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
# Check(s) |
860
|
|
|
|
|
|
|
$self->WriteLog( "CreateCompoundWordBST - Error: Cannot Create BST / Compound Word Array Is Empty - Have You Read The Compound Word File To Memory?" ) if $arySize == 0; |
861
|
|
|
|
|
|
|
return -1 if $arySize == 0; |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
my $rootNode = $bst->CreateBST( \@compoundWordAry, 0, $arySize - 1, undef ); |
864
|
|
|
|
|
|
|
$bst->SetRootNode( $rootNode ); |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
# Clean-Up |
867
|
|
|
|
|
|
|
$self->ClearCompoundWordAry(); |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
$self->WriteLog( "CreateCompoundWordBST - Compound Word Binary Search Tree Created" ); |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
return 0; |
872
|
|
|
|
|
|
|
} |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
sub CompoundifyString |
875
|
|
|
|
|
|
|
{ |
876
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
return "(null)" if !defined ( $str ); |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
$self->WriteLog( "CompoundifyString - Compoundifying String - $str" ); |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
my $bst = $self->GetCompoundWordBST(); |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
my @strAry = split( ' ', $str ); |
885
|
|
|
|
|
|
|
$str = ""; |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
my $arySize = @strAry; |
888
|
|
|
|
|
|
|
my $maxCompoundWordLength = $self->GetMaxCompoundWordLength(); |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
for( my $i = 0; $i < @strAry; $i++ ) |
891
|
|
|
|
|
|
|
{ |
892
|
|
|
|
|
|
|
my $lastIndex = $i + $maxCompoundWordLength; |
893
|
|
|
|
|
|
|
$lastIndex = $arySize - 1 if ( $i + $maxCompoundWordLength > $arySize ); |
894
|
|
|
|
|
|
|
my @tempAry = @strAry[$i..$lastIndex]; |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
my $node = $self->_CompoundifySearch( \@tempAry, undef, $strAry[$i], 0 ); |
897
|
|
|
|
|
|
|
undef( @tempAry ); |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
# Compound Word(s) Found |
900
|
|
|
|
|
|
|
if( defined( $node ) ) |
901
|
|
|
|
|
|
|
{ |
902
|
|
|
|
|
|
|
# Split Compound Word Data And Set Next Index After Located Compound Word(s) |
903
|
|
|
|
|
|
|
my @nodeDataAry = split( ' ', $node->data ); |
904
|
|
|
|
|
|
|
$i += @nodeDataAry - 1; |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
# Add Compound Words To The Return String |
907
|
|
|
|
|
|
|
$str .= join( '_', @nodeDataAry ) . " "; |
908
|
|
|
|
|
|
|
undef( @nodeDataAry ); |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
# Increment Compound Word Counter |
911
|
|
|
|
|
|
|
$compoundWordCount++; |
912
|
|
|
|
|
|
|
} |
913
|
|
|
|
|
|
|
# No Compound Word(s) Found |
914
|
|
|
|
|
|
|
else |
915
|
|
|
|
|
|
|
{ |
916
|
|
|
|
|
|
|
# Add Single Word At Array Index To Return String |
917
|
|
|
|
|
|
|
$str .= $strAry[$i] . " "; |
918
|
|
|
|
|
|
|
} |
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
# Increment Word Counter |
921
|
|
|
|
|
|
|
$postCompWordCount++; |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
# Debug Print Statements |
924
|
|
|
|
|
|
|
#$self->WriteLog( "Data: " . $node->data . " : Next Index: $i" ) if defined ( $node ); |
925
|
|
|
|
|
|
|
#$self->WriteLog( "Undefined : Index: $i" ) if !defined ( $node ); |
926
|
|
|
|
|
|
|
} |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
$self->WriteLog( "CompoundifyString - Compounded String - $str" ); |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
return $str; |
931
|
|
|
|
|
|
|
} |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
sub _CompoundifySearch |
934
|
|
|
|
|
|
|
{ |
935
|
|
|
|
|
|
|
my ( $self, $strAryRef, $oldNode, $searchStr, $index ) = @_; |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
# Checks(s) |
938
|
|
|
|
|
|
|
return undef if !defined ( $strAryRef ); |
939
|
|
|
|
|
|
|
return undef if !defined ( $searchStr ); |
940
|
|
|
|
|
|
|
return undef if !defined ( $index ); |
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
my @strAry = @{ $strAryRef }; |
943
|
|
|
|
|
|
|
my $arySize = @strAry; |
944
|
|
|
|
|
|
|
my $bst = $self->GetCompoundWordBST(); |
945
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
my $resultNode = $bst->BSTContainsSearch( $bst->GetRootNode(), $searchStr ); |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
if( defined( $resultNode ) && $index < $arySize ) |
950
|
|
|
|
|
|
|
{ |
951
|
|
|
|
|
|
|
$index++; |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
# Make Sure Returned Node Data Is Equal With Search String Or Return Old Node |
954
|
|
|
|
|
|
|
$resultNode = $bst->BSTExactSearch( $bst->GetRootNode(), $searchStr ); |
955
|
|
|
|
|
|
|
$resultNode = $oldNode if !defined( $resultNode ); |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
$searchStr .= ( " " . $strAry[$index] ) if ( $index < $arySize ); |
958
|
|
|
|
|
|
|
return $self->_CompoundifySearch( $strAryRef, $resultNode, $searchStr, $index ) if ( $index < $arySize ); |
959
|
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
# Post Check(s) |
962
|
|
|
|
|
|
|
$resultNode = undef if defined( $resultNode ) && ( $resultNode->data ne $searchStr ); |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
if( defined( $oldNode ) ) |
965
|
|
|
|
|
|
|
{ |
966
|
|
|
|
|
|
|
my @searchStrAry = split( ' ', $searchStr ); |
967
|
|
|
|
|
|
|
my @nodeStrAry = split( ' ', $oldNode->data ); |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
if( @searchStrAry > @nodeStrAry ) |
970
|
|
|
|
|
|
|
{ |
971
|
|
|
|
|
|
|
@searchStrAry = splice( @searchStrAry, 0, @nodeStrAry ); |
972
|
|
|
|
|
|
|
my $strA = join( ' ', @searchStrAry ); |
973
|
|
|
|
|
|
|
my $strB = join( ' ', @nodeStrAry ); |
974
|
|
|
|
|
|
|
$oldNode = undef if $strA ne $strB; |
975
|
|
|
|
|
|
|
} |
976
|
|
|
|
|
|
|
elsif( @searchStrAry == @nodeStrAry ) |
977
|
|
|
|
|
|
|
{ |
978
|
|
|
|
|
|
|
$oldNode = undef if $oldNode->data ne $searchStr; |
979
|
|
|
|
|
|
|
} |
980
|
|
|
|
|
|
|
else |
981
|
|
|
|
|
|
|
{ |
982
|
|
|
|
|
|
|
$oldNode = undef; |
983
|
|
|
|
|
|
|
} |
984
|
|
|
|
|
|
|
} |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
# Bug Fix: If Search Word Found At First Array Index And Second Word Not Found. |
990
|
|
|
|
|
|
|
# Prevent Invalid Data From Being Returned. |
991
|
|
|
|
|
|
|
return undef if !defined( $resultNode ) && $index == 1; |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
return $oldNode if !defined( $resultNode ); |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
return $resultNode; |
996
|
|
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
sub ReadCompoundWordDataFromFile |
999
|
|
|
|
|
|
|
{ |
1000
|
|
|
|
|
|
|
my ( $self, $fileDir, $autoSetMaxCompoundWordLength ) = @_; |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir ); |
1003
|
|
|
|
|
|
|
return -1 if !defined ( $fileDir ); |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" ); |
1006
|
|
|
|
|
|
|
return -1 if !( -e "$fileDir" ); |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Reading Compound Word File: \"$fileDir\"" ); |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
my @dataAry = (); |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
# Read XML Data From File To Memory |
1013
|
|
|
|
|
|
|
open( my $fileHandle, '<:encoding(UTF-8)', "$fileDir" ); |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
# Prepare Max Compound Word Length |
1016
|
|
|
|
|
|
|
$self->SetMaxCompoundWordLength( 0 ) if defined ( $autoSetMaxCompoundWordLength ); |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
while( my $row = <$fileHandle> ) |
1019
|
|
|
|
|
|
|
{ |
1020
|
|
|
|
|
|
|
chomp( $row ); |
1021
|
|
|
|
|
|
|
$row = $self->RemoveSpecialCharactersFromString( $row ); |
1022
|
|
|
|
|
|
|
push( @dataAry, $row ); |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
# Find Max Compound Word Length |
1025
|
|
|
|
|
|
|
my @words = split( ' ', $row ); |
1026
|
|
|
|
|
|
|
my $size = @words; |
1027
|
|
|
|
|
|
|
undef( @words ); |
1028
|
|
|
|
|
|
|
$self->SetMaxCompoundWordLength( $size ) if defined( $autoSetMaxCompoundWordLength ) && ( $self->GetMaxCompoundWordLength() < $size ); |
1029
|
|
|
|
|
|
|
} |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
close( $fileHandle ); |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Error: Compound Word Length > 100" ) if ( $self->GetMaxCompoundWordLength() > 100 ); |
1034
|
|
|
|
|
|
|
return -1 if ( $self->GetMaxCompoundWordLength() > 100 ); |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Auto Set Max Compound Word Length To \"" . $self->GetMaxCompoundWordLength() . "\"") if defined ( $autoSetMaxCompoundWordLength ); |
1037
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Reading Complete" ); |
1038
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Sorting Compound Word List" ); |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
@dataAry = sort( @dataAry ); |
1041
|
|
|
|
|
|
|
$self->SetCompoundWordAry( \@dataAry ); |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Stored " . @dataAry . " Compound Words In Memory" ) if ( @dataAry > 0 ); |
1044
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Detected Compound Word Array Data / Auto-Setting Compoundify Text = 1" ) if @dataAry > 0; |
1045
|
|
|
|
|
|
|
$self->SetCompoundifyText( 1 ) if ( @dataAry > 0 ); |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundwordDataFromFile - No Compound Word Array Data Detected / Auto-Setting Compoundify Text = 0" ) if @dataAry == 0; |
1048
|
|
|
|
|
|
|
$self->SetCompoundifyText( 0 ) if ( @dataAry == 0 ); |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Sorting Complete" ); |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
return 0; |
1053
|
|
|
|
|
|
|
} |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
sub SaveCompoundWordListToFile |
1056
|
|
|
|
|
|
|
{ |
1057
|
|
|
|
|
|
|
my ( $self, $savePath ) = @_; |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
$self->WriteLog( "SaveCompoundWordListToFile - Error: Save Path Not Specified" ) if !defined( $savePath ); |
1060
|
|
|
|
|
|
|
return -1 if !defined( $savePath ); |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
$self->WriteLog( "SaveCompoundWordListToFile - Saving Compound Word List To \"$savePath\"" ); |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
# Create File Handle |
1065
|
|
|
|
|
|
|
open( my $fileHandle, '>:encoding(UTF-8)', "$savePath" ); |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
# Write Data To File |
1068
|
|
|
|
|
|
|
for my $compoundWord ( $self->GetCompoundWordAry() ) |
1069
|
|
|
|
|
|
|
{ |
1070
|
|
|
|
|
|
|
print( $fileHandle "$compoundWord\n" ); |
1071
|
|
|
|
|
|
|
} |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
close( $fileHandle ); |
1074
|
|
|
|
|
|
|
undef( $fileHandle ); |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
$self->WriteLog( "SaveCompoundWordListToFile - Compound Word List Saved To \"$savePath\"" ); |
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
return 0; |
1079
|
|
|
|
|
|
|
} |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
sub ReadTextFromFile |
1082
|
|
|
|
|
|
|
{ |
1083
|
|
|
|
|
|
|
my ( $self, $fileDir ) = @_; |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
$self->WriteLog( "ReadTextFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir ); |
1086
|
|
|
|
|
|
|
return "(null)" if !defined ( $fileDir ); |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
$self->WriteLog( "ReadTextFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" ); |
1089
|
|
|
|
|
|
|
return "(null)" if !( -e "$fileDir" ); |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
my $str = ""; |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
# Read XML Data From File To Memory |
1094
|
|
|
|
|
|
|
open( my $fileHandle, '<:encoding(UTF-8)', "$fileDir" ); |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
while( my $row = <$fileHandle> ) |
1097
|
|
|
|
|
|
|
{ |
1098
|
|
|
|
|
|
|
chomp $row; |
1099
|
|
|
|
|
|
|
$str .= " $row"; |
1100
|
|
|
|
|
|
|
} |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
close( $fileHandle ); |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
$self->WriteLog( "ReadTextFromFile - Reading Complete" ); |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
return $str; |
1107
|
|
|
|
|
|
|
} |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
sub SaveTextToFile |
1110
|
|
|
|
|
|
|
{ |
1111
|
|
|
|
|
|
|
my ( $self, $savePath, $str ) = @_; |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
$self->WriteLog( "SaveTextToFile - Error: No Save Path Specified" ) if !defined( $savePath ); |
1114
|
|
|
|
|
|
|
return -1 if !defined( $savePath ); |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
$self->WriteLog( "SaveTextToFile - Saving Data To \"$savePath\"" ); |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
# Create file handle |
1119
|
|
|
|
|
|
|
my $fileHandle = undef; |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
# Over write file if $appendToFile == 0 |
1122
|
|
|
|
|
|
|
open( $fileHandle, '>:encoding(UTF-8)', "$savePath" ); |
1123
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
# Write Data To File |
1125
|
|
|
|
|
|
|
print( $fileHandle "$str" ); |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
close( $fileHandle ); |
1128
|
|
|
|
|
|
|
undef( $fileHandle ); |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
$self->WriteLog( "SaveTextToFile - File Saved To \"$savePath\"" ); |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
return 0; |
1133
|
|
|
|
|
|
|
} |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
sub _ReadXMLDataFromFile |
1136
|
|
|
|
|
|
|
{ |
1137
|
|
|
|
|
|
|
my ( $self, $fileDir ) = @_; |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
$self->WriteLog( "_ReadXMLDataFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir ); |
1140
|
|
|
|
|
|
|
return "(null)" if !defined ( $fileDir ); |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
$self->WriteLog( "_ReadXMLDataFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" ); |
1143
|
|
|
|
|
|
|
return "(null)" if !( -e "$fileDir" ); |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
my $data = ""; |
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
# Extract XML File From GZip To Memory |
1148
|
|
|
|
|
|
|
if ( index( $fileDir, ".gz" ) != -1 ) |
1149
|
|
|
|
|
|
|
{ |
1150
|
|
|
|
|
|
|
IO::Uncompress::Gunzip::gunzip "$fileDir" => \$data or die "gunzip failed\n"; |
1151
|
|
|
|
|
|
|
} |
1152
|
|
|
|
|
|
|
# Read XML Data From File To Memory |
1153
|
|
|
|
|
|
|
else |
1154
|
|
|
|
|
|
|
{ |
1155
|
|
|
|
|
|
|
open( my $fileHandle, '<:encoding(UTF-8)', "$fileDir" ); |
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
while( my $row = <$fileHandle> ) |
1158
|
|
|
|
|
|
|
{ |
1159
|
|
|
|
|
|
|
chomp $row; |
1160
|
|
|
|
|
|
|
$data .= "$row\n"; |
1161
|
|
|
|
|
|
|
} |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
close( $fileHandle ); |
1164
|
|
|
|
|
|
|
} |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
$self->WriteLog( "_ReadXMLDataFromFile - Reading Data Complete/Data Stored" ); |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
return $data; |
1169
|
|
|
|
|
|
|
} |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
sub _SaveTextCorpusToFile |
1172
|
|
|
|
|
|
|
{ |
1173
|
|
|
|
|
|
|
my ( $self, $savePath, $appendToFile ) = @_; |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
# Prevent Other Threads From Writing At The Same Time |
1176
|
|
|
|
|
|
|
{ |
1177
|
|
|
|
|
|
|
lock( $writeLock ); |
1178
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
$self->WriteLog( "_SaveTextCorpusToFile - Error: No Save Path Specified" ) if !defined( $savePath ); |
1180
|
|
|
|
|
|
|
return -1 if !defined( $savePath ); |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
$appendToFile = $self->GetOverwriteExitingFile() if !defined ( $appendToFile ); |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
$self->WriteLog( "_SaveTextCorpusToFile - Saving Text Corpus To \"$savePath\"" ); |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
# Create file handle |
1187
|
|
|
|
|
|
|
my $fileHandle = undef; |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
# Over write file if $appendToFile == 0 |
1190
|
|
|
|
|
|
|
open( $fileHandle, '>:encoding(UTF-8)', "$savePath" ) if $appendToFile == 0; |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
# Append to file if $appendToFile == 1 |
1193
|
|
|
|
|
|
|
open( $fileHandle, '>>:encoding(UTF-8)', "$savePath" ) if $appendToFile == 1; |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
# Write Data To File |
1196
|
|
|
|
|
|
|
print( $fileHandle $self->GetTextCorpusStr() ); |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
close( $fileHandle ); |
1199
|
|
|
|
|
|
|
undef( $fileHandle ); |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
$self->WriteLog( "_SaveTextCorpusToFile - Text Corpus Saved To \"$savePath\"" ); |
1202
|
|
|
|
|
|
|
} |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
return 1; |
1205
|
|
|
|
|
|
|
} |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
sub IsDateInSpecifiedRange |
1208
|
|
|
|
|
|
|
{ |
1209
|
|
|
|
|
|
|
my ( $self, $date, $beginDate, $endDate ) = @_; |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
$self->WriteLog( "Error: Date Not Specified To Check Against Date Range" ) if !defined ( $date ); |
1212
|
|
|
|
|
|
|
return 0 if !defined ( $date ); |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
$self->WriteLog( "Warning - BeginDate Parameter Not Specified - Using Default Value: " . $self->GetBeginDate() ) if !defined ( $beginDate ); |
1215
|
|
|
|
|
|
|
$self->WriteLog( "Warning - EndDate Parameter Not Specified - Using Default Value: " . $self->GetEndDate() ) if !defined ( $endDate ); |
1216
|
|
|
|
|
|
|
$beginDate = $self->GetBeginDate() if !defined ( $beginDate ); |
1217
|
|
|
|
|
|
|
$endDate = $self->GetEndDate() if !defined ( $endDate ); |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
my @dateAry = split( '/', $date ); |
1220
|
|
|
|
|
|
|
my @beginDateAry = split( '/', $beginDate ); |
1221
|
|
|
|
|
|
|
my @endDateAry = split( '/', $endDate ); |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
# Check(s) |
1224
|
|
|
|
|
|
|
if( @dateAry != 3 ) |
1225
|
|
|
|
|
|
|
{ |
1226
|
|
|
|
|
|
|
$self->WriteLog( "Invalid Date Format - Requested Format: Month/Day/Year : Specified Format - $date" ); |
1227
|
|
|
|
|
|
|
return 0; |
1228
|
|
|
|
|
|
|
} |
1229
|
|
|
|
|
|
|
elsif( @beginDateAry != 3 ) |
1230
|
|
|
|
|
|
|
{ |
1231
|
|
|
|
|
|
|
$self->WriteLog( "Invalid Date Format - Requested Format: Month/Day/Year : Specified Format - $beginDate" ); |
1232
|
|
|
|
|
|
|
return 0; |
1233
|
|
|
|
|
|
|
} |
1234
|
|
|
|
|
|
|
elsif( @endDateAry != 3 ) |
1235
|
|
|
|
|
|
|
{ |
1236
|
|
|
|
|
|
|
$self->WriteLog( "Invalid Date Format - Requested Format: Month/Day/Year : Specified Format - $endDate" ); |
1237
|
|
|
|
|
|
|
return 0; |
1238
|
|
|
|
|
|
|
} |
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
# Begin Date Comparison |
1241
|
|
|
|
|
|
|
my $dateYear = $dateAry[2]; |
1242
|
|
|
|
|
|
|
my $beginYear = $beginDateAry[2]; |
1243
|
|
|
|
|
|
|
my $endYear = $endDateAry[2]; |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
my $dateMonth = $dateAry[0]; |
1246
|
|
|
|
|
|
|
my $beginMonth = $beginDateAry[0]; |
1247
|
|
|
|
|
|
|
my $endMonth = $endDateAry[0]; |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
my $dateDay = $dateAry[1]; |
1250
|
|
|
|
|
|
|
my $beginDay = $beginDateAry[1]; |
1251
|
|
|
|
|
|
|
my $endDay = $endDateAry[1]; |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
# Check(s) |
1254
|
|
|
|
|
|
|
return 0 if ( $dateYear < 0 || $beginYear < 0 || $endYear < 0 || |
1255
|
|
|
|
|
|
|
$dateMonth < 0 || $beginMonth < 0 || $endMonth < 0 || |
1256
|
|
|
|
|
|
|
$dateDay < 0 || $beginDay < 0 || $endDay < 0 ); |
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
return 0 if ( $dateYear < $beginYear || $dateYear > $endYear ); |
1259
|
|
|
|
|
|
|
return 0 if ( ( $dateYear == $beginYear && $dateMonth < $beginMonth ) || ( $dateYear == $endYear && $dateMonth > $endMonth ) ); |
1260
|
|
|
|
|
|
|
return 0 if ( ( $dateYear == $beginYear && $dateMonth == $beginMonth && $dateDay < $beginDay ) |
1261
|
|
|
|
|
|
|
|| ( $dateYear == $endYear && $dateMonth == $endMonth && $dateDay > $endDay ) ); |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
return 1; |
1264
|
|
|
|
|
|
|
} |
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
sub IsFileOrDirectory |
1267
|
|
|
|
|
|
|
{ |
1268
|
|
|
|
|
|
|
my ( $self, $path ) = @_; |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
# Check(s) |
1271
|
|
|
|
|
|
|
return "unknown" if !defined( $path ); |
1272
|
|
|
|
|
|
|
return "unknown" if !( -e $path ); |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
return "file" if ( -f $path ); |
1275
|
|
|
|
|
|
|
return "dir" if ( -d $path ); |
1276
|
|
|
|
|
|
|
} |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
sub RemoveSpecialCharactersFromString |
1279
|
|
|
|
|
|
|
{ |
1280
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1281
|
|
|
|
|
|
|
$str = lc( $str ); # Convert all characters to lowercase |
1282
|
|
|
|
|
|
|
$str =~ s/ +/ /g; # Remove duplicate white spaces between words |
1283
|
|
|
|
|
|
|
$str =~ s/'s//g; # Remove "'s" characters (Apostrophe 's') |
1284
|
|
|
|
|
|
|
$str =~ s/-/ /g; # Replace all hyphen characters to spaces |
1285
|
|
|
|
|
|
|
$str =~ tr/a-z\015\012/ /cs; # Remove all characters except 'a' to 'z' and new-line characters |
1286
|
|
|
|
|
|
|
#$str =~ s/[\$#@~!&*()\[\];.,:?^\-'`\\\/]+//g; # Does not include numeric characters |
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
# Convert String Line Ending Suitable To The Target |
1289
|
|
|
|
|
|
|
my $lineEnding = ""; |
1290
|
|
|
|
|
|
|
my $os = $self->GetOSType(); |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
$lineEnding = "\015\012" if ( $os eq "MSWin32" ); |
1293
|
|
|
|
|
|
|
$lineEnding = "\012" if ( $os eq "linux" ); |
1294
|
|
|
|
|
|
|
$lineEnding = "\015" if ( $os eq "MacOS" ); |
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
$str =~ s/(\015\012|\012|\015)/$lineEnding/g; |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
# Removes Spaces At Both Ends Of String And More Than Once Space In-Between Ends |
1299
|
|
|
|
|
|
|
$str =~ s/^\s+|\s(?=\s)|\s+$//g; |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
return $str; |
1302
|
|
|
|
|
|
|
} |
1303
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
sub GetFileType |
1305
|
|
|
|
|
|
|
{ |
1306
|
|
|
|
|
|
|
my ( $self, $filePath ) = @_; |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
my $ft = File::Type->new(); |
1309
|
|
|
|
|
|
|
my $fileType = $ft->checktype_filename( $filePath ); |
1310
|
|
|
|
|
|
|
undef( $ft ); |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
return $fileType; |
1313
|
|
|
|
|
|
|
} |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
sub _DateCheck |
1316
|
|
|
|
|
|
|
{ |
1317
|
|
|
|
|
|
|
my ( $self ) = @_; |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
my $beginDate = $self->GetBeginDate(); |
1320
|
|
|
|
|
|
|
my $endDate = $self->GetEndDate(); |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
# Check(s) |
1323
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Node Defined" ) if !defined( $beginDate ); |
1324
|
|
|
|
|
|
|
return -1 if !defined( $beginDate ); |
1325
|
|
|
|
|
|
|
|
1326
|
|
|
|
|
|
|
$self->Writelog( "_DateCheck - Error: End Date Not Defined" ) if !defined( $endDate ); |
1327
|
|
|
|
|
|
|
return -1 if !defined( $endDate ); |
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
# Parse Begin Date |
1330
|
|
|
|
|
|
|
my $delimiter = ""; |
1331
|
|
|
|
|
|
|
$delimiter = "-" if index( $beginDate, "-" ) != -1; |
1332
|
|
|
|
|
|
|
$delimiter = "/" if index( $beginDate, "/" ) != -1; |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Improper Format" ) if ( $delimiter eq "" ); |
1335
|
|
|
|
|
|
|
return -1 if ( $delimiter eq "" ); |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
my @bDateAry = split( $delimiter, $beginDate ); |
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
# Check For Default Begin Date And Adjust Accordingly |
1340
|
|
|
|
|
|
|
if( $bDateAry[0] == 0 && $bDateAry[1] == 0 && $bDateAry[2] == 0000 ) |
1341
|
|
|
|
|
|
|
{ |
1342
|
|
|
|
|
|
|
$bDateAry[0] = 01; |
1343
|
|
|
|
|
|
|
$bDateAry[1] = 01; |
1344
|
|
|
|
|
|
|
$bDateAry[2] = 0000; |
1345
|
|
|
|
|
|
|
} |
1346
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
# Set Date In Proper Format |
1348
|
|
|
|
|
|
|
$beginDate = join( '/', @bDateAry ) if ( $delimiter eq "-" ); |
1349
|
|
|
|
|
|
|
$self->SetBeginDate( $beginDate ) if ( $delimiter eq "-" ); |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
# Parse End Date |
1352
|
|
|
|
|
|
|
$delimiter = ""; |
1353
|
|
|
|
|
|
|
$delimiter = "-" if index( $endDate, "-" ) != -1; |
1354
|
|
|
|
|
|
|
$delimiter = "/" if index( $endDate, "/" ) != -1; |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: End Date Improper Format" ) if ( $delimiter eq "" ); |
1357
|
|
|
|
|
|
|
return -1 if ( $delimiter eq "" ); |
1358
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
my @eDateAry = split( $delimiter, $endDate ); |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
# Check For Default End Date And Adjust Accordingly |
1362
|
|
|
|
|
|
|
if( $eDateAry[0] == 99 && $eDateAry[1] == 99 && $eDateAry[2] == 9999 ) |
1363
|
|
|
|
|
|
|
{ |
1364
|
|
|
|
|
|
|
$eDateAry[0] = 12; |
1365
|
|
|
|
|
|
|
$eDateAry[1] = 31; |
1366
|
|
|
|
|
|
|
$eDateAry[2] = 9999; |
1367
|
|
|
|
|
|
|
} |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
# Set Date In Proper Format |
1370
|
|
|
|
|
|
|
$endDate = join( '/', @eDateAry ) if ( $delimiter eq "-" ); |
1371
|
|
|
|
|
|
|
$self->SetEndDate( $endDate ) if ( $delimiter eq "-" ); |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
# Basic Checks |
1374
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Not Specified In \"Month/Day/Year\" or \"Month-Day-Year\" Format" ) if ( @bDateAry != 3 ); |
1375
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: End Date Not Specified In \"Month/Day/Year\" or \"Month-Day-Year\" Format" ) if ( @eDateAry != 3 ); |
1376
|
|
|
|
|
|
|
return -1 if ( @bDateAry != 3 ) || ( @eDateAry != 3 ); |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect Begin Date Month Value - Expected Value: 1-12 / Specified Value: " . $bDateAry[0] ) if ( $bDateAry[0] < 1 || $bDateAry[0] > 12 ); |
1379
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect End Date Month Value - Expected Value: 1-12 / Specified Value: " . $eDateAry[0] ) if ( $eDateAry[0] < 1 || $eDateAry[0] > 12 ); |
1380
|
|
|
|
|
|
|
return -1 if ( $bDateAry[0] < 1 || $bDateAry[0] > 12 ) || ( $eDateAry[0] < 1 || $eDateAry[0] > 12 ); |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect Begin Date Day Value - Expected Value: 1-31 / Specified Value: " . $bDateAry[1] ) if ( $bDateAry[1] < 1 || $bDateAry[1] > 31 ); |
1383
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect End Date Day Value - Expected Value: 1-31 / Specified Value: " . $eDateAry[1] ) if ( $eDateAry[1] < 1 || $eDateAry[1] > 31 ); |
1384
|
|
|
|
|
|
|
return -1 if ( $bDateAry[1] < 1 || $bDateAry[1] > 31 ) || ( $eDateAry[1] < 1 || $eDateAry[1] > 31 ); |
1385
|
|
|
|
|
|
|
|
1386
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect Begin Date Year Value - Expected Value: 0-9999 / Specified Value: " . $bDateAry[2] ) if ( $bDateAry[2] < 0 || $bDateAry[2] > 9999 ); |
1387
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect End Date Year Value - Expected Value: 0-9999 / Specified Value: " . $eDateAry[2] ) if ( $eDateAry[2] < 0 || $eDateAry[2] > 9999 ); |
1388
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] < 0 || $bDateAry[2] > 9999 ) || ( $eDateAry[2] < 0 || $eDateAry[2] > 9999 ); |
1389
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
# Advanced Checks |
1391
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Year > End Date Year" ) if ( $bDateAry[2] > $eDateAry[2] ); |
1392
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] > $eDateAry[2] ); |
1393
|
|
|
|
|
|
|
|
1394
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Years Equal, Begin Date Month > End Date Month" ) if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] > $eDateAry[0] ); |
1395
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] > $eDateAry[0] ); |
1396
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Years And Months Equal, Begin Date Day > End Date Day" ) if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] == $eDateAry[0] && $bDateAry[1] > $eDateAry[1] ); |
1398
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] == $eDateAry[0] && $bDateAry[1] > $eDateAry[1] ); |
1399
|
|
|
|
|
|
|
|
1400
|
|
|
|
|
|
|
# Clean Up |
1401
|
|
|
|
|
|
|
$beginDate = ""; |
1402
|
|
|
|
|
|
|
$endDate = ""; |
1403
|
|
|
|
|
|
|
$delimiter = ""; |
1404
|
|
|
|
|
|
|
@bDateAry = (); |
1405
|
|
|
|
|
|
|
@eDateAry = (); |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
return 0; |
1408
|
|
|
|
|
|
|
} |
1409
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
sub GetOSType |
1411
|
|
|
|
|
|
|
{ |
1412
|
|
|
|
|
|
|
my ( $self ) = @_; |
1413
|
|
|
|
|
|
|
return $^O; |
1414
|
|
|
|
|
|
|
} |
1415
|
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
###################################################################################### |
1418
|
|
|
|
|
|
|
# Accessors |
1419
|
|
|
|
|
|
|
###################################################################################### |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
sub GetDebugLog |
1422
|
|
|
|
|
|
|
{ |
1423
|
|
|
|
|
|
|
my ( $self ) = @_; |
1424
|
|
|
|
|
|
|
$self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } ); |
1425
|
|
|
|
|
|
|
return $self->{ _debugLog }; |
1426
|
|
|
|
|
|
|
} |
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
sub GetWriteLog |
1429
|
|
|
|
|
|
|
{ |
1430
|
|
|
|
|
|
|
my ( $self ) = @_; |
1431
|
|
|
|
|
|
|
$self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } ); |
1432
|
|
|
|
|
|
|
return $self->{ _writeLog }; |
1433
|
|
|
|
|
|
|
} |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
sub GetStoreTitle |
1436
|
|
|
|
|
|
|
{ |
1437
|
|
|
|
|
|
|
my ( $self ) = @_; |
1438
|
|
|
|
|
|
|
$self->{ _storeTitle } = 1 if !defined ( $self->{ _storeTitle } ); |
1439
|
|
|
|
|
|
|
return $self->{ _storeTitle }; |
1440
|
|
|
|
|
|
|
} |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
sub GetStoreAbstract |
1443
|
|
|
|
|
|
|
{ |
1444
|
|
|
|
|
|
|
my ( $self ) = @_; |
1445
|
|
|
|
|
|
|
$self->{ _storeAbstract } = 1 if !defined ( $self->{ _storeAbstract } ); |
1446
|
|
|
|
|
|
|
return $self->{ _storeAbstract }; |
1447
|
|
|
|
|
|
|
} |
1448
|
|
|
|
|
|
|
|
1449
|
|
|
|
|
|
|
sub GetQuickParse |
1450
|
|
|
|
|
|
|
{ |
1451
|
|
|
|
|
|
|
my ( $self ) = @_; |
1452
|
|
|
|
|
|
|
$self->{ _quickParse } = 0 if !defined ( $self->{ _quickParse } ); |
1453
|
|
|
|
|
|
|
return $self->{ _quickParse }; |
1454
|
|
|
|
|
|
|
} |
1455
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
sub GetCompoundifyText |
1457
|
|
|
|
|
|
|
{ |
1458
|
|
|
|
|
|
|
my ( $self ) = @_; |
1459
|
|
|
|
|
|
|
$self->{ _compoundifyText } = 0 if !defined ( $self->{ _compoundifyText } ); |
1460
|
|
|
|
|
|
|
return $self->{ _compoundifyText }; |
1461
|
|
|
|
|
|
|
} |
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
sub GetNumOfThreads |
1464
|
|
|
|
|
|
|
{ |
1465
|
|
|
|
|
|
|
my ( $self ) = @_; |
1466
|
|
|
|
|
|
|
$self->{ _numOfThreads } = Sys::CpuAffinity::getNumCpus() if !defined ( $self->{ _numOfThreads } ); |
1467
|
|
|
|
|
|
|
return $self->{ _numOfThreads }; |
1468
|
|
|
|
|
|
|
} |
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
sub GetWorkingDir |
1471
|
|
|
|
|
|
|
{ |
1472
|
|
|
|
|
|
|
my ( $self ) = @_; |
1473
|
|
|
|
|
|
|
$self->{ _workingDir } = Cwd::getcwd() if !defined $self->{ _workingDir }; |
1474
|
|
|
|
|
|
|
return $self->{ _workingDir }; |
1475
|
|
|
|
|
|
|
} |
1476
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
sub GetSavePath |
1478
|
|
|
|
|
|
|
{ |
1479
|
|
|
|
|
|
|
my ( $self ) = @_; |
1480
|
|
|
|
|
|
|
$self->{ _savePath } = "(null)" if !defined $self->{ _savePath }; |
1481
|
|
|
|
|
|
|
return $self->{ _savePath }; |
1482
|
|
|
|
|
|
|
} |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
sub GetBeginDate |
1485
|
|
|
|
|
|
|
{ |
1486
|
|
|
|
|
|
|
my ( $self ) = @_; |
1487
|
|
|
|
|
|
|
$self->{ _beginDate } = "00/00/0000" if !defined ( $self->{ _beginDate } ); |
1488
|
|
|
|
|
|
|
return $self->{ _beginDate }; |
1489
|
|
|
|
|
|
|
} |
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
sub GetEndDate |
1492
|
|
|
|
|
|
|
{ |
1493
|
|
|
|
|
|
|
my ( $self ) = @_; |
1494
|
|
|
|
|
|
|
$self->{ _endDate } = "99/99/9999" if !defined ( $self->{ _endDate } ); |
1495
|
|
|
|
|
|
|
return $self->{ _endDate }; |
1496
|
|
|
|
|
|
|
} |
1497
|
|
|
|
|
|
|
|
1498
|
|
|
|
|
|
|
sub GetXMLStringToParse |
1499
|
|
|
|
|
|
|
{ |
1500
|
|
|
|
|
|
|
my ( $self ) = @_; |
1501
|
|
|
|
|
|
|
$self->{ _xmlStringToParse } = "(null)" if !defined ( $self->{ _xmlStringToParse } ); |
1502
|
|
|
|
|
|
|
return $self->{ _xmlStringToParse }; |
1503
|
|
|
|
|
|
|
} |
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
sub GetTextCorpusStr |
1506
|
|
|
|
|
|
|
{ |
1507
|
|
|
|
|
|
|
my ( $self ) = @_; |
1508
|
|
|
|
|
|
|
$self->{ _textCorpusStr } = "" if !defined ( $self->{_textCorpusStr } ); |
1509
|
|
|
|
|
|
|
return $self->{ _textCorpusStr }; |
1510
|
|
|
|
|
|
|
} |
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
sub GetFileHandle |
1513
|
|
|
|
|
|
|
{ |
1514
|
|
|
|
|
|
|
my ( $self ) = @_; |
1515
|
|
|
|
|
|
|
$self->{ _fileHandle } = undef if !defined ( $self->{ _fileHandle } ); |
1516
|
|
|
|
|
|
|
return $self->{ _fileHandle }; |
1517
|
|
|
|
|
|
|
} |
1518
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
sub GetTwigHandler |
1520
|
|
|
|
|
|
|
{ |
1521
|
|
|
|
|
|
|
my ( $self ) = @_; |
1522
|
|
|
|
|
|
|
$self->{ _twigHandler } = "(null)" if !defined ( $self->{ _twigHandler } ); |
1523
|
|
|
|
|
|
|
return $self->{ _twigHandler }; |
1524
|
|
|
|
|
|
|
} |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
sub GetParsedCount |
1527
|
|
|
|
|
|
|
{ |
1528
|
|
|
|
|
|
|
my ( $self ) = @_; |
1529
|
|
|
|
|
|
|
$self->{ _parsedCount } = 0 if !defined ( $self->{ _parsedCount } ); |
1530
|
|
|
|
|
|
|
return $self->{ _parsedCount }; |
1531
|
|
|
|
|
|
|
} |
1532
|
|
|
|
|
|
|
|
1533
|
|
|
|
|
|
|
sub GetTempStr |
1534
|
|
|
|
|
|
|
{ |
1535
|
|
|
|
|
|
|
my ( $self ) = @_; |
1536
|
|
|
|
|
|
|
$self->{ _tempStr } = "" if !defined ( $self->{ _tempStr } ); |
1537
|
|
|
|
|
|
|
return $self->{ _tempStr }; |
1538
|
|
|
|
|
|
|
} |
1539
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
sub GetTempDate |
1541
|
|
|
|
|
|
|
{ |
1542
|
|
|
|
|
|
|
my ( $self ) = @_; |
1543
|
|
|
|
|
|
|
$self->{ _tempDate } = "" if !defined ( $self->{ _tempDate } ); |
1544
|
|
|
|
|
|
|
return $self->{ _tempDate }; |
1545
|
|
|
|
|
|
|
} |
1546
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
sub GetCompoundWordAry |
1548
|
|
|
|
|
|
|
{ |
1549
|
|
|
|
|
|
|
my ( $self ) = @_; |
1550
|
|
|
|
|
|
|
$self->{ _compoundWordAry } = () if !defined ( $self->{ _compoundWordAry } ); |
1551
|
|
|
|
|
|
|
return @{ $self->{ _compoundWordAry } }; |
1552
|
|
|
|
|
|
|
} |
1553
|
|
|
|
|
|
|
|
1554
|
|
|
|
|
|
|
sub GetCompoundWordBST |
1555
|
|
|
|
|
|
|
{ |
1556
|
|
|
|
|
|
|
my ( $self ) = @_; |
1557
|
|
|
|
|
|
|
$self->{ _compoundWordBST } = Word2vec::Bst->new() if !defined ( $self->{ _compoundWordBST } ); |
1558
|
|
|
|
|
|
|
return $self->{ _compoundWordBST }; |
1559
|
|
|
|
|
|
|
} |
1560
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
sub GetMaxCompoundWordLength |
1562
|
|
|
|
|
|
|
{ |
1563
|
|
|
|
|
|
|
my ( $self ) = @_; |
1564
|
|
|
|
|
|
|
$self->{ _maxCompoundWordLength } = 20 if !defined ( $self->{ _maxCompoundWordLength } ); |
1565
|
|
|
|
|
|
|
return $self->{ _maxCompoundWordLength }; |
1566
|
|
|
|
|
|
|
} |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
sub GetOverwriteExistingFile |
1569
|
|
|
|
|
|
|
{ |
1570
|
|
|
|
|
|
|
my ( $self ) = @_; |
1571
|
|
|
|
|
|
|
$self->{ _overwriteExistingFile } = 0 if !defined ( $self->{ _overwriteExistingFile } ); |
1572
|
|
|
|
|
|
|
return $self->{ _overwriteExistingFile }; |
1573
|
|
|
|
|
|
|
} |
1574
|
|
|
|
|
|
|
|
1575
|
|
|
|
|
|
|
|
1576
|
|
|
|
|
|
|
###################################################################################### |
1577
|
|
|
|
|
|
|
# Mutators |
1578
|
|
|
|
|
|
|
###################################################################################### |
1579
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
sub SetStoreTitle |
1581
|
|
|
|
|
|
|
{ |
1582
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1583
|
|
|
|
|
|
|
return $self->{ _storeTitle } = $value; |
1584
|
|
|
|
|
|
|
} |
1585
|
|
|
|
|
|
|
|
1586
|
|
|
|
|
|
|
sub SetStoreAbstract |
1587
|
|
|
|
|
|
|
{ |
1588
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1589
|
|
|
|
|
|
|
return $self->{ _storeAbstract } = $value; |
1590
|
|
|
|
|
|
|
} |
1591
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
sub SetWorkingDir |
1593
|
|
|
|
|
|
|
{ |
1594
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
1595
|
|
|
|
|
|
|
return $self->{ _workingDir } = $dir; |
1596
|
|
|
|
|
|
|
} |
1597
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
sub SetSavePath |
1599
|
|
|
|
|
|
|
{ |
1600
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
1601
|
|
|
|
|
|
|
return $self->{ _savePath } = $dir; |
1602
|
|
|
|
|
|
|
} |
1603
|
|
|
|
|
|
|
|
1604
|
|
|
|
|
|
|
sub SetQuickParse |
1605
|
|
|
|
|
|
|
{ |
1606
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1607
|
|
|
|
|
|
|
return $self->{ _quickParse } = $value; |
1608
|
|
|
|
|
|
|
} |
1609
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
sub SetCompoundifyText |
1611
|
|
|
|
|
|
|
{ |
1612
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1613
|
|
|
|
|
|
|
return $self->{ _compoundifyText } = $value; |
1614
|
|
|
|
|
|
|
} |
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
sub SetNumOfThreads |
1617
|
|
|
|
|
|
|
{ |
1618
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1619
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
# Check |
1621
|
|
|
|
|
|
|
$self->WriteLog( "SetNumOfThreads - Warning: Number Of Threads Value < 0 / Setting Default Value" ) if ( $value < 0 ); |
1622
|
|
|
|
|
|
|
$value = Sys::CpuAffinity::getNumCpus() if ( $value < 0 ); |
1623
|
|
|
|
|
|
|
|
1624
|
|
|
|
|
|
|
return $self->{ _numOfThreads } = $value; |
1625
|
|
|
|
|
|
|
} |
1626
|
|
|
|
|
|
|
|
1627
|
|
|
|
|
|
|
sub SetBeginDate |
1628
|
|
|
|
|
|
|
{ |
1629
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1630
|
|
|
|
|
|
|
return $self->{ _beginDate } = $str; |
1631
|
|
|
|
|
|
|
} |
1632
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
sub SetEndDate |
1634
|
|
|
|
|
|
|
{ |
1635
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1636
|
|
|
|
|
|
|
return $self->{ _endDate } = $str; |
1637
|
|
|
|
|
|
|
} |
1638
|
|
|
|
|
|
|
|
1639
|
|
|
|
|
|
|
sub SetXMLStringToParse |
1640
|
|
|
|
|
|
|
{ |
1641
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1642
|
|
|
|
|
|
|
return $self->{ _xmlStringToParse } = $str; |
1643
|
|
|
|
|
|
|
} |
1644
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
sub SetTextCorpusStr |
1646
|
|
|
|
|
|
|
{ |
1647
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1648
|
|
|
|
|
|
|
return $self->{ _textCorpusStr } = $str; |
1649
|
|
|
|
|
|
|
} |
1650
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
sub AppendStrToTextCorpus |
1652
|
|
|
|
|
|
|
{ |
1653
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1654
|
|
|
|
|
|
|
|
1655
|
|
|
|
|
|
|
return if ( $str eq "" || !defined( $str ) ); |
1656
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
# Prevent Other Threads From Appending Data At The Same Time |
1658
|
|
|
|
|
|
|
{ |
1659
|
|
|
|
|
|
|
lock( $appendLock ); |
1660
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
# Removes Spaces At Both Ends Of String And More Than Once Space In-Between Ends |
1662
|
|
|
|
|
|
|
$str =~ s/^\s+|\s(?=\s)|\s+$//g; |
1663
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
# Append string to text corpus |
1665
|
|
|
|
|
|
|
$self->{ _textCorpusStr } .= "$str "; |
1666
|
|
|
|
|
|
|
} |
1667
|
|
|
|
|
|
|
} |
1668
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
sub ClearTextCorpusStr |
1670
|
|
|
|
|
|
|
{ |
1671
|
|
|
|
|
|
|
my ( $self ) = @_; |
1672
|
|
|
|
|
|
|
return $self->{ _textCorpusStr } = ""; |
1673
|
|
|
|
|
|
|
} |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
sub SetTempStr |
1676
|
|
|
|
|
|
|
{ |
1677
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1678
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
# Convert String To UTF8 Format Encoding (Removes Special Characters / Fixes Wide Character Bug) |
1680
|
|
|
|
|
|
|
$str = $self->RemoveSpecialCharactersFromString( $str ); |
1681
|
|
|
|
|
|
|
$str = Text::Unidecode::unidecode( $str ); |
1682
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
return $self->{ _tempStr } = $str; |
1684
|
|
|
|
|
|
|
} |
1685
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
sub AppendToTempStr |
1687
|
|
|
|
|
|
|
{ |
1688
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1689
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
# Convert String To UTF8 Format Encoding (Removes Special Characters / Fixes Wide Character Bug) |
1691
|
|
|
|
|
|
|
$str = $self->RemoveSpecialCharactersFromString( $str ); |
1692
|
|
|
|
|
|
|
$str = Text::Unidecode::unidecode( $str ); |
1693
|
|
|
|
|
|
|
|
1694
|
|
|
|
|
|
|
# Removes Spaces At Both Ends Of String And More Than Once Space In-Between Ends |
1695
|
|
|
|
|
|
|
$str =~ s/^\s+|\s(?=\s)|\s+$//g; |
1696
|
|
|
|
|
|
|
|
1697
|
|
|
|
|
|
|
# Increment Word Counter |
1698
|
|
|
|
|
|
|
my @words = split( ' ', $str ); |
1699
|
|
|
|
|
|
|
$preCompWordCount += scalar( @words ); |
1700
|
|
|
|
|
|
|
undef( @words ); |
1701
|
|
|
|
|
|
|
|
1702
|
|
|
|
|
|
|
# Append string to text corpus |
1703
|
|
|
|
|
|
|
return $self->{ _tempStr } .= "$str "; |
1704
|
|
|
|
|
|
|
} |
1705
|
|
|
|
|
|
|
|
1706
|
|
|
|
|
|
|
sub ClearTempStr |
1707
|
|
|
|
|
|
|
{ |
1708
|
|
|
|
|
|
|
my ( $self ) = @_; |
1709
|
|
|
|
|
|
|
return $self->{ _tempStr } = ""; |
1710
|
|
|
|
|
|
|
} |
1711
|
|
|
|
|
|
|
|
1712
|
|
|
|
|
|
|
sub SetTempDate |
1713
|
|
|
|
|
|
|
{ |
1714
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1715
|
|
|
|
|
|
|
return $self->{ _tempDate } = $str; |
1716
|
|
|
|
|
|
|
} |
1717
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
sub ClearTempDate |
1719
|
|
|
|
|
|
|
{ |
1720
|
|
|
|
|
|
|
my ( $self ) = @_; |
1721
|
|
|
|
|
|
|
return $self->{ _tempDate } = ""; |
1722
|
|
|
|
|
|
|
} |
1723
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
sub SetCompoundWordAry |
1725
|
|
|
|
|
|
|
{ |
1726
|
|
|
|
|
|
|
my ( $self, $aryRef ) = @_; |
1727
|
|
|
|
|
|
|
$self->WriteLog( "Warning: Setting CompoundWordArray when array is already defined - Clearing Previous Array" ) if ( @{ $self->{ _compoundWordAry } } > 0 ); |
1728
|
|
|
|
|
|
|
undef( $self->{ _compoundWordAry } ) if ( @{ $self->{ _compoundWordAry } } > 0 ); |
1729
|
|
|
|
|
|
|
return @{ $self->{ _compoundWordAry } } = @{ $aryRef }; |
1730
|
|
|
|
|
|
|
} |
1731
|
|
|
|
|
|
|
|
1732
|
|
|
|
|
|
|
sub ClearCompoundWordAry |
1733
|
|
|
|
|
|
|
{ |
1734
|
|
|
|
|
|
|
my ( $self ) = @_; |
1735
|
|
|
|
|
|
|
undef( $self->{ _compoundWordAry } ); |
1736
|
|
|
|
|
|
|
return @{ $self->{ _compoundWordAry } } = (); |
1737
|
|
|
|
|
|
|
} |
1738
|
|
|
|
|
|
|
|
1739
|
|
|
|
|
|
|
sub SetCompoundWordBST |
1740
|
|
|
|
|
|
|
{ |
1741
|
|
|
|
|
|
|
my ( $self, $bst ) = @_; |
1742
|
|
|
|
|
|
|
$self->WriteLog( "Warning: Setting CompoundWordBST when BST is already defined - Clearing Previous BST" ) if defined ( $self->{ _compoundWordBST } ); |
1743
|
|
|
|
|
|
|
$self->{ _compoundWordBST }->DESTROY() if defined( $self->{ _compoundWordBST } ); |
1744
|
|
|
|
|
|
|
undef( $self->{ _compoundWordBST } ) if defined ( $self->{ _compoundWordBST } ); |
1745
|
|
|
|
|
|
|
return $self->{ _compoundWordBST } = $bst; |
1746
|
|
|
|
|
|
|
} |
1747
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
sub ClearCompoundWordBST |
1749
|
|
|
|
|
|
|
{ |
1750
|
|
|
|
|
|
|
my ( $self ) = @_; |
1751
|
|
|
|
|
|
|
undef( $self->{ _compoundWordBST } ); |
1752
|
|
|
|
|
|
|
return $self->{ _compoundWordBST }; |
1753
|
|
|
|
|
|
|
} |
1754
|
|
|
|
|
|
|
|
1755
|
|
|
|
|
|
|
sub SetMaxCompoundWordLength |
1756
|
|
|
|
|
|
|
{ |
1757
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1758
|
|
|
|
|
|
|
return $self->{ _maxCompoundWordLength } = $value; |
1759
|
|
|
|
|
|
|
} |
1760
|
|
|
|
|
|
|
|
1761
|
|
|
|
|
|
|
sub SetOverwriteExistingFile |
1762
|
|
|
|
|
|
|
{ |
1763
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1764
|
|
|
|
|
|
|
return $self->{ _overwriteExistingFile } = $value; |
1765
|
|
|
|
|
|
|
} |
1766
|
|
|
|
|
|
|
|
1767
|
|
|
|
|
|
|
|
1768
|
|
|
|
|
|
|
###################################################################################### |
1769
|
|
|
|
|
|
|
# Debug Functions |
1770
|
|
|
|
|
|
|
###################################################################################### |
1771
|
|
|
|
|
|
|
|
1772
|
|
|
|
|
|
|
sub GetTime |
1773
|
|
|
|
|
|
|
{ |
1774
|
|
|
|
|
|
|
my ( $self ) = @_; |
1775
|
|
|
|
|
|
|
my( $sec, $min, $hour ) = localtime(); |
1776
|
|
|
|
|
|
|
|
1777
|
|
|
|
|
|
|
$hour = "0$hour" if( $hour < 10 ); |
1778
|
|
|
|
|
|
|
$min = "0$min" if( $min < 10 ); |
1779
|
|
|
|
|
|
|
$sec = "0$sec" if( $sec < 10 ); |
1780
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
return "$hour:$min:$sec"; |
1782
|
|
|
|
|
|
|
} |
1783
|
|
|
|
|
|
|
|
1784
|
|
|
|
|
|
|
sub GetDate |
1785
|
|
|
|
|
|
|
{ |
1786
|
|
|
|
|
|
|
my ( $self ) = @_; |
1787
|
|
|
|
|
|
|
my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime(); |
1788
|
|
|
|
|
|
|
|
1789
|
|
|
|
|
|
|
$mon += 1; |
1790
|
|
|
|
|
|
|
$year += 1900; |
1791
|
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
|
return "$mon/$mday/$year"; |
1793
|
|
|
|
|
|
|
} |
1794
|
|
|
|
|
|
|
|
1795
|
|
|
|
|
|
|
sub WriteLog |
1796
|
|
|
|
|
|
|
{ |
1797
|
|
|
|
|
|
|
my ( $self ) = shift; |
1798
|
|
|
|
|
|
|
my $string = shift; |
1799
|
|
|
|
|
|
|
my $printNewLine = shift; |
1800
|
|
|
|
|
|
|
|
1801
|
|
|
|
|
|
|
return if !defined ( $string ); |
1802
|
|
|
|
|
|
|
$printNewLine = 1 if !defined ( $printNewLine ); |
1803
|
|
|
|
|
|
|
|
1804
|
|
|
|
|
|
|
# Prevent Other Threads From Writing At The Same Time |
1805
|
|
|
|
|
|
|
lock( $debugLock ); |
1806
|
|
|
|
|
|
|
|
1807
|
|
|
|
|
|
|
if( $self->GetDebugLog() ) |
1808
|
|
|
|
|
|
|
{ |
1809
|
|
|
|
|
|
|
if( ref ( $self ) ne "Word2vec::Xmltow2v" ) |
1810
|
|
|
|
|
|
|
{ |
1811
|
|
|
|
|
|
|
print( GetDate() . " " . GetTime() . " - xmltow2v: Cannot Call WriteLog() From Outside Module!\n" ); |
1812
|
|
|
|
|
|
|
return; |
1813
|
|
|
|
|
|
|
} |
1814
|
|
|
|
|
|
|
|
1815
|
|
|
|
|
|
|
$string = "" if !defined ( $string ); |
1816
|
|
|
|
|
|
|
print GetDate() . " " . GetTime() . " - xmltow2v::$string"; |
1817
|
|
|
|
|
|
|
print "\n" if( $printNewLine != 0 ); |
1818
|
|
|
|
|
|
|
} |
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
if( $self->GetWriteLog() ) |
1821
|
|
|
|
|
|
|
{ |
1822
|
|
|
|
|
|
|
if( ref ( $self ) ne "Word2vec::Xmltow2v" ) |
1823
|
|
|
|
|
|
|
{ |
1824
|
|
|
|
|
|
|
print( GetDate() . " " . GetTime() . " - xmltow2v: Cannot Call WriteLog() From Outside Module!\n" ); |
1825
|
|
|
|
|
|
|
return; |
1826
|
|
|
|
|
|
|
} |
1827
|
|
|
|
|
|
|
|
1828
|
|
|
|
|
|
|
my $fileHandle = $self->GetFileHandle(); |
1829
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
if( defined( $fileHandle ) ) |
1831
|
|
|
|
|
|
|
{ |
1832
|
|
|
|
|
|
|
print( $fileHandle GetDate() . " " . GetTime() . " - xmltow2v::$string" ); |
1833
|
|
|
|
|
|
|
print( $fileHandle "\n" ) if( $printNewLine != 0 ); |
1834
|
|
|
|
|
|
|
} |
1835
|
|
|
|
|
|
|
} |
1836
|
|
|
|
|
|
|
} |
1837
|
|
|
|
|
|
|
|
1838
|
|
|
|
|
|
|
#################### All Modules Are To Output "1"(True) at EOF ###################### |
1839
|
|
|
|
|
|
|
1; |
1840
|
|
|
|
|
|
|
|
1841
|
|
|
|
|
|
|
|
1842
|
|
|
|
|
|
|
=head1 NAME |
1843
|
|
|
|
|
|
|
|
1844
|
|
|
|
|
|
|
Word2vec::Xmltow2v - Medline XML-To-W2V Module. |
1845
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
=head1 SYNOPSIS |
1847
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
1849
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
# Parameters: Debug Output = True, Write Log = False, StoreTitle = True, StoreAbstract = True, Quick Parse = True, CompoundifyText = True, Use Multi-Threading (Default = 1 Thread Per CPU Core) |
1851
|
|
|
|
|
|
|
my $xmlconv = new xmltow2v( 1, 0, 1, 1, 1, 1, 2 ); # Note: Specifying no parameters implies default settings. |
1852
|
|
|
|
|
|
|
$xmlconv->SetWorkingDir( "Medline/XML/Directory/Here" ); |
1853
|
|
|
|
|
|
|
$xmlconv->SetSavePath( "textcorpus.txt" ); |
1854
|
|
|
|
|
|
|
$xmlconv->SetStoreTitle( 1 ); |
1855
|
|
|
|
|
|
|
$xmlconv->SetStoreAbstract( 1 ); |
1856
|
|
|
|
|
|
|
$xmlconv->SetBeginDate( "01/01/2004" ); |
1857
|
|
|
|
|
|
|
$xmlconv->SetEndDate( "08/13/2016" ); |
1858
|
|
|
|
|
|
|
$xmlconv->SetOverwriteExistingFile( 1 ); |
1859
|
|
|
|
|
|
|
|
1860
|
|
|
|
|
|
|
# If Compound Word File Exists, Store It In Memory And Create Compound Word Binary Search Tree |
1861
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "compoundword.txt", 1 ); |
1862
|
|
|
|
|
|
|
$xmlconv->CreateCompoundWordBST(); |
1863
|
|
|
|
|
|
|
|
1864
|
|
|
|
|
|
|
# Parse XML Files or Directory Of Files |
1865
|
|
|
|
|
|
|
$xmlconv->ConvertMedlineXMLToW2V( "/xmlDirectory/" ); |
1866
|
|
|
|
|
|
|
undef( $xmlconv ); |
1867
|
|
|
|
|
|
|
|
1868
|
|
|
|
|
|
|
=head1 DESCRIPTION |
1869
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
Word2vec::Xmltow2v is a XML-to-text module which converts Medline XML article title |
1871
|
|
|
|
|
|
|
and abstract data, given a date range, into a plain text corpus for use |
1872
|
|
|
|
|
|
|
with Word2vec::Interface. It also "compoundifies" during text corpus compilation |
1873
|
|
|
|
|
|
|
given a compound word file. |
1874
|
|
|
|
|
|
|
|
1875
|
|
|
|
|
|
|
=head2 Main Functions |
1876
|
|
|
|
|
|
|
|
1877
|
|
|
|
|
|
|
=head3 new |
1878
|
|
|
|
|
|
|
|
1879
|
|
|
|
|
|
|
Description: |
1880
|
|
|
|
|
|
|
|
1881
|
|
|
|
|
|
|
Returns a new 'Word2vec::Xmltow2v' module object. |
1882
|
|
|
|
|
|
|
|
1883
|
|
|
|
|
|
|
Note: Specifying no parameters implies default options. |
1884
|
|
|
|
|
|
|
|
1885
|
|
|
|
|
|
|
Default Parameters: |
1886
|
|
|
|
|
|
|
debugLog = 0 |
1887
|
|
|
|
|
|
|
writeLog = 0 |
1888
|
|
|
|
|
|
|
storeTitle = 1 |
1889
|
|
|
|
|
|
|
storeAbstract = 1 |
1890
|
|
|
|
|
|
|
quickParse = 0 |
1891
|
|
|
|
|
|
|
compoundifyText = 0 |
1892
|
|
|
|
|
|
|
numOfThreads = Number of CPUs/CPU cores (1 thread per core/CPU) |
1893
|
|
|
|
|
|
|
workingDir = Current Directory |
1894
|
|
|
|
|
|
|
savePath = Current Directory |
1895
|
|
|
|
|
|
|
beginDate = "00/00/0000" |
1896
|
|
|
|
|
|
|
endDate = "99/99/9999" |
1897
|
|
|
|
|
|
|
xmlStringToParse = "(null)" |
1898
|
|
|
|
|
|
|
textCorpusString = "" |
1899
|
|
|
|
|
|
|
twigHandler = 0 |
1900
|
|
|
|
|
|
|
parsedCount = 0 |
1901
|
|
|
|
|
|
|
tempDate = "" |
1902
|
|
|
|
|
|
|
tempStr = "" |
1903
|
|
|
|
|
|
|
outputFileName = "textcorpus.txt" |
1904
|
|
|
|
|
|
|
compoundWordAry = () |
1905
|
|
|
|
|
|
|
compoundWordBST = Word2vec::Bst->new() |
1906
|
|
|
|
|
|
|
maxCompoundWordLength = 0 |
1907
|
|
|
|
|
|
|
overwriteExistingFile = 0 |
1908
|
|
|
|
|
|
|
|
1909
|
|
|
|
|
|
|
Input: |
1910
|
|
|
|
|
|
|
|
1911
|
|
|
|
|
|
|
$debugLog -> Instructs module to print debug statements to the console. (1 = True / 0 = False) |
1912
|
|
|
|
|
|
|
$writeLog -> Instructs module to print debug statements to a log file. (1 = True / 0 = False) |
1913
|
|
|
|
|
|
|
$storeTitle -> Instructs module to store Medline article titles during text corpus compilation. (1 = True / 0 = False) |
1914
|
|
|
|
|
|
|
$storeAbstract -> Instructs module to store Medline article abstracts during text corpus compilation. (1 = True / 0 = False) |
1915
|
|
|
|
|
|
|
$quickParse -> Instructs module to utilize quick XML parsing Functions for known Medline article title and abstract tags. (1 = True / 0 = False) |
1916
|
|
|
|
|
|
|
$compoundifyText -> Instructs module to compoundify text on the fly given a compound word file. This is automatically set |
1917
|
|
|
|
|
|
|
when reading the compound word file to memory regardless of user setting. (1 = True / 0 = False) |
1918
|
|
|
|
|
|
|
$numOfThreads -> Specifies the number of worker threads which parse Medline XML files simultaneously to create the text corpus. |
1919
|
|
|
|
|
|
|
This speeds up text corpus generation by the number of physical cores present an a given machine. (Positive integer value) |
1920
|
|
|
|
|
|
|
ie. Using four threads of a Intel i7 core machine speeds up text corpus generation roughly four times faster than being single threaded. |
1921
|
|
|
|
|
|
|
$workingDir -> Specifies the current working directory. (String) |
1922
|
|
|
|
|
|
|
$savePath -> Specifies the save path for text corpus generation. (String) |
1923
|
|
|
|
|
|
|
$beginDate -> Specifies the beginning date range for Medline article text corpus composition. (Format: XX/XX/XXXX) |
1924
|
|
|
|
|
|
|
$endDate -> Specifies the ending date range for Medline article text corpus composition. (Format: XX/XX/XXXX) |
1925
|
|
|
|
|
|
|
$xmlStringToParse -> Storage location for the current Medline XML file in memory. (String) |
1926
|
|
|
|
|
|
|
$textCorpusString -> Temporary storage location for text corpus generation in memory. (String) |
1927
|
|
|
|
|
|
|
$twigHandler -> XML::Twig object location. |
1928
|
|
|
|
|
|
|
$parsedCount -> Number of parsed Medline articles during text corpus generation. |
1929
|
|
|
|
|
|
|
$tempDate -> Temporary storage location for current Medline article date during text corpus compilation. |
1930
|
|
|
|
|
|
|
$tempStr -> Temporary storage location for current Medline article title/abstract during text corpus compilation. |
1931
|
|
|
|
|
|
|
$outputFileName -> Output file path/name. |
1932
|
|
|
|
|
|
|
$compoundWordAry -> Storage location for compound words, used to compoundify text. (Array) <- Depreciated |
1933
|
|
|
|
|
|
|
$compoundWordBST -> Storage location for compound words, used to compoundify text. (Binary Search Tree) <- Supersedes '$compoundWordAry' |
1934
|
|
|
|
|
|
|
$maxCompoundWordLength -> Maximum number of words able to be compoundified in one phrase. ie "six_sea_snakes_were_sailing" = 5 compoundified words. |
1935
|
|
|
|
|
|
|
The compounding algorithm will attempt to compoundify no more than this set value, even-though the compound word list could |
1936
|
|
|
|
|
|
|
possibly contain larger compounded phrases. |
1937
|
|
|
|
|
|
|
$overwriteExistingFile -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. |
1938
|
|
|
|
|
|
|
|
1939
|
|
|
|
|
|
|
Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested. Maximum recommended parameters to be specified include: |
1940
|
|
|
|
|
|
|
"debugLog, writeLog, storeTitle, storeAbstract, quickParse, compoundifyText, numOfThreads, workingDir, savePath, beginDate, endDate" |
1941
|
|
|
|
|
|
|
|
1942
|
|
|
|
|
|
|
Output: |
1943
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
Word2vec::Xmltow2v object. |
1945
|
|
|
|
|
|
|
|
1946
|
|
|
|
|
|
|
Example: |
1947
|
|
|
|
|
|
|
|
1948
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
1949
|
|
|
|
|
|
|
|
1950
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); # Note: Specifying no parameters implies default settings as listed above. |
1951
|
|
|
|
|
|
|
|
1952
|
|
|
|
|
|
|
undef( $xmlconv ); |
1953
|
|
|
|
|
|
|
|
1954
|
|
|
|
|
|
|
# Or |
1955
|
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
1957
|
|
|
|
|
|
|
|
1958
|
|
|
|
|
|
|
# Parameters: Debug Output = True, Write Log = False, StoreTitle = True, StoreAbstract = True, Quick Parse = True, CompoundifyText = True, Use Multi-Threading (2 Threads) |
1959
|
|
|
|
|
|
|
my $xmlconv = new xmltow2v( 1, 0, 1, 1, 1, 1, 2 ); |
1960
|
|
|
|
|
|
|
|
1961
|
|
|
|
|
|
|
undef( $xmlconv ); |
1962
|
|
|
|
|
|
|
|
1963
|
|
|
|
|
|
|
=head3 DESTROY |
1964
|
|
|
|
|
|
|
|
1965
|
|
|
|
|
|
|
Description: |
1966
|
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
|
Removes module objects and variables from memory. |
1968
|
|
|
|
|
|
|
|
1969
|
|
|
|
|
|
|
Input: |
1970
|
|
|
|
|
|
|
|
1971
|
|
|
|
|
|
|
None |
1972
|
|
|
|
|
|
|
|
1973
|
|
|
|
|
|
|
Output: |
1974
|
|
|
|
|
|
|
|
1975
|
|
|
|
|
|
|
None |
1976
|
|
|
|
|
|
|
|
1977
|
|
|
|
|
|
|
Example: |
1978
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
1980
|
|
|
|
|
|
|
|
1981
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
1982
|
|
|
|
|
|
|
|
1983
|
|
|
|
|
|
|
$xmlconv->DESTROY(); |
1984
|
|
|
|
|
|
|
undef( $xmlconv ); |
1985
|
|
|
|
|
|
|
|
1986
|
|
|
|
|
|
|
=head3 ConvertMedlineXMLToW2V |
1987
|
|
|
|
|
|
|
|
1988
|
|
|
|
|
|
|
Description: |
1989
|
|
|
|
|
|
|
|
1990
|
|
|
|
|
|
|
Parses specified parameter Medline XML file or directory of files, creating a text corpus. Returns 0 if successful or -1 during an error. |
1991
|
|
|
|
|
|
|
|
1992
|
|
|
|
|
|
|
Note: Supports plain Medline XML or gun-zipped XML files. |
1993
|
|
|
|
|
|
|
|
1994
|
|
|
|
|
|
|
Input: |
1995
|
|
|
|
|
|
|
|
1996
|
|
|
|
|
|
|
$filePath -> XML file path to parse. (This can be a single file or directory of XML/XML.gz files). |
1997
|
|
|
|
|
|
|
|
1998
|
|
|
|
|
|
|
Output: |
1999
|
|
|
|
|
|
|
|
2000
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-Successful |
2001
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
Example: |
2003
|
|
|
|
|
|
|
|
2004
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2005
|
|
|
|
|
|
|
|
2006
|
|
|
|
|
|
|
$xmlconv = new xmltow2v(); # Note: Specifying no parameters implies default settings |
2007
|
|
|
|
|
|
|
$xmlconv->SetSavePath( "testCorpus.txt" ); |
2008
|
|
|
|
|
|
|
$xmlconv->SetStoreTitle( 1 ); |
2009
|
|
|
|
|
|
|
$xmlconv->SetStoreAbstract( 1 ); |
2010
|
|
|
|
|
|
|
$xmlconv->SetBeginDate( "01/01/2004" ); |
2011
|
|
|
|
|
|
|
$xmlconv->SetEndDate( "08/13/2016" ); |
2012
|
|
|
|
|
|
|
$xmlconv->SetOverwriteExistingFile( 1 ); |
2013
|
|
|
|
|
|
|
$xmlconv->ConvertMedlineXMLToW2V( "/xmlDirectory/" ); |
2014
|
|
|
|
|
|
|
undef( $xmlconv ); |
2015
|
|
|
|
|
|
|
|
2016
|
|
|
|
|
|
|
|
2017
|
|
|
|
|
|
|
=head3 _ThreadedConvert |
2018
|
|
|
|
|
|
|
|
2019
|
|
|
|
|
|
|
Description: |
2020
|
|
|
|
|
|
|
|
2021
|
|
|
|
|
|
|
Multi-Threaded Medline XML to text corpus conversion function. |
2022
|
|
|
|
|
|
|
|
2023
|
|
|
|
|
|
|
Input: |
2024
|
|
|
|
|
|
|
|
2025
|
|
|
|
|
|
|
$directory -> File directory or directory of files to parse. |
2026
|
|
|
|
|
|
|
|
2027
|
|
|
|
|
|
|
Output: |
2028
|
|
|
|
|
|
|
|
2029
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2030
|
|
|
|
|
|
|
|
2031
|
|
|
|
|
|
|
Example: |
2032
|
|
|
|
|
|
|
|
2033
|
|
|
|
|
|
|
Warning: This is a private function called by 'ConvertMedlineXMLToW2V()'. It should not be called outside of xmltow2v module. |
2034
|
|
|
|
|
|
|
|
2035
|
|
|
|
|
|
|
=head3 _ParseXMLString |
2036
|
|
|
|
|
|
|
|
2037
|
|
|
|
|
|
|
Description: |
2038
|
|
|
|
|
|
|
|
2039
|
|
|
|
|
|
|
Parses passed string parameter for Medline XML article title and abstract data and appends found data to the text corpus. |
2040
|
|
|
|
|
|
|
|
2041
|
|
|
|
|
|
|
Input: |
2042
|
|
|
|
|
|
|
|
2043
|
|
|
|
|
|
|
$string -> Medline XML string data to parse. |
2044
|
|
|
|
|
|
|
|
2045
|
|
|
|
|
|
|
Output: |
2046
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
None |
2048
|
|
|
|
|
|
|
|
2049
|
|
|
|
|
|
|
Example: |
2050
|
|
|
|
|
|
|
|
2051
|
|
|
|
|
|
|
Warning: This is a private function called by "ConvertMedlineXMLToW2V()" and "_ThreadedConvert()". It should not be called outside of xmltow2v module. |
2052
|
|
|
|
|
|
|
|
2053
|
|
|
|
|
|
|
=head3 _CheckParseRequirements |
2054
|
|
|
|
|
|
|
|
2055
|
|
|
|
|
|
|
Description: |
2056
|
|
|
|
|
|
|
|
2057
|
|
|
|
|
|
|
Checks passed string parameter to see if it contains relevant data and XML::Twig handler is initialized. |
2058
|
|
|
|
|
|
|
|
2059
|
|
|
|
|
|
|
Input: |
2060
|
|
|
|
|
|
|
|
2061
|
|
|
|
|
|
|
$string -> String data to check |
2062
|
|
|
|
|
|
|
|
2063
|
|
|
|
|
|
|
Output: |
2064
|
|
|
|
|
|
|
|
2065
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2066
|
|
|
|
|
|
|
|
2067
|
|
|
|
|
|
|
Example: |
2068
|
|
|
|
|
|
|
|
2069
|
|
|
|
|
|
|
Warning: This is a private function called "_ParseXMLString()". It should not be called outside of xmltow2v module. |
2070
|
|
|
|
|
|
|
|
2071
|
|
|
|
|
|
|
=head3 _CheckForNullData |
2072
|
|
|
|
|
|
|
|
2073
|
|
|
|
|
|
|
Description: |
2074
|
|
|
|
|
|
|
|
2075
|
|
|
|
|
|
|
Checks passed string parameter for "(null)" string. |
2076
|
|
|
|
|
|
|
|
2077
|
|
|
|
|
|
|
Input: |
2078
|
|
|
|
|
|
|
|
2079
|
|
|
|
|
|
|
$string -> String data to be checked. |
2080
|
|
|
|
|
|
|
|
2081
|
|
|
|
|
|
|
Output: |
2082
|
|
|
|
|
|
|
|
2083
|
|
|
|
|
|
|
$value -> '1' = True/Null data or '0' = False/Valid data |
2084
|
|
|
|
|
|
|
|
2085
|
|
|
|
|
|
|
Example: |
2086
|
|
|
|
|
|
|
|
2087
|
|
|
|
|
|
|
Warning: This is a private function called by "new()" and "_ParseXMLString()". It should not be called outside of xmltow2v module. |
2088
|
|
|
|
|
|
|
|
2089
|
|
|
|
|
|
|
=head3 _RemoveXMLVersion |
2090
|
|
|
|
|
|
|
|
2091
|
|
|
|
|
|
|
Description: |
2092
|
|
|
|
|
|
|
|
2093
|
|
|
|
|
|
|
Removes the XML Version string prior to parsing the XML string data. (Depreciated) |
2094
|
|
|
|
|
|
|
|
2095
|
|
|
|
|
|
|
Input: |
2096
|
|
|
|
|
|
|
|
2097
|
|
|
|
|
|
|
$string -> Medline XML string data |
2098
|
|
|
|
|
|
|
|
2099
|
|
|
|
|
|
|
Output: |
2100
|
|
|
|
|
|
|
|
2101
|
|
|
|
|
|
|
None |
2102
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
Example: |
2104
|
|
|
|
|
|
|
|
2105
|
|
|
|
|
|
|
Warning: This is a private function called by "new()" and "_ParseXMLString()". It should not be called outside of xmltow2v module. |
2106
|
|
|
|
|
|
|
|
2107
|
|
|
|
|
|
|
=head3 _ParseMedlineCitationSet |
2108
|
|
|
|
|
|
|
|
2109
|
|
|
|
|
|
|
Description: |
2110
|
|
|
|
|
|
|
|
2111
|
|
|
|
|
|
|
Parses 'MedlineCitationSet' tag data in Medline XML file. |
2112
|
|
|
|
|
|
|
|
2113
|
|
|
|
|
|
|
Input: |
2114
|
|
|
|
|
|
|
|
2115
|
|
|
|
|
|
|
$twigHandler -> XML::Twig handler |
2116
|
|
|
|
|
|
|
$root -> Beginning of XML directory to parse. ( Directory in Medline XML string data ) |
2117
|
|
|
|
|
|
|
|
2118
|
|
|
|
|
|
|
Output: |
2119
|
|
|
|
|
|
|
|
2120
|
|
|
|
|
|
|
None |
2121
|
|
|
|
|
|
|
|
2122
|
|
|
|
|
|
|
Example: |
2123
|
|
|
|
|
|
|
|
2124
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2125
|
|
|
|
|
|
|
|
2126
|
|
|
|
|
|
|
=head3 _ParseMedlineArticle |
2127
|
|
|
|
|
|
|
|
2128
|
|
|
|
|
|
|
Description: |
2129
|
|
|
|
|
|
|
|
2130
|
|
|
|
|
|
|
Parses 'MedlineArticle' tag data in Medline XML file. |
2131
|
|
|
|
|
|
|
|
2132
|
|
|
|
|
|
|
Input: |
2133
|
|
|
|
|
|
|
|
2134
|
|
|
|
|
|
|
$medlineArticle -> Current Medline article directory in XML data (XML::Twig directory) |
2135
|
|
|
|
|
|
|
|
2136
|
|
|
|
|
|
|
Output: |
2137
|
|
|
|
|
|
|
|
2138
|
|
|
|
|
|
|
$value -> '1' = Finished parsing Medline article. |
2139
|
|
|
|
|
|
|
|
2140
|
|
|
|
|
|
|
Example: |
2141
|
|
|
|
|
|
|
|
2142
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2143
|
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
|
=head3 _ParseDateCreated |
2145
|
|
|
|
|
|
|
|
2146
|
|
|
|
|
|
|
Description: |
2147
|
|
|
|
|
|
|
|
2148
|
|
|
|
|
|
|
Parses 'DateCreated' tag data in Medline XML file. |
2149
|
|
|
|
|
|
|
|
2150
|
|
|
|
|
|
|
Input: |
2151
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
$article -> Current Medline article in XML data (XML::Twig directory) |
2153
|
|
|
|
|
|
|
|
2154
|
|
|
|
|
|
|
Output: |
2155
|
|
|
|
|
|
|
|
2156
|
|
|
|
|
|
|
$date -> 'XX/XX/XXXX' (Month/Day/Year) |
2157
|
|
|
|
|
|
|
|
2158
|
|
|
|
|
|
|
Example: |
2159
|
|
|
|
|
|
|
|
2160
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2161
|
|
|
|
|
|
|
|
2162
|
|
|
|
|
|
|
=head3 _ParseArticle |
2163
|
|
|
|
|
|
|
|
2164
|
|
|
|
|
|
|
Description: |
2165
|
|
|
|
|
|
|
|
2166
|
|
|
|
|
|
|
Parses 'Article' tag data in Medline XML file. Fetches 'ArticleTitle', 'Journal' and 'Abstract' XML tags. |
2167
|
|
|
|
|
|
|
|
2168
|
|
|
|
|
|
|
Input: |
2169
|
|
|
|
|
|
|
|
2170
|
|
|
|
|
|
|
$article -> Current Medline article in XML data (XML::Twig directory) |
2171
|
|
|
|
|
|
|
|
2172
|
|
|
|
|
|
|
Output: |
2173
|
|
|
|
|
|
|
|
2174
|
|
|
|
|
|
|
None |
2175
|
|
|
|
|
|
|
|
2176
|
|
|
|
|
|
|
Example: |
2177
|
|
|
|
|
|
|
|
2178
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2179
|
|
|
|
|
|
|
|
2180
|
|
|
|
|
|
|
=head3 _ParseJournal |
2181
|
|
|
|
|
|
|
|
2182
|
|
|
|
|
|
|
Description: |
2183
|
|
|
|
|
|
|
|
2184
|
|
|
|
|
|
|
Parses 'Journal' tag data in Medline XML file. Fetches 'Title' XML tag. |
2185
|
|
|
|
|
|
|
|
2186
|
|
|
|
|
|
|
Input: |
2187
|
|
|
|
|
|
|
|
2188
|
|
|
|
|
|
|
$journalRoot -> Current Medline journal directory in XML data (XML::Twig directory) |
2189
|
|
|
|
|
|
|
|
2190
|
|
|
|
|
|
|
Output: |
2191
|
|
|
|
|
|
|
|
2192
|
|
|
|
|
|
|
None |
2193
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
Example: |
2195
|
|
|
|
|
|
|
|
2196
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2197
|
|
|
|
|
|
|
|
2198
|
|
|
|
|
|
|
=head3 _ParseOtherAbstract |
2199
|
|
|
|
|
|
|
|
2200
|
|
|
|
|
|
|
Description: |
2201
|
|
|
|
|
|
|
|
2202
|
|
|
|
|
|
|
Parses 'Abstract' tag data in Medline XML file. Fetches 'AbstractText' XML tag. |
2203
|
|
|
|
|
|
|
|
2204
|
|
|
|
|
|
|
Input: |
2205
|
|
|
|
|
|
|
|
2206
|
|
|
|
|
|
|
$abstractRoot -> Current Medline abstract directory in XML data (XML::Twig directory) |
2207
|
|
|
|
|
|
|
|
2208
|
|
|
|
|
|
|
Output: |
2209
|
|
|
|
|
|
|
|
2210
|
|
|
|
|
|
|
None |
2211
|
|
|
|
|
|
|
|
2212
|
|
|
|
|
|
|
Example: |
2213
|
|
|
|
|
|
|
|
2214
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2215
|
|
|
|
|
|
|
|
2216
|
|
|
|
|
|
|
=head3 _QuickParseDateCreated |
2217
|
|
|
|
|
|
|
|
2218
|
|
|
|
|
|
|
Description: |
2219
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
Parses 'DateCreated' tag data in Medline XML file. Used when 'QuickParse' member variable is enabled. Sets $tempDate member variable to parsed 'DateCreated' tag data. |
2221
|
|
|
|
|
|
|
|
2222
|
|
|
|
|
|
|
Input: |
2223
|
|
|
|
|
|
|
|
2224
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler |
2225
|
|
|
|
|
|
|
$article -> Current Medline article directory in XML data (XML::Twig directory) |
2226
|
|
|
|
|
|
|
|
2227
|
|
|
|
|
|
|
Output: |
2228
|
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
|
None |
2230
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
Example: |
2232
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2234
|
|
|
|
|
|
|
|
2235
|
|
|
|
|
|
|
=head3 _QuickParseJournal |
2236
|
|
|
|
|
|
|
|
2237
|
|
|
|
|
|
|
Description: |
2238
|
|
|
|
|
|
|
|
2239
|
|
|
|
|
|
|
Parses 'Journal' tag data in Medline XML file. Fetches 'Title' XML tag. Used when 'QuickParse' member variable is enabled. |
2240
|
|
|
|
|
|
|
Sets $tempStr to parsed data and stores in text corpus. |
2241
|
|
|
|
|
|
|
|
2242
|
|
|
|
|
|
|
Input: |
2243
|
|
|
|
|
|
|
|
2244
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler. |
2245
|
|
|
|
|
|
|
$journalRoot -> Current Medline journal directory in XML data (XML::Twig directory) |
2246
|
|
|
|
|
|
|
|
2247
|
|
|
|
|
|
|
Output: |
2248
|
|
|
|
|
|
|
|
2249
|
|
|
|
|
|
|
None |
2250
|
|
|
|
|
|
|
|
2251
|
|
|
|
|
|
|
Example: |
2252
|
|
|
|
|
|
|
|
2253
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2254
|
|
|
|
|
|
|
|
2255
|
|
|
|
|
|
|
=head3 _QuickParseArticle |
2256
|
|
|
|
|
|
|
|
2257
|
|
|
|
|
|
|
Description: |
2258
|
|
|
|
|
|
|
|
2259
|
|
|
|
|
|
|
Parses 'Article' tag data in Medline XML file. Fetches 'ArticleTitle' and 'Abstract' XML tags. Used when 'QuickParse' member variable is enabled. |
2260
|
|
|
|
|
|
|
Sets $tempStr to parsed data and stores in text corpus. |
2261
|
|
|
|
|
|
|
|
2262
|
|
|
|
|
|
|
Input: |
2263
|
|
|
|
|
|
|
|
2264
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler. |
2265
|
|
|
|
|
|
|
$article -> Current Medline article directory in XML data (XML::Twig directory) |
2266
|
|
|
|
|
|
|
|
2267
|
|
|
|
|
|
|
Output: |
2268
|
|
|
|
|
|
|
|
2269
|
|
|
|
|
|
|
None |
2270
|
|
|
|
|
|
|
|
2271
|
|
|
|
|
|
|
Example: |
2272
|
|
|
|
|
|
|
|
2273
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2274
|
|
|
|
|
|
|
|
2275
|
|
|
|
|
|
|
=head3 _QuickParseOtherAbstract |
2276
|
|
|
|
|
|
|
|
2277
|
|
|
|
|
|
|
Description: |
2278
|
|
|
|
|
|
|
|
2279
|
|
|
|
|
|
|
Parses 'Abstract' tag data in Medline XML file. Fetches 'AbstractText' XML tag. Used when 'QuickParse' member variable is enabled. |
2280
|
|
|
|
|
|
|
Sets $tempStr to parsed data and stores in text corpus. |
2281
|
|
|
|
|
|
|
|
2282
|
|
|
|
|
|
|
Input: |
2283
|
|
|
|
|
|
|
|
2284
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler. |
2285
|
|
|
|
|
|
|
$anstractRoot -> Current Medline abstract directory in XML data (XML::Twig directory) |
2286
|
|
|
|
|
|
|
|
2287
|
|
|
|
|
|
|
Output: |
2288
|
|
|
|
|
|
|
|
2289
|
|
|
|
|
|
|
None |
2290
|
|
|
|
|
|
|
|
2291
|
|
|
|
|
|
|
Example: |
2292
|
|
|
|
|
|
|
|
2293
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2294
|
|
|
|
|
|
|
|
2295
|
|
|
|
|
|
|
=head3 CreateCompoundWordBST |
2296
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
Description: |
2298
|
|
|
|
|
|
|
|
2299
|
|
|
|
|
|
|
Creates a binary search tree using compound word data in memory and stores root node. This also clears the compound word array afterwards. |
2300
|
|
|
|
|
|
|
|
2301
|
|
|
|
|
|
|
Warning: Compound word file must be loaded into memory using ReadCompoundWordDataFromFile() prior to calling this method. This function |
2302
|
|
|
|
|
|
|
will also delete the compound word array upon completion as it will no longer be necessary. |
2303
|
|
|
|
|
|
|
|
2304
|
|
|
|
|
|
|
Input: |
2305
|
|
|
|
|
|
|
|
2306
|
|
|
|
|
|
|
None |
2307
|
|
|
|
|
|
|
|
2308
|
|
|
|
|
|
|
Output: |
2309
|
|
|
|
|
|
|
|
2310
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2311
|
|
|
|
|
|
|
|
2312
|
|
|
|
|
|
|
Example: |
2313
|
|
|
|
|
|
|
|
2314
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2315
|
|
|
|
|
|
|
|
2316
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2317
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt" ); |
2318
|
|
|
|
|
|
|
$xmlconv->CreateCompoundWordBST(); |
2319
|
|
|
|
|
|
|
|
2320
|
|
|
|
|
|
|
=head3 CompoundifyString |
2321
|
|
|
|
|
|
|
|
2322
|
|
|
|
|
|
|
Description: |
2323
|
|
|
|
|
|
|
|
2324
|
|
|
|
|
|
|
Compoundifies string parameter based on compound word data in memory using the compound word binary search tree. |
2325
|
|
|
|
|
|
|
|
2326
|
|
|
|
|
|
|
Warning: Compound word file must be loaded into memory using ReadCompoundWordDataFromFile() prior to calling this method. |
2327
|
|
|
|
|
|
|
|
2328
|
|
|
|
|
|
|
Input: |
2329
|
|
|
|
|
|
|
|
2330
|
|
|
|
|
|
|
$string -> String to compoundify |
2331
|
|
|
|
|
|
|
|
2332
|
|
|
|
|
|
|
Output: |
2333
|
|
|
|
|
|
|
|
2334
|
|
|
|
|
|
|
$string -> Compounded string or "(null)" if string parameter is not defined. |
2335
|
|
|
|
|
|
|
|
2336
|
|
|
|
|
|
|
Example: |
2337
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2339
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2341
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt" ); |
2342
|
|
|
|
|
|
|
$xmlconv->CreateCompoundWordBST(); |
2343
|
|
|
|
|
|
|
my $compoundedString = $xmlconv->CompoundifyString( "String to compoundify" ); |
2344
|
|
|
|
|
|
|
print( "Compounded String: $compoundedString\n" ); |
2345
|
|
|
|
|
|
|
|
2346
|
|
|
|
|
|
|
undef( $xmlconv ); |
2347
|
|
|
|
|
|
|
|
2348
|
|
|
|
|
|
|
=head3 _CompoundifySearch |
2349
|
|
|
|
|
|
|
|
2350
|
|
|
|
|
|
|
Description: |
2351
|
|
|
|
|
|
|
|
2352
|
|
|
|
|
|
|
Recursive method used by CompoundifyString() to fetch compound word data in binary search tree. |
2353
|
|
|
|
|
|
|
|
2354
|
|
|
|
|
|
|
Warning: This function requires specific parameters and should not be called outside of CompoundifyString() method. |
2355
|
|
|
|
|
|
|
|
2356
|
|
|
|
|
|
|
Input: |
2357
|
|
|
|
|
|
|
|
2358
|
|
|
|
|
|
|
$stringArrayRef -> Array reference containing string data |
2359
|
|
|
|
|
|
|
$oldNode -> Last 'Word2vec::Node' data match was found |
2360
|
|
|
|
|
|
|
$searchStr -> Search phrase |
2361
|
|
|
|
|
|
|
$index -> Current string array index |
2362
|
|
|
|
|
|
|
|
2363
|
|
|
|
|
|
|
Output: |
2364
|
|
|
|
|
|
|
|
2365
|
|
|
|
|
|
|
Word2vec::Node -> Last node containing positive search phrase match |
2366
|
|
|
|
|
|
|
|
2367
|
|
|
|
|
|
|
Example: |
2368
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
Warning: This is a private function and is called by 'CompoundifyString()'. It should not be called outside of xmltow2v module. |
2370
|
|
|
|
|
|
|
|
2371
|
|
|
|
|
|
|
=head3 ReadCompoundWordDataFromFile |
2372
|
|
|
|
|
|
|
|
2373
|
|
|
|
|
|
|
Description: |
2374
|
|
|
|
|
|
|
|
2375
|
|
|
|
|
|
|
Reads compound word file and stores in memory. $autoSetMaxCompWordLength parameter is not required to be set. This |
2376
|
|
|
|
|
|
|
parameter instructs the method to auto set the maximum compound word length dependent on the longest compound word found. |
2377
|
|
|
|
|
|
|
|
2378
|
|
|
|
|
|
|
Note: $autoSetMaxCompWordLength options: defined = True and Undefined = False. |
2379
|
|
|
|
|
|
|
|
2380
|
|
|
|
|
|
|
Input: |
2381
|
|
|
|
|
|
|
|
2382
|
|
|
|
|
|
|
$filePath -> Compound word file path |
2383
|
|
|
|
|
|
|
$autoSetMaxCompWordLength -> Maximum length of a given compoundified phrase the module's compoundify algorithm will permit. |
2384
|
|
|
|
|
|
|
|
2385
|
|
|
|
|
|
|
Note: Calling this method with $autoSetMaxCompWordLength defined will automatically set the maxCompoundWordLength variable to the longest compound phrase. |
2386
|
|
|
|
|
|
|
|
2387
|
|
|
|
|
|
|
Output: |
2388
|
|
|
|
|
|
|
|
2389
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2390
|
|
|
|
|
|
|
|
2391
|
|
|
|
|
|
|
Example: |
2392
|
|
|
|
|
|
|
|
2393
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2394
|
|
|
|
|
|
|
|
2395
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2396
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt", 1 ); |
2397
|
|
|
|
|
|
|
|
2398
|
|
|
|
|
|
|
undef( $xmlconv ); |
2399
|
|
|
|
|
|
|
|
2400
|
|
|
|
|
|
|
=head3 SaveCompoundWordListToFile |
2401
|
|
|
|
|
|
|
|
2402
|
|
|
|
|
|
|
Description: |
2403
|
|
|
|
|
|
|
|
2404
|
|
|
|
|
|
|
Saves compound word data in memory to a specified file location. |
2405
|
|
|
|
|
|
|
|
2406
|
|
|
|
|
|
|
Input: |
2407
|
|
|
|
|
|
|
|
2408
|
|
|
|
|
|
|
$savePath -> Path to save compound word list to file. |
2409
|
|
|
|
|
|
|
|
2410
|
|
|
|
|
|
|
Output: |
2411
|
|
|
|
|
|
|
|
2412
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2413
|
|
|
|
|
|
|
|
2414
|
|
|
|
|
|
|
Example: |
2415
|
|
|
|
|
|
|
|
2416
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2417
|
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2419
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt" ); |
2420
|
|
|
|
|
|
|
$xmlconv->SaveCompoundWordDataFromFile( "samples/newcompoundword.txt" ); |
2421
|
|
|
|
|
|
|
undef( $xmlconv ); |
2422
|
|
|
|
|
|
|
|
2423
|
|
|
|
|
|
|
=head3 ReadTextFromFile |
2424
|
|
|
|
|
|
|
|
2425
|
|
|
|
|
|
|
Description: |
2426
|
|
|
|
|
|
|
|
2427
|
|
|
|
|
|
|
Reads a plain text file with utf8 encoding in memory. Returns string data if successful and "(null)" if unsuccessful. |
2428
|
|
|
|
|
|
|
|
2429
|
|
|
|
|
|
|
Input: |
2430
|
|
|
|
|
|
|
|
2431
|
|
|
|
|
|
|
$filePath -> Text file to read into memory |
2432
|
|
|
|
|
|
|
|
2433
|
|
|
|
|
|
|
Output: |
2434
|
|
|
|
|
|
|
|
2435
|
|
|
|
|
|
|
$string -> String data if successful or "(null)" if un-successful. |
2436
|
|
|
|
|
|
|
|
2437
|
|
|
|
|
|
|
Example: |
2438
|
|
|
|
|
|
|
|
2439
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2440
|
|
|
|
|
|
|
|
2441
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2442
|
|
|
|
|
|
|
my $textData = $xmlconv->ReadTextFromFile( "samples/textcorpus.txt" ); |
2443
|
|
|
|
|
|
|
print( "Text Data: $textData\n" ); |
2444
|
|
|
|
|
|
|
undef( $xmlconv ); |
2445
|
|
|
|
|
|
|
|
2446
|
|
|
|
|
|
|
=head3 SaveTextToFile |
2447
|
|
|
|
|
|
|
|
2448
|
|
|
|
|
|
|
Description: |
2449
|
|
|
|
|
|
|
|
2450
|
|
|
|
|
|
|
Saves a plain text file with utf8 encoding in a specified location. |
2451
|
|
|
|
|
|
|
|
2452
|
|
|
|
|
|
|
Input: |
2453
|
|
|
|
|
|
|
|
2454
|
|
|
|
|
|
|
$savePath -> Path to save string data. |
2455
|
|
|
|
|
|
|
$string -> String to save |
2456
|
|
|
|
|
|
|
|
2457
|
|
|
|
|
|
|
Output: |
2458
|
|
|
|
|
|
|
|
2459
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2460
|
|
|
|
|
|
|
|
2461
|
|
|
|
|
|
|
Example: |
2462
|
|
|
|
|
|
|
|
2463
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2464
|
|
|
|
|
|
|
|
2465
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2466
|
|
|
|
|
|
|
my $result = $xmlconv->SaveTextToFile( "text.txt", "Hello world!" ); |
2467
|
|
|
|
|
|
|
|
2468
|
|
|
|
|
|
|
print( "File saved\n" ) if $result == 0; |
2469
|
|
|
|
|
|
|
print( "File unable to save\n" ) if $result == -1; |
2470
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
undef( $xmlconv ); |
2472
|
|
|
|
|
|
|
|
2473
|
|
|
|
|
|
|
=head3 _ReadXMLDataFromFile |
2474
|
|
|
|
|
|
|
|
2475
|
|
|
|
|
|
|
Description: |
2476
|
|
|
|
|
|
|
|
2477
|
|
|
|
|
|
|
Reads an XML file from a specified location. Returns string in memory if successful and "(null)" if unsuccessful. |
2478
|
|
|
|
|
|
|
|
2479
|
|
|
|
|
|
|
Input: |
2480
|
|
|
|
|
|
|
|
2481
|
|
|
|
|
|
|
$filePath -> File to read given path |
2482
|
|
|
|
|
|
|
|
2483
|
|
|
|
|
|
|
Output: |
2484
|
|
|
|
|
|
|
|
2485
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2486
|
|
|
|
|
|
|
|
2487
|
|
|
|
|
|
|
Example: |
2488
|
|
|
|
|
|
|
|
2489
|
|
|
|
|
|
|
Warning: This is a private function and is called by XML::Twig parsing functions. It should not be called outside of xmltow2v module. |
2490
|
|
|
|
|
|
|
|
2491
|
|
|
|
|
|
|
=head3 _SaveTextCorpusToFile |
2492
|
|
|
|
|
|
|
|
2493
|
|
|
|
|
|
|
Description: |
2494
|
|
|
|
|
|
|
|
2495
|
|
|
|
|
|
|
Saves text corpus data to specified file path. This method will append to any existing file if $appendToFile parameter |
2496
|
|
|
|
|
|
|
is defined or "overwrite" option is disabled. Enabling "overwrite" option will overwrite any existing files. |
2497
|
|
|
|
|
|
|
|
2498
|
|
|
|
|
|
|
Input: |
2499
|
|
|
|
|
|
|
|
2500
|
|
|
|
|
|
|
$savePath -> Path to save the text corpus |
2501
|
|
|
|
|
|
|
$appendToFile -> Specifies whether the module will overwrite any existing data or append to existing text corpus data. |
2502
|
|
|
|
|
|
|
|
2503
|
|
|
|
|
|
|
Note: Leaving this variable undefined will fetch the "Overwrite" member variable and set the value to this parameter. |
2504
|
|
|
|
|
|
|
|
2505
|
|
|
|
|
|
|
Output: |
2506
|
|
|
|
|
|
|
|
2507
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2508
|
|
|
|
|
|
|
|
2509
|
|
|
|
|
|
|
Example: |
2510
|
|
|
|
|
|
|
|
2511
|
|
|
|
|
|
|
Warning: This is a private function and is called by XML::Twig parsing functions. It should not be called outside of xmltow2v module. |
2512
|
|
|
|
|
|
|
|
2513
|
|
|
|
|
|
|
=head3 IsDateInSpecifiedRange |
2514
|
|
|
|
|
|
|
|
2515
|
|
|
|
|
|
|
Description: |
2516
|
|
|
|
|
|
|
|
2517
|
|
|
|
|
|
|
Checks to see if $date is within $beginDate and $endDate range. Returns 1 if true and 0 if false. |
2518
|
|
|
|
|
|
|
|
2519
|
|
|
|
|
|
|
Note: Date Format: XX/XX/XXXX (Month/Day/Year) |
2520
|
|
|
|
|
|
|
|
2521
|
|
|
|
|
|
|
Input: |
2522
|
|
|
|
|
|
|
|
2523
|
|
|
|
|
|
|
$date -> Date to check against minimum and maximum data range. (String) |
2524
|
|
|
|
|
|
|
$beginDate -> Minimum date range (String) |
2525
|
|
|
|
|
|
|
$endDate -> Maximum date range (String) |
2526
|
|
|
|
|
|
|
|
2527
|
|
|
|
|
|
|
Output: |
2528
|
|
|
|
|
|
|
|
2529
|
|
|
|
|
|
|
$value -> '1' = True/Date is within specified range Or '0' = False/Date is not within specified range. |
2530
|
|
|
|
|
|
|
|
2531
|
|
|
|
|
|
|
Example: |
2532
|
|
|
|
|
|
|
|
2533
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2534
|
|
|
|
|
|
|
|
2535
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2536
|
|
|
|
|
|
|
print( "Is \"01/01/2004\" within the date range: \"02/21/1985\" to \"08/13/2016\"?\n" ); |
2537
|
|
|
|
|
|
|
print( "Yes\n" ) if $xmlconv->IsDateInSpecifiedRange( "01/01/2004", "02/21/1985", "08/13/2016" ) == 1; |
2538
|
|
|
|
|
|
|
print( "No\n" ) if $xmlconv->IsDateInSpecifiedRange( "01/01/2004", "02/21/1985", "08/13/2016" ) == 0; |
2539
|
|
|
|
|
|
|
|
2540
|
|
|
|
|
|
|
undef( $xmlconv ); |
2541
|
|
|
|
|
|
|
|
2542
|
|
|
|
|
|
|
=head3 IsFileOrDirectory |
2543
|
|
|
|
|
|
|
|
2544
|
|
|
|
|
|
|
Description: |
2545
|
|
|
|
|
|
|
|
2546
|
|
|
|
|
|
|
Checks to see if specified path is a file or directory. |
2547
|
|
|
|
|
|
|
|
2548
|
|
|
|
|
|
|
Input: |
2549
|
|
|
|
|
|
|
|
2550
|
|
|
|
|
|
|
$path -> File or directory path. (String) |
2551
|
|
|
|
|
|
|
|
2552
|
|
|
|
|
|
|
Output: |
2553
|
|
|
|
|
|
|
|
2554
|
|
|
|
|
|
|
$string -> Returns: "file" = file, "dir" = directory and "unknown" if the path is not a file or directory (undefined). |
2555
|
|
|
|
|
|
|
|
2556
|
|
|
|
|
|
|
Example: |
2557
|
|
|
|
|
|
|
|
2558
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2559
|
|
|
|
|
|
|
|
2560
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2561
|
|
|
|
|
|
|
my $path = "path/to/a/directory"; |
2562
|
|
|
|
|
|
|
|
2563
|
|
|
|
|
|
|
print( "Is \"$path\" a file or directory? " . $xmlconv->IsFileOrDirectory( $path ) . "\n" ); |
2564
|
|
|
|
|
|
|
|
2565
|
|
|
|
|
|
|
$path = "path/to/a/file.file"; |
2566
|
|
|
|
|
|
|
|
2567
|
|
|
|
|
|
|
print( "Is \"$path\" a file or directory? " . $xmlconv->IsFileOrDirectory( $path ) . "\n" ); |
2568
|
|
|
|
|
|
|
|
2569
|
|
|
|
|
|
|
undef( $xmlconv ); |
2570
|
|
|
|
|
|
|
|
2571
|
|
|
|
|
|
|
=head3 RemoveSpecialCharactersFromString |
2572
|
|
|
|
|
|
|
|
2573
|
|
|
|
|
|
|
Description: |
2574
|
|
|
|
|
|
|
|
2575
|
|
|
|
|
|
|
Removes special characters from string parameter, removes extra spaces and converts text to lowercase. |
2576
|
|
|
|
|
|
|
|
2577
|
|
|
|
|
|
|
Note: This method is called when parsing and compiling Medline title/abstract data. |
2578
|
|
|
|
|
|
|
|
2579
|
|
|
|
|
|
|
Input: |
2580
|
|
|
|
|
|
|
|
2581
|
|
|
|
|
|
|
$string -> String passed to remove special characters from and convert to lowercase. |
2582
|
|
|
|
|
|
|
|
2583
|
|
|
|
|
|
|
Output: |
2584
|
|
|
|
|
|
|
|
2585
|
|
|
|
|
|
|
$string -> String with all special characters removed and converted to lowercase. |
2586
|
|
|
|
|
|
|
|
2587
|
|
|
|
|
|
|
Example: |
2588
|
|
|
|
|
|
|
|
2589
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2590
|
|
|
|
|
|
|
|
2591
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2592
|
|
|
|
|
|
|
|
2593
|
|
|
|
|
|
|
my $str = "Heart Attack is$ an!@ also KNOWN as an Acute MYOCARDIAL inFARCTion!"; |
2594
|
|
|
|
|
|
|
|
2595
|
|
|
|
|
|
|
print( "Original String: $str\n" ); |
2596
|
|
|
|
|
|
|
|
2597
|
|
|
|
|
|
|
$str = $xmlconv->RemoveSpecialCharactersFromString( $str ); |
2598
|
|
|
|
|
|
|
|
2599
|
|
|
|
|
|
|
print( "Modified String: $str\n" ); |
2600
|
|
|
|
|
|
|
|
2601
|
|
|
|
|
|
|
undef( $xmlconv ); |
2602
|
|
|
|
|
|
|
|
2603
|
|
|
|
|
|
|
=head3 GetFileType |
2604
|
|
|
|
|
|
|
|
2605
|
|
|
|
|
|
|
Description: |
2606
|
|
|
|
|
|
|
|
2607
|
|
|
|
|
|
|
Returns file data type (string). |
2608
|
|
|
|
|
|
|
|
2609
|
|
|
|
|
|
|
Input: |
2610
|
|
|
|
|
|
|
|
2611
|
|
|
|
|
|
|
$filePath -> File to check located at file path |
2612
|
|
|
|
|
|
|
|
2613
|
|
|
|
|
|
|
Output: |
2614
|
|
|
|
|
|
|
|
2615
|
|
|
|
|
|
|
$string -> File type |
2616
|
|
|
|
|
|
|
|
2617
|
|
|
|
|
|
|
Example: |
2618
|
|
|
|
|
|
|
|
2619
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2620
|
|
|
|
|
|
|
|
2621
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new() |
2622
|
|
|
|
|
|
|
my $fileType = $xmlconv->GetFileType( "samples/textcorpus.txt" ); |
2623
|
|
|
|
|
|
|
|
2624
|
|
|
|
|
|
|
undef( $xmlconv ); |
2625
|
|
|
|
|
|
|
|
2626
|
|
|
|
|
|
|
=head3 _DateCheck |
2627
|
|
|
|
|
|
|
|
2628
|
|
|
|
|
|
|
Description: |
2629
|
|
|
|
|
|
|
|
2630
|
|
|
|
|
|
|
Checks specified begin and end date strings for formatting and logic errors. |
2631
|
|
|
|
|
|
|
|
2632
|
|
|
|
|
|
|
Input: |
2633
|
|
|
|
|
|
|
|
2634
|
|
|
|
|
|
|
None |
2635
|
|
|
|
|
|
|
|
2636
|
|
|
|
|
|
|
Output: |
2637
|
|
|
|
|
|
|
|
2638
|
|
|
|
|
|
|
$value -> "0" = Passed Checks / "-1" = Failed Checks |
2639
|
|
|
|
|
|
|
|
2640
|
|
|
|
|
|
|
Example: |
2641
|
|
|
|
|
|
|
|
2642
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2643
|
|
|
|
|
|
|
|
2644
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new() |
2645
|
|
|
|
|
|
|
print "Passed Date Checks\n" if ( $xmlconv->_DateCheck() == 0 ); |
2646
|
|
|
|
|
|
|
print "Failed Date Checks\n" if ( $xmlconv->_DateCheck() == -1 ); |
2647
|
|
|
|
|
|
|
|
2648
|
|
|
|
|
|
|
undef( $xmlconv ); |
2649
|
|
|
|
|
|
|
|
2650
|
|
|
|
|
|
|
=head2 Accessor Functions |
2651
|
|
|
|
|
|
|
|
2652
|
|
|
|
|
|
|
=head3 GetDebugLog |
2653
|
|
|
|
|
|
|
|
2654
|
|
|
|
|
|
|
Description: |
2655
|
|
|
|
|
|
|
|
2656
|
|
|
|
|
|
|
Returns the _debugLog member variable set during Word2vec::Xmltow2v object initialization of new function. |
2657
|
|
|
|
|
|
|
|
2658
|
|
|
|
|
|
|
Input: |
2659
|
|
|
|
|
|
|
|
2660
|
|
|
|
|
|
|
None |
2661
|
|
|
|
|
|
|
|
2662
|
|
|
|
|
|
|
Output: |
2663
|
|
|
|
|
|
|
|
2664
|
|
|
|
|
|
|
$value -> '0' = False, '1' = True |
2665
|
|
|
|
|
|
|
|
2666
|
|
|
|
|
|
|
Example: |
2667
|
|
|
|
|
|
|
|
2668
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2669
|
|
|
|
|
|
|
|
2670
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new() |
2671
|
|
|
|
|
|
|
my $debugLog = $xmlconv->GetDebugLog(); |
2672
|
|
|
|
|
|
|
|
2673
|
|
|
|
|
|
|
print( "Debug Logging Enabled\n" ) if $debugLog == 1; |
2674
|
|
|
|
|
|
|
print( "Debug Logging Disabled\n" ) if $debugLog == 0; |
2675
|
|
|
|
|
|
|
|
2676
|
|
|
|
|
|
|
|
2677
|
|
|
|
|
|
|
undef( $xmlconv ); |
2678
|
|
|
|
|
|
|
|
2679
|
|
|
|
|
|
|
=head3 GetWriteLog |
2680
|
|
|
|
|
|
|
|
2681
|
|
|
|
|
|
|
Description: |
2682
|
|
|
|
|
|
|
|
2683
|
|
|
|
|
|
|
Returns the _writeLog member variable set during Word2vec::Xmltow2v object initialization of new function. |
2684
|
|
|
|
|
|
|
|
2685
|
|
|
|
|
|
|
Input: |
2686
|
|
|
|
|
|
|
|
2687
|
|
|
|
|
|
|
None |
2688
|
|
|
|
|
|
|
|
2689
|
|
|
|
|
|
|
Output: |
2690
|
|
|
|
|
|
|
|
2691
|
|
|
|
|
|
|
$value -> '0' = False, '1' = True |
2692
|
|
|
|
|
|
|
|
2693
|
|
|
|
|
|
|
Example: |
2694
|
|
|
|
|
|
|
|
2695
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2696
|
|
|
|
|
|
|
|
2697
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2698
|
|
|
|
|
|
|
my $writeLog = $xmlconv->GetWriteLog(); |
2699
|
|
|
|
|
|
|
|
2700
|
|
|
|
|
|
|
print( "Write Logging Enabled\n" ) if $writeLog == 1; |
2701
|
|
|
|
|
|
|
print( "Write Logging Disabled\n" ) if $writeLog == 0; |
2702
|
|
|
|
|
|
|
|
2703
|
|
|
|
|
|
|
undef( $xmlconv ); |
2704
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
=head3 GetStoreTitle |
2706
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
Description: |
2708
|
|
|
|
|
|
|
|
2709
|
|
|
|
|
|
|
Returns the _storeTitle member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2710
|
|
|
|
|
|
|
|
2711
|
|
|
|
|
|
|
Input: |
2712
|
|
|
|
|
|
|
|
2713
|
|
|
|
|
|
|
None |
2714
|
|
|
|
|
|
|
|
2715
|
|
|
|
|
|
|
Output: |
2716
|
|
|
|
|
|
|
|
2717
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2718
|
|
|
|
|
|
|
|
2719
|
|
|
|
|
|
|
Example: |
2720
|
|
|
|
|
|
|
|
2721
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2722
|
|
|
|
|
|
|
|
2723
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2724
|
|
|
|
|
|
|
my $storeTitle = $xmlconv->GetStoreTitle(); |
2725
|
|
|
|
|
|
|
|
2726
|
|
|
|
|
|
|
print( "Store Title Option: Enabled\n" ) if $storeTitle == 1; |
2727
|
|
|
|
|
|
|
print( "Store Title Option: Disabled\n" ) if $storeTitle == 0; |
2728
|
|
|
|
|
|
|
|
2729
|
|
|
|
|
|
|
undef( $xmlconv ); |
2730
|
|
|
|
|
|
|
|
2731
|
|
|
|
|
|
|
=head3 GetStoreAbstract |
2732
|
|
|
|
|
|
|
|
2733
|
|
|
|
|
|
|
Description: |
2734
|
|
|
|
|
|
|
|
2735
|
|
|
|
|
|
|
Returns the _storeAbstract member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2736
|
|
|
|
|
|
|
|
2737
|
|
|
|
|
|
|
Input: |
2738
|
|
|
|
|
|
|
|
2739
|
|
|
|
|
|
|
None |
2740
|
|
|
|
|
|
|
|
2741
|
|
|
|
|
|
|
Output: |
2742
|
|
|
|
|
|
|
|
2743
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2744
|
|
|
|
|
|
|
|
2745
|
|
|
|
|
|
|
Example: |
2746
|
|
|
|
|
|
|
|
2747
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2748
|
|
|
|
|
|
|
|
2749
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2750
|
|
|
|
|
|
|
my $storeAbstract = $xmlconv->GetStoreAbstract(); |
2751
|
|
|
|
|
|
|
|
2752
|
|
|
|
|
|
|
print( "Store Abstract Option: Enabled\n" ) if $storeAbsract == 1; |
2753
|
|
|
|
|
|
|
print( "Store Abstract Option: Disabled\n" ) if $storeAbstract == 0; |
2754
|
|
|
|
|
|
|
|
2755
|
|
|
|
|
|
|
undef( $xmlconv ); |
2756
|
|
|
|
|
|
|
|
2757
|
|
|
|
|
|
|
=head3 GetQuickParse |
2758
|
|
|
|
|
|
|
|
2759
|
|
|
|
|
|
|
Description: |
2760
|
|
|
|
|
|
|
|
2761
|
|
|
|
|
|
|
Returns the _quickParse member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2762
|
|
|
|
|
|
|
|
2763
|
|
|
|
|
|
|
Input: |
2764
|
|
|
|
|
|
|
|
2765
|
|
|
|
|
|
|
None |
2766
|
|
|
|
|
|
|
|
2767
|
|
|
|
|
|
|
Output: |
2768
|
|
|
|
|
|
|
|
2769
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2770
|
|
|
|
|
|
|
|
2771
|
|
|
|
|
|
|
Example: |
2772
|
|
|
|
|
|
|
|
2773
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2774
|
|
|
|
|
|
|
|
2775
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2776
|
|
|
|
|
|
|
my $quickParse = $xmlconv->GetQuickParse(); |
2777
|
|
|
|
|
|
|
|
2778
|
|
|
|
|
|
|
print( "Quick Parse Option: Enabled\n" ) if $quickParse == 1; |
2779
|
|
|
|
|
|
|
print( "Quick Parse Option: Disabled\n" ) if $quickParse == 0; |
2780
|
|
|
|
|
|
|
|
2781
|
|
|
|
|
|
|
undef( $xmlconv ); |
2782
|
|
|
|
|
|
|
|
2783
|
|
|
|
|
|
|
=head3 GetCompoundifyText |
2784
|
|
|
|
|
|
|
|
2785
|
|
|
|
|
|
|
Description: |
2786
|
|
|
|
|
|
|
|
2787
|
|
|
|
|
|
|
Returns the _compoundifyText member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2788
|
|
|
|
|
|
|
|
2789
|
|
|
|
|
|
|
Input: |
2790
|
|
|
|
|
|
|
|
2791
|
|
|
|
|
|
|
None |
2792
|
|
|
|
|
|
|
|
2793
|
|
|
|
|
|
|
Output: |
2794
|
|
|
|
|
|
|
|
2795
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2796
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
Example: |
2798
|
|
|
|
|
|
|
|
2799
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2800
|
|
|
|
|
|
|
|
2801
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2802
|
|
|
|
|
|
|
my $compoundify = $xmlconv->GetCompoundifyText(); |
2803
|
|
|
|
|
|
|
|
2804
|
|
|
|
|
|
|
print( "Compoundify Text Option: Enabled\n" ) if $compoundify == 1; |
2805
|
|
|
|
|
|
|
print( "Compoundify Text Option: Disabled\n" ) if $compoundify == 0; |
2806
|
|
|
|
|
|
|
|
2807
|
|
|
|
|
|
|
undef( $xmlconv ); |
2808
|
|
|
|
|
|
|
|
2809
|
|
|
|
|
|
|
=head3 GetNumOfThreads |
2810
|
|
|
|
|
|
|
|
2811
|
|
|
|
|
|
|
Description: |
2812
|
|
|
|
|
|
|
|
2813
|
|
|
|
|
|
|
Returns the _numOfThreads member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2814
|
|
|
|
|
|
|
|
2815
|
|
|
|
|
|
|
Input: |
2816
|
|
|
|
|
|
|
|
2817
|
|
|
|
|
|
|
None |
2818
|
|
|
|
|
|
|
|
2819
|
|
|
|
|
|
|
Output: |
2820
|
|
|
|
|
|
|
|
2821
|
|
|
|
|
|
|
$value -> Number of threads |
2822
|
|
|
|
|
|
|
|
2823
|
|
|
|
|
|
|
Example: |
2824
|
|
|
|
|
|
|
|
2825
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2826
|
|
|
|
|
|
|
|
2827
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2828
|
|
|
|
|
|
|
my $numOfThreads = $xmlconv->GetNumOfThreads(); |
2829
|
|
|
|
|
|
|
|
2830
|
|
|
|
|
|
|
print( "Number of threads: $numOfThreads\n" ); |
2831
|
|
|
|
|
|
|
|
2832
|
|
|
|
|
|
|
undef( $xmlconv ); |
2833
|
|
|
|
|
|
|
|
2834
|
|
|
|
|
|
|
=head3 GetWorkingDir |
2835
|
|
|
|
|
|
|
|
2836
|
|
|
|
|
|
|
Description: |
2837
|
|
|
|
|
|
|
|
2838
|
|
|
|
|
|
|
Returns the _workingDir member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2839
|
|
|
|
|
|
|
|
2840
|
|
|
|
|
|
|
Input: |
2841
|
|
|
|
|
|
|
|
2842
|
|
|
|
|
|
|
None |
2843
|
|
|
|
|
|
|
|
2844
|
|
|
|
|
|
|
Output: |
2845
|
|
|
|
|
|
|
|
2846
|
|
|
|
|
|
|
$string -> Working directory string |
2847
|
|
|
|
|
|
|
|
2848
|
|
|
|
|
|
|
Example: |
2849
|
|
|
|
|
|
|
|
2850
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2851
|
|
|
|
|
|
|
|
2852
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2853
|
|
|
|
|
|
|
my $workingDirectory = $xmlconv->GetWorkingDir(); |
2854
|
|
|
|
|
|
|
|
2855
|
|
|
|
|
|
|
print( "Working Directory: $workingDirectory\n" ); |
2856
|
|
|
|
|
|
|
|
2857
|
|
|
|
|
|
|
undef( $xmlconv ); |
2858
|
|
|
|
|
|
|
|
2859
|
|
|
|
|
|
|
=head3 GetSavePath |
2860
|
|
|
|
|
|
|
|
2861
|
|
|
|
|
|
|
Description: |
2862
|
|
|
|
|
|
|
|
2863
|
|
|
|
|
|
|
Returns the _saveDir member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2864
|
|
|
|
|
|
|
|
2865
|
|
|
|
|
|
|
Input: |
2866
|
|
|
|
|
|
|
|
2867
|
|
|
|
|
|
|
None |
2868
|
|
|
|
|
|
|
|
2869
|
|
|
|
|
|
|
Output: |
2870
|
|
|
|
|
|
|
|
2871
|
|
|
|
|
|
|
$string -> Save directory string |
2872
|
|
|
|
|
|
|
|
2873
|
|
|
|
|
|
|
Example: |
2874
|
|
|
|
|
|
|
|
2875
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2876
|
|
|
|
|
|
|
|
2877
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2878
|
|
|
|
|
|
|
my $savePath = $xmlconv->GetSavePath(); |
2879
|
|
|
|
|
|
|
|
2880
|
|
|
|
|
|
|
print( "Save Directory: $savePath\n" ); |
2881
|
|
|
|
|
|
|
|
2882
|
|
|
|
|
|
|
undef( $xmlconv ); |
2883
|
|
|
|
|
|
|
|
2884
|
|
|
|
|
|
|
=head3 GetBeginDate |
2885
|
|
|
|
|
|
|
|
2886
|
|
|
|
|
|
|
Description: |
2887
|
|
|
|
|
|
|
|
2888
|
|
|
|
|
|
|
Returns the _beginDate member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2889
|
|
|
|
|
|
|
|
2890
|
|
|
|
|
|
|
Input: |
2891
|
|
|
|
|
|
|
|
2892
|
|
|
|
|
|
|
None |
2893
|
|
|
|
|
|
|
|
2894
|
|
|
|
|
|
|
Output: |
2895
|
|
|
|
|
|
|
|
2896
|
|
|
|
|
|
|
$date -> Beginning date range - Format: XX/XX/XXXX (Mon/Day/Year) |
2897
|
|
|
|
|
|
|
|
2898
|
|
|
|
|
|
|
Example: |
2899
|
|
|
|
|
|
|
|
2900
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2901
|
|
|
|
|
|
|
|
2902
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2903
|
|
|
|
|
|
|
my $date = $xmlconv->GetBeginDate(); |
2904
|
|
|
|
|
|
|
|
2905
|
|
|
|
|
|
|
print( "Date: $date\n" ); |
2906
|
|
|
|
|
|
|
|
2907
|
|
|
|
|
|
|
undef( $xmlconv ); |
2908
|
|
|
|
|
|
|
|
2909
|
|
|
|
|
|
|
=head3 GetEndDate |
2910
|
|
|
|
|
|
|
|
2911
|
|
|
|
|
|
|
Description: |
2912
|
|
|
|
|
|
|
|
2913
|
|
|
|
|
|
|
Returns the _endDate member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2914
|
|
|
|
|
|
|
|
2915
|
|
|
|
|
|
|
Input: |
2916
|
|
|
|
|
|
|
|
2917
|
|
|
|
|
|
|
None |
2918
|
|
|
|
|
|
|
|
2919
|
|
|
|
|
|
|
Output: |
2920
|
|
|
|
|
|
|
|
2921
|
|
|
|
|
|
|
$date -> End date range - Format: XX/XX/XXXX (Mon/Day/Year). |
2922
|
|
|
|
|
|
|
|
2923
|
|
|
|
|
|
|
Example: |
2924
|
|
|
|
|
|
|
|
2925
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2926
|
|
|
|
|
|
|
|
2927
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2928
|
|
|
|
|
|
|
my $date = $xmlconv->GetEndDate(); |
2929
|
|
|
|
|
|
|
|
2930
|
|
|
|
|
|
|
print( "Date: $date\n" ); |
2931
|
|
|
|
|
|
|
|
2932
|
|
|
|
|
|
|
undef( $xmlconv ); |
2933
|
|
|
|
|
|
|
|
2934
|
|
|
|
|
|
|
=head3 GetXMLStringToParse |
2935
|
|
|
|
|
|
|
|
2936
|
|
|
|
|
|
|
Returns the XML data (string) to be parsed. |
2937
|
|
|
|
|
|
|
|
2938
|
|
|
|
|
|
|
Description: |
2939
|
|
|
|
|
|
|
|
2940
|
|
|
|
|
|
|
Returns the _xmlStringToParse member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2941
|
|
|
|
|
|
|
|
2942
|
|
|
|
|
|
|
Input: |
2943
|
|
|
|
|
|
|
|
2944
|
|
|
|
|
|
|
None |
2945
|
|
|
|
|
|
|
|
2946
|
|
|
|
|
|
|
Output: |
2947
|
|
|
|
|
|
|
|
2948
|
|
|
|
|
|
|
$string -> Medline XML data string |
2949
|
|
|
|
|
|
|
|
2950
|
|
|
|
|
|
|
Example: |
2951
|
|
|
|
|
|
|
|
2952
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2953
|
|
|
|
|
|
|
|
2954
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2955
|
|
|
|
|
|
|
my $xmlStr = $xmlconv->GetXMLStringToParse(); |
2956
|
|
|
|
|
|
|
|
2957
|
|
|
|
|
|
|
print( "XML String: $xmlStr\n" ); |
2958
|
|
|
|
|
|
|
|
2959
|
|
|
|
|
|
|
undef( $xmlconv ); |
2960
|
|
|
|
|
|
|
|
2961
|
|
|
|
|
|
|
=head3 GetTextCorpusStr |
2962
|
|
|
|
|
|
|
|
2963
|
|
|
|
|
|
|
Description: |
2964
|
|
|
|
|
|
|
|
2965
|
|
|
|
|
|
|
Returns the _textCorpusStr member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2966
|
|
|
|
|
|
|
|
2967
|
|
|
|
|
|
|
Input: |
2968
|
|
|
|
|
|
|
|
2969
|
|
|
|
|
|
|
None |
2970
|
|
|
|
|
|
|
|
2971
|
|
|
|
|
|
|
Output: |
2972
|
|
|
|
|
|
|
|
2973
|
|
|
|
|
|
|
$string -> Text corpus string |
2974
|
|
|
|
|
|
|
|
2975
|
|
|
|
|
|
|
Example: |
2976
|
|
|
|
|
|
|
|
2977
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2978
|
|
|
|
|
|
|
|
2979
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2980
|
|
|
|
|
|
|
my $str = $xmlconv->GetTextCorpusStr(); |
2981
|
|
|
|
|
|
|
|
2982
|
|
|
|
|
|
|
print( "Text Corpus: $str\n" ); |
2983
|
|
|
|
|
|
|
|
2984
|
|
|
|
|
|
|
undef( $xmlconv ); |
2985
|
|
|
|
|
|
|
|
2986
|
|
|
|
|
|
|
=head3 GetFileHandle |
2987
|
|
|
|
|
|
|
|
2988
|
|
|
|
|
|
|
Description: |
2989
|
|
|
|
|
|
|
|
2990
|
|
|
|
|
|
|
Returns the _fileHandle member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2991
|
|
|
|
|
|
|
|
2992
|
|
|
|
|
|
|
Warning: This is a private function. File handle is used by WriteLog() method. Do not manipulate this file handle as errors can result. |
2993
|
|
|
|
|
|
|
|
2994
|
|
|
|
|
|
|
Input: |
2995
|
|
|
|
|
|
|
|
2996
|
|
|
|
|
|
|
None |
2997
|
|
|
|
|
|
|
|
2998
|
|
|
|
|
|
|
Output: |
2999
|
|
|
|
|
|
|
|
3000
|
|
|
|
|
|
|
$fileHandle -> Returns file handle for WriteLog() method. |
3001
|
|
|
|
|
|
|
|
3002
|
|
|
|
|
|
|
Example: |
3003
|
|
|
|
|
|
|
|
3004
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3005
|
|
|
|
|
|
|
|
3006
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3007
|
|
|
|
|
|
|
my $fileHandle = $xmlconv->GetFileHandle(); |
3008
|
|
|
|
|
|
|
|
3009
|
|
|
|
|
|
|
undef( $xmlconv ); |
3010
|
|
|
|
|
|
|
|
3011
|
|
|
|
|
|
|
=head3 GetTwigHandler |
3012
|
|
|
|
|
|
|
|
3013
|
|
|
|
|
|
|
Returns XML::Twig handler. |
3014
|
|
|
|
|
|
|
|
3015
|
|
|
|
|
|
|
Description: |
3016
|
|
|
|
|
|
|
|
3017
|
|
|
|
|
|
|
Returns the _twigHandler member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3018
|
|
|
|
|
|
|
|
3019
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3020
|
|
|
|
|
|
|
|
3021
|
|
|
|
|
|
|
Input: |
3022
|
|
|
|
|
|
|
|
3023
|
|
|
|
|
|
|
None |
3024
|
|
|
|
|
|
|
|
3025
|
|
|
|
|
|
|
Output: |
3026
|
|
|
|
|
|
|
|
3027
|
|
|
|
|
|
|
$twigHandler -> XML::Twig handler. |
3028
|
|
|
|
|
|
|
|
3029
|
|
|
|
|
|
|
Example: |
3030
|
|
|
|
|
|
|
|
3031
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3032
|
|
|
|
|
|
|
|
3033
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3034
|
|
|
|
|
|
|
my $xmlHandler = $xmlconv->GetTwigHandler(); |
3035
|
|
|
|
|
|
|
|
3036
|
|
|
|
|
|
|
undef( $xmlconv ); |
3037
|
|
|
|
|
|
|
|
3038
|
|
|
|
|
|
|
=head3 GetParsedCount |
3039
|
|
|
|
|
|
|
|
3040
|
|
|
|
|
|
|
Description: |
3041
|
|
|
|
|
|
|
|
3042
|
|
|
|
|
|
|
Returns the _parsedCount member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3043
|
|
|
|
|
|
|
|
3044
|
|
|
|
|
|
|
Input: |
3045
|
|
|
|
|
|
|
|
3046
|
|
|
|
|
|
|
None |
3047
|
|
|
|
|
|
|
|
3048
|
|
|
|
|
|
|
Output: |
3049
|
|
|
|
|
|
|
|
3050
|
|
|
|
|
|
|
$value -> Number of parsed Medline articles. |
3051
|
|
|
|
|
|
|
|
3052
|
|
|
|
|
|
|
Example: |
3053
|
|
|
|
|
|
|
|
3054
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3055
|
|
|
|
|
|
|
|
3056
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3057
|
|
|
|
|
|
|
my $numOfParsed = $xmlconv->GetParsedCount(); |
3058
|
|
|
|
|
|
|
|
3059
|
|
|
|
|
|
|
print( "Number of parsed Medline articles: $numOfParsed\n" ); |
3060
|
|
|
|
|
|
|
|
3061
|
|
|
|
|
|
|
undef( $xmlconv ); |
3062
|
|
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
=head3 GetTempStr |
3064
|
|
|
|
|
|
|
|
3065
|
|
|
|
|
|
|
Description: |
3066
|
|
|
|
|
|
|
|
3067
|
|
|
|
|
|
|
Returns the _tempStr member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3068
|
|
|
|
|
|
|
|
3069
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. Used by module as a temporary storage |
3070
|
|
|
|
|
|
|
location for parsed Medline 'Title' and 'Abstract' flag string data. |
3071
|
|
|
|
|
|
|
|
3072
|
|
|
|
|
|
|
Input: |
3073
|
|
|
|
|
|
|
|
3074
|
|
|
|
|
|
|
None |
3075
|
|
|
|
|
|
|
|
3076
|
|
|
|
|
|
|
Output: |
3077
|
|
|
|
|
|
|
|
3078
|
|
|
|
|
|
|
$string -> Temporary string storage location. |
3079
|
|
|
|
|
|
|
|
3080
|
|
|
|
|
|
|
Example: |
3081
|
|
|
|
|
|
|
|
3082
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3083
|
|
|
|
|
|
|
|
3084
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3085
|
|
|
|
|
|
|
my $tempStr = $xmlconv->GetTempStr(); |
3086
|
|
|
|
|
|
|
|
3087
|
|
|
|
|
|
|
print( "Temp String: $tempStr\n" ); |
3088
|
|
|
|
|
|
|
|
3089
|
|
|
|
|
|
|
undef( $xmlconv ); |
3090
|
|
|
|
|
|
|
|
3091
|
|
|
|
|
|
|
=head3 GetTempDate |
3092
|
|
|
|
|
|
|
|
3093
|
|
|
|
|
|
|
Description: |
3094
|
|
|
|
|
|
|
|
3095
|
|
|
|
|
|
|
Returns the _tempDate member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3096
|
|
|
|
|
|
|
Used by module as a temporary storage location for parsed Medline 'DateCreated' flag string data. |
3097
|
|
|
|
|
|
|
|
3098
|
|
|
|
|
|
|
Input: |
3099
|
|
|
|
|
|
|
|
3100
|
|
|
|
|
|
|
None |
3101
|
|
|
|
|
|
|
|
3102
|
|
|
|
|
|
|
Output: |
3103
|
|
|
|
|
|
|
|
3104
|
|
|
|
|
|
|
$date -> Date string - Format: XX/XX/XXXX (Mon/Day/Year). |
3105
|
|
|
|
|
|
|
|
3106
|
|
|
|
|
|
|
Example: |
3107
|
|
|
|
|
|
|
|
3108
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3109
|
|
|
|
|
|
|
|
3110
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3111
|
|
|
|
|
|
|
my $date = $xmlconv->GetTempDate(); |
3112
|
|
|
|
|
|
|
|
3113
|
|
|
|
|
|
|
print( "Temp Date: $date\n" ); |
3114
|
|
|
|
|
|
|
|
3115
|
|
|
|
|
|
|
undef( $xmlconv ); |
3116
|
|
|
|
|
|
|
|
3117
|
|
|
|
|
|
|
=head3 GetCompoundWordAry |
3118
|
|
|
|
|
|
|
|
3119
|
|
|
|
|
|
|
Description: |
3120
|
|
|
|
|
|
|
|
3121
|
|
|
|
|
|
|
Returns the _compoundWordAry member array reference set during Word2vec::Xmltow2v object instantiation of new function. |
3122
|
|
|
|
|
|
|
|
3123
|
|
|
|
|
|
|
Warning: Compound word data must be loaded in memory first via ReadCompoundWordDataFromFile(). |
3124
|
|
|
|
|
|
|
|
3125
|
|
|
|
|
|
|
Input: |
3126
|
|
|
|
|
|
|
|
3127
|
|
|
|
|
|
|
None |
3128
|
|
|
|
|
|
|
|
3129
|
|
|
|
|
|
|
Output: |
3130
|
|
|
|
|
|
|
|
3131
|
|
|
|
|
|
|
$arrayReference -> Compound word array reference. |
3132
|
|
|
|
|
|
|
|
3133
|
|
|
|
|
|
|
Example: |
3134
|
|
|
|
|
|
|
|
3135
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3136
|
|
|
|
|
|
|
|
3137
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3138
|
|
|
|
|
|
|
my $arrayReference = $xmlconv->GetCompoundWordAry(); |
3139
|
|
|
|
|
|
|
my @compoundWord = @{ $arrayReference }; |
3140
|
|
|
|
|
|
|
|
3141
|
|
|
|
|
|
|
print( "Compound Word Array: @compoundWord\n" ); |
3142
|
|
|
|
|
|
|
|
3143
|
|
|
|
|
|
|
undef( $xmlconv ); |
3144
|
|
|
|
|
|
|
|
3145
|
|
|
|
|
|
|
=head3 GetCompoundWordBST |
3146
|
|
|
|
|
|
|
|
3147
|
|
|
|
|
|
|
Description: |
3148
|
|
|
|
|
|
|
|
3149
|
|
|
|
|
|
|
Returns the _compoundWordBST member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3150
|
|
|
|
|
|
|
|
3151
|
|
|
|
|
|
|
Input: |
3152
|
|
|
|
|
|
|
|
3153
|
|
|
|
|
|
|
None |
3154
|
|
|
|
|
|
|
|
3155
|
|
|
|
|
|
|
Output: |
3156
|
|
|
|
|
|
|
|
3157
|
|
|
|
|
|
|
$bst -> Compound word binary search tree. |
3158
|
|
|
|
|
|
|
|
3159
|
|
|
|
|
|
|
Example: |
3160
|
|
|
|
|
|
|
|
3161
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3162
|
|
|
|
|
|
|
|
3163
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3164
|
|
|
|
|
|
|
my $bst = $xmlconv->GetCompoundWordBST(); |
3165
|
|
|
|
|
|
|
|
3166
|
|
|
|
|
|
|
undef( $xmlconv ); |
3167
|
|
|
|
|
|
|
|
3168
|
|
|
|
|
|
|
=head3 GetMaxCompoundWordLength |
3169
|
|
|
|
|
|
|
|
3170
|
|
|
|
|
|
|
Description: |
3171
|
|
|
|
|
|
|
|
3172
|
|
|
|
|
|
|
Returns the _maxCompoundWordLength member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3173
|
|
|
|
|
|
|
|
3174
|
|
|
|
|
|
|
Note: If not defined, it is automatically set to and returns 20. |
3175
|
|
|
|
|
|
|
|
3176
|
|
|
|
|
|
|
Input: |
3177
|
|
|
|
|
|
|
|
3178
|
|
|
|
|
|
|
None |
3179
|
|
|
|
|
|
|
|
3180
|
|
|
|
|
|
|
Output: |
3181
|
|
|
|
|
|
|
|
3182
|
|
|
|
|
|
|
$value -> Maximum number of compound words in a given phrase. |
3183
|
|
|
|
|
|
|
|
3184
|
|
|
|
|
|
|
Example: |
3185
|
|
|
|
|
|
|
|
3186
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3187
|
|
|
|
|
|
|
|
3188
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3189
|
|
|
|
|
|
|
my $compoundWordLength = $xmlconv->GetMaxCompoundWordLength(); |
3190
|
|
|
|
|
|
|
|
3191
|
|
|
|
|
|
|
print( "Maximum Compound Word Length: $compoundWordLength\n" ); |
3192
|
|
|
|
|
|
|
|
3193
|
|
|
|
|
|
|
undef( $xmlconv ); |
3194
|
|
|
|
|
|
|
|
3195
|
|
|
|
|
|
|
=head3 GetOverwriteExistingFile |
3196
|
|
|
|
|
|
|
|
3197
|
|
|
|
|
|
|
Description: |
3198
|
|
|
|
|
|
|
|
3199
|
|
|
|
|
|
|
Returns the _overwriteExisitingFile member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3200
|
|
|
|
|
|
|
Enables overwriting of existing text corpus if set to '1' or appends to the existing text corpus if set to '0'. |
3201
|
|
|
|
|
|
|
|
3202
|
|
|
|
|
|
|
Input: |
3203
|
|
|
|
|
|
|
|
3204
|
|
|
|
|
|
|
None |
3205
|
|
|
|
|
|
|
|
3206
|
|
|
|
|
|
|
Output: |
3207
|
|
|
|
|
|
|
|
3208
|
|
|
|
|
|
|
$value -> '1' = Overwrite existing file / '0' = Append to exiting file. |
3209
|
|
|
|
|
|
|
|
3210
|
|
|
|
|
|
|
Example: |
3211
|
|
|
|
|
|
|
|
3212
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3213
|
|
|
|
|
|
|
|
3214
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3215
|
|
|
|
|
|
|
my $overwriteExitingFile = $xmlconv->GetOverwriteExistingFile(); |
3216
|
|
|
|
|
|
|
|
3217
|
|
|
|
|
|
|
print( "Overwrite Existing File? YES\n" ) if ( $overwriteExistingFile == 1 ); |
3218
|
|
|
|
|
|
|
print( "Overwrite Existing File? NO\n" ) if ( $overwriteExistingFile == 0 ); |
3219
|
|
|
|
|
|
|
|
3220
|
|
|
|
|
|
|
undef( $xmlconv ); |
3221
|
|
|
|
|
|
|
|
3222
|
|
|
|
|
|
|
=head2 Mutator Functions |
3223
|
|
|
|
|
|
|
|
3224
|
|
|
|
|
|
|
=head3 SetStoreTitle |
3225
|
|
|
|
|
|
|
|
3226
|
|
|
|
|
|
|
Description: |
3227
|
|
|
|
|
|
|
|
3228
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to store article title if true or omit if false. |
3229
|
|
|
|
|
|
|
|
3230
|
|
|
|
|
|
|
Input: |
3231
|
|
|
|
|
|
|
|
3232
|
|
|
|
|
|
|
$value -> '1' = Store Titles / '0' = Omit Titles |
3233
|
|
|
|
|
|
|
|
3234
|
|
|
|
|
|
|
Ouput: |
3235
|
|
|
|
|
|
|
|
3236
|
|
|
|
|
|
|
None |
3237
|
|
|
|
|
|
|
|
3238
|
|
|
|
|
|
|
Example: |
3239
|
|
|
|
|
|
|
|
3240
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3241
|
|
|
|
|
|
|
|
3242
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3243
|
|
|
|
|
|
|
$xmlconv->SetStoreTitle( 1 ); |
3244
|
|
|
|
|
|
|
|
3245
|
|
|
|
|
|
|
undef( $xmlconv ); |
3246
|
|
|
|
|
|
|
|
3247
|
|
|
|
|
|
|
=head3 SetStoreAbstract |
3248
|
|
|
|
|
|
|
|
3249
|
|
|
|
|
|
|
Description: |
3250
|
|
|
|
|
|
|
|
3251
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to store article abstracts if true or omit if false. |
3252
|
|
|
|
|
|
|
|
3253
|
|
|
|
|
|
|
Input: |
3254
|
|
|
|
|
|
|
|
3255
|
|
|
|
|
|
|
$value -> '1' = Store Abstracts / '0' = Omit Abstracts |
3256
|
|
|
|
|
|
|
|
3257
|
|
|
|
|
|
|
Ouput: |
3258
|
|
|
|
|
|
|
|
3259
|
|
|
|
|
|
|
None |
3260
|
|
|
|
|
|
|
|
3261
|
|
|
|
|
|
|
Example: |
3262
|
|
|
|
|
|
|
|
3263
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3264
|
|
|
|
|
|
|
|
3265
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3266
|
|
|
|
|
|
|
$xmlconv->SetStoreAbstract( 1 ); |
3267
|
|
|
|
|
|
|
|
3268
|
|
|
|
|
|
|
undef( $xmlconv ); |
3269
|
|
|
|
|
|
|
|
3270
|
|
|
|
|
|
|
=head3 SetWorkingDir |
3271
|
|
|
|
|
|
|
|
3272
|
|
|
|
|
|
|
Description: |
3273
|
|
|
|
|
|
|
|
3274
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Represents the working directory. |
3275
|
|
|
|
|
|
|
|
3276
|
|
|
|
|
|
|
Input: |
3277
|
|
|
|
|
|
|
|
3278
|
|
|
|
|
|
|
$string -> Working directory string |
3279
|
|
|
|
|
|
|
|
3280
|
|
|
|
|
|
|
Ouput: |
3281
|
|
|
|
|
|
|
|
3282
|
|
|
|
|
|
|
None |
3283
|
|
|
|
|
|
|
|
3284
|
|
|
|
|
|
|
Example: |
3285
|
|
|
|
|
|
|
|
3286
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3287
|
|
|
|
|
|
|
|
3288
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3289
|
|
|
|
|
|
|
$xmlconv->SetWorkingDir( "/samples/" ); |
3290
|
|
|
|
|
|
|
|
3291
|
|
|
|
|
|
|
undef( $xmlconv ); |
3292
|
|
|
|
|
|
|
|
3293
|
|
|
|
|
|
|
=head3 SetSavePath |
3294
|
|
|
|
|
|
|
|
3295
|
|
|
|
|
|
|
Description: |
3296
|
|
|
|
|
|
|
|
3297
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Represents the text corpus save path. |
3298
|
|
|
|
|
|
|
|
3299
|
|
|
|
|
|
|
Input: |
3300
|
|
|
|
|
|
|
|
3301
|
|
|
|
|
|
|
$string -> Text corpus save path |
3302
|
|
|
|
|
|
|
|
3303
|
|
|
|
|
|
|
Output: |
3304
|
|
|
|
|
|
|
|
3305
|
|
|
|
|
|
|
None |
3306
|
|
|
|
|
|
|
|
3307
|
|
|
|
|
|
|
Example: |
3308
|
|
|
|
|
|
|
|
3309
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3310
|
|
|
|
|
|
|
|
3311
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3312
|
|
|
|
|
|
|
$xmlconv->SetSavePath( "samples/textcorpus.txt" ); |
3313
|
|
|
|
|
|
|
|
3314
|
|
|
|
|
|
|
undef( $xmlconv ); |
3315
|
|
|
|
|
|
|
|
3316
|
|
|
|
|
|
|
=head3 SetQuickParse |
3317
|
|
|
|
|
|
|
|
3318
|
|
|
|
|
|
|
Description: |
3319
|
|
|
|
|
|
|
|
3320
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to utilize quick parse |
3321
|
|
|
|
|
|
|
routines to speed up text corpus compilation. This method is somewhat less accurate due to its non-exhaustive nature. |
3322
|
|
|
|
|
|
|
|
3323
|
|
|
|
|
|
|
Input: |
3324
|
|
|
|
|
|
|
|
3325
|
|
|
|
|
|
|
$value -> '1' = Enable Quick Parse / '0' = Disable Quick Parse |
3326
|
|
|
|
|
|
|
|
3327
|
|
|
|
|
|
|
Ouput: |
3328
|
|
|
|
|
|
|
|
3329
|
|
|
|
|
|
|
None |
3330
|
|
|
|
|
|
|
|
3331
|
|
|
|
|
|
|
Example: |
3332
|
|
|
|
|
|
|
|
3333
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3334
|
|
|
|
|
|
|
|
3335
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3336
|
|
|
|
|
|
|
$xmlconv->SetQuickParse( 1 ); |
3337
|
|
|
|
|
|
|
|
3338
|
|
|
|
|
|
|
undef( $xmlconv ); |
3339
|
|
|
|
|
|
|
|
3340
|
|
|
|
|
|
|
=head3 SetCompoundifyText |
3341
|
|
|
|
|
|
|
|
3342
|
|
|
|
|
|
|
Description: |
3343
|
|
|
|
|
|
|
|
3344
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to utilize 'compoundify' option if true. |
3345
|
|
|
|
|
|
|
|
3346
|
|
|
|
|
|
|
Warning: This requires compound word data to be loaded into memory with ReadCompoundWordDataFromFile() method prior |
3347
|
|
|
|
|
|
|
to executing text corpus compilation. |
3348
|
|
|
|
|
|
|
|
3349
|
|
|
|
|
|
|
Input: |
3350
|
|
|
|
|
|
|
|
3351
|
|
|
|
|
|
|
$value -> '1' = Compoundify text / '0' = Do not compoundify text |
3352
|
|
|
|
|
|
|
|
3353
|
|
|
|
|
|
|
Ouput: |
3354
|
|
|
|
|
|
|
|
3355
|
|
|
|
|
|
|
None |
3356
|
|
|
|
|
|
|
|
3357
|
|
|
|
|
|
|
Example: |
3358
|
|
|
|
|
|
|
|
3359
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3360
|
|
|
|
|
|
|
|
3361
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3362
|
|
|
|
|
|
|
$xmlconv->SetCompoundifyText( 1 ); |
3363
|
|
|
|
|
|
|
|
3364
|
|
|
|
|
|
|
undef( $xmlconv ); |
3365
|
|
|
|
|
|
|
|
3366
|
|
|
|
|
|
|
=head3 SetNumOfThreads |
3367
|
|
|
|
|
|
|
|
3368
|
|
|
|
|
|
|
Description: |
3369
|
|
|
|
|
|
|
|
3370
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Sets the requested number of threads to parse Medline XML files |
3371
|
|
|
|
|
|
|
and compile the text corpus. |
3372
|
|
|
|
|
|
|
|
3373
|
|
|
|
|
|
|
Input: |
3374
|
|
|
|
|
|
|
|
3375
|
|
|
|
|
|
|
$value -> Integer (Positive value) |
3376
|
|
|
|
|
|
|
|
3377
|
|
|
|
|
|
|
Ouput: |
3378
|
|
|
|
|
|
|
|
3379
|
|
|
|
|
|
|
None |
3380
|
|
|
|
|
|
|
|
3381
|
|
|
|
|
|
|
Example: |
3382
|
|
|
|
|
|
|
|
3383
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3384
|
|
|
|
|
|
|
|
3385
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3386
|
|
|
|
|
|
|
$xmlconv->SetNumOfThreads( 4 ); |
3387
|
|
|
|
|
|
|
|
3388
|
|
|
|
|
|
|
undef( $xmlconv ); |
3389
|
|
|
|
|
|
|
|
3390
|
|
|
|
|
|
|
=head3 SetBeginDate |
3391
|
|
|
|
|
|
|
|
3392
|
|
|
|
|
|
|
Description: |
3393
|
|
|
|
|
|
|
|
3394
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets beginning date range for earliest articles to store, by |
3395
|
|
|
|
|
|
|
'DateCreated' Medline tag, within the text corpus during compilation. |
3396
|
|
|
|
|
|
|
|
3397
|
|
|
|
|
|
|
Note: Expected format - "XX/XX/XXXX" (Mon/Day/Year) |
3398
|
|
|
|
|
|
|
|
3399
|
|
|
|
|
|
|
Input: |
3400
|
|
|
|
|
|
|
|
3401
|
|
|
|
|
|
|
$string -> Date string - Format: "XX/XX/XXXX" |
3402
|
|
|
|
|
|
|
|
3403
|
|
|
|
|
|
|
Ouput: |
3404
|
|
|
|
|
|
|
|
3405
|
|
|
|
|
|
|
None |
3406
|
|
|
|
|
|
|
|
3407
|
|
|
|
|
|
|
Example: |
3408
|
|
|
|
|
|
|
|
3409
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3410
|
|
|
|
|
|
|
|
3411
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3412
|
|
|
|
|
|
|
$xmlconv->SetBeginDate( "01/01/2004" ); |
3413
|
|
|
|
|
|
|
|
3414
|
|
|
|
|
|
|
undef( $xmlconv ); |
3415
|
|
|
|
|
|
|
|
3416
|
|
|
|
|
|
|
=head3 SetEndDate |
3417
|
|
|
|
|
|
|
|
3418
|
|
|
|
|
|
|
Description: |
3419
|
|
|
|
|
|
|
|
3420
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets ending date range for latest article to store, by |
3421
|
|
|
|
|
|
|
'DateCreated' Medline tag, within the text corpus during compilation. |
3422
|
|
|
|
|
|
|
|
3423
|
|
|
|
|
|
|
Note: Expected format - "XX/XX/XXXX" (Mon/Day/Year) |
3424
|
|
|
|
|
|
|
|
3425
|
|
|
|
|
|
|
Input: |
3426
|
|
|
|
|
|
|
|
3427
|
|
|
|
|
|
|
$string -> Date string - Format: "XX/XX/XXXX" |
3428
|
|
|
|
|
|
|
|
3429
|
|
|
|
|
|
|
Ouput: |
3430
|
|
|
|
|
|
|
|
3431
|
|
|
|
|
|
|
None |
3432
|
|
|
|
|
|
|
|
3433
|
|
|
|
|
|
|
Example: |
3434
|
|
|
|
|
|
|
|
3435
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3436
|
|
|
|
|
|
|
|
3437
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3438
|
|
|
|
|
|
|
$xmlconv->SetEndDate( "08/13/2016" ); |
3439
|
|
|
|
|
|
|
|
3440
|
|
|
|
|
|
|
undef( $xmlconv ); |
3441
|
|
|
|
|
|
|
|
3442
|
|
|
|
|
|
|
=head3 SetXMLStringToParse |
3443
|
|
|
|
|
|
|
|
3444
|
|
|
|
|
|
|
Description: |
3445
|
|
|
|
|
|
|
|
3446
|
|
|
|
|
|
|
Sets member variable to passed string parameter. This string normally consists of Medline XML data to be |
3447
|
|
|
|
|
|
|
parsed for text corpus compilation. |
3448
|
|
|
|
|
|
|
|
3449
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3450
|
|
|
|
|
|
|
|
3451
|
|
|
|
|
|
|
Input: |
3452
|
|
|
|
|
|
|
|
3453
|
|
|
|
|
|
|
$string -> String |
3454
|
|
|
|
|
|
|
|
3455
|
|
|
|
|
|
|
Ouput: |
3456
|
|
|
|
|
|
|
|
3457
|
|
|
|
|
|
|
None |
3458
|
|
|
|
|
|
|
|
3459
|
|
|
|
|
|
|
Example: |
3460
|
|
|
|
|
|
|
|
3461
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3462
|
|
|
|
|
|
|
|
3463
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3464
|
|
|
|
|
|
|
$xmlconv->SetXMLStringToParse( "Hello World!" ); |
3465
|
|
|
|
|
|
|
|
3466
|
|
|
|
|
|
|
undef( $xmlconv ); |
3467
|
|
|
|
|
|
|
|
3468
|
|
|
|
|
|
|
=head3 SetTextCorpusStr |
3469
|
|
|
|
|
|
|
|
3470
|
|
|
|
|
|
|
Description: |
3471
|
|
|
|
|
|
|
|
3472
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Overwrites any stored text corpus data in memory to the string parameter. |
3473
|
|
|
|
|
|
|
|
3474
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3475
|
|
|
|
|
|
|
|
3476
|
|
|
|
|
|
|
Input: |
3477
|
|
|
|
|
|
|
|
3478
|
|
|
|
|
|
|
$string -> String |
3479
|
|
|
|
|
|
|
|
3480
|
|
|
|
|
|
|
Ouput: |
3481
|
|
|
|
|
|
|
|
3482
|
|
|
|
|
|
|
None |
3483
|
|
|
|
|
|
|
|
3484
|
|
|
|
|
|
|
Example: |
3485
|
|
|
|
|
|
|
|
3486
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3487
|
|
|
|
|
|
|
|
3488
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3489
|
|
|
|
|
|
|
$xmlconv->SetTextCorpusStr( "Hello World!" ); |
3490
|
|
|
|
|
|
|
|
3491
|
|
|
|
|
|
|
undef( $xmlconv ); |
3492
|
|
|
|
|
|
|
|
3493
|
|
|
|
|
|
|
=head3 AppendStrToTextCorpus |
3494
|
|
|
|
|
|
|
|
3495
|
|
|
|
|
|
|
Description: |
3496
|
|
|
|
|
|
|
|
3497
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Appends string parameter to text corpus string in memory. |
3498
|
|
|
|
|
|
|
|
3499
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3500
|
|
|
|
|
|
|
|
3501
|
|
|
|
|
|
|
Input: |
3502
|
|
|
|
|
|
|
|
3503
|
|
|
|
|
|
|
$string -> String |
3504
|
|
|
|
|
|
|
|
3505
|
|
|
|
|
|
|
Ouput: |
3506
|
|
|
|
|
|
|
|
3507
|
|
|
|
|
|
|
None |
3508
|
|
|
|
|
|
|
|
3509
|
|
|
|
|
|
|
Example: |
3510
|
|
|
|
|
|
|
|
3511
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3512
|
|
|
|
|
|
|
|
3513
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3514
|
|
|
|
|
|
|
$xmlconv->AppendStrToTextCorpus( "Hello World!" ); |
3515
|
|
|
|
|
|
|
|
3516
|
|
|
|
|
|
|
undef( $xmlconv ); |
3517
|
|
|
|
|
|
|
|
3518
|
|
|
|
|
|
|
=head3 ClearTextCorpus |
3519
|
|
|
|
|
|
|
|
3520
|
|
|
|
|
|
|
Description: |
3521
|
|
|
|
|
|
|
|
3522
|
|
|
|
|
|
|
Clears text corpus data in memory. |
3523
|
|
|
|
|
|
|
|
3524
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3525
|
|
|
|
|
|
|
|
3526
|
|
|
|
|
|
|
Input: |
3527
|
|
|
|
|
|
|
|
3528
|
|
|
|
|
|
|
None |
3529
|
|
|
|
|
|
|
|
3530
|
|
|
|
|
|
|
Ouput: |
3531
|
|
|
|
|
|
|
|
3532
|
|
|
|
|
|
|
None |
3533
|
|
|
|
|
|
|
|
3534
|
|
|
|
|
|
|
Example: |
3535
|
|
|
|
|
|
|
|
3536
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3537
|
|
|
|
|
|
|
|
3538
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3539
|
|
|
|
|
|
|
$xmlconv->ClearTextCorpus(); |
3540
|
|
|
|
|
|
|
|
3541
|
|
|
|
|
|
|
undef( $xmlconv ); |
3542
|
|
|
|
|
|
|
|
3543
|
|
|
|
|
|
|
=head3 SetTempStr |
3544
|
|
|
|
|
|
|
|
3545
|
|
|
|
|
|
|
Description: |
3546
|
|
|
|
|
|
|
|
3547
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets temporary member string to passed string parameter. |
3548
|
|
|
|
|
|
|
(Temporary placeholder for Medline Title and Abstract data). |
3549
|
|
|
|
|
|
|
|
3550
|
|
|
|
|
|
|
Note: This removes special characters and converts all characters to lowercase. |
3551
|
|
|
|
|
|
|
|
3552
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3553
|
|
|
|
|
|
|
|
3554
|
|
|
|
|
|
|
Input: |
3555
|
|
|
|
|
|
|
|
3556
|
|
|
|
|
|
|
$string -> String |
3557
|
|
|
|
|
|
|
|
3558
|
|
|
|
|
|
|
Ouput: |
3559
|
|
|
|
|
|
|
|
3560
|
|
|
|
|
|
|
None |
3561
|
|
|
|
|
|
|
|
3562
|
|
|
|
|
|
|
Example: |
3563
|
|
|
|
|
|
|
|
3564
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3565
|
|
|
|
|
|
|
|
3566
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3567
|
|
|
|
|
|
|
$xmlconv->SetTempStr( "Hello World!" ); |
3568
|
|
|
|
|
|
|
|
3569
|
|
|
|
|
|
|
undef( $xmlconv ); |
3570
|
|
|
|
|
|
|
|
3571
|
|
|
|
|
|
|
=head3 AppendToTempStr |
3572
|
|
|
|
|
|
|
|
3573
|
|
|
|
|
|
|
Description: |
3574
|
|
|
|
|
|
|
|
3575
|
|
|
|
|
|
|
Appends string parameter to temporary member string in memory. |
3576
|
|
|
|
|
|
|
|
3577
|
|
|
|
|
|
|
Note: This removes special characters and converts all characters to lowercase. |
3578
|
|
|
|
|
|
|
|
3579
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3580
|
|
|
|
|
|
|
|
3581
|
|
|
|
|
|
|
Input: |
3582
|
|
|
|
|
|
|
|
3583
|
|
|
|
|
|
|
$string -> String |
3584
|
|
|
|
|
|
|
|
3585
|
|
|
|
|
|
|
Ouput: |
3586
|
|
|
|
|
|
|
|
3587
|
|
|
|
|
|
|
None |
3588
|
|
|
|
|
|
|
|
3589
|
|
|
|
|
|
|
Example: |
3590
|
|
|
|
|
|
|
|
3591
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3592
|
|
|
|
|
|
|
|
3593
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3594
|
|
|
|
|
|
|
$xmlconv->AppendToTempStr( "Hello World!" ); |
3595
|
|
|
|
|
|
|
|
3596
|
|
|
|
|
|
|
undef( $xmlconv ); |
3597
|
|
|
|
|
|
|
|
3598
|
|
|
|
|
|
|
=head3 ClearTempStr |
3599
|
|
|
|
|
|
|
|
3600
|
|
|
|
|
|
|
Clears the temporary string storage in memory. |
3601
|
|
|
|
|
|
|
|
3602
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3603
|
|
|
|
|
|
|
|
3604
|
|
|
|
|
|
|
Input: |
3605
|
|
|
|
|
|
|
|
3606
|
|
|
|
|
|
|
None |
3607
|
|
|
|
|
|
|
|
3608
|
|
|
|
|
|
|
Ouput: |
3609
|
|
|
|
|
|
|
|
3610
|
|
|
|
|
|
|
None |
3611
|
|
|
|
|
|
|
|
3612
|
|
|
|
|
|
|
Example: |
3613
|
|
|
|
|
|
|
|
3614
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3615
|
|
|
|
|
|
|
|
3616
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3617
|
|
|
|
|
|
|
$xmlconv->ClearTempStr(); |
3618
|
|
|
|
|
|
|
|
3619
|
|
|
|
|
|
|
undef( $xmlconv ); |
3620
|
|
|
|
|
|
|
|
3621
|
|
|
|
|
|
|
=head3 SetTempDate |
3622
|
|
|
|
|
|
|
|
3623
|
|
|
|
|
|
|
Description: |
3624
|
|
|
|
|
|
|
|
3625
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets temporary date string to passed string. |
3626
|
|
|
|
|
|
|
|
3627
|
|
|
|
|
|
|
Note: Date Format - "XX/XX/XXXX" (Mon/Day/Year) |
3628
|
|
|
|
|
|
|
|
3629
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3630
|
|
|
|
|
|
|
|
3631
|
|
|
|
|
|
|
Input: |
3632
|
|
|
|
|
|
|
|
3633
|
|
|
|
|
|
|
$string -> Date string - Format: "XX/XX/XXXX" |
3634
|
|
|
|
|
|
|
|
3635
|
|
|
|
|
|
|
Ouput: |
3636
|
|
|
|
|
|
|
|
3637
|
|
|
|
|
|
|
None |
3638
|
|
|
|
|
|
|
|
3639
|
|
|
|
|
|
|
Example: |
3640
|
|
|
|
|
|
|
|
3641
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3642
|
|
|
|
|
|
|
|
3643
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3644
|
|
|
|
|
|
|
$xmlconv->SetTempDate( "08/13/2016" ); |
3645
|
|
|
|
|
|
|
|
3646
|
|
|
|
|
|
|
undef( $xmlconv ); |
3647
|
|
|
|
|
|
|
|
3648
|
|
|
|
|
|
|
=head3 ClearTempDate |
3649
|
|
|
|
|
|
|
|
3650
|
|
|
|
|
|
|
Description: |
3651
|
|
|
|
|
|
|
|
3652
|
|
|
|
|
|
|
Clears the temporary date storage location in memory. |
3653
|
|
|
|
|
|
|
|
3654
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3655
|
|
|
|
|
|
|
|
3656
|
|
|
|
|
|
|
Input: |
3657
|
|
|
|
|
|
|
|
3658
|
|
|
|
|
|
|
None |
3659
|
|
|
|
|
|
|
|
3660
|
|
|
|
|
|
|
Ouput: |
3661
|
|
|
|
|
|
|
|
3662
|
|
|
|
|
|
|
None |
3663
|
|
|
|
|
|
|
|
3664
|
|
|
|
|
|
|
Example: |
3665
|
|
|
|
|
|
|
|
3666
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3667
|
|
|
|
|
|
|
|
3668
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3669
|
|
|
|
|
|
|
$xmlconv->ClearTempDate(); |
3670
|
|
|
|
|
|
|
|
3671
|
|
|
|
|
|
|
undef( $xmlconv ); |
3672
|
|
|
|
|
|
|
|
3673
|
|
|
|
|
|
|
=head3 SetCompoundWordAry |
3674
|
|
|
|
|
|
|
|
3675
|
|
|
|
|
|
|
Description: |
3676
|
|
|
|
|
|
|
|
3677
|
|
|
|
|
|
|
Sets member variable to de-referenced passed array reference parameter. Stores compound word array by |
3678
|
|
|
|
|
|
|
de-referencing array reference parameter. |
3679
|
|
|
|
|
|
|
|
3680
|
|
|
|
|
|
|
Note: Clears previous data if existing. |
3681
|
|
|
|
|
|
|
|
3682
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3683
|
|
|
|
|
|
|
|
3684
|
|
|
|
|
|
|
Input: |
3685
|
|
|
|
|
|
|
|
3686
|
|
|
|
|
|
|
$arrayReference -> Array reference of compound words |
3687
|
|
|
|
|
|
|
|
3688
|
|
|
|
|
|
|
Ouput: |
3689
|
|
|
|
|
|
|
|
3690
|
|
|
|
|
|
|
None |
3691
|
|
|
|
|
|
|
|
3692
|
|
|
|
|
|
|
Example: |
3693
|
|
|
|
|
|
|
|
3694
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3695
|
|
|
|
|
|
|
|
3696
|
|
|
|
|
|
|
my @compoundWordAry = ( "big dog", "respiratory failure", "seven large masses" ); |
3697
|
|
|
|
|
|
|
|
3698
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3699
|
|
|
|
|
|
|
$xmlconv->SetCompoundWordAry( \@compoundWordAry ); |
3700
|
|
|
|
|
|
|
|
3701
|
|
|
|
|
|
|
undef( $xmlconv ); |
3702
|
|
|
|
|
|
|
|
3703
|
|
|
|
|
|
|
=head3 ClearCompoundWordAry |
3704
|
|
|
|
|
|
|
|
3705
|
|
|
|
|
|
|
Description: |
3706
|
|
|
|
|
|
|
|
3707
|
|
|
|
|
|
|
Clears compound word array in memory. |
3708
|
|
|
|
|
|
|
|
3709
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3710
|
|
|
|
|
|
|
|
3711
|
|
|
|
|
|
|
Input: |
3712
|
|
|
|
|
|
|
|
3713
|
|
|
|
|
|
|
None |
3714
|
|
|
|
|
|
|
|
3715
|
|
|
|
|
|
|
Ouput: |
3716
|
|
|
|
|
|
|
|
3717
|
|
|
|
|
|
|
None |
3718
|
|
|
|
|
|
|
|
3719
|
|
|
|
|
|
|
Example: |
3720
|
|
|
|
|
|
|
|
3721
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3722
|
|
|
|
|
|
|
|
3723
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3724
|
|
|
|
|
|
|
$xmlconv->ClearCompoundWordAry(); |
3725
|
|
|
|
|
|
|
|
3726
|
|
|
|
|
|
|
undef( $xmlconv ); |
3727
|
|
|
|
|
|
|
|
3728
|
|
|
|
|
|
|
=head3 SetCompoundWordBST |
3729
|
|
|
|
|
|
|
|
3730
|
|
|
|
|
|
|
Description: |
3731
|
|
|
|
|
|
|
|
3732
|
|
|
|
|
|
|
Sets member variable to passed Word2vec::Bst parameter. Sets compound word binary search tree to passed binary tree parameter. |
3733
|
|
|
|
|
|
|
|
3734
|
|
|
|
|
|
|
Note: Un-defines previous binary tree if existing. |
3735
|
|
|
|
|
|
|
|
3736
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3737
|
|
|
|
|
|
|
|
3738
|
|
|
|
|
|
|
Input: |
3739
|
|
|
|
|
|
|
|
3740
|
|
|
|
|
|
|
Word2vec::Bst -> Binary Search Tree |
3741
|
|
|
|
|
|
|
|
3742
|
|
|
|
|
|
|
Ouput: |
3743
|
|
|
|
|
|
|
|
3744
|
|
|
|
|
|
|
None |
3745
|
|
|
|
|
|
|
|
3746
|
|
|
|
|
|
|
Example: |
3747
|
|
|
|
|
|
|
|
3748
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3749
|
|
|
|
|
|
|
|
3750
|
|
|
|
|
|
|
my @compoundWordAry = ( "big dog", "respiratory failure", "seven large masses" ); |
3751
|
|
|
|
|
|
|
@compoundWordAry = sort( @compoundWordAry ); |
3752
|
|
|
|
|
|
|
|
3753
|
|
|
|
|
|
|
my $arySize = @compoundWordAry; |
3754
|
|
|
|
|
|
|
|
3755
|
|
|
|
|
|
|
my $bst = Word2vec::Bst; |
3756
|
|
|
|
|
|
|
$bst->CreateTree( \@compoundWordAry, 0, $arySize, undef ); |
3757
|
|
|
|
|
|
|
|
3758
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3759
|
|
|
|
|
|
|
$xmlconv->SetCompoundWordBST( $bst ); |
3760
|
|
|
|
|
|
|
|
3761
|
|
|
|
|
|
|
undef( $xmlconv ); |
3762
|
|
|
|
|
|
|
|
3763
|
|
|
|
|
|
|
=head3 ClearCompoundWordBST |
3764
|
|
|
|
|
|
|
|
3765
|
|
|
|
|
|
|
Description: |
3766
|
|
|
|
|
|
|
|
3767
|
|
|
|
|
|
|
Clears/Un-defines existing compound word binary search tree from memory. |
3768
|
|
|
|
|
|
|
|
3769
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3770
|
|
|
|
|
|
|
|
3771
|
|
|
|
|
|
|
Input: |
3772
|
|
|
|
|
|
|
|
3773
|
|
|
|
|
|
|
None |
3774
|
|
|
|
|
|
|
|
3775
|
|
|
|
|
|
|
Ouput: |
3776
|
|
|
|
|
|
|
|
3777
|
|
|
|
|
|
|
None |
3778
|
|
|
|
|
|
|
|
3779
|
|
|
|
|
|
|
Example: |
3780
|
|
|
|
|
|
|
|
3781
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3782
|
|
|
|
|
|
|
|
3783
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3784
|
|
|
|
|
|
|
$xmlconv->ClearCompoundWordBST(); |
3785
|
|
|
|
|
|
|
|
3786
|
|
|
|
|
|
|
undef( $xmlconv ); |
3787
|
|
|
|
|
|
|
|
3788
|
|
|
|
|
|
|
=head3 SetMaxCompoundWordLength |
3789
|
|
|
|
|
|
|
|
3790
|
|
|
|
|
|
|
Description: |
3791
|
|
|
|
|
|
|
|
3792
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Sets maximum number of compound words in a phrase for comparison. |
3793
|
|
|
|
|
|
|
|
3794
|
|
|
|
|
|
|
ie. "medical campus of Virginia Commonwealth University" can be interpreted as a compound word of 6 words. |
3795
|
|
|
|
|
|
|
Setting this variable to 3 will only attempt compoundifying a maximum amount of three words. |
3796
|
|
|
|
|
|
|
The result would be "medical_campus_of Virginia commonwealth university" even-though an exact representation |
3797
|
|
|
|
|
|
|
of this compounded string can exist. Setting this variable to 6 will result in compounding all six words if |
3798
|
|
|
|
|
|
|
they exists in the compound word array/bst. |
3799
|
|
|
|
|
|
|
|
3800
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3801
|
|
|
|
|
|
|
|
3802
|
|
|
|
|
|
|
Input: |
3803
|
|
|
|
|
|
|
|
3804
|
|
|
|
|
|
|
$value -> Integer |
3805
|
|
|
|
|
|
|
|
3806
|
|
|
|
|
|
|
Ouput: |
3807
|
|
|
|
|
|
|
|
3808
|
|
|
|
|
|
|
None |
3809
|
|
|
|
|
|
|
|
3810
|
|
|
|
|
|
|
Example: |
3811
|
|
|
|
|
|
|
|
3812
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3813
|
|
|
|
|
|
|
|
3814
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3815
|
|
|
|
|
|
|
$xmlconv->SetMaxCompoundWordLength( 8 ); |
3816
|
|
|
|
|
|
|
|
3817
|
|
|
|
|
|
|
undef( $xmlconv ); |
3818
|
|
|
|
|
|
|
|
3819
|
|
|
|
|
|
|
=head3 SetOverwriteExistingFile |
3820
|
|
|
|
|
|
|
|
3821
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Requires 0 = False or 1 = True. Sets option to overwrite |
3822
|
|
|
|
|
|
|
existing text corpus during compilation if 1 or append to existing text corpus if 0. |
3823
|
|
|
|
|
|
|
|
3824
|
|
|
|
|
|
|
=head2 Debug Functions |
3825
|
|
|
|
|
|
|
|
3826
|
|
|
|
|
|
|
=head3 GetTime |
3827
|
|
|
|
|
|
|
|
3828
|
|
|
|
|
|
|
Description: |
3829
|
|
|
|
|
|
|
|
3830
|
|
|
|
|
|
|
Returns current time string in "Hour:Minute:Second" format. |
3831
|
|
|
|
|
|
|
|
3832
|
|
|
|
|
|
|
Input: |
3833
|
|
|
|
|
|
|
|
3834
|
|
|
|
|
|
|
None |
3835
|
|
|
|
|
|
|
|
3836
|
|
|
|
|
|
|
Output: |
3837
|
|
|
|
|
|
|
|
3838
|
|
|
|
|
|
|
$string -> XX:XX:XX ("Hour:Minute:Second") |
3839
|
|
|
|
|
|
|
|
3840
|
|
|
|
|
|
|
Example: |
3841
|
|
|
|
|
|
|
|
3842
|
|
|
|
|
|
|
use Word2vec::Xmltow2v: |
3843
|
|
|
|
|
|
|
|
3844
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3845
|
|
|
|
|
|
|
my $time = $xmlconv->GetTime(); |
3846
|
|
|
|
|
|
|
|
3847
|
|
|
|
|
|
|
print( "Current Time: $time\n" ) if defined( $time ); |
3848
|
|
|
|
|
|
|
|
3849
|
|
|
|
|
|
|
undef( $xmlconv ); |
3850
|
|
|
|
|
|
|
|
3851
|
|
|
|
|
|
|
=head3 GetDate |
3852
|
|
|
|
|
|
|
|
3853
|
|
|
|
|
|
|
Description: |
3854
|
|
|
|
|
|
|
|
3855
|
|
|
|
|
|
|
Returns current month, day and year string in "Month/Day/Year" format. |
3856
|
|
|
|
|
|
|
|
3857
|
|
|
|
|
|
|
Input: |
3858
|
|
|
|
|
|
|
|
3859
|
|
|
|
|
|
|
None |
3860
|
|
|
|
|
|
|
|
3861
|
|
|
|
|
|
|
Output: |
3862
|
|
|
|
|
|
|
|
3863
|
|
|
|
|
|
|
$string -> XX/XX/XXXX ("Month/Day/Year") |
3864
|
|
|
|
|
|
|
|
3865
|
|
|
|
|
|
|
Example: |
3866
|
|
|
|
|
|
|
|
3867
|
|
|
|
|
|
|
use Word2vec::Xmltow2v: |
3868
|
|
|
|
|
|
|
|
3869
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3870
|
|
|
|
|
|
|
my $date = $xmlconv->GetDate(); |
3871
|
|
|
|
|
|
|
|
3872
|
|
|
|
|
|
|
print( "Current Date: $date\n" ) if defined( $date ); |
3873
|
|
|
|
|
|
|
|
3874
|
|
|
|
|
|
|
undef( $xmlconv ); |
3875
|
|
|
|
|
|
|
|
3876
|
|
|
|
|
|
|
=head3 WriteLog |
3877
|
|
|
|
|
|
|
|
3878
|
|
|
|
|
|
|
Description: |
3879
|
|
|
|
|
|
|
|
3880
|
|
|
|
|
|
|
Prints passed string parameter to the console, log file or both depending on user options. |
3881
|
|
|
|
|
|
|
|
3882
|
|
|
|
|
|
|
Note: printNewLine parameter prints a new line character following the string if the parameter |
3883
|
|
|
|
|
|
|
is undefined and does not if parameter is 0. |
3884
|
|
|
|
|
|
|
|
3885
|
|
|
|
|
|
|
Input: |
3886
|
|
|
|
|
|
|
|
3887
|
|
|
|
|
|
|
$string -> String to print to the console/log file. |
3888
|
|
|
|
|
|
|
$value -> 0 = Do not print newline character after string, all else prints new line character including 'undef'. |
3889
|
|
|
|
|
|
|
|
3890
|
|
|
|
|
|
|
Output: |
3891
|
|
|
|
|
|
|
|
3892
|
|
|
|
|
|
|
None |
3893
|
|
|
|
|
|
|
|
3894
|
|
|
|
|
|
|
Example: |
3895
|
|
|
|
|
|
|
|
3896
|
|
|
|
|
|
|
use Word2vec::Xmltow2v: |
3897
|
|
|
|
|
|
|
|
3898
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3899
|
|
|
|
|
|
|
$xmlconv->WriteLog( "Hello World" ); |
3900
|
|
|
|
|
|
|
|
3901
|
|
|
|
|
|
|
undef( $xmlconv ); |
3902
|
|
|
|
|
|
|
|
3903
|
|
|
|
|
|
|
=head1 Author |
3904
|
|
|
|
|
|
|
|
3905
|
|
|
|
|
|
|
Clint Cuffy, Virginia Commonwealth University |
3906
|
|
|
|
|
|
|
|
3907
|
|
|
|
|
|
|
=head1 COPYRIGHT |
3908
|
|
|
|
|
|
|
|
3909
|
|
|
|
|
|
|
Copyright (c) 2016 |
3910
|
|
|
|
|
|
|
|
3911
|
|
|
|
|
|
|
Bridget T McInnes, Virginia Commonwealth University |
3912
|
|
|
|
|
|
|
btmcinnes at vcu dot edu |
3913
|
|
|
|
|
|
|
|
3914
|
|
|
|
|
|
|
Clint Cuffy, Virginia Commonwealth University |
3915
|
|
|
|
|
|
|
cuffyca at vcu dot edu |
3916
|
|
|
|
|
|
|
|
3917
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
3918
|
|
|
|
|
|
|
under the terms of the GNU General Public License as published by the Free |
3919
|
|
|
|
|
|
|
Software Foundation; either version 2 of the License, or (at your option) |
3920
|
|
|
|
|
|
|
any later version. |
3921
|
|
|
|
|
|
|
|
3922
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT |
3923
|
|
|
|
|
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
3924
|
|
|
|
|
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
3925
|
|
|
|
|
|
|
|
3926
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along with |
3927
|
|
|
|
|
|
|
this program; if not, write to: |
3928
|
|
|
|
|
|
|
|
3929
|
|
|
|
|
|
|
The Free Software Foundation, Inc., |
3930
|
|
|
|
|
|
|
59 Temple Place - Suite 330, |
3931
|
|
|
|
|
|
|
Boston, MA 02111-1307, USA. |
3932
|
|
|
|
|
|
|
|
3933
|
|
|
|
|
|
|
=cut |