File Coverage

blib/lib/Word2vec/Word2vec.pm
Criterion Covered Total %
statement 401 1177 34.0
branch 182 924 19.7
condition 22 291 7.5
subroutine 88 93 94.6
pod 77 85 90.5
total 770 2570 29.9


line stmt bran cond sub pod time code
1             #!usr/bin/perl
2              
3             ######################################################################################
4             # #
5             # Author: Clint Cuffy #
6             # Date: 06/16/2016 #
7             # Revised: 11/06/2017 #
8             # UMLS Similarity Word2Vec Executable Interface Module #
9             # #
10             ######################################################################################
11             # #
12             # Description: #
13             # ============ #
14             # Perl "word2vec" executable interface for UMLS Similarity #
15             # Features: #
16             # ========= #
17             # Supports Word2Vec Training Using Standard Options #
18             # Conversion of Word2Vec Binary Format To Plain Text And Vice Versa #
19             # Cosine Similarity Between Two Words #
20             # Summed Cosine Similarity #
21             # Average Cosine Similarity #
22             # Multi-Word Cosine Similarity #
23             # Manipulation of Word Vectors (Addition/Subtraction/Average) #
24             # #
25             ######################################################################################
26              
27              
28             package Word2vec::Word2vec;
29              
30 4     4   43404 use strict;
  4         18  
  4         113  
31 4     4   21 use warnings;
  4         7  
  4         96  
32              
33             # Standard Package(s)
34 4     4   17 use Cwd;
  4         8  
  4         212  
35 4     4   1242 use Encode qw( decode encode );
  4         28111  
  4         252  
36              
37              
38 4     4   25 use vars qw($VERSION);
  4         7  
  4         158  
39              
40             $VERSION = '0.03';
41              
42              
43             ######################################################################################
44             # Constructor
45             ######################################################################################
46              
47             BEGIN
48       4     {
49             # CONSTRUCTOR : DO SOMETHING HERE
50             }
51              
52              
53             ######################################################################################
54             # Deconstructor
55             ######################################################################################
56              
57             END
58       4     {
59             # DECONSTRUCTOR : DO SOMETHING HERE
60             }
61              
62              
63             ######################################################################################
64             # new Class Operator
65             ######################################################################################
66              
67             sub new
68             {
69 1     1 1 116 my $class = shift;
70 1         30 my $self = {
71             # Private Member Variables
72             _debugLog => shift, # Boolean (Binary): 0 = False, 1 = True
73             _writeLog => shift, # Boolean (Binary): 0 = False, 1 = True
74             _trainFileName => shift, # String
75             _outputFileName => shift, # String
76             _wordVecSize => shift, # Int
77             _windowSize => shift, # Int
78             _sample => shift, # Float
79             _hSoftMax => shift, # Int
80             _negative => shift, # Int
81             _numOfThreads => shift, # Int
82             _numOfIterations => shift, # Int
83             _minCount => shift, # Int
84             _alpha => shift, # Float
85             _classes => shift, # Int
86             _debug => shift, # Int
87             _binaryOutput => shift, # Boolean (Binary): 0 = False, 1 = True
88             _saveVocab => shift, # String (File Name To Save To)
89             _readVocab => shift, # String (File Name To Read From)
90             _useCBOW => shift, # Boolean (Binary): 0 = Use Skip-Gram Model, 1 = Use CBOW (Default)
91             _workingDir => shift, # String
92             _word2VecExeDir => shift, # String
93             _hashRefOfWordVectors => shift, # Hash Reference of Word2Vec Vectors
94             _overwriteOldFile => shift, # Boolean (Binary): 0 = False, 1 = True
95             _sparseVectorMode => shift, # Boolean (Binary): 0 = False, 1 = True
96             _vectorLength => shift, # Int
97             _numberOfWords => shift, # Int
98             _minimizeMemoryUsage => shift, # Boolean (Binary): 0 = False, 1 = True
99             };
100              
101             # Set debug log variable to false if not defined
102 1 50       9 $self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } );
103 1 50       7 $self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } );
104 1 50       6 $self->{ _trainFileName } = "" if !defined ( $self->{ _trainFileName } );
105 1 50       6 $self->{ _outputFileName } = "" if !defined ( $self->{ _outputFileName } );
106 1 50       6 $self->{ _wordVecSize } = 100 if !defined ( $self->{ _wordVecSize } );
107 1 50       5 $self->{ _windowSize } = 5 if !defined ( $self->{ _windowSize } );
108 1 50       7 $self->{ _sample } = 0.001 if !defined ( $self->{ _sample } );
109 1 50       6 $self->{ _hSoftMax } = 0 if !defined ( $self->{ _hSoftMax } );
110 1 50       7 $self->{ _negative } = 5 if !defined ( $self->{ _negative } );
111 1 50       8 $self->{ _numOfThreads } = 12 if !defined ( $self->{ _numOfThreads } );
112 1 50       6 $self->{ _numOfIterations } = 5 if !defined ( $self->{ _numOfIterations } );
113 1 50       6 $self->{ _minCount } = 5 if !defined ( $self->{ _minCount } );
114 1 50       6 $self->{ _classes } = 0 if !defined ( $self->{ _classes } );
115 1 50       5 $self->{ _debug } = 2 if !defined ( $self->{ _debug } );
116 1 50       5 $self->{ _binaryOutput } = 1 if !defined ( $self->{ _binaryOutput } );
117 1 50       6 $self->{ _saveVocab } = "" if !defined ( $self->{ _saveVocab } );
118 1 50       6 $self->{ _readVocab } = "" if !defined ( $self->{ _readVocab } );
119 1 50       30 $self->{ _useCBOW } = 1 if !defined ( $self->{ _useCBOW } );
120              
121 1 50 33     13 $self->{ _alpha } = 0.05 if ( !defined ( $self->{ _alpha } ) && $self->{ _useCBOW } == 1 );
122 1 50 33     6 $self->{ _alpha } = 0.025 if ( !defined ( $self->{ _alpha } ) && $self->{ _useCBOW } == 0 );
123              
124 1 50       15 $self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } );
125              
126 1         5 my %hash = ();
127 1 50       6 $self->{ _hashRefOfWordVectors } = \%hash if !defined ( $self->{ _hashRefOfWordVectors } );
128 1 50       7 $self->{ _overwriteOldFile } = 0 if !defined $self->{ _overwriteOldFile };
129 1 50       5 $self->{ _sparseVectorMode } = 0 if !defined $self->{ _sparseVectorMode };
130 1 50       5 $self->{ _vectorLength } = 0 if !defined $self->{ _vectorLength };
131 1 50       11 $self->{ _numberOfWords } = 0 if !defined $self->{ _numberOfWords };
132 1 50       5 $self->{ _minimizeMemoryUsage } = 1 if !defined $self->{ _minimizeMemoryUsage };
133              
134              
135             # Try To Locate Word2Vec Executable Files Path
136 1         4 for my $dir ( @INC )
137             {
138 11 50       84 $self->{ _word2VecExeDir } = "$dir/External/Word2vec" if ( -e "$dir/External/Word2vec" ); # Test Directory
139 11 50       65 $self->{ _word2VecExeDir } = "$dir/../External/Word2vec" if ( -e "$dir/../External/Word2vec" ); # Dev Directory
140 11 50       64 $self->{ _word2VecExeDir } = "$dir/../../External/Word2vec" if ( -e "$dir/../../External/Word2vec" ); # Dev Directory
141 11 100       91 $self->{ _word2VecExeDir } = "$dir/Word2vec/External/Word2vec" if ( -e "$dir/Word2vec/External/Word2vec" ); # Release Directory
142             }
143              
144             # Open File Handler if checked variable is true
145 1 50       7 if( $self->{ _writeLog } )
146             {
147 0         0 open( $self->{ _fileHandle }, '>:encoding(UTF-8)', 'Word2vecLog.txt' );
148 0         0 $self->{ _fileHandle }->autoflush( 1 ); # Auto-flushes writes to log file
149             }
150              
151 1         3 bless $self, $class;
152              
153 1         7 $self->WriteLog( "New - Debug On" );
154 1 50       9 $self->WriteLog( "New - Word2Vec Executable Directory Found" ) if defined( $self->{ _word2VecExeDir } );
155 1 50       9 $self->WriteLog( "New - Setting Word2Vec Executable Directory To: \"" . $self->{ _word2VecExeDir } . "\"" ) if defined( $self->{ _word2VecExeDir } );
156              
157 1         5 return $self;
158             }
159              
160              
161             ######################################################################################
162             # DESTROY
163             ######################################################################################
164              
165             sub DESTROY
166             {
167 1     1   4 my ( $self ) = @_;
168              
169             # Close FileHandle
170 1 50       95 close( $self->{ _fileHandle } ) if( $self->{ _fileHandle } );
171             }
172              
173              
174             ######################################################################################
175             # Module Functions
176             ######################################################################################
177              
178             sub ExecuteTraining
179             {
180 2     2 1 6 my ( $self, $trainFilePath, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative, $alpha, $hs, $binary, $numOfThreads, $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite ) = @_;
181              
182             # Pre-Training Check(s)
183 2         5 my $executableFileDir = $self->GetWord2VecExeDir() . "/word2vec";
184 2 50       5 $executableFileDir .= ".exe" if $self->GetOSType() eq "MSWin32";
185              
186             # Override Train File Path Member Variable With Specified Train File Parameter
187 2 50       8 $self->WriteLog( "ExecuteTraining - \"TrainFilePath\" Parameter Specified / Overriding Member Variable" ) if defined( $trainFilePath );
188 2 50       6 $trainFilePath = $self->GetTrainFilePath() if !defined( $trainFilePath );
189              
190             # Override Output File Path Member Variable With Specified Train File Parameter
191 2 50       6 $self->WriteLog( "ExecuteTraining - \"OutputFilePath\" Parameter Specified / Overriding Member Variable" ) if defined( $outputFilePath );
192 2 50       4 $outputFilePath = $self->GetOutputFilePath() if !defined( $outputFilePath );
193              
194             # Override Overwrite Member Variable With Specified Train File Parameter
195 2 50       11 $self->WriteLog( "ExecuteTraining - \"Overwrite\" Parameter Specified / Overriding Member Variable" ) if defined( $overwrite );
196 2 50       6 $overwrite = $self->GetOverwriteOldFile() if !defined( $overwrite );
197              
198             # Check For 'word2vec' Executable and trainFile
199 2 50       56 $self->WriteLog( "ExecuteTraining - Error: \"word2vec\" Executable File Cannot Be Found" ) if !( -e "$executableFileDir" );
200 2 50       14 return -1 if !( -e "$executableFileDir" );
201 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Training File Not Found" ) if !( -e "$trainFilePath" );
202 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Training File Size = 0 bytes - No Data In Training File" ) if ( -z "$trainFilePath" );
203 0 0 0     0 return -1 if !( -e "$trainFilePath" ) || ( -z "$trainFilePath" );
204              
205             # Checks To See If Training Is Set To Use CBOW or Skip-Gram Model
206 0 0       0 $self->WriteLog( "ExecuteTraining - Attn: Continuous Bag Of Words Model = 0, Using Skip-Gram Model" ) if $self->GetUseCBOW() == 0;
207              
208             # Checks For Existing Output File And Returns -1 If Overwrite Option Is Not Enabled
209 0 0 0     0 $self->WriteLog( "ExecuteTraining - Warning: \"$outputFilePath\" Already Exists - Canceling Training" ) if ( -e "$outputFilePath" && $overwrite == 0 );
210 0 0 0     0 $self->WriteLog( "ExecuteTraining - Try Enabling \"Overwrite\" Option or Delete \"$outputFilePath\" In Working Directory" ) if ( -e "$outputFilePath" && $overwrite == 0 );
211 0 0 0     0 return -1 if ( -e "$outputFilePath" && $overwrite == 0 );
212              
213             # Fetch Other Training Parameters
214 0 0       0 $self->WriteLog( "ExecuteTraining - \"VectorSize\" Parameter Defined / Overriding Member Variable" ) if defined( $vectorSize );
215 0 0       0 $vectorSize = $self->GetWordVecSize() if !defined( $vectorSize );
216              
217 0 0       0 $self->WriteLog( "ExecuteTraining - \"WindowSize\" Parameter Defined / Overriding Member Variable" ) if defined( $windowSize );
218 0 0       0 $windowSize = $self->GetWindowSize() if !defined( $windowSize );
219              
220 0 0       0 $self->WriteLog( "ExecuteTraining - \"Min-Count\" Parameter Defined / Overriding Member Variable" ) if defined( $minCount );
221 0 0       0 $minCount = $self->GetMinCount() if !defined( $minCount );
222              
223 0 0       0 $self->WriteLog( "ExecuteTraining - \"Sample\" Parameter Defined / Overriding Member Variable" ) if defined( $sample );
224 0 0       0 $sample = $self->GetSample() if !defined( $sample );
225              
226 0 0       0 $self->WriteLog( "ExecuteTraining - \"Negative\" Parameter Defined / Overriding Member Variable" ) if defined( $negative );
227 0 0       0 $negative = $self->GetNegative() if !defined( $negative );
228              
229 0 0       0 $self->WriteLog( "ExecuteTraining - \"Alpha\" Parameter Defined / Overriding Member Variable" ) if defined( $alpha );
230 0 0       0 $alpha = $self->GetAlpha() if !defined( $alpha );
231              
232 0 0       0 $self->WriteLog( "ExecuteTraining - \"HSoftMax\" Parameter Defined / Overriding Member Variable" ) if defined( $hs );
233 0 0       0 $hs = $self->GetHSoftMax() if !defined( $hs );
234              
235 0 0       0 $self->WriteLog( "ExecuteTraining - \"Binary\" Parameter Defined / Overriding Member Variable" ) if defined( $binary );
236 0 0       0 $binary = $self->GetBinaryOutput() if !defined( $binary );
237              
238 0 0       0 $self->WriteLog( "ExecuteTraining - \"NumOfThreads\" Parameter Defined / Overriding Member Variable" ) if defined( $numOfThreads );
239 0 0       0 $numOfThreads = $self->GetNumOfThreads() if !defined( $numOfThreads );
240              
241 0 0       0 $self->WriteLog( "ExecuteTraining - \"Iterations\" Parameter Defined / Overriding Member Variable" ) if defined( $iterations );
242 0 0       0 $iterations = $self->GetNumOfIterations() if !defined( $iterations );
243              
244 0 0       0 $self->WriteLog( "ExecuteTraining - \"CBOW\" Parameter Defined / Overriding Member Variable" ) if defined( $useCBOW );
245 0 0       0 $useCBOW = $self->GetUseCBOW() if !defined( $useCBOW );
246              
247 0 0       0 $self->WriteLog( "ExecuteTraining - \"Classes\" Parameter Defined / Overriding Member Variable" ) if defined( $classes );
248 0 0       0 $classes = $self->GetClasses() if !defined( $classes );
249              
250 0 0       0 $self->WriteLog( "ExecuteTraining - \"ReadVocab\" Parameter Defined / Overriding Member Variable" ) if defined( $readVocab );
251 0 0       0 $readVocab = $self->GetReadVocabFilePath() if !defined( $readVocab );
252              
253 0 0       0 $self->WriteLog( "ExecuteTraining - \"SaveVocab\" Parameter Defined / Overriding Member Variable" ) if defined( $saveVocab );
254 0 0       0 $saveVocab = $self->GetSaveVocabFilePath() if !defined( $saveVocab );
255              
256 0 0       0 $self->WriteLog( "ExecuteTraining - \"Debug\" Parameter Defined / Overriding Member Variable" ) if defined( $debug );
257 0 0       0 $debug = $self->GetDebugTraining() if !defined( $debug );
258              
259             # Setting Up Command String
260 0         0 my $command = "\"$executableFileDir\" ";
261 0         0 $command .= ( "-train \"" . $trainFilePath . "\" " );
262 0         0 $command .= ( "-output \"" . $outputFilePath . "\" " );
263 0         0 $command .= ( "-size " . $vectorSize . " " );
264 0         0 $command .= ( "-window " . $windowSize . " " );
265 0         0 $command .= ( "-sample " . $sample . " " );
266 0         0 $command .= ( "-hs " . $hs . " " );
267 0         0 $command .= ( "-negative " . $negative . " " );
268 0         0 $command .= ( "-threads " . $numOfThreads . " " );
269 0         0 $command .= ( "-iter " . $iterations . " " );
270 0         0 $command .= ( "-min-count " . $minCount . " " );
271 0         0 $command .= ( "-alpha " . $alpha . " " );
272 0         0 $command .= ( "-classes " . $classes . " " );
273 0         0 $command .= ( "-binary " . $binary . " " );
274 0         0 $command .= ( "-cbow " . $useCBOW . " " );
275 0 0 0     0 $command .= ( "-read-vocab " . $readVocab . " " ) if ( defined( $readVocab ) && $readVocab ne "" );
276 0 0 0     0 $command .= ( "-save-vocab " . $saveVocab . " " ) if ( defined( $saveVocab ) && $saveVocab ne "" );
277 0         0 $command .= ( "-debug " . $debug . " " );
278              
279 0         0 $self->WriteLog( "Executing Command: $command" );
280              
281             # Execute External System Command To Train "word2vec"
282             # Execute command without capturing program output
283 0         0 my $result = system( "$command" );
284              
285 0         0 print "\n";
286              
287             # Post-Training Check(s)
288 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Unable To Spawn Executable File - Try Running '--clean' Command And Re-compile Executables" ) if ( $result == 65280 );
289              
290 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Word2Vec Output File Does Not Exist" ) if !( -e "$outputFilePath" );
291 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Word2Vec Output File Size = Zero" ) if ( -z "$outputFilePath" );
292 0 0 0     0 $result = -1 if ( !( -e "$outputFilePath" ) || ( -z "$outputFilePath" ) );
293              
294 0 0 0     0 $self->WriteLog( "ExecuteTraining - Training Successful" ) if $result == 0 && ( -e "$outputFilePath" );
295 0 0       0 $self->WriteLog( "ExecuteTraining - Training Unsuccessful" ) if $result != 0;
296              
297 0         0 return $result;
298             }
299              
300             sub ExecuteStringTraining
301             {
302 1     1 1 970 my ( $self, $trainingStr, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative, $alpha, $hs, $binary,
303             $numOfThreads, $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite ) = @_;
304              
305             # Check(s)
306 1 50       5 $self->WriteLog( "ExecuteStringTraining - Error: Training String Is Not Defined" ) if !defined( $trainingStr );
307 1 50       3 return -1 if !defined( $trainingStr );
308              
309 1 50       4 $self->WriteLog( "ExecuteStringTraining - Error: Training String Is Empty" ) if ( $trainingStr eq "" );
310 1 50       4 return -1 if ( $trainingStr eq "" );
311              
312             # Save Training String To Temporary File
313 1         2 my $result = 0;
314              
315 1         4 $self->WriteLog( "ExecuteStringTraining - Saving Training String To Temporary File At Working Directory: \"" . $self->GetWorkingDir() . "\"" );
316              
317 1         3 my $tempFilePath = $self->GetWorkingDir() . "/w2vtemp.txt";
318 1 50       72 open( my $fileHandle, ">:encoding(utf8)", "$tempFilePath" ) or $result = -1;
319              
320 1 50       53 $self->WriteLog( "ExecuteStringTraining - Error Creating File Handle : $!" ) if ( $result == -1 );
321 1 50       4 return -1 if ( $result == -1 );
322              
323             # Print Training String Data To File
324 1 50       46 print( $fileHandle "$trainingStr" ) if defined( $fileHandle );
325              
326 1         14 close( $fileHandle );
327 1         4 undef( $fileHandle );
328              
329 1         3 $self->WriteLog( "ExecuteStringTraining - Temporary Training String File Saved" );
330              
331 1         4 $result = $self->ExecuteTraining( $tempFilePath, $outputFilePath, $vectorSize, $windowSize,
332             $minCount, $sample, $negative, $alpha, $hs, $binary, $numOfThreads,
333             $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite );
334              
335 1         4 $self->WriteLog( "ExecuteStringTraining - Removing Temporary Training String Data File" );
336 1         47 unlink( $tempFilePath );
337              
338 1 50       5 $self->WriteLog( "ExecuteStringTraining - Finished" ) if ( $result == 0 );
339 1 50 33     7 $self->WriteLog( "ExecuteStringTraining - Finished With Errors" ) if ( $result == -1 && $self->GetWriteLog() == 0 );
340 1 50 33     6 $self->WriteLog( "ExecuteStringTraining - Finished With Errors / See Log File For Details" ) if ( $result == -1 && $self->GetWriteLog() == 1 ) ;
341              
342 1         6 return $result;
343             }
344              
345             sub ComputeCosineSimilarity
346             {
347 1     1 1 5 my ( $self, $wordA, $wordB ) = @_;
348              
349             # Check(s)
350 1 50 33     4 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
351 1 50       4 $self->WriteLog( "ComputeCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
352 1 50       4 return undef if ( $self->IsVectorDataInMemory() == 0 );
353              
354 0 0 0     0 $self->WriteLog( "ComputeCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
355 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
356              
357 0         0 $self->WriteLog( "ComputeCosineSimilarity - Computing Cosine Similarity Of Words: \"$wordA\" and \"$wordB\"" );
358              
359 0         0 my @wordAVtr = ();
360 0         0 my @wordBVtr = ();
361              
362              
363             # Search Dictionary For Specified Words
364 0         0 my $wordAData = $self->GetWordVector( $wordA );
365 0         0 my $wordBData = $self->GetWordVector( $wordB );
366 0 0       0 @wordAVtr = split( ' ', $wordAData ) if defined( $wordAData );
367 0 0       0 @wordBVtr = split( ' ', $wordBData ) if defined( $wordBData );
368              
369             # Post Search Check(s)
370 0 0       0 $self->WriteLog( "ComputeCosineSimilarity - Error: \"$wordA\" Not In Dictionary" ) if @wordAVtr == 0;
371 0 0       0 $self->WriteLog( "ComputeCosineSimilarity - Error: \"$wordB\" Not In Dictionary" ) if @wordBVtr == 0;
372 0 0 0     0 return undef if @wordAVtr == 0 || @wordBVtr == 0;
373              
374             # Remove Word From Vector To Compute Cosine Similarity Based On Vector Values
375 0         0 shift( @wordAVtr );
376 0         0 shift( @wordBVtr );
377 0         0 my $wordAVtrSize = @wordAVtr;
378 0         0 my $wordBVtrSize = @wordBVtr;
379              
380             # Check(s)
381 0 0       0 $wordAVtrSize = 0 if !defined( $wordAVtrSize );
382 0 0       0 $wordBVtrSize = 0 if !defined( $wordBVtrSize );
383              
384 0         0 $self->WriteLog( "ComputeCosineSimilarity - Words Present In Dictionary" );
385              
386             # Cosine Similarity => cos(angle) = -> ->
387             # A * B
388             # -------------------
389             # -> ->
390             # || A || * || B ||
391             #
392             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
393              
394 0         0 my $dpA = 0;
395 0         0 my $dpB = 0;
396 0         0 my $ldpA = 0;
397 0         0 my $ldpB = 0;
398 0         0 my $dpAB = 0;
399              
400             # Compute Dot Product Of VectorA
401 0         0 for my $value ( @wordAVtr )
402             {
403 0         0 $dpA += ( $value * $value );
404             }
405              
406             # Compute Dot Product Of VectorB
407 0         0 for my $value ( @wordBVtr )
408             {
409 0         0 $dpB += ( $value * $value );
410             }
411              
412             # Compute $ldpA & $ldpB
413 0         0 $ldpA = sqrt( $dpA );
414 0         0 $ldpB = sqrt( $dpB );
415              
416             # Compute Cosine Similarity Between Vector A & Vector B
417 0         0 for( my $i = 0; $i < $wordAVtrSize; $i++ )
418             {
419             # Compute Value If Not Dividing By Zero
420 0 0 0     0 $dpAB += ( ( $wordAVtr[$i] / $ldpA ) * ( $wordBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
421             }
422              
423             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
424 0         0 return sprintf( "%.6f", $dpAB );
425             }
426              
427             sub ComputeAvgOfWordsCosineSimilarity
428             {
429 1     1 1 5 my ( $self, $wordA, $wordB ) = @_;
430              
431             # Check(s)
432 1 50 33     3 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
433 1 50       10 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
434 1 50       3 return undef if ( $self->IsVectorDataInMemory() == 0 );
435              
436 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
437 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
438              
439 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: One Or More Arguments Consisting Of Empty String" ) if ( $wordA eq "" || $wordB eq "" );
440 0 0 0     0 return undef if ( $wordA eq "" || $wordB eq "" );
441              
442              
443 0         0 my @wordAAry = split( ' ', $wordA );
444 0         0 my @wordBAry = split( ' ', $wordB );
445              
446             # Check(s)
447 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: One Or More Arguments Contains No Data" ) if ( @wordAAry == 0 || @wordBAry == 0 );
448 0 0 0     0 return undef if ( @wordAAry == 0 || @wordBAry == 0 );
449              
450 0         0 $wordA = $self->ComputeAverageOfWords( \@wordAAry );
451 0         0 $wordB = $self->ComputeAverageOfWords( \@wordBAry );
452              
453             # Check(s)
454 0 0       0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Unable To Compute Average Of Word(s): \"@wordAAry\"" ) if !defined( $wordA );
455 0 0       0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Unable To Compute Average Of Word(s): \"@wordBAry\"" ) if !defined( $wordB );
456 0 0 0     0 return undef if !defined( $wordA ) || !defined( $wordB );
457              
458 0         0 my @avgAVtr = split( ' ', $wordA );
459 0         0 my @avgBVtr = split( ' ', $wordB );
460 0         0 my $avgAVtrSize = @avgAVtr;
461 0         0 my $avgBVtrSize = @avgBVtr;
462              
463             # Check(s)
464 0 0       0 $avgAVtrSize = 0 if !defined( $avgAVtrSize );
465 0 0       0 $avgBVtrSize = 0 if !defined( $avgBVtrSize );
466              
467 0         0 undef( $wordA );
468 0         0 undef( $wordB );
469              
470             # Compute Cosine Similarity Between Word Averages
471              
472             # Cosine Similarity => cos(angle) = -> ->
473             # A * B
474             # -------------------
475             # -> ->
476             # || A || * || B ||
477             #
478             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
479              
480 0         0 my $dpA = 0;
481 0         0 my $dpB = 0;
482 0         0 my $ldpA = 0;
483 0         0 my $ldpB = 0;
484 0         0 my $dpAB = 0;
485              
486             # Compute Dot Product Of VectorA
487 0         0 for my $value ( @avgAVtr )
488             {
489 0         0 $dpA += ( $value * $value );
490             }
491              
492             # Compute Dot Product Of VectorB
493 0         0 for my $value ( @avgBVtr )
494             {
495 0         0 $dpB += ( $value * $value );
496             }
497              
498             # Compute $ldpA & $ldpB
499 0         0 $ldpA = sqrt( $dpA );
500 0         0 $ldpB = sqrt( $dpB );
501              
502             # Compute Cosine Similarity Between Vector A & Vector B
503 0         0 for( my $i = 0; $i < $avgAVtrSize; $i++ )
504             {
505             # Compute Value If Not Dividing By Zero
506 0 0 0     0 $dpAB += ( ( $avgAVtr[$i] / $ldpA ) * ( $avgBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
507             }
508              
509             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
510 0         0 return sprintf( "%.6f", $dpAB );
511             }
512              
513             sub ComputeMultiWordCosineSimilarity
514             {
515 2     2 1 7 my ( $self, $wordA, $wordB, $allWordsMustExist ) = @_;
516              
517             # Check(s)
518 2 50 33     5 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
519 2 50       8 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
520 2 50       4 return undef if ( $self->IsVectorDataInMemory() == 0 );
521              
522 0 0 0     0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
523 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
524              
525 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Warning: \"All Words Must Exist\" Parameter Not Specified / Default = False" ) if !defined( $allWordsMustExist );
526 0 0       0 $allWordsMustExist = 0 if !defined( $allWordsMustExist );
527              
528 0         0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Computing Cosine Similarity Of Words: \"$wordA\" and \"$wordB\"" );
529              
530 0         0 my @wordAVtr = ();
531 0         0 my @wordBVtr = ();
532              
533              
534             # Split Words To Check For Existence In Dictionary
535 0         0 my @wordAAry = split( ' ', $wordA );
536 0         0 my @wordBAry = split( ' ', $wordB );
537 0         0 my $wordsFoundA = "";
538 0         0 my $wordsFoundB = "";
539              
540             # Check(s)
541 0 0 0     0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: One Or More Arguments Contains No Data" ) if ( @wordAAry == 0 || @wordBAry == 0 );
542 0 0 0     0 return undef if ( @wordAAry == 0 || @wordBAry == 0 );
543              
544             # Search Dictionary For Specified Words
545 0         0 for my $word ( @wordAAry )
546             {
547 0         0 my $wordData = $self->GetWordVector( $word );
548              
549 0 0       0 if( defined( $wordData ) )
550             {
551 0         0 my @wordVtr = split( ' ', $wordData );
552 0         0 push( @wordAVtr, [ @wordVtr ] );
553 0         0 $wordsFoundA .= ( " " . $word );
554             }
555             }
556              
557 0         0 for my $word ( @wordBAry )
558             {
559 0         0 my $wordData = $self->GetWordVector( $word );
560              
561 0 0       0 if( defined( $wordData ) )
562             {
563 0         0 my @wordVtr = split( ' ', $wordData );
564 0         0 push( @wordBVtr, [ @wordVtr ] );
565 0         0 $wordsFoundB .= ( " " . $word );
566             }
567             }
568              
569              
570             # Post Search Check(s)
571 0         0 my $error = 0;
572 0         0 for( my $i = 0; $i < @wordAAry; $i++ )
573             {
574 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: \"" . $wordAAry[$i] . "\" Not In Dictionary" ) if index( $wordsFoundA, $wordAAry[$i] ) == -1;
575 0 0 0     0 $error = 1 if index( $wordsFoundA, $wordAAry[$i] ) == -1 && $allWordsMustExist == 1;
576             }
577              
578 0         0 for( my $i = 0; $i < @wordBAry; $i++ )
579             {
580 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: \"" . $wordBAry[$i] . "\" Not In Dictionary" ) if index( $wordsFoundB, $wordBAry[$i] ) == -1;
581 0 0 0     0 $error = 1 if index( $wordsFoundB, $wordBAry[$i] ) == -1 && $allWordsMustExist == 1;
582             }
583              
584 0 0 0     0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Comparing Empty String / No Found Words" ) if ( $wordsFoundA eq "" || $wordsFoundB eq "" );
585 0 0 0     0 $error = 1 if ( $wordsFoundA eq "" || $wordsFoundB eq "" );
586              
587 0 0       0 return undef if $error != 0;
588              
589              
590 0         0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Words Present In Dictionary" );
591              
592             # Remove Words From Word Vectors
593 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
594             {
595 0         0 my @tempAry = @{ $wordAVtr[$i] };
  0         0  
596 0         0 shift( @tempAry );
597 0         0 $wordAVtr[$i] = \@tempAry;
598             }
599              
600 0         0 for( my $i = 0; $i < @wordBVtr; $i++ )
601             {
602 0         0 my @tempAry = @{ $wordBVtr[$i] };
  0         0  
603 0         0 shift( @tempAry );
604 0         0 $wordBVtr[$i] = \@tempAry;
605             }
606              
607              
608             # Compute Sum Of Compound Words
609 0         0 my @wordASumAry = ();
610 0         0 my @wordBSumAry = ();
611              
612 0         0 my $wordVtrASize = @{ $wordAVtr[0] };
  0         0  
613 0         0 my $wordVtrBSize = @{ $wordBVtr[0] };
  0         0  
614              
615 0         0 for( my $i = 0; $i < $wordVtrASize; $i++ )
616             {
617 0         0 my $value = 0;
618              
619 0         0 for my $aryRef ( @wordAVtr )
620             {
621 0         0 $value += $aryRef->[$i];
622             }
623              
624 0         0 push( @wordASumAry, $value );
625             }
626              
627 0         0 for( my $i = 0; $i < $wordVtrBSize; $i++ )
628             {
629 0         0 my $value = 0;
630              
631 0         0 for my $aryRef ( @wordBVtr )
632             {
633 0         0 $value += $aryRef->[$i];
634             }
635              
636 0         0 push( @wordBSumAry, $value );
637             }
638              
639              
640             # Cosine Similarity => cos(angle) = -> ->
641             # A * B
642             # -------------------
643             # -> ->
644             # || A || * || B ||
645             #
646             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
647              
648 0         0 my $dpA = 0;
649 0         0 my $dpB = 0;
650 0         0 my $ldpA = 0;
651 0         0 my $ldpB = 0;
652 0         0 my $dpAB = 0;
653              
654             # Compute Dot Product Of VectorA
655 0         0 for my $value ( @wordASumAry )
656             {
657 0         0 $dpA += ( $value * $value );
658             }
659              
660             # Compute Dot Product Of VectorB
661 0         0 for my $value ( @wordBSumAry )
662             {
663 0         0 $dpB += ( $value * $value );
664             }
665              
666             # Compute $ldpA & $ldpB
667 0         0 $ldpA = sqrt( $dpA );
668 0         0 $ldpB = sqrt( $dpB );
669              
670             # Compute Cosine Similarity Between Vector A & Vector B
671 0         0 for( my $i = 0; $i < $wordVtrASize; $i++ )
672             {
673             # Compute Value If Not Dividing By Zero
674 0 0 0     0 $dpAB += ( ( $wordASumAry[$i] / $ldpA ) * ( $wordBSumAry[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
675             }
676              
677             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
678 0         0 return sprintf( "%.6f", $dpAB );
679             }
680              
681             sub ComputeCosineSimilarityOfWordVectors
682             {
683 1     1 1 6 my ( $self, $wordAData, $wordBData ) = @_;
684              
685             # Check(s)
686 1 50 33     11 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordAData ) || !defined ( $wordBData );
687 1 50 33     6 return undef if !defined ( $wordAData ) || !defined ( $wordBData );
688              
689 0 0 0     0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Error: One Or More Word Vectors Consist Of No Data" ) if ( $wordAData eq "" || $wordBData eq "" );
690 0 0 0     0 return undef if ( $wordAData eq "" || $wordBData eq "" );
691              
692 0         0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Computing Cosine Similarity Of Word Vectors: \"$wordAData\" and \"$wordBData\"" );
693              
694 0         0 my @wordAVtr = split( ' ', $wordAData );
695 0         0 my @wordBVtr = split( ' ', $wordBData );
696              
697 0         0 undef( $wordAData );
698 0         0 undef( $wordBData );
699              
700 0         0 my $wordAVtrSize = @wordAVtr;
701 0         0 my $wordBVtrSize = @wordBVtr;
702              
703             # Check(s)
704 0 0       0 $wordAVtrSize = 0 if !defined( $wordAVtrSize );
705 0 0       0 $wordBVtrSize = 0 if !defined( $wordBVtrSize );
706              
707 0         0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Words Present In Dictionary" );
708              
709             # Cosine Similarity => cos(angle) = -> ->
710             # A * B
711             # -------------------
712             # -> ->
713             # || A || * || B ||
714             #
715             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
716              
717 0         0 my $dpA = 0;
718 0         0 my $dpB = 0;
719 0         0 my $ldpA = 0;
720 0         0 my $ldpB = 0;
721 0         0 my $dpAB = 0;
722              
723             # Compute Dot Product Of VectorA
724 0         0 for my $value ( @wordAVtr )
725             {
726 0         0 $dpA += ( $value * $value );
727             }
728              
729             # Compute Dot Product Of VectorB
730 0         0 for my $value ( @wordBVtr )
731             {
732 0         0 $dpB += ( $value * $value );
733             }
734              
735             # Compute $ldpA & $ldpB
736 0         0 $ldpA = sqrt( $dpA );
737 0         0 $ldpB = sqrt( $dpB );
738              
739             # Compute Cosine Similarity Between Vector A & Vector B
740 0         0 for( my $i = 0; $i < $wordAVtrSize; $i++ )
741             {
742             # Compute Value If Not Dividing By Zero
743 0 0 0     0 $dpAB += ( ( $wordAVtr[$i] / $ldpA ) * ( $wordBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
744             }
745              
746             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
747 0         0 return sprintf( "%.6f", $dpAB );
748             }
749              
750             sub CosSimWithUserInput
751             {
752 0     0 1 0 my ( $self ) = @_;
753              
754             # Check
755 0 0 0     0 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
756 0 0       0 $self->WriteLog( "CosSimWithUserInput - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
757 0 0       0 return undef if ( $self->IsVectorDataInMemory() == 0 );
758              
759 0         0 my $exit = 0;
760              
761 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
762 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
763              
764 0         0 while ( my $input = )
765             {
766 0         0 chomp( $input );
767 0 0       0 return if $input eq "EXIT";
768              
769 0         0 my @wordAry = split( ' ', $input );
770 0 0 0     0 $self->WriteLog( "Warning: Requires two words for input - ex \"man woman\"" ) if @wordAry == 0 || @wordAry == 1;
771 0 0 0     0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 ) if @wordAry == 0 || @wordAry == 1;
772              
773             # Print Data To Console When DebugLog == 0
774 0 0 0     0 print( "Warning: Requires two words for input - ex \"man woman\" \n" ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
775 0 0 0     0 print( "Input (Type \"EXIT\" to exit): " ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
776 0 0 0     0 next if ( @wordAry == 0 || @wordAry == 1 );
777              
778 0         0 my $value = $self->ComputeCosineSimilarity( $wordAry[0], $wordAry[1] );
779 0 0       0 $self->WriteLog( "Result: $value" ) if defined ( $value );
780 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
781              
782             # Print Data To Console When DebugLog == 0
783 0 0 0     0 print( "Error: One Or More Words Not Present In Dictionary\n" ) if ( !defined ( $value ) && $self->GetDebugLog() == 0 );
784 0 0 0     0 print( "Result: $value\n" ) if ( defined ( $value ) && $self->GetDebugLog == 0 );
785 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog == 0;
786             }
787             }
788              
789             sub MultiWordCosSimWithUserInput
790             {
791 0     0 1 0 my ( $self ) = @_;
792              
793             # Check
794 0 0 0     0 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
795 0 0       0 $self->WriteLog( "CosSimWithUserInput - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
796 0 0       0 return undef if ( $self->IsVectorDataInMemory() == 0 );
797              
798 0         0 my $exit = 0;
799              
800 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
801 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
802              
803 0         0 while ( my $input = )
804             {
805 0         0 chomp( $input );
806 0 0       0 return if $input eq "EXIT";
807              
808 0         0 my @wordAry = split( ' ', $input );
809 0 0 0     0 $self->WriteLog( "Warning: Requires two words for input - ex \"man woman\"" ) if @wordAry == 0 || @wordAry == 1;
810 0 0 0     0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 ) if @wordAry == 0 || @wordAry == 1;
811              
812             # Print Data To Console When DebugLog == 0
813 0 0 0     0 print( "Warning: Requires two words for input - ex \"man woman\"\n" ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
814 0 0 0     0 print( "Input (Type \"EXIT\" to exit): " ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
815 0 0 0     0 next if @wordAry == 0 || @wordAry == 1;
816              
817 0         0 my @wordArg1 = split( ':', $wordAry[0] );
818 0         0 my @wordArg2 = split( ':', $wordAry[1] );
819 0         0 my $arg1 = join( ' ', @wordArg1 );
820 0         0 my $arg2 = join( ' ', @wordArg2 );
821 0         0 my $value = $self->ComputeMultiWordCosineSimilarity( $arg1, $arg2 );
822 0 0       0 $self->WriteLog( "Result: $value" ) if defined ( $value );
823 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
824              
825             # Print Data To Console When DebugLog == 0
826 0 0 0     0 print( "Error: One Or More Words Not Present In Dictionary\n" ) if ( !defined ( $value ) && $self->GetDebugLog() == 0 );
827 0 0 0     0 print( "Result: $value\n" ) if ( defined ( $value ) && $self->GetDebugLog() == 0 );
828 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
829             }
830             }
831              
832             sub ComputeAverageOfWords
833             {
834 1     1 1 5 my ( $self, $wordAryRef ) = @_;
835              
836             # Check(s)
837 1 50 33     6 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
838 1 50       4 $self->WriteLog( "ComputeAverageOfWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
839 1 50       3 return undef if ( $self->IsVectorDataInMemory() == 0 );
840              
841 0 0       0 $self->WriteLog( "Error: Method Requires Array Reference Argument / Argument Not Defined" ) if !defined( $wordAryRef );
842 0 0       0 return undef if !defined( $wordAryRef );
843              
844 0         0 my @wordAry = @{ $wordAryRef };
  0         0  
845              
846 0         0 my @foundWords = ();
847 0         0 my @foundWordData = ();
848 0         0 my @resultAry = ();
849              
850 0         0 my $wordDataSize = 0;
851              
852 0         0 $self->WriteLog( "ComputeAverageOfWords - Locating Words In Vocabulary/Dictionary" );
853              
854             # Normal Memory Usage Mode
855 0 0       0 if( $self->GetMinimizeMemoryUsage() == 0 )
856             {
857             # Find Words
858 0         0 for my $word ( @wordAry )
859             {
860             # Dense Vector Data Algorithm
861 0 0       0 if( $self->GetSparseVectorMode() == 0 )
862             {
863             # Fetch Word From Vocabulary/Dictionary
864 0         0 my $result = $self->GetWordVector( $word );
865              
866             # Store Found Word
867 0 0       0 push( @foundWords, $word ) if defined( $result );
868              
869             # Store Found Word Vector Data
870 0 0       0 my @wordData = split( ' ', $result ) if defined( $result );
871 0 0       0 push( @foundWordData, [ @wordData ] ) if @wordData > 0;
872              
873 0 0 0     0 $wordDataSize = @wordData - 1 if $wordDataSize == 0 && defined( $result );
874             }
875             # Sparse Vector Data Algorithm
876             else
877             {
878             # Fetch Word From Vocabulary/Dictionary
879 0         0 my $result = $self->GetWordVector( $word, 1 );
880              
881             # Store Found Word
882 0 0       0 push( @foundWords, $word ) if defined( $result );
883              
884             # Store Found Word Vector Data
885 0 0       0 push( @foundWordData, $self->ConvertRawSparseTextToVectorDataHash( $result ) ) if defined( $result );
886              
887 0 0 0     0 $wordDataSize = $self->GetVectorLength() if $wordDataSize == 0 && defined( $result );
888             }
889             }
890              
891 0         0 $self->WriteLog( "ComputeAverageOfWords - Found: \"" . @foundWords . "\" Of \"" . @wordAry . "\" Words" );
892 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Computing Average Of Found Word(s): @foundWords" ) if @foundWords > 0;
893              
894             # Clear Found Words (Strings)
895 0         0 undef( @foundWords );
896 0         0 @foundWords = ();
897              
898             # Compute Average Of Vector Data For Found Words,
899             # Sum Values Of All Found Word Vectors / Dense Vector Format
900 0 0       0 if( $self->GetSparseVectorMode() == 0 )
901             {
902 0         0 for( my $i = 0; $i < $wordDataSize; $i++ )
903             {
904 0         0 my $value = 0;
905              
906 0         0 for( my $j = 0; $j < @foundWordData; $j++ )
907             {
908 0         0 $value += $foundWordData[$j]->[$i+1];
909             }
910              
911             # Compute Average
912 0         0 $value /= @foundWordData;
913              
914             # Round Decimal Places Greater Than Six
915 0         0 $value = sprintf( "%.6f", $value );
916              
917             # Store Value In Resulting Array
918 0         0 push( @resultAry, $value );
919             }
920             }
921             # Sum Values Of All Found Word Vectors / Sparse Vector Format
922             else
923             {
924             # Create And Zero Fill The Result Vector
925 0         0 @resultAry = ( "0.000000" ) x $wordDataSize;
926              
927 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
928             {
929 0         0 for my $key ( keys( %{ $foundWordData[$i] } ) )
  0         0  
930             {
931 0         0 $resultAry[$key-1] += sprintf( "%.6f", $foundWordData[$i]->{$key} );
932             }
933             }
934              
935             # Compute Average Of All Result Vector Elements
936 0 0       0 if( @foundWordData > 1 )
937             {
938 0         0 for( my $i = 0; $i < @resultAry; $i++ )
939             {
940 0         0 $resultAry[$i] /= @foundWordData;
941 0         0 $resultAry[$i] = sprintf( "%.6f", $resultAry[$i] );
942             }
943             }
944             }
945              
946             # Clear Vector Data For Found Words
947 0 0       0 if( $self->GetSparseVectorMode() == 0 )
948             {
949 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
950             {
951 0         0 $foundWordData[$i] = [];
952             }
953             }
954             else
955             {
956 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
957             {
958 0         0 $foundWordData[$i] = {};
959             }
960             }
961              
962             # Clear Found Word Data
963 0         0 undef( @foundWordData );
964 0         0 @foundWordData = ();
965             }
966             # Minimal Memory Usage Mode
967             else
968             {
969             # Find Words
970 0         0 for my $word ( @wordAry )
971             {
972             # Dense Vector Format / Minimal Memory Usage Mode
973 0 0       0 if( $self->GetSparseVectorMode() == 0 )
974             {
975             # Fetch Word From Vocabulary/Dictionary
976 0         0 my $result = $self->GetWordVector( $word );
977              
978 0 0       0 next if !defined( $result );
979              
980             # Store Found Word
981 0 0       0 push( @foundWords, $word ) if defined( $result );
982              
983             # Split Found Word Vector Data Into An Array
984 0 0       0 my @wordData = split( ' ', $result ) if defined( $result );
985              
986             # Set Word Vector Length
987 0 0 0     0 $wordDataSize = @wordData - 1 if ( $wordDataSize == 0 && defined( $result ) );
988              
989             # Create And Zero Fill The Result Vector If Not Already Done
990 0 0 0     0 @resultAry = ( "0.000000" ) x $wordDataSize if ( @resultAry == 0 && @resultAry != $wordDataSize );
991              
992 0         0 for( my $i = 1; $i < @wordData; $i++ )
993             {
994 0         0 my $value = $wordData[$i];
995              
996             # Round Decimal Places Greater Than Six
997 0         0 $value = sprintf( "%.6f", $value );
998              
999 0         0 $resultAry[$i-1] += $value;
1000             }
1001              
1002 0 0 0     0 $result = "" if ( defined( $result ) && $result ne "" );
1003              
1004 0         0 undef( @wordData );
1005 0         0 @wordData = ();
1006             }
1007             # Sparse Vector Format / Minimal Memory Usage Mode
1008             else
1009             {
1010             # Create And Zero Fill The Result Vector If Not Already Done
1011 0 0       0 @resultAry = ( "0.000000" ) x $self->GetVectorLength() if @resultAry == 0;
1012              
1013             # Fetch Word From Vocabulary/Dictionary
1014 0         0 my $result = $self->GetWordVector( $word, 1 );
1015              
1016             # Store Found Word
1017 0 0       0 push( @foundWords, $word ) if defined( $result );
1018              
1019             # Store Found Word Vector Data
1020 0 0       0 my $wordData = $self->ConvertRawSparseTextToVectorDataHash( $result ) if defined( $result );
1021              
1022             # Copy Hash Element Data To Defined Array Indices
1023 0         0 for my $key ( keys( %{ $wordData } ) )
  0         0  
1024             {
1025 0         0 $resultAry[$key-1] += sprintf( "%.6f", $wordData->{$key} );
1026             }
1027              
1028             # Clear Hash Data
1029 0         0 $wordData = {};
1030 0         0 undef( %{ $wordData } );
  0         0  
1031 0         0 $result = "";
1032             }
1033             }
1034              
1035 0         0 $self->WriteLog( "ComputeAverageOfWords - Found: \"" . @foundWords . "\" Of \"" . @wordAry . "\" Words" );
1036 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Computing Average Of Found Word(s): @foundWords" ) if @foundWords > 0;
1037              
1038             # Compute Average Of All Result Vector Elements
1039 0 0       0 if( @foundWords > 1 )
1040             {
1041 0         0 for( my $i = 0; $i < @resultAry; $i++ )
1042             {
1043 0         0 $resultAry[$i] /= @foundWords;
1044 0         0 $resultAry[$i] = sprintf( "%.6f", $resultAry[$i] );
1045             }
1046             }
1047              
1048             # Clear Found Words (Strings)
1049 0         0 undef( @foundWords );
1050 0         0 @foundWords = ();
1051             }
1052              
1053 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Complete" ) if @resultAry > 0;
1054 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Completed With Errors" ) if @resultAry == 0;
1055              
1056 0 0       0 my $returnStr = join( ' ', @resultAry ) if @resultAry > 0;
1057 0 0       0 $returnStr = undef if @resultAry == 0;
1058 0         0 undef( @resultAry );
1059 0         0 return $returnStr;
1060             }
1061              
1062             sub AddTwoWords
1063             {
1064 1     1 1 4 my ( $self, $wordA, $wordB ) = @_;
1065              
1066             # Check(s)
1067 1 50 33     4 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1068 1 50       10 $self->WriteLog( "AddTwoWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1069 1 50       6 return undef if ( $self->IsVectorDataInMemory() == 0 );
1070              
1071 0 0 0     0 $self->WriteLog( "AddTwoWords - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1072 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
1073              
1074 0         0 my $wordAData = $self->GetWordVector( $wordA );
1075 0         0 my $wordBData = $self->GetWordVector( $wordB );
1076              
1077 0 0       0 $self->WriteLog( "AddTwoWords - Error: \"$wordA\" Not In Dictionary" ) if !defined( $wordAData );
1078 0 0       0 $self->WriteLog( "AddTwoWords - Error: \"$wordB\" Not In Dictionary" ) if !defined( $wordBData );
1079 0 0 0     0 return undef if !defined( $wordAData ) || !defined( $wordBData );
1080              
1081 0         0 my @wordAVtr = split( ' ', $wordAData );
1082 0         0 my @wordBVtr = split( ' ', $wordBData );
1083              
1084             # More Check(s)
1085 0 0       0 $self->WriteLog( "AddTwoWords - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1086 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1087              
1088             # Remove Word From Word Vector (First Element)
1089 0         0 shift( @wordAVtr );
1090 0         0 shift( @wordBVtr );
1091              
1092 0         0 $self->WriteLog( "AddTwoWords - Adding Two Word Vectors" );
1093              
1094 0         0 my @resultVtr = ();
1095              
1096 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1097             {
1098 0         0 push( @resultVtr, $wordAVtr[$i] + $wordBVtr[$i] );
1099             }
1100              
1101 0         0 my $resultStr = join( ' ', @resultVtr );
1102 0         0 undef( @resultVtr );
1103              
1104 0         0 $self->WriteLog( "AddTwoWords - Complete" );
1105              
1106 0         0 return $resultStr;
1107             }
1108              
1109             sub SubtractTwoWords
1110             {
1111 1     1 1 7 my ( $self, $wordA, $wordB ) = @_;
1112              
1113             # Check(s)
1114 1 50 33     6 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1115 1 50       5 $self->WriteLog( "AddTwoWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1116 1 50       3 return undef if ( $self->IsVectorDataInMemory() == 0 );
1117              
1118 0 0 0     0 $self->WriteLog( "SubtractTwoWords - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1119 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
1120              
1121 0         0 my $wordAData = $self->GetWordVector( $wordA );
1122 0         0 my $wordBData = $self->GetWordVector( $wordB );
1123              
1124 0 0       0 $self->WriteLog( "SubtractTwoWords - Error: \"$wordA\" Not In Dictionary" ) if !defined( $wordAData );
1125 0 0       0 $self->WriteLog( "SubtractTwoWords - Error: \"$wordB\" Not In Dictionary" ) if !defined( $wordBData );
1126 0 0 0     0 return undef if !defined( $wordAData ) || !defined( $wordBData );
1127              
1128 0         0 my @wordAVtr = split( ' ', $wordAData );
1129 0         0 my @wordBVtr = split( ' ', $wordBData );
1130              
1131             # More Check(s)
1132 0 0       0 $self->WriteLog( "SubtractTwoWords - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1133 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1134              
1135             # Remove Word From Word Vector (First Element)
1136 0         0 shift( @wordAVtr );
1137 0         0 shift( @wordBVtr );
1138              
1139 0         0 $self->WriteLog( "SubtractTwoWords - Subtracting Two Word Vectors" );
1140              
1141 0         0 my @resultVtr = ();
1142              
1143 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1144             {
1145 0         0 push( @resultVtr, $wordAVtr[$i] - $wordBVtr[$i] );
1146             }
1147              
1148 0         0 my $resultStr = join( ' ', @resultVtr );
1149 0         0 undef( @resultVtr );
1150              
1151 0         0 $self->WriteLog( "SubtractTwoWords - Complete" );
1152              
1153 0         0 return $resultStr;
1154             }
1155              
1156             sub AddTwoWordVectors
1157             {
1158 1     1 1 3 my ( $self, $wordA, $wordB ) = @_;
1159              
1160             # Check(s)
1161 1 50 33     6 $self->WriteLog( "AddTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1162 1 50 33     11 return undef if !defined ( $wordA ) || !defined ( $wordB );
1163              
1164 0         0 my @wordAVtr = split( ' ', $wordA );
1165 0         0 my @wordBVtr = split( ' ', $wordB );
1166              
1167             # More Check(s)
1168 0 0       0 $self->WriteLog( "AddTwoWordVectors - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1169 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1170              
1171 0         0 $self->WriteLog( "AddTwoWordVectors - Adding Two Word Vectors" );
1172              
1173 0         0 my @resultVtr = ();
1174              
1175 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1176             {
1177 0         0 push( @resultVtr, $wordAVtr[$i] + $wordBVtr[$i] );
1178             }
1179              
1180 0         0 my $resultStr = join( ' ', @resultVtr );
1181 0         0 undef( @resultVtr );
1182              
1183 0         0 $self->WriteLog( "AddTwoWordVectors - Complete" );
1184              
1185 0         0 return $resultStr;
1186             }
1187              
1188             sub SubtractTwoWordVectors
1189             {
1190 1     1 1 4 my ( $self, $wordA, $wordB ) = @_;
1191              
1192             # Check(s)
1193 1 50 33     11 $self->WriteLog( "SubtractTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1194 1 50 33     8 return undef if !defined ( $wordA ) || !defined ( $wordB );
1195              
1196 0         0 my @wordAVtr = split( ' ', $wordA );
1197 0         0 my @wordBVtr = split( ' ', $wordB );
1198              
1199             # More Check(s)
1200 0 0       0 $self->WriteLog( "SubtractTwoWordVectors - Cannot Subtract Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1201 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1202              
1203 0         0 $self->WriteLog( "SubtractTwoWordVectors - Subtracting Two Word Vectors" );
1204              
1205 0         0 my @resultVtr = ();
1206              
1207 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1208             {
1209 0         0 push( @resultVtr, $wordAVtr[$i] - $wordBVtr[$i] );
1210             }
1211              
1212 0         0 my $resultStr = join( ' ', @resultVtr );
1213 0         0 undef( @resultVtr );
1214              
1215 0         0 $self->WriteLog( "SubtractTwoWordVectors - Complete" );
1216              
1217 0         0 return $resultStr;
1218             }
1219              
1220             sub AverageOfTwoWordVectors
1221             {
1222 1     1 1 4 my ( $self, $wordA, $wordB ) = @_;
1223              
1224             # Check(s)
1225 1 50 33     7 $self->WriteLog( "AverageOfTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1226 1 50 33     6 return undef if !defined ( $wordA ) || !defined ( $wordB );
1227              
1228 0         0 my @wordAVtr = split( ' ', $wordA );
1229 0         0 my @wordBVtr = split( ' ', $wordB );
1230              
1231             # More Check(s)
1232 0 0       0 $self->WriteLog( "AverageOfTwoWordVectors - Cannot Compute Average Of Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1233 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1234              
1235 0         0 $self->WriteLog( "AverageOfTwoWordVectors - Averaging Two Word Vectors" );
1236              
1237 0         0 my @resultVtr = ();
1238              
1239 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1240             {
1241 0         0 push( @resultVtr, ( $wordAVtr[$i] - $wordBVtr[$i] ) / 2 );
1242             }
1243              
1244 0         0 my $resultStr = join( ' ', @resultVtr );
1245 0         0 undef( @resultVtr );
1246              
1247 0         0 $self->WriteLog( "AverageOfTwoWordVectors - Complete" );
1248              
1249 0         0 return $resultStr;
1250             }
1251              
1252             sub GetWordVector
1253             {
1254 5     5 1 14 my ( $self, $searchWord, $returnRawSparseText ) = @_;
1255              
1256 5 50       15 $returnRawSparseText = 1 if defined( $returnRawSparseText );
1257 5 50       14 $returnRawSparseText = 0 if !defined( $returnRawSparseText );
1258              
1259             # Check(s)
1260 5 50 33     11 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1261 5 50       14 $self->WriteLog( "GetWordVector - Error: No Vector Data In Memory - Cannot Fetch Word Vector Data" ) if ( $self->IsVectorDataInMemory() == 0 );
1262 5 50       10 return undef if ( $self->IsVectorDataInMemory() == 0 );
1263              
1264 0         0 my $wordVectorData = $self->GetVocabularyHash->{ $searchWord };
1265              
1266 0 0       0 $self->WriteLog( "GetWordVector - Warning: \"$searchWord\" Not Found In Dictionary" ) if !defined( $wordVectorData );
1267              
1268 0 0       0 return undef if !defined( $wordVectorData );
1269              
1270 0         0 my $returnStr = "";
1271              
1272             # Convert Sparse Format To Regular Format
1273 0 0       0 if( $self->GetSparseVectorMode() == 1 )
1274             {
1275 0 0       0 if( $returnRawSparseText == 1 )
1276             {
1277 0         0 return $searchWord . " " . $wordVectorData;
1278             }
1279              
1280 0         0 my $vectorSize = $self->GetVectorLength();
1281              
1282             # Check
1283 0 0       0 $self->WriteLog( "GetWordVector - Error: Cannot Convert Sparse Data To Dense Format / Vector Length = 0 - Expects Vector Length >= 1" ) if ( $vectorSize == 0 );
1284 0 0       0 return undef if ( $vectorSize == 0 );
1285              
1286 0         0 my @data = split( ' ', $wordVectorData );
1287              
1288             # Make Array Of Vector Size With All Zeros
1289 0 0       0 my @wordVector = ( "0.000000" ) x $vectorSize if ( $vectorSize != 0 );
1290              
1291 0         0 for( my $i = 0; $i < @data; $i++ )
1292             {
1293             # If The Index ($i) Is Even, Then The Element Is An Index
1294 0 0       0 my $index = $data[$i] if ( $i % 2 == 0 );
1295              
1296             # If The Index Is Defined, Then Next Element Is An Index Element
1297 0 0       0 my $element = $data[$i+1] if defined( $index );
1298              
1299             # Assign The Correct Index Element To The Specified Index
1300 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
1301             }
1302              
1303             # Assign New Standard Format Word Vector To $returnStr
1304 0         0 $returnStr = $searchWord . " " . join( ' ', @wordVector );
1305              
1306             # Clear Array
1307 0         0 undef( @data );
1308 0         0 @data = ();
1309 0         0 undef( @wordVector );
1310 0         0 @wordVector = ();
1311             }
1312             else
1313             {
1314 0         0 $returnStr = $searchWord . " " . $wordVectorData;
1315             }
1316              
1317 0         0 return $returnStr;
1318             }
1319              
1320             sub IsVectorDataInMemory
1321             {
1322 52     52 1 84 my ( $self ) = @_;
1323              
1324 52         61 my $numberOfWordsInMemory = scalar keys %{ $self->GetVocabularyHash() };
  52         82  
1325 52 50       97 return 1 if $numberOfWordsInMemory > 0;
1326              
1327 52         507 return 0;
1328             }
1329              
1330             sub IsWordOrCUIVectorData
1331             {
1332 4     4 1 9 my ( $self ) = @_;
1333              
1334             # Check(s)
1335 4 50       30 $self->WriteLog( "isWordOrCUIVectorData - Error: No Vector Vocabulary Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1336 4 50       7 return undef if $self->IsVectorDataInMemory() == 0;
1337              
1338 0         0 my @vocabularyWords = keys %{ $self->GetVocabularyHash() };
  0         0  
1339 0         0 @vocabularyWords = sort( @vocabularyWords );
1340              
1341             # Choose Random Word, Avoiding First Three Vector Elements
1342 0         0 my $term = $vocabularyWords[ rand( @vocabularyWords - 2 ) + 2 ];
1343              
1344             # Clean Up
1345 0         0 undef( @vocabularyWords );
1346              
1347             # Perform Check
1348 0         0 $term = lc( $term );
1349 0         0 my @terms = split( 'c', $term );
1350              
1351             # Return Word Term If There Are Not Two Elements After Splitting
1352 0 0       0 return "word" if( @terms != 2 );
1353              
1354             # If $term Is CUI, Then First Element Should Be Empty String
1355 0 0       0 return "word" if ( $terms[0] ne "" );
1356              
1357             # Remove Numbers From Second Element
1358 0         0 $terms[1] =~ s/[0-9]//g;
1359              
1360             # If $term Is CUI, Then After Removing All Number From Second Element An Empty String Is All That Is Left
1361 0 0       0 return "word" if ( $terms[1] ne "" );
1362              
1363 0         0 return "cui";
1364             }
1365              
1366             sub IsVectorDataSorted
1367             {
1368 0     0 1 0 my ( $self, $aryRef ) = @_;
1369              
1370 0 0       0 my $vocabHashRef = $self->GetVocabularyHash() if !defined( $aryRef );
1371 0 0       0 $vocabHashRef = $aryRef if defined( $aryRef );
1372              
1373 0 0       0 $self->WriteLog( "IsVectorDataSorted - Error: No Vector Data In Memory" ) if ( keys %{ $vocabHashRef } == 0 );
  0         0  
1374 0 0       0 return -1 if ( keys %{ $vocabHashRef } == 0 );
  0         0  
1375              
1376 0         0 my $numOfWords = $self->GetNumberOfWords();
1377 0         0 my $vectorLength = $self->GetVectorLength();
1378              
1379 0 0 0     0 return 1 if defined( $vocabHashRef->{ $numOfWords } ) && $vocabHashRef->{ $numOfWords } eq "$vectorLength #\$\@RTED#";
1380 0         0 return 0;
1381             }
1382              
1383             sub CheckWord2VecDataFileType
1384             {
1385 3     3 1 9 my ( $self, $fileDir ) = @_;
1386              
1387             # Check(s)
1388 3 50       32 $self->WriteLog( "CheckWord2VecDataFileType - Error: File Path Not Defined" ) if !defined( $fileDir );
1389 3 50       7 return undef if !defined( $fileDir );
1390              
1391 3 100       43 $self->WriteLog( "CheckWord2VecDataFileType - Error: File Cannot Be Found / Does Not Exist" ) if !( -e $fileDir );
1392 3 100       39 return undef if !( -e $fileDir );
1393              
1394              
1395             # Check Word Vector File Format
1396 2         6 my $fileType = "";
1397 2         3 my $numOfWordVectors = 0;
1398 2         4 my $sizeOfVectors = 0;
1399 2         4 my $sparseVectorsFlag = 0;
1400              
1401 2 50       42 open( my $fh, "<:", "$fileDir" ) or $self->WriteLog( "CheckWord2VecDataFileType - Error Opening File : $!" );
1402              
1403 2         11 for( my $i = 0; $i < 2; $i++ )
1404             {
1405 4         61 my $data = <$fh>;
1406              
1407             # Store Number Of Word Vectors And Vector Size
1408 4 100       14 if( $i == 0 )
1409             {
1410 2         299 my @dimensionsAry = split( ' ', $data );
1411              
1412             # Fetch Number Of Word Vectors
1413 2 50       22 $numOfWordVectors = $dimensionsAry[0] if ( @dimensionsAry >= 2 );
1414              
1415             # Fetch Size Of Vectors
1416 2 50       10 $sizeOfVectors = $dimensionsAry[1] if ( @dimensionsAry >= 2 );
1417              
1418             # Skip First Line (First Line Is Always Plain Text Format)
1419 2         11 next;
1420             }
1421              
1422             # Check Second Line Of File To Determine Whether File Is Text Or Binary Format
1423 2         7 my $oldData = $data;
1424 2         41 my $newData = Encode::decode( "utf8", $data, Encode::FB_QUIET );
1425 2 50       62 $fileType = "text" if length( $oldData ) == length( $newData );
1426 2 50       21 $fileType = "binary" if length( $oldData ) != length( $newData );
1427              
1428             # Check Second Line For Sparse Vector
1429 2 50       28 my @dataAry = split( ' ', $oldData ) if defined( $oldData );
1430 2 50 33     9 $sparseVectorsFlag = 1 if defined( $oldData ) && ( @dataAry - 1 != $sizeOfVectors );
1431             }
1432              
1433             # Read A Couple Lines To Determine Whether Vectors Are 'Sparse' Or 'Full' Plain Vectors
1434 2 50       7 if( $fileType eq "text" )
1435             {
1436 2 50       6 my $checkLength = 50 if ( $numOfWordVectors > 50 );
1437 2 50       20 $checkLength = $numOfWordVectors if ( $numOfWordVectors < 50 );
1438              
1439             # Read Data From File To Check For Sparse Vectors
1440 2         10 for( my $i = 0; $i < $checkLength - 2; $i++ )
1441             {
1442 0         0 my $data = <$fh>;
1443 0 0       0 my @dataAry = split( ' ', $data ) if defined( $data );
1444 0 0 0     0 $sparseVectorsFlag = 1 if defined( $data ) && ( @dataAry - 1 != $sizeOfVectors );
1445             }
1446              
1447 2 50       7 $fileType = "sparsetext" if ( $sparseVectorsFlag == 1 );
1448             }
1449              
1450 2         14 close( $fh );
1451 2         10 undef( $fh );
1452              
1453 2         12 return $fileType;
1454             }
1455              
1456             sub ReadTrainedVectorDataFromFile
1457             {
1458 4     4 1 13 my ( $self, $fileDir, $searchWord ) = @_;
1459              
1460 4         38 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading File \"$fileDir\"" );
1461              
1462             # Check(s)
1463 4 50       8 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir );
1464 4 50       10 return -1 if !defined ( $fileDir );
1465              
1466 4 100       35 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" );
1467 4 100       22 return -1 if !( -e "$fileDir" );
1468              
1469 1 50       9 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Vector Data File Size = 0 bytes / File Contains No Data" ) if ( -z "$fileDir" );
1470 1 50       9 return -1 if ( -z "$fileDir" );
1471              
1472 0         0 my $numberOfWordsInMemory = $self->GetNumberOfWords();
1473 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Module Already Contains Vector Training Data In Memory" ) if $numberOfWordsInMemory > 0;
1474 0 0       0 return -1 if $numberOfWordsInMemory > 0;
1475              
1476 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Searching For Word \"$searchWord\" In Vector Data File \"$fileDir\"" ) if defined( $searchWord );
1477 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Warning: Vector Data Will Be Cleared From Memory After Search Is Complete" ) if defined ( $searchWord );
1478              
1479             # Check To See If File Data Is Binary Or Text
1480 0         0 my $fileType = $self->CheckWord2VecDataFileType( $fileDir );
1481              
1482             # Check
1483 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Unable To Determine Vector Data Format" ) if !defined( $fileType );
1484 0 0       0 return -1 if !defined( $fileType );
1485              
1486 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Plain Text Format\"" ) if $fileType eq "text" ;
1487 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Sparse Vector Text Format\"" ) if $fileType eq "sparsetext" ;
1488 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Word2Vec Binary Format\"" ) if $fileType eq "binary" ;
1489              
1490 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Setting \"Sparse Vector Mode\" = True" ) if $fileType eq "sparsetext" ;
1491 0 0       0 $self->SetSparseVectorMode( 1 ) if $fileType eq "sparsetext";
1492 0 0       0 $self->SetSparseVectorMode( 0 ) if $fileType ne "sparsetext";
1493              
1494 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading Data" );
1495              
1496              
1497             # Read Trained Vector Data From File To Memory
1498 0         0 my $fileHandle;
1499              
1500             # Read Plain Text Data Format From File
1501 0 0       0 if ( $fileType eq "text" )
    0          
    0          
1502             {
1503 0         0 my $lineCount = 0;
1504 0         0 open( $fileHandle, '<:encoding(UTF-8)', "$fileDir" );
1505              
1506 0         0 while( my $row = <$fileHandle> )
1507             {
1508 0         0 chomp $row;
1509 0         0 $row = lc( $row );
1510              
1511             # Progress Percent Indicator - Print Percentage Of File Loaded
1512 0 0       0 print( int( ( $lineCount / $self->GetNumberOfWords() ) * 100 ) . "%" ) if ( $self->GetNumberOfWords() > 0 );
1513              
1514             # Skip If Line Is Empty
1515 0 0       0 next if( length( $row ) == 0 );
1516              
1517 0 0       0 if( $lineCount == 0 )
1518             {
1519 0         0 my @data = split( ' ', $row );
1520              
1521             # Check(s)
1522 0 0       0 if( @data < 2 )
1523             {
1524 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: File Does Not Contain Header Information / NumOfWords & VectorLength" );
1525 0         0 close( $fileHandle );
1526 0         0 return -1;
1527             }
1528              
1529 0         0 $self->SetNumberOfWords( $data[0] );
1530 0         0 $self->SetVectorLength( $data[1] );
1531             }
1532              
1533             # Search For Search Word And Return If Found
1534 0 0       0 if ( defined( $searchWord ) )
1535             {
1536 0         0 my @data = split( ' ', $row );
1537              
1538 0 0       0 if ( $data[0] eq $searchWord )
1539             {
1540 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Search Word Found / Clearing Variables" );
1541 0         0 $self->ClearVocabularyHash();
1542 0         0 close( $fileHandle );
1543 0         0 return join( ' ', @data );
1544             }
1545             }
1546             # Store Vector Data In Memory
1547             else
1548             {
1549 0         0 $self->AddWordVectorToVocabHash( $row );
1550             }
1551              
1552             # Progress Percent Indicator - Return To Beginning Of Line
1553 0 0       0 print( "\r" ) if ( $self->GetNumberOfWords() > 0 );
1554              
1555 0         0 $lineCount++;
1556             }
1557              
1558 0         0 close( $fileHandle );
1559             }
1560             # Read Spare Text Format From File
1561             elsif( $fileType eq "sparsetext" )
1562             {
1563 0         0 my $lineCount = 0;
1564 0         0 my $numOfWordVectors = 0;
1565 0         0 my $vectorSize = 0;
1566              
1567 0         0 open( $fileHandle, '<:encoding(UTF-8)', "$fileDir" );
1568              
1569 0         0 while( my $row = <$fileHandle> )
1570             {
1571 0         0 chomp $row;
1572 0         0 $row = lc( $row );
1573              
1574             # Progress Percent Indicator - Print Percentage Of File Loaded
1575 0 0       0 print( int( ( $lineCount / $self->GetNumberOfWords() ) * 100 ) . "%" ) if ( $self->GetNumberOfWords() > 0 );
1576              
1577             # Skip If Line Is Empty
1578 0 0       0 next if( length( $row ) == 0 );
1579              
1580             # Skip First Line ( First Line Holds Number Of Word Vectors And Vector Size / Is Always Even )
1581 0 0       0 if( $lineCount == 0 )
    0          
1582             {
1583 0         0 my @data = split( ' ', $row );
1584              
1585             # Check(s)
1586 0 0       0 if( @data < 2 )
1587             {
1588 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: File Does Not Contain Header Information / NumOfWords" );
1589 0         0 close( $fileHandle );
1590 0         0 return -1;
1591             }
1592              
1593 0         0 $numOfWordVectors = $data[0];
1594 0         0 $vectorSize = $data[1] - 1;
1595              
1596 0         0 $self->SetNumberOfWords( $numOfWordVectors );
1597 0         0 $self->SetVectorLength( $vectorSize + 1 );
1598              
1599             }
1600             elsif( $lineCount > 0 )
1601             {
1602 0         0 my @data = split( ' ', $row );
1603              
1604             # If Array Size Is Odd, Then Error Out
1605             # Explanation: ie. - $dataAry[1] = "heart 1 0.002323 4 0.124342 16 0.005610 17"
1606             # There Are Four Indices And Three Index Elements, There Should Be
1607             # One Index Per Index Element. A Proper Sparse Vector Should Look As Follows.
1608             # ie. - $dataAry[1] = "heart 1 0.002323 4 0.124342 16 0.005610 17 0.846613"
1609             # With The Word Included In The Word Vector, The Vector Size Should Always
1610             # Be Odd By Nature.
1611             #
1612 0 0 0     0 if ( @data > 2 && @data % 2 == 0 )
1613             {
1614 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Improper Sparse Vector Format - Index/Index Element Number Mis-Match" );
1615 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Occured At Line #$lineCount: \"$row\"" );
1616 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Clearing Vocabulary Array" );
1617 0         0 $self->ClearVocabularyHash();
1618 0         0 return -1;
1619             }
1620              
1621             # Fetch String Word In First Element
1622 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: First Element Of Data Array (Word) Not Defined - Line: $lineCount" ) if !defined( $data[0] );
1623 0 0       0 return -1 if !defined( $data[0] );
1624              
1625             # Clear Array
1626 0         0 @data = ();
1627             }
1628              
1629             # Search For Search Word And Return If Found
1630 0 0       0 if ( defined( $searchWord ) )
1631             {
1632 0         0 my @data = split( ' ', $row );
1633              
1634 0 0       0 if ( $data[0] eq $searchWord )
1635             {
1636 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Search Word Found / Clearing Variables" );
1637 0         0 $self->ClearVocabularyHash();
1638 0         0 close( $fileHandle );
1639 0         0 return join( ' ', @data );
1640             }
1641             }
1642             # Store Vector Data In Memory
1643             else
1644             {
1645 0         0 $self->AddWordVectorToVocabHash( $row );
1646             }
1647              
1648             # Progress Percent Indicator - Return To Beginning Of Line
1649 0 0       0 print( "\r" ) if ( $self->GetNumberOfWords() > 0 );
1650              
1651 0         0 $lineCount++;
1652             }
1653              
1654 0         0 close( $fileHandle );
1655             }
1656             # Read Word2Vec Binary Data Format From File
1657             elsif( $fileType eq "binary" )
1658             {
1659 0         0 open( $fileHandle, "$fileDir" );
1660 0         0 binmode $fileHandle;
1661              
1662 0         0 my $buffer = "";
1663 0         0 my $word = "";
1664 0         0 my $wordVectorData = "";
1665              
1666             # Fetch "Number Of Words" and "Word Vector Size" From First Line
1667 0         0 my $row = <$fileHandle>;
1668 0         0 chomp( $row );
1669              
1670             # Skip If Line Is Empty
1671 0 0       0 next if( length( $row ) == 0 );
1672              
1673 0         0 my @strAry = split( ' ', $row );
1674              
1675             # Check(s)
1676 0 0       0 return if @strAry < 2;
1677              
1678              
1679 0         0 my $wordCount = $strAry[0];
1680 0         0 my $wordSize = $strAry[1];
1681 0         0 my $count = 1;
1682 0         0 $word = "";
1683              
1684 0         0 $self->SetNumberOfWords( $wordCount );
1685 0         0 $self->SetVectorLength( $wordSize );
1686              
1687             # Add Word Count & Word Vector Size To Memory
1688 0         0 $self->AddWordVectorToVocabHash( "$row" );
1689              
1690             # Begin Fetching Data From File
1691 0         0 while( $count < $wordCount + 1 )
1692             {
1693 0         0 my $cont = 1;
1694              
1695             # Progress Percent Indicator - Print Percentage Of File Loaded
1696 0 0       0 print( int( ( $count / $self->GetNumberOfWords() ) * 100 ) . "%" ) if ( $self->GetNumberOfWords() > 0 );
1697              
1698             # Fetch Word
1699 0         0 while( $cont == 1 )
1700             {
1701             # Fetch Word
1702 0         0 chomp( $buffer = getc( $fileHandle ) );
1703 0 0 0     0 $word .= $buffer if $buffer ne " " && defined( $buffer );
1704              
1705             # Check(s)
1706 0 0       0 $cont = 0 if eof;
1707 0 0       0 $cont = 0 if $buffer eq " ";
1708 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - ERROR: Unexpectedly Reached End Of File" ) if eof;
1709 0 0       0 $self->WriteLog( " Expected Word Count / Vector Size") if eof;
1710 0 0       0 $self->WriteLog( " $wordCount / $wordSize" ) if eof;
1711 0 0       0 $self->WriteLog( " Current Word Count" ) if eof;
1712 0 0       0 $self->WriteLog( " $count" ) if eof;
1713 0 0       0 $count = $wordCount + 1 if eof;
1714 0 0       0 next if eof;
1715             }
1716              
1717             # Fetch Word Vector Float Values
1718 0         0 for( my $i = 0; $i < $wordSize; $i++ )
1719             {
1720             # Read Specified Bytes Amount From File
1721 0         0 read( $fileHandle, $buffer, 4 ); # Assumes size of floating point is 4 bytes
1722 0         0 chomp( $buffer );
1723              
1724             # Check(s)
1725 0 0 0     0 $i = $wordSize + 1 if !defined( $buffer ) || $buffer eq 0;
1726 0 0 0     0 next if !defined( $buffer ) || $buffer eq 0;
1727              
1728 0 0 0     0 if( defined( $buffer ) && $buffer ne "" )
1729             {
1730             # Convert Binary Values To Float
1731 0         0 $buffer = unpack( "f", $buffer ); # Unpacks/convert 4 byte string to floating point
1732 0         0 $wordVectorData .= ( " " . sprintf( "%.6f", $buffer ) ); # Round Decimal At Sixth Place
1733             }
1734             }
1735              
1736             # Word Vector = Word + WordVectorData
1737 0         0 $word .= $wordVectorData;
1738              
1739             # Search For Search Word And Return If Found
1740 0 0       0 if ( defined( $searchWord ) )
1741             {
1742 0         0 my @data = split( ' ', $word );
1743              
1744 0 0       0 if ( $data[0] eq $searchWord )
1745             {
1746 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Search Word Found / Clearing Variables" );
1747 0         0 $self->ClearVocabularyHash();
1748 0         0 close( $fileHandle );
1749 0         0 return join( ' ', @data );
1750             }
1751             }
1752             # Store Vector Data In Memory
1753             else
1754             {
1755             # Add Word Vector To Memory
1756 0 0       0 $self->AddWordVectorToVocabHash( $word ) if $word ne "";
1757             }
1758              
1759             # Clear Variables
1760 0         0 $word = "";
1761 0         0 $wordVectorData = "";
1762 0         0 $buffer = "";
1763              
1764 0         0 $count++;
1765              
1766             # Progress Percent Indicator - Return To Beginning Of Line
1767 0 0       0 print( "\r" ) if ( $self->GetNumberOfWords() > 0 );
1768             }
1769              
1770 0         0 close( $fileHandle );
1771             }
1772              
1773 0 0       0 my $numberOfWords = keys %{ $self->GetVocabularyHash() } if defined( $self->GetVocabularyHash() );
  0         0  
1774 0 0       0 $numberOfWords = 0 if !defined( $numberOfWords );
1775 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading Data Complete" );
1776 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - $numberOfWords Word Vectors Stored In Memory" );
1777              
1778             # Used To Print New Line For Progress Percent Indicator
1779 0         0 print( "\n" );
1780              
1781             # Cannot Find Search Word In File
1782 0 0       0 return -1 if ( defined( $searchWord ) );
1783              
1784 0         0 return 0;
1785             }
1786              
1787             sub SaveTrainedVectorDataToFile
1788             {
1789 3     3 1 9 my ( $self, $savePath, $saveFormat ) = @_;
1790              
1791             # Check(s)
1792 3 50       9 $self->WriteLog( "SaveTrainedVectorDataToFile - Error: No Save Path Defined" ) if !defined( $savePath );
1793 3 50       7 return -1 if !defined ( $savePath );
1794              
1795 3 50       7 $saveFormat = 0 if !defined ( $saveFormat );
1796              
1797             # Save Data To File
1798 3         4 my $fileHandle;
1799              
1800             # Save Vector Data In Plain Text Format
1801 3 100       13 if ( $saveFormat == 0 )
    100          
    50          
1802             {
1803 1         6 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Text File: \"$savePath\"" );
1804              
1805 1 50       117 open( $fileHandle, ">:encoding(utf8)", "$savePath" ) or return -1;
1806 1         66 my $vocabHashRef = $self->GetVocabularyHash();
1807 1         4 my @dataAry = sort( keys %{ $vocabHashRef } );
  1         7  
1808              
1809 1 50       4 if( $self->GetSparseVectorMode() == 1 )
1810             {
1811 0         0 my $numOfWords = $self->GetNumberOfWords();
1812 0         0 my $vectorSize = $self->GetVectorLength();
1813              
1814 0         0 for( my $i = 0; $i < @dataAry; $i++ )
1815             {
1816             # Progress Percent Indicator - Print Percentage Of File Loaded
1817 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1818              
1819 0         0 my $wordVectorData = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1820              
1821             # Check(s)
1822 0 0       0 $self->WriteLog( "SaveTrainedVectorDataToFile - Warning: Word Vector Contains No Data / Empty String - Line: $i" ) if ( $wordVectorData eq "" );
1823 0 0       0 next if ( $wordVectorData eq "" );
1824              
1825 0 0       0 if( $i == 0 )
1826             {
1827 0         0 print( $fileHandle "$wordVectorData\n" )
1828             }
1829             else
1830             {
1831 0         0 my @data = split( ' ', $wordVectorData );
1832              
1833             # Get Word
1834 0         0 my $word = $data[0];
1835              
1836             # Make Array Of Vector Size With All Zeros
1837 0 0       0 my @wordVector = ( "0.000000" ) x $vectorSize if ( $vectorSize != 0 );
1838              
1839 0         0 for( my $j = 1; $j < @data; $j++ )
1840             {
1841             # If The Index ($i) Is Odd, Then The Element Is An Index
1842 0 0       0 my $index = $data[$j] if ( $j % 2 == 1 );
1843              
1844             # If The Index Is Defined, Then Next Element Is An Index Element
1845 0 0       0 my $element = $data[$j+1] if defined( $index );
1846              
1847             # Assign The Correct Index Element To The Specified Index
1848 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
1849             }
1850              
1851             # Generate Regular Formatted Word Vector
1852 0         0 $word = $word . " " . join( ' ', @wordVector );
1853              
1854             # Print Dictionary/Vocabulary Vector Data To File
1855 0         0 print( $fileHandle "$word \n" );
1856              
1857             # Clear Array
1858 0         0 @data = ();
1859 0         0 @wordVector = ();
1860             }
1861              
1862             # Progress Percent Indicator - Return To Beginning Of Line
1863 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
1864             }
1865             }
1866             else
1867             {
1868             # Get Number Of Word Vectors and Vector Array Size
1869 1         3 my $numOfWords = $self->GetNumberOfWords();
1870 1         4 my $vectorSize = $self->GetVectorLength();
1871              
1872             # Print Dictionary/Vocabulary Vector Data To File
1873 1         5 for( my $i = 0; $i < @dataAry; $i++ )
1874             {
1875             # Progress Percent Indicator - Print Percentage Of File Loaded
1876 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1877              
1878 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1879 0 0       0 print( $fileHandle "$data\n" ) if ( $i == 0 );
1880 0 0       0 print( $fileHandle "$data \n" ) if ( $i > 0 );
1881              
1882             # Progress Percent Indicator - Return To Beginning Of Line
1883 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
1884             }
1885             }
1886              
1887 1         12 close( $fileHandle );
1888 1         5 undef( $fileHandle );
1889              
1890 1         8 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
1891             }
1892             # Save Vector Data In Word2Vec Binary Format
1893             elsif ( $saveFormat == 1 )
1894             {
1895 1         6 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Binary File: \"$savePath\"" );
1896              
1897             # Get Vocabulary and Vector Sizes
1898 1         3 my $vocabHashRef = $self->GetVocabularyHash();
1899 1         2 my @dataAry = sort( keys %{ $vocabHashRef } );
  1         4  
1900              
1901             # Check(s)
1902 1 50       6 $self->WriteLog( "SaveTrainedVectorDataToFile - Error: No Word2Vec Vector Data In Memory / Vocabulary Size == 0" ) if @dataAry == 0;
1903 1 50       6 return -1 if @dataAry == 0;
1904              
1905 0 0       0 open( $fileHandle, ">:raw", "$savePath" ) or return -1;
1906 0         0 binmode( $fileHandle ); # Not necessary as ":raw" implies binmode.
1907              
1908 0         0 my $headerStr = $dataAry[0] . " " . $vocabHashRef->{ $dataAry[0] };
1909 0         0 my @headerAry = split( ' ', $headerStr );
1910 0 0       0 return -1 if ( @headerAry < 2 );
1911              
1912 0         0 my $numOfWords = $headerAry[0];
1913 0         0 my $windowSize = $headerAry[1];
1914 0         0 @headerAry = ();
1915 0         0 undef( @headerAry );
1916              
1917             # Print Vocabulary and Windows Sizes To File With Line Feed
1918 0         0 print( $fileHandle "$headerStr\n" );
1919              
1920             # Print Word2Vec Vocabulary and Vector Data To File With Line Feed(s)
1921 0         0 for( my $i = 0; $i < @dataAry; $i++ )
1922             {
1923             # Progress Percent Indicator - Print Percentage Of File Loaded
1924 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1925              
1926 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1927              
1928             # Check(s)
1929 0 0       0 next if ( $i == 0 );
1930              
1931             # Convert Sparse Vector Data To Dense Vector Format
1932 0 0       0 if ( $self->GetSparseVectorMode() == 1 )
1933             {
1934 0         0 my @tempAry = split( ' ', $data );
1935 0         0 my $word = $tempAry[0];
1936 0         0 @tempAry = ();
1937 0         0 @tempAry = @{ $self->ConvertRawSparseTextToVectorDataAry( $data ) };
  0         0  
1938 0         0 $data = "$word " . join( ' ', @tempAry );
1939 0         0 undef( @tempAry );
1940             }
1941              
1942 0         0 my @ary = split( ' ', $data );
1943 0 0       0 next if @ary < $windowSize;
1944              
1945             # Separate "Word" From "Vector Data"
1946 0         0 my $word = shift( @ary ) . " ";
1947 0         0 my $arySize = @ary;
1948              
1949             # Print Word To File
1950 0         0 print( $fileHandle $word );
1951              
1952             # Print Word Vector Data To File
1953 0         0 for my $value ( @ary )
1954             {
1955 0         0 print( $fileHandle pack( 'f', $value ) ); # Packs String Data In Decimal Binary Format
1956             }
1957              
1958             # Add Line Feed To End Of Word + Vector Data
1959 0         0 print( $fileHandle "\n" );
1960              
1961             # Progress Percent Indicator - Return To Beginning Of Line
1962 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
1963             }
1964              
1965 0         0 close( $fileHandle );
1966 0         0 undef( $fileHandle );
1967              
1968 0         0 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
1969             }
1970             # Save Vectors In Sparse Vector Format
1971             elsif ( $saveFormat == 2 )
1972             {
1973 1         5 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Sparse Text File: \"$savePath\"" );
1974              
1975 1 50       65 open( $fileHandle, ">:encoding(utf8)", "$savePath" ) or return -1;
1976 1         50 my $vocabHashRef = $self->GetVocabularyHash();
1977 1         3 my @dataAry = sort( keys( %{ $vocabHashRef } ) );
  1         5  
1978              
1979 1 50       3 if( $self->GetSparseVectorMode() == 1 )
1980             {
1981 0         0 for my $data ( @dataAry )
1982             {
1983 0         0 print( $fileHandle $data . " " . $vocabHashRef->{ $data } . "\n" );
1984             }
1985             }
1986             else
1987             {
1988             # Get Number Of Word Vectors and Vector Array Size
1989 1         4 my $numOfWords = $self->GetNumberOfWords();
1990 1         4 my $vectorSize = $self->GetVectorLength();
1991              
1992             # Print Dictionary/Vocabulary Vector Data To File
1993 1         4 for( my $i = 0; $i < @dataAry; $i++ )
1994             {
1995             # Progress Percent Indicator - Print Percentage Of File Loaded
1996 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1997              
1998 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1999 0 0       0 print( $fileHandle "$data\n" ) if ( $i == 0 );
2000              
2001 0 0 0     0 if( $i > 0 && defined( $data ) )
2002             {
2003 0         0 my @wordAry = split( ' ', $data );
2004              
2005 0         0 my $word = $wordAry[0];
2006              
2007             # Print The Vector Word To The File
2008 0         0 print( $fileHandle "$word" );
2009              
2010             # Print Vector Data To File
2011 0         0 for( my $j = 1; $j < @wordAry; $j++ )
2012             {
2013 0         0 my $index = $j - 1;
2014 0         0 my $value = $wordAry[$j];
2015 0 0       0 print( $fileHandle " $index $value" ) if ( $value != 0 );
2016             }
2017              
2018 0         0 print( $fileHandle " \n" );
2019             }
2020              
2021             # Progress Percent Indicator - Return To Beginning Of Line
2022 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
2023             }
2024             }
2025              
2026 1         9 close( $fileHandle );
2027 1         7 undef( $fileHandle );
2028              
2029 1         3 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
2030             }
2031              
2032             # Used To Print New Line For Progress Percent Indicator
2033 2         54 print( "\n" );
2034              
2035 2         16 return 0;
2036             }
2037              
2038             sub StringsAreEqual
2039             {
2040 2     2 1 7 my ( $self , $strA, $strB ) = @_;
2041              
2042 2         4 $strA = lc( $strA );
2043 2         4 $strB = lc( $strB );
2044              
2045 2 100       9 return 0 if length( $strA ) != length( $strB );
2046 1 50       7 return 0 if index( $strA, $strB ) != 0;
2047              
2048 1         4 return 1;
2049             }
2050              
2051             sub RemoveWordFromWordVectorString
2052             {
2053 3     3 1 15 my ( $self, $dataStr ) = @_;
2054              
2055             # Check(s)
2056 3 50       13 return undef if !defined( $dataStr );
2057              
2058             # shift @tempAry Also Works As Well
2059 0         0 my @tempAry = split( ' ', $dataStr, 2 );
2060 0         0 $dataStr = $tempAry[1];
2061              
2062 0         0 undef( @tempAry );
2063              
2064 0         0 return $dataStr;
2065             }
2066              
2067             sub ConvertRawSparseTextToVectorDataAry
2068             {
2069 1     1 1 4 my ( $self, $rawSparseText ) = @_;
2070              
2071             # Check(s)
2072 1 50       4 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: No Sparse Text Defined" ) if !defined( $rawSparseText );
2073 1 50       3 return () if !defined( $rawSparseText );
2074              
2075 1 50       4 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Sparse Text String Empty" ) if ( $rawSparseText eq "" );
2076 1 50       3 return () if ( $rawSparseText eq "" );
2077              
2078 1         5 my $vectorSize = $self->GetVectorLength();
2079              
2080 1 50       5 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Vector Size == 0" ) if ( $vectorSize == 0 );
2081 1 50       33 return () if ( $vectorSize == 0 );
2082              
2083             # Begin Data Conversion
2084 0         0 my @data = split( ' ', $rawSparseText );
2085              
2086             # Make Array Of Vector Size With All Zeros
2087 0         0 my @wordVector = ( "0.000000" ) x $vectorSize;
2088              
2089 0         0 for( my $i = 0; $i < @data; $i++ )
2090             {
2091             # Skip First Element / First Element Contains Word
2092 0 0       0 next if $i == 0;
2093              
2094             # If The Index ($i) Is Odd, Then The Element Is An Index
2095 0 0       0 my $index = $data[$i] if ( $i % 2 == 1 );
2096              
2097             # If The Index Is Defined, Then Next Element Is An Index Element
2098 0 0       0 my $element = $data[$i+1] if defined( $index );
2099              
2100             # Assign The Correct Index Element To The Specified Index
2101 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
2102             }
2103              
2104             # Clear Data
2105 0         0 undef( @data );
2106 0         0 @data = ();
2107 0         0 $rawSparseText = undef;
2108              
2109 0         0 return \@wordVector;
2110             }
2111              
2112             sub ConvertRawSparseTextToVectorDataHash
2113             {
2114 0     0 1 0 my ( $self, $rawSparseText ) = @_;
2115              
2116             # Check(s)
2117 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: No Sparse Text Defined" ) if !defined( $rawSparseText );
2118 0 0       0 return () if !defined( $rawSparseText );
2119              
2120 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Sparse Text String Empty" ) if ( $rawSparseText eq "" );
2121 0 0       0 return () if ( $rawSparseText eq "" );
2122              
2123 0         0 my $vectorSize = $self->GetVectorLength();
2124              
2125 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Vector Size == 0" ) if ( $vectorSize == 0 );
2126 0 0       0 return () if ( $vectorSize == 0 );
2127              
2128             # Begin Data Conversion
2129 0         0 my @data = split( ' ', $rawSparseText );
2130              
2131 0         0 my %wordHash;
2132              
2133 0         0 for( my $i = 0; $i < @data; $i++ )
2134             {
2135             # Skip First Element / First Element Contains Word
2136 0 0       0 next if $i == 0;
2137              
2138             # If The Index ($i) Is Odd, Then The Element Is An Index
2139 0 0       0 my $index = $data[$i] if ( $i % 2 == 1 );
2140              
2141             # If The Index Is Defined, Then Next Element Is An Index Element
2142 0 0       0 my $element = $data[$i+1] if defined( $index );
2143              
2144             # Assign The Correct Index Element To The Specified Index
2145 0 0 0     0 $wordHash{$index} = $element if defined( $index ) && defined( $element );
2146             }
2147              
2148             # Clear Data
2149 0         0 undef( @data );
2150 0         0 @data = ();
2151 0         0 $rawSparseText = undef;
2152              
2153 0         0 return \%wordHash;
2154             }
2155              
2156             sub GetOSType
2157             {
2158 2     2 1 4 my ( $self ) = @_;
2159 2         10 return $^O;
2160             }
2161              
2162              
2163             ######################################################################################
2164             # Accessors
2165             ######################################################################################
2166              
2167             sub GetDebugLog
2168             {
2169 64     64 1 785 my ( $self ) = @_;
2170 64 50       123 $self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } );
2171 64         168 return $self->{ _debugLog };
2172             }
2173              
2174             sub GetWriteLog
2175             {
2176 54     54 1 82 my ( $self ) = @_;
2177 54 50       106 $self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } );
2178 54         135 return $self->{ _writeLog };
2179             }
2180              
2181             sub GetFileHandle
2182             {
2183 1     1 1 3 my ( $self ) = @_;
2184 1 50       4 $self->{ _fileHandle } = undef if !defined ( $self->{ _fileHandle } );
2185 1         4 return $self->{ _fileHandle };
2186             }
2187              
2188             sub GetTrainFilePath
2189             {
2190 2     2 1 4 my ( $self ) = @_;
2191 2 50       7 $self->{ _trainFileName } = "" if !defined ( $self->{ _trainFileName } );
2192 2         7 return $self->{ _trainFileName };
2193             }
2194              
2195             sub GetOutputFilePath
2196             {
2197 2     2 1 3 my ( $self ) = @_;
2198 2 50       8 $self->{ _outputFileName } = "" if !defined ( $self->{ _outputFileName } );
2199 2         7 return $self->{ _outputFileName };
2200             }
2201              
2202             sub GetWordVecSize
2203             {
2204 2     2 1 5 my ( $self ) = @_;
2205 2 50       7 $self->{ _wordVecSize } = 100 if !defined ( $self->{ _wordVecSize } );
2206 2         7 return $self->{ _wordVecSize };
2207             }
2208              
2209             sub GetWindowSize
2210             {
2211 2     2 1 6 my ( $self ) = @_;
2212 2 50       5 $self->{ _windowSize } = 5 if !defined ( $self->{ _windowSize } );
2213 2         7 return $self->{ _windowSize };
2214             }
2215              
2216             sub GetSample
2217             {
2218 2     2 1 6 my ( $self ) = @_;
2219 2 50       17 $self->{ _sample } = 0.001 if !defined ( $self->{ _sample } );
2220 2         12 return $self->{ _sample };
2221             }
2222              
2223             sub GetHSoftMax
2224             {
2225 2     2 1 5 my ( $self ) = @_;
2226 2 50       7 $self->{ _hSoftMax } = 0 if !defined ( $self->{ _hSoftMax } );
2227 2         10 return $self->{ _hSoftMax };
2228             }
2229              
2230             sub GetNegative
2231             {
2232 2     2 1 10 my ( $self ) = @_;
2233 2 50       7 $self->{ _negative } = 5 if !defined ( $self->{ _negative } );
2234 2         8 return $self->{ _negative };
2235             }
2236              
2237             sub GetNumOfThreads
2238             {
2239 2     2 1 4 my ( $self ) = @_;
2240 2 50       7 $self->{ _numOfThreads } = 12 if !defined ( $self->{ _numOfThreads } );
2241 2         6 return $self->{ _numOfThreads };
2242             }
2243              
2244             sub GetNumOfIterations
2245             {
2246 2     2 1 4 my ( $self ) = @_;
2247 2 50       6 $self->{ _numOfIterations } = 5 if !defined ( $self->{ _numOfIterations } );
2248 2         7 return $self->{ _numOfIterations };
2249             }
2250              
2251             sub GetMinCount
2252             {
2253 2     2 1 3 my ( $self ) = @_;
2254 2 50       7 $self->{ _minCount } = 5 if !defined ( $self->{ _minCount } );
2255 2         7 return $self->{ _minCount };
2256             }
2257              
2258             sub GetAlpha
2259             {
2260 3     3 1 7 my ( $self ) = @_;
2261 3 50 33     17 $self->{ _alpha } = 0.05 if ( !defined ( $self->{ _alpha } ) && $self->GetUseCBOW() == 1 );
2262 3 50 33     9 $self->{ _alpha } = 0.025 if ( !defined ( $self->{ _alpha } ) && $self->GetUseCBOW() == 0 );
2263 3         9 return $self->{ _alpha };
2264             }
2265              
2266             sub GetClasses
2267             {
2268 2     2 1 210 my ( $self ) = @_;
2269 2 50       7 $self->{ _classes } = 0 if !defined ( $self->{ _classes } );
2270 2         7 return $self->{ _classes };
2271             }
2272              
2273             sub GetDebugTraining
2274             {
2275 2     2 1 6 my ( $self ) = @_;
2276 2 50       7 $self->{ _debug } = 2 if !defined ( $self->{ _debug } );
2277 2         10 return $self->{ _debug };
2278             }
2279              
2280             sub GetBinaryOutput
2281             {
2282 2     2 1 6 my ( $self ) = @_;
2283 2 50       7 $self->{ _binaryOutput } = 1 if !defined ( $self->{ _binaryOutput } );
2284 2         17 return $self->{ _binaryOutput };
2285             }
2286              
2287             sub GetSaveVocabFilePath
2288             {
2289 2     2 1 5 my ( $self ) = @_;
2290 2 50       6 $self->{ _saveVocab } = "" if !defined ( $self->{ _saveVocab } );
2291 2         9 return $self->{ _saveVocab };
2292             }
2293              
2294             sub GetReadVocabFilePath
2295             {
2296 2     2 1 5 my ( $self ) = @_;
2297 2 50       5 $self->{ _readVocab } = "" if !defined ( $self->{ _readVocab } );
2298 2         9 return $self->{ _readVocab };
2299             }
2300              
2301             sub GetUseCBOW
2302             {
2303 2     2 1 3 my ( $self ) = @_;
2304 2 50       7 $self->{ _useCBOW } = 1 if !defined ( $self->{ _useCBOW } );
2305 2         7 return $self->{ _useCBOW };
2306             }
2307              
2308             sub GetWorkingDir
2309             {
2310 5     5 1 11 my ( $self ) = @_;
2311 5 50       11 $self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } );
2312 5         24 return $self->{ _workingDir };
2313             }
2314              
2315             sub GetWord2VecExeDir
2316             {
2317 5     5 1 13 my ( $self ) = @_;
2318 5 50       10 $self->{ _word2VecExeDir } = "" if !defined( $self->{ _word2VecExeDir } );
2319 5         14 return $self->{ _word2VecExeDir };
2320             }
2321              
2322             sub GetVocabularyHash
2323             {
2324 57     57 1 85 my ( $self ) = @_;
2325 57 50       116 $self->{ _hashRefOfWordVectors } = undef if !defined ( $self->{ _hashRefOfWordVectors } );
2326 57         110 return $self->{ _hashRefOfWordVectors };
2327             }
2328              
2329             sub GetOverwriteOldFile
2330             {
2331 4     4 1 10 my ( $self ) = @_;
2332 4 50       10 $self->{ _overwriteOldFile } = 0 if !defined ( $self->{ _overwriteOldFile } );
2333 4         9 return $self->{ _overwriteOldFile };
2334             }
2335              
2336             sub GetSparseVectorMode
2337             {
2338 4     4 0 8 my ( $self ) = @_;
2339 4 50       13 $self->{ _sparseVectorMode } = 0 if !defined ( $self->{ _sparseVectorMode } );
2340 4         43 return $self->{ _sparseVectorMode };
2341             }
2342              
2343             sub GetVectorLength
2344             {
2345 5     5 0 13 my ( $self ) = @_;
2346 5 50       18 $self->{ _vectorLength } = 0 if !defined ( $self->{ _vectorLength } );
2347 5         16 return $self->{ _vectorLength };
2348             }
2349              
2350             sub GetNumberOfWords
2351             {
2352 4     4 0 9 my ( $self ) = @_;
2353 4 50       14 $self->{ _numberOfWords } = 0 if !defined ( $self->{ _numberOfWords } );
2354 4         11 return $self->{ _numberOfWords };
2355             }
2356              
2357             sub GetMinimizeMemoryUsage
2358             {
2359 2     2 0 6 my ( $self ) = @_;
2360 2 50       8 $self->{ _minimizeMemoryUsage } = 1 if !defined ( $self->{ _minimizeMemoryUsage } );
2361 2         6 return $self->{ _minimizeMemoryUsage };
2362             }
2363              
2364              
2365             ######################################################################################
2366             # Mutators
2367             ######################################################################################
2368              
2369             sub SetTrainFilePath
2370             {
2371 2     2 1 5 my ( $self, $str ) = @_;
2372 2         4 return $self->{ _trainFileName } = $str;
2373             }
2374              
2375             sub SetOutputFilePath
2376             {
2377 2     2 1 5 my ( $self, $str ) = @_;
2378 2         3 return $self->{ _outputFileName } = $str;
2379             }
2380              
2381             sub SetWordVecSize
2382             {
2383 2     2 1 5 my ( $self, $value ) = @_;
2384 2         4 return $self->{ _wordVecSize } = $value;
2385             }
2386              
2387             sub SetWindowSize
2388             {
2389 2     2 1 5 my ( $self, $value ) = @_;
2390 2         5 return $self->{ _windowSize } = $value;
2391             }
2392              
2393             sub SetSample
2394             {
2395 2     2 1 5 my ( $self, $value ) = @_;
2396 2         4 return $self->{ _sample } = $value;
2397             }
2398              
2399             sub SetHSoftMax
2400             {
2401 2     2 1 6 my ( $self, $value ) = @_;
2402 2         6 return $self->{ _hSoftMax } = $value;
2403             }
2404              
2405             sub SetNegative
2406             {
2407 2     2 1 5 my ( $self, $value ) = @_;
2408 2         6 return $self->{ _negative } = $value;
2409             }
2410              
2411             sub SetNumOfThreads
2412             {
2413 2     2 1 4 my ( $self, $value ) = @_;
2414 2         4 return $self->{ _numOfThreads } = $value;
2415             }
2416              
2417             sub SetNumOfIterations
2418             {
2419 2     2 1 4 my ( $self, $value ) = @_;
2420 2         4 return $self->{ _numOfIterations } = $value;
2421             }
2422              
2423             sub SetMinCount
2424             {
2425 2     2 1 3 my ( $self, $value ) = @_;
2426 2         5 return $self->{ _minCount } = $value;
2427             }
2428              
2429             sub SetAlpha
2430             {
2431 2     2 1 7 my ( $self, $value ) = @_;
2432 2         4 return $self->{ _alpha } = $value;
2433             }
2434              
2435             sub SetClasses
2436             {
2437 2     2 1 5 my ( $self, $value ) = @_;
2438 2         4 return $self->{ _classes } = $value;
2439             }
2440              
2441             sub SetDebugTraining
2442             {
2443 1     1 1 3 my ( $self, $value ) = @_;
2444 1         2 return $self->{ _debug } = $value;
2445             }
2446              
2447             sub SetBinaryOutput
2448             {
2449 1     1 1 3 my ( $self, $value ) = @_;
2450 1         3 return $self->{ _binaryOutput } = $value;
2451             }
2452              
2453             sub SetSaveVocabFilePath
2454             {
2455 2     2 1 4 my ( $self, $str ) = @_;
2456 2         4 return $self->{ _saveVocab } = $str;
2457             }
2458              
2459             sub SetReadVocabFilePath
2460             {
2461 2     2 1 5 my ( $self, $str ) = @_;
2462 2         5 return $self->{ _readVocab } = $str;
2463             }
2464              
2465             sub SetUseCBOW
2466             {
2467 2     2 1 5 my ( $self, $value ) = @_;
2468 2         2 return $self->{ _useCBOW } = $value;
2469             }
2470              
2471             sub SetWorkingDir
2472             {
2473 2     2 1 6 my ( $self, $dir ) = @_;
2474 2         4 return $self->{ _workingDir } = $dir;
2475             }
2476              
2477             sub SetWord2VecExeDir
2478             {
2479 2     2 1 6 my ( $self, $dir ) = @_;
2480 2         4 return $self->{ _word2VecExeDir } = $dir;
2481             }
2482              
2483             sub SetVocabularyHash
2484             {
2485 1     1 1 3 my ( $self, $ref ) = @_;
2486 1 50       4 return if !defined( $ref );
2487 1         2 return $self->{ _hashRefOfWordVectors } = $ref;
2488             }
2489              
2490             sub ClearVocabularyHash
2491             {
2492 4     4 1 29 my ( $self ) = @_;
2493              
2494 4         17 $self->SetNumberOfWords( 0 );
2495 4         13 $self->SetVectorLength( 0 );
2496              
2497 4         7 undef( %{ $self->{ _hashRefOfWordVectors } } );
  4         9  
2498              
2499 4         7 my %hash;
2500 4         10 return $self->{ _hashRefOfWordVectors } = \%hash;
2501             }
2502              
2503             sub AddWordVectorToVocabHash
2504             {
2505 0     0 1 0 my ( $self, $wordVectorStr ) = @_;
2506 0 0       0 return if !defined( $wordVectorStr );
2507 0         0 my @tempAry = split( ' ', $wordVectorStr, 2 );
2508              
2509             # Check(s)
2510 0 0       0 return if !defined( $self->{ _hashRefOfWordVectors } );
2511 0 0       0 return if ( @tempAry != 2 );
2512              
2513 0         0 $self->{ _hashRefOfWordVectors }->{ $tempAry[0] } = $tempAry[1];
2514             }
2515              
2516             sub SetOverwriteOldFile
2517             {
2518 1     1 1 2 my ( $self, $temp ) = @_;
2519 1         3 return $self->{ _overwriteOldFile } = $temp;
2520             }
2521              
2522             sub SetSparseVectorMode
2523             {
2524 2     2 0 5 my ( $self, $temp ) = @_;
2525 2         4 return $self->{ _sparseVectorMode } = $temp;
2526             }
2527              
2528             sub SetVectorLength
2529             {
2530 6     6 0 9 my ( $self, $temp ) = @_;
2531 6         11 return $self->{ _vectorLength } = $temp;
2532             }
2533              
2534             sub SetNumberOfWords
2535             {
2536 6     6 0 12 my ( $self, $temp ) = @_;
2537 6         11 return $self->{ _numberOfWords } = $temp;
2538             }
2539              
2540             sub SetMinimizeMemoryUsage
2541             {
2542 2     2 0 4 my ( $self, $temp ) = @_;
2543 2 100       7 $self->WriteLog( "SetMinimalMemoryUsage - Normal Memory Mode Enabled" ) if ( $temp == 0 );
2544 2 100       5 $self->WriteLog( "SetMinimalMemoryUsage - Low Memory Mode Enabled" ) if ( $temp == 1 );
2545 2         4 return $self->{ _minimizeMemoryUsage } = $temp;
2546             }
2547              
2548              
2549             ######################################################################################
2550             # Debug Functions
2551             ######################################################################################
2552              
2553             sub GetTime
2554             {
2555 1     1 1 2 my ( $self ) = @_;
2556 1         60 my( $sec, $min, $hour ) = localtime();
2557              
2558 1 50       6 if( $hour < 10 )
2559             {
2560 1         3 $hour = "0$hour";
2561             }
2562              
2563 1 50       5 if( $min < 10 )
2564             {
2565 0         0 $min = "0$min";
2566             }
2567              
2568 1 50       3 if( $sec < 10 )
2569             {
2570 1         2 $sec = "0$sec";
2571             }
2572              
2573 1         5 return "$hour:$min:$sec";
2574             }
2575              
2576             sub GetDate
2577             {
2578 1     1 1 4 my ( $self ) = @_;
2579 1         12 my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime();
2580              
2581 1         3 $mon += 1;
2582 1         3 $year += 1900;
2583              
2584 1         4 return "$mon/$mday/$year";
2585             }
2586              
2587             sub WriteLog
2588             {
2589 51     51 1 89 my ( $self ) = shift;
2590 51         67 my $string = shift;
2591 51         64 my $printNewLine = shift;
2592              
2593 51 50       89 return if !defined ( $string );
2594 51 50       86 $printNewLine = 1 if !defined ( $printNewLine );
2595              
2596              
2597 51 50       91 if( $self->GetDebugLog() )
2598             {
2599 0 0       0 if( ref ( $self ) ne "Word2vec::Word2vec" )
2600             {
2601 0         0 print( GetDate() . " " . GetTime() . " - Word2vec: Cannot Call WriteLog() From Outside Module!\n" );
2602 0         0 return;
2603             }
2604              
2605 0 0       0 $string = "" if !defined ( $string );
2606 0         0 print GetDate() . " " . GetTime() . " - Word2vec::$string";
2607 0 0       0 print "\n" if( $printNewLine != 0 );
2608             }
2609              
2610 51 50       95 if( $self->GetWriteLog() )
2611             {
2612 0 0         if( ref ( $self ) ne "Word2vec::Word2vec" )
2613             {
2614 0           print( GetDate() . " " . GetTime() . " - Word2vec: Cannot Call WriteLog() From Outside Module!\n" );
2615 0           return;
2616             }
2617              
2618 0           my $fileHandle = $self->GetFileHandle();
2619              
2620 0 0         if( defined( $fileHandle ) )
2621             {
2622 0           print( $fileHandle GetDate() . " " . GetTime() . " - Word2vec::$string" );
2623 0 0         print( $fileHandle "\n" ) if( $printNewLine != 0 );
2624             }
2625             }
2626             }
2627              
2628             #################### All Modules Are To Output "1"(True) at EOF ######################
2629             1;
2630              
2631              
2632             =head1 NAME
2633              
2634             Word2vec::Word2vec - word2vec wrapper module.
2635              
2636             =head1 SYNOPSIS
2637              
2638             # Parameters: Enabled Debug Logging, Disabled Write Logging
2639             my $w2v = Word2vec::Word2vec->new( 1, 0 ); # Note: Specifiying no parameters implies default settings.
2640              
2641             $w2v->SetTrainFilePath( "textCorpus.txt" );
2642             $w2v->SetOutputFilePath( "vectors.bin" );
2643             $w2v->SetWordVecSize( 200 );
2644             $w2v->SetWindowSize( 8 );
2645             $w2v->SetSample( 0.0001 );
2646             $w2v->SetNegative( 25 );
2647             $w2v->SetHSoftMax( 0 );
2648             $w2v->SetBinaryOutput( 0 );
2649             $w2v->SetNumOfThreads( 20 );
2650             $w2v->SetNumOfIterations( 12 );
2651             $w2v->SetUseCBOW( 1 );
2652             $w2v->SetOverwriteOldFile( 0 );
2653              
2654             $w2v->ExecuteTraining();
2655              
2656             undef( $w2v );
2657              
2658             # or
2659              
2660             use Word2vec::Word2vec;
2661              
2662             my $w2v = Word2vec::Word2vec->new(); # Note: Specifying no parameters implies default settings.
2663              
2664             $w2v->ExecuteTraining( $trainFilePath, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative,
2665             $alpha, $hs, $binary, $numOfThreads, $iterations, $useCBOW, $classes, $readVocab,
2666             $saveVocab, $debug, $overwrite );
2667              
2668             undef( $w2v );
2669              
2670             =head1 DESCRIPTION
2671              
2672             Word2vec::Word2vec is a word2vec package tool that trains text corpus data using the word2vec tool, provides multiple avenues for cosine
2673             similarity computation, manipulation of word vectors and conversion of word2vec's binary format to human readable text.
2674              
2675             =head2 Main Functions
2676              
2677             =head3 new
2678              
2679             Description:
2680              
2681             Returns a new "Word2vec::Word2vec" module object.
2682              
2683             Note: Specifying no parameters implies default options.
2684              
2685             Default Parameters:
2686             debugLog = 0
2687             writeLog = 0
2688             trainFileName = ""
2689             outputFileName = ""
2690             wordVecSize = 100
2691             sample = 5
2692             hSoftMax = 0
2693             negative = 5
2694             numOfThreads = 12
2695             numOfIterations = 5
2696             minCount = 5
2697             alpha = 0.05 (CBOW) or 0.025 (Skip-Gram)
2698             classes = 0
2699             debug = 2
2700             binaryOutput = 1
2701             saveVocab = ""
2702             readVocab = ""
2703             useCBOW = 1
2704             workingDir = Current Directory
2705             hashRefOfWordVectors = ()
2706             overwriteOldFile = 0
2707              
2708             Input:
2709              
2710             $debugLog -> Instructs module to print debug statements to the console. (1 = True / 0 = False)
2711             $writeLog -> Instructs module to print debug statements to a log file. (1 = True / 0 = False)
2712             $trainFileName -> Specifies the training text corpus file path. (String)
2713             $outputFileName -> Specifies the word2vec post training output file path. (String)
2714             $wordVecSize -> Specifies word2vec word vector parameter size.(Integer)
2715             $sample -> Specifies word2vec sample parameter value. (Integer)
2716             $hSoftMax -> Specifies word2vec HSoftMax parameter value. (Integer)
2717             $negative -> Specifies word2vec negative parameter value. (Integer)
2718             $numOfThreads -> Specifies word2vec number of threads parameter value. (Integer)
2719             $numOfIterations -> Specifies word2vec number of iterations parameter value. (Integer)
2720             $minCount -> Specifies word2vec min-count parameter value. (Integer)
2721             $alpha -> Specifies word2vec alpha parameter value. (Integer)
2722             $classes -> Specifies word2vec classes parameter value. (Integer)
2723             $debug -> Specifies word2vec debug training parameter value. (Integer: '0' = No Debug, '1' = Debug, '2' = Even more debug info)
2724             $binaryOutput -> Specifies word2vec binary output mode parameter value. (Integer: '1' = Binary, '0' = Plain Text)
2725             $saveVocab -> Specifies word2vec save vocabulary file path. (String)
2726             $readVocab -> Specifies word2vec read vocabulary file path. (String)
2727             $useCBOW -> Specifies word2vec CBOW algorithm parameter value. (Integer: '1' = CBOW, '0' = Skip-Gram)
2728             $workingDir -> Specifies module working directory. (String)
2729             $hashRefOfWordVectors -> Storage location for loaded word2vec trained vector data file in memory. (Hash)
2730             $overwriteOldFile -> Instructs the module to either overwrite any existing data with the same output file name and path. ( '1' or '0' )
2731              
2732             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2733              
2734             Output:
2735              
2736             Word2vec::Word2vec object.
2737              
2738             Example:
2739              
2740             use Word2vec::Word2vec;
2741              
2742             my $w2v = Word2vec::Word2vec->new();
2743              
2744             undef( $w2v );
2745              
2746             =head3 DESTROY
2747              
2748             Description:
2749              
2750             Removes member variables and file handle from memory.
2751              
2752             Input:
2753              
2754             None
2755              
2756             Output:
2757              
2758             None
2759              
2760             Example:
2761              
2762             use Word2vec::Word2vec;
2763              
2764             my $w2v = Word2vec::Word2vec->new();
2765             $w2v->DESTROY();
2766              
2767             undef( $w2v );
2768              
2769             =head3 ExecuteTraining
2770              
2771             Executes word2vec training based on parameters. Parameter variables have higher precedence
2772             than member variables. Any parameter specified will override its respective member variable.
2773              
2774             Note: If no parameters are specified, this module executes word2vec training based on preset
2775             member variables. Returns string regarding training status.
2776              
2777             Input:
2778              
2779             $trainFilePath -> Specifies word2vec text corpus training file in a given path. (String)
2780             $outputFilePath -> Specifies word2vec trained output data file name and save path. (String)
2781             $vectorSize -> Size of word2vec word vectors. (Integer)
2782             $windowSize -> Maximum skip length between words. (Integer)
2783             $minCount -> Disregard words that appear less than $minCount times. (Integer)
2784             $sample -> Threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled. (Float)
2785             $negative -> Number of negative examples. (Integer)
2786             $alpha -> Set that start learning rate. (Float)
2787             $hs -> Hierarchical Soft-max (Integer)
2788             $binary -> Save trained data as binary mode. (Integer)
2789             $numOfThreads -> Number of word2vec training threads. (Integer)
2790             $iterations -> Number of training iterations to run prior to completion of training. (Integer)
2791             $useCBOW -> Enable Continuous Bag Of Words model or Skip-Gram model. (Integer)
2792             $classes -> Output word classes rather than word vectors. (Integer)
2793             $readVocab -> Read vocabulary from file path without constructing from training data. (String)
2794             $saveVocab -> Save vocabulary to file path. (String)
2795             $debug -> Set word2vec debug mode. (Integer)
2796             $overwrite -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. ( '1' = True / '0' = False )
2797              
2798             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2799              
2800             Output:
2801              
2802             $value -> '0' = Successful / '-1' = Un-successful
2803              
2804             Example:
2805              
2806             use Word2vec::Word2vec;
2807              
2808             my $w2v = Word2vec::Word2vec->new();
2809             $w2v->SetTrainFilePath( "textcorpus.txt" );
2810             $w2v->SetOutputFilePath( "vectors.bin" );
2811             $w2v->SetWordVecSize( 200 );
2812             $w2v->SetWindowSize( 8 );
2813             $w2v->SetSample( 0.0001 );
2814             $w2v->SetNegative( 25 );
2815             $w2v->SetHSoftMax( 0 );
2816             $w2v->SetBinaryOutput( 0 );
2817             $w2v->SetNumOfThreads( 20 );
2818             $w2v->SetNumOfIterations( 15 );
2819             $w2v->SetUseCBOW( 1 );
2820             $w2v->SetOverwriteOldFile( 0 );
2821             $w2v->ExecuteTraining();
2822              
2823             undef( $w2v );
2824              
2825             # or
2826              
2827             use Word2vec::Word2vec;
2828              
2829             my $w2v = Word2vec::Word2vec->new();
2830             $w2v->ExecuteTraining( "textcorpus.txt", "vectors.bin", 200, 8, 5, 0.001, 25, 0.05, 0, 0, 20, 15, 1, 0, "", "", 2, 0 );
2831              
2832             undef( $w2v );
2833              
2834             =head3 ExecuteStringTraining
2835              
2836             Executes word2vec training based on parameters. Parameter variables have higher precedence
2837             than member variables. Any parameter specified will override its respective member variable.
2838              
2839             Note: If no parameters are specified, this module executes word2vec training based on preset
2840             member variables. Returns string regarding training status.
2841              
2842             Input:
2843              
2844             $trainingStr -> String to train with word2vec.
2845             $outputFilePath -> Specifies word2vec trained output data file name and save path. (String)
2846             $vectorSize -> Size of word2vec word vectors. (Integer)
2847             $windowSize -> Maximum skip length between words. (Integer)
2848             $minCount -> Disregard words that appear less than $minCount times. (Integer)
2849             $sample -> Threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled. (Float)
2850             $negative -> Number of negative examples. (Integer)
2851             $alpha -> Set that start learning rate. (Float)
2852             $hs -> Hierarchical Soft-max (Integer)
2853             $binary -> Save trained data as binary mode. (Integer)
2854             $numOfThreads -> Number of word2vec training threads. (Integer)
2855             $iterations -> Number of training iterations to run prior to completion of training. (Integer)
2856             $useCBOW -> Enable Continuous Bag Of Words model or Skip-Gram model. (Integer)
2857             $classes -> Output word classes rather than word vectors. (Integer)
2858             $readVocab -> Read vocabulary from file path without constructing from training data. (String)
2859             $saveVocab -> Save vocabulary to file path. (String)
2860             $debug -> Set word2vec debug mode. (Integer)
2861             $overwrite -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. ( '1' = True / '0' = False )
2862              
2863             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2864              
2865             Output:
2866              
2867             $value -> '0' = Successful / '-1' = Un-successful
2868              
2869             Example:
2870              
2871             use Word2vec::Word2vec;
2872              
2873             my $w2v = Word2vec::Word2vec->new();
2874             $w2v->SetOutputFilePath( "vectors.bin" );
2875             $w2v->SetWordVecSize( 200 );
2876             $w2v->SetWindowSize( 8 );
2877             $w2v->SetSample( 0.0001 );
2878             $w2v->SetNegative( 25 );
2879             $w2v->SetHSoftMax( 0 );
2880             $w2v->SetBinaryOutput( 0 );
2881             $w2v->SetNumOfThreads( 20 );
2882             $w2v->SetNumOfIterations( 15 );
2883             $w2v->SetUseCBOW( 1 );
2884             $w2v->SetOverwriteOldFile( 0 );
2885             $w2v->ExecuteStringTraining( "string to train here" );
2886              
2887             undef( $w2v );
2888              
2889             # or
2890              
2891             use Word2vec::Word2vec;
2892              
2893             my $w2v = Word2vec::Word2vec->new();
2894             $w2v->ExecuteStringTraining( "string to train here", "vectors.bin", 200, 8, 5, 0.001, 25, 0.05, 0, 0, 20, 15, 1, 0, "", "", 2, 0 );
2895              
2896             undef( $w2v );
2897              
2898             =head3 ComputeCosineSimilarity
2899              
2900             Description:
2901              
2902             Computes cosine similarity between two words using trained word2vec vector data. Returns
2903             float value or undefined if one or more words are not in the dictionary.
2904              
2905             Note: Supports single words only and requires vector data to be in memory with ReadTrainedVectorDataFromFile() prior to function execution.
2906              
2907             Input:
2908              
2909             $string -> Single string word
2910             $string -> Single string word
2911              
2912             Output:
2913              
2914             $value -> Float or Undefined
2915              
2916             Example:
2917              
2918             use Word2vec::Word2vec;
2919              
2920             my $w2v = Word2vec::Word2vec->new();
2921             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2922             print "Cosine similarity between words: \"of\" and \"the\": " . $w2v->ComputeCosineSimilarity( "of", "the" ) . "\n";
2923              
2924             undef( $w2v );
2925              
2926             =head3 ComputeAvgOfWordsCosineSimilarity
2927              
2928             Description:
2929              
2930             Computes cosine similarity between two words or compound words using trained word2vec vector data.
2931             Returns float value or undefined.
2932              
2933             Note: Supports multiple words concatenated by ' ' and requires vector data to be in memory prior
2934             to method execution. This method will not error out when a word is not located within the dictionary.
2935             It will take the average of all found words for each parameter then cosine similarity of both word vectors.
2936              
2937             Input:
2938              
2939             $string -> string of single or multiple words separated by ' ' (space).
2940             $string -> string of single or multiple words separated by ' ' (space).
2941              
2942             Output:
2943              
2944             $value -> Float or Undefined
2945              
2946             Example:
2947              
2948             use Word2vec::Word2vec;
2949              
2950             my $w2v = Word2vec::Word2vec->new();
2951             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2952             print "Cosine similarity between words: \"heart attack\" and \"acute myocardial infarction\": " .
2953             $w2v->ComputeAvgOfWordsCosineSimilarity( "heart attack", "acute myocardial infarction" ) . "\n";
2954              
2955             undef( $w2v );
2956              
2957             =head3 ComputeMultiWordCosineSimilarity
2958              
2959             Description:
2960              
2961             Computes cosine similarity between two words or compound words using trained word2vec vector data.
2962              
2963             Note: Supports multiple words concatenated by ' ' (space) and requires vector data to be in memory prior to method execution.
2964             If $allWordsMustExist is set to true, this function will error out when a specified word is not found and return undefined.
2965              
2966             Input:
2967              
2968             $string -> string of single or multiple words separated by ' ' (space).
2969             $string -> string of single or multiple words separated by ' ' (space).
2970             $allWordsMustExist -> 1 = True, 0 or undef = False
2971              
2972             Output:
2973              
2974             $value -> Float or Undefined
2975              
2976             Example:
2977              
2978             use Word2vec::Word2vec;
2979              
2980             my $w2v = Word2vec::Word2vec->new();
2981             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2982             print "Cosine similarity between words: \"heart attack\" and \"acute myocardial infarction\": " .
2983             $w2v->ComputeMultiWordCosineSimilarity( "heart attack", "acute myocardial infarction" ) . "\n";
2984              
2985             undef( $w2v );
2986              
2987             =head3 ComputeCosineSimilarityOfWordVectors
2988              
2989             Description:
2990              
2991             Computes cosine similarity between two word vectors.
2992             Returns float value or undefined if one or more words are not in the dictionary.
2993              
2994             Note: Function parameters require actual word vector data with words removed.
2995              
2996             Input:
2997              
2998             $string -> string of word vector representation data separated by ' ' (space).
2999             $string -> string of word vector representation data separated by ' ' (space).
3000              
3001             Output:
3002              
3003             $value -> Float or Undefined
3004              
3005             Example:
3006              
3007             use Word2vec::Word2vec;
3008              
3009             my $word2vec = Word2vec::Word2vec->new();
3010             $word2vec->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3011             my $vectorAData = $word2vec->GetWordVector( "heart" );
3012             my $vectorBData = $word2vec->GetWordVector( "attack" );
3013              
3014             # Remove Words From Data
3015             $vectorAData = RemoveWordFromWordVectorString( $vectorAData );
3016             $vectorBData = RemoveWordFromWordVectorString( $vectorBData );
3017              
3018             print "Cosine similarity between words: \"heart\" and \"attack\": " .
3019             $word2vec->ComputeCosineSimilarityOfWordVectors( $vectorAData, $vectorBData ) . "\n";
3020              
3021             undef( $word2vec );
3022              
3023             =head3 CosSimWithUserInput
3024              
3025             Description:
3026              
3027             Computes cosine similarity between two words using trained word2vec vector data based on user input.
3028              
3029             Note: No compound word support.
3030              
3031             Warning: Requires vector data to be in memory prior to method execution.
3032              
3033             Input:
3034              
3035             None
3036              
3037             Output:
3038              
3039             None
3040              
3041             Example:
3042              
3043             use Word2vec::Word2vec;
3044              
3045             my $w2v = Word2vec::Word2vec->new();
3046             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3047             $w2v->CosSimWIthUserInputTest();
3048              
3049             undef( $w2v );
3050              
3051             =head3 MultiWordCosSimWithUserInput
3052              
3053             Description:
3054              
3055             Computes cosine similarity between two words or compound words using trained word2vec vector data based on user input.
3056              
3057             Note: Supports multiple words concatenated by ':'.
3058              
3059             Warning: Requires vector data to be in memory prior to method execution.
3060              
3061             Input:
3062              
3063             None
3064              
3065             Output:
3066              
3067             None
3068              
3069             Example:
3070              
3071             use Word2vec::Word2vec;
3072              
3073             my $w2v = Word2vec::Word2vec->new();
3074             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3075             $w2v->MultiWordCosSimWithUserInput();
3076              
3077             undef( $w2v );
3078              
3079              
3080             =head3 ComputeAverageOfWords
3081              
3082             Description:
3083              
3084             Computes cosine similarity average of all found words given an array reference parameter of
3085             plain text words. Returns average values (string) or undefined.
3086              
3087             Warning: Requires vector data to be in memory prior to method execution.
3088              
3089             Input:
3090              
3091             $arrayReference -> Array reference of words
3092              
3093             Output:
3094              
3095             $string -> String of word2vec word average values
3096              
3097             Example:
3098              
3099             use Word2vec::Word2vec;
3100              
3101             my $w2v = Word2vec::Word2vec->new();
3102             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3103             my $data = $w2v->ComputeAverageOfWords( "of", "the", "and" );
3104             print( "Computed Average Of Words: $data" ) if defined( $data );
3105              
3106             undef( $w2v );
3107              
3108             =head3 AddTwoWords
3109              
3110             Description:
3111              
3112             Adds two word vectors and returns the result.
3113              
3114             Warning: This method also requires vector data to be in memory prior to method execution.
3115              
3116             Input:
3117              
3118             $string -> Word to add
3119             $string -> Word to add
3120              
3121             Output:
3122              
3123             $string -> String of word2vec summed word values
3124              
3125             Example:
3126              
3127             use Word2vec::Word2vec;
3128              
3129             my $w2v = Word2vec::Word2vec->new();
3130             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3131              
3132             my $data = $w2v->AddTwoWords( "heart", "attack" );
3133             print( "Computed Sum Of Words: $data" ) if defined( $data );
3134              
3135             undef( $w2v );
3136              
3137             =head3 SubtractTwoWords
3138              
3139             Description:
3140              
3141             Subtracts two word vectors and returns the result.
3142              
3143             Warning: This method also requires vector data to be in memory prior to method execution.
3144              
3145             Input:
3146              
3147             $string -> Word to subtract
3148             $string -> Word to subtract
3149              
3150             Output:
3151              
3152             $string -> String of word2vec difference between word values
3153              
3154             Example:
3155              
3156             use Word2vec::Word2vec;
3157              
3158             my $w2v = Word2vec::Word2vec->new();
3159             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3160              
3161             my $data = $w2v->SubtractTwoWords( "king", "man" );
3162             print( "Computed Difference Of Words: $data" ) if defined( $data );
3163              
3164             undef( $w2v );
3165              
3166             =head3 AddTwoWordVectors
3167              
3168             Description:
3169              
3170             Adds two vector data strings and returns the result.
3171              
3172             Warning: Text word must be removed from vector data prior to calling this method. This method
3173             also requires vector data to be in memory prior to method execution.
3174              
3175             Input:
3176              
3177             $string -> Word2vec word vector data (with string word removed)
3178             $string -> Word2vec word vector data (with string word removed)
3179              
3180             Output:
3181              
3182             $string -> String of word2vec summed word values
3183              
3184             Example:
3185              
3186             use Word2vec::Word2vec;
3187              
3188             my $w2v = Word2vec::Word2vec->new();
3189             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3190             my $wordAData = $w2v->GetWordVector( "of" );
3191             my $wordBData = $w2v->GetWordVector( "the" );
3192              
3193             # Removing Words From Vector Data Array
3194             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3195             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3196              
3197             my $data = $w2v->AddTwoWordVectors( $wordAData, $wordBData );
3198             print( "Computed Sum Of Words: $data" ) if defined( $data );
3199              
3200             undef( $w2v );
3201              
3202             =head3 SubtractTwoWordVectors
3203              
3204             Description:
3205              
3206             Subtracts two vector data strings and returns the result.
3207              
3208             Warning: Text word must be removed from vector data prior to calling this method. This method
3209             also requires vector data to be in memory prior to method execution.
3210              
3211             Input:
3212              
3213             $string -> Word2vec word vector data (with string word removed)
3214             $string -> Word2vec word vector data (with string word removed)
3215              
3216             Output:
3217              
3218             $string -> String of word2vec difference between word values
3219              
3220             Example:
3221              
3222             use Word2vec::Word2vec;
3223              
3224             my $w2v = Word2vec::Word2vec->new();
3225             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3226             my $wordAData = $w2v->GetWordVector( "of" );
3227             my $wordBData = $w2v->GetWordVector( "the" );
3228              
3229             # Removing Words From Vector Data Array
3230             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3231             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3232              
3233             my $data = $w2v->SubtractTwoWordVectors( $wordAData, $wordBData );
3234             print( "Computed Difference Of Words: $data" ) if defined( $data );
3235              
3236             undef( $w2v );
3237              
3238             =head3 AverageOfTwoWordVectors
3239              
3240             Description:
3241              
3242             Computes the average of two vectors and returns the result.
3243              
3244             Warning: Text word must be removed from vector data prior to calling this method. This method
3245             also requires vector data to be in memory prior to method execution.
3246              
3247             Input:
3248              
3249             $string -> Word2vec word vector data (with string word removed)
3250             $string -> Word2vec word vector data (with string word removed)
3251              
3252             Output:
3253              
3254             $string -> String of word2vec average between word values
3255              
3256             Example:
3257              
3258             use Word2vec::Word2vec;
3259              
3260             my $w2v = Word2vec::Word2vec->new();
3261             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3262             my $wordAData = $w2v->GetWordVector( "of" );
3263             my $wordBData = $w2v->GetWordVector( "the" );
3264              
3265             # Removing Words From Vector Data Array
3266             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3267             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3268              
3269             my $data = $w2v->AverageOfTwoWordVectors( $wordAData, $wordBData );
3270             print( "Computed Difference Of Words: $data" ) if defined( $data );
3271              
3272             undef( $w2v );
3273              
3274             =head3 GetWordVector
3275              
3276             Description:
3277              
3278             Searches dictionary in memory for the specified string argument and returns the vector data.
3279             Returns undefined if not found.
3280              
3281             Warning: Requires vector data to be in memory prior to method execution.
3282              
3283             Input:
3284              
3285             $string -> Word to locate in word2vec vocabulary/dictionary
3286              
3287             Output:
3288              
3289             $string -> Found word2vec word + word vector data or undefined.
3290              
3291             Example:
3292              
3293             use Word2vec::Word2vec;
3294              
3295             my $w2v = Word2vec::Word2vec->new();
3296             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3297             my $wordData = $w2v->GetWordVector( "of" );
3298             print( "Word2vec Word Data: $wordData\n" ) if defined( $wordData );
3299              
3300             undef( $w2v );
3301              
3302             =head3 IsVectorDataInMemory
3303              
3304             Description:
3305              
3306             Checks to see if vector data has been loaded in memory.
3307              
3308             Input:
3309              
3310             None
3311              
3312             Output:
3313              
3314             $value -> '1' = True / '0' = False
3315              
3316             Example:
3317              
3318             use Word2vec::Word2vec;
3319              
3320             my $w2v = Word2vec::Word2vec->new();
3321             my $result = $w2v->IsVectorDataInMemory();
3322              
3323             print( "No vector data in memory\n" ) if $result == 0;
3324             print( "Yes vector data in memory\n" ) if $result == 1;
3325              
3326             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3327              
3328             print( "No vector data in memory\n" ) if $result == 0;
3329             print( "Yes vector data in memory\n" ) if $result == 1;
3330              
3331             undef( $w2v );
3332              
3333             =head3 IsWordOrCUIVectorData
3334              
3335             Description:
3336              
3337             Checks to see if vector data consists of word or CUI terms.
3338              
3339             Input:
3340              
3341             None
3342              
3343             Output:
3344              
3345             $string -> 'cui', 'word' or undef
3346              
3347             Example:
3348              
3349             use Word2vec::Word2vec;
3350              
3351             my $w2v = Word2vec::Word2vec->new();
3352             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3353             my $isWordOrCUIData = $w2v->IsWordOrCUIVectorData();
3354              
3355             print( "Vector Data Consists Of \"$isWordOrCUIData\" Terms\n" ) if defined( $isWordOrCUIData );
3356             print( "Cannot Determine Type Of Terms\n" ) if !defined( $isWordOrCUIData );
3357              
3358             undef( $w2v );
3359              
3360             =head3 IsVectorDataSorted
3361              
3362             Description:
3363              
3364             Checks to see if vector data header is signed as sorted in memory.
3365              
3366             Input:
3367              
3368             None
3369              
3370             Output:
3371              
3372             $value -> '1' = True / '0' = False
3373              
3374             Example:
3375              
3376             use Word2vec::Word2vec;
3377              
3378             my $w2v = Word2vec::Word2vec->new();
3379             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3380              
3381             my $result = $w2v->IsVectorDataSorted();
3382              
3383             print( "No vector data is not sorted\n" ) if $result == 0;
3384             print( "Yes vector data is sorted\n" ) if $result == 1;
3385              
3386             undef( $w2v );
3387              
3388             =head3 CheckWord2VecDataFileType
3389              
3390             Description:
3391              
3392             Checks specified file to see if vector data is in binary or plain text format. Returns 'text'
3393             for plain text and 'binary' for binary data.
3394              
3395             Input:
3396              
3397             $string -> File path
3398              
3399             Output:
3400              
3401             $string -> File Type ( "text" = Plain text file / "binary" = Binary data file )
3402              
3403             Example:
3404              
3405             use Word2vec::Word2vec;
3406              
3407             my $w2v = Word2vec::Word2vec->new();
3408             my $fileType = $w2v->CheckWord2VecDataFileType( "samples/samplevectors.bin" );
3409              
3410             print( "FileType: $fileType\n" ) if defined( $fileType );
3411              
3412             undef( $fileType );
3413              
3414             =head3 ReadTrainedVectorDataFromFile
3415              
3416             Description:
3417              
3418             Reads trained vector data from file path in memory or searches for vector data from file. This function supports and
3419             automatically detects word2vec binary, plain text and sparse vector data formats.
3420              
3421             Note: If search word is undefined, the entire vector file is loaded in memory. If a search word is defined only the vector data is returned or undef.
3422              
3423             Input:
3424              
3425             $string -> Word2vec trained vector data file path
3426             $searchWord -> Searches trained vector data file for specific word vector
3427              
3428             Output:
3429              
3430             $value -> '0' = Successful / '-1' = Un-successful
3431              
3432             Example:
3433              
3434             # Loading data in memory
3435             use Word2vec::Word2vec;
3436              
3437             my $w2v = Word2vec::Word2vec->new();
3438             my $result = $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3439              
3440             print( "Success Loading Data\n" ) if $result == 0;
3441             print( "Un-successful, Data Not Loaded\n" ) if $result == -1;
3442              
3443             undef( $w2v );
3444              
3445             # or
3446              
3447             # Searching vector data file for a specific word vector
3448             use Word2vec::Word2vec;
3449              
3450             my $w2v = Word2vec::Word2vec->new();
3451             my $result = $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin", "medical" );
3452              
3453             print( "Found Vector Data In File\n" ) if $result != -1;
3454             print( "Vector Data Not Found\n" ) if $result == -1;
3455              
3456             undef( $w2v );
3457              
3458             =head3 SaveTrainedVectorDataToFile
3459              
3460             Description:
3461              
3462             Saves trained vector data at the location specified. Defining 'binaryFormat' parameter will
3463             save in word2vec's binary format.
3464              
3465             Input:
3466              
3467             $string -> Save Path
3468             $binaryFormat -> Integer ( '1' = Save data in word2vec binary format / '0' = Save as plain text )
3469              
3470             Note: Leaving $binaryFormat as undefined will save the file in plain text format.
3471              
3472             Warning: If the vector data is stored as a binary search tree, this method will error out gracefully.
3473              
3474             Output:
3475              
3476             $value -> '0' = Successful / '-1' = Un-successful
3477              
3478             Example:
3479              
3480             use Word2vec::Word2vec;
3481              
3482             my $w2v = Word2vec::Word2vec->new();
3483              
3484             # Instruct the module to store the method as an array, not a BST.
3485             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3486             $w2v->SaveTrainedVectorDataToFile( "samples/newvectors.bin" );
3487              
3488             undef( $w2v );
3489              
3490             =head3 StringsAreEqual
3491              
3492             Description:
3493              
3494             Compares two strings to check for equality, ignoring case-sensitivity.
3495              
3496             Note: This method is not case-sensitive. ie. "string" equals "StRiNg"
3497              
3498             Input:
3499              
3500             $string -> String to compare
3501             $string -> String to compare
3502              
3503             Output:
3504              
3505             $value -> '1' = Strings are equal / '0' = Strings are not equal
3506              
3507             Example:
3508              
3509             use Word2vec::Word2vec;
3510              
3511             my $w2v = Word2vec::Word2vec->new();
3512             my $result = $w2v->StringsAreEqual( "hello world", "HeLlO wOrLd" );
3513              
3514             print( "Strings are equal!\n" )if $result == 1;
3515             print( "Strings are not equal!\n" ) if $result == 0;
3516              
3517             undef( $w2v );
3518              
3519             =head3 RemoveWordFromWordVectorString
3520              
3521             Description:
3522              
3523             Given a vector data string as input, it removed the vector word from its data returning only data.
3524              
3525             Input:
3526              
3527             $string -> Vector word & data string.
3528              
3529             Output:
3530              
3531             $string -> Vector data string.
3532              
3533             Example:
3534              
3535             use Word2vec::Word2vec;
3536              
3537             my $w2v = Word2vec::Word2vec->new();
3538             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3539              
3540             my $vectorData = $w2v->RemoveWordFromWordVectorString( $str );
3541              
3542             print( "Success!\n" ) if length( vectorData ) < length( $str );
3543              
3544             undef( $w2v );
3545              
3546             =head3 ConvertRawSparseTextToVectorDataAry
3547              
3548             Description:
3549              
3550             Converts sparse vector string to a dense vector format data array.
3551              
3552             Input:
3553              
3554             $string -> Vector data string.
3555              
3556             Output:
3557              
3558             $arrayReference -> Reference to array of vector data.
3559              
3560             Example:
3561              
3562             use Word2vec::Word2vec;
3563              
3564             my $w2v = Word2vec::Word2vec->new();
3565             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3566              
3567             my @vectorData = @{ $w2v->ConvertRawSparseTextToVectorDataAry( $str ) };
3568              
3569             print( "Data conversion successful!\n" ) if @vectorData > 0;
3570             print( "Data conversion un-successful!\n" ) if @vectorData == 0;
3571              
3572             undef( $w2v );
3573              
3574             =head3 ConvertRawSparseTextToVectorDataHash
3575              
3576             Description:
3577              
3578             Converts sparse vector string to a dense vector format data hash.
3579              
3580             Input:
3581              
3582             $string -> Vector data string.
3583              
3584             Output:
3585              
3586             $hashReference -> Reference to array of hash data.
3587              
3588             Example:
3589              
3590             use Word2vec::Word2vec;
3591              
3592             my $w2v = Word2vec::Word2vec->new();
3593             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3594              
3595             my %vectorData = %{ $w2v->ConvertRawSparseTextToVectorDataHash( $str ) };
3596              
3597             print( "Data conversion successful!\n" ) if ( keys %vectorData ) > 0;
3598             print( "Data conversion un-successful!\n" ) if ( keys %vectorData ) == 0;
3599              
3600             undef( $w2v );
3601              
3602             =head3 GetOSType
3603              
3604             Description:
3605              
3606             Returns (string) operating system type.
3607              
3608             Input:
3609              
3610             None
3611              
3612             Output:
3613              
3614             $string -> Operating System String
3615              
3616             Example:
3617              
3618             use Word2vec::Word2vec;
3619              
3620             my $w2v = Word2vec::Word2vec->new();
3621             my $os = $w2v->GetOSType();
3622              
3623             print( "Operating System: $os\n" );
3624              
3625             undef( $w2v );
3626              
3627             =head2 Accessor Functions
3628              
3629             =head3 GetDebugLog
3630              
3631             Description:
3632              
3633             Returns the _debugLog member variable set during Word2vec::Word2vec object initialization of new function.
3634              
3635             Input:
3636              
3637             None
3638              
3639             Output:
3640              
3641             $value -> '0' = False, '1' = True
3642              
3643             Example:
3644              
3645             use Word2vec::Word2vec;
3646              
3647             my $w2v = Word2vec::Word2vec->new()
3648             my $debugLog = $w2v->GetDebugLog();
3649              
3650             print( "Debug Logging Enabled\n" ) if $debugLog == 1;
3651             print( "Debug Logging Disabled\n" ) if $debugLog == 0;
3652              
3653              
3654             undef( $w2v );
3655              
3656             =head3 GetWriteLog
3657              
3658             Description:
3659              
3660             Returns the _writeLog member variable set during Word2vec::Word2vec object initialization of new function.
3661              
3662             Input:
3663              
3664             None
3665              
3666             Output:
3667              
3668             $value -> '0' = False, '1' = True
3669              
3670             Example:
3671              
3672             use Word2vec::Word2vec;
3673              
3674             my $w2v = Word2vec::Word2vec->new();
3675             my $writeLog = $w2v->GetWriteLog();
3676              
3677             print( "Write Logging Enabled\n" ) if $writeLog == 1;
3678             print( "Write Logging Disabled\n" ) if $writeLog == 0;
3679              
3680             undef( $w2v );
3681              
3682             =head3 GetFileHandle
3683              
3684             Description:
3685              
3686             Returns the _fileHandle member variable set during Word2vec::Word2vec object instantiation of new function.
3687              
3688             Warning: This is a private function. File handle is used by WriteLog() method. Do not manipulate this file handle as errors can result.
3689              
3690             Input:
3691              
3692             None
3693              
3694             Output:
3695              
3696             $fileHandle -> Returns file handle for WriteLog() method or undefined.
3697              
3698             Example:
3699              
3700             use Word2vec::Word2vec;
3701              
3702             my $w2v = Word2vec::Word2vec->new();
3703             my $fileHandle = $w2v->GetFileHandle();
3704              
3705             undef( $w2v );
3706              
3707             =head3 GetTrainFilePath
3708              
3709             Description:
3710              
3711             Returns the _trainFilePath member variable set during Word2vec::Word2vec object instantiation of new function.
3712              
3713             Input:
3714              
3715             None
3716              
3717             Output:
3718              
3719             $string -> Returns word2vec training text corpus file path.
3720              
3721             Example:
3722              
3723             use Word2vec::Word2vec;
3724              
3725             my $w2v = Word2vec::Word2vec->new();
3726             my $filePath = $w2v->GetTrainFilePath();
3727             print( "Training File Path: $filePath\n" );
3728              
3729             undef( $w2v );
3730              
3731             =head3 GetOutputFilePath
3732              
3733             Description:
3734              
3735             Returns the _outputFilePath member variable set during Word2vec::Word2vec object instantiation of new function.
3736              
3737             Input:
3738              
3739             None
3740              
3741             Output:
3742              
3743             $string -> Returns post word2vec training output file path.
3744              
3745             Example:
3746              
3747             use Word2vec::Word2vec;
3748              
3749             my $w2v = Word2vec::Word2vec->new();
3750             my $filePath = $w2v->GetOutputFilePath();
3751             print( "File Path: $filePath\n" );
3752              
3753             undef( $w2v );
3754              
3755             =head3 GetWordVecSize
3756              
3757             Description:
3758              
3759             Returns the _wordVecSize member variable set during Word2vec::Word2vec object instantiation of new function.
3760              
3761             Input:
3762              
3763             None
3764              
3765             Output:
3766              
3767             $value -> Returns (integer) size of word2vec word vectors. Default value = 100
3768              
3769             Example:
3770              
3771             use Word2vec::Word2vec;
3772              
3773             my $w2v = Word2vec::Word2vec->new();
3774             my $value = $w2v->GetWordVecSize();
3775             print( "Word Vector Size: $value\n" );
3776              
3777             undef( $w2v );
3778              
3779             =head3 GetWindowSize
3780              
3781             Description:
3782              
3783             Returns the _windowSize member variable set during Word2vec::Word2vec object instantiation of new function.
3784              
3785             Input:
3786              
3787             None
3788              
3789             Output:
3790              
3791             $value -> Returns (integer) word2vec window size. Default value = 5
3792              
3793             Example:
3794              
3795             use Word2vec::Word2vec;
3796              
3797             my $w2v = Word2vec::Word2vec->new();
3798             my $value = $w2v->GetWindowSize();
3799             print( "Window Size: $value\n" );
3800              
3801             undef( $w2v );
3802              
3803             =head3 GetSample
3804              
3805             Description:
3806              
3807             Returns the _sample member variable set during Word2vec::Word2vec object instantiation of new function.
3808              
3809             Input:
3810              
3811             None
3812              
3813             Output:
3814              
3815             $value -> Returns (integer) word2vec sample size. Default value = 0.001
3816              
3817             Example:
3818              
3819             use Word2vec::Word2vec;
3820              
3821             my $w2v = Word2vec::Word2vec->new();
3822             my $value = $w2v->GetSample();
3823             print( "Sample: $value\n" );
3824              
3825             undef( $w2v );
3826              
3827             =head3 GetHSoftMax
3828              
3829             Description:
3830              
3831             Returns the _hSoftMax member variable set during Word2vec::Word2vec object instantiation of new function.
3832              
3833             Input:
3834              
3835             None
3836              
3837             Output:
3838              
3839             $value -> Returns (integer) word2vec HSoftMax value. Default = 0
3840              
3841             Example:
3842              
3843             use Word2vec::Word2vec;
3844              
3845             my $w2v = Word2vec::Word2vec->new();
3846             my $value = $w2v->GetHSoftMax();
3847             print( "HSoftMax: $value\n" );
3848              
3849             undef( $w2v );
3850              
3851             =head3 GetNegative
3852              
3853             Description:
3854              
3855             Returns the _negative member variable set during Word2vec::Word2vec object instantiation of new function.
3856              
3857             Input:
3858              
3859             None
3860              
3861             Output:
3862              
3863             $value -> Returns (integer) word2vec negative value. Default = 5
3864              
3865             Example:
3866              
3867             use Word2vec::Word2vec;
3868              
3869             my $w2v = Word2vec::Word2vec->new();
3870             my $value = $w2v->GetNegative();
3871             print( "Negative: $value\n" );
3872              
3873             undef( $w2v );
3874              
3875             =head3 GetNumOfThreads
3876              
3877             Description:
3878              
3879             Returns the _numOfThreads member variable set during Word2vec::Word2vec object instantiation of new function.
3880              
3881             Input:
3882              
3883             None
3884              
3885             Output:
3886              
3887             $value -> Returns (integer) word2vec number of threads to use during training. Default = 12
3888              
3889             Example:
3890              
3891             use Word2vec::Word2vec;
3892              
3893             my $w2v = Word2vec::Word2vec->new();
3894             my $value = $w2v->GetNumOfThreads();
3895             print( "Number of threads: $value\n" );
3896              
3897             undef( $w2v );
3898              
3899             =head3 GetNumOfIterations
3900              
3901             Description:
3902              
3903             Returns the _iterations member variable set during Word2vec::Word2vec object instantiation of new function.
3904              
3905             Input:
3906              
3907             None
3908              
3909             Output:
3910              
3911             $value -> Returns (integer) word2vec number of word2vec iterations. Default = 5
3912              
3913             Example:
3914              
3915             use Word2vec::Word2vec;
3916              
3917             my $w2v = Word2vec::Word2vec->new();
3918             my $value = $w2v->GetNumOfIterations();
3919             print( "Number of iterations: $value\n" );
3920              
3921             undef( $w2v );
3922              
3923             =head3 GetMinCount
3924              
3925             Description:
3926              
3927             Returns the _minCount member variable set during Word2vec::Word2vec object instantiation of new function.
3928              
3929             Input:
3930              
3931             None
3932              
3933             Output:
3934              
3935             $value -> Returns (integer) word2vec min-count value. Default = 5
3936              
3937             Example:
3938              
3939             use Word2vec::Word2vec;
3940              
3941             my $w2v = Word2vec::Word2vec->new();
3942             my $value = $w2v->GetMinCount();
3943             print( "Min Count: $value\n" );
3944              
3945             undef( $w2v );
3946              
3947             =head3 GetAlpha
3948              
3949             Description:
3950              
3951             Returns the _alpha member variable set during Word2vec::Word2vec object instantiation of new function.
3952              
3953             Input:
3954              
3955             None
3956              
3957             Output:
3958              
3959             $value -> Returns (integer) word2vec alpha value. Default = 0.05 for CBOW and 0.025 for Skip-Gram.
3960              
3961             Example:
3962              
3963             use Word2vec::Word2vec;
3964              
3965             my $w2v = Word2vec::Word2vec->new();
3966             my $value = $w2v->GetAlpha();
3967             print( "Alpha: $value\n" );
3968              
3969             undef( $w2v );
3970              
3971             =head3 GetClasses
3972              
3973             Description:
3974              
3975             Returns the _classes member variable set during Word2vec::Word2vec object instantiation of new function.
3976              
3977             Input:
3978              
3979             None
3980              
3981             Output:
3982              
3983             $value -> Returns (integer) word2vec classes value. Default = 0
3984              
3985             Example:
3986              
3987             use Word2vec::Word2vec;
3988              
3989             my $w2v = Word2vec::Word2vec->new();
3990             my $value = $w2v->GetClasses();
3991             print( "Classes: $value\n" );
3992              
3993             undef( $w2v );
3994              
3995             =head3 GetDebugTraining
3996              
3997             Description:
3998              
3999             Returns the _debug member variable set during Word2vec::Word2vec object instantiation of new function.
4000              
4001             Note: 0 = No debug output, 1 = Enable debug output, 2 = Even more debug output
4002              
4003             Input:
4004              
4005             None
4006              
4007             Output:
4008              
4009             $value -> Returns (integer) word2vec debug value. Default = 2
4010              
4011             Example:
4012              
4013             use Word2vec::Word2vec;
4014              
4015             my $w2v = Word2vec::Word2vec->new();
4016             my $value = $w2v->GetDebugTraining();
4017             print( "Debug: $value\n" );
4018              
4019             undef( $w2v );
4020              
4021             =head3 GetBinaryOutput
4022              
4023             Description:
4024              
4025             Returns the _binaryOutput member variable set during Word2vec::Word2vec object instantiation of new function.
4026              
4027             Note: 1 = Save trained vector data in binary format, 2 = Save trained vector data in plain text format.
4028              
4029             Input:
4030              
4031             None
4032              
4033             Output:
4034              
4035             $value -> Returns (integer) word2vec binary flag. Default = 0
4036              
4037             Example:
4038              
4039             use Word2vec::Word2vec;
4040              
4041             my $w2v = Word2vec::Word2vec->new();
4042             my $value = $w2v->GetBinaryOutput();
4043             print( "Binary Output: $value\n" );
4044              
4045             undef( $w2v );
4046              
4047             =head3 GetReadVocabFilePath
4048              
4049             Description:
4050              
4051             Returns the _readVocab member variable set during Word2vec::Word2vec object instantiation of new function.
4052              
4053             Input:
4054              
4055             None
4056              
4057             Output:
4058              
4059             $string -> Returns (string) word2vec read vocabulary file name or empty string if not set.
4060              
4061             Example:
4062              
4063             use Word2vec::Word2vec;
4064              
4065             my $w2v = Word2vec::Word2vec->new();
4066             my $str = $w2v->GetReadVocabFilePath();
4067             print( "Read Vocab File Path: $str\n" );
4068              
4069             undef( $w2v );
4070              
4071             =head3 GetSaveVocabFilePath
4072              
4073             Description:
4074              
4075             Returns the _saveVocab member variable set during Word2vec::Word2vec object instantiation of new function.
4076              
4077             Input:
4078              
4079             None
4080              
4081             Output:
4082              
4083             $string -> Returns (string) word2vec save vocabulary file name or empty string if not set.
4084              
4085             Example:
4086              
4087             use Word2vec::Word2vec;
4088              
4089             my $w2v = Word2vec::Word2vec->new();
4090             my $str = $w2v->GetSaveVocabFilePath();
4091             print( "Save Vocab File Path: $str\n" );
4092              
4093             undef( $w2v );
4094              
4095             =head3 GetUseCBOW
4096              
4097             Description:
4098              
4099             Returns the _useCBOW member variable set during Word2vec::Word2vec object instantiation of new function.
4100              
4101             Note: 0 = Skip-Gram Model, 1 = Continuous Bag Of Words Model.
4102              
4103             Input:
4104              
4105             None
4106              
4107             Output:
4108              
4109             $value -> Returns (integer) word2vec Continuous-Bag-Of-Words flag. Default = 1
4110              
4111             Example:
4112              
4113             use Word2vec::Word2vec;
4114              
4115             my $w2v = Word2vec::Word2vec->new();
4116             my $value = $w2v->GetUseCBOW();
4117             print( "Use CBOW?: $value\n" );
4118              
4119             undef( $w2v );
4120              
4121             =head3 GetWorkingDir
4122              
4123             Description:
4124              
4125             Returns the _workingDir member variable set during Word2vec::Word2vec object instantiation of new function.
4126              
4127             Input:
4128              
4129             None
4130              
4131             Output:
4132              
4133             $value -> Returns (string) working directory path or current directory if not specified.
4134              
4135             Example:
4136              
4137             use Word2vec::Word2vec;
4138              
4139             my $w2v = Word2vec::Word2vec->new();
4140             my $str = $w2v->GetWorkingDir();
4141             print( "Working Directory: $str\n" );
4142              
4143             undef( $w2v );
4144              
4145             =head3 GetWord2VecExeDir
4146              
4147             Description:
4148              
4149             Returns the _word2VecExeDir member variable set during Word2vec::Word2vec object instantiation of new function.
4150              
4151             Input:
4152              
4153             None
4154              
4155             Output:
4156              
4157             $value -> Returns (string) word2vec executable directory path or empty string if not specified.
4158              
4159             Example:
4160              
4161             use Word2vec::Word2vec;
4162              
4163             my $w2v = Word2vec::Word2vec->new();
4164             my $str = $w2v->GetWord2VecExeDir();
4165             print( "Word2Vec Executable File Directory: $str\n" );
4166              
4167             undef( $w2v );
4168              
4169             =head3 GetVocabularyHash
4170              
4171             Description:
4172              
4173             Returns the _hashRefOfWordVectors member variable set during Word2vec::Word2vec object instantiation of new function.
4174              
4175             Input:
4176              
4177             None
4178              
4179             Output:
4180              
4181             $value -> Returns array of vocabulary/dictionary words. (Word2vec trained data in memory)
4182              
4183             Example:
4184              
4185             use Word2vec::Word2vec;
4186              
4187             my $w2v = Word2vec::Word2vec->new();
4188             my @vocabulary = $w2v->GetVocabularyHash();
4189              
4190             undef( $w2v );
4191              
4192             =head3 GetOverwriteOldFile
4193              
4194             Description:
4195              
4196             Returns the _overwriteOldFile member variable set during Word2vec::Word2vec object instantiation of new function.
4197              
4198             Input:
4199              
4200             None
4201              
4202             Output:
4203              
4204             $value -> Returns 1 = True or 0 = False.
4205              
4206             Example:
4207              
4208             use Word2vec::Word2vec;
4209              
4210             my $w2v = Word2vec::Word2vec->new();
4211             my $value = $w2v->GetOverwriteOldFile();
4212             print( "Overwrite Exiting File?: $value\n" );
4213              
4214             undef( $w2v );
4215              
4216             =head2 Mutator Functions
4217              
4218             =head3 SetTrainFilePath
4219              
4220             Description:
4221              
4222             Sets member variable to string parameter. Sets training file path.
4223              
4224             Input:
4225              
4226             $string -> Text corpus training file path
4227              
4228             Output:
4229              
4230             None
4231              
4232             Example:
4233              
4234             use Word2vec::Word2vec;
4235              
4236             my $w2v = Word2vec::Word2vec->new();
4237             $w2v->SetTrainFilePath( "samples/textcorpus.txt" );
4238              
4239             undef( $w2v );
4240              
4241             =head3 SetOutputFilePath
4242              
4243             Description:
4244              
4245             Sets member variable to string parameter. Sets output file path.
4246              
4247             Input:
4248              
4249             $string -> Post word2vec training save file path
4250              
4251             Output:
4252              
4253             None
4254              
4255             Example:
4256              
4257             use Word2vec::Word2vec;
4258              
4259             my $w2v = Word2vec::Word2vec->new();
4260             $w2v->SetOutputFilePath( "samples/tempvectors.bin" );
4261              
4262             undef( $w2v );
4263              
4264             =head3 SetWordVecSize
4265              
4266             Description:
4267              
4268             Sets member variable to integer parameter. Sets word2vec word vector size.
4269              
4270             Input:
4271              
4272             $value -> Word2vec word vector size
4273              
4274             Output:
4275              
4276             None
4277              
4278             Example:
4279              
4280             use Word2vec::Word2vec;
4281              
4282             my $w2v = Word2vec::Word2vec->new();
4283             $w2v->SetWordVecSize( 100 );
4284              
4285             undef( $w2v );
4286              
4287             =head3 SetWindowSize
4288              
4289             Description:
4290              
4291             Sets member variable to integer parameter. Sets word2vec window size.
4292              
4293             Input:
4294              
4295             $value -> Word2vec window size
4296              
4297             Output:
4298              
4299             None
4300              
4301             Example:
4302              
4303             use Word2vec::Word2vec;
4304              
4305             my $w2v = Word2vec::Word2vec->new();
4306             $w2v->SetWindowSize( 8 );
4307              
4308             undef( $w2v );
4309              
4310             =head3 SetSample
4311              
4312             Description:
4313              
4314             Sets member variable to integer parameter. Sets word2vec sample size.
4315              
4316             Input:
4317              
4318             $value -> Word2vec sample size
4319              
4320             Output:
4321              
4322             None
4323              
4324             Example:
4325              
4326             use Word2vec::Word2vec;
4327              
4328             my $w2v = Word2vec::Word2vec->new();
4329             $w2v->SetSample( 3 );
4330              
4331             undef( $w2v );
4332              
4333             =head3 SetHSoftMax
4334              
4335             Description:
4336              
4337             Sets member variable to integer parameter. Sets word2vec HSoftMax value.
4338              
4339             Input:
4340              
4341             $value -> Word2vec HSoftMax size
4342              
4343             Output:
4344              
4345             None
4346              
4347             Example:
4348              
4349             use Word2vec::Word2vec;
4350              
4351             my $w2v = Word2vec::Word2vec->new();
4352             $w2v->SetHSoftMax( 12 );
4353              
4354             undef( $w2v );
4355              
4356             =head3 SetNegative
4357              
4358             Description:
4359              
4360             Sets member variable to integer parameter. Sets word2vec negative value.
4361              
4362             Input:
4363              
4364             $value -> Word2vec negative value
4365              
4366             Output:
4367              
4368             None
4369              
4370             Example:
4371              
4372             use Word2vec::Word2vec;
4373              
4374             my $w2v = Word2vec::Word2vec->new();
4375             $w2v->SetNegative( 12 );
4376              
4377             undef( $w2v );
4378              
4379             =head3 SetNumOfThreads
4380              
4381             Description:
4382              
4383             Sets member variable to integer parameter. Sets word2vec number of training threads to specified value.
4384              
4385             Input:
4386              
4387             $value -> Word2vec number of threads value
4388              
4389             Output:
4390              
4391             None
4392              
4393             Example:
4394              
4395             use Word2vec::Word2vec;
4396              
4397             my $w2v = Word2vec::Word2vec->new();
4398             $w2v->SetNumOfThreads( 12 );
4399              
4400             undef( $w2v );
4401              
4402             =head3 SetNumOfIterations
4403              
4404             Description:
4405              
4406             Sets member variable to integer parameter. Sets word2vec iterations value.
4407              
4408             Input:
4409              
4410             $value -> Word2vec number of iterations value
4411              
4412             Output:
4413              
4414             None
4415              
4416             Example:
4417              
4418             use Word2vec::Word2vec;
4419              
4420             my $w2v = Word2vec::Word2vec->new();
4421             $w2v->SetNumOfIterations( 12 );
4422              
4423             undef( $w2v );
4424              
4425             =head3 SetMinCount
4426              
4427             Description:
4428              
4429             Sets member variable to integer parameter. Sets word2vec min-count value.
4430              
4431             Input:
4432              
4433             $value -> Word2vec min-count value
4434              
4435             Output:
4436              
4437             None
4438              
4439             Example:
4440              
4441             use Word2vec::Word2vec;
4442              
4443             my $w2v = Word2vec::Word2vec->new();
4444             $w2v->SetMinCount( 7 );
4445              
4446             undef( $w2v );
4447              
4448             =head3 SetAlpha
4449              
4450             Description:
4451              
4452             Sets member variable to float parameter. Sets word2vec alpha value.
4453              
4454             Input:
4455              
4456             $value -> Word2vec alpha value. (Float)
4457              
4458             Output:
4459              
4460             None
4461              
4462             Example:
4463              
4464             use Word2vec::Word2vec;
4465              
4466             my $w2v = Word2vec::Word2vec->new();
4467             $w2v->SetAlpha( 0.0012 );
4468              
4469             undef( $w2v );
4470              
4471             =head3 SetClasses
4472              
4473             Description:
4474              
4475             Sets member variable to integer parameter. Sets word2vec classes value.
4476              
4477             Input:
4478              
4479             $value -> Word2vec classes value.
4480              
4481             Output:
4482              
4483             None
4484              
4485             Example:
4486              
4487             use Word2vec::Word2vec;
4488              
4489             my $w2v = Word2vec::Word2vec->new();
4490             $w2v->SetClasses( 0 );
4491              
4492             undef( $w2v );
4493              
4494             =head3 SetDebugTraining
4495              
4496             Description:
4497              
4498             Sets member variable to integer parameter. Sets word2vec debug parameter value.
4499              
4500             Input:
4501              
4502             $value -> Word2vec debug training value.
4503              
4504             Output:
4505              
4506             None
4507              
4508             Example:
4509              
4510             use Word2vec::Word2vec;
4511              
4512             my $w2v = Word2vec::Word2vec->new();
4513             $w2v->SetDebugTraining( 0 );
4514              
4515             undef( $w2v );
4516              
4517             =head3 SetBinaryOutput
4518              
4519             Description:
4520              
4521             Sets member variable to integer parameter. Sets word2vec binary parameter value.
4522              
4523             Input:
4524              
4525             $value -> Word2vec binary output mode value. ( '1' = Binary Output / '0' = Plain Text )
4526              
4527             Output:
4528              
4529             None
4530              
4531             Example:
4532              
4533             use Word2vec::Word2vec;
4534              
4535             my $w2v = Word2vec::Word2vec->new();
4536             $w2v->SetBinaryOutput( 1 );
4537              
4538             undef( $w2v );
4539              
4540             =head3 SetSaveVocabFilePath
4541              
4542             Description:
4543              
4544             Sets member variable to string parameter. Sets word2vec save vocabulary file name.
4545              
4546             Input:
4547              
4548             $string -> Word2vec save vocabulary file name and path.
4549              
4550             Output:
4551              
4552             None
4553              
4554             Example:
4555              
4556             use Word2vec::Word2vec;
4557              
4558             my $w2v = Word2vec::Word2vec->new();
4559             $w2v->SetSaveVocabFilePath( "samples/vocab.txt" );
4560              
4561             undef( $w2v );
4562              
4563             =head3 SetReadVocabFilePath
4564              
4565             Description:
4566              
4567             Sets member variable to string parameter. Sets word2vec read vocabulary file name.
4568              
4569             Input:
4570              
4571             $string -> Word2vec read vocabulary file name and path.
4572              
4573             Output:
4574              
4575             None
4576              
4577             Example:
4578              
4579             use Word2vec::Word2vec;
4580              
4581             my $w2v = Word2vec::Word2vec->new();
4582             $w2v->SetReadVocabFilePath( "samples/vocab.txt" );
4583              
4584             undef( $w2v );
4585              
4586             =head3 SetUseCBOW
4587              
4588             Description:
4589              
4590             Sets member variable to integer parameter. Sets word2vec CBOW parameter value.
4591              
4592             Input:
4593              
4594             $value -> Word2vec CBOW mode value.
4595              
4596             Output:
4597              
4598             None
4599              
4600             Example:
4601              
4602             use Word2vec::Word2vec;
4603              
4604             my $w2v = Word2vec::Word2vec->new();
4605             $w2v->SetUseCBOW( 1 );
4606              
4607             undef( $w2v );
4608              
4609             =head3 SetWorkingDir
4610              
4611             Description:
4612              
4613             Sets member variable to string parameter. Sets working directory.
4614              
4615             Input:
4616              
4617             $string -> Working directory
4618              
4619             Output:
4620              
4621             None
4622              
4623             Example:
4624              
4625             use Word2vec::Word2vec;
4626              
4627             my $w2v = Word2vec::Word2vec->new();
4628             $w2v->SetWorkingDir( "/samples" );
4629              
4630             undef( $w2v );
4631              
4632             =head3 SetWord2VecExeDir
4633              
4634             Description:
4635              
4636             Sets member variable to string parameter. Sets word2vec executable file directory.
4637              
4638             Input:
4639              
4640             $string -> Word2vec directory
4641              
4642             Output:
4643              
4644             None
4645              
4646             Example:
4647              
4648             use Word2vec::Word2vec;
4649              
4650             my $w2v = Word2vec::Word2vec->new();
4651             $w2v->SetWord2VecExeDir( "/word2vec" );
4652              
4653             undef( $w2v );
4654              
4655             =head3 SetVocabularyHash
4656              
4657             Description:
4658              
4659             Sets vocabulary/dictionary array to de-referenced array reference parameter.
4660              
4661             Warning: This will overwrite any existing vocabulary/dictionary array data.
4662              
4663             Input:
4664              
4665             $arrayReference -> Vocabulary/Dictionary array reference of word2vec word vectors.
4666              
4667             Output:
4668              
4669             None
4670              
4671             Example:
4672              
4673             use Word2vec::Word2vec;
4674              
4675             my $w2v = Word2vec::Word2vec->new();
4676             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
4677             my @vocab = $w2v->GetVocabularyHash();
4678             $w2v->SetVocabularyHash( \@vocab );
4679              
4680             undef( $w2v );
4681              
4682             =head3 ClearVocabularyHash
4683              
4684             Description:
4685              
4686             Clears vocabulary/dictionary array.
4687              
4688             Input:
4689              
4690             None
4691              
4692             Output:
4693              
4694             None
4695              
4696             Example:
4697              
4698             use Word2vec::Word2vec;
4699              
4700             my $w2v = Word2vec::Word2vec->new();
4701             $w2v->ClearVocabularyHash();
4702              
4703             undef( $w2v );
4704              
4705             =head3 AddWordVectorToVocabHash
4706              
4707             Description:
4708              
4709             Adds word vector string to vocabulary/dictionary.
4710              
4711             Input:
4712              
4713             $string -> Word2vec word vector string
4714              
4715             Output:
4716              
4717             None
4718              
4719             Example:
4720              
4721             use Word2vec::Word2vec;
4722              
4723             my $w2v = Word2vec::Word2vec->new();
4724              
4725             # Note: This is representational data of word2vec's word vector format and not actual data.
4726             $w2v->AddWordVectorToVocabHash( "of 0.4346 -0.1235 0.5789 0.2347 -0.0056 -0.0001" );
4727              
4728             undef( $w2v );
4729              
4730             =head3 SetOverwriteOldFile
4731              
4732             Description:
4733              
4734             Sets member variable to integer parameter. Enables overwriting output file if one already exists.
4735              
4736             Input:
4737              
4738             $value -> '1' = Overwrite exiting file / '0' = Graceful termination when file with same name exists
4739              
4740             Output:
4741              
4742             None
4743              
4744             Example:
4745              
4746             use Word2vec::Word2vec;
4747              
4748             my $w2v = Word2vec::Word2vec->new();
4749             $w2v->SetOverwriteOldFile( 1 );
4750              
4751             undef( $w2v );
4752              
4753             =head2 Debug Functions
4754              
4755             =head3 GetTime
4756              
4757             Description:
4758              
4759             Returns current time string in "Hour:Minute:Second" format.
4760              
4761             Input:
4762              
4763             None
4764              
4765             Output:
4766              
4767             $string -> XX:XX:XX ("Hour:Minute:Second")
4768              
4769             Example:
4770              
4771             use Word2vec::Word2vec:
4772              
4773             my $w2v = Word2vec::Word2vec->new();
4774             my $time = $w2v->GetTime();
4775              
4776             print( "Current Time: $time\n" ) if defined( $time );
4777              
4778             undef( $w2v );
4779              
4780             =head3 GetDate
4781              
4782             Description:
4783              
4784             Returns current month, day and year string in "Month/Day/Year" format.
4785              
4786             Input:
4787              
4788             None
4789              
4790             Output:
4791              
4792             $string -> XX/XX/XXXX ("Month/Day/Year")
4793              
4794             Example:
4795              
4796             use Word2vec::Word2vec:
4797              
4798             my $w2v = Word2vec::Word2vec->new();
4799             my $date = $w2v->GetDate();
4800              
4801             print( "Current Date: $date\n" ) if defined( $date );
4802              
4803             undef( $w2v );
4804              
4805             =head3 WriteLog
4806              
4807             Description:
4808              
4809             Prints passed string parameter to the console, log file or both depending on user options.
4810              
4811             Note: printNewLine parameter prints a new line character following the string if the parameter
4812             is undefined and does not if parameter is 0.
4813              
4814             Input:
4815              
4816             $string -> String to print to the console/log file.
4817             $value -> 0 = Do not print newline character after string, all else prints new line character including 'undef'.
4818              
4819             Output:
4820              
4821             None
4822              
4823             Example:
4824              
4825             use Word2vec::Word2vec:
4826              
4827             my $w2v = Word2vec::Word2vec->new();
4828             $w2v->WriteLog( "Hello World" );
4829              
4830             undef( $w2v );
4831              
4832             =head1 Author
4833              
4834             Clint Cuffy, Virginia Commonwealth University
4835              
4836             =head1 COPYRIGHT
4837              
4838             Copyright (c) 2016
4839              
4840             Bridget T McInnes, Virginia Commonwealth University
4841             btmcinnes at vcu dot edu
4842              
4843             Clint Cuffy, Virginia Commonwealth University
4844             cuffyca at vcu dot edu
4845              
4846             This program is free software; you can redistribute it and/or modify it
4847             under the terms of the GNU General Public License as published by the Free
4848             Software Foundation; either version 2 of the License, or (at your option)
4849             any later version.
4850              
4851             This program is distributed in the hope that it will be useful, but WITHOUT
4852             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
4853             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
4854              
4855             You should have received a copy of the GNU General Public License along with
4856             this program; if not, write to:
4857              
4858             The Free Software Foundation, Inc.,
4859             59 Temple Place - Suite 330,
4860             Boston, MA 02111-1307, USA.
4861              
4862             =cut