File Coverage

blib/lib/NanoB2B/NER/Avgman.pm
Criterion Covered Total %
statement 12 236 5.0
branch 0 48 0.0
condition 0 3 0.0
subroutine 4 13 30.7
pod 0 8 0.0
total 16 308 5.1


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2             # NanoB2B-NER::NER::Avgman
3             #
4             # Averages Weka files and makes a nice output file
5             # Version 1.5
6             #
7             # Program by Milk
8              
9             package NanoB2B::NER::Avgman;
10              
11 1     1   6 use NanoB2B::UniversalRoutines;
  1         2  
  1         27  
12 1     1   4 use File::Path qw(make_path); #makes sub directories
  1         2  
  1         35  
13 1     1   5 use strict;
  1         1  
  1         16  
14 1     1   4 use warnings;
  1         2  
  1         2227  
15              
16             #### GLOBAL VARIABLES ####
17              
18             #option variables
19             my $program_dir;
20             my $bucketsNum = 10;
21             my $weka_dir;
22             my $debug = 0;
23              
24             #universal subroutines object
25             my %uniParams;
26             my $uniSub;
27              
28             #module variables
29             my @features;
30             my @wekaAttr = ("TP Rate", "FP Rate", "Precision", "Recall", "F-Measure", "MCC", "ROC Area", "PRC Area");
31             my @allBuckets;
32             my @sets;
33             my $entId = "_e";
34              
35             #### A SUPER PET IS ADOPTED ####
36              
37             # construction method to create a new Avgman object
38             # input : $directory <-- the name of the directory for the files
39             # $weka_dir <-- the weka directory name
40             # $features <-- the set of features [e.g. "ortho morph text pos cui sem"]
41             # $buckets <-- the number of buckets used for the k-fold cross validation
42             # $debug <-- the set of features to run on [e.g. omtpcs]
43             # output : $self <-- an instance of the Avgman object
44             sub new {
45             #grab class and parameters
46 0     0 0   my $self = {};
47 0           my $class = shift;
48 0 0         return undef if(ref $class);
49 0           my $params = shift;
50              
51             #reset all the arrays
52 0           %uniParams = ();
53 0           @features = ();
54 0           @allBuckets = ();
55 0           @sets = ();
56              
57             #bless this object
58 0           bless $self, $class;
59 0           $self->_init($params);
60 0           @allBuckets = (1..$bucketsNum);
61              
62             #retrieve parameters for universal-routines
63 0           $uniParams{'debug'} = $debug;
64 0           $uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
65              
66             #make the features
67 0           my $item = "_";
68 0           foreach my $fs (@features){
69 0           $item .= substr($fs, 0, 1); #add to abbreviations for the name
70 0           push(@sets, $item);
71             }
72            
73              
74 0           return $self;
75             }
76              
77             # method to initialize the NanoB2B::NER::Avgman object.
78             # input : $parameters <- reference to a hash
79             # output:
80             sub _init {
81 0     0     my $self = shift;
82 0           my $params = shift;
83              
84 0 0         $params = {} if(!defined $params);
85              
86             # get some of the parameters
87 0           my $diroption = $params->{'directory'};
88 0           my $ftsoption = $params->{'features'};
89 0           my $bucketsNumoption = $params->{'buckets'};
90 0           my $wekadiroption = $params->{'weka_dir'};
91 0           my $debugoption = $params->{'debug'};
92              
93             #set the global variables
94 0 0         if(defined $debugoption){$debug = $debugoption;}
  0            
95 0 0         if(defined $diroption){$program_dir = $diroption;}
  0            
96 0 0         if(defined $bucketsNumoption){$bucketsNum = $bucketsNumoption;}
  0            
97 0 0         if(defined $ftsoption){@features = split(' ', $ftsoption);}
  0            
98 0 0         if(defined $wekadiroption){$weka_dir = $wekadiroption};
  0            
99             }
100              
101              
102             ############### NOT YOUR AVGMAN! ################
103              
104             #prints weka results to an output file
105             # input : $name <-- name of the file with the weka data
106             # output : (weka average results file)
107             sub avg_file{
108 0     0 0   my $self = shift;
109 0           my $name = shift;
110              
111             #get the averaged weka data, individual weka data, and majority sense baseline
112 0           my %avg = averageWekaData($name);
113 0           my %ind = individualWekaData($name);
114 0           my $msb = getMSB($name);
115              
116             #open new file
117 0           my $direct = ("$program_dir/_WEKAS/$weka_dir");
118 0           make_path($direct);
119 0           my $file = "$direct/$name" . "_weka-results";
120 0           my $WEKA_SAVE;
121 0 0         open ($WEKA_SAVE, ">", "$file") || die ("Weka file $file could not be made in the _WEKAS directory! \n$!");
122              
123             #print the name
124 0           $uniSub->print2File($WEKA_SAVE, "ENTITY NAME\t\t: $name");
125              
126             #print the list of features
127 0           $uniSub->print2FileNoLine($WEKA_SAVE, "FEATURES\t\t: ");
128 0           foreach my $d(@features){$uniSub->print2FileNoLine($WEKA_SAVE, "$d, ");}
  0            
129 0           $uniSub->print2File($WEKA_SAVE, "");
130              
131             #print the msb
132 0           $msb = int($msb * 10**4) / 10**4;
133 0           $uniSub->print2File($WEKA_SAVE, "Majority Label Baseline : $msb");
134              
135 0           $uniSub->print2File($WEKA_SAVE, "\n");
136              
137             #print the averages
138 0           foreach my $key (sort keys %avg){
139 0           $uniSub->print2File($WEKA_SAVE, "$key - AVERAGES");
140 0           $uniSub->print2File($WEKA_SAVE, "-------------------------");
141              
142 0           my @arr = @{$avg{$key}};
  0            
143 0           my $arrLen = @arr;
144 0           for(my $a = 0; $a < $arrLen; $a++){
145 0           my $wekaThing = $wekaAttr[$a];
146 0           my $entry = $arr[$a];
147             #try to align the values based on the length
148 0           my $tab = "";
149 0 0         if(length($wekaThing) > 7){
    0          
150 0           $tab = "\t";
151             }elsif(length($wekaThing) < 4){
152 0           $tab = "\t\t\t";
153             }else{
154 0           $tab = "\t\t";
155             }
156              
157 0           my $in = "$wekaThing" . $tab . "-\t$entry";
158 0           $uniSub->print2File($WEKA_SAVE, $in);
159             }
160 0           $uniSub->print2File($WEKA_SAVE, "");
161             }
162 0           $uniSub->print2File($WEKA_SAVE, "\n");
163             #print the individual
164 0           foreach my $key (sort keys %ind){
165             #header
166 0           $uniSub->print2File($WEKA_SAVE, "$key - INDIVIDUAL");
167 0           $uniSub->print2File($WEKA_SAVE, "==================\n");
168 0           $uniSub->print2FileNoLine($WEKA_SAVE, "\t\t\t");
169 0           foreach my $r(@wekaAttr){
170 0           my $sr = "";
171 0 0         if(length($r) >= 6){$sr = (substr($r, 0, 6) . ".");}
  0 0          
172 0           elsif(length($r) < 4){$sr = "$r\t";}
173 0           else{$sr = $r;}
174 0           $uniSub->print2FileNoLine($WEKA_SAVE, "$sr\t");
175             }
176 0           $uniSub->print2File($WEKA_SAVE, "\n-------------------------------------------------------------------------------");
177              
178             #print the lines
179 0           my @arr = @{$ind{$key}};
  0            
180 0           my $arrLen = @arr;
181              
182 0           for(my $a = 0; $a < $arrLen;$a++){
183 0           my $b = $a + 1;
184 0           my $entry = $arr[$a];
185             #printColorDebug("on_red", "$name - $entry");
186              
187 0           my $in = "BUCKET $b\t$entry";
188 0           $uniSub->print2File($WEKA_SAVE, $in);
189             }
190            
191 0           $uniSub->print2File($WEKA_SAVE, "");
192             }
193              
194 0           close $WEKA_SAVE;
195              
196             }
197              
198              
199             #average the weka accuracy datas
200             # input : $name <-- name of the file with the weka data
201             # output : %featAvg <-- hash of each feature set's averages (in array form aligned to the wekaAttr)
202             sub averageWekaData{
203 0     0 0   my $name = shift;
204 0           my %featAvg = ();
205              
206 0           foreach my $item (@sets){
207 0           my %data = ();
208              
209 0           foreach my $bucket(@allBuckets){
210              
211             #import the wekaman
212 0           my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket";
213 0 0         open (WEKA, "$file") || die ("Unable to import $file from _WEKAS directory\n $!");
214              
215             #get lines
216 0           my @lines = ;
217 0           foreach my $line(@lines){chomp($line)};
  0            
218 0           my $len = @lines;
219              
220             #$uniSub->printColorDebug("on_red", "YOUR FACE!");
221              
222             #file doesn't have any contents for some reason
223 0 0         if($len == 0){
224 0           $uniSub->printColorDebug("red", "**ERROR: NOTHING FOUND FOR $file!**\n");
225 0           my $n = @wekaAttr;
226 0           my @values = (0) x $n;
227 0           push(@{$data{$bucket}}, @values);
  0            
228 0           next;
229             #return %featAvg;
230             }
231              
232             #get the rest of the array
233 0           my $keyword = "=== Error on test data ===";
234 0           my $index = $uniSub->getIndexofLine($keyword, \@lines);
235 0           my @result = @lines[$index..$len];
236              
237             #grab the only stuff you need
238 0           my $weightWord = " Yes";
239 0           my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result);
240 0           my $weightLine = $result[$weightIndex];
241              
242             #split it uuuuuup
243 0           my @values = split /\s+/, $weightLine;
244 0           my $valLen = @values;
245 0           my $valLen2 = $valLen - 2;
246 0           @values = @values[1..$valLen2];
247             #printArr(", ", @values);
248              
249             #add to overall
250 0           push(@{$data{$bucket}}, @values);
  0            
251             }
252            
253 0           my @averages = ();
254             #add all the stuffs
255 0           foreach my $bucket (@allBuckets){
256 0           my @wekaSet = @{$data{$bucket}};
  0            
257 0           my $wekaLen = @wekaSet;
258 0           for(my $e = 0; $e < $wekaLen; $e++){
259 0           my $entry = $wekaSet[$e];
260 0 0 0       if($entry ne "NaN" and $entry ne "?"){ #if value is unknown (from WEKA)
261 0           $averages[$e] += $entry;
262             }
263             }
264             }
265              
266             #printArr(", ", @avgLens);
267              
268             #divide them
269 0           foreach my $tea (@averages){
270 0           $tea /= $bucketsNum;
271             }
272              
273             #add the averages for the specific feature
274 0           push(@{$featAvg{$item}}, @averages);
  0            
275             }
276             #exit;
277 0           return %featAvg;
278             }
279              
280             #retrieve the weighted average lines from the individual buckets
281             # input : $name <-- name of the file with the weka data
282             # output : %featData <-- hash of arrays for each line of individual data for the buckets
283             sub individualWekaData{
284 0     0 0   my $name = shift;
285 0           my %featData = ();
286              
287 0           foreach my $item (@sets){ #for each feature specified
288 0           my @data = ();
289              
290 0           foreach my $bucket(@allBuckets){ #for each bucket in the set
291              
292             #import the wekaman
293 0           my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket";
294 0 0         open (WEKA, "$file") || die ("Unable to import $file from _WEKAS directory $!");
295              
296             #get lines
297 0           my @lines = ;
298 0           foreach my $line(@lines){chomp($line)};
  0            
299 0           my $len = @lines;
300              
301             #if the contents of the file are empty for some reason
302 0 0         if($len == 0){
303 0           $uniSub->printColorDebug("red", "**ERROR: NOTHING FOUND FOR $file!**\n");
304 0           my $n = @wekaAttr;
305 0           my @values = (0) x $n;
306 0           my $valueLine = join(" ", @values);
307 0           push(@data, $valueLine);
308 0           next;
309             #return %featAvg;
310             }
311              
312             #get the rest of the array
313 0           my $keyword = "=== Error on test data ===";
314 0           my $index = $uniSub->getIndexofLine($keyword, \@lines);
315 0           my @result = @lines[$index..$len];
316              
317             #grab the only stuff you need
318 0           my $weightWord = " Yes";
319 0           my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result);
320 0           my $weightLine = $result[$weightIndex];
321              
322             #split it up
323 0           my @values = split /\s+/, $weightLine;
324 0           my $valLen = @values;
325 0           $valLen -= 2;
326 0           my @values2 = @values[1..$valLen];
327              
328             #rejoin it
329 0           my $valLine = join("\t", @values2);
330              
331             #add it
332 0           push (@data, $valLine);
333             }
334 0           push (@{$featData{$item}}, @data);
  0            
335             }
336 0           return %featData;
337             }
338              
339             # grabs the majority sense-label baseline for the file
340             # input : $name <-- name of the file to get the entities from
341             # output : $msb <-- # of no instances / # of yes instances
342             sub getMSB{
343 0     0 0   my $file = shift;
344             #get the name of the file
345 0           my @n = split '/', $file;
346 0           my $l = @n;
347 0           my $filename = $n[$l - 1];
348 0           $filename = lc($filename);
349              
350             #import the data from the file
351 0 0         if($program_dir ne ""){open (FILE, "$program_dir/$file") || die ("what is this '$program_dir/$file' you speak of?\n$!");}
  0 0          
352 0 0         else{open (FILE, "$file") || die ("what is this '$file' you speak of? $!\n");}
353 0           my @fileLines = ;
354 0           foreach my $l(@fileLines){
355 0           $l = lc($l);
356             }
357              
358             #clean it up for two separate sets
359 0           my @tagSet = retagSet($filename, \@fileLines);
360              
361             #count and get results
362 0           return countInst(\@tagSet);
363             }
364              
365             # counts the # of no instances / # of all instances
366             # input : @set <-- set of lines to read and count
367             # output : $msb <-- # of no instances / # of all instances
368             sub countInst{
369 0     0 0   my $set_ref = shift;
370 0           my @set = @$set_ref;
371              
372 0           my $no_inst = 0;
373 0           my $all_inst = 0;
374              
375 0           foreach my $line(@set){
376 0           my @words = split(" ", $line);
377 0           foreach my $word(@words){
378             #print($word . "\n");
379             #if the word doesn't have a entity tag
380 0 0         if(!($word =~ /([\s\S]*($entId)$)/)){
381 0           $no_inst++; #count the words not annotated
382             }
383              
384 0           $all_inst++; #count all the words
385             }
386             }
387              
388             #return the ratio
389 0           return ($no_inst / $all_inst);
390             }
391              
392              
393              
394             ###### RETAGS THE LINE ######
395              
396             # turns the tagged entity words into special words with <> for the context words
397             # input : $input <-- the line to retag
398             # $id <-- the id within the tag to look for
399             # output : (.arff files)
400             sub retag{
401 0     0 0   my $input = shift;
402 0           my $id = shift;
403              
404 0           $id = lc($id);
405 0           my $line = lc($input);
406              
407             #get rid of any tags
408 0           my @words = split (" ", $line);
409 0           my @newSet = ();
410 0           my $charact = 0;
411 0           foreach my $word (@words){
412 0 0         if($charact){ #if currently annotating
413 0 0         if($word eq ""){ #stop tagging annotations
414 0           $charact = 0;
415             }else{ #add the entity tagger to the words
416 0           my $charWord = "$word"."$entId";
417 0           push @newSet, $charWord;
418             }
419             }else{
420 0 0         if($word eq ""){ #start annotating
421 0           $charact = 1;
422             }else{ #add normal word
423 0           push @newSet, $word;
424             }
425             }
426             }
427              
428             #clean up the new line
429 0           my $new_line = join " ", @newSet;
430 0           $new_line =~s/\b$entId\b//g;
431 0           $new_line = $uniSub->cleanWords($new_line);
432 0           return $new_line;
433             }
434              
435             # turns the tagged entity words in the entire file into special words with <> for the context words
436             # input : $name <-- the name of the file to use as the id tag
437             # @lines <-- the set of lines to retag
438             # output : @tagSet <-- set of retagged lines
439             sub retagSet{
440 0     0 0   my $name = shift;
441 0           my $lines_ref = shift;
442 0           my @lines = @$lines_ref;
443              
444 0           my @tagSet = ();
445 0           foreach my $line (@lines){
446             #retag the line
447 0           chomp($line);
448 0           my $tag_line = retag($line, $name);
449              
450             #add it to the set
451 0           push @tagSet, $tag_line;
452             }
453 0           return @tagSet;
454             }
455              
456             1;