File Coverage

blib/lib/NanoB2B/NER/Avgman.pm

Criterion	Covered	Total	%
statement	12	236	5.0
branch	0	48	0.0
condition	0	3	0.0
subroutine	4	13	30.7
pod	0	8	0.0
total	16	308	5.1

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2							# NanoB2B-NER::NER::Avgman
3							#
4							# Averages Weka files and makes a nice output file
5							# Version 1.5
6							#
7							# Program by Milk
8
9							package NanoB2B::NER::Avgman;
10
11	1			1		6	use NanoB2B::UniversalRoutines;
	1					2
	1					27
12	1			1		4	use File::Path qw(make_path); #makes sub directories
	1					2
	1					35
13	1			1		5	use strict;
	1					1
	1					16
14	1			1		4	use warnings;
	1					2
	1					2227
15
16							#### GLOBAL VARIABLES ####
17
18							#option variables
19							my $program_dir;
20							my $bucketsNum = 10;
21							my $weka_dir;
22							my $debug = 0;
23
24							#universal subroutines object
25							my %uniParams;
26							my $uniSub;
27
28							#module variables
29							my @features;
30							my @wekaAttr = ("TP Rate", "FP Rate", "Precision", "Recall", "F-Measure", "MCC", "ROC Area", "PRC Area");
31							my @allBuckets;
32							my @sets;
33							my $entId = "_e";
34
35							#### A SUPER PET IS ADOPTED ####
36
37							# construction method to create a new Avgman object
38							# input : $directory <-- the name of the directory for the files
39							# $weka_dir <-- the weka directory name
40							# $features <-- the set of features [e.g. "ortho morph text pos cui sem"]
41							# $buckets <-- the number of buckets used for the k-fold cross validation
42							# $debug <-- the set of features to run on [e.g. omtpcs]
43							# output : $self <-- an instance of the Avgman object
44							sub new {
45							#grab class and parameters
46	0			0	0		my $self = {};
47	0						my $class = shift;
48	0	0					return undef if(ref $class);
49	0						my $params = shift;
50
51							#reset all the arrays
52	0						%uniParams = ();
53	0						@features = ();
54	0						@allBuckets = ();
55	0						@sets = ();
56
57							#bless this object
58	0						bless $self, $class;
59	0						$self->_init($params);
60	0						@allBuckets = (1..$bucketsNum);
61
62							#retrieve parameters for universal-routines
63	0						$uniParams{'debug'} = $debug;
64	0						$uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
65
66							#make the features
67	0						my $item = "_";
68	0						foreach my $fs (@features){
69	0						$item .= substr($fs, 0, 1); #add to abbreviations for the name
70	0						push(@sets, $item);
71							}
72
73
74	0						return $self;
75							}
76
77							# method to initialize the NanoB2B::NER::Avgman object.
78							# input : $parameters <- reference to a hash
79							# output:
80							sub _init {
81	0			0			my $self = shift;
82	0						my $params = shift;
83
84	0	0					$params = {} if(!defined $params);
85
86							# get some of the parameters
87	0						my $diroption = $params->{'directory'};
88	0						my $ftsoption = $params->{'features'};
89	0						my $bucketsNumoption = $params->{'buckets'};
90	0						my $wekadiroption = $params->{'weka_dir'};
91	0						my $debugoption = $params->{'debug'};
92
93							#set the global variables
94	0	0					if(defined $debugoption){$debug = $debugoption;}
	0
95	0	0					if(defined $diroption){$program_dir = $diroption;}
	0
96	0	0					if(defined $bucketsNumoption){$bucketsNum = $bucketsNumoption;}
	0
97	0	0					if(defined $ftsoption){@features = split(' ', $ftsoption);}
	0
98	0	0					if(defined $wekadiroption){$weka_dir = $wekadiroption};
	0
99							}
100
101
102							############### NOT YOUR AVGMAN! ################
103
104							#prints weka results to an output file
105							# input : $name <-- name of the file with the weka data
106							# output : (weka average results file)
107							sub avg_file{
108	0			0	0		my $self = shift;
109	0						my $name = shift;
110
111							#get the averaged weka data, individual weka data, and majority sense baseline
112	0						my %avg = averageWekaData($name);
113	0						my %ind = individualWekaData($name);
114	0						my $msb = getMSB($name);
115
116							#open new file
117	0						my $direct = ("$program_dir/_WEKAS/$weka_dir");
118	0						make_path($direct);
119	0						my $file = "$direct/$name" . "_weka-results";
120	0						my $WEKA_SAVE;
121	0	0					open ($WEKA_SAVE, ">", "$file") \|\| die ("Weka file $file could not be made in the _WEKAS directory! \n$!");
122
123							#print the name
124	0						$uniSub->print2File($WEKA_SAVE, "ENTITY NAME\t\t: $name");
125
126							#print the list of features
127	0						$uniSub->print2FileNoLine($WEKA_SAVE, "FEATURES\t\t: ");
128	0						foreach my $d(@features){$uniSub->print2FileNoLine($WEKA_SAVE, "$d, ");}
	0
129	0						$uniSub->print2File($WEKA_SAVE, "");
130
131							#print the msb
132	0						$msb = int($msb * 104) / 104;
133	0						$uniSub->print2File($WEKA_SAVE, "Majority Label Baseline : $msb");
134
135	0						$uniSub->print2File($WEKA_SAVE, "\n");
136
137							#print the averages
138	0						foreach my $key (sort keys %avg){
139	0						$uniSub->print2File($WEKA_SAVE, "$key - AVERAGES");
140	0						$uniSub->print2File($WEKA_SAVE, "-------------------------");
141
142	0						my @arr = @{$avg{$key}};
	0
143	0						my $arrLen = @arr;
144	0						for(my $a = 0; $a < $arrLen; $a++){
145	0						my $wekaThing = $wekaAttr[$a];
146	0						my $entry = $arr[$a];
147							#try to align the values based on the length
148	0						my $tab = "";
149	0	0					if(length($wekaThing) > 7){
		0
150	0						$tab = "\t";
151							}elsif(length($wekaThing) < 4){
152	0						$tab = "\t\t\t";
153							}else{
154	0						$tab = "\t\t";
155							}
156
157	0						my $in = "$wekaThing" . $tab . "-\t$entry";
158	0						$uniSub->print2File($WEKA_SAVE, $in);
159							}
160	0						$uniSub->print2File($WEKA_SAVE, "");
161							}
162	0						$uniSub->print2File($WEKA_SAVE, "\n");
163							#print the individual
164	0						foreach my $key (sort keys %ind){
165							#header
166	0						$uniSub->print2File($WEKA_SAVE, "$key - INDIVIDUAL");
167	0						$uniSub->print2File($WEKA_SAVE, "==================\n");
168	0						$uniSub->print2FileNoLine($WEKA_SAVE, "\t\t\t");
169	0						foreach my $r(@wekaAttr){
170	0						my $sr = "";
171	0	0					if(length($r) >= 6){$sr = (substr($r, 0, 6) . ".");}
	0	0
172	0						elsif(length($r) < 4){$sr = "$r\t";}
173	0						else{$sr = $r;}
174	0						$uniSub->print2FileNoLine($WEKA_SAVE, "$sr\t");
175							}
176	0						$uniSub->print2File($WEKA_SAVE, "\n-------------------------------------------------------------------------------");
177
178							#print the lines
179	0						my @arr = @{$ind{$key}};
	0
180	0						my $arrLen = @arr;
181
182	0						for(my $a = 0; $a < $arrLen;$a++){
183	0						my $b = $a + 1;
184	0						my $entry = $arr[$a];
185							#printColorDebug("on_red", "$name - $entry");
186
187	0						my $in = "BUCKET $b\t$entry";
188	0						$uniSub->print2File($WEKA_SAVE, $in);
189							}
190
191	0						$uniSub->print2File($WEKA_SAVE, "");
192							}
193
194	0						close $WEKA_SAVE;
195
196							}
197
198
199							#average the weka accuracy datas
200							# input : $name <-- name of the file with the weka data
201							# output : %featAvg <-- hash of each feature set's averages (in array form aligned to the wekaAttr)
202							sub averageWekaData{
203	0			0	0		my $name = shift;
204	0						my %featAvg = ();
205
206	0						foreach my $item (@sets){
207	0						my %data = ();
208
209	0						foreach my $bucket(@allBuckets){
210
211							#import the wekaman
212	0						my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket";
213	0	0					open (WEKA, "$file") \|\| die ("Unable to import $file from _WEKAS directory\n $!");
214
215							#get lines
216	0						my @lines = ;
217	0						foreach my $line(@lines){chomp($line)};
	0
218	0						my $len = @lines;
219
220							#$uniSub->printColorDebug("on_red", "YOUR FACE!");
221
222							#file doesn't have any contents for some reason
223	0	0					if($len == 0){
224	0						$uniSub->printColorDebug("red", "ERROR: NOTHING FOUND FOR $file!\n");
225	0						my $n = @wekaAttr;
226	0						my @values = (0) x $n;
227	0						push(@{$data{$bucket}}, @values);
	0
228	0						next;
229							#return %featAvg;
230							}
231
232							#get the rest of the array
233	0						my $keyword = "=== Error on test data ===";
234	0						my $index = $uniSub->getIndexofLine($keyword, \@lines);
235	0						my @result = @lines[$index..$len];
236
237							#grab the only stuff you need
238	0						my $weightWord = " Yes";
239	0						my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result);
240	0						my $weightLine = $result[$weightIndex];
241
242							#split it uuuuuup
243	0						my @values = split /\s+/, $weightLine;
244	0						my $valLen = @values;
245	0						my $valLen2 = $valLen - 2;
246	0						@values = @values[1..$valLen2];
247							#printArr(", ", @values);
248
249							#add to overall
250	0						push(@{$data{$bucket}}, @values);
	0
251							}
252
253	0						my @averages = ();
254							#add all the stuffs
255	0						foreach my $bucket (@allBuckets){
256	0						my @wekaSet = @{$data{$bucket}};
	0
257	0						my $wekaLen = @wekaSet;
258	0						for(my $e = 0; $e < $wekaLen; $e++){
259	0						my $entry = $wekaSet[$e];
260	0	0	0				if($entry ne "NaN" and $entry ne "?"){ #if value is unknown (from WEKA)
261	0						$averages[$e] += $entry;
262							}
263							}
264							}
265
266							#printArr(", ", @avgLens);
267
268							#divide them
269	0						foreach my $tea (@averages){
270	0						$tea /= $bucketsNum;
271							}
272
273							#add the averages for the specific feature
274	0						push(@{$featAvg{$item}}, @averages);
	0
275							}
276							#exit;
277	0						return %featAvg;
278							}
279
280							#retrieve the weighted average lines from the individual buckets
281							# input : $name <-- name of the file with the weka data
282							# output : %featData <-- hash of arrays for each line of individual data for the buckets
283							sub individualWekaData{
284	0			0	0		my $name = shift;
285	0						my %featData = ();
286
287	0						foreach my $item (@sets){ #for each feature specified
288	0						my @data = ();
289
290	0						foreach my $bucket(@allBuckets){ #for each bucket in the set
291
292							#import the wekaman
293	0						my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket";
294	0	0					open (WEKA, "$file") \|\| die ("Unable to import $file from _WEKAS directory $!");
295
296							#get lines
297	0						my @lines = ;
298	0						foreach my $line(@lines){chomp($line)};
	0
299	0						my $len = @lines;
300
301							#if the contents of the file are empty for some reason
302	0	0					if($len == 0){
303	0						$uniSub->printColorDebug("red", "ERROR: NOTHING FOUND FOR $file!\n");
304	0						my $n = @wekaAttr;
305	0						my @values = (0) x $n;
306	0						my $valueLine = join(" ", @values);
307	0						push(@data, $valueLine);
308	0						next;
309							#return %featAvg;
310							}
311
312							#get the rest of the array
313	0						my $keyword = "=== Error on test data ===";
314	0						my $index = $uniSub->getIndexofLine($keyword, \@lines);
315	0						my @result = @lines[$index..$len];
316
317							#grab the only stuff you need
318	0						my $weightWord = " Yes";
319	0						my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result);
320	0						my $weightLine = $result[$weightIndex];
321
322							#split it up
323	0						my @values = split /\s+/, $weightLine;
324	0						my $valLen = @values;
325	0						$valLen -= 2;
326	0						my @values2 = @values[1..$valLen];
327
328							#rejoin it
329	0						my $valLine = join("\t", @values2);
330
331							#add it
332	0						push (@data, $valLine);
333							}
334	0						push (@{$featData{$item}}, @data);
	0
335							}
336	0						return %featData;
337							}
338
339							# grabs the majority sense-label baseline for the file
340							# input : $name <-- name of the file to get the entities from
341							# output : $msb <-- # of no instances / # of yes instances
342							sub getMSB{
343	0			0	0		my $file = shift;
344							#get the name of the file
345	0						my @n = split '/', $file;
346	0						my $l = @n;
347	0						my $filename = $n[$l - 1];
348	0						$filename = lc($filename);
349
350							#import the data from the file
351	0	0					if($program_dir ne ""){open (FILE, "$program_dir/$file") \|\| die ("what is this '$program_dir/$file' you speak of?\n$!");}
	0	0
352	0	0					else{open (FILE, "$file") \|\| die ("what is this '$file' you speak of? $!\n");}
353	0						my @fileLines = ;
354	0						foreach my $l(@fileLines){
355	0						$l = lc($l);
356							}
357
358							#clean it up for two separate sets
359	0						my @tagSet = retagSet($filename, \@fileLines);
360
361							#count and get results
362	0						return countInst(\@tagSet);
363							}
364
365							# counts the # of no instances / # of all instances
366							# input : @set <-- set of lines to read and count
367							# output : $msb <-- # of no instances / # of all instances
368							sub countInst{
369	0			0	0		my $set_ref = shift;
370	0						my @set = @$set_ref;
371
372	0						my $no_inst = 0;
373	0						my $all_inst = 0;
374
375	0						foreach my $line(@set){
376	0						my @words = split(" ", $line);
377	0						foreach my $word(@words){
378							#print($word . "\n");
379							#if the word doesn't have a entity tag
380	0	0					if(!($word =~ /([\s\S]*($entId)$)/)){
381	0						$no_inst++; #count the words not annotated
382							}
383
384	0						$all_inst++; #count all the words
385							}
386							}
387
388							#return the ratio
389	0						return ($no_inst / $all_inst);
390							}
391
392
393
394							###### RETAGS THE LINE ######
395
396							# turns the tagged entity words into special words with <> for the context words
397							# input : $input <-- the line to retag
398							# $id <-- the id within the tag to look for
399							# output : (.arff files)
400							sub retag{
401	0			0	0		my $input = shift;
402	0						my $id = shift;
403
404	0						$id = lc($id);
405	0						my $line = lc($input);
406
407							#get rid of any tags
408	0						my @words = split (" ", $line);
409	0						my @newSet = ();
410	0						my $charact = 0;
411	0						foreach my $word (@words){
412	0	0					if($charact){ #if currently annotating
413	0	0					if($word eq ""){ #stop tagging annotations
414	0						$charact = 0;
415							}else{ #add the entity tagger to the words
416	0						my $charWord = "$word"."$entId";
417	0						push @newSet, $charWord;
418							}
419							}else{
420	0	0					if($word eq ""){ #start annotating
421	0						$charact = 1;
422							}else{ #add normal word
423	0						push @newSet, $word;
424							}
425							}
426							}
427
428							#clean up the new line
429	0						my $new_line = join " ", @newSet;
430	0						$new_line =~s/\b$entId\b//g;
431	0						$new_line = $uniSub->cleanWords($new_line);
432	0						return $new_line;
433							}
434
435							# turns the tagged entity words in the entire file into special words with <> for the context words
436							# input : $name <-- the name of the file to use as the id tag
437							# @lines <-- the set of lines to retag
438							# output : @tagSet <-- set of retagged lines
439							sub retagSet{
440	0			0	0		my $name = shift;
441	0						my $lines_ref = shift;
442	0						my @lines = @$lines_ref;
443
444	0						my @tagSet = ();
445	0						foreach my $line (@lines){
446							#retag the line
447	0						chomp($line);
448	0						my $tag_line = retag($line, $name);
449
450							#add it to the set
451	0						push @tagSet, $tag_line;
452							}
453	0						return @tagSet;
454							}
455
456							1;