File Coverage

blib/lib/NanoB2B/NER/Avgman.pm

Criterion	Covered	Total	%
statement	12	236	5.0
branch	0	48	0.0
condition	0	3	0.0
subroutine	4	13	30.7
pod	0	8	0.0
total	16	308	5.1

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2							# NanoB2B-NER::NER::Avgman
3							#
4							# Averages Weka files and makes a nice output file
5							# Version 1.5
6							#
7							# Program by Milk
8
9							package NanoB2B::NER::Avgman;
10
11	1			1		6	use NanoB2B::UniversalRoutines;
	1					2
	1					25
12	1			1		4	use File::Path qw(make_path); #makes sub directories
	1					2
	1					35
13	1			1		5	use strict;
	1					2
	1					18
14	1			1		4	use warnings;
	1					1
	1					1728
15
16							#### GLOBAL VARIABLES ####
17
18							#option variables
19							my $program_dir;
20							my $bucketsNum = 10;
21							my $weka_dir;
22							my $debug = 0;
23
24							#universal subroutines object
25							my %uniParams;
26							my $uniSub;
27
28							#module variables
29							my @features;
30							my @wekaAttr = ("TP Rate", "FP Rate", "Precision", "Recall", "F-Measure", "MCC", "ROC Area", "PRC Area");
31							my @allBuckets;
32							my @sets;
33							my $entId = "_e";
34
35							#### A SUPER PET IS ADOPTED ####
36
37							# construction method to create a new Avgman object
38							# input : $directory <-- the name of the directory for the files
39							# $weka_dir <-- the weka directory name
40							# $features <-- the set of features [e.g. "ortho morph text pos cui sem"]
41							# $buckets <-- the number of buckets used for the k-fold cross validation
42							# $debug <-- the set of features to run on [e.g. omtpcs]
43							# output : $self <-- an instance of the Avgman object
44							sub new {
45							#grab class and parameters
46	0			0	0		my $self = {};
47	0						my $class = shift;
48	0	0					return undef if(ref $class);
49	0						my $params = shift;
50
51							#reset all the arrays
52	0						%uniParams = ();
53	0						@features = ();
54	0						@allBuckets = ();
55	0						@sets = ();
56
57							#bless this object
58	0						bless $self, $class;
59	0						$self->_init($params);
60	0						@allBuckets = (1..$bucketsNum);
61
62							#retrieve parameters for universal-routines
63	0						$uniParams{'debug'} = $debug;
64	0						$uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
65
66							#make the features
67	0						my $item = "_";
68	0						foreach my $fs (@features){
69	0						$item .= substr($fs, 0, 1); #add to abbreviations for the name
70	0						push(@sets, $item);
71							}
72
73
74	0						return $self;
75							}
76
77							# method to initialize the NanoB2B::NER::Avgman object.
78							# input : $parameters <- reference to a hash
79							# output:
80							sub _init {
81	0			0			my $self = shift;
82	0						my $params = shift;
83
84	0	0					$params = {} if(!defined $params);
85
86							# get some of the parameters
87	0						my $diroption = $params->{'directory'};
88	0						my $ftsoption = $params->{'features'};
89	0						my $bucketsNumoption = $params->{'buckets'};
90	0						my $wekadiroption = $params->{'weka_dir'};
91	0						my $debugoption = $params->{'debug'};
92
93							#set the global variables
94	0	0					if(defined $debugoption){$debug = $debugoption;}
	0
95	0	0					if(defined $diroption){$program_dir = $diroption;}
	0
96	0	0					if(defined $bucketsNumoption){$bucketsNum = $bucketsNumoption;}
	0
97	0	0					if(defined $ftsoption){@features = split(' ', $ftsoption);}
	0
98	0	0					if(defined $wekadiroption){$weka_dir = $wekadiroption};
	0
99							}
100
101
102							############### NOT YOUR AVGMAN! ################
103
104							#prints weka results to an output file
105							# input : $name <-- name of the file with the weka data
106							# output : (weka average results file)
107							sub avg_file{
108	0			0	0		my $self = shift;
109	0						my $name = shift;
110
111	0						my %avg = averageWekaData($name);
112	0						my %ind = individualWekaData($name);
113	0						my $msb = getMSB($name);
114
115							#open new file
116	0						my $direct = ("$program_dir/_WEKAS/$weka_dir");
117	0						make_path($direct);
118	0						my $file = "$direct/$name" . "_weka-results";
119	0						my $WEKA_SAVE;
120	0	0					open ($WEKA_SAVE, ">", "$file") \|\| die ("Aw man! $!");
121
122							#print the name
123	0						$uniSub->print2File($WEKA_SAVE, "ENTITY NAME\t\t: $name");
124
125							#print the list of features
126	0						$uniSub->print2FileNoLine($WEKA_SAVE, "FEATURES\t\t: ");
127	0						foreach my $d(@features){$uniSub->print2FileNoLine($WEKA_SAVE, "$d, ");}
	0
128	0						$uniSub->print2File($WEKA_SAVE, "");
129
130							#print the msb
131	0						$msb = int($msb * 104) / 104;
132	0						$uniSub->print2File($WEKA_SAVE, "Majority Label Baseline : $msb");
133
134	0						$uniSub->print2File($WEKA_SAVE, "\n");
135
136							#print the averages
137	0						foreach my $key (sort keys %avg){
138	0						$uniSub->print2File($WEKA_SAVE, "$key - AVERAGES");
139	0						$uniSub->print2File($WEKA_SAVE, "-------------------------");
140
141	0						my @arr = @{$avg{$key}};
	0
142	0						my $arrLen = @arr;
143	0						for(my $a = 0; $a < $arrLen; $a++){
144	0						my $wekaThing = $wekaAttr[$a];
145	0						my $entry = $arr[$a];
146	0						my $tab = "";
147	0	0					if(length($wekaThing) > 7){
		0
148	0						$tab = "\t";
149							}elsif(length($wekaThing) < 4){
150	0						$tab = "\t\t\t";
151							}else{
152	0						$tab = "\t\t";
153							}
154
155	0						my $in = "$wekaThing" . $tab . "-\t$entry";
156	0						$uniSub->print2File($WEKA_SAVE, $in);
157							}
158	0						$uniSub->print2File($WEKA_SAVE, "");
159							}
160	0						$uniSub->print2File($WEKA_SAVE, "\n");
161							#print the individual
162	0						foreach my $key (sort keys %ind){
163							#header
164	0						$uniSub->print2File($WEKA_SAVE, "$key - INDIVIDUAL");
165	0						$uniSub->print2File($WEKA_SAVE, "==================\n");
166	0						$uniSub->print2FileNoLine($WEKA_SAVE, "\t\t\t");
167	0						foreach my $r(@wekaAttr){
168	0						my $sr = "";
169	0	0					if(length($r) >= 6){$sr = (substr($r, 0, 6) . ".");}
	0	0
170	0						elsif(length($r) < 4){$sr = "$r\t";}
171	0						else{$sr = $r;}
172	0						$uniSub->print2FileNoLine($WEKA_SAVE, "$sr\t");
173							}
174	0						$uniSub->print2File($WEKA_SAVE, "\n-------------------------------------------------------------------------------");
175
176							#print the lines
177	0						my @arr = @{$ind{$key}};
	0
178	0						my $arrLen = @arr;
179
180	0						for(my $a = 0; $a < $arrLen;$a++){
181	0						my $b = $a + 1;
182	0						my $entry = $arr[$a];
183							#printColorDebug("on_red", "$name - $entry");
184
185	0						my $in = "BUCKET $b\t$entry";
186	0						$uniSub->print2File($WEKA_SAVE, $in);
187							}
188
189	0						$uniSub->print2File($WEKA_SAVE, "");
190							}
191
192	0						close $WEKA_SAVE;
193
194							}
195
196
197							#average the weka accuracy datas
198							# input : $name <-- name of the file with the weka data
199							# output : %featAvg <-- hash of each feature set's averages (in array form aligned to the wekaAttr)
200							sub averageWekaData{
201	0			0	0		my $name = shift;
202	0						my %featAvg = ();
203
204	0						foreach my $item (@sets){
205	0						my %data = ();
206
207	0						foreach my $bucket(@allBuckets){
208
209							#import the wekaman
210	0						my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket";
211	0	0					open (WEKA, "$file") \|\| die ("WHY NO FILE - $file?!");
212
213							#get lines
214	0						my @lines = ;
215	0						foreach my $line(@lines){chomp($line)};
	0
216	0						my $len = @lines;
217
218							#$uniSub->printColorDebug("on_red", "YOUR FACE!");
219
220	0	0					if($len == 0){
221	0						$uniSub->printColorDebug("red", "NOTHING FOUND FOR $file!\n");
222	0						my $n = @wekaAttr;
223	0						my @values = (0) x $n;
224	0						push(@{$data{$bucket}}, @values);
	0
225	0						next;
226							#return %featAvg;
227							}
228
229							#get the rest of the array
230	0						my $keyword = "=== Error on test data ===";
231	0						my $index = $uniSub->getIndexofLine($keyword, \@lines);
232	0						my @result = @lines[$index..$len];
233
234							#grab the only stuff you need
235	0						my $weightWord = " Yes";
236	0						my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result);
237	0						my $weightLine = $result[$weightIndex];
238
239							#split it uuuuuup
240	0						my @values = split /\s+/, $weightLine;
241	0						my $valLen = @values;
242	0						my $valLen2 = $valLen - 2;
243	0						@values = @values[1..$valLen2];
244							#printArr(", ", @values);
245
246							#add to overall
247	0						push(@{$data{$bucket}}, @values);
	0
248							}
249
250	0						my @averages = ();
251							#add all the stuffs
252	0						foreach my $bucket (@allBuckets){
253	0						my @wekaSet = @{$data{$bucket}};
	0
254	0						my $wekaLen = @wekaSet;
255	0						for(my $e = 0; $e < $wekaLen; $e++){
256	0						my $entry = $wekaSet[$e];
257	0	0	0				if($entry ne "NaN" and $entry ne "?"){
258	0						$averages[$e] += $entry;
259							}
260							}
261							}
262
263							#printArr(", ", @avgLens);
264
265							#divide them
266	0						foreach my $tea (@averages){
267	0						$tea /= $bucketsNum;
268							}
269
270							#ta-da averages
271	0						push(@{$featAvg{$item}}, @averages);
	0
272
273							#hello I am here(>'_')> hug me~
274							#ok then <(^_^<) ~
275							}
276							#exit;
277	0						return %featAvg;
278							}
279
280							#retrieve the weighted average lines from the individual buckets
281							# input : $name <-- name of the file with the weka data
282							# output : %featData <-- hash of arrays for each line of individual data for the buckets
283							sub individualWekaData{
284	0			0	0		my $name = shift;
285	0						my %featData = ();
286
287	0						foreach my $item (@sets){
288	0						my @data = ();
289
290	0						foreach my $bucket(@allBuckets){
291
292							#import the wekaman
293	0						my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket";
294	0	0					open (WEKA, "$file") \|\| die ("WHY NO FILE - $file?!");
295
296							#get lines
297	0						my @lines = ;
298	0						foreach my $line(@lines){chomp($line)};
	0
299	0						my $len = @lines;
300
301	0	0					if($len == 0){
302	0						$uniSub->printColorDebug("red", "NOTHING FOUND FOR $file!\n");
303	0						my $n = @wekaAttr;
304	0						my @values = (0) x $n;
305	0						my $valueLine = join(" ", @values);
306	0						push(@data, $valueLine);
307	0						next;
308							#return %featAvg;
309							}
310
311							#get the rest of the array
312	0						my $keyword = "=== Error on test data ===";
313	0						my $index = $uniSub->getIndexofLine($keyword, \@lines);
314	0						my @result = @lines[$index..$len];
315
316							#grab the only stuff you need
317	0						my $weightWord = " Yes";
318	0						my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result);
319	0						my $weightLine = $result[$weightIndex];
320
321							#split it uuup
322	0						my @values = split /\s+/, $weightLine;
323	0						my $valLen = @values;
324	0						$valLen -= 2;
325	0						my @values2 = @values[1..$valLen];
326
327							#rejoin it
328	0						my $valLine = join("\t", @values2);
329
330							#add it
331	0						push (@data, $valLine);
332							}
333	0						push (@{$featData{$item}}, @data);
	0
334							}
335	0						return %featData;
336							}
337
338							# grabs the majority sense-label baseline for the file
339							# input : $name <-- name of the file to get the entities from
340							# output : $msb <-- # of no instances / # of yes instances
341							sub getMSB{
342	0			0	0		my $file = shift;
343							#get the name of the file
344	0						my @n = split '/', $file;
345	0						my $l = @n;
346	0						my $filename = $n[$l - 1];
347	0						$filename = lc($filename);
348
349							#import the data from the file
350	0	0					if($program_dir ne ""){open (FILE, "$program_dir/$file") \|\| die ("what is this '$program_dir/$file' you speak of?\n");}
	0	0
351	0	0					else{open (FILE, "$file") \|\| die ("what is this 'file' you speak of?\n");}
352	0						my @fileLines = ;
353	0						foreach my $l(@fileLines){
354	0						$l = lc($l);
355							}
356
357							#clean it up for two separate sets
358	0						my @tagSet = retagSet($filename, \@fileLines);
359
360							#count and get results
361	0						return countInst(\@tagSet);
362							}
363
364							# counts the # of no instances / # of all instances
365							# input : @set <-- set of lines to read and count
366							# output : $msb <-- # of no instances / # of all instances
367
368							sub countInst{
369	0			0	0		my $set_ref = shift;
370	0						my @set = @$set_ref;
371
372	0						my $no_inst = 0;
373	0						my $all_inst = 0;
374
375	0						foreach my $line(@set){
376	0						my @words = split(" ", $line);
377	0						foreach my $word(@words){
378							#print($word . "\n");
379	0	0					if(!($word =~ /([\s\S]*($entId)$)/)){
380	0						$no_inst++;
381							}
382
383	0						$all_inst++;
384							}
385							}
386
387	0						return ($no_inst / $all_inst);
388							}
389
390
391
392							###### RETAGS THE LINE ######
393
394							# turns the tagged entity words into special words with <> for the context words
395							# input : $input <-- the line to retag
396							# $id <-- the id within the tag to look for
397							# output : (.arff files)
398							sub retag{
399	0			0	0		my $input = shift;
400	0						my $id = shift;
401
402	0						$id = lc($id);
403	0						my $line = lc($input);
404
405							#get rid of any tags
406	0						my @words = split (" ", $line);
407	0						my @newSet = ();
408	0						my $charact = 0;
409	0						foreach my $word (@words){
410	0	0					if($charact){
411	0	0					if($word eq ""){
412	0						$charact = 0;
413							}else{
414	0						my $charWord = "$word"."$entId";
415	0						push @newSet, $charWord;
416							}
417							}else{
418	0	0					if($word eq ""){
419	0						$charact = 1;
420							}else{
421	0						push @newSet, $word;
422							}
423							}
424							}
425
426							#clean up the new line
427	0						my $new_line = join " ", @newSet;
428	0						$new_line =~s/\b$entId\b//g;
429	0						$new_line = $uniSub->cleanWords($new_line);
430	0						return $new_line;
431							}
432
433							# turns the tagged entity words in the entire file into special words with <> for the context words
434							# input : $name <-- the name of the file to use as the id tag
435							# @lines <-- the set of lines to retag
436							# output : @tagSet <-- set of retagged lines
437							sub retagSet{
438	0			0	0		my $name = shift;
439	0						my $lines_ref = shift;
440	0						my @lines = @$lines_ref;
441
442	0						my @tagSet = ();
443	0						foreach my $line (@lines){
444							#retag the line
445	0						chomp($line);
446	0						my $tag_line = retag($line, $name);
447
448							#add it to the set
449	0						push @tagSet, $tag_line;
450							}
451	0						return @tagSet;
452							}
453
454							1;