| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | #!/usr/bin/perl | 
| 2 |  |  |  |  |  |  | # NanoB2B-NER::NER::Avgman | 
| 3 |  |  |  |  |  |  | # | 
| 4 |  |  |  |  |  |  | # Averages Weka files and makes a nice output file | 
| 5 |  |  |  |  |  |  | # Version 1.5 | 
| 6 |  |  |  |  |  |  | # | 
| 7 |  |  |  |  |  |  | # Program by Milk | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | package NanoB2B::NER::Avgman; | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 1 |  |  | 1 |  | 6 | use NanoB2B::UniversalRoutines; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 27 |  | 
| 12 | 1 |  |  | 1 |  | 4 | use File::Path qw(make_path);			#makes sub directories | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 35 |  | 
| 13 | 1 |  |  | 1 |  | 5 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 16 |  | 
| 14 | 1 |  |  | 1 |  | 4 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 2227 |  | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | ####          GLOBAL VARIABLES           #### | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | #option variables | 
| 19 |  |  |  |  |  |  | my $program_dir; | 
| 20 |  |  |  |  |  |  | my $bucketsNum = 10; | 
| 21 |  |  |  |  |  |  | my $weka_dir; | 
| 22 |  |  |  |  |  |  | my $debug = 0; | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | #universal subroutines object | 
| 25 |  |  |  |  |  |  | my %uniParams; | 
| 26 |  |  |  |  |  |  | my $uniSub; | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | #module variables | 
| 29 |  |  |  |  |  |  | my @features; | 
| 30 |  |  |  |  |  |  | my @wekaAttr = ("TP Rate", "FP Rate", "Precision", "Recall", "F-Measure", "MCC", "ROC Area", "PRC Area"); | 
| 31 |  |  |  |  |  |  | my @allBuckets; | 
| 32 |  |  |  |  |  |  | my @sets; | 
| 33 |  |  |  |  |  |  | my $entId = "_e"; | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | ####      A SUPER PET IS ADOPTED     #### | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | # construction method to create a new Avgman object | 
| 38 |  |  |  |  |  |  | # input  : $directory <-- the name of the directory for the files | 
| 39 |  |  |  |  |  |  | #		   $weka_dir  <-- the weka directory name | 
| 40 |  |  |  |  |  |  | #		   $features  <-- the set of features [e.g. "ortho morph text pos cui sem"] | 
| 41 |  |  |  |  |  |  | #		   $buckets   <-- the number of buckets used for the k-fold cross validation | 
| 42 |  |  |  |  |  |  | #		   $debug     <-- the set of features to run on [e.g. omtpcs] | 
| 43 |  |  |  |  |  |  | # output : $self      <-- an instance of the Avgman object | 
| 44 |  |  |  |  |  |  | sub new { | 
| 45 |  |  |  |  |  |  | #grab class and parameters | 
| 46 | 0 |  |  | 0 | 0 |  | my $self = {}; | 
| 47 | 0 |  |  |  |  |  | my $class = shift; | 
| 48 | 0 | 0 |  |  |  |  | return undef if(ref $class); | 
| 49 | 0 |  |  |  |  |  | my $params = shift; | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | #reset all the arrays | 
| 52 | 0 |  |  |  |  |  | %uniParams = (); | 
| 53 | 0 |  |  |  |  |  | @features = (); | 
| 54 | 0 |  |  |  |  |  | @allBuckets = (); | 
| 55 | 0 |  |  |  |  |  | @sets = (); | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | #bless this object | 
| 58 | 0 |  |  |  |  |  | bless $self, $class; | 
| 59 | 0 |  |  |  |  |  | $self->_init($params); | 
| 60 | 0 |  |  |  |  |  | @allBuckets = (1..$bucketsNum); | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  | #retrieve parameters for universal-routines | 
| 63 | 0 |  |  |  |  |  | $uniParams{'debug'} = $debug; | 
| 64 | 0 |  |  |  |  |  | $uniSub = NanoB2B::UniversalRoutines->new(\%uniParams); | 
| 65 |  |  |  |  |  |  |  | 
| 66 |  |  |  |  |  |  | #make the features | 
| 67 | 0 |  |  |  |  |  | my $item = "_"; | 
| 68 | 0 |  |  |  |  |  | foreach my $fs (@features){ | 
| 69 | 0 |  |  |  |  |  | $item .= substr($fs, 0, 1);		#add to abbreviations for the name | 
| 70 | 0 |  |  |  |  |  | push(@sets, $item); | 
| 71 |  |  |  |  |  |  | } | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  |  | 
| 74 | 0 |  |  |  |  |  | return $self; | 
| 75 |  |  |  |  |  |  | } | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | #  method to initialize the NanoB2B::NER::Avgman object. | 
| 78 |  |  |  |  |  |  | #  input : $parameters <- reference to a hash | 
| 79 |  |  |  |  |  |  | #  output: | 
| 80 |  |  |  |  |  |  | sub _init { | 
| 81 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 82 | 0 |  |  |  |  |  | my $params = shift; | 
| 83 |  |  |  |  |  |  |  | 
| 84 | 0 | 0 |  |  |  |  | $params = {} if(!defined $params); | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | #  get some of the parameters | 
| 87 | 0 |  |  |  |  |  | my $diroption = $params->{'directory'}; | 
| 88 | 0 |  |  |  |  |  | my $ftsoption = $params->{'features'}; | 
| 89 | 0 |  |  |  |  |  | my $bucketsNumoption = $params->{'buckets'}; | 
| 90 | 0 |  |  |  |  |  | my $wekadiroption = $params->{'weka_dir'}; | 
| 91 | 0 |  |  |  |  |  | my $debugoption = $params->{'debug'}; | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | #set the global variables | 
| 94 | 0 | 0 |  |  |  |  | if(defined $debugoption){$debug = $debugoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 95 | 0 | 0 |  |  |  |  | if(defined $diroption){$program_dir = $diroption;} | 
|  | 0 |  |  |  |  |  |  | 
| 96 | 0 | 0 |  |  |  |  | if(defined $bucketsNumoption){$bucketsNum = $bucketsNumoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 97 | 0 | 0 |  |  |  |  | if(defined $ftsoption){@features = split(' ', $ftsoption);} | 
|  | 0 |  |  |  |  |  |  | 
| 98 | 0 | 0 |  |  |  |  | if(defined $wekadiroption){$weka_dir = $wekadiroption}; | 
|  | 0 |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | } | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | ###############			NOT YOUR AVGMAN! 		################ | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | #prints weka results to an output file | 
| 105 |  |  |  |  |  |  | # input  : $name 	  <-- name of the file with the weka data | 
| 106 |  |  |  |  |  |  | # output : (weka average results file) | 
| 107 |  |  |  |  |  |  | sub avg_file{ | 
| 108 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 109 | 0 |  |  |  |  |  | my $name = shift; | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | #get the averaged weka data, individual weka data, and majority sense baseline | 
| 112 | 0 |  |  |  |  |  | my %avg = averageWekaData($name); | 
| 113 | 0 |  |  |  |  |  | my %ind = individualWekaData($name); | 
| 114 | 0 |  |  |  |  |  | my $msb = getMSB($name); | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | #open new file | 
| 117 | 0 |  |  |  |  |  | my $direct = ("$program_dir/_WEKAS/$weka_dir"); | 
| 118 | 0 |  |  |  |  |  | make_path($direct); | 
| 119 | 0 |  |  |  |  |  | my $file = "$direct/$name" . "_weka-results"; | 
| 120 | 0 |  |  |  |  |  | my $WEKA_SAVE; | 
| 121 | 0 | 0 |  |  |  |  | open ($WEKA_SAVE, ">", "$file") || die ("Weka file $file could not be made in the _WEKAS directory! \n$!"); | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | #print the name | 
| 124 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "ENTITY NAME\t\t: $name"); | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  | #print the list of features | 
| 127 | 0 |  |  |  |  |  | $uniSub->print2FileNoLine($WEKA_SAVE, "FEATURES\t\t: "); | 
| 128 | 0 |  |  |  |  |  | foreach my $d(@features){$uniSub->print2FileNoLine($WEKA_SAVE, "$d, ");} | 
|  | 0 |  |  |  |  |  |  | 
| 129 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, ""); | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | #print the msb | 
| 132 | 0 |  |  |  |  |  | $msb = int($msb * 10**4) / 10**4; | 
| 133 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "Majority Label Baseline : $msb"); | 
| 134 |  |  |  |  |  |  |  | 
| 135 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "\n"); | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | #print the averages | 
| 138 | 0 |  |  |  |  |  | foreach my $key (sort keys %avg){ | 
| 139 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "$key - AVERAGES"); | 
| 140 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "-------------------------"); | 
| 141 |  |  |  |  |  |  |  | 
| 142 | 0 |  |  |  |  |  | my @arr = @{$avg{$key}}; | 
|  | 0 |  |  |  |  |  |  | 
| 143 | 0 |  |  |  |  |  | my $arrLen = @arr; | 
| 144 | 0 |  |  |  |  |  | for(my $a = 0; $a < $arrLen; $a++){ | 
| 145 | 0 |  |  |  |  |  | my $wekaThing = $wekaAttr[$a]; | 
| 146 | 0 |  |  |  |  |  | my $entry = $arr[$a]; | 
| 147 |  |  |  |  |  |  | #try to align the values based on the length | 
| 148 | 0 |  |  |  |  |  | my $tab = ""; | 
| 149 | 0 | 0 |  |  |  |  | if(length($wekaThing) > 7){ | 
|  |  | 0 |  |  |  |  |  | 
| 150 | 0 |  |  |  |  |  | $tab = "\t"; | 
| 151 |  |  |  |  |  |  | }elsif(length($wekaThing) < 4){ | 
| 152 | 0 |  |  |  |  |  | $tab = "\t\t\t"; | 
| 153 |  |  |  |  |  |  | }else{ | 
| 154 | 0 |  |  |  |  |  | $tab = "\t\t"; | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  |  | 
| 157 | 0 |  |  |  |  |  | my $in = "$wekaThing" . $tab . "-\t$entry"; | 
| 158 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, $in); | 
| 159 |  |  |  |  |  |  | } | 
| 160 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, ""); | 
| 161 |  |  |  |  |  |  | } | 
| 162 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "\n"); | 
| 163 |  |  |  |  |  |  | #print the individual | 
| 164 | 0 |  |  |  |  |  | foreach my $key (sort keys %ind){ | 
| 165 |  |  |  |  |  |  | #header | 
| 166 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "$key - INDIVIDUAL"); | 
| 167 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "==================\n"); | 
| 168 | 0 |  |  |  |  |  | $uniSub->print2FileNoLine($WEKA_SAVE, "\t\t\t"); | 
| 169 | 0 |  |  |  |  |  | foreach my $r(@wekaAttr){ | 
| 170 | 0 |  |  |  |  |  | my $sr = ""; | 
| 171 | 0 | 0 |  |  |  |  | if(length($r) >= 6){$sr = (substr($r, 0, 6) . ".");} | 
|  | 0 | 0 |  |  |  |  |  | 
| 172 | 0 |  |  |  |  |  | elsif(length($r) < 4){$sr = "$r\t";} | 
| 173 | 0 |  |  |  |  |  | else{$sr = $r;} | 
| 174 | 0 |  |  |  |  |  | $uniSub->print2FileNoLine($WEKA_SAVE, "$sr\t"); | 
| 175 |  |  |  |  |  |  | } | 
| 176 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, "\n-------------------------------------------------------------------------------"); | 
| 177 |  |  |  |  |  |  |  | 
| 178 |  |  |  |  |  |  | #print the lines | 
| 179 | 0 |  |  |  |  |  | my @arr = @{$ind{$key}}; | 
|  | 0 |  |  |  |  |  |  | 
| 180 | 0 |  |  |  |  |  | my $arrLen = @arr; | 
| 181 |  |  |  |  |  |  |  | 
| 182 | 0 |  |  |  |  |  | for(my $a = 0; $a < $arrLen;$a++){ | 
| 183 | 0 |  |  |  |  |  | my $b = $a + 1; | 
| 184 | 0 |  |  |  |  |  | my $entry = $arr[$a]; | 
| 185 |  |  |  |  |  |  | #printColorDebug("on_red", "$name - $entry"); | 
| 186 |  |  |  |  |  |  |  | 
| 187 | 0 |  |  |  |  |  | my $in = "BUCKET $b\t$entry"; | 
| 188 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, $in); | 
| 189 |  |  |  |  |  |  | } | 
| 190 |  |  |  |  |  |  |  | 
| 191 | 0 |  |  |  |  |  | $uniSub->print2File($WEKA_SAVE, ""); | 
| 192 |  |  |  |  |  |  | } | 
| 193 |  |  |  |  |  |  |  | 
| 194 | 0 |  |  |  |  |  | close $WEKA_SAVE; | 
| 195 |  |  |  |  |  |  |  | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | #average the weka accuracy datas | 
| 200 |  |  |  |  |  |  | # input  : $name 	  <-- name of the file with the weka data | 
| 201 |  |  |  |  |  |  | # output : %featAvg   <-- hash of each feature set's averages (in array form aligned to the wekaAttr) | 
| 202 |  |  |  |  |  |  | sub averageWekaData{ | 
| 203 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 204 | 0 |  |  |  |  |  | my %featAvg = (); | 
| 205 |  |  |  |  |  |  |  | 
| 206 | 0 |  |  |  |  |  | foreach my $item (@sets){ | 
| 207 | 0 |  |  |  |  |  | my %data = (); | 
| 208 |  |  |  |  |  |  |  | 
| 209 | 0 |  |  |  |  |  | foreach my $bucket(@allBuckets){ | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | #import the wekaman | 
| 212 | 0 |  |  |  |  |  | my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket"; | 
| 213 | 0 | 0 |  |  |  |  | open (WEKA, "$file") || die ("Unable to import $file from _WEKAS directory\n $!"); | 
| 214 |  |  |  |  |  |  |  | 
| 215 |  |  |  |  |  |  | #get lines | 
| 216 | 0 |  |  |  |  |  | my @lines = ; | 
| 217 | 0 |  |  |  |  |  | foreach my $line(@lines){chomp($line)}; | 
|  | 0 |  |  |  |  |  |  | 
| 218 | 0 |  |  |  |  |  | my $len = @lines; | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  | #$uniSub->printColorDebug("on_red", "YOUR FACE!"); | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | #file doesn't have any contents for some reason | 
| 223 | 0 | 0 |  |  |  |  | if($len == 0){ | 
| 224 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "**ERROR: NOTHING FOUND FOR $file!**\n"); | 
| 225 | 0 |  |  |  |  |  | my $n = @wekaAttr; | 
| 226 | 0 |  |  |  |  |  | my @values = (0) x $n; | 
| 227 | 0 |  |  |  |  |  | push(@{$data{$bucket}}, @values); | 
|  | 0 |  |  |  |  |  |  | 
| 228 | 0 |  |  |  |  |  | next; | 
| 229 |  |  |  |  |  |  | #return %featAvg; | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | #get the rest of the array | 
| 233 | 0 |  |  |  |  |  | my $keyword = "=== Error on test data ==="; | 
| 234 | 0 |  |  |  |  |  | my $index = $uniSub->getIndexofLine($keyword, \@lines); | 
| 235 | 0 |  |  |  |  |  | my @result = @lines[$index..$len]; | 
| 236 |  |  |  |  |  |  |  | 
| 237 |  |  |  |  |  |  | #grab the only stuff you need | 
| 238 | 0 |  |  |  |  |  | my $weightWord = "     Yes"; | 
| 239 | 0 |  |  |  |  |  | my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result); | 
| 240 | 0 |  |  |  |  |  | my $weightLine = $result[$weightIndex]; | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | #split it uuuuuup | 
| 243 | 0 |  |  |  |  |  | my @values = split /\s+/, $weightLine; | 
| 244 | 0 |  |  |  |  |  | my $valLen = @values; | 
| 245 | 0 |  |  |  |  |  | my $valLen2 = $valLen - 2; | 
| 246 | 0 |  |  |  |  |  | @values = @values[1..$valLen2]; | 
| 247 |  |  |  |  |  |  | #printArr(", ", @values); | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | #add to overall | 
| 250 | 0 |  |  |  |  |  | push(@{$data{$bucket}}, @values); | 
|  | 0 |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  | } | 
| 252 |  |  |  |  |  |  |  | 
| 253 | 0 |  |  |  |  |  | my @averages = (); | 
| 254 |  |  |  |  |  |  | #add all the stuffs | 
| 255 | 0 |  |  |  |  |  | foreach my $bucket (@allBuckets){ | 
| 256 | 0 |  |  |  |  |  | my @wekaSet = @{$data{$bucket}}; | 
|  | 0 |  |  |  |  |  |  | 
| 257 | 0 |  |  |  |  |  | my $wekaLen = @wekaSet; | 
| 258 | 0 |  |  |  |  |  | for(my $e = 0; $e < $wekaLen; $e++){ | 
| 259 | 0 |  |  |  |  |  | my $entry = $wekaSet[$e]; | 
| 260 | 0 | 0 | 0 |  |  |  | if($entry ne "NaN" and $entry ne "?"){		#if value is unknown (from WEKA) | 
| 261 | 0 |  |  |  |  |  | $averages[$e] += $entry; | 
| 262 |  |  |  |  |  |  | } | 
| 263 |  |  |  |  |  |  | } | 
| 264 |  |  |  |  |  |  | } | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  | #printArr(", ", @avgLens); | 
| 267 |  |  |  |  |  |  |  | 
| 268 |  |  |  |  |  |  | #divide them | 
| 269 | 0 |  |  |  |  |  | foreach my $tea (@averages){ | 
| 270 | 0 |  |  |  |  |  | $tea /= $bucketsNum; | 
| 271 |  |  |  |  |  |  | } | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | #add the averages for the specific feature | 
| 274 | 0 |  |  |  |  |  | push(@{$featAvg{$item}}, @averages); | 
|  | 0 |  |  |  |  |  |  | 
| 275 |  |  |  |  |  |  | } | 
| 276 |  |  |  |  |  |  | #exit; | 
| 277 | 0 |  |  |  |  |  | return %featAvg; | 
| 278 |  |  |  |  |  |  | } | 
| 279 |  |  |  |  |  |  |  | 
| 280 |  |  |  |  |  |  | #retrieve the weighted average lines from the individual buckets | 
| 281 |  |  |  |  |  |  | # input  : $name 	  <-- name of the file with the weka data | 
| 282 |  |  |  |  |  |  | # output : %featData   <-- hash of arrays for each line of individual data for the buckets | 
| 283 |  |  |  |  |  |  | sub individualWekaData{ | 
| 284 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 285 | 0 |  |  |  |  |  | my %featData = (); | 
| 286 |  |  |  |  |  |  |  | 
| 287 | 0 |  |  |  |  |  | foreach my $item (@sets){		#for each feature specified | 
| 288 | 0 |  |  |  |  |  | my @data = (); | 
| 289 |  |  |  |  |  |  |  | 
| 290 | 0 |  |  |  |  |  | foreach my $bucket(@allBuckets){	#for each bucket in the set | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | #import the wekaman | 
| 293 | 0 |  |  |  |  |  | my $file = "$program_dir/_WEKAS/$weka_dir/$name" . "_WEKA_DATA/$item/$name" . "_accuracy_$bucket"; | 
| 294 | 0 | 0 |  |  |  |  | open (WEKA, "$file") || die ("Unable to import $file from _WEKAS directory $!"); | 
| 295 |  |  |  |  |  |  |  | 
| 296 |  |  |  |  |  |  | #get lines | 
| 297 | 0 |  |  |  |  |  | my @lines = ; | 
| 298 | 0 |  |  |  |  |  | foreach my $line(@lines){chomp($line)}; | 
|  | 0 |  |  |  |  |  |  | 
| 299 | 0 |  |  |  |  |  | my $len = @lines; | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  | #if the contents of the file are empty for some reason | 
| 302 | 0 | 0 |  |  |  |  | if($len == 0){ | 
| 303 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "**ERROR: NOTHING FOUND FOR $file!**\n"); | 
| 304 | 0 |  |  |  |  |  | my $n = @wekaAttr; | 
| 305 | 0 |  |  |  |  |  | my @values = (0) x $n; | 
| 306 | 0 |  |  |  |  |  | my $valueLine = join(" ", @values); | 
| 307 | 0 |  |  |  |  |  | push(@data, $valueLine); | 
| 308 | 0 |  |  |  |  |  | next; | 
| 309 |  |  |  |  |  |  | #return %featAvg; | 
| 310 |  |  |  |  |  |  | } | 
| 311 |  |  |  |  |  |  |  | 
| 312 |  |  |  |  |  |  | #get the rest of the array | 
| 313 | 0 |  |  |  |  |  | my $keyword = "=== Error on test data ==="; | 
| 314 | 0 |  |  |  |  |  | my $index = $uniSub->getIndexofLine($keyword, \@lines); | 
| 315 | 0 |  |  |  |  |  | my @result = @lines[$index..$len]; | 
| 316 |  |  |  |  |  |  |  | 
| 317 |  |  |  |  |  |  | #grab the only stuff you need | 
| 318 | 0 |  |  |  |  |  | my $weightWord = "     Yes"; | 
| 319 | 0 |  |  |  |  |  | my $weightIndex = $uniSub->getIndexofLine($weightWord, \@result); | 
| 320 | 0 |  |  |  |  |  | my $weightLine = $result[$weightIndex]; | 
| 321 |  |  |  |  |  |  |  | 
| 322 |  |  |  |  |  |  | #split it up | 
| 323 | 0 |  |  |  |  |  | my @values = split /\s+/, $weightLine; | 
| 324 | 0 |  |  |  |  |  | my $valLen = @values; | 
| 325 | 0 |  |  |  |  |  | $valLen -= 2; | 
| 326 | 0 |  |  |  |  |  | my @values2 = @values[1..$valLen]; | 
| 327 |  |  |  |  |  |  |  | 
| 328 |  |  |  |  |  |  | #rejoin it | 
| 329 | 0 |  |  |  |  |  | my $valLine = join("\t", @values2); | 
| 330 |  |  |  |  |  |  |  | 
| 331 |  |  |  |  |  |  | #add it | 
| 332 | 0 |  |  |  |  |  | push (@data, $valLine); | 
| 333 |  |  |  |  |  |  | } | 
| 334 | 0 |  |  |  |  |  | push (@{$featData{$item}}, @data); | 
|  | 0 |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  | } | 
| 336 | 0 |  |  |  |  |  | return %featData; | 
| 337 |  |  |  |  |  |  | } | 
| 338 |  |  |  |  |  |  |  | 
| 339 |  |  |  |  |  |  | # grabs the majority sense-label baseline for the file | 
| 340 |  |  |  |  |  |  | # input  : $name      <-- name of the file to get the entities from | 
| 341 |  |  |  |  |  |  | # output : $msb       <-- # of no instances / # of yes instances | 
| 342 |  |  |  |  |  |  | sub getMSB{ | 
| 343 | 0 |  |  | 0 | 0 |  | my $file = shift; | 
| 344 |  |  |  |  |  |  | #get the name of the file | 
| 345 | 0 |  |  |  |  |  | my @n = split '/', $file; | 
| 346 | 0 |  |  |  |  |  | my $l = @n; | 
| 347 | 0 |  |  |  |  |  | my $filename = $n[$l - 1]; | 
| 348 | 0 |  |  |  |  |  | $filename = lc($filename); | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | #import the data from the file | 
| 351 | 0 | 0 |  |  |  |  | if($program_dir ne ""){open (FILE, "$program_dir/$file") || die ("what is this '$program_dir/$file' you speak of?\n$!");} | 
|  | 0 | 0 |  |  |  |  |  | 
| 352 | 0 | 0 |  |  |  |  | else{open (FILE, "$file") || die ("what is this '$file' you speak of? $!\n");} | 
| 353 | 0 |  |  |  |  |  | my @fileLines = ; | 
| 354 | 0 |  |  |  |  |  | foreach my $l(@fileLines){ | 
| 355 | 0 |  |  |  |  |  | $l = lc($l); | 
| 356 |  |  |  |  |  |  | } | 
| 357 |  |  |  |  |  |  |  | 
| 358 |  |  |  |  |  |  | #clean it up for two separate sets | 
| 359 | 0 |  |  |  |  |  | my @tagSet = retagSet($filename, \@fileLines); | 
| 360 |  |  |  |  |  |  |  | 
| 361 |  |  |  |  |  |  | #count and get results | 
| 362 | 0 |  |  |  |  |  | return countInst(\@tagSet); | 
| 363 |  |  |  |  |  |  | } | 
| 364 |  |  |  |  |  |  |  | 
| 365 |  |  |  |  |  |  | # counts the # of no instances / # of all instances | 
| 366 |  |  |  |  |  |  | # input  : @set   <-- set of lines to read and count | 
| 367 |  |  |  |  |  |  | # output : $msb   <-- # of no instances / # of all instances | 
| 368 |  |  |  |  |  |  | sub countInst{ | 
| 369 | 0 |  |  | 0 | 0 |  | my $set_ref = shift; | 
| 370 | 0 |  |  |  |  |  | my @set = @$set_ref; | 
| 371 |  |  |  |  |  |  |  | 
| 372 | 0 |  |  |  |  |  | my $no_inst = 0; | 
| 373 | 0 |  |  |  |  |  | my $all_inst = 0; | 
| 374 |  |  |  |  |  |  |  | 
| 375 | 0 |  |  |  |  |  | foreach my $line(@set){ | 
| 376 | 0 |  |  |  |  |  | my @words = split(" ", $line); | 
| 377 | 0 |  |  |  |  |  | foreach my $word(@words){ | 
| 378 |  |  |  |  |  |  | #print($word . "\n"); | 
| 379 |  |  |  |  |  |  | #if the word doesn't have a entity tag | 
| 380 | 0 | 0 |  |  |  |  | if(!($word =~ /([\s\S]*($entId)$)/)){ | 
| 381 | 0 |  |  |  |  |  | $no_inst++;		#count the words not annotated | 
| 382 |  |  |  |  |  |  | } | 
| 383 |  |  |  |  |  |  |  | 
| 384 | 0 |  |  |  |  |  | $all_inst++;		#count all the words | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  | } | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | #return the ratio | 
| 389 | 0 |  |  |  |  |  | return ($no_inst / $all_inst); | 
| 390 |  |  |  |  |  |  | } | 
| 391 |  |  |  |  |  |  |  | 
| 392 |  |  |  |  |  |  |  | 
| 393 |  |  |  |  |  |  |  | 
| 394 |  |  |  |  |  |  | ######    RETAGS THE LINE    ###### | 
| 395 |  |  |  |  |  |  |  | 
| 396 |  |  |  |  |  |  | # turns the tagged entity words into special words with <> for the context words | 
| 397 |  |  |  |  |  |  | # input  : $input <-- the line to retag | 
| 398 |  |  |  |  |  |  | #		   $id    <-- the id within the tag to look for | 
| 399 |  |  |  |  |  |  | # output : (.arff files) | 
| 400 |  |  |  |  |  |  | sub retag{ | 
| 401 | 0 |  |  | 0 | 0 |  | my $input = shift; | 
| 402 | 0 |  |  |  |  |  | my $id = shift; | 
| 403 |  |  |  |  |  |  |  | 
| 404 | 0 |  |  |  |  |  | $id = lc($id); | 
| 405 | 0 |  |  |  |  |  | my $line = lc($input); | 
| 406 |  |  |  |  |  |  |  | 
| 407 |  |  |  |  |  |  | #get rid of any tags | 
| 408 | 0 |  |  |  |  |  | my @words = split (" ", $line); | 
| 409 | 0 |  |  |  |  |  | my @newSet = (); | 
| 410 | 0 |  |  |  |  |  | my $charact = 0; | 
| 411 | 0 |  |  |  |  |  | foreach my $word (@words){ | 
| 412 | 0 | 0 |  |  |  |  | if($charact){				#if currently annotating | 
| 413 | 0 | 0 |  |  |  |  | if($word eq ""){		#stop tagging annotations | 
| 414 | 0 |  |  |  |  |  | $charact = 0; | 
| 415 |  |  |  |  |  |  | }else{						#add the entity tagger to the words | 
| 416 | 0 |  |  |  |  |  | my $charWord = "$word"."$entId"; | 
| 417 | 0 |  |  |  |  |  | push @newSet, $charWord; | 
| 418 |  |  |  |  |  |  | } | 
| 419 |  |  |  |  |  |  | }else{ | 
| 420 | 0 | 0 |  |  |  |  | if($word eq ""){		#start annotating | 
| 421 | 0 |  |  |  |  |  | $charact = 1; | 
| 422 |  |  |  |  |  |  | }else{							#add normal word | 
| 423 | 0 |  |  |  |  |  | push @newSet, $word; | 
| 424 |  |  |  |  |  |  | } | 
| 425 |  |  |  |  |  |  | } | 
| 426 |  |  |  |  |  |  | } | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | #clean up the new line | 
| 429 | 0 |  |  |  |  |  | my $new_line = join " ", @newSet; | 
| 430 | 0 |  |  |  |  |  | $new_line =~s/\b$entId\b//g; | 
| 431 | 0 |  |  |  |  |  | $new_line = $uniSub->cleanWords($new_line); | 
| 432 | 0 |  |  |  |  |  | return $new_line; | 
| 433 |  |  |  |  |  |  | } | 
| 434 |  |  |  |  |  |  |  | 
| 435 |  |  |  |  |  |  | # turns the tagged entity words in the entire file into special words with <> for the context words | 
| 436 |  |  |  |  |  |  | # input  : $name  <-- the name of the file to use as the id tag | 
| 437 |  |  |  |  |  |  | #		   @lines   <-- the set of lines to retag | 
| 438 |  |  |  |  |  |  | # output : @tagSet <-- set of retagged lines | 
| 439 |  |  |  |  |  |  | sub retagSet{ | 
| 440 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 441 | 0 |  |  |  |  |  | my $lines_ref = shift; | 
| 442 | 0 |  |  |  |  |  | my @lines = @$lines_ref; | 
| 443 |  |  |  |  |  |  |  | 
| 444 | 0 |  |  |  |  |  | my @tagSet = (); | 
| 445 | 0 |  |  |  |  |  | foreach my $line (@lines){ | 
| 446 |  |  |  |  |  |  | #retag the line | 
| 447 | 0 |  |  |  |  |  | chomp($line); | 
| 448 | 0 |  |  |  |  |  | my $tag_line = retag($line, $name); | 
| 449 |  |  |  |  |  |  |  | 
| 450 |  |  |  |  |  |  | #add it to the set | 
| 451 | 0 |  |  |  |  |  | push @tagSet, $tag_line; | 
| 452 |  |  |  |  |  |  | } | 
| 453 | 0 |  |  |  |  |  | return @tagSet; | 
| 454 |  |  |  |  |  |  | } | 
| 455 |  |  |  |  |  |  |  | 
| 456 |  |  |  |  |  |  | 1; |