File Coverage

blib/lib/NanoB2B/NER/Arffman.pm

Criterion	Covered	Total	%
statement	18	839	2.1
branch	0	212	0.0
condition	0	63	0.0
subroutine	6	41	14.6
pod	0	34	0.0
total	24	1189	2.0

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2							# NanoB2B-NER::NER::Arffman
3							#
4							# Creates ARFF files from annotated files
5							# Version 1.5
6							#
7							# Program by Milk
8
9							package NanoB2B::NER::Arffman;
10
11	1			1		6	use NanoB2B::UniversalRoutines;
	1					2
	1					25
12	1			1		4	use MetaMap::DataStructures;
	1					2
	1					17
13	1			1		4	use File::Path qw(make_path); #makes sub directories
	1					2
	1					38
14	1			1		6	use List::MoreUtils qw(uniq);
	1					2
	1					6
15
16	1			1		457	use strict;
	1					2
	1					15
17	1			1		4	use warnings;
	1					1
	1					6192
18
19							#option variables
20							my $debug = 1;
21							my $program_dir = "";
22							my $fileIndex = 0;
23							my $stopwords_file;
24							my $prefix = 3;
25							my $suffix = 3;
26							my $bucketsNum = 10;
27							my $is_cui = 0;
28							my $sparse_matrix = 0;
29							my $wcs = "";
30
31							#datastructure object
32							my %params = ();
33							my $dataStructures = MetaMap::DataStructures->new(\%params);
34
35							#universal subroutines object
36							my %uniParams = ();
37							my $uniSub;
38
39							#other general global variables
40							my @allBuckets;
41							my %fileHash;
42							my %metamapHash;
43							my %tokenHash;
44							my %conceptHash;
45							my %posHash;
46							my %semHash;
47							my %cuiHash;
48							my %orthoHash;
49							my @features;
50							my $selfId = "_self";
51							my $entId = "_e";
52							my $morphID = "_m";
53
54							my $stopRegex;
55
56							#### A HERO IS BORN ####
57
58							# construction method to create a new Arffman object
59							# input : $directory <-- the name of the directory for the files
60							# $name <-- name of the file to examine
61							# $features <-- the list of features to use [e.g. "ortho morph text pos cui sem"]
62							# $bucketsNum <-- the number of buckets to use for k-fold cross validation
63							# \$debug <-- run the program with debug print statements
64							# \$prefix <-- the number of letters to look at the beginning of each word
65							# \$suffix <-- the number of letters to look at the end of each word
66							# \$index <-- the index to start metamapping from in the set of files
67							# \$no_stopwords <-- exclude examining stop words [imported from the stop word list]
68							# output : $self <-- an instance of the Arffman object
69							sub new {
70							#grab class and parameters
71	0			0	0		my $self = {};
72	0						my $class = shift;
73	0	0					return undef if(ref $class);
74	0						my $params = shift;
75
76							#reset all arrays and hashes
77	0						@allBuckets = ();
78	0						%fileHash = ();
79	0						%metamapHash = ();
80	0						%tokenHash = ();
81	0						%conceptHash = ();
82	0						%posHash = ();
83	0						%semHash = ();
84	0						%cuiHash = ();
85	0						%orthoHash = ();
86	0						@features = ();
87
88							#bless this object
89	0						bless $self, $class;
90	0						$self->_init($params);
91	0						@allBuckets = (1..$bucketsNum);
92
93							#retrieve parameters for universal-routines
94	0						$uniParams{'debug'} = $debug;
95	0						$uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
96
97							#return the object
98	0						return $self;
99							}
100
101							# method to initialize the NanoB2B::NER::Arffman object.
102							# input : $parameters <- reference to a hash
103							# output:
104							sub _init {
105	0			0			my $self = shift;
106	0						my $params = shift;
107
108	0	0					$params = {} if(!defined $params);
109
110							# get some of the parameters
111	0						my $diroption = $params->{'directory'};
112	0						my $ftsoption = $params->{'features'};
113	0						my $bucketsNumoption = $params->{'bucketsNum'};
114	0						my $debugoption = $params->{'debug'};
115	0						my $prefixoption = $params->{'prefix'};
116	0						my $suffixoption = $params->{'suffix'};
117	0						my $indexoption = $params->{'index'};
118	0						my $stopwordoption = $params->{'stopwords'};
119	0						my $iscuioption = $params->{'is_cui'};
120	0						my $sparsematrixoption = $params->{'sparse_matrix'};
121	0						my $wcsoption = $params->{'wcs'};
122
123							#set the global variables
124	0	0					if(defined $debugoption){$debug = $debugoption;}
	0
125	0	0					if(defined $diroption){$program_dir = $diroption;}
	0
126	0	0					if(defined $indexoption){$fileIndex = $indexoption;}
	0
127	0	0					if(defined $stopwordoption){$stopwords_file = $stopwordoption;}
	0
128	0	0					if(defined $iscuioption){$is_cui = $iscuioption;}
	0
129	0	0					if(defined $sparsematrixoption){$sparse_matrix = $sparsematrixoption;}
	0
130	0	0					if(defined $prefixoption){$prefix = $prefixoption;}
	0
131	0	0					if(defined $suffixoption){$suffix = $suffixoption;}
	0
132	0	0					if(defined $wcsoption){$wcs = $wcsoption;}
	0
133	0	0					if(defined $bucketsNumoption){$bucketsNum = $bucketsNumoption;}
	0
134	0	0					if(defined $ftsoption){@features = split(' ', $ftsoption);}
	0
135							}
136
137
138							####### ARFFMAN AND THE METHODS OF MADNESS #####
139
140
141							# opens a single file and runs it through the process of creating buckets
142							# extracting tokens and concepts, and creating arff files based on the features given
143							# input : $file <-- the name of the file to make into arff files
144							# output : a set of arff files
145							sub arff_file{
146	0			0	0		my $self = shift;
147	0						my $file = shift;
148
149							#define and reset temp var
150	0						my $indexer = 0;
151	0						%fileHash = ();
152	0						%metamapHash = ();
153	0						%tokenHash = ();
154	0						%conceptHash = ();
155	0						%posHash = ();
156	0						%semHash = ();
157	0						%cuiHash = ();
158
159							#get the name of the file
160	0						my @n = split '/', $file;
161	0						my $l = @n;
162	0						my $filename = $n[$l - 1];
163	0						$filename = lc($filename);
164
165	0						my $FILE;
166	0	0					open ($FILE, "$program_dir/$file") \|\| die ("what is this '$program_dir/$filename' you speak of?\n");
167	0						my @fileLines = <$FILE>;
168	0						my @orthoLines = @fileLines;
169							#my @orthoLines = ["Hi! I'm Milk", "I have a hamster named Scott", "I like pizza"];
170	0						foreach my $l(@fileLines){
171	0						$l = lc($l);
172							}
173	0						$uniSub->printColorDebug("on_red", "$filename");
174							#$uniSub->printColorDebug("on_cyan", "* $wcs *");
175
176							#get the total num of lines
177	0						my $totalLines = 0;
178	0						$totalLines = @fileLines;
179	0						$uniSub->printColorDebug("red", "Lines: $totalLines\n");
180
181							#clean it up for two separate sets
182	0						my @tagSet = retagSet($filename, \@fileLines);
183	0						my @cleanLines = untagSet($filename, \@fileLines);
184
185							#get the orthographic based lines
186							#my @orthoLines = ;
187	0						@orthoLines = retagSetOrtho(\@orthoLines);
188
189							#$uniSub->printColorDebug("red", "TAG SET: ");
190							#$uniSub->printArr(", ", \@tagSet);
191
192							####### ASSIGN THE VALUES TO HASHTABLES O KEEP TRACK OF THEM #######
193
194							#put all the lines in a file hash
195	0						$uniSub->printColorDebug("blue", "*Putting all the file lines into a hashtable....\n");
196	0						$indexer = 0;
197	0						foreach my $line (@tagSet){
198	0						$fileHash{$indexer} = $line;
199	0						$indexer++;
200							}
201
202							#put the orthographic lines in a hash
203	0						$indexer = 0;
204	0						foreach my $line (@orthoLines){
205							#$uniSub->printColorDebug("red", "$line\n");
206	0						$orthoHash{$indexer} = $line;
207	0						$indexer++;
208							}
209
210							#import the hashtables from saved data
211	0						importMetaData($filename);
212
213							#tokenize all the lines --> tokenhash
214	0						$uniSub->printColorDebug("blue", "*Tokenizing the lines into a hashtable....\n");
215	0						$indexer = 0;
216	0						my $totalTokens = 0;
217	0						my $totalConcepts = 0;
218	0						foreach my $line (@cleanLines){
219							#acquire the necessary variables
220	0						my $special_ID = "$indexer.ti.1";
221	0						my $meta = $metamapHash{$indexer};
222
223							#create citation first
224	0						$dataStructures->createFromTextWithId($meta, $special_ID);
225	0						my $citation = $dataStructures->getCitationWithId($special_ID);
226
227							#get tokens
228	0						my @tokensOut = $citation->getOrderedTokens();
229							#double array - extract the inner one
230	0						my @tokens = ();
231	0						foreach my $tt (@tokensOut){
232	0						my @newSet = @$tt;
233	0						push (@tokens, @newSet);
234							}
235	0						my $tnum = @tokens;
236	0						$totalTokens += $tnum;
237	0						push (@{$tokenHash{$indexer}}, @tokens);
	0
238
239							#get concepts
240	0						my @conceptsOut = $citation->getOrderedConcepts();
241							#double array - extract the inner one
242	0						my @concepts = ();
243	0						foreach my $cc (@conceptsOut){
244	0						my @newSet = @$cc;
245	0						push (@concepts, @newSet);
246							}
247	0						my $cnum = @concepts;
248	0						$totalConcepts += $cnum;
249	0						push (@{$conceptHash{$indexer}}, @concepts);
	0
250
251							#create the ordered POS lines
252	0						my @posOrder = orderedTokenPOS($cleanLines[$indexer], \@tokens);
253	0						my $posOrderLine = join " ", @posOrder;
254	0						$posHash{$indexer} = $posOrderLine;
255
256							#create the ordered semantic type lines
257	0						my @semantics = getConceptSets("sem", $line, \@concepts);
258	0						my $semanticsLine = join " ", @semantics;
259	0						$semHash{$indexer} = $semanticsLine;
260
261							#create the ordered cui lines
262	0						my @cuis = getConceptSets("cui", $line, \@concepts);
263	0						my $cuiLine = join " ", @cuis;
264	0						$cuiHash{$indexer} = $cuiLine;
265
266
267							#increment to the next set
268	0						$indexer++;
269							}
270	0						$uniSub->printColorDebug("red", "TOKENS: $totalTokens\n");
271	0						$uniSub->printColorDebug("red", "CONCEPTS: $totalConcepts\n");
272
273
274							####### BUCKET SORTING - TRAIN AND TEST DATA #######
275
276							#sort the lines to buckets
277	0						$uniSub->printColorDebug("blue", "*Making buckets....\n");
278	0						my %buckets = ();
279	0						%buckets = sort2Buckets($totalLines, $bucketsNum);
280
281	0						$uniSub->printColorDebug("blue", "*Making train and test files....\n");
282
283	0						zhu_li($filename, \%buckets);
284
285	0						$uniSub->printDebug("\n");
286
287							}
288
289
290
291							###################### LINE MANIPULATION #####################
292
293							###### RETAGS THE LINE ######
294
295							# turns the tagged entity words into special words with <> for the context words
296							# input : $input <-- the line to retag
297							# $id <-- the id within the tag to look for
298							# output : (.arff files)
299							sub retag{
300	0			0	0		my $input = shift;
301	0						my $id = shift;
302
303	0						$id = lc($id);
304	0						my $line = lc($input);
305
306							#get rid of any tags
307	0						my @words = split (" ", $line);
308	0						my @newSet = ();
309	0						my $charact = 0;
310	0						foreach my $word (@words){
311	0	0					if($charact){
312	0	0					if($word eq ""){
313	0						$charact = 0;
314							}else{
315	0						my $charWord = "$word"."$entId";
316	0						push @newSet, $charWord;
317							}
318							}else{
319	0	0					if($word eq ""){
320	0						$charact = 1;
321							}else{
322	0						push @newSet, $word;
323							}
324							}
325							}
326
327							#clean up the new line
328	0						my $new_line = join " ", @newSet;
329	0						$new_line =~s/\b$entId\b//og;
330	0						$new_line = $uniSub->cleanWords($new_line);
331	0						return $new_line;
332							}
333
334							# turns the tagged entity words in the entire file into special words with <> for the context words
335							# input : $name <-- the name of the file to use as the id tag
336							# @lines <-- the set of lines to retag
337							# output : @tagSet <-- set of retagged lines
338							sub retagSet{
339	0			0	0		my $name = shift;
340	0						my $lines_ref = shift;
341	0						my @lines = @$lines_ref;
342
343	0						my @tagSet = ();
344	0						foreach my $line (@lines){
345							#retag the line
346	0						chomp($line);
347	0						my $tag_line = retag($line, $name);
348
349							#add it to the set
350	0						push @tagSet, $tag_line;
351							}
352	0						return @tagSet;
353							}
354
355							#returns clean line with no tags or retaggings
356							# input : $line <-- the line to untag
357							# : $id <-- the id label to look for
358							# output : $input <-- untagged input line
359							sub untag{
360	0			0	0		my $line = shift;
361	0						my $id = shift;
362
363	0						my $input = lc($line);
364	0						$id = lc($id);
365	0						$input =~ s/ //og;
366	0						$input =~ s/ //og;
367	0						$input = $uniSub->cleanWords($input);
368	0						return $input;
369							}
370							#returns a clean set of lines
371							# input : $filename <-- the name of the file for use in the id tag
372							# : @lines <-- the set of lines to untag
373							# output : @clean_set <-- untagged set of lines
374							sub untagSet{
375	0			0	0		my $filename = shift;
376	0						my $lines_ref = shift;
377	0						my @lines = @$lines_ref;
378
379	0						my @clean_set = ();
380	0						foreach my $line(@lines){
381	0						my $cl = untag($line, $filename);
382	0						push @clean_set, $cl;
383							}
384	0						return @clean_set;
385							}
386
387
388							#import metamap hashtable data
389							# input : $name <-- the name of the file to import from
390							# output : (hashmap of metamap lines)
391							sub importMetaData{
392	0			0	0		my $name = shift;
393
394							#create a directory to save hashtable data
395	0						my $META;
396	0						my $subdir = "_METAMAPS";
397	0	0					open($META, "<", ("$program_dir/$subdir/" . $name . "_meta")) \|\| die ("HAHA No such thing!");
398
399
400							#import metamap data from the file
401	0						my @metaLines = <$META>;
402	0						my $metaCombo = join("", @metaLines);
403	0						my @newMetaLines = split("\n\n", $metaCombo);
404	0						my $t = @newMetaLines;
405	0						$uniSub->printColorDebug("red", "META LINES: $t\n");
406	0						my $key = 0;
407	0						foreach my $mm (@newMetaLines){
408	0						$metamapHash{$key} = $mm;
409	0						$key++;
410							}
411	0						close $META;
412							}
413
414							##### FOR THE ORTHO SET #####
415
416							#turns the tagged entity words into special words with <> for the context words
417							# input : $input <-- the line to retag
418							# output : $new_line <-- the retagged line
419							sub retagOrtho{
420	0			0	0		my $input = shift;
421	0						my $line = $input;
422
423							#get rid of any tags
424	0						my @words = split (" ", $line);
425	0						my @newSet = ();
426	0						my $charact = 0;
427	0						foreach my $word (@words){
428	0	0					if($charact){
429	0	0					if($word =~//){
430	0						$charact = 0;
431							}else{
432	0						my $charWord = "$word"."$entId";
433	0						push @newSet, $charWord;
434							}
435							}else{
436	0	0					if($word =~//g){
437	0						$charact = 1;
438							}else{
439	0						push @newSet, $word;
440							}
441							}
442							}
443
444							#clean up the new line
445	0						my $new_line = join " ", @newSet;
446	0						$new_line =~s/\b$entId\b//g;
447	0						$new_line = noASCIIOrtho($new_line);
448	0						return $new_line;
449							}
450							#turns the tagged entity words in the entire file into special words with <> for the context words
451							# input : @lines <-- the set of lines to retag
452							# output : @tagSet <-- the retagged line
453							sub retagSetOrtho{
454	0			0	0		my $lines_ref = shift;
455	0						my @lines = @$lines_ref;
456
457	0						my @tagSet = ();
458	0						foreach my $line (@lines){
459							#retag the line
460	0						chomp($line);
461	0						my $tag_line = retagOrtho($line);
462
463							#add it to the set
464	0						push @tagSet, $tag_line;
465							}
466	0						return @tagSet;
467							}
468
469							#cleans the line without getting rid of tags
470							# input : $line <-- line to clean up
471							# output : $new_in <-- the cleaned line
472							sub noASCIIOrtho{
473	0			0	0		my $line = shift;
474
475	0						my $new_in = $line;
476	0						$new_in =~ s/[^[:ascii:]]//g;
477	0						return $new_in
478							}
479
480
481							####################### TOKENS AND CONCEPT MANIPULATION #######################
482
483
484							#gets rid of any special tokens
485							# input : $text <-- the token text to fix
486							# output : $tokenText <-- a cleaned up token
487							sub cleanToken{
488	0			0	0		my $text = shift;
489
490	0						my $tokenText = $text;
491
492							#fix "# . #" tokens
493	0	0					if($tokenText =~ /\d+\s\.\s\d+/o){
494	0						$tokenText =~s/\s\.\s/\./og;
495							}
496
497							#fix "__ \' __" tokens
498	0	0					if($tokenText =~ /\w+\s\\\'\s\w+/o){
499	0						$tokenText =~s/\s\\\'\s//og;
500							}
501
502	0	0					if($tokenText =~ /[^a-zA-Z0-9]/o){
503	0						$tokenText = "";
504							}
505
506	0						return $tokenText;
507							}
508
509							#grabs the part-of-speech part of the token that's matched up with the bucket tokens
510							# input : @buktTokens <-- the set of tokens from the specific bucket[s]
511							# output : @posTokens <-- the part-of-speech tokens for the bucket
512							sub getTokensPOS{
513	0			0	0		my $bucketTokens_ref = shift;
514	0						my @buktTokens = @$bucketTokens_ref;
515
516							#finds the part of speech tokens
517	0						my @posTokens = ();
518	0						foreach my $token (@buktTokens){
519	0						my $pos = $token->{posTag};
520	0						push(@posTokens, $pos);
521							}
522	0						return @posTokens;
523							}
524
525							#gets the positions of the POS words from a line
526							# input : $cleanLine <-- the line to pinpoint the POS tokens to
527							# : @tokens <-- the set of tokens to use are reference
528							# output : @orderPOS <-- the part-of-speech tokens for the bucket
529							sub orderedTokenPOS{
530	0			0	0		my $cleanLine = shift;
531	0						my $tokens_ref = shift;
532	0						my @tokens = @$tokens_ref;
533
534							#$uniSub->printColorDebug("on_red", "TAG : $tagLine\n");
535	0						my @lineWords = split " ", $cleanLine;
536	0						my @orderPOS = ();
537
538							#make the connection between the word and the pos
539	0						my %text2Pos = ();
540	0						my @txtTokens = ();
541	0						foreach my $token (@tokens){
542	0						my $txt = $token->{text};
543	0						$txt = cleanToken($txt);
544							#$uniSub->printColorDebug("on_green", $txt);
545	0						push @txtTokens, $txt;
546	0						my $pos = $token->{posTag};
547	0						$text2Pos{$txt} = $pos;
548							}
549
550							#associate each tagged word with it
551	0						foreach my $word (@lineWords){
552	0	0					if($uniSub->inArr($word, \@txtTokens)){
553	0						my $newPos = $text2Pos{$word};
554	0						push @orderPOS, $newPos;
555							}else{
556	0						push @orderPOS, "undef";
557							}
558							}
559
560	0						return @orderPOS;
561							}
562
563							#gets the tagged parts of the concepts
564							# input : $type <-- what kind of concepts you want to extract [e.g. "sem", "cui"]
565							# : $line <-- the line to pinpoint the concepts to
566							# : @concepts <-- the total set of concepts to use
567							# output : @conceptSet <-- the set of concepts used within the line
568							sub getConceptSets{
569	0			0	0		my $type = shift;
570	0						my $line = shift;
571	0						my $concepts_ref = shift;
572	0						my @concepts = @$concepts_ref;
573
574	0						$line = lc($line);
575
576							#assign each concept by their text name
577	0						my @conceptsTxt = ();
578	0						foreach my $concept (@concepts){
579	0						my $ohboi = @$concept[0];
580	0						my $name = lc($ohboi->{text});
581	0						push (@conceptsTxt, $name);
582							}
583
584							#make a clean set of text words
585	0						my @txtIn = split / /, $line;
586	0						my @clean_txt = ();
587	0						foreach my $word (@txtIn){
588	0						push @clean_txt, $word;
589							}
590
591	0						my $totCon = @conceptsTxt;
592							#get the set needed
593	0						my @conceptSet = ();
594	0						for(my $f = 0; $f < $totCon; $f++){
595	0						my @concept = @{$concepts[$f]};
	0
596	0						my $txtCon = $conceptsTxt[$f];
597
598
599	0						foreach my $cc (@concept){
600
601							#get the right items
602	0						my @items = ();
603	0	0					if($type eq "sem"){
		0
		0
604	0						my $s = $cc->{semanticTypes};
605	0						@items = split /,/, $s;
606							}elsif($type eq "cui"){
607	0						my $c = $cc->{cui};
608	0						@items = split /,/, $c;
609							}elsif($type eq "text"){
610	0						my $t = $cc->{text};
611	0						@items = split /,/, $t;
612							}
613
614							#add to the concept set
615	0						push @conceptSet, @items;
616							}
617							}
618	0						return @conceptSet;
619							}
620
621
622							#retrieves the feature for a single word
623							# input : $word <-- the word to extract the features from
624							# : $type <-- what type of feature to extract [e.g. "pos", "sem", "cui"]
625							# output : if "pos" <-- a scalar part-of-speech value
626							# : else <-- an array of semantic or cui values (a single text value can have more than one of these)
627							sub getFeature{
628	0			0	0		my $word = shift;
629	0						my $type = shift;
630
631							#if retrieving pos tag
632	0	0	0				if($type eq "pos"){
		0
633							#get the token and it's pos tag
634	0						foreach my $key (sort keys %tokenHash){
635	0						foreach my $token(@{$tokenHash{$key}}){
	0
636	0						my $tokenTxt = $token->{text};
637	0	0					if($tokenTxt eq $word){
638	0						return $token->{posTag};
639							}
640							}
641							}
642	0						return "";
643							}elsif($type eq "sem" or $type eq "cui"){
644							#get the concept and it's cui or sem tag
645	0						foreach my $key (sort keys %conceptHash){
646	0						foreach my $concept (@{$conceptHash{$key}}){
	0
647	0						my $ohboi = @$concept[0];
648	0						my $name = lc($ohboi->{text});
649	0	0					if($name eq $word){
650	0	0					if($type eq "sem"){
		0
651	0						my @semArr = ();
652	0						foreach my $cc (@$concept){push(@semArr, $cc->{semanticTypes});}
	0
653	0						return @semArr;
654							}elsif($type eq "cui"){
655	0						my @cuiArr = ();
656	0						foreach my $cc (@$concept){push(@cuiArr, $cc->{cui});}
	0
657	0						return @cuiArr;
658							}
659							}
660							}
661							}
662	0						return "";
663							}
664	0						return "";
665							}
666
667							###################### BUCKETS - TRAIN AND TEST ARFF FILES #####################
668
669
670							#sorts the keys from the hashmaps into buckets so that certain values can be accessed
671							# input : $keyAmt <-- the number of lines or "keys" to divvy up into the buckets
672							# : $bucketNum <-- how many buckets to use
673							# output : %bucketList <-- the set of buckets with keys in them
674							sub sort2Buckets{
675	0			0	0		my $keyAmt = shift;
676	0						my $bucketNum = shift;
677
678							#create sets
679	0						my @keySet = (0..$keyAmt - 1); #set of keys
680	0						my %bucketList = (); #all of the buckets
681
682							#add some buckets to the bucket list
683	0						for(my $a = 1; $a <= $bucketNum; $a++){
684	0						$bucketList{$a} = [];
685							}
686
687							#sort the lines into buckets
688	0						my $bucketId = 1;
689	0						foreach my $key (@keySet){
690	0						push (@{$bucketList{$bucketId}}, $key); #add the line to the bucket
	0
691
692							#reset the id if at the max value
693	0	0					if($bucketId == $bucketNum){
694	0						$bucketId = 1;
695							}else{
696	0						$bucketId++;
697							}
698							}
699
700							#return the list of buckets
701	0						return %bucketList;
702							}
703
704							###################### ARFF STUFF #####################
705							#makes arff files for ortho, morpho, text, pos, cui, and sem attributes
706
707							#zhu li!! Do the thing!!
708							# input : $name <-- the name of the file
709							# : %bucketList <-- the set of buckets with keys in them
710							# output : (n arff files; n = # of buckets x (train and test) x # of features being used)
711							sub zhu_li{
712	0			0	0		my $name = shift;
713	0						my $bucketList_ref = shift;
714	0						my %buckets = %$bucketList_ref;
715
716							#grab the attributes
717	0						my %attrSets = ();
718	0						$uniSub->printColorDebug("bold green", "Retrieving attributes...\n");
719	0						foreach my $item(@features){
720	0						$uniSub->printColorDebug("bright_green", "\t$item attr\n");
721	0						my %setOfAttr = grabAttr($name, $item, \%buckets);
722	0						$attrSets{$item} = \%setOfAttr; #gets both the vector and arff based attributes
723							}
724
725	0	0					if(defined $stopwords_file){
726	0						$stopRegex = stop($stopwords_file);
727							}
728
729							#let's make some vectors!
730	0						$uniSub->printColorDebug("bold yellow", "Making Vectors...\n-------------------\n");
731	0						my @curFeatSet = ();
732	0						my $abbrev = "";
733
734							#run based on wcs
735	0						my $wcs_bucket;
736							my $wcs_feature;
737	0						my $wcs_found = 0;
738	0	0					if($wcs){
739	0						my @wcs_parts = split("-", $wcs);
740	0						$wcs_feature = $wcs_parts[1];
741	0						$wcs_bucket = $wcs_parts[0];
742							}
743
744
745							#iteratively add on the features [e.g. o, om, omt, omtp, omtpc, omtpcs]
746	0						foreach my $feature (@features){
747	0						$uniSub->printColorDebug("yellow", " $feature \n");
748	0						push(@curFeatSet, $feature);
749	0						$abbrev .= substr($feature, 0, 1); #add to abbreviations for the name
750
751							#$uniSub->printColorDebug("on_red", "$wcs - $wcs_found - $abbrev vs. $wcs_feature");
752	0	0	0				if(($wcs) && (!$wcs_found) && ($abbrev ne $wcs_feature)){
			0
753	0						print("SKIP \n");
754	0						next;
755							}
756
757							#go through each bucket
758	0						foreach my $bucket (sort keys %buckets){
759	0	0	0				if(($wcs) && (!$wcs_found) && ($bucket != $wcs_bucket)){
			0
760	0						print("\tSKIP\n");
761	0						next;
762							}else{
763	0						$wcs_found = 1;
764							}
765
766	0						my @range = $uniSub->bully($bucketsNum, $bucket);
767
768	0						$uniSub->printColorDebug("on_green", "BUCKET #$bucket");
769							#retrieve the vector attributes to use
770	0						my %vecAttrSet = ();
771	0						foreach my $curItem(@curFeatSet){
772	0	0					if($curItem eq "ortho"){
773	0						$vecAttrSet{$curItem} = ();
774							}else{
775							#get outer layer (tpcs)
776	0						my $a_ref = $attrSets{$curItem};
777	0						my %a = %$a_ref;
778
779							#get inner layer (vector)
780	0						my $b_ref = $a{vector};
781	0						my %b = %$b_ref;
782
783							#foreach my $key (sort keys %b){print "$key\n";}
784
785							#finally get the bucket layer (1..$bucketNum) based on range
786	0						my $c_ref = $b{$bucket};
787	0						my @c = @$c_ref;
788	0						$vecAttrSet{$curItem} = \@c;
789							}
790							}
791
792							### TRAIN ###
793	0						$uniSub->printColorDebug("bold blue", "\ttraining...\n");
794							#retrieve the lines to use
795	0						my @lineSetTrain = ();
796	0						my @bucketSetTrain = ();
797	0						foreach my $num (@range){push(@bucketSetTrain, @{$buckets{$num}});}
	0
	0
798	0						foreach my $key (@bucketSetTrain){push(@lineSetTrain, $orthoHash{$key});}
	0
799
800							#make the vector
801	0						my @vectorSetTrain = vectorMaker(\@lineSetTrain, \@curFeatSet, \%vecAttrSet);
802	0						$uniSub->printDebug("\n");
803
804							### TEST ###
805	0						$uniSub->printColorDebug("bold magenta", "\ttesting...\n");
806							#retrieve the lines to use
807	0						my @lineSetTest = ();
808	0						my @bucketSetTest = ();
809	0						push(@bucketSetTest, @{$buckets{$bucket}});
	0
810	0						foreach my $key (@bucketSetTest){push(@lineSetTest, $orthoHash{$key});}
	0
811
812							#make the vector
813	0						my @vectorSetTest = vectorMaker(\@lineSetTest, \@curFeatSet, \%vecAttrSet);
814	0						$uniSub->printDebug("\n");
815
816							### ARFF ###
817							#retrieve the arff attributes to use
818	0						my @arffAttrSet = ();
819	0						foreach my $curItem(@curFeatSet){
820	0	0					if($curItem eq "ortho"){
821							#get outer layer (ortho)
822	0						my $a_ref = $attrSets{$curItem};
823	0						my %a = %$a_ref;
824							#get the values from ortho
825	0						push(@arffAttrSet, @{$a{arff}});
	0
826							}else{
827							#get outer layer (mtpcs)
828	0						my $a_ref = $attrSets{$curItem};
829	0						my %a = %$a_ref;
830
831							#get inner layer (arff)
832	0						my $b_ref = $a{arff};
833	0						my %b = %$b_ref;
834
835							#finally get the bucket layer (1..$bucketNum) based on range
836	0						my $c_ref = $b{$bucket};
837	0						my @c = @$c_ref;
838	0						push(@arffAttrSet, @c);
839							}
840							}
841
842
843	0						$uniSub->printColorDebug("bright_yellow", "\tmaking arff files...\n");
844	0						$uniSub->printColorDebug("bright_red", "\t\tARFF TRAIN\n");
845	0						createARFF($name, $bucket, $abbrev, "train", \@arffAttrSet, \@vectorSetTrain);
846	0						$uniSub->printColorDebug("bright_red", "\t\tARFF TEST\n");
847	0						createARFF($name, $bucket, $abbrev, "test", \@arffAttrSet, \@vectorSetTest);
848							}
849							}
850
851							}
852
853							#create the arff file
854							# input : $name <-- the name of the file
855							# : $bucket <-- the index of the bucket you're testing [e.g. bucket #1]
856							# : $abbrev <-- the abbreviation label for the set of features
857							# : $type <-- train or test ARFF?
858							# : @attrARFFSet <-- the set of attributes exclusively for printing to the arff file
859							# : @vecSec <-- the set of vectors created
860							# output : (an arff file)
861							sub createARFF{
862	0			0	0		my $name = shift;
863	0						my $bucket = shift;
864	0						my $abbrev = shift;
865	0						my $type = shift;
866	0						my $attr_ref = shift;
867	0						my $vec_ref = shift;
868
869	0						my $typeDir = "_$type";
870	0						my $ARFF;
871							#print to files
872	0						$uniSub->printColorDebug("bold cyan", "\t\tcreating $name/$abbrev - BUCKET #$bucket $type ARFF...\n");
873	0	0					if($program_dir ne ""){
874	0						my $subdir = "_ARFF";
875	0						my $arffdir = $name . "_ARFF";
876	0						my $featdir = "_$abbrev";
877	0						make_path("$program_dir/$subdir/$arffdir/$featdir/$typeDir");
878	0	0					open($ARFF, ">", ("$program_dir/$subdir/$arffdir/$featdir/$typeDir/" . $name . "_$type-" . $bucket .".arff")) \|\| die ("OMG?!?!");
879							}else{
880	0						my $arffdir = $name . "_ARFF";
881	0						my $featdir = "_$abbrev";
882	0						make_path("$arffdir/$featdir/$typeDir");
883	0	0					open($ARFF, ">", ("$arffdir/$featdir/$typeDir/" . $name . "_$type-" . $bucket .".arff")) \|\| die ("What?!?!");
884							}
885
886							#get the attr and vector set
887	0						my @attrARFFSet = @$attr_ref;
888	0						my @vecSet = @$vec_ref;
889
890							#get format for the file
891	0						my $relation = "\@RELATION $name";
892	0						my @printAttr = makeAttrData(\@attrARFFSet);
893	0						my $entity = "\@ATTRIBUTE Entity {Yes, No}"; #set if the entity word or not
894	0						my $data = "\@DATA";
895
896							#print everything to the file
897	0						$uniSub->printDebug("\t\tprinting to file...\n");
898	0						$uniSub->print2File($ARFF, $relation);
899	0						foreach my $a(@printAttr){$uniSub->print2File($ARFF, $a);}
	0
900	0						$uniSub->print2File($ARFF, $entity);
901	0						$uniSub->print2File($ARFF, $data);
902	0						foreach my $d(@vecSet){$uniSub->print2File($ARFF, $d);}
	0
903	0						close $ARFF;
904							}
905
906							###################### VECTOR THINGIES #####################
907
908
909							#makes vectors from a set
910							# input : @txtLineSet <-- the retagged text lines to make vectors out of
911							# : @featureList <-- the list of features to make the vectors out of [e.g. (ortho, morph, text)]
912							# : @attrs <-- the attributes to use to make the vectors
913							# output : @setVectors <-- the vectors for each word in all of the lines
914							sub vectorMaker{
915	0			0	0		my $set_ref = shift;
916	0						my $feat_ref = shift;
917	0						my $attrib_ref = shift;
918	0						my @txtLineSet = @$set_ref;
919	0						my @featureList = @$feat_ref;
920	0						my %attrs = %$attrib_ref;
921
922	0						my @setVectors = ();
923							#go through each line of the set
924	0						my $setLen = @txtLineSet;
925
926	0						for(my $l = 0; $l < $setLen; $l++){
927	0						my $line = $txtLineSet[$l];
928	0						my @words = split(' ', $line);
929							#$uniSub->printArr(", ", \@words);
930							#print "\n";
931	0						my $wordLen = @words;
932							#go through each word
933	0						for(my $a = 0; $a < $wordLen; $a++){
934
935	0						$\| = 1;
936
937	0						my $wordOrig = $words[$a];
938							#make the words for comparison
939	0						my $word = $words[$a];
940	0						my $prevWord = "";
941	0						my $nextWord = "";
942
943							#show progress
944	0						my $l2 = $l + 1;
945	0						my $a2 = $a + 1;
946	0						$uniSub->printDebug("\r" . "\t\tLine - $l2/$setLen ------ Word - $a2/$wordLen ---- ");
947
948	0						my $smlword = substr($word, 0, 8);
949	0	0					if(length($word) > 8){
950	0						$smlword .= "...";
951							}
952
953	0	0					if($word =~/$entId/o){
954	0						$uniSub->printColorDebug("red", "$smlword! ");
955							}else{
956	0						$uniSub->printDebug("$smlword! ")
957							}
958
959	0						my @word_cuis = getFeature($word, "cui");
960	0						my $ncui = $word_cuis[0];
961							#$uniSub->printColorDebug("red", "\n\t\t$word - $ncui\n");
962
963							#check if it's a stopword
964	0	0	0				if(($stopwords_file and $word=~/$stopRegex/o) \|\| ($is_cui and $word_cuis[0] eq "") \|\| ($word eq "." \|\| $word eq ",")){
			0
			0
			0
			0
965							#$uniSub->printColorDebug("on_red", "\t\tSKIP!");
966	0						next;
967							}
968
969	0	0					if($a > 0){$prevWord = $words[$a - 1];}
	0
970	0	0					if($a < ($wordLen - 1)){$nextWord = $words[$a + 1];}
	0
971
972
973
974							#get rid of tag if necessary
975	0						$prevWord =~s/$entId//og;
976	0						$nextWord =~s/$entId//og;
977	0						$word =~s/$entId//og;
978
979	0						my $vec = "";
980							#use each set of attributes
981	0						foreach my $item(@featureList){
982	0						my $addVec = "";
983	0	0					if($item eq "ortho"){$addVec = orthoVec($word);}
	0	0
		0
		0
		0
		0
984	0						elsif($item eq "morph"){$addVec = morphVec($word, \@{$attrs{"morph"}});}
	0
985	0						elsif($item eq "text"){$addVec = textVec($word, $prevWord, $nextWord, \@{$attrs{"text"}});}
	0
986	0						elsif($item eq "pos"){$addVec = posVec($word, $prevWord, $nextWord, \@{$attrs{"pos"}});}
	0
987	0						elsif($item eq "cui"){$addVec = cuiVec($word, $prevWord, $nextWord, \@{$attrs{"cui"}});}
	0
988	0						elsif($item eq "sem"){$addVec = semVec($word, $prevWord, $nextWord, \@{$attrs{"sem"}});}
	0
989
990
991	0						$vec .= $addVec;
992
993							}
994
995							#convert binary to sparse if specified
996	0	0					if($sparse_matrix){
997	0						$vec = convert2Sparse($vec);
998							#$uniSub->printColorDebug("red", "$vec\n");
999							}
1000
1001							#check if the word is an entity or not
1002							#$uniSub->printColorDebug("red", "\n$wordOrig\n");
1003	0	0					$vec .= (($wordOrig =~/\b[\S]+(_e)\b/) ? "Yes " : "No ");
1004
1005							#close it if using sparse matrix
1006	0	0					if($sparse_matrix){
1007	0						$vec .= "}";
1008							}
1009
1010							#finally add the word back and add the entire vector to the set
1011	0						$vec .= "\%$word";
1012
1013	0	0					if($word ne ""){
1014	0						push(@setVectors, $vec);
1015							}
1016							}
1017							}
1018
1019	0						return @setVectors;
1020							}
1021
1022							#makes the orthographic based part of the vector
1023							# input : $word <-- the word to analyze
1024							# output : $strVec <-- the orthographic vector string
1025							sub orthoVec{
1026	0			0	0		my $word = shift;
1027
1028							## CHECKS ##
1029	0						my $strVec = "";
1030	0						my $addon = "";
1031
1032							#check if first letter capital
1033	0	0					$addon = ($word =~ /\b([A-Z])\w+\b/og ? 1 : 0);
1034	0						$strVec .= "$addon, ";
1035
1036							#check if a single letter word
1037	0	0					$addon = (length($word) == 1 ? 1 : 0);
1038	0						$strVec .= "$addon, ";
1039
1040							#check if all capital letters
1041	0	0					$addon = ($word =~ /\b[A-Z]+\b/og ? 1 : 0);
1042	0						$strVec .= "$addon, ";
1043
1044							#check if contains a digit
1045	0	0					$addon = ($word =~ /[0-9]+/og ? 1 : 0);
1046	0						$strVec .= "$addon, ";
1047
1048							#check if all digits
1049	0	0					$addon = ($word =~ /\b[0-9]+\b/og ? 1 : 0);
1050	0						$strVec .= "$addon, ";
1051
1052							#check if contains a hyphen
1053	0	0					$addon = ($word =~ /-/og ? 1 : 0);
1054	0						$strVec .= "$addon, ";
1055
1056							#check if contains punctuation
1057	0	0					$addon = ($word =~ /[^a-zA-Z0-9\s]/og ? 1 : 0);
1058	0						$strVec .= "$addon, ";
1059
1060	0						return $strVec;
1061							}
1062
1063							#makes the morphological based part of the vector
1064							# input : $word <-- the word to analyze
1065							# : @attrs <-- the set of morphological attributes to use
1066							# output : $strVec <-- the morphological vector string
1067							sub morphVec{
1068	0			0	0		my $word = shift;
1069	0						my $attrs_ref = shift;
1070	0						my @attrs = @$attrs_ref;
1071
1072	0						my $strVec = "";
1073
1074	0						my $preWord = substr($word, 0, $prefix);
1075	0						my $sufWord = substr($word, -$suffix);
1076
1077	0						foreach my $a (@attrs){
1078	0	0					if($a eq $preWord){
		0
1079	0						$strVec .= "1, ";
1080							}elsif($a eq $sufWord){
1081	0						$strVec .= "1, ";
1082							}else{
1083	0						$strVec .= "0, ";
1084							}
1085							}
1086
1087	0						return $strVec;
1088
1089							}
1090
1091							#makes the text based part of the vector
1092							# input : $w <-- the word to analyze
1093							# : $pw <-- the previous word
1094							# : $nw <-- the next word
1095							# : @attrbts <-- the set of text attributes to use
1096							# output : $strVec <-- the text vector string
1097							sub textVec{
1098	0			0	0		my $w = shift;
1099	0						my $pw = shift;
1100	0						my $nw = shift;
1101	0						my $at_ref = shift;
1102	0						my @attrbts = @$at_ref;
1103
1104	0						my $strVec = "";
1105
1106							#clean the words
1107	0						$w = $uniSub->cleanWords($w);
1108	0						$pw = $uniSub->cleanWords($pw);
1109	0						$nw = $uniSub->cleanWords($nw);
1110
1111							#check if the word is the attribute or the words adjacent it are the attribute
1112	0						foreach my $a(@attrbts){
1113
1114	0						my $pair = "";
1115	0	0					$pair .= ($w eq $a ? "1, " : "0, ");
1116	0	0	0				$pair .= (($pw eq $a or $nw eq $a) ? "1, " : "0, ");
1117	0						$strVec .= $pair;
1118							}
1119
1120	0						return $strVec;
1121							}
1122
1123							#makes the part of speech based part of the vector
1124							# input : $w <-- the word to analyze
1125							# : $pw <-- the previous word
1126							# : $nw <-- the next word
1127							# : @attrbts <-- the set of pos attributes to use
1128							# output : $strVec <-- the pos vector string
1129							sub posVec{
1130	0			0	0		my $w = shift;
1131	0						my $pw = shift;
1132	0						my $nw = shift;
1133	0						my $at_ref = shift;
1134	0						my @attrbts = @$at_ref;
1135
1136							#clean the words
1137	0						$w = $uniSub->cleanWords($w);
1138	0						$pw = $uniSub->cleanWords($pw);
1139	0						$nw = $uniSub->cleanWords($nw);
1140
1141							#alter the words to make them pos types
1142	0						$w = getFeature($w, "pos");
1143	0						$pw = getFeature($pw, "pos");
1144	0						$nw = getFeature($nw, "pos");
1145
1146	0						my $strVec = "";
1147
1148							#check if the word is the attribute or the words adjacent it are the attribute
1149	0						foreach my $a(@attrbts){
1150	0						my $pair = "";
1151	0	0					$pair .= ($w eq $a ? "1, " : "0, ");
1152	0	0	0				$pair .= (($pw eq $a or $nw eq $a) ? "1, " : "0, ");
1153	0						$strVec .= $pair;
1154							}
1155
1156	0						return $strVec;
1157							}
1158
1159							#makes the cui based part of the vector
1160							# input : $w <-- the word to analyze
1161							# : $pw <-- the previous word
1162							# : $nw <-- the next word
1163							# : @attrbts <-- the set of cui attributes to use
1164							# output : $strVec <-- the cui vector string
1165							sub cuiVec{
1166	0			0	0		my $w = shift;
1167	0						my $pw = shift;
1168	0						my $nw = shift;
1169	0						my $at_ref = shift;
1170	0						my @attrbts = @$at_ref;
1171
1172							#clean the words
1173	0						$w = $uniSub->cleanWords($w);
1174	0						$pw = $uniSub->cleanWords($pw);
1175	0						$nw = $uniSub->cleanWords($nw);
1176
1177							#alter the words to make them cui types
1178	0						my @wArr = getFeature($w, "cui");
1179	0						my @pwArr = getFeature($pw, "cui");
1180	0						my @nwArr = getFeature($nw, "cui");
1181
1182	0						my $strVec = "";
1183							#check if the word is the attribute or the words adjacent it are the attribute
1184	0						foreach my $a(@attrbts){
1185	0						my $pair = "";
1186	0	0					$pair .= ($uniSub->inArr($a, \@wArr) ? "1, " : "0, ");
1187	0	0	0				$pair .= (($uniSub->inArr($a, \@pwArr) or $uniSub->inArr($a, \@nwArr)) ? "1, " : "0, ");
1188	0						$strVec .= $pair;
1189							}
1190
1191	0						return $strVec;
1192							}
1193
1194							#makes the semantic based part of the vector
1195							# input : $w <-- the word to analyze
1196							# : $pw <-- the previous word
1197							# : $nw <-- the next word
1198							# : @attrbts <-- the set of sem attributes to use
1199							# output : $strVec <-- the sem vector string
1200							sub semVec{
1201	0			0	0		my $w = shift;
1202	0						my $pw = shift;
1203	0						my $nw = shift;
1204	0						my $at_ref = shift;
1205	0						my @attrbts = @$at_ref;
1206
1207							#clean the words
1208	0						$w = $uniSub->cleanWords($w);
1209	0						$pw = $uniSub->cleanWords($pw);
1210	0						$nw = $uniSub->cleanWords($nw);
1211
1212							#alter the words to make them sem types
1213	0						my @wArr = getFeature($w, "sem");
1214	0						my @pwArr = getFeature($pw, "sem");
1215	0						my @nwArr = getFeature($nw, "sem");
1216
1217	0						my $strVec = "";
1218
1219							#check if the word is the attribute or the words adjacent it are the attribute
1220	0						foreach my $a(@attrbts){
1221							#remove "sem" label
1222	0						$a = lc($a);
1223
1224	0						my $pair = "";
1225	0	0					$pair .= ($uniSub->inArr($a, \@wArr) ? "1, " : "0, ");
1226	0	0	0				$pair .= (($uniSub->inArr($a, \@pwArr) or $uniSub->inArr($a, \@nwArr)) ? "1, " : "0, ");
1227	0						$strVec .= $pair;
1228							}
1229	0						return $strVec;
1230							}
1231
1232							#converts a binary vector to a sparse vector
1233							sub convert2Sparse{
1234	0			0	0		my $bin_vec = shift;
1235	0						my @vals = split(", ", $bin_vec);
1236	0						my $numVals = @vals;
1237
1238	0						my $sparse_vec = "{";
1239	0						for(my $c=0;$c<$numVals;$c++){
1240	0						my $curVal = $vals[$c];
1241
1242	0	0					if(($curVal eq "1")){
1243	0						$sparse_vec .= "$c $curVal, ";
1244							#$uniSub->printColorDebug("red", "$c $curVal, ");
1245							}
1246							}
1247	0						$sparse_vec .= "$numVals, ";
1248
1249	0						return $sparse_vec;
1250							}
1251
1252
1253							###################### ATTRIBUTE BASED METHODS #####################
1254
1255							#gets the attributes based on the item
1256							# input : $feature <-- the feature type [e.g. ortho, morph, text]
1257							# : %buckets <-- the bucket key set
1258							# output : %vecARFFattr <-- the vector set of attributes and arff set of attributes
1259							sub grabAttr{
1260	0			0	0		my $name = shift;
1261	0						my $feature = shift;
1262	0						my $buckets_ref = shift;
1263	0						my %buckets = %$buckets_ref;
1264
1265	0						my %vecARFFattr = ();
1266	0	0					if($feature eq "ortho"){
		0
1267	0						my @vecSet = ();
1268	0						my @arffSet = ("first_letter_capital",
1269							"single_character",
1270							"all_capital",
1271							"has_digit",
1272							"all_digit",
1273							"has_hyphen",
1274							"has_punctuation");
1275	0						$vecARFFattr{vector} = \@vecSet;
1276	0						$vecARFFattr{arff} = \@arffSet;
1277	0						return %vecARFFattr;
1278							}elsif($feature eq "morph"){
1279	0						my %bucketAttr = ();
1280	0						my %bucketAttrARFF = ();
1281
1282							#get the attributes for each bucket
1283	0						foreach my $testBucket (@allBuckets){
1284	0						my @range = $uniSub->bully($bucketsNum, $testBucket);
1285	0						$uniSub->printDebug("\t\t$name BUCKET #$testBucket/$feature MORPHO attributes...\n");
1286
1287							#get attributes [ unique and deluxe ]
1288	0						my @attr = getMorphoAttributes(\@range, \%buckets);
1289	0						@attr = uniq(@attr); #make unique forms
1290	0						$bucketAttr{$testBucket} = \@attr;
1291
1292	0						my @attrARFF = @attr;
1293	0						foreach my $a(@attrARFF){$a .= $morphID;}
	0
1294	0						$bucketAttrARFF{$testBucket} = \@attrARFF;
1295							}
1296
1297							#add to overall
1298	0						$vecARFFattr{vector} = \%bucketAttr;
1299	0						$vecARFFattr{arff} = \%bucketAttrARFF;
1300
1301	0						return %vecARFFattr;
1302							}else{
1303	0						my %bucketAttr = ();
1304	0						my %bucketAttrARFF = ();
1305
1306							#get the attributes for each bucket
1307	0						foreach my $testBucket (@allBuckets){
1308	0						my @range = $uniSub->bully($bucketsNum, $testBucket);
1309	0						$uniSub->printDebug("\t\t$name BUCKET #$testBucket/$feature attributes...\n");
1310
1311							#get attributes [ unique and deluxe ]
1312	0						my @attr = getRangeAttributes($feature, \@range, \%buckets);
1313	0						@attr = uniq(@attr); #make unique forms
1314	0						$bucketAttr{$testBucket} = \@attr;
1315
1316	0						my @attrARFF = getAttrDelux($feature, \@attr);
1317	0						$bucketAttrARFF{$testBucket} = \@attrARFF;
1318							}
1319
1320							#add to overall
1321	0						$vecARFFattr{vector} = \%bucketAttr;
1322	0						$vecARFFattr{arff} = \%bucketAttrARFF;
1323
1324	0						return %vecARFFattr;
1325							}
1326							}
1327
1328
1329
1330							#makes an array with unique elements
1331							# input : @orig_arr <-- the original array w/ repeats
1332							# output : @new_arr <-- same array but w/o repeats
1333							sub makeUniq{
1334	0			0	0		my $orig_arr_ref = shift;
1335	0						my @orig_arr = @$orig_arr_ref;
1336
1337	0						my @new_arr = ();
1338	0						foreach my $t (@orig_arr){
1339	0	0	0				unless($uniSub->inArr($t, \@new_arr) or $t =~/\s+/o or $t =~/\b$entId\b/o or length($t) == 0){
			0
			0
1340	0						push @new_arr, $t;
1341							}
1342							}
1343	0						@new_arr = grep { $_ ne '' } @new_arr;
	0
1344	0						return @new_arr;
1345							}
1346
1347
1348							#returns the attribute values of a range of buckets
1349							# input : $type <-- the feature type [e.g. ortho, morph, text]
1350							# : @bucketRange <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm]
1351							# : %buckets <-- the bucket key set
1352							# output : @attributes <-- the set of attributes for the specific type and range
1353							sub getRangeAttributes{
1354	0			0	0		my $type = shift;
1355	0						my $bucketRange_ref = shift;
1356	0						my $buckets_ref = shift;
1357	0						my @bucketRange = @$bucketRange_ref;
1358	0						my %buckets = %$buckets_ref;
1359
1360							#collect all the necessary keys
1361	0						my @keyRing = ();
1362	0						foreach my $bucket (sort { $a <=> $b } keys %buckets){
	0
1363	0	0					if($uniSub->inArr($bucket, \@bucketRange)){
1364	0						my @keys = @{$buckets{$bucket}};
	0
1365	0						push @keyRing, @keys;
1366							}
1367							}
1368
1369							#get the tokens for each associated key
1370	0						my @bucketTokens = ();
1371	0						foreach my $key (@keyRing){
1372	0						push @bucketTokens, @{$tokenHash{$key}};
	0
1373							}
1374
1375							#get the concepts for each associated key
1376	0						my @bucketConcepts = ();
1377	0	0					if($type eq "sem"){
		0
1378	0						foreach my $key (@keyRing){
1379	0						push @bucketConcepts, $semHash{$key};
1380							}
1381							}elsif($type eq "cui"){
1382	0						foreach my $key (@keyRing){
1383	0						push @bucketConcepts, $cuiHash{$key};
1384							}
1385							}
1386
1387
1388							#get particular value from the tokens and concepts
1389	0						my @attributes = ();
1390	0	0	0				if($type eq "text" or $type eq "pos"){ #get the text attributes
		0	0
1391	0						my @tokenWords = ();
1392	0						foreach my $token(@bucketTokens){
1393	0						my $tokenText = $token->{text};
1394
1395							#add to the tokens
1396	0	0	0				if($tokenText =~ /\w+\s\w+/o){
		0	0
1397	0						my @tokenText2 = split(" ", $tokenText);
1398	0						push @tokenWords, @tokenText2;
1399							}elsif($tokenText ne "." and $tokenText ne "-" and !($tokenText =~ /[^a-zA-Z0-9]/)){
1400	0						push @tokenWords, $tokenText;
1401							}
1402
1403							#clean up the text
1404	0						foreach my $toky(@tokenWords){
1405	0						$toky = cleanToken($toky);
1406							}
1407
1408							}
1409							#gets the tokens for the attributes and vector analysis
1410	0	0					if($type eq "text"){
		0
1411	0						@attributes = @tokenWords;
1412							}elsif($type eq "pos"){
1413	0						@attributes = getTokensPOS(\@bucketTokens, \@tokenWords, \@keyRing);
1414							}
1415
1416							}
1417							#get the concept-based attributes
1418							elsif($type eq "sem" or $type eq "cui"){
1419	0						my @conWords = ();
1420	0						foreach my $conFeat(@bucketConcepts){
1421	0						my @conLine = split / /, $conFeat;
1422	0						push @conWords, @conLine;
1423							}
1424	0						@attributes = uniq (@conWords);
1425
1426							#add a semantic label for differentiation
1427	0	0					if($type eq "sem"){
1428	0						foreach my $a (@attributes){$a = uc($a);}
	0
1429							}
1430
1431							}
1432							#my $a = @attributes;
1433							#$uniSub->printColorDebug("red", "$type ATTR: #$a\n");
1434							#printArr("\n", @attributes);
1435
1436	0						return @attributes;
1437							}
1438
1439							#makes the arff version attributes - makes a copy of each attribute but with "_self" at the end
1440							# input : $f <-- the feature type (used for special features like POS and morph)
1441							# : @attrs <-- the attributes to ready for arff output
1442							# output : @attrDelux <-- the delux-arff attribute set
1443							sub getAttrDelux{
1444	0			0	0		my $f = shift;
1445	0						my $attr_ref = shift;
1446	0						my @attr = @$attr_ref;
1447
1448							#add the _self copy
1449	0						my @attrDelux = ();
1450	0						foreach my $word (@attr){
1451							#check if certain type of feature
1452	0	0					if($f eq "pos"){
1453	0						$word = ($word . "_POS");
1454							}
1455	0						$word =~s/$entId//g;
1456
1457							#add the copy and then the original
1458	0						my $copy = "$word" . "$selfId";
1459	0	0					if(!$uniSub->inArr($word, \@attrDelux)){
1460	0						push (@attrDelux, $copy);
1461	0						push(@attrDelux, $word);
1462							}
1463							}
1464	0						return @attrDelux;
1465							}
1466
1467							#returns the lines from a range of buckets
1468							# input : $type <-- the feature type [e.g. ortho, morph, text]
1469							# : @bucketRange <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm]
1470							# : %buckets <-- the bucket key set
1471							# output : @bucketLines <-- the lines for the specific type and bucket keys based on the range
1472							sub getRangeLines{
1473	0			0	0		my $type = shift;
1474	0						my $bucketRange_ref = shift;
1475	0						my $buckets_ref = shift;
1476	0						my @bucketRange = @$bucketRange_ref;
1477	0						my %buckets = %$buckets_ref;
1478
1479							#collect all the necessary keys
1480	0						my @keyRing = ();
1481	0						foreach my $bucket (sort { $a <=> $b } keys %buckets){
	0
1482	0						my @bucKeys = @{$buckets{$bucket}};
	0
1483	0	0					if($uniSub->inArr($bucket, \@bucketRange)){
1484	0						push @keyRing, @bucKeys;
1485							}
1486							}
1487
1488	0						my @bucketLines = ();
1489							#get the lines for each associated key
1490	0	0					if($type eq "text"){
		0
		0
		0
1491							#[line based]
1492	0						foreach my $key (@keyRing){
1493	0						my $line = $fileHash{$key};
1494	0						push @bucketLines, $line;
1495							}
1496							}
1497							elsif($type eq "pos"){
1498	0						foreach my $key (@keyRing){
1499	0						my $line = $posHash{$key};
1500	0						push @bucketLines, $line;
1501							}
1502							}
1503							elsif($type eq "sem"){
1504	0						foreach my $key (@keyRing){
1505	0						my $line = $semHash{$key};
1506	0						push @bucketLines, $line;
1507							}
1508							}
1509							elsif($type eq "cui"){
1510	0						foreach my $key (@keyRing){
1511	0						my $line = $cuiHash{$key};
1512	0						push @bucketLines, $line;
1513							}
1514							}
1515
1516	0						return @bucketLines;
1517							}
1518							#looks at the prefix # and suffix # and returns a substring of each word found in the bucket text set
1519							# input : @bucketRange <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm]
1520							# : %buckets <-- the bucket key set
1521							# output : @attributes <-- the morphological attribute set
1522							sub getMorphoAttributes{
1523	0			0	0		my $bucketRange_ref = shift;
1524	0						my $buckets_ref = shift;
1525	0						my @bucketRange = @$bucketRange_ref;
1526	0						my %buckets = %$buckets_ref;
1527
1528							#collect all the necessary keys
1529	0						my @keyRing = ();
1530	0						foreach my $bucket (sort { $a <=> $b } keys %buckets){
	0
1531	0	0					if($uniSub->inArr($bucket, \@bucketRange)){
1532	0						my @keys = @{$buckets{$bucket}};
	0
1533	0						push @keyRing, @keys;
1534							}
1535							}
1536
1537	0						my @bucketLines = ();
1538							#get the lines for each associated key
1539	0						foreach my $key (@keyRing){
1540	0						my $line = $fileHash{$key};
1541	0						push @bucketLines, $line;
1542							}
1543
1544							#get each word from each line
1545	0						my @wordSet = ();
1546	0						foreach my $line (@bucketLines){
1547	0						my @words = split(" ", $line);
1548	0						push(@wordSet, @words);
1549							}
1550
1551							#get the prefix and suffix from each word
1552	0						my @attributes = ();
1553	0						foreach my $word (@wordSet){
1554	0						$word =~s/$entId//g;
1555	0						push(@attributes, substr($word, 0, $prefix)); #add the word's prefix
1556	0						push(@attributes, substr($word, -$suffix)); #add the word's suffix
1557							}
1558
1559							#my $a = @attributes;
1560							#$uniSub->printColorDebug("red", "$type ATTR: #$a\n");
1561							#printArr("\n", @attributes);
1562
1563	0						return @attributes;
1564							}
1565
1566							#formats attributes for the ARFF file
1567							# input : @set <-- the attribute set
1568							# output : @attributes <-- the arff formatted attributes
1569							sub makeAttrData{
1570	0			0	0		my $set_ref = shift;
1571	0						my @set = @$set_ref;
1572
1573	0						my @attributes = ();
1574	0						foreach my $attr (@set){
1575	0						push (@attributes, "\@ATTRIBUTE $attr NUMERIC");
1576							}
1577
1578	0						return @attributes;
1579							}
1580
1581							##new stoplist function
1582							sub stop {
1583
1584	0			0	0		my $stopfile = shift;
1585
1586	0						my $stop_regex = "";
1587	0						my $stop_mode = "AND";
1588
1589	0	0					open ( STP, $stopfile ) \|\|
1590							die ("Couldn't open the stoplist file $stopfile\n");
1591
1592	0						while ( ) {
1593	0						chomp;
1594
1595	0	0					if(/\@stop.mode\s=\s(\w+)\s*$/) {
1596	0						$stop_mode=$1;
1597	0	0					if(!($stop_mode=~/^(AND\|and\|OR\|or)$/)) {
1598	0						print STDERR "Requested Stop Mode $1 is not supported.\n";
1599	0						exit;
1600							}
1601	0						next;
1602							}
1603
1604							# accepting Perl Regexs from Stopfile
1605	0						s/^\s+//;
1606	0						s/\s+$//;
1607
1608							#handling a blank lines
1609	0	0					if(/^\s*$/) { next; }
	0
1610
1611							#check if a valid Perl Regex
1612	0	0					if(!(/^\//)) {
1613	0						print STDERR "Stop token regular expression <$_> should start with '/'\n";
1614	0						exit;
1615							}
1616	0	0					if(!(/\/$/)) {
1617	0						print STDERR "Stop token regular expression <$_> should end with '/'\n";
1618	0						exit;
1619							}
1620
1621							#remove the / s from beginning and end
1622	0						s/^\///;
1623	0						s/\/$//;
1624
1625							#form a single big regex
1626	0						$stop_regex.="(".$_.")\|";
1627							}
1628
1629	0	0					if(length($stop_regex)<=0) {
1630	0						print STDERR "No valid Perl Regular Experssion found in Stop file $stopfile";
1631	0						exit;
1632							}
1633
1634	0						chop $stop_regex;
1635
1636							# making AND a default stop mode
1637	0	0					if(!defined $stop_mode) {
1638	0						$stop_mode="AND";
1639							}
1640
1641	0						close STP;
1642
1643	0						return $stop_regex;
1644							}
1645
1646							1;