File Coverage

blib/lib/NanoB2B/NER.pm
Criterion Covered Total %
statement 23 351 6.5
branch 0 106 0.0
condition 0 21 0.0
subroutine 8 18 44.4
pod 6 9 66.6
total 37 505 7.3


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2             # NanoB2B::NER
3             # (Last Updated $Id: NER.pm,v 0.03 2017/07/06 16:52:33 charityml Exp $)
4             #
5             # Perl module that turns labeled text lines into
6             # ARFF files based on specified features
7             # that are extracted using MetaMap
8             # and runs through WEKA to average the results
9             #
10             # Copyright (c) 2017
11             #
12             # Megan Charity, Virginia Commonwealth University
13             # charityml at vcu.edu
14             #
15             # Bridget T. McInnes, Virginia Commonwealth University
16             # btmcinnes at vcu.edu
17             #
18             # This program is free software; you can redistribute it and/or
19             # modify it under the terms of the GNU General Public License
20             # as published by the Free Software Foundation; either version 2
21             # of the License, or (at your option) any later version.
22             #
23             # This program is distributed in the hope that it will be useful,
24             # but WITHOUT ANY WARRANTY; without even the implied warranty of
25             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26             # GNU General Public License for more details.
27             #
28             # You should have received a copy of the GNU General Public License
29             # along with this program; if not, write to
30             #
31             # The Free Software Foundation, Inc.,
32             # 59 Temple Place - Suite 330,
33             # Boston, MA 02111-1307, USA.
34              
35              
36             =head1 NAME
37              
38             NanoB2B::NER - turns labeled text lines into ARFF files based on
39             specified features that are extracted using MetaMap and runs
40             through WEKA to average the results
41              
42             =head1 DESCRIPTION
43              
44             This package turns labeled text lines into ARFF files based on
45             specified features that are extracted using MetaMap and runs
46             through WEKA to average the results
47              
48             For more information please see the NanoB2B::NER.pm documentation.
49              
50             =head1 SYNOPSIS
51              
52             add synopsis
53              
54             =head1 ABSTRACT
55              
56             There is a critical need to automatically extract and synthesize knowledge and
57             trends in nanotechnology research from an exponentially increasing body of
58             literature. Engineered nanomaterials (ENMs), such as nanomedicines, are
59             continuously being discovered and Natural Language Processing approaches can
60             semi‐automate the cataloging of ENMs and their unique physico‐chemical
61             properties; automatically aggregate studies on their exposure and hazards;
62             and link the physicochemical properties to the measured effects.
63             The goal of this project is to develop a nanomedicine entity extraction system
64             to automatically identify nanomedicine physico-characteristics,
65             exposure and biological effects.
66              
67             =head1 INSTALL
68              
69             To install the module, run the following magic commands:
70              
71             perl Makefile.PL
72             make
73             make test
74             make install
75              
76             This will install the module in the standard location. You will, most
77             probably, require root privileges to install in standard system
78             directories. To install in a non-standard directory, specify a prefix
79             during the 'perl Makefile.PL' stage as:
80              
81             perl Makefile.PL PREFIX=/home/milk
82              
83             It is possible to modify other parameters during installation. The
84             details of these can be found in the ExtUtils::MakeMaker
85             documentation. However, it is highly recommended not messing around
86             with other parameters, unless you know what you're doing.
87              
88             =head1 FUNCTION DESCRIPTIONS
89             =cut
90              
91             package NanoB2B::NER;
92              
93 1     1   446 use 5.006;
  1         3  
94 1     1   4 use strict;
  1         3  
  1         18  
95 1     1   3 use warnings FATAL => 'all';
  1         4  
  1         38  
96              
97 1     1   300 use NanoB2B::UniversalRoutines;
  1         3  
  1         27  
98 1     1   472 use NanoB2B::NER::Metaman;
  1         2  
  1         23  
99 1     1   450 use NanoB2B::NER::Arffman;
  1         2  
  1         27  
100 1     1   337 use NanoB2B::NER::Wekaman;
  1         3  
  1         24  
101 1     1   352 use NanoB2B::NER::Avgman;
  1         4  
  1         2333  
102              
103             #the instances of the modules (named 'boy' because they are expendable and constantly changing - like Robin, BOY Wonder to BatMAN)
104             my $uniSub;
105             my $metaboy;
106             my $arffboy;
107             my $wekaboy;
108             my $avgboy;
109              
110             our $VERSION = '0.07';
111              
112             #option variables
113             my $debug = 0;
114              
115             #for wcs
116             my $wcs_found = 1;
117              
118             =head1 NAME
119              
120             NanoB2B-NNER-PM::NER - The main file that runs all of the processes for NER
121              
122             =head1 DESCRIPTION
123              
124             This package turns nanoparticle texts into ARFF
125             files and WEKA accuracy files based on the nanoparticle characteristics
126             found from pre-annotated articles
127              
128             =head1 VERSION
129              
130             Version 0.07
131              
132             =head1 INITIALIZING THE MODULE
133              
134             To create an instance of the ner module, using default values
135             for all configuration options:
136              
137             use NanoB2B::NER;
138             my %params = ();
139             $params{'dir'} = "my_directory";
140             $params{'features'} = "ortho morph text pos cui sem";
141              
142             my $nner = new NanoB2B::NER(\%params);
143              
144             =cut
145              
146              
147             # -------------------- Class methods start here --------------------
148              
149             # method to create a new NanpB2B-NNER-PM::NER object
150             # output: $self <- an NER object
151             sub new {
152             #grab class and parameters
153 0     0 0   my $self = {};
154 0           my $class = shift;
155 0 0         return undef if(ref $class);
156 0           my $params = shift;
157              
158             #bless this object
159 0           bless $self, $class;
160 0           $self->_init($params);
161              
162             #retrieve parameters for universal-routines
163 0           my %uniParams = ();
164 0           $uniParams{'debug'} = $debug;
165 0           $uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
166              
167             #return the object
168 0           return $self;
169             }
170              
171             # method to initialize the NanoB2B::NER::Arffman object.
172             # input : $parameters <- reference to a hash
173             # output: (module variables to use in other parameters)
174             sub _init {
175 0     0     my $self = shift;
176 0           my $params = shift;
177              
178 0 0         $params = {} if(!defined $params);
179              
180             #get the parameters
181 0           my $opt_dir = $params->{'dir'};
182 0           my $opt_file = $params->{'file'};
183 0           my $opt_sortbysize = $params->{'sortBySize'};
184 0           my $opt_index = $params->{'index'};
185 0           my $opt_debug = $params->{'debug'};
186 0           my $opt_importmeta = $params->{'import_meta'};
187 0           my $opt_features = $params->{'features'};
188 0           my $opt_buckets = $params->{'buckets'};
189 0           my $opt_stopwords = $params->{'stopwords'};
190 0           my $opt_is_cui = $params->{'is_cui'};
191 0           my $opt_sparse = $params->{'sparse_matrix'};
192 0           my $opt_prefix = $params->{'prefix'};
193 0           my $opt_suffix = $params->{'suffix'};
194 0           my $opt_wcs = $params->{'wcs'};
195 0           my $opt_wekatype = $params->{'weka_type'};
196 0           my $opt_wekasize = $params->{'weka_size'};
197 0           my $opt_metamaparguments = $params->{'metamap_arguments'};
198              
199              
200             #set the global variables
201              
202             #required variables
203 0 0         if(defined $opt_dir){ #if using the entire directory
204 0           $self->{program_dir} = $opt_dir;
205             }else{
206 0           print ("***ERROR: DIRECTORY NOT DEFINED!!***\n");
207 0           exit(-1);
208             }
209 0 0         if(defined $opt_features){ #grab the features you want to use
210 0           $self->{features} = $opt_features;
211             }else{
212 0           print("***ERROR: FEATURE SET NOT DEFINED!!***\n");
213 0           exit(-1);
214             }
215              
216              
217             #not required variables
218 0 0         if(defined $opt_file){ #if using one file
219 0           $self->{program_file} = $opt_file;
220             }
221              
222 0 0         if(defined $opt_debug){ #run the programs with debug mode on
223 0           $self->{debug} = $opt_debug;
224 0           $debug = $opt_debug;
225             }else{
226 0           $debug = 0;
227             }
228              
229 0 0         if(defined $opt_stopwords){ #exclude stop words from arff vectors
230 0           $self->{stopwords} = $opt_stopwords;
231             }
232              
233 0 0         if(defined $opt_is_cui){ #if a word doesn't have a cui - don't make a vector
234 0           $self->{is_cui} = $opt_is_cui;
235             }else{
236 0           $self->{is_cui} = 0;
237             }
238              
239 0 0         if(defined $opt_sparse){ #decide to turn vector's into sparse format
240 0           $self->{sparse_matrix} = $opt_sparse;
241             }else{
242 0           $self->{sparse_matrix} = 0;
243             }
244              
245 0 0         if(defined $opt_importmeta){ #decide to import metamap or not
246 0           $self->{import_meta} = $opt_importmeta;
247             }else{
248 0           $self->{import_meta} = 0;
249             }
250              
251 0 0         if(defined $opt_prefix){ #get the word prefix character count
252 0           $self->{prefix} = $opt_prefix;
253             }else{
254 0           $self->{prefix} = 3;
255             }
256 0 0         if(defined $opt_suffix){ #get the word suffix character count
257 0           $self->{suffix} = $opt_suffix;
258             }else{
259 0           $self->{suffix} = 3;
260             }
261              
262 0 0         if(defined $opt_wcs){ #check if to do worst-case-scenario backup
263 0           $self->{wcs} = $opt_wcs;
264 0           $wcs_found = 0;
265             }
266              
267 0 0         if(defined $opt_wekatype){ #get the type of weka algorithm to run
268 0           $self->{weka_type} = $opt_wekatype;
269             }else{
270 0           $self->{weka_type} = "weka.classifiers.bayes.NaiveBayes";
271             }
272              
273 0 0         if(defined $opt_wekasize){ #set the memory allocation size for weka to run
274 0           $self->{weka_size} = $opt_wekasize;
275             }else{
276 0           $self->{weka_size} = "-Xmx2G";
277             }
278              
279 0 0         if(defined $opt_buckets){ #set the number of buckets to run for k-fold cross validation
280 0           $self->{bucketsNum} = $opt_buckets;
281             }else{
282 0           $self->{bucketsNum} = 10;
283             }
284              
285 0 0         if(defined $opt_index){ #start at a certain file given an index
286 0           $self->{fileIndex} = $opt_index;
287             }else{
288 0           $self->{fileIndex} = 1;
289             }
290              
291 0 0         if(defined $opt_sortbysize){ #start at a certain file given an index
292 0           $self->{sortSize} = $opt_sortbysize;
293             }else{
294 0           $self->{sortSize} = 0;
295             }
296 0 0         if(defined $opt_metamaparguments){ #run metamap with specific arguments
297 0           $self->{metamap_arguments} = $opt_metamaparguments;
298             } else {
299 0           $self->{metamap_arguments} = "-q";
300             }
301              
302             #error handling?
303 0 0 0       if(defined $opt_file and $self->{fileIndex} > 1){
304 0           print("***ERROR: Cannot have an index value for a single file!***\n");
305 0           exit;
306             }
307              
308             #check for out of bounds index
309 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
310 0 0 0       my @files = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR;
  0            
311 0           my $totFiles = @files;
312 0 0         if($self->{fileIndex} > $totFiles){
313 0           print("***ERROR: Index cannot be greater than the number of files in the directory!***\n");
314 0           exit;
315             }
316              
317             }
318              
319              
320              
321              
322             ######################### THE MEATY STUFF ##########################
323              
324             =head3 nerByFile
325              
326             description:
327              
328             Runs the files specified in the parameters program_dir metamaps all the files, arffs all the files, wekas all the files, and averages all the files
329             This NER method doesn't move on to the next file until all the methods have been used
330              
331             input:
332              
333             None
334              
335             output:
336              
337             Metamap files, ARFF file sets, Weka file sets, and Averaged Accuracy files
338              
339             example:
340              
341             use NanoB2B::NER;
342             my %params = ();
343             $params{'dir'} = "my_directory";
344             $params{'features'} = "ortho morph text pos cui sem";
345              
346             my $nner = new NanoB2B::NER(\%params);
347             $nner->nerByFile();
348              
349             =cut
350              
351             sub nerByFile{
352 0     0 1   my $self = shift;
353              
354             #open the directory
355 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
356              
357             #if not doing a single file
358 0 0         if(!defined $self->{program_file}){
359 0 0 0       my @files = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
360              
361             #sort by size?
362 0 0         if($self->{sortSize}){
363 0           @files = sortBySize($self, \@files);
364             }
365             #ner the files individually
366 0           my $totalTags = @files;
367 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
368 0           $uniSub->printColorDebug("on_blue", "FILE #$a / $totalTags");
369 0           my $tag = $files[$a - 1];
370            
371             #metamap the files if needed
372 0 0         if(!$self->{import_meta}){
373 0           $uniSub->printColorDebug("bold cyan", "--- METAMAP ---\n");
374 0           my %paramsm = ();
375 0           $paramsm{'directory'} = $self->{program_dir};
376 0           $paramsm{'index'} = $self->{index};
377 0           $paramsm{'metamap_arguments'} = $self->{metamap_arguments};
378 0           $paramsm{'debug'} = $debug;
379 0           $metaboy = NanoB2B::NER::Metaman->new(\%paramsm);
380 0           $metaboy->meta_file($tag);
381             }
382              
383             #arff the file
384 0           $uniSub->printColorDebug("bold magenta", "--- ARFF ---\n");
385             #define arffboy with the parameters
386 0           my %paramsr = ();
387 0           $paramsr{'directory'} = $self->{program_dir};
388 0           $paramsr{'features'} = $self->{features};
389 0           $paramsr{'bucketsNum'} = $self->{bucketsNum};
390 0           $paramsr{'debug'} = $debug;
391 0           $paramsr{'prefix'} = $self->{prefix};
392 0           $paramsr{'suffix'} = $self->{suffix};
393 0           $paramsr{'index'} = $self->{index};
394 0           $paramsr{'stopwords'} = $self->{stopwords};
395 0           $paramsr{'is_cui'} = $self->{is_cui};
396 0           $paramsr{'sparse_matrix'} = $self->{sparse_matrix};
397 0 0         if(!$wcs_found){
398 0           $paramsr{'wcs'} = $self->{wcs};
399 0           $wcs_found = 1;
400             }
401 0           $arffboy = NanoB2B::NER::Arffman->new(\%paramsr);
402 0           $arffboy->arff_file($tag);
403              
404             #weka the file
405 0           $uniSub->printColorDebug("bold yellow", "--- WEKA ---\n");
406             #define wekaboy with the parameters
407 0           my %paramsw = ();
408 0           $paramsw{'directory'} = $self->{program_dir};
409 0           $paramsw{'type'} = $self->{weka_type};
410 0           $paramsw{'weka_size'} = $self->{weka_size};
411 0           $paramsw{'features'} = $self->{features};
412 0           $paramsw{'buckets'} = $self->{bucketsNum};
413 0           $paramsw{'debug'} = $debug;
414 0           $wekaboy = NanoB2B::NER::Wekaman->new(\%paramsw);
415 0           $wekaboy->weka_file($tag);
416              
417             #average the set
418 0           $uniSub->printColorDebug("bold green", "--- AVG ---\n");
419             #define avgboy with the parameters
420 0           my %paramsa = ();
421 0           $paramsa{'directory'} = $self->{program_dir};
422 0           my @a = split(/\./, $self->{weka_type});
423 0           $paramsa{'weka_dir'} = $a[$#a];
424 0           $paramsa{'features'} = $self->{features};
425 0           $paramsa{'buckets'} = $self->{bucketsNum};
426 0           $paramsa{'debug'} = $debug;
427 0           $avgboy = NanoB2B::NER::Avgman->new(\%paramsa);
428 0           $avgboy->avg_file($tag);
429              
430 0           $uniSub->printColorDebug("on_blue", "## FINISHED #$a - $tag! ##");
431             }
432             }else{
433 0           my $tag = $self->{program_file};
434 0           $uniSub->printColorDebug("on_blue", "FILE #$tag");
435            
436             #metamap the files if needed
437 0 0         if(!$self->{import_meta}){
438 0           $uniSub->printColorDebug("bold cyan", "--- METAMAP ---\n");
439 0           my %paramsm = ();
440 0           $paramsm{'directory'} = $self->{program_dir};
441 0           $paramsm{'index'} = $self->{index};
442 0           $paramsm{'metamap_arguments'} = $self->{metamap_arguments};
443 0           $paramsm{'debug'} = $debug;
444 0           $metaboy = NanoB2B::NER::Metaman->new(\%paramsm);
445 0           $metaboy->meta_file($tag);
446             }
447              
448             #arff the file
449 0           $uniSub->printColorDebug("bold magenta", "--- ARFF ---\n");
450             #define arffboy with the parameters
451 0           my %paramsr = ();
452 0           $paramsr{'directory'} = $self->{program_dir};
453 0           $paramsr{'features'} = $self->{features};
454 0           $paramsr{'bucketsNum'} = $self->{bucketsNum};
455 0           $paramsr{'debug'} = $debug;
456 0           $paramsr{'prefix'} = $self->{prefix};
457 0           $paramsr{'suffix'} = $self->{suffix};
458 0           $paramsr{'index'} = $self->{index};
459 0           $paramsr{'stopwords'} = $self->{stopwords};
460 0           $paramsr{'is_cui'} = $self->{is_cui};
461 0           $paramsr{'sparse_matrix'} = $self->{sparse_matrix};
462 0 0         if(!$wcs_found){
463 0           $paramsr{'wcs'} = $self->{wcs};
464 0           $wcs_found = 1;
465             }
466 0           $arffboy = NanoB2B::NER::Arffman->new(\%paramsr);
467 0           $arffboy->arff_file($tag);
468              
469             #weka the file
470 0           $uniSub->printColorDebug("bold yellow", "--- WEKA ---\n");
471             #define wekaboy with the parameters
472 0           my %paramsw = ();
473 0           $paramsw{'directory'} = $self->{program_dir};
474 0           $paramsw{'type'} = $self->{weka_type};
475 0           $paramsw{'weka_size'} = $self->{weka_size};
476 0           $paramsw{'features'} = $self->{features};
477 0           $paramsw{'buckets'} = $self->{bucketsNum};
478 0           $paramsw{'debug'} = $debug;
479 0           $wekaboy = NanoB2B::NER::Wekaman->new(\%paramsw);
480 0           $wekaboy->weka_file($tag);
481              
482             #average the set
483 0           $uniSub->printColorDebug("bold green", "--- AVG ---\n");
484             #define avgboy with the parameters
485 0           my %paramsa = ();
486 0           $paramsa{'directory'} = $self->{program_dir};
487 0           my @a = split(/\./, $self->{weka_type});
488 0           $paramsa{'weka_dir'} = $a[$#a];
489 0           $paramsa{'features'} = $self->{features};
490 0           $paramsa{'buckets'} = $self->{bucketsNum};
491 0           $paramsa{'debug'} = $debug;
492 0           $avgboy = NanoB2B::NER::Avgman->new(\%paramsa);
493 0           $avgboy->avg_file($tag);
494              
495 0           $uniSub->printColorDebug("on_blue", "## FINISHED #$tag! ##");
496             }
497            
498             }
499              
500              
501             =head3 nerByMethod
502              
503             description:
504              
505             Runs the files specified in the parameters program_dir metamaps all the files, arffs all the files, wekas all the files, and averages all the files
506             This NER method doesn't move on to the next method until all the files have been processed
507              
508             input:
509              
510             None
511              
512             output:
513              
514             Metamap files, ARFF file sets, Weka file sets, and Averaged Accuracy files
515              
516             example:
517              
518             use NanoB2B::NER;
519             my %params = ();
520             $params{'dir'} = "my_directory";
521             $params{'features'} = "ortho morph text pos cui sem";
522              
523             my $nner = new NanoB2B::NER(\%params);
524             $nner->nerByMethod();
525              
526             =cut
527              
528             sub nerByMethod{
529 0     0 1   my $self = shift;
530              
531             #meta the files if needed
532 0 0         if(!$self->{import_meta}){
533 0           $self->metaSet();
534             }
535            
536             #arff the files
537 0           $self->arffSet();
538              
539             #weka the files
540 0           $self->wekaSet();
541              
542             #average the files
543 0           $self->avgSet();
544             }
545              
546             =head3 metaSet
547              
548             description:
549              
550             Runs a set of files through metamap
551              
552             input:
553              
554             None
555              
556             output:
557              
558             Metamap files for every file found in the directory specified in the constructor parameters
559              
560             example:
561              
562             use NanoB2B::NER;
563             my %params = ();
564             $params{'dir'} = "my_directory";
565             $params{'features'} = "ortho morph text pos cui sem";
566              
567             my $nner = new NanoB2B::NER(\%params);
568             $nner->metaSet();
569              
570             =cut
571             sub metaSet{
572 0     0 1   my $self = shift;
573              
574 0           print "\tDIR: " . $self->{'program_dir'} . "\n";
575            
576             #open the directory
577 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
578 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
579 0           my $totalTags = @tags;
580              
581             #if only file
582 0 0         if($self->{program_file}){
583 0           $totalTags = 1;
584             }
585              
586             #sort by size?
587 0 0         if($self->{sortSize}){
588 0           @tags = sortBySize($self, \@tags);
589             }
590              
591             #if only one file reduce it to the one
592 0 0         if(defined $self->{program_file}){
593 0           @tags = ($self->{program_file});
594             }
595              
596             #define metaboy with the parameters
597 0           my %params = ();
598 0           $params{'directory'} = $self->{program_dir};
599 0           $params{'index'} = $self->{index};
600 0           $params{'metamap_arguments'} = $self->{metamap_arguments};
601 0           $params{'debug'} = $debug;
602 0           $metaboy = NanoB2B::NER::Metaman->new(\%params);
603            
604             #run set through metamap
605 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
606 0           $uniSub->printColorDebug("bold cyan", "META FILE #$a / $totalTags\n");
607 0           my $tag = $tags[$a - 1];
608 0           $metaboy->meta_file($tag);
609 0           $uniSub->printColorDebug("bold cyan", "## FINISHED METAMAP #$a - $tag! ##\n");
610             }
611             }
612              
613             =head3 arffSet
614              
615             description:
616              
617             Turns a set of files into ARFF files based on the features specificied in the constructor parameters
618              
619             input:
620              
621             None
622              
623             output:
624              
625             ARFF file sets for every file found in the directory specified in the constructor parameters
626              
627             example:
628              
629             use NanoB2B::NER;
630             my %params = ();
631             $params{'dir'} = "my_directory";
632             $params{'features'} = "ortho morph text pos cui sem";
633              
634             my $nner = new NanoB2B::NER(\%params);
635             $nner->arffSet();
636              
637             =cut
638             sub arffSet{
639 0     0 1   my $self = shift;
640              
641             #open the directory
642 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
643 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
644 0           my $totalTags = @tags;
645              
646             #if only one file
647 0 0         if($self->{program_file}){
648 0           $totalTags = 1;
649             }
650              
651             #sort by size?
652 0 0         if($self->{sortSize}){
653 0           @tags = sortBySize($self, \@tags);
654             }
655              
656             #if only one file reduce it to the one
657 0 0         if(defined $self->{program_file}){
658 0           @tags = ($self->{program_file});
659             }
660              
661             #define arffboy with the parameters
662 0           my %params = ();
663 0           $params{'directory'} = $self->{'program_dir'};
664 0           $params{'features'} = $self->{'features'};
665 0           $params{'bucketsNum'} = $self->{'bucketsNum'};
666 0           $params{'debug'} = $debug;
667 0           $params{'prefix'} = $self->{'prefix'};
668 0           $params{'suffix'} = $self->{'suffix'};
669 0           $params{'index'} = $self->{'index'};
670 0           $params{'stopwords'} = $self->{'stopwords'};
671 0           $params{'is_cui'} = $self->{'is_cui'};
672 0           $params{'sparse_matrix'} = $self->{'sparse_matrix'};
673 0           $params{'wcs'} = $self->{'wcs'};
674 0           $arffboy = NanoB2B::NER::Arffman->new(\%params);
675              
676 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
677 0           $uniSub->printColorDebug("bold magenta", "ARFF FILE #$a / $totalTags\n");
678 0           my $tag = $tags[$a - 1];
679 0 0         if($wcs_found){
680 0           $params{'wcs'} = "";
681 0           $arffboy = NanoB2B::NER::Arffman->new(\%params);
682             }else{
683 0           $wcs_found = 1;
684             }
685 0           $arffboy->arff_file($tag);
686 0           $uniSub->printColorDebug("bold magenta", "## FINISHED ARFF #$a - $tag! ##\n");
687             }
688             }
689              
690             =head3 wekaSet
691              
692             description:
693              
694             Runs a set of ARFF files through WEKA
695              
696             input:
697              
698             None
699              
700             output:
701              
702             WEKA files for every file found in the directory specified in the constructor parameters
703              
704             example:
705              
706             use NanoB2B::NER;
707             my %params = ();
708             $params{'dir'} = "my_directory";
709             $params{'features'} = "ortho morph text pos cui sem";
710              
711             my $nner = new NanoB2B::NER(\%params);
712             $nner->wekaSet();
713              
714             =cut
715             sub wekaSet{
716 0     0 1   my $self = shift;
717              
718             #open the directory
719 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
720 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
721 0           my $totalTags = @tags;
722              
723             #if only one file
724 0 0         if($self->{program_file}){
725 0           $totalTags = 1;
726             }
727              
728             #sort by size?
729 0 0         if($self->{sortSize}){
730 0           @tags = sortBySize($self, \@tags);
731             }
732              
733             #if only one file reduce it to the one
734 0 0         if(defined $self->{program_file}){
735 0           @tags = ($self->{program_file});
736             }
737              
738             #define wekaboy with the parameters
739 0           my %params = ();
740 0           $params{'directory'} = $self->{program_dir};
741 0           $params{'type'} = $self->{weka_type};
742 0           $params{'weka_size'} = $self->{weka_size};
743 0           $params{'features'} = $self->{features};
744 0           $params{'buckets'} = $self->{bucketsNum};
745 0           $params{'debug'} = $debug;
746 0           $wekaboy = NanoB2B::NER::Wekaman->new(\%params);
747              
748 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
749 0           $uniSub->printColorDebug("bold yellow", "## WEKA FILE #$a / $totalTags ##\n");
750 0           my $tag = $tags[$a - 1];
751 0           $wekaboy->weka_file($tag);
752 0           $uniSub->printColorDebug("bold yellow", "## FINISHED WEKA #$a - $tag! ##\n");
753 0           sleep(1);
754             }
755             }
756              
757             =head3 avgSet
758              
759             description:
760              
761             Averages together a set of WEKA files
762              
763             input:
764              
765             None
766              
767             output:
768              
769             Average accuracy files for every file found in the directory specified in the constructor parameters
770              
771             example:
772              
773             use NanoB2B::NER;
774             my %params = ();
775             $params{'dir'} = "my_directory";
776             $params{'features'} = "ortho morph text pos cui sem";
777              
778             my $nner = new NanoB2B::NER(\%params);
779             $nner->avgSet();
780              
781             =cut
782             sub avgSet{
783 0     0 1   my $self = shift;
784              
785             #open the directory
786 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
787 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
788 0           my $totalTags = @tags;
789              
790             #if only one file
791 0 0         if($self->{program_file}){
792 0           $totalTags = 1;
793             }
794              
795             #sort by size?
796 0 0         if($self->{sortSize}){
797 0           @tags = sortBySize($self, \@tags);
798             }
799              
800             #if only one file reduce it to the one
801 0 0         if(defined $self->{program_file}){
802 0           @tags = ($self->{program_file});
803             }
804              
805             #define avgboy with the parameters
806 0           my %params = ();
807 0           $params{'directory'} = $self->{program_dir};
808 0           my @a = split(/\./, $self->{weka_type});
809 0           $params{'weka_dir'} = $a[$#a];
810 0           $params{'features'} = $self->{features};
811 0           $params{'buckets'} = $self->{bucketsNum};
812 0           $params{'debug'} = $debug;
813 0           $avgboy = NanoB2B::NER::Avgman->new(\%params);
814              
815 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
816 0           $uniSub->printColorDebug("bold green", "AVG FILE #$a / $totalTags\n");
817 0           my $tag = $tags[$a - 1];
818 0           $avgboy->avg_file($tag);
819 0           $uniSub->printColorDebug("bold green", "## FINISHED AVERAGING #$a - $tag! ##\n");
820             }
821             }
822              
823             sub isArr{
824 0     0 0   my $self = shift;
825 0           my $arr_ref = shift;
826 0           my @arr = @$arr_ref;
827              
828 0           print "yay\n";
829              
830             }
831              
832             #sorts a directory by the file size
833             # input : @files <-- get the list of files in the folder
834             # output : @newset <-- the set of files ordered by size from smallest to largest
835             sub sortBySize{
836 0     0 0   my $self = shift;
837 0           my $files_ref = shift;
838 0           my @files = @$files_ref;
839              
840 0           my %hash = ();
841 0           my @newSet = ();
842              
843 0           my $dir = $self->{program_dir};
844              
845             #create hashmap
846 0           foreach my $file (@files){
847 0           my $s = -s "$dir/$file";
848 0           $hash{$s} = $file;
849             }
850              
851             #add sorted sizes to array
852 0           foreach my $key (sort { $a <=> $b } keys %hash){
  0            
853 0           my $name = $hash{$key};
854             #printColorDebug("cyan", "$name - $key\n");
855 0           push @newSet, $name;
856             }
857              
858 0           return @newSet;
859             }
860              
861             1;
862              
863             =head1 SEE ALSO
864              
865             =head1 AUTHOR
866              
867             Megan Charity
868             Bridget T McInnes
869              
870             =head1 COPYRIGHT
871              
872             Copyright (c) 2017
873             Megan Charity, Virginia Commonwealth University
874             charityml at vcu.edu
875              
876             Bridget T. McInnes, Virginia Commonwealth University
877             btmcinnes at vcu.edu
878              
879             This program is free software; you can redistribute it and/or modify it under
880             the terms of the GNU General Public License as published by the Free Software
881             Foundation; either version 2 of the License, or (at your option) any later
882             version.
883              
884             This program is distributed in the hope that it will be useful, but WITHOUT
885             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
886             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
887              
888             You should have received a copy of the GNU General Public License along with
889             this program; if not, write to
890              
891             The Free Software Foundation, Inc.,
892             59 Temple Place - Suite 330,
893             Boston, MA 02111-1307, USA.
894              
895             =cut