File Coverage

blib/lib/NanoB2B/NER.pm
Criterion Covered Total %
statement 26 409 6.3
branch 0 136 0.0
condition 0 24 0.0
subroutine 9 19 47.3
pod 7 9 77.7
total 42 597 7.0


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2             # NanoB2B::NER
3             # (Last Updated $Id: NER.pm,v 0.09 2018/01/18 16:33:33 charityml Exp $)
4             #
5             # Perl module that turns labeled text lines into
6             # ARFF files based on specified features
7             # that are extracted using MetaMap
8             # and runs through WEKA to average the results
9             #
10             # Copyright (c) 2017
11             #
12             # Megan Charity, Virginia Commonwealth University
13             # charityml at vcu.edu
14             #
15             # Bridget T. McInnes, Virginia Commonwealth University
16             # btmcinnes at vcu.edu
17             #
18             # This program is free software; you can redistribute it and/or
19             # modify it under the terms of the GNU General Public License
20             # as published by the Free Software Foundation; either version 2
21             # of the License, or (at your option) any later version.
22             #
23             # This program is distributed in the hope that it will be useful,
24             # but WITHOUT ANY WARRANTY; without even the implied warranty of
25             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26             # GNU General Public License for more details.
27             #
28             # You should have received a copy of the GNU General Public License
29             # along with this program; if not, write to
30             #
31             # The Free Software Foundation, Inc.,
32             # 59 Temple Place - Suite 330,
33             # Boston, MA 02111-1307, USA.
34              
35              
36             =head1 NAME
37              
38             NanoB2B::NER - turns labeled text lines into ARFF files based on
39             specified features that are extracted using MetaMap and runs
40             through WEKA to average the results
41              
42             =head1 DESCRIPTION
43              
44             This package turns labeled text lines into ARFF files based on
45             specified features that are extracted using MetaMap and runs
46             through WEKA to average the results
47              
48             For more information please see the NanoB2B::NER.pm documentation.
49              
50             =head1 SYNOPSIS
51              
52             add synopsis
53              
54             =head1 ABSTRACT
55              
56             There is a critical need to automatically extract and synthesize knowledge and
57             trends in nanotechnology research from an exponentially increasing body of
58             literature. Engineered nanomaterials (ENMs), such as nanomedicines, are
59             continuously being discovered and Natural Language Processing approaches can
60             semi‐automate the cataloging of ENMs and their unique physico‐chemical
61             properties; automatically aggregate studies on their exposure and hazards;
62             and link the physicochemical properties to the measured effects.
63             The goal of this project is to develop a nanomedicine entity extraction system
64             to automatically identify nanomedicine physico-characteristics,
65             exposure and biological effects.
66              
67             =head1 INSTALL
68              
69             To install the module, run the following magic commands:
70              
71             perl Makefile.PL
72             make
73             make test
74             make install
75              
76             This will install the module in the standard location. You will, most
77             probably, require root privileges to install in standard system
78             directories. To install in a non-standard directory, specify a prefix
79             during the 'perl Makefile.PL' stage as:
80              
81             perl Makefile.PL PREFIX=/home/milk
82              
83             It is possible to modify other parameters during installation. The
84             details of these can be found in the ExtUtils::MakeMaker
85             documentation. However, it is highly recommended not messing around
86             with other parameters, unless you know what you're doing.
87              
88             =head1 FUNCTION DESCRIPTIONS
89             =cut
90              
91             package NanoB2B::NER;
92              
93 1     1   537 use 5.006;
  1         3  
94 1     1   8 use strict;
  1         2  
  1         33  
95 1     1   6 use warnings FATAL => 'all';
  1         1  
  1         47  
96              
97 1     1   363 use NanoB2B::UniversalRoutines;
  1         3  
  1         32  
98 1     1   524 use NanoB2B::NER::Metaman;
  1         3  
  1         30  
99 1     1   555 use NanoB2B::NER::Arffman;
  1         5  
  1         44  
100 1     1   565 use NanoB2B::NER::Wekaman;
  1         4  
  1         44  
101 1     1   446 use NanoB2B::NER::Avgman;
  1         2  
  1         41  
102 1     1   453 use NanoB2B::NER::Modelman;
  1         3  
  1         3640  
103              
104             #the instances of the modules (named 'boy' because they are expendable and constantly changing - like Robin, BOY Wonder to BatMAN)
105             my $uniSub;
106             my $metaboy;
107             my $arffboy;
108             my $wekaboy;
109             my $avgboy;
110             my $modelboy;
111              
112             our $VERSION = '1.01';
113              
114             #option variables
115             my $debug = 0;
116              
117             #for wcs
118             my $wcs_found = 1;
119              
120             =head1 NAME
121              
122             NanoB2B-NNER-PM::NER - The main file that runs all of the processes for NER
123              
124             =head1 DESCRIPTION
125              
126             This package turns nanoparticle texts into ARFF
127             files and WEKA accuracy files based on the nanoparticle characteristics
128             found from pre-annotated articles
129              
130             =head1 VERSION
131              
132             Version 1.01
133              
134             =head1 INITIALIZING THE MODULE
135              
136             To create an instance of the ner module, using default values
137             for all configuration options:
138              
139             use NanoB2B::NER;
140             my %params = ();
141             $params{'dir'} = "my_directory";
142             $params{'features'} = "ortho morph text pos cui sem";
143              
144             my $nner = new NanoB2B::NER(\%params);
145              
146             =cut
147              
148              
149             # -------------------- Class methods start here --------------------
150              
151             # method to create a new NanpB2B-NNER-PM::NER object
152             # output: $self <- an NER object
153             sub new {
154             #grab class and parameters
155 0     0 0   my $self = {};
156 0           my $class = shift;
157 0 0         return undef if(ref $class);
158 0           my $params = shift;
159              
160             #bless this object - amen
161 0           bless $self, $class;
162 0           $self->_init($params);
163              
164             #retrieve parameters for universal-routines
165 0           my %uniParams = ();
166 0           $uniParams{'debug'} = $debug;
167 0           $uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
168              
169             #return the object
170 0           return $self;
171             }
172              
173             # method to initialize the NanoB2B::NER::Arffman object.
174             # input : $parameters <- reference to a hash
175             # output: (module variables to use in other parameters)
176             sub _init {
177 0     0     my $self = shift;
178 0           my $params = shift;
179              
180 0 0         $params = {} if(!defined $params);
181              
182             #get the parameters
183 0           my $opt_dir = $params->{'dir'};
184 0           my $opt_file = $params->{'file'};
185 0           my $opt_sortbysize = $params->{'sortBySize'};
186 0           my $opt_index = $params->{'index'};
187 0           my $opt_debug = $params->{'debug'};
188 0           my $opt_importmeta = $params->{'import_meta'};
189 0           my $opt_features = $params->{'features'};
190 0           my $opt_buckets = $params->{'buckets'};
191 0           my $opt_stopwords = $params->{'stopwords'};
192 0           my $opt_is_cui = $params->{'is_cui'};
193 0           my $opt_sparse = $params->{'sparse_matrix'};
194 0           my $opt_prefix = $params->{'prefix'};
195 0           my $opt_suffix = $params->{'suffix'};
196 0           my $opt_wcs = $params->{'wcs'};
197 0           my $opt_wekatype = $params->{'weka_type'};
198 0           my $opt_wekasize = $params->{'weka_size'};
199 0           my $opt_metamaparguments = $params->{'metamap_arguments'};
200 0           my $opt_model = $params->{'model'};
201              
202              
203             #set the global variables
204              
205             #required variables
206 0 0         if(defined $opt_dir){ #if using the entire directory
207 0           $self->{program_dir} = $opt_dir;
208             }else{
209 0           print ("***ERROR: DIRECTORY NOT DEFINED!!***\n");
210 0           exit(-1);
211             }
212 0 0         if(defined $opt_features){ #grab the features you want to use
213 0           $self->{features} = $opt_features;
214             }else{
215 0           print("***ERROR: FEATURE SET NOT DEFINED!!***\n");
216 0           exit(-1);
217             }
218              
219              
220             #not required variables
221 0 0         if(defined $opt_file){ #if using one file
222 0           $self->{program_file} = $opt_file;
223             }
224              
225 0 0         if(defined $opt_debug){ #run the programs with debug mode on
226 0           $self->{debug} = $opt_debug;
227 0           $debug = $opt_debug;
228             }else{
229 0           $debug = 0;
230             }
231              
232 0 0         if(defined $opt_stopwords){ #exclude stop words from arff vectors
233 0           $self->{stopwords} = $opt_stopwords;
234             }
235              
236 0 0         if(defined $opt_is_cui){ #if a word doesn't have a cui - don't make a vector
237 0           $self->{is_cui} = $opt_is_cui;
238             }else{
239 0           $self->{is_cui} = 0;
240             }
241              
242 0 0         if(defined $opt_sparse){ #decide to turn vector's into sparse format
243 0           $self->{sparse_matrix} = $opt_sparse;
244             }else{
245 0           $self->{sparse_matrix} = 0;
246             }
247              
248 0 0         if(defined $opt_importmeta){ #decide to import metamap or not
249 0           $self->{import_meta} = $opt_importmeta;
250             }else{
251 0           $self->{import_meta} = 0;
252             }
253              
254 0 0         if(defined $opt_prefix){ #get the word prefix character count
255 0           $self->{prefix} = $opt_prefix;
256             }else{
257 0           $self->{prefix} = 3;
258             }
259 0 0         if(defined $opt_suffix){ #get the word suffix character count
260 0           $self->{suffix} = $opt_suffix;
261             }else{
262 0           $self->{suffix} = 3;
263             }
264              
265 0 0         if(defined $opt_wcs){ #check if to do worst-case-scenario backup
266 0           $self->{wcs} = $opt_wcs;
267 0           $wcs_found = 0;
268             }
269              
270 0 0         if(defined $opt_wekatype){ #get the type of weka algorithm to run
271 0           $self->{weka_type} = $opt_wekatype;
272             }else{
273 0           $self->{weka_type} = "weka.classifiers.bayes.NaiveBayes";
274             }
275              
276 0 0         if(defined $opt_wekasize){ #set the memory allocation size for weka to run
277 0           $self->{weka_size} = $opt_wekasize;
278             }else{
279 0           $self->{weka_size} = "-Xmx2G";
280             }
281              
282 0 0         if(defined $opt_buckets){ #set the number of buckets to run for k-fold cross validation
283 0           $self->{bucketsNum} = $opt_buckets;
284             }else{
285 0           $self->{bucketsNum} = 10;
286             }
287              
288 0 0         if(defined $opt_index){ #start at a certain file given an index
289 0           $self->{fileIndex} = $opt_index;
290             }else{
291 0           $self->{fileIndex} = 1;
292             }
293              
294 0 0         if(defined $opt_sortbysize){ #start at a certain file given an index
295 0           $self->{sortSize} = $opt_sortbysize;
296             }else{
297 0           $self->{sortSize} = 0;
298             }
299 0 0         if(defined $opt_metamaparguments){ #run metamap with specific arguments
300 0           $self->{metamap_arguments} = $opt_metamaparguments;
301             } else {
302 0           $self->{metamap_arguments} = "-q";
303             }
304              
305 0 0         if(defined $opt_model){
306 0           $self->{model} = $opt_model;
307             }else{
308 0           $self->{model} = 0;
309             }
310              
311             #error handling?
312 0 0 0       if(defined $opt_file and $self->{fileIndex} > 1){
313 0           print("***ERROR: Cannot have an index value for a single file!***\n");
314 0           exit;
315             }
316              
317             #check for out of bounds index
318 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
319 0 0 0       my @files = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR;
  0            
320 0           my $totFiles = @files;
321 0 0         if($self->{fileIndex} > $totFiles){
322 0           print("***ERROR: Index cannot be greater than the number of files in the directory!***\n");
323 0           exit;
324             }
325              
326             }
327              
328              
329              
330              
331             ######################### THE MEATY STUFF ##########################
332              
333             =head3 nerByFile
334              
335             description:
336              
337             Runs the files specified in the parameters program_dir metamaps all the files, arffs all the files, wekas all the files, and averages all the files
338             This NER method doesn't move on to the next file until all the methods have been used
339              
340             input:
341              
342             None
343              
344             output:
345              
346             Metamap files, ARFF file sets, Weka file sets, and Averaged Accuracy files
347              
348             example:
349              
350             use NanoB2B::NER;
351             my %params = ();
352             $params{'dir'} = "my_directory";
353             $params{'features'} = "ortho morph text pos cui sem";
354              
355             my $nner = new NanoB2B::NER(\%params);
356             $nner->nerByFile();
357              
358             =cut
359              
360             sub nerByFile{
361 0     0 1   my $self = shift;
362              
363             #open the directory
364 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
365              
366             #if not doing a single file
367 0 0         if(!defined $self->{program_file}){
368 0 0 0       my @files = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
369              
370             #sort by size?
371 0 0         if($self->{sortSize}){
372 0           @files = sortBySize($self, \@files);
373             }
374             #ner the files individually
375 0           my $totalTags = @files;
376 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
377 0           $uniSub->printColorDebug("on_blue", "FILE #$a / $totalTags");
378 0           my $tag = $files[$a - 1];
379            
380             #metamap the files if needed
381 0 0         if(!$self->{import_meta}){
382 0           $uniSub->printColorDebug("bold cyan", "--- METAMAP ---\n");
383 0           my %paramsm = ();
384 0           $paramsm{'directory'} = $self->{program_dir};
385 0           $paramsm{'index'} = $self->{index};
386 0           $paramsm{'metamap_arguments'} = $self->{metamap_arguments};
387 0           $paramsm{'debug'} = $debug;
388 0           $metaboy = NanoB2B::NER::Metaman->new(\%paramsm);
389 0           $metaboy->meta_file($tag);
390             }
391              
392             #arff the file
393 0           $uniSub->printColorDebug("bold magenta", "--- ARFF ---\n");
394             #define arffboy with the parameters
395 0           my %paramsr = ();
396 0           $paramsr{'directory'} = $self->{program_dir};
397 0           $paramsr{'features'} = $self->{features};
398 0           $paramsr{'bucketsNum'} = $self->{bucketsNum};
399 0           $paramsr{'debug'} = $debug;
400 0           $paramsr{'prefix'} = $self->{prefix};
401 0           $paramsr{'suffix'} = $self->{suffix};
402 0           $paramsr{'index'} = $self->{index};
403 0           $paramsr{'stopwords'} = $self->{stopwords};
404 0           $paramsr{'is_cui'} = $self->{is_cui};
405 0           $paramsr{'sparse_matrix'} = $self->{sparse_matrix};
406 0 0         if(!$wcs_found){
407 0           $paramsr{'wcs'} = $self->{wcs};
408 0           $wcs_found = 1;
409             }
410 0           $arffboy = NanoB2B::NER::Arffman->new(\%paramsr);
411 0           $arffboy->arff_file($tag);
412              
413             #weka the file
414 0 0         if($self->{bucketsNum} > 1){
415 0           $uniSub->printColorDebug("bold yellow", "--- WEKA ---\n");
416             #define wekaboy with the parameters
417 0           my %paramsw = ();
418 0           $paramsw{'directory'} = $self->{program_dir};
419 0           $paramsw{'type'} = $self->{weka_type};
420 0           $paramsw{'weka_size'} = $self->{weka_size};
421 0           $paramsw{'features'} = $self->{features};
422 0           $paramsw{'buckets'} = $self->{bucketsNum};
423 0           $paramsw{'debug'} = $debug;
424 0           $wekaboy = NanoB2B::NER::Wekaman->new(\%paramsw);
425 0           $wekaboy->weka_file($tag);
426             }
427              
428             #model the file
429 0 0         if($self->{model}){
430 0           $uniSub->printColorDebug("bold red", "--- MODEL ---\n");
431 0           my %paramsmo = ();
432 0           $paramsmo{'directory'} = $self->{program_dir};
433 0           $paramsmo{'type'} = $self->{weka_type};
434 0           $paramsmo{'weka_size'} = $self->{weka_size};
435 0           $paramsmo{'features'} = $self->{features};
436 0           $paramsmo{'buckets'} = $self->{bucketsNum};
437 0           $paramsmo{'debug'} = $debug;
438 0           $modelboy = NanoB2B::NER::Modelman->new(\%paramsmo);
439 0           $modelboy->make_model_file($tag);
440             }
441              
442             #average the file
443 0 0         if($self->{bucketsNum} > 1){
444 0           $uniSub->printColorDebug("bold green", "--- AVG ---\n");
445             #define avgboy with the parameters
446 0           my %paramsa = ();
447 0           $paramsa{'directory'} = $self->{program_dir};
448 0           my @a = split(/\./, $self->{weka_type});
449 0           $paramsa{'weka_dir'} = $a[$#a];
450 0           $paramsa{'features'} = $self->{features};
451 0           $paramsa{'buckets'} = $self->{bucketsNum};
452 0           $paramsa{'debug'} = $debug;
453 0           $avgboy = NanoB2B::NER::Avgman->new(\%paramsa);
454 0           $avgboy->avg_file($tag);
455             }
456            
457              
458 0           $uniSub->printColorDebug("on_blue", "## FINISHED #$a - $tag! ##");
459             }
460             }else{
461 0           my $tag = $self->{program_file};
462 0           $uniSub->printColorDebug("on_blue", "FILE #$tag");
463            
464             #metamap the files if needed
465 0 0         if(!$self->{import_meta}){
466 0           $uniSub->printColorDebug("bold cyan", "--- METAMAP ---\n");
467 0           my %paramsm = ();
468 0           $paramsm{'directory'} = $self->{program_dir};
469 0           $paramsm{'index'} = $self->{index};
470 0           $paramsm{'metamap_arguments'} = $self->{metamap_arguments};
471 0           $paramsm{'debug'} = $debug;
472 0           $metaboy = NanoB2B::NER::Metaman->new(\%paramsm);
473 0           $metaboy->meta_file($tag);
474             }
475              
476             #arff the file
477 0           $uniSub->printColorDebug("bold magenta", "--- ARFF ---\n");
478             #define arffboy with the parameters
479 0           my %paramsr = ();
480 0           $paramsr{'directory'} = $self->{program_dir};
481 0           $paramsr{'features'} = $self->{features};
482 0           $paramsr{'bucketsNum'} = $self->{bucketsNum};
483 0           $paramsr{'debug'} = $debug;
484 0           $paramsr{'prefix'} = $self->{prefix};
485 0           $paramsr{'suffix'} = $self->{suffix};
486 0           $paramsr{'index'} = $self->{index};
487 0           $paramsr{'stopwords'} = $self->{stopwords};
488 0           $paramsr{'is_cui'} = $self->{is_cui};
489 0           $paramsr{'sparse_matrix'} = $self->{sparse_matrix};
490 0 0         if(!$wcs_found){
491 0           $paramsr{'wcs'} = $self->{wcs};
492 0           $wcs_found = 1;
493             }
494 0           $arffboy = NanoB2B::NER::Arffman->new(\%paramsr);
495 0           $arffboy->arff_file($tag);
496              
497             #weka the file
498 0 0         if($self->{bucketsNum} > 1){
499 0           $uniSub->printColorDebug("bold yellow", "--- WEKA ---\n");
500             #define wekaboy with the parameters
501 0           my %paramsw = ();
502 0           $paramsw{'directory'} = $self->{program_dir};
503 0           $paramsw{'type'} = $self->{weka_type};
504 0           $paramsw{'weka_size'} = $self->{weka_size};
505 0           $paramsw{'features'} = $self->{features};
506 0           $paramsw{'buckets'} = $self->{bucketsNum};
507 0           $paramsw{'debug'} = $debug;
508 0           $wekaboy = NanoB2B::NER::Wekaman->new(\%paramsw);
509 0           $wekaboy->weka_file($tag);
510             }
511              
512             #model the file
513 0 0         if($self->{model}){
514 0           $uniSub->printColorDebug("bold red", "--- MODEL ---\n");
515 0           my %paramsmo = ();
516 0           $paramsmo{'directory'} = $self->{program_dir};
517 0           $paramsmo{'type'} = $self->{weka_type};
518 0           $paramsmo{'weka_size'} = $self->{weka_size};
519 0           $paramsmo{'features'} = $self->{features};
520 0           $paramsmo{'buckets'} = $self->{bucketsNum};
521 0           $paramsmo{'debug'} = $debug;
522 0           $modelboy = NanoB2B::NER::Modelman->new(\%paramsmo);
523 0           $modelboy->make_model_file($tag);
524             }
525              
526             #average the file
527 0 0         if($self->{bucketsNum} > 1){
528 0           $uniSub->printColorDebug("bold green", "--- AVG ---\n");
529             #define avgboy with the parameters
530 0           my %paramsa = ();
531 0           $paramsa{'directory'} = $self->{program_dir};
532 0           my @a = split(/\./, $self->{weka_type});
533 0           $paramsa{'weka_dir'} = $a[$#a];
534 0           $paramsa{'features'} = $self->{features};
535 0           $paramsa{'buckets'} = $self->{bucketsNum};
536 0           $paramsa{'debug'} = $debug;
537 0           $avgboy = NanoB2B::NER::Avgman->new(\%paramsa);
538 0           $avgboy->avg_file($tag);
539             }
540            
541              
542 0           $uniSub->printColorDebug("on_blue", "## FINISHED #$tag! ##");
543             }
544            
545             }
546              
547              
548             =head3 nerByMethod
549              
550             description:
551              
552             Runs the files specified in the parameters program_dir metamaps all the files, arffs all the files, wekas all the files, and averages all the files
553             This NER method doesn't move on to the next method until all the files have been processed
554              
555             input:
556              
557             None
558              
559             output:
560              
561             Metamap files, ARFF file sets, Weka file sets, and Averaged Accuracy files
562              
563             example:
564              
565             use NanoB2B::NER;
566             my %params = ();
567             $params{'dir'} = "my_directory";
568             $params{'features'} = "ortho morph text pos cui sem";
569              
570             my $nner = new NanoB2B::NER(\%params);
571             $nner->nerByMethod();
572              
573             =cut
574              
575             sub nerByMethod{
576 0     0 1   my $self = shift;
577              
578             #meta the files if needed
579 0 0         if(!$self->{import_meta}){
580 0           $self->metaSet();
581             }
582            
583             #arff the files
584 0           $self->arffSet();
585              
586             #weka the files
587 0 0         if($self->{bucketsNum} > 1){
588 0           $self->wekaSet();
589             }
590            
591              
592 0 0         if($self->{model}){
593 0           $self->modelSet();
594             }
595              
596             #average the files
597 0 0         if($self->{bucketsNum} > 1){
598 0           $self->avgSet();
599             }
600             }
601              
602             =head3 metaSet
603              
604             description:
605              
606             Runs a set of files through metamap
607              
608             input:
609              
610             None
611              
612             output:
613              
614             Metamap files for every file found in the directory specified in the constructor parameters
615              
616             example:
617              
618             use NanoB2B::NER;
619             my %params = ();
620             $params{'dir'} = "my_directory";
621             $params{'features'} = "ortho morph text pos cui sem";
622              
623             my $nner = new NanoB2B::NER(\%params);
624             $nner->metaSet();
625              
626             =cut
627             sub metaSet{
628 0     0 1   my $self = shift;
629              
630 0           print "\tDIR: " . $self->{'program_dir'} . "\n";
631            
632             #open the directory
633 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
634 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
635 0           my $totalTags = @tags;
636              
637             #if only file
638 0 0         if($self->{program_file}){
639 0           $totalTags = 1;
640             }
641              
642             #sort by size?
643 0 0         if($self->{sortSize}){
644 0           @tags = sortBySize($self, \@tags);
645             }
646              
647             #if only one file reduce it to the one
648 0 0         if(defined $self->{program_file}){
649 0           @tags = ($self->{program_file});
650             }
651              
652             #define metaboy with the parameters
653 0           my %params = ();
654 0           $params{'directory'} = $self->{program_dir};
655 0           $params{'index'} = $self->{index};
656 0           $params{'metamap_arguments'} = $self->{metamap_arguments};
657 0           $params{'debug'} = $debug;
658 0           $metaboy = NanoB2B::NER::Metaman->new(\%params);
659            
660             #run set through metamap
661 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
662 0           $uniSub->printColorDebug("bold cyan", "META FILE #$a / $totalTags\n");
663 0           my $tag = $tags[$a - 1];
664 0           $metaboy->meta_file($tag);
665 0           $uniSub->printColorDebug("bold cyan", "## FINISHED METAMAP #$a - $tag! ##\n");
666             }
667             }
668              
669             =head3 arffSet
670              
671             description:
672              
673             Turns a set of files into ARFF files based on the features specificied in the constructor parameters
674              
675             input:
676              
677             None
678              
679             output:
680              
681             ARFF file sets for every file found in the directory specified in the constructor parameters
682              
683             example:
684              
685             use NanoB2B::NER;
686             my %params = ();
687             $params{'dir'} = "my_directory";
688             $params{'features'} = "ortho morph text pos cui sem";
689              
690             my $nner = new NanoB2B::NER(\%params);
691             $nner->arffSet();
692              
693             =cut
694             sub arffSet{
695 0     0 1   my $self = shift;
696              
697             #open the directory
698 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
699 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
700 0           my $totalTags = @tags;
701              
702             #if only one file
703 0 0         if($self->{program_file}){
704 0           $totalTags = 1;
705             }
706              
707             #sort by size?
708 0 0         if($self->{sortSize}){
709 0           @tags = sortBySize($self, \@tags);
710             }
711              
712             #if only one file reduce it to the one
713 0 0         if(defined $self->{program_file}){
714 0           @tags = ($self->{program_file});
715             }
716              
717             #define arffboy with the parameters
718 0           my %params = ();
719 0           $params{'directory'} = $self->{'program_dir'};
720 0           $params{'features'} = $self->{'features'};
721 0           $params{'bucketsNum'} = $self->{'bucketsNum'};
722 0           $params{'debug'} = $debug;
723 0           $params{'prefix'} = $self->{'prefix'};
724 0           $params{'suffix'} = $self->{'suffix'};
725 0           $params{'index'} = $self->{'index'};
726 0           $params{'stopwords'} = $self->{'stopwords'};
727 0           $params{'is_cui'} = $self->{'is_cui'};
728 0           $params{'sparse_matrix'} = $self->{'sparse_matrix'};
729 0           $params{'wcs'} = $self->{'wcs'};
730 0           $arffboy = NanoB2B::NER::Arffman->new(\%params);
731              
732 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
733 0           $uniSub->printColorDebug("bold magenta", "ARFF FILE #$a / $totalTags\n");
734 0           my $tag = $tags[$a - 1];
735 0 0         if($wcs_found){
736 0           $params{'wcs'} = "";
737 0           $arffboy = NanoB2B::NER::Arffman->new(\%params);
738             }else{
739 0           $wcs_found = 1;
740             }
741 0           $arffboy->arff_file($tag);
742 0           $uniSub->printColorDebug("bold magenta", "## FINISHED ARFF #$a - $tag! ##\n");
743             }
744             }
745              
746             =head3 wekaSet
747              
748             description:
749              
750             Runs a set of ARFF files through WEKA
751              
752             input:
753              
754             None
755              
756             output:
757              
758             WEKA files for every file found in the directory specified in the constructor parameters
759              
760             example:
761              
762             use NanoB2B::NER;
763             my %params = ();
764             $params{'dir'} = "my_directory";
765             $params{'features'} = "ortho morph text pos cui sem";
766              
767             my $nner = new NanoB2B::NER(\%params);
768             $nner->wekaSet();
769              
770             =cut
771             sub wekaSet{
772 0     0 1   my $self = shift;
773              
774             #open the directory
775 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
776 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
777 0           my $totalTags = @tags;
778              
779             #if only one file
780 0 0         if($self->{program_file}){
781 0           $totalTags = 1;
782             }
783              
784             #sort by size?
785 0 0         if($self->{sortSize}){
786 0           @tags = sortBySize($self, \@tags);
787             }
788              
789             #if only one file reduce it to the one
790 0 0         if(defined $self->{program_file}){
791 0           @tags = ($self->{program_file});
792             }
793              
794             #define wekaboy with the parameters
795 0           my %params = ();
796 0           $params{'directory'} = $self->{program_dir};
797 0           $params{'type'} = $self->{weka_type};
798 0           $params{'weka_size'} = $self->{weka_size};
799 0           $params{'features'} = $self->{features};
800 0           $params{'buckets'} = $self->{bucketsNum};
801 0           $params{'debug'} = $debug;
802 0           $wekaboy = NanoB2B::NER::Wekaman->new(\%params);
803              
804 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
805 0           $uniSub->printColorDebug("bold yellow", "## WEKA FILE #$a / $totalTags ##\n");
806 0           my $tag = $tags[$a - 1];
807 0           $wekaboy->weka_file($tag);
808 0           $uniSub->printColorDebug("bold yellow", "## FINISHED WEKA #$a - $tag! ##\n");
809 0           sleep(1);
810             }
811             }
812              
813             =head3 modelSet
814              
815             description:
816              
817             Creates WEKA models from the training ARFF files
818              
819             input:
820              
821             None
822              
823             output:
824              
825             WEKA model files for every file with training ARFF files
826              
827             example:
828              
829             use NanoB2B::NER;
830             my %params = ();
831             $params{'dir'} = "my_directory";
832             $params{'features'} = "ortho morph text pos cui sem";
833              
834             my $nner = new NanoB2B::NER(\%params);
835             $nner->modelSet();
836              
837             =cut
838             sub modelSet{
839 0     0 1   my $self = shift;
840              
841             #open the directory
842 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
843 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
844 0           my $totalTags = @tags;
845              
846             #if only one file
847 0 0         if($self->{program_file}){
848 0           $totalTags = 1;
849             }
850              
851             #sort by size?
852 0 0         if($self->{sortSize}){
853 0           @tags = sortBySize($self, \@tags);
854             }
855              
856             #if only one file reduce it to the one
857 0 0         if(defined $self->{program_file}){
858 0           @tags = ($self->{program_file});
859             }
860              
861             #define wekaboy with the parameters
862 0           my %params = ();
863 0           $params{'directory'} = $self->{program_dir};
864 0           $params{'type'} = $self->{weka_type};
865 0           $params{'weka_size'} = $self->{weka_size};
866 0           $params{'features'} = $self->{features};
867 0           $params{'buckets'} = $self->{bucketsNum};
868 0           $params{'debug'} = $debug;
869 0           $modelboy = NanoB2B::NER::Modelman->new(\%params);
870              
871 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
872 0           $uniSub->printColorDebug("bold red", "## MODEL FILE #$a / $totalTags ##\n");
873 0           my $tag = $tags[$a - 1];
874 0           $modelboy->make_model_file($tag);
875 0           $uniSub->printColorDebug("bold red", "## FINISHED MODEL #$a - $tag! ##\n");
876 0           sleep(1);
877             }
878             }
879              
880              
881             =head3 avgSet
882              
883             description:
884              
885             Averages together a set of WEKA files
886              
887             input:
888              
889             None
890              
891             output:
892              
893             Average accuracy files for every file found in the directory specified in the constructor parameters
894              
895             example:
896              
897             use NanoB2B::NER;
898             my %params = ();
899             $params{'dir'} = "my_directory";
900             $params{'features'} = "ortho morph text pos cui sem";
901              
902             my $nner = new NanoB2B::NER(\%params);
903             $nner->avgSet();
904              
905             =cut
906             sub avgSet{
907 0     0 1   my $self = shift;
908              
909             #open the directory
910 0 0         opendir (my $DIR, $self->{program_dir}) or die $!;
911 0 0 0       my @tags = grep { $_ ne '.' and $_ ne '..' and substr($_, 0, 1) ne '_'} readdir $DIR; #get each file from the directory
  0            
912 0           my $totalTags = @tags;
913              
914             #if only one file
915 0 0         if($self->{program_file}){
916 0           $totalTags = 1;
917             }
918              
919             #sort by size?
920 0 0         if($self->{sortSize}){
921 0           @tags = sortBySize($self, \@tags);
922             }
923              
924             #if only one file reduce it to the one
925 0 0         if(defined $self->{program_file}){
926 0           @tags = ($self->{program_file});
927             }
928              
929             #define avgboy with the parameters
930 0           my %params = ();
931 0           $params{'directory'} = $self->{program_dir};
932 0           my @a = split(/\./, $self->{weka_type});
933 0           $params{'weka_dir'} = $a[$#a];
934 0           $params{'features'} = $self->{features};
935 0           $params{'buckets'} = $self->{bucketsNum};
936 0           $params{'debug'} = $debug;
937 0           $avgboy = NanoB2B::NER::Avgman->new(\%params);
938              
939 0           for(my $a = $self->{fileIndex}; $a <= $totalTags; $a++){
940 0           $uniSub->printColorDebug("bold green", "AVG FILE #$a / $totalTags\n");
941 0           my $tag = $tags[$a - 1];
942 0           $avgboy->avg_file($tag);
943 0           $uniSub->printColorDebug("bold green", "## FINISHED AVERAGING #$a - $tag! ##\n");
944             }
945             }
946              
947              
948             #sorts a directory by the file size
949             # input : @files <-- get the list of files in the folder
950             # output : @newset <-- the set of files ordered by size from smallest to largest
951             sub sortBySize{
952 0     0 0   my $self = shift;
953 0           my $files_ref = shift;
954 0           my @files = @$files_ref;
955              
956 0           my %hash = ();
957 0           my @newSet = ();
958              
959 0           my $dir = $self->{program_dir};
960              
961             #create hashmap
962 0           foreach my $file (@files){
963 0           my $s = -s "$dir/$file";
964 0           $hash{$s} = $file;
965             }
966              
967             #add sorted sizes to array
968 0           foreach my $key (sort { $a <=> $b } keys %hash){
  0            
969 0           my $name = $hash{$key};
970             #printColorDebug("cyan", "$name - $key\n");
971 0           push @newSet, $name;
972             }
973              
974 0           return @newSet;
975             }
976              
977             1;
978              
979             =head1 SEE ALSO
980              
981             =head1 AUTHOR
982              
983             Megan Charity
984             Bridget T McInnes
985              
986             =head1 COPYRIGHT
987              
988             Copyright (c) 2017
989             Megan Charity, Virginia Commonwealth University
990             charityml at vcu.edu
991              
992             Bridget T. McInnes, Virginia Commonwealth University
993             btmcinnes at vcu.edu
994              
995             This program is free software; you can redistribute it and/or modify it under
996             the terms of the GNU General Public License as published by the Free Software
997             Foundation; either version 2 of the License, or (at your option) any later
998             version.
999              
1000             This program is distributed in the hope that it will be useful, but WITHOUT
1001             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1002             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
1003              
1004             You should have received a copy of the GNU General Public License along with
1005             this program; if not, write to
1006              
1007             The Free Software Foundation, Inc.,
1008             59 Temple Place - Suite 330,
1009             Boston, MA 02111-1307, USA.
1010              
1011             =cut