File Coverage

blib/lib/Data/FeatureFactory.pm
Criterion Covered Total %
statement 568 667 85.1
branch 321 416 77.1
condition 106 133 79.7
subroutine 23 24 95.8
pod 1 7 14.2
total 1019 1247 81.7


line stmt bran cond sub pod time code
1             package Data::FeatureFactory;
2              
3 2     2   132317 use strict;
  2         4  
  2         64  
4 2     2   12 use Carp;
  2         4  
  2         153  
5 2     2   11 use File::Basename;
  2         8  
  2         144  
6 2     2   10 use Scalar::Util;
  2         3  
  2         744  
7              
8             our $VERSION = '0.0404';
9             my $PATH = &{ sub { return dirname( (caller)[1] ) } };
10             my $OPEN_OPTIONS;
11             our $CURRENT_FEATURE;
12             my %KNOWN_FORMATS = map {;$_=>1} qw/binary normal numeric/;
13              
14             # check if perl can open files in utf8
15             {
16             my $fh;
17             undef $@;
18 2     2   20 eval { open $fh, '<:encoding(utf8)', $0 };
  2         4  
  2         50  
19             if ($@) {
20             $OPEN_OPTIONS = '';
21             warn qq{the open's :encoding directive not supported by your perl ($]). Files won't be opened in utf8 format.};
22             }
23             else { $OPEN_OPTIONS = ':encoding(utf8)' }
24             close $fh;
25             }
26              
27             sub new : method {
28 30     30 0 39586 my ($class, $args) = @_;
29 30 50       95 $class = ref $class if ref $class;
30 30 50       77 croak "Too many parameters to $class->new" if @_ > 2;
31 30         62 my $self = bless +{}, $class;
32            
33 30 100       80 if (defined $args) {
34 3 50       30 croak "The parameter to ${class}->new must be a hashref with options or nothing" if ref $args ne 'HASH';
35 3         8 my %accepted_option = map {;$_=>1} qw(N/A);
  3         16  
36 3         19 while (my ($k, $v) = each %$args) {
37 3 50       14 if (not exists $accepted_option{$k}) {
38 0         0 croak "Unexpected option '$k' passed to ${class}->new"
39             }
40 3 50       11 if ($k eq 'N/A') {
41 3         34 $self->{'N/A'} = "$v";
42             }
43             }
44             }
45            
46 2     2   16 no strict 'refs';
  2         5  
  2         1250  
47 30 50       35 if (not defined @{$class."::features"}) {
  30         164  
48 0         0 croak "\@${class}::features not defined";
49             }
50 30         33 our @features;
51 30         36 *features = \@{$class."::features"};
  30         88  
52 2     2   12 use strict;
  2         3  
  2         790  
53 30 50       75 if (not @features) {
54 0         0 warn "$class has empty set of features. Not much fun";
55             }
56 30         168 $self->{'features'} = [];
57 30         70 my %feat_named;
58 30         63 $self->{'feat_named'} = \%feat_named;
59 30         32 my @featkeys;
60 30         47 $self->{'featkeys'} = \@featkeys;
61 30         1116 $self->{'caller_path'} = dirname( (caller)[1] );
62            
63 30         66 my %supported_option = ( map {;$_=>1} qw(code default format label name postproc range type values values_file) );
  300         546  
64 30         97 my %accepted_option = ( map {;$_=>1} qw(cat2num cat2num_dyna num2cat num2cat_dyna num_values_fh values_ordered) );
  180         336  
65            
66             # parse the @features array
67 30         76 for my $original_feature (@features) {
68 110         395 my $feature = { %$original_feature };
69 110 100       325 if (not exists $feature->{'name'}) {
70 1         217 croak q{There was a feature without a name. Each record in the @features array must be a hashref with a 'name' field at least};
71             }
72 109         182 my $name = $feature->{'name'};
73 109 100       223 if (exists $feat_named{$name}) {
74 1         177 croak "Feature $name specified twice in \@${class}::features";
75             }
76 108         121 push @{ $self->{'features'} }, $feature;
  108         277  
77 108         285 $feat_named{$name} = $feature;
78 108         156 push @featkeys, $name;
79            
80             # Check if there aren't illegal options
81 108         306 for (keys %$feature) {
82 299 100       695 if (not exists $supported_option{$_}) {
83 2 100       8 if (exists $accepted_option{$_}) {
84 1         13 warn "Option '$_' you specified for feature '$name' is not supported. Be sure you know what you are doing"
85             }
86             else {
87 1         169 croak "Unrecognized option '$_' specified for feature '$name'";
88             }
89             }
90             }
91            
92             # Check if a postprocessing subroutine is declared
93             # If it's a CODEref, we're OK. Else try to load it.
94 107 100 100     398 if (exists $feature->{'postproc'} and ref $feature->{'postproc'} ne 'CODE') {
95 4         5 my $postproc = $feature->{'postproc'};
96 2     2   14 no strict 'refs';
  2         3  
  2         4781  
97 4         7 my $postprocsub = \&{$postproc};
  4         57  
98 4         7 undef $@;
99 4         6 eval { $postprocsub->() };
  4         35  
100 4 100       127 if ($@ =~ /Undefined subroutine/) {
    50          
101 3         13 my ($package_name) = $postproc =~ /^( (?: \w+:: )+ )/x;
102 3         4 my $ppname;
103 3 100 66     16 if (defined $package_name and length $package_name > 0) {
104 2         8 $package_name =~ s/::$//;
105 2         14 local @INC = (@INC, $self->{'caller_path'});
106 2         3 undef $@;
107 2         135 eval "require $package_name";
108 2 50       23660 if ($@) {
109 0         0 warn "Failed loading module '$package_name'";
110             }
111 2         11 $ppname = $postproc;
112             }
113             else {
114 1         4 $ppname = $class.'::'.$postproc;
115             }
116 3         7 $postprocsub = \&{$ppname};
  3         8  
117 3         5 undef $@;
118 3         7 eval { $postprocsub->() };
  3         6  
119 3 50       506 if ($@ =~ /^Undefined subroutine/) {
120 0         0 croak "Couldn't load postprocessing function '$postproc' ($@)"
121             }
122             }
123             elsif ($@ =~ /^Undefined subroutine/) {
124 0         0 croak "Couldn't load postprocessing function '$postproc' ($@)"
125             }
126 4         12 $feature->{'postproc'} = $postprocsub;
127             }
128            
129             # Check if values are specified and if they are a list of values.
130 107 100       233 if (exists $feature->{'values'}) {
131 29 100       71 if (exists $feature->{'values_file'}) {
132 1         236 croak "Values specified both explicitly and by file for '$name'"
133             }
134 28         48 my $values = $feature->{'values'};
135 28 100       105 if (ref $values eq 'HASH') { # OK, do nothing
    50          
136             }
137             elsif (ref $values eq 'ARRAY') { # Convert the list to a hash.
138 24         44 my %values = map {;$_ => 1} @$values;
  228         573  
139 24         69 $feature->{'values_ordered'} = $values;
140 24         54 $feature->{'values'} = \%values;
141             }
142             else {
143 0         0 my $type;
144 0 0       0 if (ref $values) {
145 0         0 $type = lc(ref $values).'ref';
146             }
147             else {
148 0         0 $type = lc(ref \$values);
149             }
150 0         0 croak "The values must be specified as an arrayref or hashref, not $type"
151             }
152             }
153            
154 106 100       266 if (exists $feature->{'values_file'}) {
155 4         19 my $values_fn = $feature->{'values_file'};
156 4         143 my $opened = open my $values_fh, '<'.$OPEN_OPTIONS, $values_fn;
157 4 50       21 if (not $opened) {
158 4 50       179 open $values_fh, '<'.$OPEN_OPTIONS, $self->{'caller_path'}.'/'.$values_fn
159             or croak "Couldn't open file '$values_fn' specifying values for $name"
160             }
161 4         249 my %values;
162             my @values;
163 4         154 while (<$values_fh>) {
164 104         168 chomp;
165 104         285 $values{$_} = 1;
166 104         361 push @values, $_;
167             }
168 4         369 close $values_fh;
169 4         17 $feature->{'values'} = \%values;
170 4         33 $feature->{'values_ordered'} = \@values;
171             }
172            
173 106 100       212 if (exists $feature->{'range'}) {
174 13 100       31 if (exists $feature->{'values'}) {
175 2         363 croak "Both range and values specified for feature '$name'"
176             }
177 11 100       233 $feature->{'range'} =~ /^ (.+?) \s* \.{2,} \s* (.+) $/x
178             or croak "Malformed range '$$feature{range}' of feature '$name'. Should be in format '0 .. 5'";
179 10         55 my $l = $1+0;
180 10         40 my $r = $2+0;
181 10 100       36 if (not $l < $r) {
182 1         156 croak "Invalid range '$$feature{range}' specified for feature '$name'. The left boundary must be lesser than the right one"
183             }
184            
185 9 100       46 if ($feature->{'type'} =~ /^int/i) {
    50          
186 7         29 $feature->{'values'} = {map {;$_ => 1} $l .. $r};
  37         90  
187 7         30 $feature->{'values_ordered'} = [$l .. $r];
188             }
189             elsif ($feature->{'type'} =~ /^num/i) {
190 2         5 $feature->{'range_l'} = $l;
191 2         6 $feature->{'range_r'} = $r;
192             }
193             }
194            
195 102 100       217 if (exists $feature->{'default'}) {
196 14 50 66     43 if (not exists $feature->{'values'} and not exists $feature->{'range_l'}) {
197 0         0 croak "Default value '$$feature{default}' but no values specified for feature '$name'"
198             }
199             }
200            
201 102 100       223 if (exists $feature->{'type'}) {
202 41         90 my $type = lc substr $feature->{'type'}, 0, 3;
203 41         71 my $type_OK = grep {$type eq $_} qw(boo int num cat);
  164         270  
204 41 50       85 if (not $type_OK) {
205 0         0 croak "The type of a feature, if given, should be 'integer', 'numeric', or 'categorial'"
206             }
207 41         78 $feature->{'type'} = $type;
208            
209             # check if the values comply to the type
210 41 100       183 if ($type eq 'boo') {
    100          
    100          
211 15 100       34 if (exists $feature->{'values'}) {
212 5 50       10 my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} });
  5         12  
  0         0  
213 5 100       12 if (@values > 2) {
214 1         2 my $num_values = @values;
215 1         154 croak "More than two values ($num_values) specified for feature '$name'"
216             }
217 4         5 my ($false, $true);
218             # boolify the values
219 4         7 for (@values) {
220 7 100       11 if ($_) {
221 4 100       9 if (defined $true) {
222 1         257 croak "True value (literal: '$true', '$_') for feature '$name' specified multiple times"
223             }
224 3         3 $true = $_;
225             }
226             else {
227 3 100       9 if (defined $false) {
228 1         162 croak "False value (literal: '$false', '$_') for feature '$name' specified multiple times"
229             }
230 2         2 $false = $_;
231             }
232 5 100       12 $_ = $_ ? 1 : 0;
233             }
234 2 50       6 if (exists $feature->{'values_ordered'}) {
235 2         4 $feature->{'values_ordered'} = \@values;
236             }
237 2         3 $feature->{'values'} = +{ map {;$_=>1} @values };
  3         10  
238             }
239             else {
240 10         37 $feature->{'values'} = {0 => 1, 1 => 1};
241 10         25 $feature->{'values_ordered'} = [0,1];
242             }
243 12 100       41 if (exists $feature->{'default'}) {
244 2         3 my $def = $feature->{'default'};
245 2         3 my @vals = values %{ $feature->{'values'} };
  2         6  
246 2 100       6 if (@vals > 1) {
247 1         169 croak "Default value '$def' specified for boolean feature '$name' which has both values allowed"
248             }
249 1 50 25     10 unless ($def xor $vals[0]) {
250 1 50       7 my $val = $def ? 'true' : 'false';
251 1         173 croak "Default and allowed value are both $val for feature '$name'";
252             }
253 0 0       0 $feature->{'default'} = $def ? 1 : 0;
254             }
255             }
256             elsif ($type eq 'int') {
257 14 100       59 if (exists $feature->{'values'}) {
258 7 50       23 my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} });
  7         24  
  0         0  
259             # integrify the values
260 7         15 for (@values) {
261 37         52 $_ = int $_;
262             }
263 7 50       20 if (exists $feature->{'values_ordered'}) {
264 7         16 $feature->{'values_ordered'} = \@values;
265             }
266 7         15 $feature->{'values'} = +{ map {;$_=>1} @values };
  37         81  
267             }
268 14 100       57 if (exists $feature->{'default'}) {
269 1         40 $feature->{'default'} = int $feature->{'default'};
270             }
271             }
272             elsif ($type eq 'num') {
273             # numify the features, producing warnings if used
274 9 50       30 if (exists $feature->{'values'}) {
275 0 0       0 my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} });
  0         0  
  0         0  
276 0         0 for (@values) {
277 0         0 $_ += 0;
278             }
279 0 0       0 if (exists $feature->{'values_ordered'}) {
280 0         0 $feature->{'values_ordered'} = \@values;
281             }
282 0         0 $feature->{'values'} = +{ map {;$_=>1} @values };
  0         0  
283             }
284 9 100       29 if (exists $feature->{'default'}) {
285 1         3 $feature->{'default'} += 0;
286             }
287             }
288             }
289            
290 97 100       280 if (exists $feature->{'format'}) {
291 9         21 my $format = $feature->{'format'};
292 9 100       51 if (not $format =~ /^ (?: normal | numeric | binary ) $/x) {
293 1         245 croak "Invalid format '$format' specified for feature '$name'. Please specify 'normal', 'numeric' or 'binary'"
294             }
295 8 100 66     33 if (not exists $feature->{'values'} and $format eq 'binary') {
296 1         228 croak "Feature '$name' has format: 'binary' specified but doesn't have values specified"
297             }
298             }
299            
300             # find the actual code of the feature
301 95         100 my $code;
302 2     2   14 no strict 'refs';
  2         5  
  2         12650  
303 95 100 100     180 if (exists $feature->{'code'}) {
  49 100       235  
304 46         66 $code = $feature->{'code'};
305 46 50       110 if (ref $code ne 'CODE') {
306 0         0 croak "'code' was specified for feature '$name' but it's not a coderef"
307             }
308             }
309 2         12 elsif (%{$class.'::features'} and exists ${$class.'::features'}{$name}) {
310 1         2 $code = ${$class.'::features'}{$name};
  1         4  
311 1 50       13 if (ref $code ne 'CODE') {
312 0         0 croak "Found $name in \%${class}::features but it's not a coderef"
313             }
314             }
315             else {
316 48         48 $code = *{$class.'::'.$name}{CODE};
  48         149  
317 48 50       134 if (ref $code ne 'CODE') {
318 0         0 croak "Couldn't find the code (function) for feature '$name'. Define it as a function '$name' in the '$class' package. Stopped"
319             }
320             }
321 95         241 $feature->{'code'} = $code;
322            
323 95 100       281 if (exists $feature->{'label'}) {
324 23         37 my $label = $feature->{'label'};
325 23 100       54 if (ref $label eq 'ARRAY') {
    50          
326 13         21 $feature->{'label'} = {map {;uc($_) => 1} @$label};
  31         119  
327             }
328             elsif (ref $label) {
329 0         0 croak "Label must be a string or an array of strings - feature '$name' has a ".ref($label).'ref'
330             }
331             else {
332 10         102 $feature->{'label'} = {uc($label) => 1};
333             }
334             }
335             }
336             # print map "*$_\n", map keys(%$_), @{ $self->{'features'} };
337 15         104 return $self
338             }
339              
340             sub expand_names : method {
341 194     194 0 339 my ($self, $featnames) = @_;
342 194 100 100     970 if (not ref $featnames and exists $self->{expand_names_cache}{$featnames}) {
343 105         329 return $self->{expand_names_cache}{$featnames}
344             }
345 89         107 my $orig_featnames = $featnames;
346 89         100 my @featkeys = @{ $self->{'featkeys'} };
  89         318  
347 89         125 my %feat_named = %{ $self->{'feat_named'} };
  89         501  
348            
349 89 100 66     494 if ($featnames eq 'ALL') {
    100          
    100          
350 4         10 $featnames = \@featkeys;
351             }
352             elsif (ref $featnames eq 'ARRAY') {
353             # $featnames = [@$featnames]; # make a copy
354             }
355             # features given by labels
356             elsif ($featnames !~ /[[:lower:]]/ and $featnames =~ /[[:upper:]]/) {
357 10         36 my @all_labels = split /\s+/, $featnames;
358 10         19 my @plus_labels = map {s/^\+//; $_} grep {substr($_, 0, 1) ne '-'} @all_labels;
  12         20  
  12         26  
  19         53  
359 10         52 my @minus_labels = map {substr $_, 1} grep {substr($_, 0, 1) eq '-'} @all_labels;
  7         19  
  19         43  
360             # Specifying just '-LABEL' means all but those that have LABEL
361 10 100 66     42 if (@plus_labels == 0 and @minus_labels > 0) {
362 3         7 @plus_labels = qw(ALL);
363             }
364 10 100       21 if (grep {$_ eq 'ALL'} @minus_labels) {
  7         20  
365 1         205 croak "Label 'ALL' is special and can't be used with the minus sign, as in $featnames"
366             }
367 9         15 $featnames = [];
368 9         16 for my $featkey (@featkeys) {
369 127         197 my $feature = $feat_named{ $featkey };
370 127 100       134 my $included = grep { $_ eq 'ALL' or exists $feature->{'label'}{$_} } @plus_labels;
  191         667  
371 127         140 my $excluded = grep { exists $feature->{'label'}{$_} } @minus_labels;
  79         148  
372 127 100 100     500 push @$featnames, $featkey if $included and not $excluded;
373             }
374             }
375             else {
376 25         71 $featnames = ["$featnames"];
377             }
378            
379 88 100       181 if (not ref $orig_featnames) { $self->{expand_names_cache}{$orig_featnames} = $featnames }
  38         97  
380 88         328 return $featnames
381             }
382              
383             sub evaluate : method {
384 165     165 0 35952 my ($self, $featnames, $format, @args) = @_;
385 165         252 my $class = ref $self;
386            
387 165         383 $featnames = $self->expand_names($featnames);
388 164         191 my @feats;
389 164 100       616 if (exists $self->{evaluate_featnames_cache}{"@$featnames"}) {
390 112         119 @feats = @{ $self->{evaluate_featnames_cache}{"@$featnames"} };
  112         473  
391             }
392             else {
393 52         50 my %feat_named = %{ $self->{feat_named} };
  52         259  
394 52         117 for my $featname (@$featnames) {
395 145 50       278 if (not exists $feat_named{$featname}) {
396 0         0 croak "Feature '$featname' you wish to evaluate was not found among known features (these are: @{$self->{featkeys}})"
  0         0  
397             }
398 145         277 push @feats, $feat_named{$featname};
399             }
400 52         263 $self->{evaluate_featnames_cache}{"@$featnames"} = \@feats;
401             }
402            
403 164 100       415 if (not exists $KNOWN_FORMATS{$format}) {
404 1         5 croak "Unknown format: '$format'. Please specify one of: @{[keys %KNOWN_FORMATS]}."
  1         222  
405             }
406 163         243 for my $feature (@feats) {
407 676         1656 $self->_create_mapping($feature, $format);
408             }
409            
410 161 50       382 if (@args == 0) {
411 0         0 warn 'No arguments specified for the features.';
412             }
413             ### Done argument checking.
414            
415             ### Traverse the features and evaluate them
416 161         185 my @rv;
417 161         214 for my $feature (@feats) {
418 670         1114 my $name = $feature->{'name'};
419 670         784 $CURRENT_FEATURE = $name;
420 670         1936 my $normrv = $feature->{'code'}(@args);
421 670         4354 undef $CURRENT_FEATURE;
422 670 100       1667 my $format = exists $feature->{'format'} ? $feature->{'format'} : $format;
423            
424 670 100 100     2367 if (not defined $normrv and exists $self->{'N/A'}) {
425 145         209 my $na = $self->{'N/A'};
426 145 100 100     689 if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') {
    100          
427 41         93 push @rv, $na;
428             }
429             elsif ($format eq 'binary') {
430             # take one of the vectors in cat2bin
431 45         45 my @dummy = @{ (values %{ $feature->{'cat2bin'} })[0] };
  45         47  
  45         209  
432 45 50       200 if (not @dummy) {
433 0         0 croak "Couldn't determine the length of bit vector for feature '$name',"
434             ."which was about to be evaluated in binary and returned undef"
435             }
436 45         349 push @rv, map $na, @dummy;
437             }
438             else {
439 59         137 push @rv, $na;
440             }
441             }
442             else {
443             # Normally format the value. The eval babble is there to take care of unexpected values.
444 525         703 undef $@;
445 525         696 my @val = eval { _format_value($feature, $normrv, $format, @args) };
  525         946  
446 525 100       1188 if ($@) {
447 5 100 66     36 if (ref $@ and $@->isa('Data::FeatureFactory::SoftError')) {
448 4         4 warn ${$@};
  4         28  
449             return
450 4         84 }
451             else {
452 1         6 die $@
453             }
454             }
455 520         1581 push @rv, @val;
456             }
457             }
458            
459 156         1935 return @rv[0 .. $#rv]
460             }
461              
462             sub _format_value {
463 1086     1086   2326 my ($feature, $normrv, $format, @args) = @_;
464 1086         1139 my @rv;
465 1086         1464 my $name = $feature->{'name'};
466 1086         2208 local $\; local $,;
  1086         1303  
467            
468             # convert to number if appropriate
469 1086 100       2450 if (exists $feature->{'type'}) {
470 374         651 my $type = $feature->{'type'};
471 374 100 100     1805 if ($type eq 'num' or $type eq 'int') {
472 185         299 $normrv += 0;
473             }
474 374 100       1351 if ($type eq 'int') {
475 158         169 $normrv = int $normrv;
476             }
477 374 100       884 if ($type eq 'boo') {
478 136 100       368 $normrv = $normrv ? 1 : 0;
479             }
480             }
481            
482             # check if the value is a legal one
483 1086 100       2303 if (exists $feature->{'values'}) {
    100          
484 834 100       2349 if (exists $feature->{'values'}{$normrv}) { # alles gute
    100          
485             }
486             elsif (exists $feature->{'default'}) {
487 307         530 $normrv = $feature->{'default'};
488             }
489             else {
490 4         28 die Data::FeatureFactory::SoftError->new("Feature '$name' returned unexpected value '$normrv' on arguments '@args'")
491             }
492             }
493             # check the range for numeric features
494             elsif (exists $feature->{'range_l'}) {
495 20 50       93 if (not exists $feature->{'range_r'}) {
496 0         0 die "feature '$name' has range_l but not range_r";
497             }
498 20 50       48 if ($normrv < $feature->{'range_l'}) {
499 0 0       0 if (exists $feature->{'default'}) {
500 0         0 $normrv = $feature->{'default'};
501             }
502             else {
503 0         0 die Data::FeatureFactory::SoftError->new(
504             "Feature '$name' returned an unexpected value '$normrv' below the left allowed boundary '$$feature{range_l}'"
505             )
506             }
507             }
508 20 100       48 if ($normrv > $feature->{'range_r'}) {
509 1 50       4 if (exists $feature->{'default'}) {
510 1         2 $normrv = $feature->{'default'};
511             }
512             else {
513 0         0 die Data::FeatureFactory::SoftError->new(
514             "Feature '$name' returned an unexpected value '$normrv' above the right allowed boundary '$$feature{range_r}'"
515             )
516             }
517             }
518             }
519            
520 1082 100       2612 if ($format eq 'normal') {
    100          
    50          
521 425 100       883 if (exists $feature->{'postproc'}) {
522 101         306 $normrv = $feature->{'postproc'}->($normrv);
523             }
524 424         1105 @rv = ($normrv);
525             }
526             elsif ($format eq 'numeric') {
527 414 100 66     1832 if (exists $feature->{'type'} and $feature->{'type'} =~ /^( num | int | boo )$/x) {
    100          
528 143         470 @rv = ($normrv);
529             }
530             elsif (exists $feature->{'cat2num'}) {
531 219 50       518 if (not exists $feature->{'cat2num'}{$normrv}) {
532 0         0 croak "Feature '$name' has the value '$normrv' for which there is no mapping to numbers"
533             }
534 219         568 @rv = ($feature->{'cat2num'}{$normrv});
535             }
536             else { # dynamically creating the mapping
537 52         58 my $n;
538 52 50       159 if (exists $feature->{'cat2num_dyna'}{$normrv}) {
539 52         362 $n = $feature->{'cat2num_dyna'}{$normrv};
540             }
541             else {
542 0         0 $n = ++$feature->{'num_value_max'};
543 0         0 $feature->{'cat2num_dyna'}{$normrv} = $n;
544 0         0 $feature->{'num2cat_dyna'}{$n} = $normrv;
545 0         0 my @toprint = ($normrv, $n);
546 0 0       0 if (exists $feature->{'postproc'}) {
547 0         0 my $ppd = $feature->{'postproc'}->($normrv);
548 0         0 $feature->{'pp2cat_dyna'}{$ppd} = $normrv;
549 0         0 push @toprint, $ppd;
550             }
551 0 0       0 print {$feature->{'num_values_fh'}} join("\t", @toprint)."\n"
  0         0  
552             or croak "Couldn't print the mapping of categorial value '$normrv' to numeric value '$n' for feature '$name' to a file ($!).\n"
553             . 'Please provide a list of values for the feature to avoid this'
554             }
555 52         112 @rv = ($n);
556             }
557             }
558             elsif ($format eq 'binary') {
559 243 100 100     960 if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') {
    50          
560 39         101 @rv = ($normrv);
561             }
562             elsif (not exists $feature->{'cat2bin'}{$normrv}) {
563 0         0 croak "No mapping for value '$normrv' to binary in feature '$name'"
564             }
565             else {
566 204         195 @rv = @{ $feature->{'cat2bin'}{$normrv} };
  204         1350  
567             }
568             }
569             else {
570 0         0 croak "Unrecognized format '$format'"
571             }
572             return @rv
573 1081         6099 }
574              
575             sub _values_of {
576 244     244   283 my ($feature) = @_;
577 244         240 my @values;
578 244 100       671 if (exists $feature->{'values_ordered'}) {
    50          
579 242         224 @values = @{ $feature->{'values_ordered'} };
  242         910  
580             }
581             elsif (exists $feature->{'values'}) {
582 2         3 @values = keys %{ $feature->{'values'} };
  2         13  
583             }
584             else {
585 0         0 croak "Attempted to gather the values of feature '$$feature{name}', which has none specified"
586             }
587 244 100 66     830 if (exists $feature->{'default'} and not exists $feature->{'values'}{ $feature->{'default'} }) {
588 41         63 push @values, $feature->{'default'};
589             }
590             return @values
591 244         790 }
592              
593             sub _create_mapping : method {
594 1049     1049   1482 my ($class, $feature, $format) = @_;
595 1049 50       2554 $class = ref $class if ref $class;
596 1049 100 66     2976 if (exists $feature->{'format'} and $format ne 'postprocd') {
597 211         332 $format = $feature->{'format'};
598             }
599            
600 1049 100       3597 if (lc $format eq 'normal') {
    100          
    100          
    50          
601             }
602             elsif (lc $format eq 'numeric') {
603 279 100 100     1254 return if exists $feature->{'type'} and $feature->{'type'} eq 'num';
604 262 100 100     1173 return if exists $feature->{'type'} and $feature->{'type'} eq 'int';
605 193 100 66     666 return if exists $feature->{'type'} and $feature->{'type'} eq 'boo';
606 140 100       492 return if exists $feature->{'cat2num'}; # Blindly trusting that what we have here is a sane mapping from the original values to numbers
607 38         64 my $name = $feature->{'name'};
608 38 100       125 if (not exists $feature->{'values'}) {
609 28 100       225 return if exists $feature->{'num_values_fh'};
610 3         37 warn "Categorial feature '$name' is about to be evaluated numerically but has no set of values specified";
611 3         86 (my $num_values_basename = $class.'__'.$name) =~ s/\W/_/g;
612 3         8 $num_values_basename = '.FeatureFactory.'.$num_values_basename;
613 3         56 my @filenames_to_try = (
614             $PATH.'/'.$num_values_basename,
615             $ENV{'HOME'}.'/'.$num_values_basename,
616             '/tmp/'.$num_values_basename,
617             );
618 3         7 my $num_values_fh;
619             my $opened;
620 3         7 my $num_value_max = 0;
621             FILENAME_R:
622 3         10 for my $fn (@filenames_to_try) {
623 3         277 $opened = open my $fh, '+<'.$OPEN_OPTIONS, $fn;
624 3 50       154 if ($opened) {
625 3         7 local $_; # for some reason, this is necessary to prevent crashes (Modification of read-only value) when e.g. in for(qw(a b)){ }
626 3         88 while (<$fh>) {
627 22         64 chomp;
628 22         91 my ($cat, $num, $ppd) = split /\t/;
629 22 50       54 $num_value_max = $num if $num > $num_value_max;
630 22         66 $feature->{'cat2num_dyna'}{$cat} = $num;
631 22         51 $feature->{'num2cat_dyna'}{$num} = $cat;
632 22 100       104 $feature->{'pp2cat_dyna' }{$ppd} = $cat if defined $ppd;
633             }
634 3         365 print STDERR "Saving the mapping for feature '$name' to file $fn\n";
635 3         16 $feature->{'num_values_fh'} = $fh;
636             last FILENAME_R
637 3         14 }
638             }
639             # If there's no file to recover from, try to start a new one
640 3 50       11 if (not $opened) { FILENAME_W: for my $fn (@filenames_to_try) {
  0         0  
641 0         0 $opened = open my $fh, '>'.$OPEN_OPTIONS, $fn;
642 0 0       0 if ($opened) {
643 0         0 print STDERR "Saving the mapping for feature '$name' to file $fn\n";
644 0         0 $feature->{'num_values_fh'} = $fh;
645             last FILENAME_W
646 0         0 }
647             }}
648 3 50       12 if (not $opened) {
649 0         0 delete $feature->{'num_values_fh'};
650 0         0 croak "Couldn't open a file for saving the mapping the categories of feature '$name' to numbers. "
651             . 'Please specify the values for the feature to avoid this'
652             }
653 3         16 $feature->{'num_value_max'} = $num_value_max;
654             }
655             else { # Got values specified - create a mapping
656 10         34 my @values = _values_of($feature);
657 10         29 my $n = 1;
658 10         21 for my $value (@values) {
659 111         234 $feature->{'cat2num'}{$value} = $n;
660 111         538 $feature->{'num2cat'}{$n} = $value;
661             } continue {
662 111         172 $n++;
663             }
664             }
665             }
666             elsif (lc $format eq 'binary') {
667 273 100 100     1148 return if exists $feature->{'type'} and $feature->{'type'} eq 'boo';
668 231 100       674 return if exists $feature->{'cat2bin'};
669 18         30 my $name = $feature->{'name'};
670 18 100 66     75 if (not exists $feature->{'values_ordered'} and not exists $feature->{'values'}) {
671 2         427 croak "Attempted to convert feature '$name' to binary without specifying its values";
672             }
673            
674 16         36 my @values = _values_of($feature);
675            
676 16         26 my $n = 0;
677 16         185 my @zeroes = (0) x scalar(@values);
678 16         32 for my $value (@values) {
679 121         377 my @vector = @zeroes;
680 121         187 $vector[$n] = 1;
681 121         322 $feature->{'cat2bin'}{$value} = \@vector;
682 121         1171 $feature->{'bin2cat'}{join(' ', @vector)} = $value;
683             } continue {
684 121         257 $n++;
685             }
686             }
687             elsif ($format eq 'postprocd') {
688 11 100       47 return if exists $feature->{'pp2cat'};
689 2         5 my $name = $feature->{'name'};
690 2 50       7 if (not exists $feature->{'postproc'}) {
691 0         0 croak "Feature '$name' doesn't have a postprocessing function specified - can't create mapping from postprocessed values. Stopped"
692             }
693 2         3 my $ppfun = $feature->{'postproc'};
694 2         5 my @values = _values_of($feature);
695 2         5 my %pp2cat;
696 2         13 for my $value (@values) {
697 32         56 my $ppd = $ppfun->($value);
698 32         140 $pp2cat{ $ppd } = $value;
699             }
700 2         10 $feature->{'pp2cat'} = \%pp2cat;
701             }
702             else {
703 0         0 croak "Format '$format' not recognized - please specify 'normal', 'numeric', 'binary' or 'postprocd' (should have caught this earlier)"
704             }
705             }
706              
707             sub names : method {
708 34     34 1 5541 my ($self) = @_;
709 34         44 return map $_->{'name'}, @{ $self->{'features'} }
  34         346  
710             }
711              
712             sub _vector_length { # how many bits will the binary representation of this feature have
713 268     268   321 my ($feature) = @_;
714 268 100 100     882 if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') {
715 52         89 return 1
716             }
717 216         334 return scalar _values_of($feature)
718             }
719              
720             sub _shift_value {
721 40     40   706 my ($feature, $format, $values) = @_;
722 40 50       80 if ($format ne 'binary') {
723 0         0 return shift @$values
724             }
725 40         93 my $n = _vector_length($feature);
726 40 50       75 if (@$values < $n) {
727 0         0 croak "There's not enough fields left to shift a $format value (width $n) of feature '$$feature{name}' from a length "
728             . scalar(@$values) . " list ('@$values')"
729             }
730 40         244 return splice @$values, 0, $n
731             }
732              
733             sub _init_translation {
734 32     32   62 my ($self, $names, $options) = @_;
735 32 50       380 if (ref($names) ne 'ARRAY') {
736 0         0 croak 'Names must be given by an arrayref'
737             }
738 32 50       85 if (ref($options) ne 'HASH') {
739 0         0 croak 'Options must be given by a hashref'
740             }
741            
742 32         68 my %accepted_options = map {;$_=>1} qw(
  288         546  
743             names from_format to_format from_NA to_NA FS OFS header ignore
744             );
745 32         143 for (keys %$options) {
746 118 50       276 if (not exists $accepted_options{$_}) {
747 0         0 croak "Translate does not accept option '$_'. Accepted options are: ".join(' ', keys %accepted_options).'. Stopped'
748             }
749             }
750            
751 32         75 my $from_format = $options->{'from_format'};
752 32         44 my $to_format = $options->{'to_format'};
753 32         54 for ($from_format, $to_format) {
754 64 50       314 if (! m/^(?: normal | numeric | binary )$/x) {
755 0         0 croak '{to,from}_format must be one of "normal", "numeric" or "binary"'
756             }
757             }
758            
759 32 100 100     459 if (exists $options->{'from_NA'} and exists $options->{'to_NA'}) {
    100 66        
    100 66        
    50          
    50          
    100          
760             }
761             elsif (exists $options->{'from_NA'} and exists $self->{'N/A'}) {
762 3         10 $options->{'to_NA'} = $self->{'N/A'};
763             }
764             elsif (exists $options->{'to_NA'} and exists $self->{'N/A'}) {
765 3         8 $options->{'from_NA'} = $self->{'N/A'};
766             }
767             elsif (exists $options->{'to_NA'}) {
768 0         0 $options->{'from_NA'} = undef;
769             }
770             elsif (exists $options->{'from_NA'}) {
771 0         0 croak 'from_NA specified but neither to_NA nor global N/A value specified'
772             }
773             elsif (exists $self->{'N/A'}) {
774 15         52 $options->{'from_NA'} = $options->{'to_NA'} = $self->{'N/A'};
775             }
776            
777 32 50 66     105 if (exists $options->{'header'} and not $options->{'header'}) {
778 0         0 delete $options->{'header'};
779             }
780            
781 32 100       76 if (exists $options->{'ignore'}) {
782 4         52 my $ignore = $options->{'ignore'};
783 4         9 $options->{'ignore'} = [];
784            
785 4 100       12 if (not ref $ignore) {
786 2         4 $ignore = [$ignore];
787             }
788            
789 4 50       9 if (ref($ignore) eq 'ARRAY') {
790 4         18 my $has_non_nums = grep !Scalar::Util::looks_like_number($_), @$ignore;
791 4 50       10 if ($has_non_nums) {
792 0         0 warn 'Some of the specifications of columns to ignore are non-numeric'
793             }
794 4         10 for my $idx (@$ignore) {
795 9 50       17 if ($idx < 0) {
796 0         0 croak "Negative column indices aren't currently supported. Trailing columns are ignored always. Stopped"
797             }
798 9         19 $options->{'ignore'}[ $idx ] = 1;
799             }
800             }
801             else {
802 0         0 croak 'Option "ignore" can only be a column number or an array thereof. Stopped'
803             }
804            
805             # Remove the names of the columns to ignore if the names come from a header
806 4 100       13 if (exists $options->{'header'}) {
807 2         11 for my $idx (sort {$b <=> $a} @$ignore) {
  4         6  
808 5         12 splice @$names, $idx, 1;
809             }
810             }
811             }
812            
813 32         37 my (@features, @widths);
814 32         102 my %names = map {;$_=>1} $self->names;
  237         442  
815 32         103 for my $name (@$names) {
816 181 50       385 if (not exists $names{$name}) {
817 0         0 croak "Feature '$name' not found among ".join(' ', $self->names).". Stopped"
818             }
819 181         334 my $feature = $self->{'feat_named'}{ $name };
820 181         365 $self->_create_mapping($feature, $from_format);
821 181         363 $self->_create_mapping($feature, $to_format);
822 181 100 100     694 if ($from_format eq 'normal' and exists $feature->{'postproc'}) {
823 15 100 66     84 if (exists $feature->{'values'}) {
    100          
    50          
824 11         26 $self->_create_mapping($feature, 'postprocd');
825             }
826             elsif (exists $feature->{'format'} or $to_format eq 'normal') {
827             # translating normal -> normal -- kein problem
828             }
829             elsif (join(' ', sort $from_format, $to_format) eq 'normal numeric') {
830             # translating with dynamic mapping
831             }
832             else {
833 0         0 croak "Feature '$name' is postprocessed and about to be translated from normal but has no values specified. Stopped"
834             }
835             }
836 181         229 push @features, $feature;
837 181         220 my $bin = 0;
838 181 100 100     823 if (exists $feature->{'format'} and $feature->{'format'} eq 'binary') {
    100          
    100          
839 24         30 $bin = 1;
840             }
841             elsif (exists $feature->{'format'}) {}
842             elsif ($from_format eq 'binary') {
843 36         49 $bin = 1;
844             }
845 181 100       359 my $width = $bin ? _vector_length($feature) : 1;
846 181         413 push @widths, $width;
847             }
848 32         810 return map [$names->[$_], $features[$_], $widths[$_]], 0 .. $#features
849             }
850              
851             my %x2cat = (
852             binary => 'bin2cat',
853             numeric => 'num2cat',
854             postprocd => 'pp2cat',
855             );
856              
857             sub _translate_row : method {
858 203     203   322 my ($self, $descrs, $values, $options) = @_;
859 203 50       459 if (ref($values) ne 'ARRAY') {
860 0         0 croak 'Values must be given by an arrayref'
861             }
862 203 50       385 if (@$values < @$descrs) {
863 0         0 croak "There's not enough values in the \@values array (".scalar(@$values).") to match the number of features (".scalar(@$descrs).")";
864             }
865 203         504 my ($from_format, $to_format, $from_NA, $to_NA, $ignore) = @$options{qw(
866             from_format to_format from_NA to_NA ignore)};
867            
868 203         231 my $coln = 0;
869 203         193 my @rv;
870             FEATNAME:
871 203         274 for my $descr (@$descrs) {
872 1297         2075 my ($name, $feature, $width) = @$descr;
873 1297 100       2326 if (defined $ignore) {
874 160         312 while (exists $ignore->[ $coln++ ]) {
875 80         189 push @rv, shift @$values;
876             }
877             }
878 1297 100       3112 my $from_format = exists $feature->{'format'} ? $feature->{'format'} : $from_format;
879 1297 100       2511 my $to_format = exists $feature->{'format'} ? $feature->{'format'} : $to_format;
880 1297         3458 my @value = splice @$values, 0, $width;
881 1297 50       2857 if (@value == 0) {
882 0         0 croak "Zero-width value obtained for feature '$name'"
883             }
884            
885             # Check if the value is N/A
886 1297         1735 my $is_NA = 0;
887 1297 100       2356 if (@value == 1) {
888 950         1394 my $value = $value[0];
889 950 100 100     6477 if (defined $from_NA and $value eq $from_NA) {
    50 66        
      33        
890 235         445 $is_NA = 1;
891             }
892             elsif (defined $to_NA and not defined $value and not defined $from_NA) {
893 0         0 $is_NA = 1;
894             }
895             }
896             else {
897 347 50 66     1104 if (defined $to_NA and not grep {defined $_} @value and not defined $from_NA) {
  4750 100 33     10671  
  4750   100     14748  
898 0         0 $is_NA = 1;
899             }
900             elsif (defined $from_NA and not grep {$_ ne $from_NA} @value) {
901 133         168 $is_NA = 1;
902             }
903             }
904            
905             # Append the N/A if appropriate
906 1297 100       5501 if ($is_NA) {
907 368 100       902 my $n = $to_format eq 'binary' ? _vector_length($feature) : 1;
908 368         821 push @rv, ( ($to_NA) x $n );
909             next FEATNAME
910 368         1096 }
911            
912 929 100       1605 if ($from_format eq $to_format) {
913 368         780 push @rv, @value;
914             next FEATNAME
915 368         1155 }
916             else {
917 561         660 my $catval;
918 561         1895 my $from_format = $from_format;
919 561 100 100     1820 if ($from_format eq 'normal' and exists $feature->{'postproc'}) {
920 102         122 $from_format = 'postprocd';
921             }
922 561 100 100     3579 if ($from_format eq 'normal') {
    100 66        
    100 100        
      100        
923 171         485 ($catval) = @value;
924             }
925             elsif ($from_format eq 'numeric' and exists $feature->{'type'} and $feature->{'type'} =~ /^(int|num|boo)$/) {
926 49         75 ($catval) = @value;
927             }
928             elsif ($from_format eq 'binary' and exists $feature->{'type'} and $feature->{'type'} eq 'boo') {
929 27         40 ($catval) = @value;
930             }
931             else {
932 314         562 my $transfer = $x2cat{ $from_format };
933 314 50       581 if (not defined $transfer) {
934 0         0 croak "Internal error: Unexpected value for \$from_format: '$from_format'"
935             }
936 314 100       612 if (not exists $feature->{ $transfer }) {
937 40 50       194 if (exists $feature->{ $transfer.'_dyna' }) {
938 40         142 $transfer = $transfer.'_dyna';
939             }
940             else {
941 0         0 croak "Cannot find mapping '$transfer' for feature '$name'"
942             }
943             }
944 314         1122 my $valval = join(' ', @value);
945 314 50       801 if (not exists $feature->{ $transfer }{ $valval }) {
946 0         0 my $hint = '';
947 0 0       0 if ($valval eq $feature->{'name'}) {
948 0         0 $hint = ". Maybe you forgot there was a header in your file? Stopped"
949             }
950 0         0 croak "Unexpected value '$valval' of feature '$name' for transfer '$transfer'$hint"
951             }
952 314         709 $catval = $feature->{ $transfer }{ $valval };
953             }
954            
955 561         1031 my @formatted = _format_value($feature, $catval, $to_format, 'NO_ARGS:TRANSLATING_ONLY');
956 561         2631 push @rv, @formatted;
957             }
958             }
959            
960             # Append the trailing columns
961 203         352 push @rv, @$values;
962            
963             return @rv
964 203         2688 }
965              
966             sub translate_row : method {
967 13     13 0 10819 my ($self, $names, $values, $options) = @_;
968 13         33 $names = $self->expand_names($names);
969 13         39 my @descrs = $self->_init_translation($names, $options);
970 13         49 $self->_translate_row(\@descrs, $values, $options);
971             }
972              
973             sub translate : method {
974 19     19 0 31686 my ($self, $source, $sink, $options) = @_;
975 19         52 local $\; local $,;
  19         40  
976 19 50       90 if (not defined Scalar::Util::openhandle($source)) {
977 0         0 croak 'Source must be given by an open filehandle'
978             }
979 19 50       60 if (not defined Scalar::Util::openhandle($sink)) {
980 0         0 croak 'Destination must be given by an open filehandle'
981             }
982 19 50       66 if (ref($options) ne 'HASH') {
983 0         0 croak 'Options must be given by a hashref'
984             }
985 19         124 my $ifs = $options->{'FS'};
986 19 100       69 my $ofs = exists $options->{'OFS'} ? $options->{'OFS'} : $ifs;
987 19         27 my @names;
988             my @orig_header_fields;
989 19 100 33     87 if (exists $options->{'names'}) {
    50          
990 16         20 @names = @{ $self->expand_names($options->{'names'}) };
  16         52  
991             }
992             elsif (exists $options->{'header'} and $options->{'header'}) {
993 3         40 my $row = <$source>;
994 3         8 chomp $row;
995 3         74 @names = split /(?:\Q$ifs\E)+/, $row;
996 3         13 @orig_header_fields = @names;
997             }
998             else {
999 0         0 croak 'No feature names specified for translate'
1000             }
1001            
1002 19         74 my @descrs = $self->_init_translation(\@names, $options);
1003            
1004             # Translate the header, if there's one.
1005 19 100       68 if (@orig_header_fields) {
1006 3         8 my $globbin = $options->{'to_format'} eq 'binary';
1007 3         11 my $last = pop @orig_header_fields;
1008 3         8 for my $field (@orig_header_fields) {
1009 19         25 my $nsep;
1010 19 100       60 if (not exists $self->{'feat_named'}{ $field }) {
1011 4         6 $nsep = 1;
1012             }
1013             else {
1014 15         35 my $feature = $self->{'feat_named'}{ $field };
1015 15   100     109 my $bin = (exists $feature->{'format'} and $feature->{'format'} eq 'binary' or $globbin);
1016 15 100       46 $nsep = $bin ? _vector_length($feature) : 1;
1017             }
1018 19         24 print {$sink} $field, $ofs x $nsep;
  19         92  
1019             }
1020 3         8 print {$sink} $last, "\n";
  3         6  
1021             }
1022            
1023             ROW:
1024 19         181 while (defined (my $row = <$source>)) {
1025 190         244 chomp $row;
1026 190         2131 my @values = split /$ifs/, $row;
1027 190         489 undef $@;
1028 190         252 my @translated = eval { $self->_translate_row(\@descrs, \@values, $options) };
  190         1204  
1029 190 50       780 warn("$@ (line $.)"), next ROW if $@;
1030 190         199 print {$sink} join($ofs, @translated), "\n";
  190         3737  
1031             }
1032             }
1033              
1034             sub add_label {
1035 0     0 0 0 my ($feature, @labels) = @_;
1036 0         0 @labels = map uc($_), @labels;
1037 0 0       0 if (exists $feature->{'label'}) {
1038 0 0       0 if (ref($feature->{'label'}) eq 'ARRAY') {
1039 0         0 push @{ $feature->{'label'} }, @labels;
  0         0  
1040             }
1041             else {
1042 0         0 $feature->{'label'} = [$feature->{'label'}, @labels];
1043             }
1044             }
1045             else {
1046 0         0 $feature->{'label'} = [@labels];
1047             }
1048             }
1049              
1050             {
1051             package Data::FeatureFactory::SoftError;
1052             sub new {
1053 4     4   7 my ($class, $message) = @_;
1054 4 50       9 $message = "SoftError occurred" if not defined $message;
1055 4         28 return bless \$message, $class
1056             }
1057             }
1058              
1059             1
1060              
1061             __END__