| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Algorithm::AM; |
|
2
|
10
|
|
|
10
|
|
75665
|
use strict; |
|
|
10
|
|
|
|
|
12
|
|
|
|
10
|
|
|
|
|
207
|
|
|
3
|
10
|
|
|
10
|
|
24
|
use warnings; |
|
|
10
|
|
|
|
|
10
|
|
|
|
10
|
|
|
|
|
292
|
|
|
4
|
|
|
|
|
|
|
our $VERSION = '3.10'; |
|
5
|
|
|
|
|
|
|
# ABSTRACT: Classify data with Analogical Modeling |
|
6
|
10
|
|
|
10
|
|
28
|
use feature 'state'; |
|
|
10
|
|
|
|
|
10
|
|
|
|
10
|
|
|
|
|
606
|
|
|
7
|
10
|
|
|
10
|
|
33
|
use Carp; |
|
|
10
|
|
|
|
|
20
|
|
|
|
10
|
|
|
|
|
967
|
|
|
8
|
|
|
|
|
|
|
our @CARP_NOT = qw(Algorithm::AM); |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# Place this accessor here so that Class::Tiny doesn't generate |
|
11
|
|
|
|
|
|
|
# a getter/setter pair. |
|
12
|
|
|
|
|
|
|
sub training_set { |
|
13
|
389
|
|
|
389
|
1
|
297
|
my ($self) = @_; |
|
14
|
389
|
|
|
|
|
468
|
return $self->{training_set}; |
|
15
|
|
|
|
|
|
|
} |
|
16
|
|
|
|
|
|
|
|
|
17
|
10
|
|
|
|
|
52
|
use Class::Tiny qw( |
|
18
|
|
|
|
|
|
|
exclude_nulls |
|
19
|
|
|
|
|
|
|
exclude_given |
|
20
|
|
|
|
|
|
|
linear |
|
21
|
|
|
|
|
|
|
training_set |
|
22
|
|
|
|
|
|
|
), { |
|
23
|
|
|
|
|
|
|
exclude_nulls => 1, |
|
24
|
|
|
|
|
|
|
exclude_given => 1, |
|
25
|
|
|
|
|
|
|
linear => 0, |
|
26
|
10
|
|
|
10
|
|
2707
|
}; |
|
|
10
|
|
|
|
|
14209
|
|
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub BUILD { |
|
29
|
195
|
|
|
195
|
0
|
11255
|
my ($self, $args) = @_; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# check for invalid arguments |
|
32
|
195
|
|
|
|
|
212
|
my $class = ref $self; |
|
33
|
195
|
|
|
|
|
371
|
my %valid_attrs = map {$_ => 1} |
|
|
780
|
|
|
|
|
3301
|
|
|
34
|
|
|
|
|
|
|
Class::Tiny->get_all_attributes_for($class); |
|
35
|
195
|
|
|
|
|
610
|
my @invalids = grep {!$valid_attrs{$_}} sort keys %$args; |
|
|
744
|
|
|
|
|
807
|
|
|
36
|
195
|
100
|
|
|
|
342
|
if(@invalids){ |
|
37
|
1
|
|
|
|
|
10
|
croak "Invalid attributes for $class: " . join ' ', |
|
38
|
|
|
|
|
|
|
sort @invalids; |
|
39
|
|
|
|
|
|
|
} |
|
40
|
|
|
|
|
|
|
|
|
41
|
194
|
100
|
|
|
|
306
|
if(!exists $args->{training_set}){ |
|
42
|
1
|
|
|
|
|
14
|
croak "Missing required parameter 'training_set'"; |
|
43
|
|
|
|
|
|
|
} |
|
44
|
|
|
|
|
|
|
|
|
45
|
193
|
100
|
|
|
|
362
|
if('Algorithm::AM::DataSet' ne ref $args->{training_set}){ |
|
46
|
1
|
|
|
|
|
7
|
croak 'Parameter training_set should ' . |
|
47
|
|
|
|
|
|
|
'be an Algorithm::AM::DataSet'; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
192
|
|
|
|
|
286
|
$self->_initialize(); |
|
50
|
|
|
|
|
|
|
# delete $args->{training_set}; |
|
51
|
192
|
|
|
|
|
459
|
return; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
10
|
|
|
10
|
|
9145
|
use Algorithm::AM::Result; |
|
|
10
|
|
|
|
|
16
|
|
|
|
10
|
|
|
|
|
257
|
|
|
55
|
10
|
|
|
10
|
|
45
|
use Algorithm::AM::BigInt 'bigcmp'; |
|
|
10
|
|
|
|
|
10
|
|
|
|
10
|
|
|
|
|
358
|
|
|
56
|
10
|
|
|
10
|
|
3253
|
use Algorithm::AM::DataSet; |
|
|
10
|
|
|
|
|
14
|
|
|
|
10
|
|
|
|
|
471
|
|
|
57
|
10
|
|
|
10
|
|
3795
|
use Import::Into; |
|
|
10
|
|
|
|
|
19219
|
|
|
|
10
|
|
|
|
|
789
|
|
|
58
|
|
|
|
|
|
|
# Use Import::Into to export classes into caller |
|
59
|
|
|
|
|
|
|
sub import { |
|
60
|
15
|
|
|
15
|
|
42236
|
my $target = caller; |
|
61
|
15
|
|
|
|
|
93
|
Algorithm::AM::BigInt->import::into($target, 'bigcmp'); |
|
62
|
15
|
|
|
|
|
2513
|
Algorithm::AM::DataSet->import::into($target, 'dataset_from_file'); |
|
63
|
15
|
|
|
|
|
2101
|
Algorithm::AM::DataSet::Item->import::into($target, 'new_item'); |
|
64
|
15
|
|
|
|
|
2280
|
return; |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
require XSLoader; |
|
68
|
|
|
|
|
|
|
XSLoader::load(__PACKAGE__, $VERSION); |
|
69
|
|
|
|
|
|
|
|
|
70
|
10
|
|
|
10
|
|
2725
|
use Log::Any qw($log); |
|
|
10
|
|
|
|
|
57954
|
|
|
|
10
|
|
|
|
|
32
|
|
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# do all of the classification data structure initialization here, |
|
73
|
|
|
|
|
|
|
# as well as calling the XS initialization method. |
|
74
|
|
|
|
|
|
|
sub _initialize { |
|
75
|
192
|
|
|
192
|
|
165
|
my ($self) = @_; |
|
76
|
|
|
|
|
|
|
|
|
77
|
192
|
|
|
|
|
229
|
my $train = $self->training_set; |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# compute sub-lattices sizes here so that lattice space can be |
|
80
|
|
|
|
|
|
|
# allocated in the _xs_initialize method. If certain features are |
|
81
|
|
|
|
|
|
|
# thrown out later, each sub-lattice can only get smaller, so |
|
82
|
|
|
|
|
|
|
# this is safe to do once here. |
|
83
|
192
|
|
|
|
|
418
|
my $lattice_sizes = _compute_lattice_sizes($train->cardinality); |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# sum is intitialized to a list of zeros |
|
86
|
192
|
|
|
|
|
342
|
@{$self->{sum}} = (0.0) x ($train->num_classes + 1); |
|
|
192
|
|
|
|
|
357
|
|
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# preemptively allocate memory |
|
89
|
|
|
|
|
|
|
# TODO: not sure what this does |
|
90
|
192
|
|
|
|
|
325
|
@{$self->{itemcontextchain}} = (0) x $train->size; |
|
|
192
|
|
|
|
|
1285
|
|
|
91
|
|
|
|
|
|
|
|
|
92
|
192
|
|
|
|
|
1056
|
$self->{$_} = {} for ( |
|
93
|
|
|
|
|
|
|
qw( |
|
94
|
|
|
|
|
|
|
itemcontextchainhead |
|
95
|
|
|
|
|
|
|
context_to_class |
|
96
|
|
|
|
|
|
|
contextsize |
|
97
|
|
|
|
|
|
|
pointers |
|
98
|
|
|
|
|
|
|
gang |
|
99
|
|
|
|
|
|
|
) |
|
100
|
|
|
|
|
|
|
); |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Initialize XS data structures |
|
103
|
|
|
|
|
|
|
# TODO: Perl crashes unless this is saved. The XS |
|
104
|
|
|
|
|
|
|
# must not be increasing the reference count |
|
105
|
192
|
|
|
|
|
323
|
$self->{save_this} = $train->_data_classes; |
|
106
|
|
|
|
|
|
|
$self->_xs_initialize( |
|
107
|
|
|
|
|
|
|
$lattice_sizes, |
|
108
|
|
|
|
|
|
|
$self->{save_this}, |
|
109
|
|
|
|
|
|
|
$self->{itemcontextchain}, |
|
110
|
|
|
|
|
|
|
$self->{itemcontextchainhead}, |
|
111
|
|
|
|
|
|
|
$self->{context_to_class}, |
|
112
|
|
|
|
|
|
|
$self->{contextsize}, |
|
113
|
|
|
|
|
|
|
$self->{pointers}, |
|
114
|
|
|
|
|
|
|
$self->{gang}, |
|
115
|
|
|
|
|
|
|
$self->{sum} |
|
116
|
192
|
|
|
|
|
948
|
); |
|
117
|
192
|
|
|
|
|
260
|
return; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub classify { |
|
121
|
195
|
|
|
195
|
1
|
3807
|
my ($self, $test_item) = @_; |
|
122
|
|
|
|
|
|
|
|
|
123
|
195
|
|
|
|
|
237
|
my $training_set = $self->training_set; |
|
124
|
195
|
100
|
|
|
|
332
|
if($training_set->cardinality != $test_item->cardinality){ |
|
125
|
1
|
|
|
|
|
2
|
croak 'Training set and test item do not have the same ' . |
|
126
|
|
|
|
|
|
|
'cardinality (' . $training_set->cardinality . ' and ' . |
|
127
|
|
|
|
|
|
|
$test_item->cardinality . ')'; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# num_feats is the number of features to be used in classification; |
|
131
|
|
|
|
|
|
|
# if we exclude nulls, then we need to minus the number of '=' |
|
132
|
|
|
|
|
|
|
# found in this test item; otherwise, it's just the number of |
|
133
|
|
|
|
|
|
|
# columns in a single item vector |
|
134
|
194
|
|
|
|
|
424
|
my $num_feats = $training_set->cardinality; |
|
135
|
|
|
|
|
|
|
|
|
136
|
194
|
100
|
|
|
|
3508
|
if($self->exclude_nulls){ |
|
137
|
1808
|
|
|
|
|
1523
|
$num_feats -= grep {$_ eq ''} @{ |
|
138
|
189
|
|
|
|
|
919
|
$test_item->features }; |
|
|
189
|
|
|
|
|
322
|
|
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# recalculate the lattice sizes with new number of active features |
|
142
|
194
|
|
|
|
|
391
|
my $lattice_sizes = _compute_lattice_sizes($num_feats); |
|
143
|
|
|
|
|
|
|
## $activeContexts = 1 << $activeVar; |
|
144
|
|
|
|
|
|
|
|
|
145
|
194
|
|
|
|
|
190
|
my $nullcontext = pack "b64", '0' x 64; |
|
146
|
|
|
|
|
|
|
|
|
147
|
194
|
|
|
|
|
129
|
my $given_excluded = 0; |
|
148
|
194
|
|
|
|
|
167
|
my $test_in_training = 0; |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# initialize classification-related variables |
|
151
|
|
|
|
|
|
|
# it is important to dereference rather than just |
|
152
|
|
|
|
|
|
|
# assigning a new one with [] or {}. This is because |
|
153
|
|
|
|
|
|
|
# the XS code has access to the existing reference, |
|
154
|
|
|
|
|
|
|
# but will be accessing the wrong variable if we |
|
155
|
|
|
|
|
|
|
# change it. |
|
156
|
194
|
|
|
|
|
136
|
%{$self->{contextsize}} = (); |
|
|
194
|
|
|
|
|
284
|
|
|
157
|
194
|
|
|
|
|
119
|
%{$self->{itemcontextchainhead}} = (); |
|
|
194
|
|
|
|
|
157
|
|
|
158
|
194
|
|
|
|
|
138
|
%{$self->{context_to_class}} = (); |
|
|
194
|
|
|
|
|
149
|
|
|
159
|
194
|
|
|
|
|
121
|
%{$self->{pointers}} = (); |
|
|
194
|
|
|
|
|
164
|
|
|
160
|
194
|
|
|
|
|
119
|
%{$self->{gang}} = (); |
|
|
194
|
|
|
|
|
158
|
|
|
161
|
194
|
|
|
|
|
129
|
@{$self->{itemcontextchain}} = (); |
|
|
194
|
|
|
|
|
591
|
|
|
162
|
|
|
|
|
|
|
# big ints are used in AM.xs; these consist of an |
|
163
|
|
|
|
|
|
|
# array of 8 unsigned longs |
|
164
|
194
|
|
|
|
|
120
|
foreach (@{$self->{sum}}) { |
|
|
194
|
|
|
|
|
258
|
|
|
165
|
753
|
|
|
|
|
710
|
$_ = pack "L!8", 0, 0, 0, 0, 0, 0, 0, 0; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# calculate context labels and associated structures for |
|
169
|
|
|
|
|
|
|
# the entire data set |
|
170
|
194
|
|
|
|
|
359
|
for my $index ( 0 .. $training_set->size - 1 ) { |
|
171
|
|
|
|
|
|
|
my $context = _context_label( |
|
172
|
|
|
|
|
|
|
# Note: this must be copied to prevent infinite loop; |
|
173
|
|
|
|
|
|
|
# see todo note for _context_label |
|
174
|
30034
|
|
|
|
|
17359
|
[@{$lattice_sizes}], |
|
|
30034
|
|
|
|
|
53202
|
|
|
175
|
|
|
|
|
|
|
$training_set->get_item($index)->features, |
|
176
|
|
|
|
|
|
|
$test_item->features, |
|
177
|
|
|
|
|
|
|
$self->exclude_nulls |
|
178
|
|
|
|
|
|
|
); |
|
179
|
30034
|
|
|
|
|
58609
|
$self->{contextsize}->{$context}++; |
|
180
|
|
|
|
|
|
|
# TODO: explain itemcontextchain and itemcontextchainhead |
|
181
|
|
|
|
|
|
|
$self->{itemcontextchain}->[$index] = |
|
182
|
30034
|
|
|
|
|
27870
|
$self->{itemcontextchainhead}->{$context}; |
|
183
|
30034
|
|
|
|
|
23036
|
$self->{itemcontextchainhead}->{$context} = $index; |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# store the class for the subcontext; if there |
|
186
|
|
|
|
|
|
|
# is already a different class for this subcontext, |
|
187
|
|
|
|
|
|
|
# then store 0, signifying heterogeneity. |
|
188
|
30034
|
|
|
|
|
46738
|
my $class = $training_set->_index_for_class( |
|
189
|
|
|
|
|
|
|
$training_set->get_item($index)->class); |
|
190
|
30034
|
100
|
|
|
|
39817
|
if ( defined $self->{context_to_class}->{$context} ) { |
|
191
|
21669
|
100
|
|
|
|
32272
|
if($self->{context_to_class}->{$context} != $class){ |
|
192
|
9834
|
|
|
|
|
11489
|
$self->{context_to_class}->{$context} = 0; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
} |
|
195
|
|
|
|
|
|
|
else { |
|
196
|
8365
|
|
|
|
|
9909
|
$self->{context_to_class}->{$context} = $class; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
# $nullcontext is all 0's, which is a context label for |
|
200
|
|
|
|
|
|
|
# a training item that exactly matches the test item. Exclude |
|
201
|
|
|
|
|
|
|
# the item if required, and set a flag that the test item was |
|
202
|
|
|
|
|
|
|
# found in the training set. |
|
203
|
194
|
100
|
|
|
|
402
|
if ( exists $self->{context_to_class}->{$nullcontext} ) { |
|
204
|
182
|
|
|
|
|
133
|
$test_in_training = 1; |
|
205
|
182
|
100
|
|
|
|
2553
|
if($self->exclude_given){ |
|
206
|
177
|
|
|
|
|
908
|
delete $self->{context_to_class}->{$nullcontext}; |
|
207
|
177
|
|
|
|
|
160
|
$given_excluded = 1; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
# initialize the results object to hold all of the configuration |
|
211
|
|
|
|
|
|
|
# info. |
|
212
|
194
|
100
|
|
|
|
2422
|
my $result = Algorithm::AM::Result->new( |
|
213
|
|
|
|
|
|
|
given_excluded => $given_excluded, |
|
214
|
|
|
|
|
|
|
cardinality => $num_feats, |
|
215
|
|
|
|
|
|
|
exclude_nulls => $self->exclude_nulls, |
|
216
|
|
|
|
|
|
|
count_method => $self->linear ? 'linear' : 'squared', |
|
217
|
|
|
|
|
|
|
training_set => $training_set, |
|
218
|
|
|
|
|
|
|
test_item => $test_item, |
|
219
|
|
|
|
|
|
|
test_in_train => $test_in_training, |
|
220
|
|
|
|
|
|
|
); |
|
221
|
|
|
|
|
|
|
|
|
222
|
194
|
50
|
|
|
|
9175
|
$log->debug(${$result->config_info}) |
|
|
0
|
|
|
|
|
0
|
|
|
223
|
|
|
|
|
|
|
if($log->is_debug); |
|
224
|
|
|
|
|
|
|
|
|
225
|
194
|
|
|
|
|
7864
|
$result->start_time([ (localtime)[0..2] ]); |
|
226
|
194
|
100
|
|
|
|
3270
|
$self->_fillandcount( |
|
227
|
|
|
|
|
|
|
$lattice_sizes, $self->linear ? 1 : 0); |
|
228
|
194
|
|
|
|
|
39343
|
$result->end_time([ (localtime)[0..2] ]); |
|
229
|
|
|
|
|
|
|
|
|
230
|
194
|
100
|
|
|
|
1079
|
unless ($self->{pointers}->{'grandtotal'}) { |
|
231
|
|
|
|
|
|
|
#TODO: is this tested yet? |
|
232
|
1
|
50
|
|
|
|
4
|
if($log->is_warn){ |
|
233
|
0
|
|
|
|
|
0
|
$log->warn('No training items considered. ' . |
|
234
|
|
|
|
|
|
|
'No prediction possible.'); |
|
235
|
|
|
|
|
|
|
} |
|
236
|
1
|
|
|
|
|
8
|
return; |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
$result->_process_stats( |
|
240
|
|
|
|
|
|
|
# TODO: after refactoring to a "guts" object, |
|
241
|
|
|
|
|
|
|
# just pass that in |
|
242
|
|
|
|
|
|
|
$self->{sum}, |
|
243
|
|
|
|
|
|
|
$self->{pointers}, |
|
244
|
|
|
|
|
|
|
$self->{itemcontextchainhead}, |
|
245
|
|
|
|
|
|
|
$self->{itemcontextchain}, |
|
246
|
|
|
|
|
|
|
$self->{context_to_class}, |
|
247
|
|
|
|
|
|
|
$self->{gang}, |
|
248
|
|
|
|
|
|
|
$lattice_sizes, |
|
249
|
|
|
|
|
|
|
$self->{contextsize} |
|
250
|
193
|
|
|
|
|
550
|
); |
|
251
|
193
|
|
|
|
|
381
|
return $result; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# since we split the lattice in four, we have to decide which features |
|
255
|
|
|
|
|
|
|
# go where. Given the number of features being used, return an arrayref |
|
256
|
|
|
|
|
|
|
# containing the number of features to be used in each of the the four |
|
257
|
|
|
|
|
|
|
# lattices. |
|
258
|
|
|
|
|
|
|
sub _compute_lattice_sizes { |
|
259
|
386
|
|
|
386
|
|
323
|
my ($num_feats) = @_; |
|
260
|
|
|
|
|
|
|
|
|
261
|
10
|
|
|
10
|
|
31762
|
use integer; |
|
|
10
|
|
|
|
|
75
|
|
|
|
10
|
|
|
|
|
34
|
|
|
262
|
386
|
|
|
|
|
256
|
my @lattice_sizes; |
|
263
|
386
|
|
|
|
|
396
|
my $half = $num_feats / 2; |
|
264
|
386
|
|
|
|
|
424
|
$lattice_sizes[0] = $half / 2; |
|
265
|
386
|
|
|
|
|
335
|
$lattice_sizes[1] = $half - $lattice_sizes[0]; |
|
266
|
386
|
|
|
|
|
290
|
$half = $num_feats - $half; |
|
267
|
386
|
|
|
|
|
273
|
$lattice_sizes[2] = $half / 2; |
|
268
|
386
|
|
|
|
|
278
|
$lattice_sizes[3] = $half - $lattice_sizes[2]; |
|
269
|
386
|
|
|
|
|
433
|
return \@lattice_sizes; |
|
270
|
|
|
|
|
|
|
} |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# Create binary context labels for a training item |
|
273
|
|
|
|
|
|
|
# by comparing it with a test item. Each training item |
|
274
|
|
|
|
|
|
|
# needs one binary label for each sublattice (of which |
|
275
|
|
|
|
|
|
|
# there are currently four), but this is packed into a |
|
276
|
|
|
|
|
|
|
# single scalar representing an array of 4 shorts (this |
|
277
|
|
|
|
|
|
|
# format is used in the XS side). |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
# TODO: we have to copy lattice_sizes out of $self in order to |
|
280
|
|
|
|
|
|
|
# iterate it. Otherwise it goes on forever. Why? |
|
281
|
|
|
|
|
|
|
sub _context_label { |
|
282
|
|
|
|
|
|
|
# inputs: |
|
283
|
|
|
|
|
|
|
# number of active features in each lattice, |
|
284
|
|
|
|
|
|
|
# training item features, test item features, |
|
285
|
|
|
|
|
|
|
# and boolean indicating if nulls should be excluded |
|
286
|
30034
|
|
|
30034
|
|
102508
|
my ($lattice_sizes, $train_feats, $test_feats, $skip_nulls) = @_; |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# feature index |
|
289
|
30034
|
|
|
|
|
19962
|
my $index = 0; |
|
290
|
|
|
|
|
|
|
# the binary context labels for each separate lattice |
|
291
|
30034
|
|
|
|
|
23331
|
my @context_list = (); |
|
292
|
|
|
|
|
|
|
|
|
293
|
30034
|
|
|
|
|
26383
|
for my $a (@$lattice_sizes) { |
|
294
|
|
|
|
|
|
|
# binary context label for a single sublattice |
|
295
|
120136
|
|
|
|
|
64282
|
my $context = 0; |
|
296
|
|
|
|
|
|
|
# loop through all features in the sublattice |
|
297
|
|
|
|
|
|
|
# assign 0 if features match, 1 if they do not |
|
298
|
120136
|
|
|
|
|
129549
|
for ( ; $a ; --$a ) { |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# skip null features if indicated |
|
301
|
247274
|
100
|
|
|
|
243662
|
if($skip_nulls){ |
|
302
|
247187
|
|
|
|
|
308481
|
++$index while $test_feats->[$index] eq ''; |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
# add a 1 for mismatched variable, 0 for matched variable |
|
305
|
247274
|
|
|
|
|
173076
|
$context = ( $context << 1 ) | ( |
|
306
|
|
|
|
|
|
|
$test_feats->[$index] ne $train_feats->[$index] ); |
|
307
|
247274
|
|
|
|
|
272879
|
++$index; |
|
308
|
|
|
|
|
|
|
} |
|
309
|
120136
|
|
|
|
|
97642
|
push @context_list, $context; |
|
310
|
|
|
|
|
|
|
} |
|
311
|
|
|
|
|
|
|
# a context label is an array of unsigned shorts in XS |
|
312
|
30034
|
|
|
|
|
41973
|
my $context = pack "S!4", @context_list; |
|
313
|
30034
|
|
|
|
|
33552
|
return $context; |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
1; |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
__END__ |