| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Algorithm::AM::DataSet; |
|
2
|
10
|
|
|
10
|
|
53135
|
use strict; |
|
|
10
|
|
|
|
|
31
|
|
|
|
10
|
|
|
|
|
269
|
|
|
3
|
10
|
|
|
10
|
|
46
|
use warnings; |
|
|
10
|
|
|
|
|
19
|
|
|
|
10
|
|
|
|
|
418
|
|
|
4
|
|
|
|
|
|
|
our $VERSION = '3.11'; |
|
5
|
|
|
|
|
|
|
# ABSTRACT: Manage data used by Algorithm::AM |
|
6
|
10
|
|
|
10
|
|
49
|
use Carp; |
|
|
10
|
|
|
|
|
18
|
|
|
|
10
|
|
|
|
|
580
|
|
|
7
|
10
|
|
|
10
|
|
2690
|
use Algorithm::AM::DataSet::Item; |
|
|
10
|
|
|
|
|
26
|
|
|
|
10
|
|
|
|
|
436
|
|
|
8
|
10
|
|
|
10
|
|
5006
|
use Path::Tiny; |
|
|
10
|
|
|
|
|
84095
|
|
|
|
10
|
|
|
|
|
639
|
|
|
9
|
|
|
|
|
|
|
use Exporter::Easy ( |
|
10
|
10
|
|
|
|
|
76
|
OK => ['dataset_from_file'] |
|
11
|
10
|
|
|
10
|
|
88
|
); |
|
|
10
|
|
|
|
|
20
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
#pod =head1 SYNOPSIS |
|
14
|
|
|
|
|
|
|
#pod |
|
15
|
|
|
|
|
|
|
#pod use Algorithm::AM::DataSet 'dataset_from_file'; |
|
16
|
|
|
|
|
|
|
#pod use Algorithm::AM::DataSet::Item 'new_item'; |
|
17
|
|
|
|
|
|
|
#pod my $dataset = Algorithm::AM::DataSet->new(cardinality => 10); |
|
18
|
|
|
|
|
|
|
#pod # or |
|
19
|
|
|
|
|
|
|
#pod $dataset = dataset_from_file(path => 'finnverb', format => 'nocommas'); |
|
20
|
|
|
|
|
|
|
#pod $dataset->add_item( |
|
21
|
|
|
|
|
|
|
#pod new_item(features => [qw(a b c d e f g h i)])); |
|
22
|
|
|
|
|
|
|
#pod my $item = $dataset->get_item(2); |
|
23
|
|
|
|
|
|
|
#pod |
|
24
|
|
|
|
|
|
|
#pod =head1 DESCRIPTION |
|
25
|
|
|
|
|
|
|
#pod |
|
26
|
|
|
|
|
|
|
#pod This package contains a list of items that can be used by |
|
27
|
|
|
|
|
|
|
#pod L or L for classification. |
|
28
|
|
|
|
|
|
|
#pod DataSets can be made one item at a time via the L method, |
|
29
|
|
|
|
|
|
|
#pod or they can be read from files via the L function. |
|
30
|
|
|
|
|
|
|
#pod |
|
31
|
|
|
|
|
|
|
#pod =head2 C |
|
32
|
|
|
|
|
|
|
#pod |
|
33
|
|
|
|
|
|
|
#pod Creates a new DataSet object. You must provide a C argument |
|
34
|
|
|
|
|
|
|
#pod indicating the number of features to be contained in each data vector. |
|
35
|
|
|
|
|
|
|
#pod You can then add items via the add_item method. Each item will contain |
|
36
|
|
|
|
|
|
|
#pod a feature vector, and also optionally a class label and a comment |
|
37
|
|
|
|
|
|
|
#pod (also called a "spec"). |
|
38
|
|
|
|
|
|
|
#pod |
|
39
|
|
|
|
|
|
|
#pod =cut |
|
40
|
|
|
|
|
|
|
sub new { |
|
41
|
50
|
|
|
50
|
1
|
30875
|
my ($class, %opts) = @_; |
|
42
|
|
|
|
|
|
|
|
|
43
|
50
|
|
|
|
|
161
|
my $new_opts = _check_opts(%opts); |
|
44
|
|
|
|
|
|
|
|
|
45
|
48
|
|
|
|
|
94
|
my $self = bless $new_opts, $class; |
|
46
|
|
|
|
|
|
|
|
|
47
|
48
|
|
|
|
|
137
|
$self->_init; |
|
48
|
|
|
|
|
|
|
|
|
49
|
48
|
|
|
|
|
137
|
return $self; |
|
50
|
|
|
|
|
|
|
} |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# check the options for validity |
|
53
|
|
|
|
|
|
|
# Return an option hash to initialize $self with |
|
54
|
|
|
|
|
|
|
# For now only 'cardinality' is allowed/required. |
|
55
|
|
|
|
|
|
|
sub _check_opts { |
|
56
|
50
|
|
|
50
|
|
114
|
my (%opts) = @_; |
|
57
|
|
|
|
|
|
|
|
|
58
|
50
|
|
|
|
|
73
|
my %final_opts; |
|
59
|
|
|
|
|
|
|
|
|
60
|
50
|
100
|
|
|
|
132
|
if(!defined $opts{cardinality}){ |
|
61
|
1
|
|
|
|
|
16
|
croak q{Failed to provide 'cardinality' parameter}; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
49
|
|
|
|
|
97
|
$final_opts{cardinality} = $opts{cardinality}; |
|
64
|
49
|
|
|
|
|
74
|
delete $opts{cardinality}; |
|
65
|
|
|
|
|
|
|
|
|
66
|
49
|
100
|
|
|
|
115
|
if(keys %opts){ |
|
67
|
|
|
|
|
|
|
# sort the keys in the error message to make testing possible |
|
68
|
1
|
|
|
|
|
14
|
croak 'Unknown parameters in DataSet constructor: ' . |
|
69
|
|
|
|
|
|
|
(join ', ', sort keys %opts); |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
48
|
|
|
|
|
96
|
return \%final_opts; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# initialize internal state |
|
76
|
|
|
|
|
|
|
sub _init { |
|
77
|
48
|
|
|
48
|
|
97
|
my ($self) = @_; |
|
78
|
|
|
|
|
|
|
# contains all of the items in the dataset |
|
79
|
48
|
|
|
|
|
121
|
$self->{items} = []; |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# map unique class labels to unique integers; |
|
82
|
|
|
|
|
|
|
# these are the indices of the class labels in class_list below; |
|
83
|
|
|
|
|
|
|
# the indices must start at 1 for AM to work, as 0 is reserved |
|
84
|
|
|
|
|
|
|
# for heterogeneity. |
|
85
|
48
|
|
|
|
|
100
|
$self->{class_num_index} = {}; |
|
86
|
|
|
|
|
|
|
# contains the list of class strings in an order that matches |
|
87
|
|
|
|
|
|
|
# the indices in class_num_index |
|
88
|
48
|
|
|
|
|
81
|
$self->{class_list} = []; |
|
89
|
|
|
|
|
|
|
# the total number of different classes contained in the data set |
|
90
|
48
|
|
|
|
|
79
|
$self->{num_classes} = 0; |
|
91
|
48
|
|
|
|
|
62
|
return; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
#pod =head2 C |
|
95
|
|
|
|
|
|
|
#pod |
|
96
|
|
|
|
|
|
|
#pod Returns the number of features contained in the feature vector of a |
|
97
|
|
|
|
|
|
|
#pod single item. |
|
98
|
|
|
|
|
|
|
#pod |
|
99
|
|
|
|
|
|
|
#pod =cut |
|
100
|
|
|
|
|
|
|
sub cardinality { |
|
101
|
881
|
|
|
881
|
1
|
1412
|
my ($self) = @_; |
|
102
|
881
|
|
|
|
|
2012
|
return $self->{cardinality}; |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
#pod =head2 C |
|
106
|
|
|
|
|
|
|
#pod |
|
107
|
|
|
|
|
|
|
#pod Returns the number of items in the data set. |
|
108
|
|
|
|
|
|
|
#pod |
|
109
|
|
|
|
|
|
|
#pod =cut |
|
110
|
|
|
|
|
|
|
sub size { |
|
111
|
830
|
|
|
830
|
1
|
66730
|
my ($self) = @_; |
|
112
|
830
|
|
|
|
|
1087
|
return scalar @{$self->{items}}; |
|
|
830
|
|
|
|
|
2559
|
|
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
#pod =head2 C |
|
116
|
|
|
|
|
|
|
#pod |
|
117
|
|
|
|
|
|
|
#pod Returns the list of all unique class labels in the data set. |
|
118
|
|
|
|
|
|
|
#pod |
|
119
|
|
|
|
|
|
|
#pod =cut |
|
120
|
|
|
|
|
|
|
sub classes { |
|
121
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
|
122
|
0
|
|
|
|
|
0
|
return @{ $self->{class_list} }; |
|
|
0
|
|
|
|
|
0
|
|
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
#pod =head2 C |
|
126
|
|
|
|
|
|
|
#pod |
|
127
|
|
|
|
|
|
|
#pod Adds a new item to the data set. The input may be either an |
|
128
|
|
|
|
|
|
|
#pod L object, or the arguments to create |
|
129
|
|
|
|
|
|
|
#pod one via its constructor (features, class, comment). This method will |
|
130
|
|
|
|
|
|
|
#pod croak if the cardinality of the item does not match L. |
|
131
|
|
|
|
|
|
|
#pod |
|
132
|
|
|
|
|
|
|
#pod =cut |
|
133
|
|
|
|
|
|
|
sub add_item { |
|
134
|
276
|
|
|
276
|
1
|
754
|
my ($self, @args) = @_; |
|
135
|
276
|
|
|
|
|
311
|
my $item; |
|
136
|
276
|
100
|
|
|
|
533
|
if('Algorithm::AM::DataSet::Item' eq ref $args[0]){ |
|
137
|
219
|
|
|
|
|
267
|
$item = $args[0]; |
|
138
|
|
|
|
|
|
|
}else{ |
|
139
|
57
|
|
|
|
|
153
|
$item = Algorithm::AM::DataSet::Item->new(@args); |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
|
|
142
|
275
|
100
|
|
|
|
442
|
if($self->cardinality != $item->cardinality){ |
|
143
|
|
|
|
|
|
|
croak 'Expected ' . $self->cardinality . |
|
144
|
|
|
|
|
|
|
' features, but found ' . (scalar $item->cardinality) . |
|
145
|
1
|
|
|
|
|
4
|
' in ' . (join ' ', @{$item->features}) . |
|
|
1
|
|
|
|
|
3
|
|
|
146
|
|
|
|
|
|
|
' (' . $item->comment . ')'; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
|
|
149
|
274
|
100
|
|
|
|
544
|
if(defined $item->class){ |
|
150
|
270
|
|
|
|
|
464
|
$self->_update_class_vars($item->class); |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# store the new item |
|
154
|
274
|
|
|
|
|
335
|
push @{$self->{items}}, $item; |
|
|
274
|
|
|
|
|
502
|
|
|
155
|
274
|
|
|
|
|
472
|
return; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# keep track of classes; needs updating for new item |
|
159
|
|
|
|
|
|
|
sub _update_class_vars { |
|
160
|
270
|
|
|
270
|
|
397
|
my ($self, $class) = @_; |
|
161
|
|
|
|
|
|
|
|
|
162
|
270
|
100
|
|
|
|
514
|
if(!$self->{class_num_index}->{$class}){ |
|
163
|
47
|
|
|
|
|
64
|
$self->{num_classes}++; |
|
164
|
47
|
|
|
|
|
98
|
$self->{class_num_index}->{$class} = $self->{num_classes}; |
|
165
|
47
|
|
|
|
|
66
|
push @{$self->{class_list}}, $class; |
|
|
47
|
|
|
|
|
133
|
|
|
166
|
|
|
|
|
|
|
} |
|
167
|
270
|
|
|
|
|
367
|
return; |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
#pod =head2 C |
|
171
|
|
|
|
|
|
|
#pod |
|
172
|
|
|
|
|
|
|
#pod Return the item at the given index. This will be a |
|
173
|
|
|
|
|
|
|
#pod L object. |
|
174
|
|
|
|
|
|
|
#pod |
|
175
|
|
|
|
|
|
|
#pod =cut |
|
176
|
|
|
|
|
|
|
sub get_item { |
|
177
|
60306
|
|
|
60306
|
1
|
81749
|
my ($self, $index) = @_; |
|
178
|
60306
|
|
|
|
|
116519
|
return $self->{items}->[$index]; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
#pod =head2 C |
|
182
|
|
|
|
|
|
|
#pod |
|
183
|
|
|
|
|
|
|
#pod Returns the number of different classification labels contained in |
|
184
|
|
|
|
|
|
|
#pod the data set. |
|
185
|
|
|
|
|
|
|
#pod |
|
186
|
|
|
|
|
|
|
#pod =cut |
|
187
|
|
|
|
|
|
|
sub num_classes { |
|
188
|
388
|
|
|
388
|
1
|
1456
|
my ($self) = @_; |
|
189
|
388
|
|
|
|
|
1261
|
return $self->{num_classes}; |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# Used by AM. Return an arrayref containing all of the |
|
193
|
|
|
|
|
|
|
# classes for the data set (ordered the same as the data set). |
|
194
|
|
|
|
|
|
|
sub _data_classes { |
|
195
|
194
|
|
|
194
|
|
312
|
my ($self) = @_; |
|
196
|
|
|
|
|
|
|
my @classes = map { |
|
197
|
30018
|
50
|
|
|
|
43835
|
defined $_->class ? |
|
198
|
|
|
|
|
|
|
$self->_index_for_class($_->class) : |
|
199
|
|
|
|
|
|
|
undef |
|
200
|
194
|
|
|
|
|
306
|
} @{$self->{items}}; |
|
|
194
|
|
|
|
|
449
|
|
|
201
|
194
|
|
|
|
|
809
|
return \@classes; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# Used by AM. Return the integer mapped to the given class string. |
|
205
|
|
|
|
|
|
|
sub _index_for_class { |
|
206
|
60053
|
|
|
60053
|
|
79999
|
my ($self, $class) = @_; |
|
207
|
60053
|
|
|
|
|
103205
|
return $self->{class_num_index}->{$class}; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# Used by Result, which traverses data structures from |
|
211
|
|
|
|
|
|
|
# AM's guts. |
|
212
|
|
|
|
|
|
|
sub _class_for_index { |
|
213
|
364
|
|
|
364
|
|
1673
|
my ($self, $index) = @_; |
|
214
|
364
|
|
|
|
|
1125
|
return $self->{class_list}->[$index - 1]; |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
#pod =head2 C |
|
218
|
|
|
|
|
|
|
#pod |
|
219
|
|
|
|
|
|
|
#pod This function may be exported. Given 'path' and 'format' arguments, |
|
220
|
|
|
|
|
|
|
#pod it reads a file containing a dataset and returns a new DataSet object |
|
221
|
|
|
|
|
|
|
#pod with the given data. The 'path' argument should be the path to the |
|
222
|
|
|
|
|
|
|
#pod file. The 'format' argument should be 'commas' or 'nocommas', |
|
223
|
|
|
|
|
|
|
#pod indicating one of the following formats. You may also specify 'unknown' |
|
224
|
|
|
|
|
|
|
#pod and 'null' arguments to indicate the strings meant to represent an |
|
225
|
|
|
|
|
|
|
#pod unknown class value and null feature values. By default these are |
|
226
|
|
|
|
|
|
|
#pod 'UNK' and '='. |
|
227
|
|
|
|
|
|
|
#pod |
|
228
|
|
|
|
|
|
|
#pod The 'commas' file format is shown below: |
|
229
|
|
|
|
|
|
|
#pod |
|
230
|
|
|
|
|
|
|
#pod class , f eat u re s , your comment here |
|
231
|
|
|
|
|
|
|
#pod |
|
232
|
|
|
|
|
|
|
#pod The commas separate the class label, feature values, and comments, |
|
233
|
|
|
|
|
|
|
#pod and the whitespace around the commas is optional. Each feature value |
|
234
|
|
|
|
|
|
|
#pod is separated with whitespace. |
|
235
|
|
|
|
|
|
|
#pod |
|
236
|
|
|
|
|
|
|
#pod The 'nocommas' file format is shown below: |
|
237
|
|
|
|
|
|
|
#pod |
|
238
|
|
|
|
|
|
|
#pod class features your comment here |
|
239
|
|
|
|
|
|
|
#pod |
|
240
|
|
|
|
|
|
|
#pod Here the class, feature values, and comments are separated by |
|
241
|
|
|
|
|
|
|
#pod whitespace. Each feature value must be a single character with no |
|
242
|
|
|
|
|
|
|
#pod separating characters, so here the features are f, e, a, t, u, r, |
|
243
|
|
|
|
|
|
|
#pod e, and s. |
|
244
|
|
|
|
|
|
|
#pod |
|
245
|
|
|
|
|
|
|
#pod Lines beginning with a pound character (C<#>) are ignored. |
|
246
|
|
|
|
|
|
|
#pod |
|
247
|
|
|
|
|
|
|
#pod =cut |
|
248
|
|
|
|
|
|
|
sub dataset_from_file {## no critic (RequireArgUnpacking) |
|
249
|
12
|
|
|
12
|
1
|
7779
|
my (%opts) = ( |
|
250
|
|
|
|
|
|
|
unknown => 'UNK', |
|
251
|
|
|
|
|
|
|
null => '=', |
|
252
|
|
|
|
|
|
|
@_ |
|
253
|
|
|
|
|
|
|
); |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
croak q[Failed to provide 'path' parameter] |
|
256
|
12
|
100
|
|
|
|
55
|
unless exists $opts{path}; |
|
257
|
|
|
|
|
|
|
croak q[Failed to provide 'format' parameter] |
|
258
|
11
|
100
|
|
|
|
42
|
unless exists $opts{format}; |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
my ($path, $format, $unknown, $null) = ( |
|
261
|
10
|
|
|
|
|
28
|
path($opts{path}), @opts{'format', 'unknown', 'null'}); |
|
262
|
|
|
|
|
|
|
|
|
263
|
10
|
100
|
|
|
|
240
|
croak "Could not find file $path" |
|
264
|
|
|
|
|
|
|
unless $path->exists; |
|
265
|
|
|
|
|
|
|
|
|
266
|
9
|
|
|
|
|
237
|
my ($field_sep, $feature_sep); |
|
267
|
9
|
100
|
|
|
|
33
|
if($format eq 'commas'){ |
|
|
|
100
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# class/features/comment separated by a comma |
|
269
|
4
|
|
|
|
|
20
|
$field_sep = qr{\s*,\s*}; |
|
270
|
|
|
|
|
|
|
# features separated by space |
|
271
|
4
|
|
|
|
|
10
|
$feature_sep = qr{\s+}; |
|
272
|
|
|
|
|
|
|
}elsif($format eq 'nocommas'){ |
|
273
|
|
|
|
|
|
|
# class/features/comment separated by space |
|
274
|
4
|
|
|
|
|
16
|
$field_sep = qr{\s+}; |
|
275
|
|
|
|
|
|
|
# no seps for features; each is a single character |
|
276
|
4
|
|
|
|
|
12
|
$feature_sep = qr{}; |
|
277
|
|
|
|
|
|
|
}else{ |
|
278
|
1
|
|
|
|
|
10
|
croak "Unknown value $format for format parameter " . |
|
279
|
|
|
|
|
|
|
q{(should be 'commas' or 'nocommas')}; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
8
|
50
|
|
|
|
23
|
if(!defined $unknown){ |
|
283
|
0
|
|
|
|
|
0
|
croak q[Must provide a defined value for 'unknown' parameter]; |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
|
|
286
|
8
|
|
|
|
|
37
|
my $reader = _read_data_sub( |
|
287
|
|
|
|
|
|
|
$path, $unknown, $null, $field_sep, $feature_sep); |
|
288
|
8
|
|
|
|
|
18
|
my $item = $reader->(); |
|
289
|
8
|
50
|
|
|
|
32
|
if(!$item){ |
|
290
|
0
|
|
|
|
|
0
|
croak "No data found in file $path"; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
8
|
|
|
|
|
30
|
my $dataset = __PACKAGE__->new(cardinality => $item->cardinality); |
|
293
|
8
|
|
|
|
|
27
|
$dataset->add_item($item); |
|
294
|
8
|
|
|
|
|
18
|
while($item = $reader->()){ |
|
295
|
189
|
|
|
|
|
334
|
$dataset->add_item($item); |
|
296
|
|
|
|
|
|
|
} |
|
297
|
7
|
|
|
|
|
191
|
return $dataset; |
|
298
|
|
|
|
|
|
|
} |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# return a sub that returns one Item per call from the given FH, |
|
301
|
|
|
|
|
|
|
# and returns undef once the file is done being read. Throws errors |
|
302
|
|
|
|
|
|
|
# on bad file contents. |
|
303
|
|
|
|
|
|
|
# Input is file (Path::Tiny), string representing unknown class, |
|
304
|
|
|
|
|
|
|
# string representing null feature, field separator (class, |
|
305
|
|
|
|
|
|
|
# features, comment) and feature separator |
|
306
|
|
|
|
|
|
|
sub _read_data_sub { |
|
307
|
8
|
|
|
8
|
|
23
|
my ($data_file, $unknown, $null, |
|
308
|
|
|
|
|
|
|
$field_sep, $feature_sep) = @_; |
|
309
|
8
|
|
|
|
|
31
|
my $data_fh = $data_file->openr_utf8; |
|
310
|
8
|
|
|
|
|
34419
|
my $line_num = 0; |
|
311
|
|
|
|
|
|
|
return sub { |
|
312
|
205
|
|
|
205
|
|
238
|
my $line; |
|
313
|
|
|
|
|
|
|
# grab the next non-blank line from the file |
|
314
|
205
|
|
|
|
|
648
|
while($line = <$data_fh>){ |
|
315
|
201
|
|
|
|
|
337
|
$line_num++; |
|
316
|
|
|
|
|
|
|
# skip comments |
|
317
|
201
|
100
|
|
|
|
416
|
next if $line =~ m/^\s*#/; |
|
318
|
|
|
|
|
|
|
# cross-platform chomp |
|
319
|
199
|
|
|
|
|
782
|
$line =~ s/\R$//; |
|
320
|
199
|
|
|
|
|
915
|
$line =~ s/^\s+|\s+$//g; |
|
321
|
199
|
100
|
|
|
|
334
|
last if $line; |
|
322
|
|
|
|
|
|
|
} |
|
323
|
205
|
100
|
|
|
|
333
|
return unless $line; |
|
324
|
198
|
|
|
|
|
900
|
my ($class, $feats, $comment) = split /$field_sep/, $line, 3; |
|
325
|
|
|
|
|
|
|
# the line has to have at least the class label and features |
|
326
|
198
|
100
|
|
|
|
380
|
if(!defined $feats){ |
|
327
|
1
|
|
|
|
|
6
|
croak "Couldn't read data at line $line_num in $data_file"; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
# if the class is specified as unknown, set it to undef to |
|
330
|
|
|
|
|
|
|
# indicate this to Item |
|
331
|
197
|
100
|
|
|
|
318
|
if($class eq $unknown){ |
|
332
|
4
|
|
|
|
|
9
|
undef $class; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
197
|
|
|
|
|
958
|
my @data_vars = split /$feature_sep/, $feats; |
|
336
|
|
|
|
|
|
|
# set null features to '' |
|
337
|
197
|
100
|
|
|
|
348
|
@data_vars = map {$_ eq $null ? '' : $_} @data_vars; |
|
|
1913
|
|
|
|
|
3135
|
|
|
338
|
|
|
|
|
|
|
|
|
339
|
197
|
|
|
|
|
552
|
return Algorithm::AM::DataSet::Item->new( |
|
340
|
|
|
|
|
|
|
features=> \@data_vars, |
|
341
|
|
|
|
|
|
|
class => $class, |
|
342
|
|
|
|
|
|
|
comment => $comment |
|
343
|
|
|
|
|
|
|
); |
|
344
|
8
|
|
|
|
|
55
|
}; |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
1; |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
__END__ |