File Coverage

blib/lib/Algorithm/AM/DataSet.pm

Criterion	Covered	Total	%
statement	118	123	95.9
branch	37	40	92.5
condition			n/a
subroutine	21	22	95.4
pod	8	8	100.0
total	184	193	95.3

line	stmt	bran	sub	pod	time	code
1						package Algorithm::AM::DataSet;
2	10		10		53135	use strict;
	10				31
	10				269
3	10		10		46	use warnings;
	10				19
	10				418
4						our $VERSION = '3.11';
5						# ABSTRACT: Manage data used by Algorithm::AM
6	10		10		49	use Carp;
	10				18
	10				580
7	10		10		2690	use Algorithm::AM::DataSet::Item;
	10				26
	10				436
8	10		10		5006	use Path::Tiny;
	10				84095
	10				639
9						use Exporter::Easy (
10	10				76	OK => ['dataset_from_file']
11	10		10		88	);
	10				20
12
13						#pod =head1 SYNOPSIS
14						#pod
15						#pod use Algorithm::AM::DataSet 'dataset_from_file';
16						#pod use Algorithm::AM::DataSet::Item 'new_item';
17						#pod my $dataset = Algorithm::AM::DataSet->new(cardinality => 10);
18						#pod # or
19						#pod $dataset = dataset_from_file(path => 'finnverb', format => 'nocommas');
20						#pod $dataset->add_item(
21						#pod new_item(features => [qw(a b c d e f g h i)]));
22						#pod my $item = $dataset->get_item(2);
23						#pod
24						#pod =head1 DESCRIPTION
25						#pod
26						#pod This package contains a list of items that can be used by
27						#pod L or L for classification.
28						#pod DataSets can be made one item at a time via the L method,
29						#pod or they can be read from files via the L function.
30						#pod
31						#pod =head2 C
32						#pod
33						#pod Creates a new DataSet object. You must provide a C argument
34						#pod indicating the number of features to be contained in each data vector.
35						#pod You can then add items via the add_item method. Each item will contain
36						#pod a feature vector, and also optionally a class label and a comment
37						#pod (also called a "spec").
38						#pod
39						#pod =cut
40						sub new {
41	50		50	1	30875	my ($class, %opts) = @_;
42
43	50				161	my $new_opts = _check_opts(%opts);
44
45	48				94	my $self = bless $new_opts, $class;
46
47	48				137	$self->_init;
48
49	48				137	return $self;
50						}
51
52						# check the options for validity
53						# Return an option hash to initialize $self with
54						# For now only 'cardinality' is allowed/required.
55						sub _check_opts {
56	50		50		114	my (%opts) = @_;
57
58	50				73	my %final_opts;
59
60	50	100			132	if(!defined $opts{cardinality}){
61	1				16	croak q{Failed to provide 'cardinality' parameter};
62						}
63	49				97	$final_opts{cardinality} = $opts{cardinality};
64	49				74	delete $opts{cardinality};
65
66	49	100			115	if(keys %opts){
67						# sort the keys in the error message to make testing possible
68	1				14	croak 'Unknown parameters in DataSet constructor: ' .
69						(join ', ', sort keys %opts);
70						}
71
72	48				96	return \%final_opts;
73						}
74
75						# initialize internal state
76						sub _init {
77	48		48		97	my ($self) = @_;
78						# contains all of the items in the dataset
79	48				121	$self->{items} = [];
80
81						# map unique class labels to unique integers;
82						# these are the indices of the class labels in class_list below;
83						# the indices must start at 1 for AM to work, as 0 is reserved
84						# for heterogeneity.
85	48				100	$self->{class_num_index} = {};
86						# contains the list of class strings in an order that matches
87						# the indices in class_num_index
88	48				81	$self->{class_list} = [];
89						# the total number of different classes contained in the data set
90	48				79	$self->{num_classes} = 0;
91	48				62	return;
92						}
93
94						#pod =head2 C
95						#pod
96						#pod Returns the number of features contained in the feature vector of a
97						#pod single item.
98						#pod
99						#pod =cut
100						sub cardinality {
101	881		881	1	1412	my ($self) = @_;
102	881				2012	return $self->{cardinality};
103						}
104
105						#pod =head2 C
106						#pod
107						#pod Returns the number of items in the data set.
108						#pod
109						#pod =cut
110						sub size {
111	830		830	1	66730	my ($self) = @_;
112	830				1087	return scalar @{$self->{items}};
	830				2559
113						}
114
115						#pod =head2 C
116						#pod
117						#pod Returns the list of all unique class labels in the data set.
118						#pod
119						#pod =cut
120						sub classes {
121	0		0	1	0	my ($self) = @_;
122	0				0	return @{ $self->{class_list} };
	0				0
123						}
124
125						#pod =head2 C
126						#pod
127						#pod Adds a new item to the data set. The input may be either an
128						#pod L object, or the arguments to create
129						#pod one via its constructor (features, class, comment). This method will
130						#pod croak if the cardinality of the item does not match L.
131						#pod
132						#pod =cut
133						sub add_item {
134	276		276	1	754	my ($self, @args) = @_;
135	276				311	my $item;
136	276	100			533	if('Algorithm::AM::DataSet::Item' eq ref $args[0]){
137	219				267	$item = $args[0];
138						}else{
139	57				153	$item = Algorithm::AM::DataSet::Item->new(@args);
140						}
141
142	275	100			442	if($self->cardinality != $item->cardinality){
143						croak 'Expected ' . $self->cardinality .
144						' features, but found ' . (scalar $item->cardinality) .
145	1				4	' in ' . (join ' ', @{$item->features}) .
	1				3
146						' (' . $item->comment . ')';
147						}
148
149	274	100			544	if(defined $item->class){
150	270				464	$self->_update_class_vars($item->class);
151						}
152
153						# store the new item
154	274				335	push @{$self->{items}}, $item;
	274				502
155	274				472	return;
156						}
157
158						# keep track of classes; needs updating for new item
159						sub _update_class_vars {
160	270		270		397	my ($self, $class) = @_;
161
162	270	100			514	if(!$self->{class_num_index}->{$class}){
163	47				64	$self->{num_classes}++;
164	47				98	$self->{class_num_index}->{$class} = $self->{num_classes};
165	47				66	push @{$self->{class_list}}, $class;
	47				133
166						}
167	270				367	return;
168						}
169
170						#pod =head2 C
171						#pod
172						#pod Return the item at the given index. This will be a
173						#pod L object.
174						#pod
175						#pod =cut
176						sub get_item {
177	60306		60306	1	81749	my ($self, $index) = @_;
178	60306				116519	return $self->{items}->[$index];
179						}
180
181						#pod =head2 C
182						#pod
183						#pod Returns the number of different classification labels contained in
184						#pod the data set.
185						#pod
186						#pod =cut
187						sub num_classes {
188	388		388	1	1456	my ($self) = @_;
189	388				1261	return $self->{num_classes};
190						}
191
192						# Used by AM. Return an arrayref containing all of the
193						# classes for the data set (ordered the same as the data set).
194						sub _data_classes {
195	194		194		312	my ($self) = @_;
196						my @classes = map {
197	30018	50			43835	defined $_->class ?
198						$self->_index_for_class($_->class) :
199						undef
200	194				306	} @{$self->{items}};
	194				449
201	194				809	return \@classes;
202						}
203
204						# Used by AM. Return the integer mapped to the given class string.
205						sub _index_for_class {
206	60053		60053		79999	my ($self, $class) = @_;
207	60053				103205	return $self->{class_num_index}->{$class};
208						}
209
210						# Used by Result, which traverses data structures from
211						# AM's guts.
212						sub _class_for_index {
213	364		364		1673	my ($self, $index) = @_;
214	364				1125	return $self->{class_list}->[$index - 1];
215						}
216
217						#pod =head2 C
218						#pod
219						#pod This function may be exported. Given 'path' and 'format' arguments,
220						#pod it reads a file containing a dataset and returns a new DataSet object
221						#pod with the given data. The 'path' argument should be the path to the
222						#pod file. The 'format' argument should be 'commas' or 'nocommas',
223						#pod indicating one of the following formats. You may also specify 'unknown'
224						#pod and 'null' arguments to indicate the strings meant to represent an
225						#pod unknown class value and null feature values. By default these are
226						#pod 'UNK' and '='.
227						#pod
228						#pod The 'commas' file format is shown below:
229						#pod
230						#pod class , f eat u re s , your comment here
231						#pod
232						#pod The commas separate the class label, feature values, and comments,
233						#pod and the whitespace around the commas is optional. Each feature value
234						#pod is separated with whitespace.
235						#pod
236						#pod The 'nocommas' file format is shown below:
237						#pod
238						#pod class features your comment here
239						#pod
240						#pod Here the class, feature values, and comments are separated by
241						#pod whitespace. Each feature value must be a single character with no
242						#pod separating characters, so here the features are f, e, a, t, u, r,
243						#pod e, and s.
244						#pod
245						#pod Lines beginning with a pound character (C<#>) are ignored.
246						#pod
247						#pod =cut
248						sub dataset_from_file {## no critic (RequireArgUnpacking)
249	12		12	1	7779	my (%opts) = (
250						unknown => 'UNK',
251						null => '=',
252						@_
253						);
254
255						croak q[Failed to provide 'path' parameter]
256	12	100			55	unless exists $opts{path};
257						croak q[Failed to provide 'format' parameter]
258	11	100			42	unless exists $opts{format};
259
260						my ($path, $format, $unknown, $null) = (
261	10				28	path($opts{path}), @opts{'format', 'unknown', 'null'});
262
263	10	100			240	croak "Could not find file $path"
264						unless $path->exists;
265
266	9				237	my ($field_sep, $feature_sep);
267	9	100			33	if($format eq 'commas'){
		100
268						# class/features/comment separated by a comma
269	4				20	$field_sep = qr{\s,\s};
270						# features separated by space
271	4				10	$feature_sep = qr{\s+};
272						}elsif($format eq 'nocommas'){
273						# class/features/comment separated by space
274	4				16	$field_sep = qr{\s+};
275						# no seps for features; each is a single character
276	4				12	$feature_sep = qr{};
277						}else{
278	1				10	croak "Unknown value $format for format parameter " .
279						q{(should be 'commas' or 'nocommas')};
280						}
281
282	8	50			23	if(!defined $unknown){
283	0				0	croak q[Must provide a defined value for 'unknown' parameter];
284						}
285
286	8				37	my $reader = _read_data_sub(
287						$path, $unknown, $null, $field_sep, $feature_sep);
288	8				18	my $item = $reader->();
289	8	50			32	if(!$item){
290	0				0	croak "No data found in file $path";
291						}
292	8				30	my $dataset = __PACKAGE__->new(cardinality => $item->cardinality);
293	8				27	$dataset->add_item($item);
294	8				18	while($item = $reader->()){
295	189				334	$dataset->add_item($item);
296						}
297	7				191	return $dataset;
298						}
299
300						# return a sub that returns one Item per call from the given FH,
301						# and returns undef once the file is done being read. Throws errors
302						# on bad file contents.
303						# Input is file (Path::Tiny), string representing unknown class,
304						# string representing null feature, field separator (class,
305						# features, comment) and feature separator
306						sub _read_data_sub {
307	8		8		23	my ($data_file, $unknown, $null,
308						$field_sep, $feature_sep) = @_;
309	8				31	my $data_fh = $data_file->openr_utf8;
310	8				34419	my $line_num = 0;
311						return sub {
312	205		205		238	my $line;
313						# grab the next non-blank line from the file
314	205				648	while($line = <$data_fh>){
315	201				337	$line_num++;
316						# skip comments
317	201	100			416	next if $line =~ m/^\s*#/;
318						# cross-platform chomp
319	199				782	$line =~ s/\R$//;
320	199				915	$line =~ s/^\s+\|\s+$//g;
321	199	100			334	last if $line;
322						}
323	205	100			333	return unless $line;
324	198				900	my ($class, $feats, $comment) = split /$field_sep/, $line, 3;
325						# the line has to have at least the class label and features
326	198	100			380	if(!defined $feats){
327	1				6	croak "Couldn't read data at line $line_num in $data_file";
328						}
329						# if the class is specified as unknown, set it to undef to
330						# indicate this to Item
331	197	100			318	if($class eq $unknown){
332	4				9	undef $class;
333						}
334
335	197				958	my @data_vars = split /$feature_sep/, $feats;
336						# set null features to ''
337	197	100			348	@data_vars = map {$_ eq $null ? '' : $_} @data_vars;
	1913				3135
338
339	197				552	return Algorithm::AM::DataSet::Item->new(
340						features=> \@data_vars,
341						class => $class,
342						comment => $comment
343						);
344	8				55	};
345						}
346
347						1;
348
349						__END__