File Coverage

blib/lib/Statistics/Data.pm
Criterion Covered Total %
statement 178 340 52.3
branch 92 180 51.1
condition 16 40 40.0
subroutine 29 44 65.9
pod 25 25 100.0
total 340 629 54.0


line stmt bran cond sub pod time code
1             package Statistics::Data;
2 4     4   65520 use strict;
  4         8  
  4         155  
3 4     4   16 use warnings FATAL => 'all';
  4         4  
  4         160  
4 4     4   18 use Carp qw(croak);
  4         13  
  4         390  
5 4     4   2385 use List::AllUtils qw(all);
  4         46715  
  4         363  
6 4     4   2283 use Number::Misc qw(is_even);
  4         4320  
  4         230  
7 4     4   22 use Scalar::Util qw(looks_like_number);
  4         5  
  4         321  
8 4     4   2090 use String::Util qw(hascontent nocontent);
  4         17036  
  4         13203  
9             our $VERSION = '0.09';
10            
11             =head1 NAME
12            
13             Statistics::Data - Load, access, update, check and save one or more sequences of data for statistical analysis
14            
15             =head1 VERSION
16            
17             This is documentation for Version 0.09 of Statistics/Data.pm, released February 2015.
18            
19             =head1 SYNOPSIS
20            
21             use Statistics::Data 0.09;
22             my $dat = Statistics::Data->new();
23            
24             # managing labelled sequences:
25             $dat->load({'aname' => \@data1, 'anothername' => \@data2}); # labels are arbitrary
26             $aref = $dat->access(label => 'aname'); # gets back a copy of @data1
27             $dat->add(aname => [2, 3]); # pushes new values onto loaded copy of @data1
28             $dat->dump_list(); # print to check if both arrays are loaded and their number of elements
29             $dat->unload(label => 'anothername'); # only 'aname' data remains loaded
30             $aref = $dat->access(label => 'aname'); # $aref is a reference to a copy of @data1
31             $dat->dump_vals(label => 'aname', delim => ','); # proof in print it's back
32            
33             # managing multiple anonymous sequences:
34             $dat->load(\@data1, \@data2); # any number of anonymous arrays
35             $dat->add([2], [6]); # pushes a single value apiece onto copies of @data1 and @data2
36             $aref = $dat->access(index => 1); # returns reference to copy of @data2, with its new values
37             $dat->unload(index => 0); # only @data2 remains loaded, and its index is now 0
38            
39             # managing single anonymous sequence, over time:
40             $dat->load(1, 2, 2);
41             $dat->add(1); # loaded sequence is now 1, 2, 2, 1
42             $dat->dump_vals(); # same as: print @{$dat->access()}, "\n";
43             $dat->save_to_file(path => 'variables1.dat'); # write to disk
44             $dat->unload(); # all gone - go wild, knowing that you can ...
45             $dat->load_from_path(path => 'variables1.dat'); # back again - go to work
46             $dat->dump_vals(); # proof: same printed output as before
47             # do more work
48            
49             =head1 DESCRIPTION
50            
51             Handles data for some other statistics modules, as in loading, updating and retrieving data for analysis. Performs no actual statistical analysis itself.
52            
53             Rationale is not wanting to write the same or similar load, add, etc. methods for every statistics module, not to provide an omnibus API for Perl stat modules. It, however, encompasses much of the variety of how Perl stats modules do the basic handling their data. Used for L (and its sub-tests).
54            
55             =head1 SUBROUTINES/METHODS
56            
57             Manages caches of one or more lists of data for use by some other statistics modules. The lists are ordered arrays comprised of literal scalars (numbers, strings). They can be loaded, added to (updated), accessed or unloaded by referring to the index (order) in which they have been loaded (or previously added to), or by a particular label. The lists are cached within the class object's '_DATA' aref as an aref itself, optionally associated with a 'label'. The particular structures supported here to load, update, retrieve, unload data are specified under L. Any module that uses this one as its base can still use its own rules to select the appropriate sequence, or provide the appropriate sequence within the call to itself.
58            
59             =head2 Constructors
60            
61             =head3 new
62            
63             $dat = Statistics::Data->new();
64            
65             Returns a new Statistics::Data object.
66            
67             =cut
68            
69             sub new {
70 4     4 1 34 my $class = shift;
71 4 50       17 my $self = bless {}, ref($class) ? ref($class) : $class;
72 4         21 $self->{_DATA} = [];
73 4         12 return $self;
74             }
75            
76             =head3 clone
77            
78             $new_self = $dat->clone();
79            
80             I: B
81            
82             Returns a copy of the class object with its data loaded (if any). Note this is not a copy of any particular data but the whole blessed hash. Alternatively, use L to get all the data added to a new object, or use L to load/add particular arrays of data into another object. Nothing modified in this new object affects the original.
83            
84             =cut
85            
86             sub clone {
87 0     0 1 0 my $self = shift;
88 0         0 require Clone;
89 0         0 return Clone::clone($self);
90             }
91             *copy = \*clone;
92            
93             =head2 Setting data
94            
95             Methods to cache and uncache data into the data-object.
96            
97             =head3 load
98            
99             $dat->load(@data); # CASE 1 - can be updated/retrieved anonymously, or as index => i (load order)
100             $dat->load(\@data); # CASE 2 - same, as aref
101             $dat->load(data => \@data); # CASE 3 - updated/retrieved as label => 'data' (arbitrary name, not just 'data'); or by index (order)
102             $dat->load({ data => \@data }) # CASE 4 - same as CASE 4, as hashref
103             $dat->load(blues => \@blue_data, reds => \@red_data); # CASE 5 - same as CASE 3 but with multiple named loads
104             $dat->load({ blues => \@blue_data, reds => \@red_data }); # CASE 6 - same as CASE 5 bu as hashref
105             $dat->load(\@blue_data, \@red_data); # CASE 7 - same as CASE 2 but with multiple aref loads
106            
107             # Not supported:
108             #$dat->load(data => @data); # not OK - use CASE 3 instead
109             #$dat->load([\@blue_data, \@red_data]); # not OK - use CASE 7 instead
110             #$dat->load([ [blues => \@blue_data], [reds => \@red_data] ]); # not OK - use CASE 5 or CASE 6 instead
111             #$dat->load(blues => \@blue_data, reds => [\@red_data1, \@red_data2]); # not OK - too mixed to make sense
112            
113             I: B
114            
115             Cache a list of data as an array-reference. Each call removes previous loads, as does sending nothing. If data need to be cached without unloading previous loads, try L. Arguments with the following structures are acceptable as data, and will be Lible by either index or label as expected:
116            
117             =over 4
118            
119             =item load ARRAY
120            
121             Load an anonymous array that has no named values. For example:
122            
123             $dat->load(1, 4, 7);
124             $dat->load(@ari);
125            
126             This is loaded as a single sequence, with an undefined label, and indexed as 0. Note that trying to load a labelled dataset with an unreferenced array is wrong for it will be treated like this case - the label will be "folded" into the sequence itself.
127            
128             =item load AREF
129            
130             Load a reference to a single anonymous array that has no named values, e.g.:
131            
132             $dat->load([1, 4, 7]);
133             $dat->load(\@ari);
134            
135             This is loaded as a single sequence, with an undefined label, and indexed as 0.
136            
137             =item load ARRAY of AREF(s)
138            
139             Same as above, but note that more than one unlabelled array-reference can also be loaded at once, e.g.:
140            
141             $dat->load([1, 4, 7], [2, 5, 9]);
142             $dat->load(\@ari1, \@ari2);
143            
144             Each sequence can be accessed, using L, by specifying B => index, the latter value representing the order in which these arrays were loaded.
145            
146             =item load HASH of AREF(s)
147            
148             Load one or more labelled references to arrays, e.g.:
149            
150             $dat->load('dist1' => [1, 4, 7]);
151             $dat->load('dist1' => [1, 4, 7], 'dist2' => [2, 5, 9]);
152            
153             This loads the sequence(s) with a label attribute, so that when calling L, they can be retrieved by name, e.g., passing B
154            
155             =item load HASHREF of AREF(s)
156            
157             As above, but where the hash is referenced, e.g.:
158            
159             $dat->load({'dist1' => [1, 4, 7], 'dist2' => [2, 5, 9]});
160            
161             =back
162            
163             This means that using the following forms will produce unexpected results, if they do not actually croak, and so should not be used:
164            
165             $dat->load(data => @data); # no croak but wrong - puts "data" in @data - use \@data
166             $dat->load([\@blue_data, \@red_data]); # use unreferenced ARRAY of AREFs instead
167             $dat->load([ [blues => \@blue_data], [reds => \@red_data] ]); # treated as single AREF; use HASH of AREFs instead
168             $dat->load(blues => \@blue_data, reds => [\@red_data1, \@red_data2]); # mixed structures not supported
169            
170             =cut
171            
172             sub load
173             { # load single aref: cannot load more than one sequence; keeps a direct reference to the data: any edits creep back.
174 17     17 1 14121 my ( $self, @args ) = @_;
175 17         66 $self->unload();
176 17         43 $self->add(@args);
177 17         38 return 1;
178             }
179             *load_data = \&load;
180            
181             =head3 add
182            
183             I: B, B, B
184            
185             Same usage as above for L. Just push any value(s) or so along, or loads an entirely labelled sequence, without clobbering what's already in there (as L would). If data have not been loaded with a label, then appending data to them happens according to the order of array-refs set here, see L could even skip adding something to one previously loaded sequence by, e.g., going $dat->add([], \new_data) - adding nothing to the first loaded sequence, and initialising a second array, if none already, or appending these data to it.
186            
187             =cut
188            
189             sub add {
190 29     29 1 7010 my ( $self, @args ) = @_;
191 29         63 my $tmp = _init_data( $self, @args )
192             ; # hashref of data sequence(s) keyed by index to use for loading or adding
193 29         32 while ( my ( $i, $val ) = each %{$tmp} ) {
  62         159  
194 33 100       61 if ( defined $val->{'lab'} ) { # newly labelled data
195 16         63 $self->{_DATA}->[$i] =
196             { seq => $val->{'seq'}, lab => $val->{'lab'} };
197             }
198             else
199             { # data to be added to existing cache, or an anonymous load, indexed only
200 17         18 push @{ $self->{_DATA}->[$i]->{'seq'} }, @{ $val->{'seq'} };
  17         84  
  17         48  
201             }
202             }
203 29         73 return;
204             }
205             *add_data = \&add;
206             *append_data = \&add;
207             *update = \&add;
208            
209             =head3 unload
210            
211             $dat->unload(); # deletes all cached data, named or not
212             $dat->unload(index => integer); # deletes the aref named 'data' whatever
213             $dat->unload(label => 'a name'); # deletes the aref named 'data' whatever
214            
215             Empty, clear, clobber what's in there. Croaks if given index or label does not refer to any loaded data. This should be used whenever any already loaded or added data are no longer required ahead of another L, including via L or L.
216            
217             =cut
218            
219             sub unload {
220 19     19 1 574 my ( $self, @args ) = @_;
221 19 100       41 if ( !$args[0] ) {
222 18         34 $self->{_DATA} = [];
223             }
224             else {
225 1         2 splice @{ $self->{_DATA} }, _index_by_args( $self, @args ), 1;
  1         2  
226             }
227 19         48 return;
228             }
229            
230             =head3 share
231            
232             $dat_new->share($dat_old);
233            
234             Adds all the data from one Statistics::Data object to another. Changes in the new copies do not affect the originals.
235            
236             =cut
237            
238             sub share {
239 1     1 1 4 my ( $self, $other ) = @_;
240 1         4 _add_from_object_aref( $self, $other->{_DATA} );
241 1         1 return 1;
242             }
243            
244             =head3 load_from_file
245            
246             $dat->load_from_file(path => 'medata.csv', format => 'xml|csv');
247             $dat->load_from_file(path => 'mysequences.csv', serializer => 'XML::Simple', compress => 1, secret => '123'); # serialization options
248            
249             Loads data from a file, assuming there are data in the given path that have been saved in the format used in L. Basically a wrapper to L method in L; cf. for options; and then to L. If the data retreived are actually to be added to any data already cached via a previous load or add, define the optional parameter B => 1.
250            
251             =cut
252            
253             sub load_from_file {
254 0     0 1 0 my ( $self, @args ) = @_;
255 0 0       0 my $args = ref $args[0] ? $args[0] : {@args};
256 0 0 0     0 croak 'There is no path for loading data'
257             if nocontent( $args->{'path'} )
258             || !-e $args->{'path'}; # is filepath valid?
259 0         0 require Data::Serializer;
260 0         0 my $serializer = Data::Serializer->new( %{$args} );
  0         0  
261 0         0 my $href = $serializer->retrieve( $args->{'path'} );
262 0 0       0 $self->unload() unless $args->{'keep'};
263 0         0 _add_from_object_aref( $self, $href->{DATA} );
264 0         0 return 1;
265             }
266            
267             =head2 Getting data
268            
269             To retrieve what has been previously loaded, simply call L, specifying the "label" or "index" that was used to load/add the data - i.e., when loaded as a hashref or an arrayref, respectively; specifying the list by B
270            
271             For retrieving more than one previously loaded dataset, use one of the "get" methods, choosing between getting back a hash- or an array-ref, or to get back a single list, as by L, after all. These "get" methods only support retrieving data loaded as hashrefs for now; use L to get back index-specific loads. They might be folded within L later on.
272            
273             =head3 access
274            
275             $aref = $dat->access(); #returns the first and/or only sequence loaded, if any
276             $aref = $dat->access(index => integer); #returns the ith sequence loaded
277             $aref = $dat->access(label => 'a_name'); # returns a particular named cache of data
278            
279             I: B
280            
281             Returns one referenced array being previously loaded/added to data by the given B (in a flat-list load) or B
282            
283             =cut
284            
285             sub access {
286 24     24 1 7123 my ( $self, @args ) = @_;
287 24         67 return $self->{_DATA}->[ _index_by_args( $self, @args ) ]->{'seq'};
288             }
289             *read = \&access; # legacy only
290            
291             =head3 get_hoa, get_hoa_by_lab
292            
293             $href = $data->get_hoa(lab => [qw/fez boa/]); # retrieve 1 or more named data
294             $href = $data->get_hoa(); # retrieve all named data
295            
296             Returns a hashref of arefs, where the keys are the names of the data, as previously given in a load, and the values are arefs of the list of data that has been loaded for that name.
297            
298             The optional argument B should be a reference to a list of one or more data that have been given as keys in a hash-wise L. Any elements in this list that have not been used as names in a load are ignored. If none of the names has been used, an empty list is returned. If there is no B argument, then all of the loaded data are returned as a hashref of arefs; if there were no named data, this a reference to an empty hash.
299            
300             This is useful in a module like L that needs to continuously cross-refer to multiple variables to make a single calculation while also being able to distinguish them by some meaningful key other than simply an index number.
301            
302             For working with numerical data in particular, see the following two methods.
303            
304             =cut
305            
306             sub get_hoa_by_lab {
307 0     0 1 0 my ( $self, %args ) = @_;
308 0 0 0     0 $args{'lab'} = [ $args{'lab'} ]
309             if hascontent( $args{'lab'} )
310             and not ref $args{'lab'};
311 0         0 my %data = ();
312 0 0       0 if ( !ref $args{'lab'} ) { # get all data
313 0         0 for my $i ( 0 .. $self->ndata() - 1 ) {
314 0 0       0 if ( hascontent( $self->{_DATA}->[$i]->{'lab'} ) ) {
315 0         0 $data{ $self->{_DATA}->[$i]->{'lab'} } =
316             $self->{_DATA}->[$i]->{'seq'};
317             }
318             }
319             }
320             else { # get named data
321 0         0 for my $i ( 0 .. scalar @{ $args{'lab'} } - 1 )
  0         0  
322             { # assume ref eq 'ARRAY'
323 0         0 my $j = _seq_index_by_label( $self, $args{'lab'}->[$i] )
324             ; # is name loaded with data?
325 0 0       0 if ( defined $j ) {
326 0         0 $data{ $args{'lab'}->[$i] } = $self->{_DATA}->[$j]->{'seq'};
327             } # else ignore the given name
328             }
329             }
330 0 0       0 return wantarray ? %data : \%data;
331             }
332             *get_hoa = \&get_hoa_by_lab;
333            
334             =head3 get_hoa_by_lab_numonly_indep
335            
336             $hoa = $dat->get_hoa_by_lab_numonly_indep(); # same as get_hoa but each list culled of NaNs
337            
338             Returns the variables given in the argument B (an aref of strings) culled on any empty or non-numeric values. This is done by treating each variable indpendently, with culls on one "list" not creating a cull on any other. This is the type of data useful for an independent ANOVA.
339            
340             =cut
341            
342             sub get_hoa_by_lab_numonly_indep {
343 0     0 1 0 my ( $self, %args ) = @_;
344 0         0 return _cull_hoa_indep( scalar $self->get_hoa_by_lab(%args),
345             \$self->{'purged'} );
346             }
347            
348             =head3 get_hoa_by_lab_numonly_across
349            
350             $hoa = $dat->get_hoa_by_lab_numonly_across(); # same as get_hoa but each list culled of NaNs at same i across lists
351            
352             Returns hashref of previously loaded variable data (as arefs) culled of an empty or non-numerical values whereby even a valid value in one list is culled if it is at an index that is invalid in another list. This is the type of data useful for a dependent ANOVA.
353            
354             =cut
355            
356             sub get_hoa_by_lab_numonly_across {
357 0     0 1 0 my ( $self, %args ) = @_;
358 0         0 return _cull_hoa_across( scalar $self->get_hoa_by_lab(%args),
359             \$self->{'purged'} );
360             }
361            
362             =head3 get_aoa, get_aoa_by_lab
363            
364             $aref_of_arefs = $dat->get_aoa_by_lab();
365            
366             Returns a reference to an array where each value is itself an array of data, as separately loaded under a different name or anonymously, in the order that they were loaded. If no B value is defined, all the loaded data are returned as a list of arefs.
367            
368             =cut
369            
370             sub get_aoa_by_lab {
371 0     0 1 0 my ( $self, %args ) = @_;
372 0 0 0     0 $args{'lab'} = [ $args{'lab'} ]
373             if hascontent( $args{'lab'} )
374             and not ref $args{'lab'};
375 0         0 my @data = ();
376 0 0       0 if ( !ref $args{'lab'} ) { # get all data
377 0         0 for my $i ( 0 .. $self->ndata() - 1 ) {
378 0         0 $data[$i] = $self->{_DATA}->[$i]->{'seq'};
379             }
380             }
381             else { # get named data
382 0         0 for my $i ( 0 .. scalar @{ $args{'lab'} } - 1 )
  0         0  
383             { # assume ref eq 'ARRAY'
384 0         0 my $j = _seq_index_by_label( $self, $args{'lab'}->[$i] )
385             ; # is name loaded with data?
386 0 0       0 if ( defined $j ) {
387 0         0 $data[$i] = $self->{_DATA}->[$j]->{'seq'};
388             } # else ignore the given name
389             }
390             }
391 0 0       0 return wantarray ? @data : \@data; # unreferenced for chance legacy for now
392             }
393             *get_aoa = \&get_aoa_by_lab;
394            
395             =head3 get_aref_by_lab
396            
397             $aref = $dat->get_aref_by_lab();
398            
399             Returns a reference to a single, previously loaded hashref of arrayed of data, as specified in the named argument B. The array is empty if no data have been loaded, or if there is none with the given B. If B is not defined, the the last-loaded data, if any, is returned (as aref).
400            
401             =cut
402            
403             sub get_aref_by_lab {
404 0     0 1 0 my ( $self, %args ) = @_;
405 0         0 my $aref = [];
406 0 0 0     0 if ( nocontent( $args{'lab'} ) && ref $self->{_DATA}->[-1]->{'seq'} ) {
407 0         0 $aref = $self->{_DATA}->[-1]->{'seq'};
408             }
409             else {
410 0         0 my $j = _seq_index_by_label( $self, $args{'lab'} )
411             ; # is name loaded with data?
412 0 0       0 if ( defined $j ) {
413 0         0 $aref = $self->{_DATA}->[$j]->{'seq'};
414             }
415             }
416 0         0 return $aref;
417             }
418            
419             =head3 ndata
420            
421             $n = $dat->ndata();
422            
423             Returns the number of loaded variables.
424            
425             =cut
426            
427             sub ndata {
428 6     6 1 1982 my $self = shift;
429 6         8 return scalar( @{ $self->{'_DATA'} } );
  6         28  
430             }
431            
432             =head3 labels
433            
434             $aref = $dat->labels();
435            
436             Returns a reference to an array of all the datanames (labels), if any.
437            
438             =cut
439            
440             sub labels {
441 0     0 1 0 my $self = shift;
442 0         0 my @names = ();
443 0         0 for ( 0 .. scalar @{ $self->{'_DATA'} } - 1 ) {
  0         0  
444 0 0       0 push @names, $self->{'_DATA'}->[$_]->{'lab'}
445             if hascontent( $self->{'_DATA'}->[$_]->{'lab'} );
446             }
447 0         0 return \@names;
448             }
449            
450             =head2 Checking data
451            
452             =head3 all_full
453            
454             $bool = $dat->all_full(\@data); # test data are valid before loading them
455             $bool = $dat->all_full(label => 'mydata'); # checking after loading/adding the data (or key in 'index')
456            
457             Checks not only if the data sequence, as named or indexed, exists, but if it is non-empty: has no empty elements, with any elements that might exist in there being checked with L.
458            
459             =cut
460            
461             sub all_full {
462 3     3 1 3988 my ( $self, @args ) = @_;
463 3 50       13 my $data = ref $args[0] ? shift @args : $self->access(@args);
464 3         5 my ( $bool, @vals ) = ();
465 3         5 foreach ( @{$data} ) {
  3         9  
466 24 100       36 $bool = nocontent($_) ? 0 : 1;
467 24 100       151 if (wantarray) {
468 5 100       13 push @vals, $_ if $bool;
469             }
470             else {
471 19 100       38 last if $bool == 0;
472             }
473             }
474 3 100       10 return wantarray ? ( \@vals, $bool ) : $bool;
475             }
476            
477             =head3 all_numeric
478            
479             $bool = $dat->all_numeric(); # test data first-loaded, if any
480             $bool = $dat->all_numeric(\@data); # test these data are valid before loading them
481             $bool = $dat->all_numeric(label => 'mydata'); # check specific data after loading/adding them by a 'label' or by their 'index' order
482             ($aref, $bool) = $dat->all_numeric([3, '', 4.7, undef, 'b']); # returns ([3, 4.7], 0); - same for any loaded data
483            
484             Given an aref of data, or reference to data previously loaded (see L), tests numeracy of each element, and return, if called in scalar context, a boolean scalar indicating if all data in this aref are defined and not empty (using C in L), and, if they have content, if these are all numerical, using C in L. Alternatively, if called in array context, returns the data (as an aref) less any values that failed this test, followed by the boolean.
485            
486             =cut
487            
488             sub all_numeric {
489 7     7 1 3055 my ( $self, @args ) = @_;
490 7         7 my ( $data, $bool ) = ();
491 7 100       13 if ( ref $args[0] eq 'ARRAY' ) {
492 3         5 $data = shift @args;
493             }
494             else {
495 4         9 $data = $self->{_DATA}->[ _index_by_args( $self, @args ) ]->{'seq'};
496             }
497 7         11 my (@vals) = ();
498 7         6 foreach ( @{$data} ) {
  7         12  
499 58 100 100     80 $bool = ( nocontent($_) or not looks_like_number($_) ) ? 0 : 1;
500 58 100       428 if (wantarray) {
501 5 100       7 push @vals, $_ if $bool;
502             }
503             else {
504 53 100       156 last if $bool == 0;
505             }
506 53         58 $data = \@vals;
507             }
508 7 100       21 return wantarray ? ( $data, $bool ) : $bool;
509             }
510             *all_numerical = \&all_numeric;
511            
512             =head3 all_proportions
513            
514             $bool = $dat->all_proportions(\@data); # test data are valid before loading them
515             $bool = $dat->all_proportions(label => 'mydata'); # checking after loading/adding the data (or key in 'index')
516            
517             Ensure data are all proportions. Sometimes, the data a module needs are all proportions, ranging from 0 to 1 inclusive. A dataset might have to be cleaned
518            
519             =cut
520            
521             sub all_proportions {
522 5     5 1 796 my ( $self, @args ) = @_;
523 5 100       15 my $data = ref $args[0] ? shift @args : $self->access(@args);
524 5         5 my ( $bool, @vals ) = ();
525 5         9 foreach ( @{$data} ) {
  5         9  
526 15 100       18 if ( nocontent($_) ) {
    100          
527 3         15 $bool = 0;
528             }
529             elsif ( looks_like_number($_) ) {
530 11 100 66     132 $bool = ( $_ < 0 || $_ > 1 ) ? 0 : 1;
531             }
532 15 100       23 if (wantarray) {
533 5 100       9 push @vals, $_ if $bool;
534             }
535             else {
536 10 100       25 last if $bool == 0;
537             }
538             }
539 5 100       12 return wantarray ? ( \@vals, $bool ) : $bool;
540             }
541            
542             =head3 all_counts
543            
544             $bool = $dat->all_counts(\@data); # test data are valid before loading them
545             $bool = $dat->all_counts(label => 'mydata'); # checking after loading/adding the data (or key in 'index')
546             ($aref, $bool) = $dat->all_counts(\@data);
547            
548             Returns true if all values in given data are real positive integers or zero, as well as satisfying "hascontent" and "looks_like_number" methods; false otherwise. Called in array context, returns aref of data culled of any values that are false on this basis, and then the boolean. For example, [2.2, 3, 4] and [-1, 3, 4] both fail, but [1, 3, 4] is true. Integer test is simply if $v == int($v).
549            
550             =cut
551            
552             sub all_counts {
553 3     3 1 13 my ( $self, @args ) = @_;
554 3 50       8 my $data = ref $args[0] ? shift @args : $self->access(@args);
555 3         4 my ( $bool, @vals ) = ();
556 3         3 foreach ( @{$data} ) {
  3         6  
557 5 50       10 if ( nocontent($_) ) {
    50          
558 0         0 $bool = 0;
559             }
560             elsif ( looks_like_number($_) ) {
561 5 100 100     65 $bool = $_ >= 0 && $_ == int($_) ? 1 : 0;
562             }
563             else {
564 0         0 $bool = 0;
565             }
566 5 50       16 if (wantarray) {
567 0 0       0 push @vals, $_ if $bool;
568             }
569             else {
570 5 100       10 last if $bool == 0;
571             }
572             }
573 3 50       8 return wantarray ? ( \@vals, $bool ) : $bool;
574             }
575            
576             =head3 all_pos
577            
578             $bool = $dat->all_pos(\@data); # test data are valid before loading them
579             $bool = $dat->all_pos(label => 'mydata'); # checking after loading/adding the data (or key in 'index')
580             ($aref, $bool) = $dat->all_pos(\@data);
581            
582             Returns true if all values in given data are greater than zero, as well as "hascontent" and "looks_like_number"; false otherwise. Called in array context, returns aref of data culled of any values that are false on this basis, and then the boolean.
583            
584             =cut
585            
586             sub all_pos {
587 2     2 1 9 my ( $self, @args ) = @_;
588 2 50       22 my $data = ref $args[0] ? shift @args : $self->access(@args);
589 2         3 my ( $bool, @vals ) = ();
590 2         3 foreach ( @{$data} ) {
  2         6  
591 5 50       15 if ( nocontent($_) ) {
    50          
592 0         0 $bool = 0;
593             }
594             elsif ( looks_like_number($_) ) {
595 5 100       52 $bool = $_ > 0 ? 1 : 0;
596             }
597 5 50       10 if (wantarray) {
598 0 0       0 push @vals, $_ if $bool;
599             }
600             else {
601 5 100       19 last if $bool == 0;
602             }
603             }
604 2 50       8 return wantarray ? ( \@vals, $bool ) : $bool;
605             }
606            
607             =head3 equal_n
608            
609             If the given data or aref of variable names all have the same number of elements, then that number is returned; otherwise 0.
610            
611             =cut
612            
613             sub equal_n {
614 0     0 1 0 my ( $self, %args ) = ( shift, @_ );
615 0 0       0 my $data =
616             $args{'data'} ? delete $args{'data'} : $self->get_hoa_by_lab(%args);
617 0 0       0 my @data = values %{$data} if ref $data eq 'HASH';
  0         0  
618 0         0 my $n = scalar @{ $data[0] };
  0         0  
619 0         0 for ( 1 .. scalar @data - 1 ) {
620 0         0 my $tmp_count = scalar @{ $data[$_] };
  0         0  
621 0 0       0 if ( $tmp_count != $n ) {
622 0         0 $n = 0;
623 0         0 last;
624             }
625             else {
626 0         0 $n = $tmp_count;
627             }
628             }
629 0         0 return $n;
630             }
631            
632             =head3 idx_anumeric
633            
634             Given an aref, returns a reference to an array of indices for a particular dataset (list) where the values are either undefined, empty or non-numerical.
635            
636             =cut
637            
638             sub idx_anumeric
639             { # List keyed by sample-names of their indices where invalid values lie
640 0     0 1 0 my ( $self, $dat ) = @_;
641 0         0 my %purge = ();
642 0         0 for my $i ( 0 .. scalar @{$dat} - 1 ) {
  0         0  
643 0 0       0 $purge{$i}++ if !looks_like_number( $dat->[$i] );
644             }
645 0         0 return \%purge;
646             }
647            
648             =head2 Dumping data
649            
650             =head3 dump_vals
651            
652             $seq->dump_vals(delim => ", "); # assumes the first (only?) loaded sequence should be dumped
653             $seq->dump_vals(index => I, delim => ", "); # dump the i'th loaded sequence
654             $seq->dump_vals(label => 'mysequence', delim => ", "); # dump the sequence loaded/added with the given "label"
655            
656             Prints to STDOUT a space-separated line (ending with "\n") of a loaded/added data's elements. Optionally, give a value for B to specify how the elements in each sequence should be separated; default is a single space.
657            
658             =cut
659            
660             sub dump_vals {
661 0     0 1 0 my ( $self, @args ) = @_;
662 0 0       0 my $args = ref $args[0] ? $args[0] : {@args};
663 0   0     0 my $delim = $args->{'delim'} || q{ };
664 0 0       0 print {*STDOUT} join( $delim, @{ $self->access($args) } ), "\n"
  0         0  
  0         0  
665             or croak 'Could not print line to STDOUT';
666 0         0 return 1;
667             }
668            
669             =head3 dump_list
670            
671             Dumps a list (using L) of the data currently loaded, without showing their actual elements. List is firstly by index, then by label (if any), then gives the number of elements in the associated sequence.
672            
673             =cut
674            
675             sub dump_list {
676 0     0 1 0 my ( $self, $i, $lim, $lab, $N, $len_lab, $len_n, $tbl, @rows, @maxlens ) =
677             (shift);
678 0         0 $lim = $self->ndata();
679 0 0       0 @maxlens = ( ( $lim > 5 ? $lim : 5 ), 5, 1 );
680 0         0 for my $i ( 0 .. $lim - 1 ) {
681 0 0       0 $lab =
682             defined $self->{_DATA}->[$i]->{lab}
683             ? $self->{_DATA}->[$i]->{lab}
684             : q{-};
685 0         0 $N = scalar @{ $self->{_DATA}->[$i]->{seq} };
  0         0  
686 0         0 $len_lab = length $lab;
687 0         0 $len_n = length $N;
688 0 0       0 $maxlens[1] = $len_lab if $len_lab > $maxlens[1];
689 0 0       0 $maxlens[2] = $len_n if $len_n > $maxlens[2];
690 0         0 $rows[$i] = [ $i, $lab, $N ];
691             }
692 0         0 require Text::SimpleTable;
693 0         0 $tbl = Text::SimpleTable->new(
694             [ $maxlens[0], 'index' ],
695             [ $maxlens[1], 'label' ],
696             [ $maxlens[2], 'N' ]
697             );
698 0         0 $tbl->row( @{$_} ) foreach @rows;
  0         0  
699 0 0       0 print {*STDOUT} $tbl->draw or croak 'Could not print list of loaded data';
  0         0  
700 0         0 return 1;
701             }
702            
703             =head3 save_to_file
704            
705             $dat->save_to_file(path => 'mysequences.csv');
706             $dat->save_to_file(path => 'mysequences.csv', serializer => 'XML::Simple', compress => 1, secret => '123'); # serialization options
707            
708             Saves the data presently loaded in the Statistics::Data object to a file, with the given B. This can be retrieved, with all the data added to the Statistics::Data object, via L. Basically a wrapper to C method in L; cf. for options.
709            
710             =cut
711            
712             sub save_to_file {
713 0     0 1 0 my ( $self, @args ) = @_;
714 0 0       0 my $args = ref $args[0] ? $args[0] : {@args};
715 0 0       0 croak 'There is no path for saving data' if nocontent( $args->{'path'} );
716 0         0 require Data::Serializer;
717 0         0 my $serializer = Data::Serializer->new( %{$args} );
  0         0  
718 0         0 $serializer->store( { DATA => $self->{_DATA} }, $args->{'path'} );
719 0         0 return 1;
720             }
721             *save = \&save_to_file;
722            
723             # PRIVATE METHODS:
724            
725             sub _cull_hoa_indep {
726 0     0   0 my $hoa = shift;
727 0         0 my $purged_n = shift;
728 0         0 my $purged = 0;
729 0         0 my %purged_data = ();
730 0         0 foreach my $name ( keys %{$hoa} ) {
  0         0  
731 0         0 my @clean = ();
732 0         0 for my $i ( 0 .. scalar( @{ $hoa->{$name} } ) - 1 ) {
  0         0  
733 0 0 0     0 if ( nocontent( $hoa->{$name}->[$i] )
734             or not looks_like_number( $hoa->{$name}->[$i] ) )
735             {
736 0         0 $purged++;
737             }
738             else {
739 0         0 push @clean, $hoa->{$name}->[$i];
740             }
741             }
742             croak
743 0 0       0 "Empty data for ANOVA following purge of invalid value(s) in list < $name >"
744             if !scalar @clean;
745 0         0 $purged_data{$name} = [@clean];
746             }
747 0 0       0 ${$purged_n} = $purged if ref $purged_n;
  0         0  
748 0         0 return \%purged_data;
749             }
750            
751             sub _cull_hoa_across {
752 0     0   0 my $hoa = shift;
753 0         0 my $purged_n = shift;
754 0         0 my ( $purged, %inv_1, %invalid_ids, %clean, %purged_data ) = ();
755            
756 0         0 foreach my $name ( keys %{$hoa} ) {
  0         0  
757 0         0 for my $i ( 0 .. scalar( @{ $hoa->{$name} } ) - 1 ) {
  0         0  
758 0 0       0 $inv_1{$name}->{$i} = 1
759             if !looks_like_number( $hoa->{$name}->[$i] );
760             }
761             }
762            
763             # List of all indices in all lists with invalid values; and copy of each group of data:
764 0         0 foreach my $name ( keys %{$hoa} ) {
  0         0  
765 0         0 $clean{$name} = $hoa->{$name}; #$self->get_aref_by_lab( lab => $name );
766 0         0 while ( my ( $key, $val ) = each %{ $inv_1{$name} } )
  0         0  
767             { #each %{ $self->{'_purge'}->{$name} } ) {
768 0         0 $invalid_ids{$key} += $val;
769             }
770             }
771 0   0     0 $purged = scalar( keys(%invalid_ids) ) || 0;
772            
773             # Purge by index (from highest to lowest):
774 0         0 my @invalid_ids = reverse( sort { $a <=> $b } keys(%invalid_ids) );
  0         0  
775 0         0 foreach my $cull (@invalid_ids) {
776 0         0 foreach my $name ( keys %clean ) {
777 0         0 splice( @{ $clean{$name} }, $cull, 1 );
  0         0  
778             }
779             }
780            
781 0         0 foreach my $c ( keys %clean ) {
782 0         0 $purged_data{$c} = $clean{$c};
783             }
784 0 0       0 ${$purged_n} = $purged if ref $purged_n;
  0         0  
785 0         0 return \%purged_data;
786             }
787            
788             sub _init_data {
789 29     29   42 my ( $self, @args ) = @_;
790 29         43 my $tmp = {};
791 29 100       62 if ( _isa_hashref_of_arefs( $args[0] ) ) { # cases 4 & 6
    100          
    100          
    50          
792 5         15 $tmp = _init_labelled_data( $self, $args[0] );
793             }
794             elsif ( _isa_hash_of_arefs(@args) ) { # cases 3 & 5
795 14         52 $tmp = _init_labelled_data( $self, {@args} );
796             }
797             elsif ( _isa_array_of_arefs(@args) ) { # case 2 & 7
798 8         15 $tmp = _init_unlabelled_data(@args);
799             }
800             elsif ( ref $args[0] ) {
801 0         0 croak 'Don\'t know how to load/add data';
802             }
803             else { # assume @args is just a list of nude strings - case 1
804 2         15 $tmp->{0} = { seq => [@args], lab => undef };
805             }
806 29         85 return $tmp;
807             }
808            
809             sub _isa_hashref_of_arefs {
810 29     29   29 my $arg = shift;
811 29 100 100     126 if ( not ref $arg or ref $arg ne 'HASH' ) {
812 24         79 return 0;
813             }
814             else {
815 5         6 return _isa_hash_of_arefs( %{$arg} );
  5         21  
816             }
817             }
818            
819             sub _isa_hash_of_arefs {
820            
821             # determines that:
822             # - scalar @args passes Number::Misc is_even, then that:
823             # - every odd indexed value 'hascontent' via String::Util
824             # - every even indexed value is aref
825 29     29   47 my @args = @_;
826 29         28 my $bool = 0;
827 29 100       80 if ( is_even( scalar @args ) )
828             { # Number::Misc method - not odd number in assignment
829 21         562 my %args = @args; # so assume is hash
830             HASHCHECK:
831 21         76 while ( my ( $lab, $val ) = each %args ) {
832 25 100 66     72 if ( hascontent($lab) && ref $val eq 'ARRAY' ) {
833 23         257 $bool = 1;
834             }
835             else {
836 2         35 $bool = 0;
837             }
838 25 100       135 last HASHCHECK if $bool == 0;
839             }
840             }
841             else {
842 8         163 $bool = 0;
843             }
844 29         83 return $bool;
845             }
846            
847             sub _isa_array_of_arefs {
848 10     10   16 my @args = @_;
849 10 100   10   74 if ( all { ref($_) eq 'ARRAY' } @args ) {
  10         29  
850 8         16 return 1;
851             }
852             else {
853 2         11 return 0;
854             }
855             }
856            
857             sub _init_labelled_data {
858 19     19   20 my ( $self, $href ) = @_;
859 19         20 my ( $i, %tmp ) = ( scalar @{ $self->{_DATA} } );
  19         42  
860 19         20 while ( my ( $lab, $seq ) = each %{$href} ) {
  42         110  
861 23         45 my $j = _seq_index_by_label( $self, $lab );
862 23 100       41 if ( defined $j )
863             { # there is already a label for these data, so don't need to define it for this init
864 7         9 $tmp{$j} = { seq => [ @{$seq} ], lab => undef };
  7         33  
865             }
866             else { # no aref labelled $lab yet: define for seq and label
867 16         16 $tmp{ $i++ } = { seq => [ @{$seq} ], lab => $lab };
  16         113  
868             }
869             }
870 19         37 return \%tmp;
871             }
872            
873             sub _init_unlabelled_data {
874 8     8   11 my @args = @_;
875 8         11 my %tmp = ();
876 8         19 for my $i ( 0 .. scalar @args - 1 ) {
877 8         7 $tmp{$i} = { seq => [ @{ $args[$i] } ], lab => undef };
  8         57  
878             }
879 8         17 return \%tmp;
880             }
881            
882             sub _index_by_args {
883 29     29   38 my ( $self, @args ) = @_;
884 29         23 my $i;
885 29 100       57 if ( !$args[0] ) {
886 10         12 $i = 0;
887             }
888             else {
889 19 50       69 my $args = ref $args[0] ? $args[0] : {@args};
890 19 50       72 if ( hascontent( $args->{'index'} ) ) {
    50          
891 0         0 $i = $args->{'index'};
892             }
893             elsif ( hascontent( $args->{'label'} ) ) {
894 19         219 $i = _seq_index_by_label( $self, $args->{'label'} );
895             }
896             else {
897 0         0 $i = 0;
898             }
899             } # print "index by args = $i\n";
900 29 50 33     204 return ( defined $i and ref $self->{_DATA}->[$i]->{'seq'} )
901             ? $i
902             : croak __PACKAGE__, ' Data for accessing need to be loaded';
903             }
904            
905             sub _seq_index_by_label {
906 42     42   48 my ( $self, $label ) = @_;
907 42         40 my ( $i, $k ) = ( 0, 0 );
908 42         41 for ( ; $i < scalar( @{ $self->{_DATA} } ) ; $i++ ) {
  46         101  
909 30 100 66     180 do { $k++; last; }
  26         24  
  26         45  
910             if $self->{_DATA}->[$i]->{lab}
911             and $self->{_DATA}->[$i]->{lab} eq $label;
912             }
913 42 100       94 return $k ? $i : undef;
914             }
915            
916             sub _add_from_object_aref {
917 1     1   1 my ( $self, $aref ) = @_;
918 1         2 foreach my $dat ( @{$aref} ) {
  1         2  
919 2 50       5 if ( hascontent( $dat->{'lab'} ) ) {
920 2         14 $self->add( $dat->{'lab'} => $dat->{'seq'} );
921             }
922             else {
923 0         0 $self->add( $dat->{'seq'} );
924             }
925             }
926 1         2 return 1;
927             }
928            
929             =head1 EXAMPLES
930            
931             B<1. Multivariate data (a tale of horny frogs)>
932            
933             In a study of how doing mental arithmetic affects arousal in self and others (i.e., how mind, body and world interact), three male frogs were maths-trained and then, as they did their calculations, were measured for pupillary dilation and perceived attractiveness. After four runs, average measures per frog can be loaded:
934            
935             $frogs->load(Names => [qw/Freddo Kermit Larry/], Pupil => [59.2, 77.7, 56.1], Attract => [3.11, 8.79, 6.99]);
936            
937             But one more frog still had to graudate from training, and data are now ready for loading:
938            
939             $frogs->add(Names => ['Sleepy'], Pupil => [83.4], Attract => [5.30]);
940             $frogs->dump_data(label => 'Pupil'); # prints "59.2 77.7 56.1 83.4" : all 4 frogs' pupil data for analysis by some module
941            
942             Say we're finished testing for now, so:
943            
944             $frogs->save_to_file(path => 'frogs.csv');
945             $frogs->unload();
946            
947             But another frog has been trained, measures taken:
948            
949             $frogs->load_from_file(path => 'frogs.csv');
950             $frogs->add(Pupil => [93], Attract => [6.47], Names => ['Jack']); # add yet another frog's data
951             $frogs->dump_data(label => 'Pupil'); # prints "59.2 77.7 56.1 83.4 93": all 5 frogs' pupil data
952            
953             Now we run another experiment, taking measures of heart-rate, and can add them to the current load of data for analysis:
954            
955             $frogs->add(Heartrate => [.70, .50, .44, .67, .66]); # add entire new sequence for all frogs
956             print "heartrate data are bung" if ! $frogs->all_proportions(label => 'Heartrate'); # validity check (could do before add)
957             $frogs->dump_list(); # see all four data-sequences now loaded, each with 5 observations (1 per frog), i.e.:
958             .-------+-----------+----.
959             | index | label | N |
960             +-------+-----------+----+
961             | 0 | Names | 5 |
962             | 1 | Attract | 5 |
963             | 2 | Pupil | 5 |
964             | 3 | Heartrate | 5 |
965             '-------+-----------+----'
966            
967             B<2. Using as a base module>
968            
969             As L, and so its sub-modules, use this module as their base, it doesn't have to do much data-managing itself:
970            
971             use Statistics::Sequences;
972             my $seq = Statistics::Sequences->new();
973             $seq->load(qw/f b f b b/); # using Statistics::Data method
974             say $seq->p_value(stat => 'runs', exact => 1); # using Statistics::Sequences::Runs method
975            
976             Or if these data were loaded directly within Statistics::Data, the data can be shared around modules that use it as a base:
977            
978             use Statistics::Data;
979             use Statistics::Sequences::Runs;
980             my $dat = Statistics::Data->new();
981             my $runs = Statistics::Sequences::Runs->new();
982             $dat->load(qw/f b f b b/);
983             $runs->pass($dat);
984             say $runs->p_value(exact => 1);
985            
986             =head1 DIAGNOSTICS
987            
988             =over 4
989            
990             =item Don't know how to load/add data
991            
992             Croaked when attempting to load or add data with an unsupported data structure where the first argument is a reference. See the examples under L for valid (and invalid) ways of sending data to them.
993            
994             =item Data for accessing need to be loaded
995            
996             Croaked when calling L, or any methods that use it internally -- viz., L and the validity checks L -- when it is called with a label for data that have not been loaded, or did not load successfully.
997            
998             =item Data for unloading need to be loaded
999            
1000             Croaked when calling L with an index or a label attribute and the data these refer to have not been loaded, or did not load successfully.
1001            
1002             =item There is no path for saving (or loading) data
1003            
1004             Croaked when calling L or L without a value for the required B argument, or if it does not exist when it's touched for a load.
1005            
1006             =back
1007            
1008             =head1 DEPENDENCIES
1009            
1010             L - used for its C method when testing loads
1011            
1012             L - used for its C method when testing loads
1013            
1014             L - used for its C and C methods
1015            
1016             L - required for L and L
1017            
1018             L - required for L
1019            
1020             L - required for L
1021            
1022             =head1 BUGS AND LIMITATIONS
1023            
1024             Some methods rely on accessing previously loaded data but should permit performing their operations on data submitted directly to them, just like, e.g., $dat->all_numeric(\@data) is ok. This is handled for now internally, but should be handled in the same way by modules using this one as its base - for at the moment they have to check for an aref to their data-manipulating methods ahead of accessing any loaded data by this module.
1025            
1026             Please report any bugs or feature requests to C, or through the web interface at L. This will notify the author, and then you'll automatically be notified of progress on your bug as any changes are made.
1027            
1028             =head1 SUPPORT
1029            
1030             You can find documentation for this module with the perldoc command.
1031            
1032             perldoc Statistics::Data
1033            
1034             You can also look for information at:
1035            
1036             =over 4
1037            
1038             =item * RT: CPAN's request tracker
1039            
1040             L
1041            
1042             =item * AnnoCPAN: Annotated CPAN documentation
1043            
1044             L
1045            
1046             =item * CPAN Ratings
1047            
1048             L
1049            
1050             =item * Search CPAN
1051            
1052             L
1053            
1054             =back
1055            
1056             =head1 AUTHOR
1057            
1058             Roderick Garton, C<< >>
1059            
1060             =head1 LICENSE AND COPYRIGHT
1061            
1062             Copyright 2009-2015 Roderick Garton
1063            
1064             This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published
1065             by the Free Software Foundation; or the Artistic License. See L for more information.
1066            
1067             =cut
1068            
1069             1; # End of Statistics::Data