File Coverage

blib/lib/Data/Freq.pm
Criterion Covered Total %
statement 77 100 77.0
branch 16 42 38.1
condition 2 15 13.3
subroutine 17 18 94.4
pod 6 6 100.0
total 118 181 65.1


line stmt bran cond sub pod time code
1 3     3   107428 use 5.006;
  3         19  
2 3     3   13 use strict;
  3         5  
  3         54  
3 3     3   11 use warnings;
  3         5  
  3         150  
4              
5             package Data::Freq;
6              
7             =head1 NAME
8              
9             Data::Freq - Collects data, counts frequency, and makes up a multi-level counting report
10              
11             =head1 VERSION
12              
13             Version 0.04
14              
15             =cut
16              
17             our $VERSION = '0.04';
18              
19             our $ROOT_VALUE = 'Total';
20              
21 3     3   13 use Carp qw(croak);
  3         4  
  3         155  
22 3     3   680 use Data::Freq::Field;
  3         7  
  3         84  
23 3     3   962 use Data::Freq::Node;
  3         6  
  3         80  
24 3     3   963 use Data::Freq::Record;
  3         6  
  3         124  
25 3     3   19 use List::Util qw(max);
  3         5  
  3         226  
26 3     3   16 use Scalar::Util qw(blessed openhandle);
  3         5  
  3         2435  
27              
28             =head1 SYNOPSIS
29              
30             use Data::Freq;
31            
32             my $data = Data::Freq->new('date');
33            
34             while (my $line = ) {
35             $data->add($line);
36             }
37            
38             $data->output();
39              
40             =head1 DESCRIPTION
41              
42             C is an object-oriented module to collect data from log files
43             or any kind of data sources, count frequency of particular patterns,
44             and generate a counting report.
45              
46             See also the command-line tool L.
47              
48             The simplest usage is to count lines of a log files in terms of a particular category
49             such as date, username, remote address, and so on.
50              
51             For more advanced usage, C is capable of aggregating counting results
52             at multiple levels.
53             For example, lines of a log file can be grouped into I first,
54             and then under each of the months, they can be further grouped into individual I,
55             where all the frequency of both months and days is summed up consistently.
56              
57             =head2 Analyzing an Apache access log
58              
59             The example below is a copy from the L section.
60              
61             my $data = Data::Freq->new('date');
62            
63             while (my $line = ) {
64             $data->add($line);
65             }
66            
67             $data->output();
68              
69             It will generate a report that looks something like this:
70              
71             123: 2012-01-01
72             456: 2012-01-02
73             789: 2012-01-03
74             ...
75              
76             where the left column shows the number of occurrences of each date.
77              
78             The date/time value is automatically extracted from the log line,
79             where the first field enclosed by a pair of brackets C<[...]>
80             is parsed as a date/time text by the C function.
81             (See L.)
82              
83             See also L.
84              
85             =head2 Multi-level counting
86              
87             The initialization parameters for the L method can be customized
88             for a multi-level analysis.
89              
90             If the field specifications are given, e.g.
91              
92             Data::Freq->new(
93             {type => 'date'}, # field spec for level 1
94             {type => 'text', pos => 2}, # field spec for level 2
95             );
96             # assuming the position 2 (third portion, 0-based)
97             # is the remote username.
98              
99             then the output will look like this:
100              
101             123: 2012-01-01
102             100: user1
103             20: user2
104             3: user3
105             456: 2012-01-02
106             400: user1
107             50: user2
108             6: user3
109             ...
110              
111             Below is another example along this line:
112              
113             Data::Freq->new('month', 'day');
114             # Level 1: 'month'
115             # Level 2: 'day'
116              
117             with the output:
118              
119             12300: 2012-01
120             123: 2012-01-01
121             456: 2012-01-02
122             789: 2012-01-03
123             ...
124             45600: 2012-02
125             456: 2012-02-01
126             789: 2012-02-02
127             ...
128              
129             See L for more details about the initialization parameters.
130              
131             =head2 Custom input
132              
133             The data source is not restricted to log files.
134             For example, a CSV file can be analyzed as below:
135              
136             my $data = Data::Freq->new({pos => 0}, {pos => 1});
137             # or more simply, Data::Freq->new(0, 1);
138            
139             open(my $csv, 'source.csv');
140            
141             while (<$csv>) {
142             $data->add([split /,/]);
143             }
144              
145             Note: the L method accepts an array ref,
146             so that the input does not have to be split by the default
147             L function.
148              
149             For more generic input data, a hash ref can also be given
150             to the L method.
151              
152             E.g.
153              
154             my $data = Data::Freq->new({key => 'x'}, {key => 'y'});
155             # Note: keys *cannot* be abbrebiated like Data::Freq->new('x', 'y')
156            
157             $data->add({x => 'foo', y => 'abc'});
158             $data->add({x => 'bar', y => 'def'});
159             $data->add({x => 'foo', y => 'ghi'});
160             $data->add({x => 'bar', y => 'jkl'});
161             ...
162              
163             In the field specifications, the value of C or C can also be an array ref,
164             where the multiple elements selected by the C or C will be C'ed
165             by a space (or the value of C<$">).
166              
167             This is useful when a log format contains a date that is not enclosed by a pair of
168             brackets C<[...]>.
169              
170             E.g.
171              
172             my $data = Data::Freq->new({type => 'date', pos => [0..3]});
173            
174             # Log4x with %d{dd MMM yyyy HH:mm:ss,SSS}
175             $data->add("01 Jan 2012 01:02:03,456 INFO - test log\n");
176            
177             # pos 0: "01"
178             # pos 1: "Jan"
179             # pos 2: "2012"
180             # pos 3: "01:02:03,456"
181              
182             As a result, "01 Jan 2012 01:02:03,456" will be parsed as a date string.
183              
184             =head2 Custom output
185              
186             The L method accepts different types of parameters as below:
187              
188             =over 4
189              
190             =item * A file handle or an instance of C
191              
192             By default, the result is printed out to C.
193             With this parameter given, it can be any other output destination.
194              
195             =item * A callback subroutine ref
196              
197             If a callback is specified, it will be invoked with a node object (L)
198             passed as an argument.
199             See L for more details about the tree structure.
200              
201             Roughly, each node represents a counting result for each line
202             in the default output format, in the depth-first order (i.e. the same order
203             as the default output lines).
204              
205             $data->output(sub {
206             my $node = shift;
207             print "Count: ", $node->count, "\n";
208             print "Value: ", $node->value, "\n";
209             print "Depth: ", $node->depth, "\n";
210             print "\n";
211             });
212              
213             =item * A hash ref of options to control output format
214              
215             $data->output({
216             with_root => 0 , # also prints total (root node)
217             transpose => 0 , # prints values before counts
218             indent => ' ', # repeats (depth - 1) times
219             separator => ': ' , # separates the count and the value
220             prefix => '' , # prepended before the count
221             no_padding => 0 , # disables padding for the count
222             });
223              
224             =item * The format option can be specified together with a file handle.
225              
226             $data->output(\*STDERR, {indent => "\t"});
227              
228             =back
229              
230             The output does not include the grand total by default.
231             If the C option is set to a true value, the total count will be printed
232             as the first line (level 0), and all the subsequent levels will be shifted to the right.
233              
234             The C option flips the order of the count and the value in each line. E.g.
235              
236             2012-01: 12300
237             2012-01-01: 123
238             2012-01-02: 456
239             2012-01-03: 789
240             ...
241             2012-02: 45600
242             2012-02-01: 456
243             2012-02-02: 789
244             ...
245              
246             The indent unit (repeated appropriate times) and the separator
247             (between the count and the value) can be customized with the respective options,
248             C and C.
249              
250             The default output format has apparent ambiguity between the indent
251             and the padding for alignment.
252              
253             For example, consider the output below:
254              
255             1200000: Level 1
256             900000: Level 2
257             900000: Level 3
258             5: Level 2
259             ...
260              
261             where the second "Level 2" appears to have a deeper indent than the "Level 3."
262              
263             Although the positions of colons (C<:>) are consistently aligned,
264             it may seem to be slightly inconsistent.
265              
266             The indent depth will be clearer if a C is added:
267              
268             $data->output({prefix => '* '});
269            
270             * 1200000: Level 1
271             * 900000: Level 2
272             * 900000: Level 3
273             * 5: Level 2
274             ...
275              
276             Alternatively, the C option can be set to a true value
277             to disable the left padding.
278              
279             $data->output({no_padding => 1});
280            
281             1200000: Level 1
282             900000: Level 2
283             900000: Level 3
284             5: Level 2
285             ...
286              
287             =head2 Field specification
288              
289             Each argument passed to the L method is passed to the L method.
290              
291             For example,
292              
293             Data::Freq->new(
294             'month',
295             'day',
296             );
297            
298             is equivalent to
299              
300             Data::Freq->new(
301             Data::Freq::Field->new('month'),
302             Data::Freq::Field->new('day'),
303             );
304              
305             and because of the way the argument is interpreted by the L class,
306             it is also equivalent to
307              
308             Data::Freq->new(
309             Data::Freq::Field->new({type => 'month'}),
310             Data::Freq::Field->new({type => 'day'}),
311             );
312              
313             =over 4
314              
315             =item * C<< type => { 'text' | 'number' | 'date' } >>
316              
317             The basic data types are C<'text'>, C<'number'>, and C<'date'>,
318             which determine how each input data is normalized for the frequency counting,
319             and how the results are sorted.
320              
321             The C<'date'> type can also be written as the format string for C function.
322             (See L.)
323              
324             Data::Freq->new('%Y-%m');
325            
326             Data::Freq->new({type => '%H'});
327              
328             If the type is simply specified as C<'date'>, the format defaults to C<'%Y-%m-%d'>.
329              
330             In addition, the keywords below can be used as synonims:
331              
332             'year' : equivalent to '%Y'
333             'month' : equivalent to '%Y-%m'
334             'day' : equivalent to '%Y-%m-%d'
335             'hour' : equivalent to '%Y-%m-%d %H'
336             'minute': equivalent to '%Y-%m-%d %H:%M'
337             'second': equivalent to '%Y-%m-%d %H:%M:%S'
338              
339             =item * C<< aggregate => { 'unique' | 'max' | 'min' | 'average' } >>
340              
341             The C parameter alters how each C is calculated,
342             where the default C is equal to the sum of all the C's for its child nodes.
343              
344             'unique' : the number of distinct child values
345             'max' : the maximum count of the child nodes
346             'min' : the minimum count of the child nodes
347             'average': the average count of the child nodes
348              
349             =item * C<< sort => { 'value' | 'count' | 'first' | 'last' } >>
350              
351             The C parameter is used as the key by which the group of records
352             will be sorted for the output.
353              
354             'value': sort by the normalized value
355             'count': sort by the frequency count
356             'first': sort by the first occurrence in the input
357             'last' : sort by the last occurrence in the input
358              
359             =item * C<< order => { 'asc' | 'desc' } >>
360              
361             The C parameter controls the sorting in the either ascending or descending order.
362              
363             =item * C<< pos => { 0, 1, 2, -1, -2, ... } >>
364              
365             If the C parameter is given or an integer value (or a list of integers) is given
366             without a parameter name, the value whose frequency is counted will be selected
367             at the indices from an array ref input or a text split
368             by the L function.
369              
370             =item * C<< key => { any key(s) for input hash refs } >>
371              
372             If the C parameter is given, it is assumed that the input is a hash ref,
373             where the value whose frequency is counted will be selected by the specified key(s).
374              
375             =item * C<< convert => sub {...} >>
376              
377             If the C parameter is set to a subroutine ref,
378             it is invoked to convert the value to a normalized form for frequency counting.
379              
380             The subroutine is expected to take one string argument and return a converted string.
381              
382             =back
383              
384             If the C parameter is either C or C,
385             the results are sorted by C in the descending order by default
386             (i.e. the most frequent value first).
387              
388             For the C type, the C parameter defaults to C,
389             and the C parameter defaults to C
390             (i.e. the time-line order).
391              
392             =head2 Frequency tree
393              
394             Once all the data have been collected with the L method,
395             a C has been constructed internally.
396              
397             Suppose the C instance is initialized with the two fields as below:
398              
399             my $field1 = Data::Freq::Field->new({type => 'month'});
400             my $field2 = Data::Freq::Field->new({type => 'text', pos => 2});
401             my $data = Data::Freq->new($field1, $field2);
402             ...
403              
404             a result tree that looks like below will be constructed as each data record is added:
405              
406             Depth 0 Depth 1 Depth 2
407             $field1 $field2
408              
409             {432: root}--+--{123: "2012-01"}--+--{10: "user1"}
410             | +--{ 8: "user2"}
411             | +--{ 7: "user3"}
412             | ...
413             +--{135: "2012-02"}--+--{11: "user3"}
414             | +--{ 9: "user2"}
415             | ...
416             ...
417              
418             In the diagram, a node is represented by a pair of braces C<{...}>,
419             and each integer value is the total number of occurrences of the node value,
420             under its parent category.
421              
422             The root node maintains the grand total of records that have been added.
423              
424             The tree structure can be recursively visited by the L method.
425              
426             Below is an example to generate a HTML:
427              
428             print qq(
    \n);
429            
430             $data->traverse(sub {
431             my ($node, $children, $recurse) = @_;
432            
433             my ($count, $value) = ($node->count, $node->value);
434             # HTML-escape $value if necessary
435            
436             print qq(
  • $count: $value);
  • 437            
    438             if (@$children > 0) {
    439             print qq(\n
      \n);
    440            
    441             for my $child (@$children) {
    442             $recurse->($child); # invoke recursion
    443             }
    444            
    445             print qq(\n);
    446             }
    447            
    448             print qq(\n);
    449             });
    450            
    451             print qq(\n);
    452              
    453             =head1 METHODS
    454              
    455             =head2 new
    456              
    457             Usage:
    458              
    459             Data::Freq->new($field1, $field2, ...);
    460              
    461             Constructs a C object.
    462              
    463             The arguments C<$field1>, C<$field2>, etc. are instances of L,
    464             or any valid arguments that can be passed to L.
    465              
    466             The actual data to be analyzed need to be added by the L method one by one.
    467              
    468             The C object maintains the counting results, based on the specified fields.
    469             The first field (C<$field1>) is used to group the added data into the major category.
    470             The next subsequent field (C<$field2>) is for the sub-category under each major group.
    471             Any more subsequent fields are interpreted recursively as sub-sub-category, etc.
    472              
    473             If no fields are given to the L method, one field of the C type will be assumed.
    474              
    475             =cut
    476              
    477             sub new {
    478 13     13 1 33188 my $class = shift;
    479            
    480 13         25 my $fields = eval {[map {
    481 13 50 33     48 blessed($_) && $_->isa('Data::Freq::Field') ?
      17 100       116  
    482             $_ : Data::Freq::Field->new($_)
    483             } (@_ ? (@_) : ('text'))]};
    484            
    485 13 50       33 croak $@ if $@;
    486            
    487 13         65 return bless {
    488             root => Data::Freq::Node->new($ROOT_VALUE),
    489             fields => $fields,
    490             }, $class;
    491             }
    492              
    493             =head2 add
    494              
    495             Usage:
    496              
    497             $data->add("A record");
    498            
    499             $data->add("A log line text\n");
    500            
    501             $data->add(['Already', 'split', 'data']);
    502            
    503             $data->add({key1 => 'data1', key2 => 'data2', ...});
    504              
    505             Adds a record that increments the counting by 1.
    506              
    507             The interpretation of the input depends on the type of fields specified in the L method.
    508             See L.
    509              
    510             =cut
    511              
    512             sub add {
    513 164     164 1 472 my $self = shift;
    514            
    515 164         214 for my $input (@_) {
    516 164         322 my $record = Data::Freq::Record->new($input);
    517            
    518 164         248 my $node = $self->root;
    519 164         203 $node->{count}++;
    520            
    521 164         169 for my $field (@{$self->fields}) {
      164         206  
    522 233         419 my $value = $field->evaluate_record($record);
    523 233 50       352 last unless defined $value;
    524 233         374 $node = $node->add_subnode($value);
    525             }
    526             }
    527            
    528 164         239 return $self;
    529             }
    530              
    531             =head2 output
    532              
    533             Usage:
    534              
    535             # I/O
    536             $data->output(); # print results (default format)
    537             $data->output(\*OUT); # print results to open handle
    538             $data->output($io); # print results to IO::* object
    539            
    540             # Callback
    541             $data->output(sub {
    542             my $node = shift;
    543             # $node is a Data::Freq::Node instance
    544             });
    545            
    546             # Options
    547             $data->output({
    548             with_root => 0 , # if true, prints total at root
    549             transpose => 0 , # if true, prints values before counts
    550             indent => ' ', # repeats (depth - 1) times
    551             separator => ': ', # separates the count and the value
    552             prefix => '' , # prepended before the count
    553             no_padding => 0 , # if true, disables padding for the count
    554             });
    555            
    556             # Combination
    557             $data->output(\*STDERR, {opt => ...});
    558             $data->output($open_fh, {opt => ...});
    559              
    560             Generates a report of the counting results.
    561              
    562             If no arguments are given, default format results are printed out to C.
    563             Any open handle or an instance of C can be passed as the output destination.
    564              
    565             If the argument is a subroutine ref, it is regarded as a callback
    566             that will be called for each node of the I in the depth-first order.
    567             (See L for details.)
    568              
    569             The following arguments are passed to the callback:
    570              
    571             =over 4
    572              
    573             =item * $node: Data::Freq::Node
    574              
    575             The current node (L)
    576              
    577             =item * $children: [$child_node1, $child_node2, ...]
    578              
    579             An array ref to the list of child nodes, sorted based on the field
    580              
    581             Note: C<< $node->children >> is a hash ref (unsorted) of a raw counting data.
    582              
    583             =back
    584              
    585             =cut
    586              
    587             sub output {
    588 4     4 1 45 my $self = shift;
    589 4         7 my ($fh, $callback, $opt);
    590            
    591 4         11 for (@_) {
    592 4 50       22 if (openhandle($_)) {
        50          
    593 0         0 $fh = $_;
    594             } elsif (ref $_ eq 'HASH') {
    595 0         0 $opt = $_;
    596             } else {
    597 4         6 $callback = $_;
    598             }
    599             }
    600            
    601 4   50     21 $opt ||= {};
    602            
    603 4 50       12 my $indent = defined $opt->{indent} ? $opt->{indent} : ' ';
    604 4 50       11 my $prefix = defined $opt->{prefix} ? $opt->{prefix} : '' ;
    605 4 50       9 my $separator = defined $opt->{separator} ? $opt->{separator} : ': ';
    606 4 50       9 my $with_root = $opt->{with_root} ? 1 : 0;
    607 4 50       8 my $no_padding = $opt->{no_padding} ? 1 : 0;
    608 4 50       7 my $transpose = $opt->{transpose} ? 1 : 0;
    609            
    610 4 50       9 if (!$callback) {
    611 0 0 0     0 my $maxlen = $with_root ? length($self->root->count) : length($self->root->max || '');
    612 0   0     0 $fh ||= \*STDOUT;
    613            
    614             $callback = sub {
    615 0     0   0 my ($node, $children, $field, $subfield) = @_;
    616            
    617 0 0 0     0 if ($with_root || $node->depth > 0) {
    618 0 0       0 print $fh $indent x ($node->depth - ($with_root ? 0 : 1));
    619 0         0 print $fh $prefix;
    620            
    621 0         0 my $value = $node->value;
    622 0         0 my $count;
    623            
    624 0 0 0     0 if ($field and my $aggregate = $field->aggregate) {
    625 0         0 $count = $node->$aggregate;
    626             } else {
    627 0         0 $count = $node->count;
    628             }
    629            
    630 0 0       0 if ($transpose) {
        0          
    631 0         0 print $fh $value;
    632             } elsif ($no_padding) {
    633 0         0 print $fh $count;
    634             } else {
    635 0         0 printf $fh '%'.$maxlen.'d', $count;
    636             }
    637            
    638 0         0 print $fh $separator;
    639            
    640 0 0       0 if ($transpose) {
    641 0         0 print $fh $count;
    642             } else {
    643 0         0 print $fh $value;
    644             }
    645            
    646 0         0 print $fh "\n";
    647             }
    648 0         0 };
    649             }
    650            
    651             $self->traverse(sub {
    652 24     24   34 my ($node, $children, $recurse, $field) = @_;
    653 24         56 $callback->($node, $children, $field);
    654 24         87 $recurse->($_) foreach @$children;
    655 4         28 });
    656             }
    657              
    658             =head2 traverse
    659              
    660             Usage:
    661              
    662             $data->traverse(sub {
    663             my ($node, $children, $recurse) = @_;
    664            
    665             # Do something with $node before its child nodes
    666            
    667             # $children is a sorted list of child nodes,
    668             # based on the field specification
    669             for my $child (@$children) {
    670             $recurse->($child); # invoke recursion
    671             }
    672            
    673             # Do something with $node after its child nodes
    674             });
    675              
    676             Provides a way to traverse the result tree with more control than the L method.
    677              
    678             A callback must be passed as an argument, and will ba called with the following arguments:
    679              
    680             =over 4
    681              
    682             =item * $node: Data::Freq::Node
    683              
    684             The current node (L)
    685              
    686             =item * $children: [$child_node1, $child_node2, ...]
    687              
    688             An array ref to the list of child nodes, sorted based on the field
    689              
    690             Note: C<< $node->children >> is a hash ref (unsorted) of a raw counting data.
    691              
    692             =item * $recurse: sub ($a_child_node)
    693              
    694             A subroutine ref, with which the resursion is invoked at a desired time
    695              
    696             =back
    697              
    698             When the L method is called,
    699             the root node is passed as the C<$node> parameter first.
    700             Until the C<$recurse> subroutine is explicitly invoked for the child nodes,
    701             B recursion will be invoked automatically.
    702              
    703             =cut
    704              
    705             sub traverse {
    706 4     4 1 8 my $self = shift;
    707 4         6 my $callback = shift;
    708            
    709 4         7 my $fields = $self->fields;
    710 4         5 my $recurse; # separate declaration for closure access
    711            
    712             $recurse = sub {
    713 24     24   26 my $node = shift;
    714 24         26 my $children = [];
    715 24         44 my $field = $fields->[$node->depth];
    716 24         42 my $subfield = $fields->[$node->depth + 1];
    717            
    718 24 100       40 if ($field) {
    719 6         5 $children = [values %{$node->children}];
      6         12  
    720 6         18 $children = $field->select_nodes($children, $subfield);
    721             }
    722            
    723 24         53 $callback->($node, $children, $recurse, $field, $subfield);
    724 4         22 };
    725            
    726 4         36 $recurse->($self->root);
    727             }
    728              
    729             =head2 root
    730              
    731             Returns the root node of the I. (See L for details.)
    732              
    733             The root node is created during the L method call,
    734             and maintains the total number of added records and a reference to its direct child nodes
    735             for the first field.
    736              
    737             =head2 fields
    738              
    739             Returns the array ref to the list of fields (L).
    740              
    741             The returned array should B be modified.
    742              
    743             =cut
    744              
    745 177     177 1 258 sub root {shift->{root }}
    746 168     168 1 237 sub fields {shift->{fields}}
    747              
    748             =head1 AUTHOR
    749              
    750             Mahiro Ando, C<< >>
    751              
    752             =head1 BUGS
    753              
    754             Please report any bugs or feature requests to C, or through
    755             the web interface at L. I will be notified, and then you'll
    756             automatically be notified of progress on your bug as I make changes.
    757              
    758             =head1 SUPPORT
    759              
    760             You can find documentation for this module with the perldoc command.
    761              
    762             perldoc Data::Freq
    763              
    764             You can also look for information at:
    765              
    766             =over 4
    767              
    768             =item * RT: CPAN's request tracker (report bugs here)
    769              
    770             L
    771              
    772             =item * AnnoCPAN: Annotated CPAN documentation
    773              
    774             L
    775              
    776             =item * CPAN Ratings
    777              
    778             L
    779              
    780             =item * Search CPAN
    781              
    782             L
    783              
    784             =back
    785              
    786             =head1 ACKNOWLEDGEMENTS
    787              
    788             =head1 LICENSE AND COPYRIGHT
    789              
    790             Copyright 2012 Mahiro Ando.
    791              
    792             This program is free software; you can redistribute it and/or modify it
    793             under the terms of either: the GNU General Public License as published
    794             by the Free Software Foundation; or the Artistic License.
    795              
    796             See http://dev.perl.org/licenses/ for more information.
    797              
    798             =cut
    799              
    800             1; # End of Data::Freq