File Coverage

blib/lib/ELFF/Parser.pm
Criterion Covered Total %
statement 39 41 95.1
branch 8 10 80.0
condition 2 3 66.6
subroutine 5 5 100.0
pod 2 2 100.0
total 56 61 91.8


line stmt bran cond sub pod time code
1             package ELFF::Parser;
2              
3             # ELFF-Parser is a perl module for parsing ELFF formatted log files.
4             #
5             # Copyright (C) 2007-2010 Mark Warren
6             #
7             # This library is free software; you can redistribute it and/or
8             # modify it under the terms of the GNU Lesser General Public
9             # License as published by the Free Software Foundation; either
10             # version 2.1 of the License, or (at your option) any later version.
11             #
12             # This library is distributed in the hope that it will be useful,
13             # but WITHOUT ANY WARRANTY; without even the implied warranty of
14             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15             # Lesser General Public License for more details.
16             #
17             # You should have received a copy of the GNU Lesser General Public
18             # License along with this library; if not, write to the Free Software
19             # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20              
21              
22             =pod
23              
24             =head1 NAME
25              
26             ELFF::Parser - parse ELFF formatted log files
27              
28             =head1 SYNOPSIS
29              
30             use ELFF::Parser;
31              
32             $p = new ELFF::Parser();
33             while() {
34             $res = $p->parse_line($_);
35              
36             if($res->{directive} && $res->{directive} eq 'Start-Date') {
37             print "Log starts at $res->{value}\n";
38             }
39             elsif($res->{href}) {
40             print $res->{href}{'rs-bytes'}, "\n";
41             }
42             elsif($res->{aref}) {
43             print "Detected log format change, or no fields directive\n";
44             foreach my $field (@{$res->{aref}}) {
45             print " found field: $field\n";
46             }
47             print "\n";
48             }
49             else {
50             print STDERR "Failed to parse log line\n";
51             }
52             }
53              
54             =head1 DESCRIPTION
55              
56             C parses ELFF formatted logs. For a description of ELFF
57             (Extended Log File Format), see http://www.w3.org/TR/WD-logfile.html. In
58             brief, ELFF log files consist of directives (meta-data about the logs)
59             and logs. C parses both, extracting log format information
60             from the directives and using it to build hashes for each log entry.
61             If log format information isn't available or becomes invalidated (see
62             the L section below), C will return
63             arrays for each log entry instead of hashes.
64              
65             =head1 CONSTRUCTOR
66              
67             =over 4
68              
69             =item $ep = new ELFF::Parser()
70              
71             Creates a new C object.
72              
73             =back
74              
75             =head1 METHODS
76              
77             =over 4
78              
79             =item $res = $ep->parse_line($line)
80              
81             Parse an ELFF log line. The returned result will be a hash reference that
82             contains different information depending on the state of the object and
83             the type of line parsed (i.e. directive or log entry).
84              
85             If the line is a directive, the returned hash will have the following
86             keys:
87              
88             $res->{directive} the name of the directive
89             $res->{value} the value of the directive
90              
91             If the line is a Fields directive, the result will contain a 'fields'
92             key as well, which is an array reference containing the fields.
93              
94             foreach my $field (@{$res->{fields}}) {
95             print "Found field $field\n";
96             }
97              
98             Since C builds hashes for you for each log entry, you
99             generally don't need to worry about the fields.
100              
101             If the line is a log entry, and the C object has parsed
102             a fields directive already, the result hash will contain a 'href'
103             key whose value is a hash reference containing the log entry data.
104              
105             print "client to proxy bytes: ", $res->{href}{'cs-bytes'}, "\n";
106              
107             If no fields directive has been parsed, or C detects a
108             change in log format (see the L section below), an
109             array reference may be returned instead:
110              
111             foreach my $field (@{$res->{aref}}) {
112             print "data: ", $field, "\n";
113             }
114              
115             If C detects a malformed line, it will return undef.
116              
117             =back
118              
119             =head1 ELFF PROBLEMS
120              
121             There is one particularly annoying thing about ELFF log files, which is
122             that the ELFF standard doesn't require that a new Fields directive be
123             inserted into the log file when the log format changes. Because of this,
124             if the log format changes in the middle of a log file, there is very
125             little that a parser can do to detect the change. All reporting software
126             that I have seen simply ignores logs as soon as a change in format
127             is detected (i.e. when errors are encountered extracting statistics
128             from the logs). This is a shortcoming in the ELFF standard, and I'm
129             afraid that C doesn't handle the problem much better.
130             C detects log format changes by checking the number of
131             fields in each log entry. If the number of fields in a log entry differs
132             from the number of fields specified in the Fields directive, C
133             will invalidate the format and start returning arrays of fields for
134             each message instead of hashes. This way, the log data is still
135             available to you, and you can attempt to recover from the problem
136             yourself. However, if the number of fields in the log messages
137             doesn't change when the log format changes (e.g. when fields are
138             re-ordered, or when the same number of fields is added and removed),
139             C will not detected the format change.
140              
141             Thankfully, log formats usually don't change on their own, so
142             administrators can modify their procedures such that the impact
143             of this shortcoming is minimized (e.g. rotate the log file
144             immediately after changing the log format to force a new fields
145             directive to be logged).
146              
147             =head1 HOMEPAGE
148              
149             L
150              
151             =head1 BUGS
152              
153             None that I know of, but please let me know if you find one. Please
154             report bugs via the SourceForge tracker.
155              
156             =head1 AUTHOR
157              
158             Copyright (c) 2007 Mark Warren
159              
160             =head1 LICENSE AND DISCLAIMER
161              
162             This software is distributed under the terms of the GNU Lesser General
163             Public License.
164              
165             THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
166             AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
167             IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
168             ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
169             LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
170             CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
171             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
172             INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
173             CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
174             ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
175             POSSIBILITY OF SUCH DAMAGE.
176              
177             =cut
178              
179 3     3   24005 use 5.00;
  3         12  
  3         136  
180 3     3   20 use strict;
  3         9  
  3         98  
181 3     3   17 use Carp;
  3         12  
  3         1855  
182              
183             our $VERSION = '0.92';
184              
185              
186             sub new {
187 1     1 1 265 my $class = shift;
188              
189 1         4 my $self = {
190             # we use number of fields to detect log format changes. it's
191             # not perfect, but we don't understand the log content, so this
192             # is the best that we can do
193             'fields' => 0,
194              
195             # revmap is used to figure out the name of each field as we
196             # build the result hash in parse_line
197             'revmap' => {},
198             };
199              
200 1         4 return bless $self, $class;
201             }
202              
203             sub parse_line {
204 15     15 1 7179 my ($self, $line) = @_;
205 15         22 chomp($line);
206              
207 15         23 my $res = {};
208              
209             # if the line is a directive, handle it here
210 15 100 66     80 if($line && substr($line, 0, 1) eq '#') {
211             # some vendors put whitespace between # and the directive name, remove it
212 9         22 $line =~ s/^#\s+/#/;
213              
214 9         37 @$res{('directive', 'value')} = split(/\s+/, $line, 2);
215 9         46 $res->{directive} =~ s/(?:^#|:$)//g;
216              
217 9 100       24 if($res->{directive} eq 'Fields') {
218 5         28 $self->{revmap} = tokenize($res->{value});
219 5         8 $self->{fields} = $#{$self->{revmap}};
  5         9  
220 5         5 @{$res->{fields}} = @{$self->{revmap}};
  5         17  
  5         8  
221             }
222              
223 9         23 return $res;
224             }
225              
226             # not a directive, regular log
227              
228 6         22 my $flds = tokenize($line);
229 6 50       13 return undef unless $flds;
230              
231             # no field names - return array
232 6 50       14 unless($self->{revmap}) {
233 0         0 $res->{aref} = $flds;
234 0         0 return $res;
235             }
236              
237             # change in format, invalidate fields and return array
238 6 100       15 if($#$flds != $self->{fields}) {
239 1         3 $self->{revmap} = undef;
240 1         3 $res->{aref} = $flds;
241 1         3 return $res;
242             }
243              
244             # return href
245 5         6 my %href;
246 5         6 @href{@{$self->{revmap}}} = @$flds;
  5         30  
247 5         12 $res->{href} = \%href;
248              
249 5         22 return $res;
250             }
251              
252              
253             require XSLoader;
254             XSLoader::load('ELFF::Parser', $VERSION);
255              
256              
257             1;