File Coverage

blib/lib/Data/Record.pm
Criterion Covered Total %
statement 113 114 99.1
branch 59 66 89.3
condition 2 3 66.6
subroutine 16 16 100.0
pod 8 8 100.0
total 198 207 95.6


line stmt bran cond sub pod time code
1             package Data::Record;
2              
3 3     3   95368 use warnings;
  3         6  
  3         110  
4 3     3   19 use strict;
  3         6  
  3         183  
5              
6             =head1 NAME
7              
8             Data::Record - "split" on steroids
9              
10             =head1 VERSION
11              
12             Version 0.02
13              
14             =cut
15              
16             our $VERSION = '0.02';
17 3     3   18 use constant NOT_FOUND => -1;
  3         9  
  3         224  
18 3     3   17 use constant ALL_RECORDS => -1;
  3         5  
  3         154  
19 3     3   15 use constant TRIM_RECORDS => 0;
  3         5  
  3         5175  
20              
21             =head1 SYNOPSIS
22              
23             use Regexp::Common;
24             use Data::Record;
25             my $record = Data::Record->new({
26             split => "\n",
27             unless => $RE{quoted},
28             });
29             my @data = $record->records($data);
30              
31             =head1 DESCRIPTION
32              
33             Sometimes we need data split into records and a simple split on the input
34             record separator (C<$/>) or some other value fails because the values we're
35             splitting on may allowed in other parts of the data. Perhaps they're quoted.
36             Perhaps they're embedded in other data which should not be split up.
37              
38             This module allows you to specify what you wish to split the data on, but also
39             speficy an "unless" regular expression. If the text in question matches the
40             "unless" regex, it will not be split there. This allows us to do things like
41             split on newlines unless newlines are embedded in quotes.
42              
43             =head1 METHODS
44              
45             =head2 new
46              
47             Common usage:
48              
49             my $record = Data::Record->new({
50             split => qr/$split/,
51             unless => qr/$unless/,
52             });
53              
54             Advanced usage:
55              
56             my $record = Data::Record->new({
57             split => qr/$split/,
58             unless => qr/$unless/, # optional
59             token => $token, # optional
60             chomp => 0, # optional
61             limit => $limit, # optional (do not use with trim)
62             trim => 1, # optional (do not use with limit)
63             fields => {
64             split => ',',
65             unless => $RE{quoted}, # from Regexp::Common
66             }
67             });
68              
69             The constructor takes a hashref of key/value pairs to set the behavior of data
70             records to be created.
71              
72             =over 4
73            
74             =item * split
75              
76             This is the value to split the data on. It may be either a regular expression
77             or a string.
78              
79             Defaults to the current input record separator (C<$/>).
80              
81             =item * unless
82              
83             Data will be split into records matching the split value I they also
84             match this value. No default.
85              
86             If you do not have an C value, use of this module is overkill.
87              
88             =item * token
89              
90             You will probably never need to set this value.
91              
92             Internally, this module attempts to find a token which does not match any text
93             found in the data to be split and also does not match the split value. This is
94             necessary because we mask the data we don't want to split using this token.
95             This allows us to split the resulting text.
96              
97             In the unlikely event that the module cannot find a token which is not in the
98             text, you may set the token value yourself to some string value. Do not set it
99             to a regular expression.
100              
101             =item * chomp
102              
103             By default, the split value is discarded (chomped) from each record. Set this
104             to a true value to keep the split value on each record. This differs slightly
105             from how it's done with split and capturing parentheses:
106              
107             split /(\,)/, '3,4,5';
108              
109             Ordinarily, this results in the following list:
110              
111             ( 3, ',', 4, ',', 5 )
112              
113             This module assumes you want those values I the preceding record. By
114             setting chomp to false, you get the following list:
115              
116             ( '3,', '4,' 5 )
117              
118             =item * limit
119              
120             The default split behavior is similar to this:
121              
122             split $split_regex, $data;
123              
124             Setting C will cause the behavior to act like this:
125              
126             split $split_regex, $data, $limit
127              
128             See C for more information about the behavior of C.
129              
130             You may not set both C and C in the constructor.
131              
132             =item * trim
133              
134             By default, we return all records. This means that due to the nature of split
135             and how we're doing things, we sometimes get a trailing null record. However,
136             setting this value causes the module to behave as if we had done this:
137              
138             split $split_regex, $data, 0;
139              
140             When C is called with a zero as the third argument, trailing null values
141             are discarded. See C for more information.
142              
143             You may not set both C and C in the constructor.
144              
145             B: This does I trim white space around returned records.
146              
147             =item * fields
148              
149             By default, individual records are returned as strings. If you set C,
150             you pass in a hashref of arguments that are identical to what C would take
151             and resulting records are returned as array references processed by a new
152             C instance.
153              
154             Example: a quick CSV parser which assumes that commas and newlines may both be
155             in quotes:
156              
157             # four lines, but there are only three records! (newline in quotes)
158             $data = <<'END_DATA';
159             1,2,"programmer, perl",4,5
160             1,2,"programmer,
161             perl",4,5
162             1,2,3,4,5
163             END_DATA
164            
165             $record = $RECORD->new({
166             split => "\n",
167             unless => $quoted,
168             trim => 1,
169             fields => {
170             split => ",",
171             unless => $quoted,
172             }
173             });
174             my @records = $record->records($data);
175             foreach my $fields (@records) {
176             foreach my $field = (@$fields);
177             # do something
178             }
179             }
180              
181             Note that above example will not remove the quotes from individual fields.
182              
183             =back
184              
185             =cut
186              
187             sub new {
188 5     5 1 2991 my ( $class, $value_of ) = @_;
189 5         23 my %value_of = %$value_of;
190              
191             # XXX fix this later after we have the core working
192 5         20 my $self = bless {}, $class;
193              
194 5 100       18 unless ( exists $value_of{split} ) {
195 1         3 $value_of{split} = $/;
196             }
197 5 50       15 $self->split( $value_of{split} )->unless( $value_of{unless} )
    100          
198             ->chomp( exists $value_of{chomp} ? $value_of{chomp} : 1 )
199             ->limit( exists $value_of{limit} ? $value_of{limit} : ALL_RECORDS );
200 5 50       19 $self->token( $value_of{token} ) if exists $value_of{token};
201 5 100       13 if ( exists $value_of{trim} ) {
202 2 100       10 $self->_croak("You may not specify 'trim' if 'limit' is specified")
203             if exists $value_of{limit};
204 1         3 $self->trim(1);
205             }
206 4 100       13 $self->_fields( $value_of{fields} ) if exists $value_of{fields};
207 4         19 return $self;
208             }
209              
210             ##############################################################################
211              
212             =head2 split
213              
214             my $split = $record->split;
215             $record->split($on_value);
216              
217             Getter/setter for split value. May be a regular expression or a scalar value.
218              
219             =cut
220              
221             sub split {
222 30     30 1 39 my $self = shift;
223 30 100       148 return $self->{split} unless @_;
224              
225 6         8 my $split = shift;
226 6 100       88 $split = qr/\Q$split\E/ unless 'Regexp' eq ref $split;
227 6         19 $self->{split} = $split;
228 6         24 return $self;
229             }
230              
231             ##############################################################################
232              
233             =head2 unless
234              
235             my $unless = $self->unless;
236             $self->unless($is_value);
237              
238             Getter/setter for unless value. May be a regular expression or a scalar value.
239              
240             =cut
241              
242             sub unless {
243 16     16 1 22 my $self = shift;
244 16 100       58 return $self->{unless} unless @_;
245              
246 6         9 my $unless = shift;
247 6 100       17 $unless = '' unless defined $unless;
248 6 100 66     53 $unless = qr/\Q$unless\E/
249             unless 'Regexp' eq ref $unless
250             || 'Regexp::Common' eq ref $unless;
251 6         15 $self->{unless} = $unless;
252 6         28 return $self;
253             }
254              
255             ##############################################################################
256              
257             =head2 chomp
258              
259             my $chomp = $record->chomp;
260             $record->chomp(0);
261              
262             Getter/setter for boolean chomp value.
263              
264             =cut
265              
266             sub chomp {
267 28     28 1 38 my $self = shift;
268 28 100       137 return $self->{chomp} unless @_;
269              
270 7         13 $self->{chomp} = shift;
271 7         30 return $self;
272             }
273              
274             ##############################################################################
275              
276             =head2 limit
277              
278             my $limit = $record->limit;
279             $record->limit(3);
280              
281             Getter/setter for integer limit value.
282              
283             =cut
284              
285             sub limit {
286 19     19 1 78 my $self = shift;
287 19 100       106 return $self->{limit} unless @_;
288              
289 8         11 my $limit = shift;
290 8 100       50 unless ( $limit =~ /^-?\d+$/ ) {
291 2         12 $self->_croak("limit must be an integer value, not ($limit)");
292             }
293 6         12 $self->{limit} = $limit;
294 6         11 return $self;
295             }
296              
297             ##############################################################################
298              
299             =head2 trim
300              
301             my $trim = $record->trim;
302             $record->trim(1);
303              
304             Getter/setter for boolean limit value. Setting this value will cause any
305             previous C value to be overwritten.
306              
307             =cut
308              
309             sub trim {
310 2     2 1 4 my $self = shift;
311 2 50       7 return $self->{trim} unless @_;
312              
313 2         3 my $limit = shift;
314 2 50       9 $self->{limit} = $limit ? TRIM_RECORDS : ALL_RECORDS;
315             }
316              
317             ##############################################################################
318              
319             =head2 token
320              
321             my $token = $record->token;
322             $record->token($string_not_found_in_text);
323              
324             Getter/setter for token value. Token must be a string that does not match the
325             split value and is not found in the text.
326              
327             You can return the current token value if you have set it in your code. If you
328             rely on this module to create a token (this is the normal behavior), it is not
329             available via this method until C is called.
330              
331             Setting the token to an undefined value causes L to try and find
332             a token itself.
333              
334             If the token matches the split value, this method will croak when you attempt
335             to set the token.
336              
337             If the token is found in the data, the C method will croak when it is
338             called.
339              
340             =cut
341              
342             sub token {
343 28     28 1 3717 my $self = shift;
344 28 100       99 return $self->{token} unless @_;
345              
346 14         17 my $token = shift;
347 14 100       30 if ( defined $token ) {
348 13 100       30 if ( $token =~ $self->split ) {
349 1         4 $self->_croak(
350 1         3 "Token ($token) must not match the split value (@{[$self->split]})"
351             );
352             }
353             }
354 13         28 $self->{token} = $token;
355 13         29 return $self;
356             }
357              
358             ##############################################################################
359              
360             =head2 records
361              
362             my @records = $record->records($data);
363              
364             Returns C<@records> for C<$data> based upon current split criteria.
365              
366             =cut
367              
368             sub records {
369 12     12 1 91 my ( $self, $data ) = @_;
370 12         27 my $token = $self->_create_token($data);
371 10         15 my @values;
372 10 50       20 if ( defined( my $unless = $self->unless ) ) {
373 10         15 my $index = 0;
374 10         271 $data =~ s{($unless)}
375             {
376 15         39 $values[$index] = $1;
377 15         82 $token . $index++ . $token;
378             }gex;
379              
380             #main::diag($data);
381             }
382 10         28 my $split = $self->split;
383 10 100       25 $split = $self->chomp ? $split : qr/($split)/;
384              
385             # if they have a numeric split value, we don't want to split tokens
386 10         51 my $token_re = qr/\Q$token\E/;
387 10 100       90 $split = qr/(?
388             if 0 =~ $split;
389 10         35 my @records = split $split, $data, $self->limit;
390 10 100       25 unless ( $self->chomp ) {
391 3         4 my @new_records;
392 3         11 while ( defined( my $record = shift @records ) ) {
393 9 50       17 if (@records) {
394 9         21 $record = join '', $record, shift @records;
395             }
396 9         29 push @new_records, $record;
397             }
398 3         8 @records = @new_records;
399             }
400              
401 10         21 foreach my $record (@records) {
402 38 100       101 unless ( NOT_FOUND eq index $record, $token ) {
403 15         117 $record =~ s{$token_re(\d+)$token_re}{$values[$1]}gex;
  15         64  
404             }
405             }
406 10 100       28 if ( my $field = $self->_fields ) {
407 1         9 $_ = [ $field->records($_) ] foreach @records;
408             }
409 10         94 return @records;
410             }
411              
412             sub _fields {
413 11     11   19 my $self = shift;
414 11 100       46 return $self->{fields} unless @_;
415              
416 1         8 my $fields = ref($self)->new(shift);
417 1 50       5 if ( defined( my $token = $self->token ) ) {
418 0         0 $fields->token($token);
419             }
420 1         3 $self->{fields} = $fields;
421 1         2 return $self;
422             }
423              
424             my @tokens = map { $_ x 6 } qw( ~ ` ? " { } ! @ $ % ^ & * - _ + = );
425              
426             sub _create_token {
427 12     12   20 my ( $self, $data ) = @_;
428 12         13 my $token;
429 12 100       26 if ( defined( $token = $self->token ) ) {
430 7 100       35 $self->_croak("Current token ($token) found in data")
431             unless NOT_FOUND eq index $data, $token;
432             }
433              
434 11         20 foreach my $curr_token (@tokens) {
435 27 100       73 if ( NOT_FOUND eq index $data, $curr_token ) {
436 10         12 $token = $curr_token;
437 10         20 $self->token($token);
438 10         15 last;
439             }
440             }
441 11 100       28 if ( defined $token ) {
442 10         23 return $token;
443             }
444              
445 1         5 my $tried = join ", ", @tokens;
446 1         6 $self->_croak(
447             "Could not determine a unique token for data. Tried ($tried)");
448             }
449              
450             sub _croak {
451 6     6   10 my ( $self, $message ) = @_;
452 6         41 require Carp;
453 6         86 Carp::croak($message);
454             }
455              
456             =head1 BUGS
457              
458             It's possible to get erroneous results if the split value is C. I've
459             tried to work around this. Please let me know if there is a problem.
460              
461             =head1 CAVEATS
462              
463             This module must read I of the data at once. This can make it slow for
464             larger data sets.
465              
466             =head1 AUTHOR
467              
468             Curtis "Ovid" Poe, C<< >>
469              
470             =head1 BUGS
471              
472             Please report any bugs or feature requests to
473             C, or through the web interface at
474             L.
475             I will be notified, and then you'll automatically be notified of progress on
476             your bug as I make changes.
477              
478             =head1 ACKNOWLEDGEMENTS
479              
480             Thanks to the Monks for inspiration from
481             L.
482              
483             0.02 Thanks to Smylers and Stefano Rodighiero for catching POD errors.
484              
485             =head1 COPYRIGHT & LICENSE
486              
487             Copyright 2005 Curtis "Ovid" Poe, all rights reserved.
488              
489             This program is free software; you can redistribute it and/or modify it
490             under the same terms as Perl itself.
491              
492             =cut
493              
494             1; # End of Data::Record