File Coverage

blib/lib/Data/Record.pm

Criterion	Covered	Total	%
statement	113	114	99.1
branch	59	66	89.3
condition	2	3	66.6
subroutine	16	16	100.0
pod	8	8	100.0
total	198	207	95.6

line	stmt	bran	cond	sub	pod	time	code
1							package Data::Record;
2
3	3			3		95368	use warnings;
	3					6
	3					110
4	3			3		19	use strict;
	3					6
	3					183
5
6							=head1 NAME
7
8							Data::Record - "split" on steroids
9
10							=head1 VERSION
11
12							Version 0.02
13
14							=cut
15
16							our $VERSION = '0.02';
17	3			3		18	use constant NOT_FOUND => -1;
	3					9
	3					224
18	3			3		17	use constant ALL_RECORDS => -1;
	3					5
	3					154
19	3			3		15	use constant TRIM_RECORDS => 0;
	3					5
	3					5175
20
21							=head1 SYNOPSIS
22
23							use Regexp::Common;
24							use Data::Record;
25							my $record = Data::Record->new({
26							split => "\n",
27							unless => $RE{quoted},
28							});
29							my @data = $record->records($data);
30
31							=head1 DESCRIPTION
32
33							Sometimes we need data split into records and a simple split on the input
34							record separator (C<$/>) or some other value fails because the values we're
35							splitting on may allowed in other parts of the data. Perhaps they're quoted.
36							Perhaps they're embedded in other data which should not be split up.
37
38							This module allows you to specify what you wish to split the data on, but also
39							speficy an "unless" regular expression. If the text in question matches the
40							"unless" regex, it will not be split there. This allows us to do things like
41							split on newlines unless newlines are embedded in quotes.
42
43							=head1 METHODS
44
45							=head2 new
46
47							Common usage:
48
49							my $record = Data::Record->new({
50							split => qr/$split/,
51							unless => qr/$unless/,
52							});
53
54							Advanced usage:
55
56							my $record = Data::Record->new({
57							split => qr/$split/,
58							unless => qr/$unless/, # optional
59							token => $token, # optional
60							chomp => 0, # optional
61							limit => $limit, # optional (do not use with trim)
62							trim => 1, # optional (do not use with limit)
63							fields => {
64							split => ',',
65							unless => $RE{quoted}, # from Regexp::Common
66							}
67							});
68
69							The constructor takes a hashref of key/value pairs to set the behavior of data
70							records to be created.
71
72							=over 4
73
74							=item * split
75
76							This is the value to split the data on. It may be either a regular expression
77							or a string.
78
79							Defaults to the current input record separator (C<$/>).
80
81							=item * unless
82
83							Data will be split into records matching the split value I they also
84							match this value. No default.
85
86							If you do not have an C value, use of this module is overkill.
87
88							=item * token
89
90							You will probably never need to set this value.
91
92							Internally, this module attempts to find a token which does not match any text
93							found in the data to be split and also does not match the split value. This is
94							necessary because we mask the data we don't want to split using this token.
95							This allows us to split the resulting text.
96
97							In the unlikely event that the module cannot find a token which is not in the
98							text, you may set the token value yourself to some string value. Do not set it
99							to a regular expression.
100
101							=item * chomp
102
103							By default, the split value is discarded (chomped) from each record. Set this
104							to a true value to keep the split value on each record. This differs slightly
105							from how it's done with split and capturing parentheses:
106
107							split /(\,)/, '3,4,5';
108
109							Ordinarily, this results in the following list:
110
111							( 3, ',', 4, ',', 5 )
112
113							This module assumes you want those values I the preceding record. By
114							setting chomp to false, you get the following list:
115
116							( '3,', '4,' 5 )
117
118							=item * limit
119
120							The default split behavior is similar to this:
121
122							split $split_regex, $data;
123
124							Setting C will cause the behavior to act like this:
125
126							split $split_regex, $data, $limit
127
128							See C for more information about the behavior of C.
129
130							You may not set both C and C in the constructor.
131
132							=item * trim
133
134							By default, we return all records. This means that due to the nature of split
135							and how we're doing things, we sometimes get a trailing null record. However,
136							setting this value causes the module to behave as if we had done this:
137
138							split $split_regex, $data, 0;
139
140							When C is called with a zero as the third argument, trailing null values
141							are discarded. See C for more information.
142
143							You may not set both C and C in the constructor.
144
145							B: This does I trim white space around returned records.
146
147							=item * fields
148
149							By default, individual records are returned as strings. If you set C,
150							you pass in a hashref of arguments that are identical to what C would take
151							and resulting records are returned as array references processed by a new
152							C instance.
153
154							Example: a quick CSV parser which assumes that commas and newlines may both be
155							in quotes:
156
157							# four lines, but there are only three records! (newline in quotes)
158							$data = <<'END_DATA';
159							1,2,"programmer, perl",4,5
160							1,2,"programmer,
161							perl",4,5
162							1,2,3,4,5
163							END_DATA
164
165							$record = $RECORD->new({
166							split => "\n",
167							unless => $quoted,
168							trim => 1,
169							fields => {
170							split => ",",
171							unless => $quoted,
172							}
173							});
174							my @records = $record->records($data);
175							foreach my $fields (@records) {
176							foreach my $field = (@$fields);
177							# do something
178							}
179							}
180
181							Note that above example will not remove the quotes from individual fields.
182
183							=back
184
185							=cut
186
187							sub new {
188	5			5	1	2991	my ( $class, $value_of ) = @_;
189	5					23	my %value_of = %$value_of;
190
191							# XXX fix this later after we have the core working
192	5					20	my $self = bless {}, $class;
193
194	5	100				18	unless ( exists $value_of{split} ) {
195	1					3	$value_of{split} = $/;
196							}
197	5	50				15	$self->split( $value_of{split} )->unless( $value_of{unless} )
		100
198							->chomp( exists $value_of{chomp} ? $value_of{chomp} : 1 )
199							->limit( exists $value_of{limit} ? $value_of{limit} : ALL_RECORDS );
200	5	50				19	$self->token( $value_of{token} ) if exists $value_of{token};
201	5	100				13	if ( exists $value_of{trim} ) {
202	2	100				10	$self->_croak("You may not specify 'trim' if 'limit' is specified")
203							if exists $value_of{limit};
204	1					3	$self->trim(1);
205							}
206	4	100				13	$self->_fields( $value_of{fields} ) if exists $value_of{fields};
207	4					19	return $self;
208							}
209
210							##############################################################################
211
212							=head2 split
213
214							my $split = $record->split;
215							$record->split($on_value);
216
217							Getter/setter for split value. May be a regular expression or a scalar value.
218
219							=cut
220
221							sub split {
222	30			30	1	39	my $self = shift;
223	30	100				148	return $self->{split} unless @_;
224
225	6					8	my $split = shift;
226	6	100				88	$split = qr/\Q$split\E/ unless 'Regexp' eq ref $split;
227	6					19	$self->{split} = $split;
228	6					24	return $self;
229							}
230
231							##############################################################################
232
233							=head2 unless
234
235							my $unless = $self->unless;
236							$self->unless($is_value);
237
238							Getter/setter for unless value. May be a regular expression or a scalar value.
239
240							=cut
241
242							sub unless {
243	16			16	1	22	my $self = shift;
244	16	100				58	return $self->{unless} unless @_;
245
246	6					9	my $unless = shift;
247	6	100				17	$unless = '' unless defined $unless;
248	6	100	66			53	$unless = qr/\Q$unless\E/
249							unless 'Regexp' eq ref $unless
250							\|\| 'Regexp::Common' eq ref $unless;
251	6					15	$self->{unless} = $unless;
252	6					28	return $self;
253							}
254
255							##############################################################################
256
257							=head2 chomp
258
259							my $chomp = $record->chomp;
260							$record->chomp(0);
261
262							Getter/setter for boolean chomp value.
263
264							=cut
265
266							sub chomp {
267	28			28	1	38	my $self = shift;
268	28	100				137	return $self->{chomp} unless @_;
269
270	7					13	$self->{chomp} = shift;
271	7					30	return $self;
272							}
273
274							##############################################################################
275
276							=head2 limit
277
278							my $limit = $record->limit;
279							$record->limit(3);
280
281							Getter/setter for integer limit value.
282
283							=cut
284
285							sub limit {
286	19			19	1	78	my $self = shift;
287	19	100				106	return $self->{limit} unless @_;
288
289	8					11	my $limit = shift;
290	8	100				50	unless ( $limit =~ /^-?\d+$/ ) {
291	2					12	$self->_croak("limit must be an integer value, not ($limit)");
292							}
293	6					12	$self->{limit} = $limit;
294	6					11	return $self;
295							}
296
297							##############################################################################
298
299							=head2 trim
300
301							my $trim = $record->trim;
302							$record->trim(1);
303
304							Getter/setter for boolean limit value. Setting this value will cause any
305							previous C value to be overwritten.
306
307							=cut
308
309							sub trim {
310	2			2	1	4	my $self = shift;
311	2	50				7	return $self->{trim} unless @_;
312
313	2					3	my $limit = shift;
314	2	50				9	$self->{limit} = $limit ? TRIM_RECORDS : ALL_RECORDS;
315							}
316
317							##############################################################################
318
319							=head2 token
320
321							my $token = $record->token;
322							$record->token($string_not_found_in_text);
323
324							Getter/setter for token value. Token must be a string that does not match the
325							split value and is not found in the text.
326
327							You can return the current token value if you have set it in your code. If you
328							rely on this module to create a token (this is the normal behavior), it is not
329							available via this method until C is called.
330
331							Setting the token to an undefined value causes L to try and find
332							a token itself.
333
334							If the token matches the split value, this method will croak when you attempt
335							to set the token.
336
337							If the token is found in the data, the C method will croak when it is
338							called.
339
340							=cut
341
342							sub token {
343	28			28	1	3717	my $self = shift;
344	28	100				99	return $self->{token} unless @_;
345
346	14					17	my $token = shift;
347	14	100				30	if ( defined $token ) {
348	13	100				30	if ( $token =~ $self->split ) {
349	1					4	$self->_croak(
350	1					3	"Token ($token) must not match the split value (@{[$self->split]})"
351							);
352							}
353							}
354	13					28	$self->{token} = $token;
355	13					29	return $self;
356							}
357
358							##############################################################################
359
360							=head2 records
361
362							my @records = $record->records($data);
363
364							Returns C<@records> for C<$data> based upon current split criteria.
365
366							=cut
367
368							sub records {
369	12			12	1	91	my ( $self, $data ) = @_;
370	12					27	my $token = $self->_create_token($data);
371	10					15	my @values;
372	10	50				20	if ( defined( my $unless = $self->unless ) ) {
373	10					15	my $index = 0;
374	10					271	$data =~ s{($unless)}
375							{
376	15					39	$values[$index] = $1;
377	15					82	$token . $index++ . $token;
378							}gex;
379
380							#main::diag($data);
381							}
382	10					28	my $split = $self->split;
383	10	100				25	$split = $self->chomp ? $split : qr/($split)/;
384
385							# if they have a numeric split value, we don't want to split tokens
386	10					51	my $token_re = qr/\Q$token\E/;
387	10	100				90	$split = qr/(?
388							if 0 =~ $split;
389	10					35	my @records = split $split, $data, $self->limit;
390	10	100				25	unless ( $self->chomp ) {
391	3					4	my @new_records;
392	3					11	while ( defined( my $record = shift @records ) ) {
393	9	50				17	if (@records) {
394	9					21	$record = join '', $record, shift @records;
395							}
396	9					29	push @new_records, $record;
397							}
398	3					8	@records = @new_records;
399							}
400
401	10					21	foreach my $record (@records) {
402	38	100				101	unless ( NOT_FOUND eq index $record, $token ) {
403	15					117	$record =~ s{$token_re(\d+)$token_re}{$values[$1]}gex;
	15					64
404							}
405							}
406	10	100				28	if ( my $field = $self->_fields ) {
407	1					9	$_ = [ $field->records($_) ] foreach @records;
408							}
409	10					94	return @records;
410							}
411
412							sub _fields {
413	11			11		19	my $self = shift;
414	11	100				46	return $self->{fields} unless @_;
415
416	1					8	my $fields = ref($self)->new(shift);
417	1	50				5	if ( defined( my $token = $self->token ) ) {
418	0					0	$fields->token($token);
419							}
420	1					3	$self->{fields} = $fields;
421	1					2	return $self;
422							}
423
424							my @tokens = map { $_ x 6 } qw( ~ ` ? " { } ! @ $ % ^ & * - _ + = );
425
426							sub _create_token {
427	12			12		20	my ( $self, $data ) = @_;
428	12					13	my $token;
429	12	100				26	if ( defined( $token = $self->token ) ) {
430	7	100				35	$self->_croak("Current token ($token) found in data")
431							unless NOT_FOUND eq index $data, $token;
432							}
433
434	11					20	foreach my $curr_token (@tokens) {
435	27	100				73	if ( NOT_FOUND eq index $data, $curr_token ) {
436	10					12	$token = $curr_token;
437	10					20	$self->token($token);
438	10					15	last;
439							}
440							}
441	11	100				28	if ( defined $token ) {
442	10					23	return $token;
443							}
444
445	1					5	my $tried = join ", ", @tokens;
446	1					6	$self->_croak(
447							"Could not determine a unique token for data. Tried ($tried)");
448							}
449
450							sub _croak {
451	6			6		10	my ( $self, $message ) = @_;
452	6					41	require Carp;
453	6					86	Carp::croak($message);
454							}
455
456							=head1 BUGS
457
458							It's possible to get erroneous results if the split value is C. I've
459							tried to work around this. Please let me know if there is a problem.
460
461							=head1 CAVEATS
462
463							This module must read I of the data at once. This can make it slow for
464							larger data sets.
465
466							=head1 AUTHOR
467
468							Curtis "Ovid" Poe, C<< >>
469
470							=head1 BUGS
471
472							Please report any bugs or feature requests to
473							C, or through the web interface at
474							L.
475							I will be notified, and then you'll automatically be notified of progress on
476							your bug as I make changes.
477
478							=head1 ACKNOWLEDGEMENTS
479
480							Thanks to the Monks for inspiration from
481							L.
482
483							0.02 Thanks to Smylers and Stefano Rodighiero for catching POD errors.
484
485							=head1 COPYRIGHT & LICENSE
486
487							Copyright 2005 Curtis "Ovid" Poe, all rights reserved.
488
489							This program is free software; you can redistribute it and/or modify it
490							under the same terms as Perl itself.
491
492							=cut
493
494							1; # End of Data::Record