File Coverage

blib/lib/Text/RecordParser.pm

Criterion	Covered	Total	%
statement	276	279	98.9
branch	168	180	93.3
condition	29	37	78.3
subroutine	31	31	100.0
pod	23	23	100.0
total	527	550	95.8

line	stmt	bran	cond	sub	pod	time	code
1							package Text::RecordParser;
2
3							=head1 NAME
4
5							Text::RecordParser - read record-oriented files
6
7							=head1 SYNOPSIS
8
9							use Text::RecordParser;
10
11							# use default record (\n) and field (,) separators
12							my $p = Text::RecordParser->new( $file );
13
14							# or be explicit
15							my $p = Text::RecordParser->new({
16							filename => $file,
17							field_separator => "\t",
18							});
19
20							$p->filename('foo.csv');
21
22							# Split records on two newlines
23							$p->record_separator("\n\n");
24
25							# Split fields on tabs
26							$p->field_separator("\t");
27
28							# Skip lines beginning with hashes
29							$p->comment( qr/^#/ );
30
31							# Trim whitespace
32							$p->trim(1);
33
34							# Use the fields in the first line as column names
35							$p->bind_header;
36
37							# Get a list of the header fields (in order)
38							my @columns = $p->field_list;
39
40							# Extract a particular field from the next row
41							my ( $name, $age ) = $p->extract( qw[name age] );
42
43							# Return all the fields from the next row
44							my @fields = $p->fetchrow_array;
45
46							# Define a field alias
47							$p->set_field_alias( name => 'handle' );
48
49							# Return all the fields from the next row as a hashref
50							my $record = $p->fetchrow_hashref;
51							print $record->{'name'};
52							# or
53							print $record->{'handle'};
54
55							# Return the record as an object with fields as accessors
56							my $object = $p->fetchrow_object;
57							print $object->name; # or $object->handle;
58
59							# Get all data as arrayref of arrayrefs
60							my $data = $p->fetchall_arrayref;
61
62							# Get all data as arrayref of hashrefs
63							my $data = $p->fetchall_arrayref( { Columns => {} } );
64
65							# Get all data as hashref of hashrefs
66							my $data = $p->fetchall_hashref('name');
67
68							=head1 DESCRIPTION
69
70							This module is for reading record-oriented data in a delimited text
71							file. The most common example have records separated by newlines and
72							fields separated by commas or tabs, but this module aims to provide a
73							consistent interface for handling sequential records in a file however
74							they may be delimited. Typically this data lists the fields in the
75							first line of the file, in which case you should call C
76							to bind the field name (or not, and it will be called implicitly). If
77							the first line contains data, you can still bind your own field names
78							via C. Either way, you can then use many methods to get
79							at the data as arrays or hashes.
80
81							=head1 METHODS
82
83							=cut
84
85	12			12		227334	use strict;
	12					18
	12					371
86	12			12		36	use warnings;
	12					17
	12					262
87	12			12		4370	use version;
	12					14735
	12					52
88	12			12		994	use Carp qw( croak );
	12					16
	12					624
89	12			12		4928	use IO::Scalar;
	12					101251
	12					469
90	12			12		5376	use List::MoreUtils qw( uniq );
	12					85408
	12					82
91	12			12		6647	use Readonly;
	12					6131
	12					570
92	12			12		5049	use Text::ParseWords qw( parse_line );
	12					9986
	12					26520
93
94							our $VERSION = version->new('1.6.4');
95
96							Readonly my $COMMA => q{,};
97							Readonly my $EMPTY_STR => q{};
98							Readonly my $NEW_LINE => qq{\n};
99							Readonly my $PIPE => q{\|};
100
101							# ----------------------------------------------------------------
102							sub new {
103
104							=pod
105
106							=head2 new
107
108							This is the object constructor. It takes a hash (or hashref) of
109							arguments. Each argument can also be set through the method of the
110							same name.
111
112							=over 4
113
114							=item * filename
115
116							The path to the file being read. If the filename is passed and the fh
117							is not, then it will open a filehandle on that file and sets C
118							accordingly.
119
120							=item * comment
121
122							A compiled regular expression identifying comment lines that should
123							be skipped.
124
125							=item * data
126
127							The data to read.
128
129							=item * fh
130
131							The filehandle of the file to read.
132
133							=item * field_separator \| fs
134
135							The field separator (default is comma).
136
137							=item * record_separator \| rs
138
139							The record separator (default is newline).
140
141							=item * field_filter
142
143							A callback applied to all the fields as they are read.
144
145							=item * header_filter
146
147							A callback applied to the column names.
148
149							=item * trim
150
151							Boolean to enable trimming of leading and trailing whitespace from fields
152							(useful if splitting on whitespace only).
153
154							=back
155
156							See methods for each argument name for more information.
157
158							Alternately, if you supply a single argument to C, it will be
159							treated as the C argument.
160
161							=cut
162
163	58			58	1	24908	my $class = shift;
164
165	58	100	100			427	my $args
		100
166							= defined $_[0] && UNIVERSAL::isa( $_[0], 'HASH' ) ? shift
167							: scalar @_ == 1 ? { filename => shift }
168							: { @_ };
169
170	58					93	my $self = bless {}, $class;
171
172	58	100				117	if ( my $fs = $args->{'fs'} ) {
173	1					2	$args->{'field_separator'} = $fs;
174	1					2	delete $args->{'fs'};
175							}
176
177	58	100				117	if ( my $rs = $args->{'rs'} ) {
178	1					2	$args->{'record_separator'} = $rs;
179	1					2	delete $args->{'rs'};
180							}
181
182	58					83	my $data_handles = 0;
183	58					97	for my $arg (
184							qw[ filename fh header_filter field_filter trim
185							field_separator record_separator data comment
186							]
187							) {
188	522	100				718	next if !defined $args->{ $arg };
189
190	56	100				212	if ( $arg =~ / \A (filename\|fh\|data) \Z /xms ) {
191	37					41	$data_handles++;
192							}
193
194	56					127	$self->$arg( $args->{ $arg } );
195							}
196
197	58	100				108	if ( $data_handles > 1 ) {
198	1					17	croak
199							'Passed too many arguments to read the data. '.
200							'Please choose only one of "filename," "fh," or "data."'
201							;
202							}
203
204	57					157	return $self;
205							}
206
207							# ----------------------------------------------------------------
208							sub bind_fields {
209
210							=pod
211
212							=head2 bind_fields
213
214							$p->bind_fields( qw[ name rank serial_number ] );
215
216							Takes an array of field names and memorizes the field positions for
217							later use. If the input file has no header line but you still wish to
218							retrieve the fields by name (or even if you want to call
219							C and then give your own field names), simply pass in the
220							an array of field names you wish to use.
221
222							Pass in an empty array reference to unset:
223
224							$p->bind_field( [] ); # unsets fields
225
226							=cut
227
228	42			42	1	2696	my $self = shift;
229
230							# using an empty arrayref to unset
231	42	100	66			139	if ( ref $_[0] eq 'ARRAY' && !@{ $_[0] } ) {
	1	100				3
232	1					1	$self->{'field_pos_ordered'} = [];
233	1					3	$self->{'field_pos'} = {};
234	1					2	$self->{'fields_bound'} = 0;
235							}
236							elsif ( @_ ) {
237	40					65	my @fields = @_;
238	40					102	$self->{'field_pos_ordered'} = [ @fields ];
239
240	40					35	my %field_pos;
241	40					75	for my $i ( 0 .. $#fields ) {
242	183	50				213	next unless $fields[ $i ];
243	183					216	$field_pos{ $fields[ $i ] } = $i;
244							}
245
246	40					66	$self->{'field_pos'} = \%field_pos;
247	40					69	$self->{'fields_bound'} = 1;
248							}
249							else {
250	1					15	croak 'Bind fields called without field list';
251							}
252
253	41					46	return 1;
254							}
255
256							# ----------------------------------------------------------------
257							sub bind_header {
258
259							=pod
260
261							=head2 bind_header
262
263							$p->bind_header;
264							my $name = $p->extract('name');
265
266							Takes the fields from the next row under the cursor and assigns the field
267							names to the values. Usually you would call this immediately after
268							opening the file in order to bind the field names in the first row.
269
270							=cut
271
272	37			37	1	110	my $self = shift;
273
274	37	100				68	if ( my @columns = $self->fetchrow_array ) {
275	34	100				65	if ( my $filter = $self->header_filter ) {
276	2					5	for my $i ( 0 .. $#columns ) {
277	12					26	$columns[ $i ] = $filter->( $columns[ $i ] );
278							}
279							}
280
281	34					225	$self->bind_fields( @columns );
282							}
283							else {
284	2					5	croak q[Can't find columns in file '], $self->filename, q['];
285							}
286
287	34					61	return 1;
288							}
289
290							# ----------------------------------------------------------------
291							sub comment {
292
293							=pod
294
295							=head2 comment
296
297							$p->comment( qr/^#/ ); # Perl-style comments
298							$p->comment( qr/^--/ ); # SQL-style comments
299
300							Takes a regex to apply to a record to see if it looks like a comment
301							to skip.
302
303							=cut
304
305	93			93	1	108	my $self = shift;
306
307	93	100				129	if ( my $arg = shift ) {
308	3	100				7	if ( ref $arg ne 'Regexp' ) {
309	1					20	croak q[Argument to comment doesn't look like a regex];
310							}
311
312	2					18	$self->{'comment'} = $arg;
313							}
314
315	92	100				240	return defined $self->{'comment'} ? $self->{'comment'} : $EMPTY_STR;
316							}
317
318							# ----------------------------------------------------------------
319							sub data {
320
321							=pod
322
323							=head2 data
324
325							$p->data( $string );
326							$p->data( \$string );
327							$p->data( @lines );
328							$p->data( [$line1, $line2, $line3] );
329							$p->data( IO::File->new('
330
331							Allows a scalar, scalar reference, glob, array, or array reference as
332							the thing to read instead of a file handle.
333
334							It's not advised to pass a filehandle to C as it will read the
335							entire contents of the file rather than one line at a time if you set
336							it via C.
337
338							=cut
339
340	10			10	1	1290	my $self = shift;
341	10					34	my $data;
342
343	10	100				16	if (@_) {
344	9					10	my $arg = shift;
345
346	9	100	66			69	if ( UNIVERSAL::isa( $arg, 'SCALAR' ) ) {
		100
		100
		100
347	1					2	$data = $$arg;
348							}
349							elsif ( UNIVERSAL::isa( $arg, 'ARRAY' ) ) {
350	1					2	$data = join $EMPTY_STR, @$arg;
351							}
352							elsif ( UNIVERSAL::isa( $arg, 'GLOB' ) ) {
353	1					4	local $/;
354	1					25	$data = <$arg>;
355							}
356							elsif ( !ref($arg) && @_ ) {
357	2					7	$data = join $EMPTY_STR, $arg, @_;
358							}
359							else {
360	4					5	$data = $arg;
361							}
362							}
363							else {
364	1					7	croak 'Data called without any arguments';
365							}
366
367	9	100				24	if ( $data ) {
368	8					30	my $fh = IO::Scalar->new( \$data );
369	8					326	$self->fh( $fh );
370							}
371							else {
372	1					9	croak 'No usable data';
373							}
374
375	8					17	return 1;
376							}
377
378							# ----------------------------------------------------------------
379							sub extract {
380
381							=pod
382
383							=head2 extract
384
385							my ( $foo, $bar, $baz ) = $p->extract( qw[ foo bar baz ] );
386
387							Extracts a list of fields out of the last row read. The field names
388							must correspond to the field names bound either via C or
389							C.
390
391							=cut
392
393	6			6	1	931	my $self = shift;
394	6	100				13	my @fields = @_ or return;
395	5					9	my %allowed = map { $_, 1 } $self->field_list;
	21					25
396	4	100				8	my $record = $self->fetchrow_hashref or return;
397
398	3					3	my @data;
399	3					4	foreach my $field ( @fields ) {
400	4	100				5	if ( $allowed{ $field } ) {
401	3					4	push @data, $record->{ $field };
402							}
403							else {
404	1					3	croak "Invalid field $field for file "
405							. $self->filename
406							. $NEW_LINE
407							. 'Valid fields are: '
408							. join(', ', $self->field_list)
409							. $NEW_LINE
410							;
411							}
412							}
413
414	2	100				9	return scalar @data == 1 ? $data[0] : @data;
415							}
416
417							# ----------------------------------------------------------------
418							sub fetchrow_array {
419
420							=pod
421
422							=head2 fetchrow_array
423
424							my @values = $p->fetchrow_array;
425
426							Reads a row from the file and returns an array or array reference
427							of the fields.
428
429							=cut
430
431	91			91	1	2286	my $self = shift;
432	91	100				105	my $fh = $self->fh or croak 'No filename or file handle';
433	90					166	my $comment = $self->comment;
434	90					292	local $/ = $self->record_separator;
435
436	90					476	my $line;
437	90					67	my $line_no = 0;
438	90					66	for ( ;; ) {
439	101					70	$line_no++;
440	101	100				565	defined( $line = <$fh> ) or return;
441	91					412	chomp $line;
442	91	100	100			174	next if $comment and $line =~ $comment;
443	87					161	$line =~ s/(?
444	87	100				135	last if $line;
445							}
446
447	80					117	my $separator = $self->field_separator;
448	80	100				178	$separator eq $PIPE and $separator = '\\|';
449	80	50				412	my @fields = map { defined $_ && $_ =~ s/\\'/'/g; $_ } (
	391	100				7341
	391					409
450							( ref $separator eq 'Regexp' )
451							? parse_line( $separator, 0, $line )
452							: parse_line( $separator, 1, $line )
453							);
454
455	80	50				138	if ( !@fields ) {
456	0					0	croak "Error reading line number $line_no:\n$line";
457							}
458
459	80	100				164	if ( my $filter = $self->field_filter ) {
460	4					6	@fields = map { $filter->( $_ ) } @fields;
	24					68
461							}
462
463	80	100				464	if ( $self->trim ) {
464	7	50				6	@fields = map { defined $_ && s/^\s+\|\s+$//g; $_ } @fields;
	29					68
	29					28
465							}
466
467	80					67	while ( my ( $position, $callback ) = each %{ $self->field_compute } ) {
	87					140
468	7	100				21	next if $position !~ m/^\d+$/;
469	3					10	$fields[ $position ] = $callback->( $fields[ $position ], \@fields );
470							}
471
472	80	100				455	return wantarray ? @fields : \@fields;
473							}
474
475							# ----------------------------------------------------------------
476							sub fetchrow_hashref {
477
478							=pod
479
480							=head2 fetchrow_hashref
481
482							my $record = $p->fetchrow_hashref;
483							print "Name = ", $record->{'name'}, "\n";
484
485							Reads a line of the file and returns it as a hash reference. The keys
486							of the hashref are the field names bound via C or
487							C. If you do not bind fields prior to calling this method,
488							the C method will be implicitly called for you.
489
490							=cut
491
492	38			38	1	4939	my $self = shift;
493	38	50				52	my @fields = $self->field_list or return;
494	38	100				69	my @row = $self->fetchrow_array or return;
495
496	33					35	my $i = 0;
497	33					23	my %return;
498	33					101	for my $field ( @fields ) {
499	156	50				196	next unless defined $row[ $i ];
500	156					174	$return{ $field } = $row[ $i++ ];
501	156	100				177	if ( my @aliases = $self->get_field_aliases( $field ) ) {
502	2					8	$return{ $_ } = $return{ $field } for @aliases;
503							}
504							}
505
506	33					33	while ( my ( $position, $callback ) = each %{ $self->field_compute } ) {
	37					88
507	4					11	$return{ $position } = $callback->( $return{ $position }, \%return );
508							}
509
510	33					104	return \%return;
511							}
512
513							# ----------------------------------------------------------------
514							sub fetchrow_object {
515
516							=pod
517
518							=head2 fetchrow_object
519
520							while ( my $object = $p->fetchrow_object ) {
521							my $id = $object->id;
522							my $name = $object->naem; # <-- this will throw a runtime error
523							}
524
525							This will return the next data record as a Text::RecordParser::Object
526							object that has read-only accessor methods of the field names and any
527							aliases. This allows you to enforce field names, further helping
528							ensure that your code is reading the input file correctly. That is,
529							if you are using the "fetchrow_hashref" method to read each line, you
530							may misspell the hash key and introduce a bug in your code. With this
531							method, Perl will throw an error if you attempt to read a field not
532							defined in the file's headers. Additionally, any defined field
533							aliases will be created as additional accessor methods.
534
535							=cut
536
537	4			4	1	674	my $self = shift;
538	4	100				7	my $row = $self->fetchrow_hashref or return;
539	3	50				5	my @fields = $self->field_list or return;
540
541	3					3	push @fields, map { $self->get_field_aliases( $_ ) } @fields;
	10					12
542
543	3					15	return Text::RecordParser::Object->new( \@fields, $row );
544							}
545
546							# ----------------------------------------------------------------
547							sub fetchall_arrayref {
548
549							=pod
550
551							=head2 fetchall_arrayref
552
553							my $records = $p->fetchall_arrayref;
554							for my $record ( @$records ) {
555							print "Name = ", $record->[0], "\n";
556							}
557
558							my $records = $p->fetchall_arrayref( { Columns => {} } );
559							for my $record ( @$records ) {
560							print "Name = ", $record->{'name'}, "\n";
561							}
562
563							Like DBI's fetchall_arrayref, returns an arrayref of arrayrefs. Also
564							accepts optional "{ Columns => {} }" argument to return an arrayref of
565							hashrefs.
566
567							=cut
568
569	4			4	1	14	my $self = shift;
570	1					3	my %args
571	4	100	100			24	= defined $_[0] && ref $_[0] eq 'HASH' ? %{ shift() }
		100
572							: @_ % 2 == 0 ? @_
573							: ();
574
575	4	100				10	my $method = ref $args{'Columns'} eq 'HASH'
576							? 'fetchrow_hashref' : 'fetchrow_array';
577
578	4					4	my @return;
579	4					8	while ( my $record = $self->$method() ) {
580	9					18	push @return, $record;
581							}
582
583	4					12	return \@return;
584							}
585
586							# ----------------------------------------------------------------
587							sub fetchall_hashref {
588
589							=pod
590
591							=head2 fetchall_hashref
592
593							my $records = $p->fetchall_hashref('id');
594							for my $id ( keys %$records ) {
595							my $record = $records->{ $id };
596							print "Name = ", $record->{'name'}, "\n";
597							}
598
599							Like DBI's fetchall_hashref, this returns a hash reference of hash
600							references. The keys of the top-level hashref are the field values
601							of the field argument you supply. The field name you supply can be
602							a field created by a C.
603
604							=cut
605
606	3			3	1	40	my $self = shift;
607	3		50			6	my $key_field = shift(@_) \|\| return croak('No key field');
608	3					4	my @fields = $self->field_list;
609
610	3					3	my ( %return, $field_ok );
611	3					4	while ( my $record = $self->fetchrow_hashref ) {
612	5	100				7	if ( !$field_ok ) {
613	3	100				7	if ( !exists $record->{ $key_field } ) {
614	1					17	croak "Invalid key field: '$key_field'";
615							}
616
617	2					3	$field_ok = 1;
618							}
619
620	4					9	$return{ $record->{ $key_field } } = $record;
621							}
622
623	2					5	return \%return;
624							}
625
626							# ----------------------------------------------------------------
627							sub fh {
628
629							=pod
630
631							=head2 fh
632
633							open my $fh, '<', $file or die $!;
634							$p->fh( $fh );
635
636							Gets or sets the filehandle of the file being read.
637
638							=cut
639
640	148			148	1	1465	my ( $self, $arg ) = @_;
641
642	148	100				228	if ( defined $arg ) {
643	13	100				32	if ( ! UNIVERSAL::isa( $arg, 'GLOB' ) ) {
644	1					8	croak q[Argument to fh doesn't look like a filehandle];
645							}
646
647	12	100				20	if ( defined $self->{'fh'} ) {
648	3	100				19	close $self->{'fh'} or croak "Can't close existing filehandle: $!";
649							}
650
651	11					22	$self->{'fh'} = $arg;
652	11					27	$self->{'filename'} = $EMPTY_STR;
653							}
654
655	146	100				276	if ( !defined $self->{'fh'} ) {
656	73	100				112	if ( my $file = $self->filename ) {
657	33	100				913	open my $fh, '<', $file or croak "Cannot read '$file': $!";
658	32					74	$self->{'fh'} = $fh;
659							}
660							}
661
662	145					557	return $self->{'fh'};
663							}
664
665							# ----------------------------------------------------------------
666							sub field_compute {
667
668							=pod
669
670							=head2 field_compute
671
672							A callback applied to the fields identified by position (or field
673							name if C or C was called).
674
675							The callback will be passed two arguments:
676
677							=over 4
678
679							=item 1
680
681							The current field
682
683							=item 2
684
685							A reference to all the other fields, either as an array or hash
686							reference, depending on the method which you called.
687
688							=back
689
690							If data looks like this:
691
692							parent children
693							Mike Greg,Peter,Bobby
694							Carol Marcia,Jane,Cindy
695
696							You could split the "children" field into an array reference with the
697							values like so:
698
699							$p->field_compute( 'children', sub { [ split /,/, shift() ] } );
700
701							The field position or name doesn't actually have to exist, which means
702							you could create new, computed fields on-the-fly. E.g., if you data
703							looks like this:
704
705							1,3,5
706							32,4,1
707							9,5,4
708
709							You could write a field_compute like this:
710
711							$p->field_compute( 3,
712							sub {
713							my ( $cur, $others ) = @_;
714							my $sum;
715							$sum += $_ for @$others;
716							return $sum;
717							}
718							);
719
720							Field "3" will be created as the sum of the other fields. This allows
721							you to further write:
722
723							my $data = $p->fetchall_arrayref;
724							for my $rec ( @$data ) {
725							print "$rec->[0] + $rec->[1] + $rec->[2] = $rec->[3]\n";
726							}
727
728							Prints:
729
730							1 + 3 + 5 = 9
731							32 + 4 + 1 = 37
732							9 + 5 + 4 = 18
733
734							=cut
735
736	130			130	1	710	my $self = shift;
737
738	130	100				175	if ( @_ ) {
739	6					8	my ( $position, $callback ) = @_;
740
741	6	100				20	if ( $position !~ /\S+/ ) {
742	1					28	croak 'No usable field name or position';
743							}
744
745	5	100				12	if ( ref $callback ne 'CODE' ) {
746	1					29	croak 'Callback not code reference';
747							}
748
749	4					8	$self->{'field_computes'}{ $position } = $callback;
750							}
751
752	128		100			526	return $self->{'field_computes'} \|\| {};
753							}
754
755							# ----------------------------------------------------------------
756							sub field_filter {
757
758							=pod
759
760							=head2 field_filter
761
762							$p->field_filter( sub { $_ = shift; uc(lc($_)) } );
763
764							A callback which is applied to each field. The callback will be
765							passed the current value of the field. Whatever is passed back will
766							become the new value of the field. The above example capitalizes
767							field values. To unset the filter, pass in the empty string.
768
769							=cut
770
771	89			89	1	1248	my ( $self, $filter ) = @_;
772
773	89	100				128	if ( defined $filter ) {
774	6	100				11	if ( $filter eq $EMPTY_STR ) {
		100
775	1					5	$self->{'field_filter'} = $EMPTY_STR; # allows nullification
776							}
777							elsif ( ref $filter eq 'CODE' ) {
778	4					21	$self->{'field_filter'} = $filter;
779							}
780							else {
781	1					14	croak q[Argument to field_filter doesn't look like code];
782							}
783							}
784
785	88		66			282	return $self->{'field_filter'} \|\| $EMPTY_STR;
786							}
787
788							# ----------------------------------------------------------------
789							sub field_list {
790
791							=pod
792
793							=head2 field_list
794
795							$p->bind_fields( qw[ foo bar baz ] );
796							my @fields = $p->field_list;
797							print join ', ', @fields; # prints "foo, bar, baz"
798
799							Returns the fields bound via C (or C).
800
801							=cut
802
803	65			65	1	514	my $self = shift;
804
805	65	100				115	if ( !$self->{'fields_bound'} ) {
806	12					28	$self->bind_header;
807							}
808
809	63	50				123	if ( ref $self->{'field_pos_ordered'} eq 'ARRAY' ) {
810	63					37	return @{ $self->{'field_pos_ordered'} };
	63					198
811							}
812							else {
813	0					0	croak 'No fields. Call "bind_fields" or "bind_header" first.';
814							}
815							}
816
817							# ----------------------------------------------------------------
818							sub field_positions {
819
820							=pod
821
822							=head2 field_positions
823
824							my %positions = $p->field_positions;
825
826							Returns a hash of the fields and their positions bound via
827							C (or C). Mostly for internal use.
828
829							=cut
830
831	6			6	1	24	my $self = shift;
832
833	6	100				17	if ( ref $self->{'field_pos'} eq 'HASH' ) {
834	3					4	return %{ $self->{'field_pos'} };
	3					15
835							}
836							else {
837	3					9	return;
838							}
839							}
840
841							# ----------------------------------------------------------------
842							sub field_separator {
843
844							=pod
845
846							=head2 field_separator
847
848							$p->field_separator("\t"); # splits fields on tabs
849							$p->field_separator('::'); # splits fields on double colons
850							$p->field_separator(qr/\s+/); # splits fields on whitespace
851							my $sep = $p->field_separator; # returns the current separator
852
853							Gets and sets the token to use as the field delimiter. Regular
854							expressions can be specified using qr//. If not specified, it will
855							take a guess based on the filename extension ("comma" for ".txt,"
856							".dat," or ".csv"; "tab" for ".tab"). The default is a comma.
857
858							=cut
859
860	104			104	1	1600	my $self = shift;
861
862	104	100				148	if ( @_ ) {
863	18					32	$self->{'field_separator'} = shift;
864							}
865
866	104	100				179	if ( !$self->{'field_separator'} ) {
867	36					23	my $guess;
868	36	100				53	if ( my $filename = $self->filename ) {
869	22	100				105	if ( $filename =~ /\.(csv\|txt\|dat)$/ ) {
		50
870	21					22	$guess = q{,};
871							}
872							elsif ( $filename =~ /\.tab$/ ) {
873	0					0	$guess = qq{\t};
874							}
875							}
876
877	36	100				132	if ( $guess ) {
878	21					33	$self->{'field_separator'} = $guess;
879							}
880							}
881
882	104		66			223	return $self->{'field_separator'} \|\| $COMMA;
883							}
884
885							# ----------------------------------------------------------------
886							sub filename {
887
888							=pod
889
890							=head2 filename
891
892							$p->filename('/path/to/file.dat');
893
894							Gets or sets the complete path to the file to be read. If a file is
895							already opened, then the handle on it will be closed and a new one
896							opened on the new file.
897
898							=cut
899
900	160			160	1	2373	my $self = shift;
901
902	160	100				215	if ( my $filename = shift ) {
903	42	100	66			988	if ( -d $filename ) {
		100
904	1					16	croak "Cannot use directory '$filename' as input source";
905							}
906							elsif ( -f _ and -r _ ) {
907	40	100				117	if ( my $fh = $self->fh ) {
908	2	100				11	if ( !close($fh) ) {
909	1					13	croak "Can't close previously opened filehandle: $!\n";
910							}
911
912	1					2	$self->{'fh'} = undef;
913	1					2	$self->bind_fields([]);
914							}
915
916	39					92	$self->{'filename'} = $filename;
917							}
918							else {
919	1					14	croak
920							"Cannot use '$filename' as input source: ",
921							'file does not exist or is not readable.'
922							;
923							}
924							}
925
926	157		100			493	return $self->{'filename'} \|\| $EMPTY_STR;
927							}
928
929							# ----------------------------------------------------------------
930							sub get_field_aliases {
931
932							=pod
933
934							=head2 get_field_aliases
935
936							my @aliases = $p->get_field_aliases('name');
937
938							Allows you to define alternate names for fields, e.g., sometimes your
939							input file calls city "town" or "township," sometimes a file uses "Moniker"
940							instead of "name."
941
942							=cut
943
944	167			167	1	132	my $self = shift;
945	167	50				201	my $field_name = shift or return;
946
947	167	100				207	if ( !$self->{'field_alias'} ) {
948	160					273	return;
949							}
950
951	7	100				4	return @{ $self->{'field_alias'}{ $field_name } \|\| [] };
	7					25
952							}
953
954							# ----------------------------------------------------------------
955							sub header_filter {
956
957							=pod
958
959							=head2 header_filter
960
961							$p->header_filter( sub { $_ = shift; s/\s+/_/g; lc $_ } );
962
963							A callback applied to column header names. The callback will be
964							passed the current value of the header. Whatever is returned will
965							become the new value of the header. The above example collapses
966							spaces into a single underscore and lowercases the letters. To unset
967							a filter, pass in the empty string.
968
969							=cut
970
971	43			43	1	1343	my ( $self, $filter ) = @_;
972
973	43	100				75	if ( defined $filter ) {
974	6	100				14	if ( $filter eq $EMPTY_STR ) {
		100
975	1					5	$self->{'header_filter'} = $EMPTY_STR; # allows nullification
976							}
977							elsif ( ref $filter eq 'CODE' ) {
978	4					27	$self->{'header_filter'} = $filter;
979
980	4	100				12	if ( my %field_pos = $self->field_positions ) {
981	2					1	my @new_order;
982	2					6	while ( my ( $field, $order ) = each %field_pos ) {
983	6					47	my $xform = $filter->( $field );
984	6					22	$new_order[ $order ] = $xform;
985							}
986
987	2					3	$self->bind_fields( @new_order );
988							}
989							}
990							else{
991	1					22	croak q[Argument to field_filter doesn't look like code];
992							}
993							}
994
995	42		66			143	return $self->{'header_filter'} \|\| $EMPTY_STR;
996							}
997
998							# ----------------------------------------------------------------
999							sub record_separator {
1000
1001							=pod
1002
1003							=head2 record_separator
1004
1005							$p->record_separator("\n//\n");
1006							$p->field_separator("\n");
1007
1008							Gets and sets the token to use as the record separator. The default is
1009							a newline ("\n").
1010
1011							The above example would read a file that looks like this:
1012
1013							field1
1014							field2
1015							field3
1016							//
1017							data1
1018							data2
1019							data3
1020							//
1021
1022							=cut
1023
1024	101			101	1	610	my $self = shift;
1025
1026	101	100				175	if ( @_ ) {
1027	7					11	$self->{'record_separator'} = shift;
1028							}
1029
1030	101		66			288	return $self->{'record_separator'} \|\| $NEW_LINE;
1031							}
1032
1033							# ----------------------------------------------------------------
1034							sub set_field_alias {
1035
1036							=pod
1037
1038							=head2 set_field_alias
1039
1040							$p->set_field_alias({
1041							name => 'Moniker,handle', # comma-separated string
1042							city => [ qw( town township ) ], # or anonymous arrayref
1043							});
1044
1045							Allows you to define alternate names for fields, e.g., sometimes your
1046							input file calls city "town" or "township," sometimes a file uses "Moniker"
1047							instead of "name."
1048
1049							=cut
1050
1051	1			1	1	25	my $self = shift;
1052	1	50				4	my %args = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_;
	1					3
1053	1					7	my %is_field = map { $_, 1 } $self->field_list;
	6					7
1054
1055							ARG:
1056	1					5	while ( my ( $field_name, $aliases ) = each %args ) {
1057	2	100				3	if ( ref $aliases ne 'ARRAY' ) {
1058	1					3	$aliases = [ split(/,/, $aliases) ];
1059							}
1060
1061	2	100				4	if ( !$is_field{ $field_name } ) {
1062	1					2	push @$aliases, $field_name;
1063	1					1	( $field_name ) = grep { $is_field{ $_ } } @$aliases;
	3					4
1064	1	50				2	next ARG unless $field_name;
1065							}
1066
1067	4					10	$self->{'field_alias'}{ $field_name } = [
1068	2					10	grep { $_ ne $field_name } uniq( @$aliases )
1069							];
1070							}
1071
1072	1					3	return 1;
1073							}
1074
1075							# ----------------------------------------------------------------
1076							sub trim {
1077
1078							=pod
1079
1080							=head2 trim
1081
1082							my $trim_value = $p->trim(1);
1083
1084							Provide "true" argument to remove leading and trailing whitespace from
1085							fields. Use a "false" argument to disable.
1086
1087							=cut
1088
1089	89			89	1	309	my ( $self, $arg ) = @_;
1090
1091	89	100				170	if ( defined $arg ) {
1092	6	100				13	$self->{'trim'} = $arg ? 1 : 0;
1093							}
1094
1095	89					145	return $self->{'trim'};
1096							}
1097
1098							1;
1099
1100							# ----------------------------------------------------------------
1101							# I must Create a System, or be enslav'd by another Man's;
1102							# I will not Reason and Compare; my business is to Create.
1103							# -- William Blake, "Jerusalem"
1104							# ----------------------------------------------------------------
1105
1106							=pod
1107
1108							=head1 AUTHOR
1109
1110							Ken Youens-Clark Ekclark@cpan.orgE
1111
1112							=head1 SOURCE
1113
1114							http://github.com/kyclark/text-recordparser
1115
1116							=head1 CREDITS
1117
1118							Thanks to the following:
1119
1120							=over 4
1121
1122							=item * Benjamin Tilly
1123
1124							For Text::xSV, the inspirado for this module
1125
1126							=item * Tim Bunce et al.
1127
1128							For DBI, from which many of the methods were shamelessly stolen
1129
1130							=item * Tom Aldcroft
1131
1132							For contributing code to make it easy to parse whitespace-delimited data
1133
1134							=item * Liya Ren
1135
1136							For catching the column-ordering error when parsing with "no-headers"
1137
1138							=item * Sharon Wei
1139
1140							For catching bug in C that sets up infinite loops
1141
1142							=item * Lars Thegler
1143
1144							For bug report on missing "script_files" arg in Build.PL
1145
1146							=back
1147
1148							=head1 BUGS
1149
1150							None known. Please use http://rt.cpan.org/ for reporting bugs.
1151
1152							=head1 LICENSE AND COPYRIGHT
1153
1154							Copyright (C) 2006-10 Ken Youens-Clark. All rights reserved.
1155
1156							This program is free software; you can redistribute it and/or modify
1157							it under the terms of the GNU General Public License as published by
1158							the Free Software Foundation; version 2.
1159
1160							This program is distributed in the hope that it will be useful, but
1161							WITHOUT ANY WARRANTY; without even the implied warranty of
1162							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1163							General Public License for more details.
1164
1165							=cut