File Coverage

blib/lib/ETL/Pipeline/Input/DelimitedText.pm
Criterion Covered Total %
statement 17 17 100.0
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 23 23 100.0


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::DelimitedText - Input source for CSV, tab, or pipe
6             delimited files
7              
8             =head1 SYNOPSIS
9              
10             use ETL::Pipeline;
11             ETL::Pipeline->new( {
12             input => ['DelimitedText', iname => qr/\.csv$/i],
13             mapping => {First => 'Header1', Second => 'Header2'},
14             output => ['UnitTest']
15             } )->process;
16              
17             =head1 DESCRIPTION
18              
19             B<ETL::Pipeline::Input::DelimitedText> defines an input source for reading
20             CSV (comma seperated variable), tab delimited, or pipe delimited files. It
21             uses L<Text::CSV> for parsing.
22              
23             B<ETL::Pipeline::Input::DelimitedText> expects a standard CSV file. A lot of
24             hand built exporters often forget quote marks, use invalid characters, or don't
25             escape the quotes. If you experience trouble with a file, experiment with the
26             options to L<Text::CSV>.
27              
28             =cut
29              
30             package ETL::Pipeline::Input::DelimitedText;
31 2     2   14 use Moose;
  2         3  
  2         20  
32              
33 2     2   13799 use 5.014000;
  2         8  
34 2     2   12 use warnings;
  2         4  
  2         109  
35              
36 2     2   13 use Carp;
  2         5  
  2         178  
37 2     2   1736 use Text::CSV;
  2         26755  
  2         1009  
38              
39              
40             our $VERSION = '3.00';
41              
42              
43             =head1 METHODS & ATTRIBUTES
44              
45             =head2 Arguments for L<ETL::Pipeline/input>
46              
47             B<ETL::Pipeline::Input::DelimitedText> consumes the
48             L<ETL::Pipeline::Input::File> and L<ETL::Pipeline::Input::File::Table> roles.
49             It supports all of the attributes from these two.
50              
51             In addition, B<ETL::Pipeline::Input::DelimitedText> uses the options from
52             L<Text::CSV>. See L<Text::CSV> for a list.
53              
54             # Pipe delimited, allowing embedded new lines.
55             $etl->input( 'DelimitedText',
56             iname => qr/\.dat$/i,
57             sep_char => '|',
58             binary => 1
59             );
60              
61             =cut
62              
63             sub BUILD {
64             my $self= shift;
65             my $arguments = shift;
66              
67             my %options;
68             foreach my $key (Text::CSV::known_attributes) {
69             $options{$key} = $arguments->{$key} if exists $arguments->{$key};
70             }
71             $self->_csv_options( \%options );
72             }
73              
74              
75             =head3 skipping
76              
77             Optional. If you use a code reference for B<skipping>, this input source sends a
78             line of plain text. The text is B<not> parsed into fields. I assume that you're
79             skipping report headers, not formatted data.
80              
81             If you pass an integer, the input source completely skips over that many lines.
82             It reads and discards the lines without parsing.
83              
84             =head2 Methods
85              
86             =head3 run
87              
88             This is the main loop. It opens the file, reads records, and closes it when
89             done. This is the place to look if there are problems.
90              
91             L<ETL::Pipeline> automatically calls this method.
92              
93             =cut
94              
95             sub run {
96             my ($self, $etl) = @_;
97              
98             my $csv = Text::CSV->new( $self->_csv_options );
99             my $path = $self->path;
100              
101             # Open the file.
102             my $handle = $path->openr();
103             croak "Cannot read '$path'" unless defined $handle;
104              
105             # Skip over report headers. These are not data. They are extra rows put
106             # there for report formats. The data starts after these rows.
107             my $line;
108             my $skip = $self->skipping;
109             if (ref( $skip ) eq 'CODE') {
110             while (!$handle->eof) {
111             $line = $handle->getline;
112             last if !$skip->( $line );
113             }
114             } elsif ($skip > 0) {
115             $handle->getline foreach (1 .. $skip);
116             }
117              
118             # Load field names.
119             unless ($self->no_column_names) {
120             my $fields;
121             if (defined $line) {
122             $fields = [$csv->parse( $line )];
123             $line = undef;
124             } else {
125             $fields = $csv->getline( $handle );
126             }
127              
128             if (defined $fields) {
129             my @names;
130             while (my ($index, $value) = each @$fields) {
131             push @names, {$value => $index};
132             }
133             $etl->aliases( @names );
134             }
135             }
136              
137             # Read and process each line.
138             while (!$csv->eof) {
139             my $fields;
140             if (defined $line) {
141             $fields = $csv->parse( $line );
142             $line = undef;
143             } else {
144             $fields = $csv->getline( $handle );
145             }
146              
147             if (defined $fields) {
148             $etl->record( $fields );
149             } elsif (!$csv->eof) {
150             my $at = $csv->record_number;
151             my ($code, $message, $position) = $csv->error_diag;
152             croak "CSV file '$path', error $code: $message at character $position (record $at)";
153             }
154             }
155              
156             # Close the file when done.
157             close $handle;
158             }
159              
160              
161             #-------------------------------------------------------------------------------
162             # Internal methods and attributes
163              
164             # Text::CSV options passed into the object constructor. I either needed to
165             # store the options or a Text::CSV object. I chose to store the options. The
166             # "run" method uses them to create a Text::CSV object.
167             has '_csv_options' => (
168             is => 'rw',
169             isa => 'HashRef[Any]',
170             );
171              
172              
173             =head1 SEE ALSO
174              
175             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File>,
176             L<ETL::Pipeline::Input::File::Table>, L<Text::CSV>
177              
178             =cut
179              
180             with 'ETL::Pipeline::Input';
181             with 'ETL::Pipeline::Input::File';
182             with 'ETL::Pipeline::Input::File::Table';
183              
184              
185             =head1 AUTHOR
186              
187             Robert Wohlfarth <robert.j.wohlfarth@vumc.org>
188              
189             =head1 LICENSE
190              
191             Copyright 2021 (c) Vanderbilt University Medical Center
192              
193             This program is free software; you can redistribute it and/or modify it under
194             the same terms as Perl itself.
195              
196             =cut
197              
198 2     2   24 no Moose;
  2         6  
  2         31  
199             __PACKAGE__->meta->make_immutable;