File Coverage

blib/lib/ETL/Pipeline/Input/Tabular.pm
Criterion Covered Total %
statement 24 28 85.7
branch 4 6 66.6
condition n/a
subroutine 7 8 87.5
pod 2 2 100.0
total 37 44 84.0


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::Tabular - Sequential input in rows and columns
6              
7             =head1 SYNOPSIS
8              
9             # In the input source...
10             use Moose;
11             with 'ETL::Pipeline::Input::Tabular';
12             ...
13              
14             =head1 DESCRIPTION
15              
16             B<ETL::Pipeline::Input::Tabular> provides a common interface where the data
17             is in a table or columns. Spreadsheets and CSV files are considered I<tabular>.
18              
19             While B<ETL::Pipeline::Input::Tabular> works with any sequential input source,
20             L<ETL::Pipeline::Input::File>s would be the most common.
21              
22             =cut
23              
24             package ETL::Pipeline::Input::Tabular;
25 3     3   2924 use Moose::Role;
  3         5  
  3         19  
26              
27 3     3   10738 use 5.014000;
  3         8  
28 3     3   1392 use List::AllUtils qw/indexes/;
  3         14086  
  3         215  
29 3     3   18 use String::Util qw/hascontent trim/;
  3         4  
  3         1665  
30              
31              
32             our $VERSION = '1.00';
33              
34              
35             =head1 METHODS & ATTRIBUTES
36              
37             =head2 Arguments for L<ETL::Pipeline/input>
38              
39             =head3 no_column_names
40              
41             By default, B<ETL::Pipeline::Input::Tabular> assumes that the first data row
42             has column names (headers) and not real data. If your data does not have column
43             names, set this boolean flag to B<true>.
44              
45             $etl->input( 'Excel', no_column_names => 1 );
46              
47             =cut
48              
49             has 'no_column_names' => (
50             default => 0,
51             is => 'ro',
52             isa => 'Bool',
53             );
54              
55              
56             =head3 skipping
57              
58             B<skipping> jumps over a certain number of records in the beginning of the
59             file. Report formats often contain extra headers - even before the column
60             names. B<skipping> ignores those and starts processing at the data.
61              
62             B<skipping> accepts either an integer or code reference. An integer represents
63             the number of rows/records to ignore. For a code reference, the code discards
64             records until the subroutine returns a I<true> value.
65              
66             # Bypass the first three rows.
67             $etl->input( 'Excel', skipping => 3 );
68            
69             # Bypass until we find something in column 'C'.
70             $etl->input( 'Excel', skipping => sub { hascontent( $_->get( 'C' ) ) } );
71              
72             =cut
73              
74             has 'skipping' => (
75             default => 0,
76             is => 'ro',
77             isa => 'CodeRef|Int',
78             );
79              
80              
81             # This block of code implements both "skipping" and "no_column_names".
82             after 'configure' => sub {
83             my $self = shift @_;
84              
85             # "skipping"
86             my $headers = $self->skipping;
87             if (ref( $headers ) eq 'CODE') {
88             do {
89             $self->next_record;
90             } until $self->pipeline->execute_code_ref( $headers );
91             $self->_cached( 1 );
92             } else { $self->next_record foreach (1 .. $headers); }
93              
94             # "no_column_names"
95             $self->get_column_names unless $self->no_column_names;
96             };
97              
98              
99             # This attribute indicates if the next record has been cached in memory. When
100             # processing variable length report headers, I can't tell they end until I read
101             # the next line. If the next line is where your data starts, then I can't just
102             # throw it away. This attribute tells the code to process the current record in
103             # memory instead of reading one from disk.
104             #
105             # The code automatically adjusts the record count down, so that we don't count
106             # this record twice.
107             has '_cached' => (
108             default => 0,
109             is => 'rw',
110             isa => 'Bool',
111             trigger => \&_trigger_cached,
112             );
113              
114              
115             around 'next_record' => sub {
116             my ($original, $self, @arguments) = @_;
117              
118             if ($self->_cached) {
119             $self->_cached( 0 );
120             return 1;
121             } else { return $original->( $self, @arguments ); }
122             };
123              
124              
125             sub _trigger_cached {
126 1     1   2 my ($self, $new, $old) = @_;
127 1 50       37 $self->decrement_record_number if $new;
128             }
129              
130              
131             =head2 Other Methods & Attributes
132              
133             =head3 get_column_names
134              
135             This method reads the column name row, parses it, and sets L</column_names>.
136             B<ETL::Pipeline::Input::TabularFile> knows nothing about the internal storage
137             of individual records. It relies on the implementing class for that ability.
138             That's where B<get_column_names> comes into play.
139              
140             B<get_column_names> should call L</add_column> for each column name.
141              
142             sub get_column_names {
143             my ($self) = @_;
144             $self->next_record;
145             # Loop through all of the fields...
146             $self->add_column( $value, $field );
147             }
148              
149             =cut
150              
151             requires 'get_column_names';
152              
153              
154             =head3 column_names
155              
156             B<column_names> holds a list of the column names as read from the file. The
157             list is kept in file order. Duplicate names are allowed. B<column_names> is
158             filled when L</get_column_names> calls the L</add_column> method.
159              
160             When L<ETL::Pipeline/mapping> calls L<ETL::Pipeline::Input/get>, this role
161             intercepts the call. The role translates column names or regular expressions
162             into actual field names. L<ETL::Pipeline::Input/get> returns a list of values
163             from all fields that match.
164              
165             =cut
166              
167             has 'column_names' => (
168             default => sub { [] },
169             handles => {
170             _add_column_name => 'push',
171             _clear_column_names => 'clear',
172             columns => 'elements',
173             _get_column_name => 'get',
174             number_of_columns => 'count',
175             },
176             init_arg => undef,
177             is => 'ro',
178             isa => 'ArrayRef[Str]',
179             traits => [qw/Array/],
180             );
181              
182              
183             # This private hash is used for non-numeric field names.
184             has '_column_mapping' => (
185             default => sub { {} },
186             handles => {
187             _clear_column_mapping => 'clear',
188             column_mapped => 'exists',
189             _get_field_names => 'get',
190             _set_column_mapping => 'set',
191             },
192             init_arg => undef,
193             is => 'ro',
194             isa => 'HashRef[ArrayRef[Any]]',
195             traits => [qw/Hash/],
196             );
197              
198              
199             around 'get' => sub {
200             my ($original, $self, $field, @arguments) = @_;
201            
202             # Find the first match based on order fields appear in the file.
203             my @matches;
204             if (ref( $field ) eq 'Regexp') {
205             @matches = indexes { m/$field/ } $self->columns;
206             } else {
207             @matches = indexes { $_ eq $field } $self->columns;
208             }
209              
210             # See if this column name maps to a field. If it doesn't, the index
211             # number is the real field name.
212             my @real_field;
213             foreach my $index (@matches) {
214             my $column = $self->_get_column_name( $index );
215             if ($self->column_mapped( $column )) {
216             push @real_field, @{$self->_get_field_names( $column )};
217             } else {
218             push @real_field, $index;
219             }
220             }
221              
222             # Call the real "get" method with the translated field name.
223             if (scalar( @real_field ) == 0) {
224             if (ref( $field ) eq 'Regexp') {
225             return ();
226             } else {
227             return $original->( $self, $field, @arguments );
228             }
229             } else {
230             return map { $original->( $self, $_, @arguments ) } @real_field;
231             }
232             };
233              
234              
235             =head3 add_column
236              
237             L</get_column_names> calls this method once for every column name.
238             B<add_column> puts the column name into L</column_names>.
239              
240             L</get_column_names> passes in the column name as the first parameter and the
241             field name as the second. The field name is optional.
242             L<ETL::Pipeline::Input/get> will use the L</column_names> index as the field
243             name by default.
244              
245             # Add column names for fields 0 and 1. No field name means that "get" uses
246             # the index numbers - 0 and 1.
247             $self->add_column( 'First' );
248             $self->add_column( 'Second' );
249            
250             # Add column names for fields 'A' and 'B'. Always pass the field name if
251             # it's a string.
252             $self->add_column( 'First', 'A' );
253             $self->add_column( 'Second', 'B' );
254              
255             B<Note:> B<add_column> trims leading and trailing whitespace from column names.
256              
257             =cut
258              
259             sub add_column {
260 32     32 1 261 my $self = shift;
261 32         66 my $name = trim( shift );
262            
263 32         1317 $self->_add_column_name( $name );
264              
265             # Always return the first field with a given name.
266 32 100       74 if (scalar( @_ ) > 0) {
267 26         29 my $field = shift;
268 26         848 my $mapping = $self->_get_field_names( $name );
269 26 50       40 if (defined $mapping) {
270 0         0 push @$mapping, $field;
271             } else {
272 26         833 $self->_set_column_mapping( $name, [$field] );
273             }
274             }
275             }
276              
277              
278             =head3 reset_column_names
279              
280             This method wipes out the existing column names. It can be used from
281             L</get_column_names>.
282              
283             $self->reset_column_names;
284              
285             =cut
286              
287             sub reset_column_names {
288 0     0 1   my ($self) = @_;
289 0           $self->_clear_column_mapping;
290 0           $self->_clear_column_names;
291             }
292              
293              
294             =head1 SEE ALSO
295              
296             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File>
297              
298             =head1 AUTHOR
299              
300             Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
301              
302             =head1 LICENSE
303              
304             Copyright 2016 (c) Vanderbilt University Medical Center
305              
306             This program is free software; you can redistribute it and/or modify it under
307             the same terms as Perl itself.
308              
309             =cut
310              
311 3     3   15 no Moose;
  3         3  
  3         21  
312              
313             # Required by Perl to load the module.
314             1;