File Coverage

blib/lib/ETL/Pipeline/Input.pm
Criterion Covered Total %
statement 11 11 100.0
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 15 15 100.0


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input - Role for ETL::Pipeline input sources
6              
7             =head1 SYNOPSIS
8              
9             use Moose;
10             with 'ETL::Pipeline::Input';
11              
12             sub next_record {
13             # Add code to read your data here
14             ...
15             }
16              
17             =head1 DESCRIPTION
18              
19             L<ETL::Pipeline> reads data from an input source, transforms it, and writes
20             the information to an output destination. This role defines the required
21             methods and attributes for input sources. Every input source B<must> implement
22             B<ETL::Pipeline::Input>.
23              
24             L<ETL::Pipeline> works by calling the methods defined in this role. The role
25             presents a common interface. It works as a shim, tying file parsing modules
26             with L<ETL::Pipeline>. For example, CSV files are parsed with the L<Text::CSV>
27             module. L<ETL::Pipeline::Input::DelimitedText> wraps around L<Text::CSV>.
28             L<ETL::Pipeline::Input::DelimitedText> implements this role by calling
29             L<Text::CSV>.
30              
31             =head2 Adding a new input source
32              
33             Out of the box, L<ETL::Pipeline> provides input sources for Microsoft Excel and
34             CSV (comma seperated variable) files. To add your own formats...
35              
36             =over
37              
38             =item 1. Create a Perl module. Name it C<ETL::Pipeline::Input::...>.
39              
40             =item 2. Make it a Moose object: C<use Moose;>.
41              
42             =item 3. Include the role: C<with 'ETL::Pipeline::Input';>.
43              
44             =item 4. Add the L</next_record> method: C<sub next_record { ... }>.
45              
46             =item 5. Add the L</configure> method: C<sub configure { ... }>.
47              
48             =item 6. Add the L</finish> method: C<sub finish { ... }>.
49              
50             =back
51              
52             Ta-da! Your input source is ready to use:
53              
54             $etl->input( 'YourNewSource' );
55              
56             =head2 Does B<ETL::Pipeline::Input> only work with files?
57              
58             No. B<ETL::Pipeline::Input> works for any source of data, such as SQL queries,
59             CSV files, or network sockets. Write a L</next_record> method using whatever
60             method suits your needs.
61              
62             This documentation refers to files because that is what I use the most. Don't
63             let that fool you! B<ETL::Pipeline::Input> was designed to work seamlessly with
64             files and non-files alike.
65              
66             =cut
67              
68             package ETL::Pipeline::Input;
69 7     7   5708 use Moose::Role;
  7         9  
  7         49  
70              
71 7     7   14783 use 5.014000;
  7         16  
72 7     7   31 use String::Util qw/trim/;
  7         8  
  7         2127  
73              
74              
75             our $VERSION = '2.00';
76              
77              
78             =head1 METHODS & ATTRIBUTES
79              
80             =head3 pipeline
81              
82             B<pipeline> returns the L<ETL::Pipeline> object using this input source. You
83             can access information about the pipeline inside the methods.
84              
85             L<ETL::Pipeline/input> automatically sets this attribute.
86              
87             =cut
88              
89             has 'pipeline' => (
90             is => 'ro',
91             isa => 'ETL::Pipeline',
92             required => 1,
93             );
94              
95              
96             =head2 Arguments for L<ETL::Pipeline/input>
97              
98             =head3 debug
99              
100             While we expect perfect data, things go wrong. B<debug> lets
101             L<ETL::Pipeline/process> peek into the raw data one record at a time. I use
102             this when tracking down random problems in the middle of a 3,000 row spread
103             sheet.
104              
105             L<ETL::Pipeline/process> executes this code reference for every record.
106             L<ETL::Pipeline/process> ignores the return value.
107              
108             The code reference receives the current L<ETL::Pipeline> as its first parameter
109             and in C<$_>.
110              
111             $etl->input( 'UnitTest', debug => sub { print $_->get( 'A' ) } );
112              
113             =cut
114              
115             has 'debug' => (
116             is => 'rw',
117             isa => 'Maybe[CodeRef]',
118             );
119              
120              
121             =head3 filter
122              
123             B<filter> does extra processing on the file data. The default filter trims
124             leading and trailing whitespace. You can use your own filter to handle special
125             values like "N/A" or "NULL".
126              
127             Assign a code reference to B<filter>. Unlike the other code references,
128             B<filter> does not have access to the L<ETL::Pipeline> object. The filter
129             receives two array references as parameters. The first array holds the values
130             for filtering. The second array holds the arguments passed to L</get>.
131              
132             The filter returns a list of filtered values. The results should be in the
133             same order as the values found in the input.
134              
135             $etl->input( 'UnitTest', filter => sub {
136             my ($values, $arguments) = @_;
137             map { $_ eq 'NA' ? '' : $_ } @$values;
138             } );
139              
140             =cut
141              
142             has 'filter' => (
143             default => sub { sub {
144             my ($values, $arguments) = @_;
145             return map { trim( $_ ) } @$values;
146             } },
147             is => 'rw',
148             isa => 'CodeRef',
149             );
150              
151             around 'get' => sub {
152             my ($original, $self, @arguments) = @_;
153              
154             my @values = $original->( $self, @arguments );
155             return $self->filter->( \@values, \@arguments );
156             };
157              
158              
159             =head3 skip_if
160              
161             B<skip_if> accepts a code reference. L<ETL::Pipeline/process> executes this
162             code for every input record. If this code returns I<false>,
163             L<ETL::Pipeline/process> discards the record with no further processing.
164              
165             Use B<skip_if> to bypass bad data.
166              
167             The code reference receives the current L<ETL::Pipeline> as its first parameter
168             and in C<$_>.
169              
170             I<Note:> B<skip_if> only works on data records. It is not applied to column
171             headers.
172              
173             $etl->input( 'UnitTest', skip_if => sub { $_->get( 'A' ) eq 'DELETED' } );
174              
175             =cut
176              
177             has 'skip_if' => (
178             is => 'rw',
179             isa => 'Maybe[CodeRef]',
180             );
181              
182              
183             =head3 stop_if
184              
185             Normally, L<ETL::Pipeline> goes until the end of the file. This code reference
186             stops processing early. If the code reference returns I<true>, L<ETL::Pipeline>
187             shuts down, just as if it reached the end of the file.
188              
189             I use this with report formats that have grand totals at the end. The totals
190             aren't real data.
191              
192             The code reference receives the current L<ETL::Pipeline> as its first parameter
193             and in C<$_>.
194              
195             $etl->input( 'UnitTest', stop_if => sub { $_->get( 'A' ) eq 'Totals' } );
196              
197             =cut
198              
199             has 'stop_if' => (
200             is => 'rw',
201             isa => 'Maybe[CodeRef]',
202             );
203              
204              
205             =head2 Called from L<ETL::Pipeline/process>
206              
207             =head3 next_record
208              
209             B<next_record> reads the next single record from the input source.
210             L<ETL::Pipeline/process> calls this method inside of a loop. B<next_record>
211             returns a boolean flag. A I<true> value means success getting the record. A
212             I<false> value indicates the end of the input - no more records.
213              
214             The implmenting class must define this method.
215              
216             while ($input->next_record) {
217             ...
218             }
219              
220             =cut
221              
222             requires 'next_record';
223              
224              
225             =head3 get
226              
227             B<get> returns a list of values from matching fields from the current record.
228             B<ETL::Pipeline::Input> does not define how L</next_record> stores its data
229             internally. You should use the format that best suits your needs. For example,
230             L<ETL::Pipeline::Input::Excel> uses an L<Spreadsheet::XLSX> object. It's B<get>
231             accesses object methods to retrieve fields.
232              
233             L<ETL::Pipeline/process> passes in the value from L<ETL::Pipeline/mapping>.
234             That can be a scalar value (string), regular expression, or array reference.
235             B<get> returns a list of values from matching fields. L<ETL::Pipeline/process>
236             passes that list directly to L<ETL::Pipeline::Output/set>.
237              
238             B<Note:> B<ETL::Pipeline::Input> automatically passes the return values through
239             L</filter>. You should not call L</filter> from inside of the B<get> method.
240              
241             The implmenting class must define this method.
242              
243             # Retrieve one field named 'A'.
244             $etl->get( 'A' );
245            
246             # Retrieve the field from the column 'ID Num'.
247             $etl->get( qr/id\s*num/i );
248            
249             # A list is used to build composite field names.
250             $etl->get( '/root', '/first' );
251              
252             B<NOTE:> B<get> returns a list - not an individual value. Even if only one
253             field matches, B<get> still returns a list. Calling it in scalar context
254             returns the number of elements in the list - not a value. Keep this in mind
255             when calling B<get> from L</stop_if> or L</skip_if>.
256              
257             =cut
258              
259             requires 'get';
260              
261              
262             =head3 configure
263              
264             B<configure> prepares the input source. It can open files, make database
265             connections, or anything else required before reading the first record.
266              
267             Why not do this in the class constructor? Some roles add automatic
268             configuration. Those roles use the usual Moose method modifiers, which would
269             not work with the constructor.
270              
271             This B<configure> - for the input source - is called I<before> the
272             L<ETL::Pipeline::Output/configure> of the output destination. This method
273             should not rely on the configuration of the output destination.
274              
275             The implmenting class must define this method.
276              
277             $input->configure;
278              
279             =cut
280              
281             requires 'configure';
282              
283              
284             =head3 finish
285              
286             B<finish> shuts down the input source. It can close files, disconnect
287             from the database, or anything else required to cleanly terminate the input.
288              
289             Why not do this in the class destructor? Some roles add automatic functionality
290             via Moose method modifiers. This would not work with a destructor.
291              
292             This B<finish> - for the input source - is called I<after> the
293             L<ETL::Pipeline::Output/finish> of the output destination. This method should
294             not rely on the configuration of the output destination.
295              
296             The implmenting class must define this method.
297              
298             $input->finish;
299              
300             =cut
301              
302             requires 'finish';
303              
304              
305             =head2 Other Methods & Attributes
306              
307             =head3 record_number
308              
309             The B<record_number> attribute tells you how many total records have been read
310             by L</next_record>. The count includes headers and L</skip_if> records.
311              
312             The first record is always B<1>.
313              
314             B<ETL::Pipeline::Input> automatically increments the counter after
315             L</next_record>. The L</next_record> method should not change B<record_number>.
316              
317             =head3 decrement_record_number
318              
319             This method decreases L</record_number> by one. It can be used to I<back out>
320             header records from the count.
321              
322             $input->decrement_record_number;
323              
324             =head3 increment_record_number
325              
326             This method increases L</record_number> by one.
327              
328             $input->increment_record_number;
329              
330             =cut
331              
332             has 'record_number' => (
333             default => '0',
334             handles => {
335             decrement_record_number => 'dec',
336             increment_record_number => 'inc',
337             },
338             is => 'ro',
339             isa => 'Int',
340             traits => [qw/Counter/],
341             );
342              
343             around 'next_record' => sub {
344             my $original = shift;
345             my $self = shift;
346              
347             my $result = $self->$original( @_ );
348             $self->increment_record_number if $result;
349             return $result;
350             };
351              
352              
353             =head1 SEE ALSO
354              
355             L<ETL::Pipeline>, L<ETL::Pipeline::Output>
356              
357             =head1 AUTHOR
358              
359             Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
360              
361             =head1 LICENSE
362              
363             Copyright 2016 (c) Vanderbilt University Medical Center
364              
365             This program is free software; you can redistribute it and/or modify it under
366             the same terms as Perl itself.
367              
368             =cut
369              
370 7     7   33 no Moose;
  7         12  
  7         40  
371              
372             # Required by Perl to load the module.
373             1;