File Coverage

blib/lib/ETL/Pipeline/Input.pm

Criterion	Covered	Total	%
statement	11	11	100.0
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	15	15	100.0

line	stmt	sub	time	code
1				=pod
2
3				=head1 NAME
4
5				ETL::Pipeline::Input - Role for ETL::Pipeline input sources
6
7				=head1 SYNOPSIS
8
9				use Moose;
10				with 'ETL::Pipeline::Input';
11
12				sub next_record {
13				# Add code to read your data here
14				...
15				}
16
17				=head1 DESCRIPTION
18
19				L<ETL::Pipeline> reads data from an input source, transforms it, and writes
20				the information to an output destination. This role defines the required
21				methods and attributes for input sources. Every input source B<must> implement
22				B<ETL::Pipeline::Input>.
23
24				L<ETL::Pipeline> works by calling the methods defined in this role. The role
25				presents a common interface. It works as a shim, tying file parsing modules
26				with L<ETL::Pipeline>. For example, CSV files are parsed with the L<Text::CSV>
27				module. L<ETL::Pipeline::Input::DelimitedText> wraps around L<Text::CSV>.
28				L<ETL::Pipeline::Input::DelimitedText> implements this role by calling
29				L<Text::CSV>.
30
31				=head2 Adding a new input source
32
33				Out of the box, L<ETL::Pipeline> provides input sources for Microsoft Excel and
34				CSV (comma seperated variable) files. To add your own formats...
35
36				=over
37
38				=item 1. Create a Perl module. Name it C<ETL::Pipeline::Input::...>.
39
40				=item 2. Make it a Moose object: C<use Moose;>.
41
42				=item 3. Include the role: C<with 'ETL::Pipeline::Input';>.
43
44				=item 4. Add the L</next_record> method: C<sub next_record { ... }>.
45
46				=item 5. Add the L</configure> method: C<sub configure { ... }>.
47
48				=item 6. Add the L</finish> method: C<sub finish { ... }>.
49
50				=back
51
52				Ta-da! Your input source is ready to use:
53
54				$etl->input( 'YourNewSource' );
55
56				=head2 Does B<ETL::Pipeline::Input> only work with files?
57
58				No. B<ETL::Pipeline::Input> works for any source of data, such as SQL queries,
59				CSV files, or network sockets. Write a L</next_record> method using whatever
60				method suits your needs.
61
62				This documentation refers to files because that is what I use the most. Don't
63				let that fool you! B<ETL::Pipeline::Input> was designed to work seamlessly with
64				files and non-files alike.
65
66				=cut
67
68				package ETL::Pipeline::Input;
69	7	7	5708	use Moose::Role;
	7		9
	7		49
70
71	7	7	14783	use 5.014000;
	7		16
72	7	7	31	use String::Util qw/trim/;
	7		8
	7		2127
73
74
75				our $VERSION = '2.00';
76
77
78				=head1 METHODS & ATTRIBUTES
79
80				=head3 pipeline
81
82				B<pipeline> returns the L<ETL::Pipeline> object using this input source. You
83				can access information about the pipeline inside the methods.
84
85				L<ETL::Pipeline/input> automatically sets this attribute.
86
87				=cut
88
89				has 'pipeline' => (
90				is => 'ro',
91				isa => 'ETL::Pipeline',
92				required => 1,
93				);
94
95
96				=head2 Arguments for L<ETL::Pipeline/input>
97
98				=head3 debug
99
100				While we expect perfect data, things go wrong. B<debug> lets
101				L<ETL::Pipeline/process> peek into the raw data one record at a time. I use
102				this when tracking down random problems in the middle of a 3,000 row spread
103				sheet.
104
105				L<ETL::Pipeline/process> executes this code reference for every record.
106				L<ETL::Pipeline/process> ignores the return value.
107
108				The code reference receives the current L<ETL::Pipeline> as its first parameter
109				and in C<$_>.
110
111				$etl->input( 'UnitTest', debug => sub { print $_->get( 'A' ) } );
112
113				=cut
114
115				has 'debug' => (
116				is => 'rw',
117				isa => 'Maybe[CodeRef]',
118				);
119
120
121				=head3 filter
122
123				B<filter> does extra processing on the file data. The default filter trims
124				leading and trailing whitespace. You can use your own filter to handle special
125				values like "N/A" or "NULL".
126
127				Assign a code reference to B<filter>. Unlike the other code references,
128				B<filter> does not have access to the L<ETL::Pipeline> object. The filter
129				receives two array references as parameters. The first array holds the values
130				for filtering. The second array holds the arguments passed to L</get>.
131
132				The filter returns a list of filtered values. The results should be in the
133				same order as the values found in the input.
134
135				$etl->input( 'UnitTest', filter => sub {
136				my ($values, $arguments) = @_;
137				map { $_ eq 'NA' ? '' : $_ } @$values;
138				} );
139
140				=cut
141
142				has 'filter' => (
143				default => sub { sub {
144				my ($values, $arguments) = @_;
145				return map { trim( $_ ) } @$values;
146				} },
147				is => 'rw',
148				isa => 'CodeRef',
149				);
150
151				around 'get' => sub {
152				my ($original, $self, @arguments) = @_;
153
154				my @values = $original->( $self, @arguments );
155				return $self->filter->( \@values, \@arguments );
156				};
157
158
159				=head3 skip_if
160
161				B<skip_if> accepts a code reference. L<ETL::Pipeline/process> executes this
162				code for every input record. If this code returns I<false>,
163				L<ETL::Pipeline/process> discards the record with no further processing.
164
165				Use B<skip_if> to bypass bad data.
166
167				The code reference receives the current L<ETL::Pipeline> as its first parameter
168				and in C<$_>.
169
170				I<Note:> B<skip_if> only works on data records. It is not applied to column
171				headers.
172
173				$etl->input( 'UnitTest', skip_if => sub { $_->get( 'A' ) eq 'DELETED' } );
174
175				=cut
176
177				has 'skip_if' => (
178				is => 'rw',
179				isa => 'Maybe[CodeRef]',
180				);
181
182
183				=head3 stop_if
184
185				Normally, L<ETL::Pipeline> goes until the end of the file. This code reference
186				stops processing early. If the code reference returns I<true>, L<ETL::Pipeline>
187				shuts down, just as if it reached the end of the file.
188
189				I use this with report formats that have grand totals at the end. The totals
190				aren't real data.
191
192				The code reference receives the current L<ETL::Pipeline> as its first parameter
193				and in C<$_>.
194
195				$etl->input( 'UnitTest', stop_if => sub { $_->get( 'A' ) eq 'Totals' } );
196
197				=cut
198
199				has 'stop_if' => (
200				is => 'rw',
201				isa => 'Maybe[CodeRef]',
202				);
203
204
205				=head2 Called from L<ETL::Pipeline/process>
206
207				=head3 next_record
208
209				B<next_record> reads the next single record from the input source.
210				L<ETL::Pipeline/process> calls this method inside of a loop. B<next_record>
211				returns a boolean flag. A I<true> value means success getting the record. A
212				I<false> value indicates the end of the input - no more records.
213
214				The implmenting class must define this method.
215
216				while ($input->next_record) {
217				...
218				}
219
220				=cut
221
222				requires 'next_record';
223
224
225				=head3 get
226
227				B<get> returns a list of values from matching fields from the current record.
228				B<ETL::Pipeline::Input> does not define how L</next_record> stores its data
229				internally. You should use the format that best suits your needs. For example,
230				L<ETL::Pipeline::Input::Excel> uses an L<Spreadsheet::XLSX> object. It's B<get>
231				accesses object methods to retrieve fields.
232
233				L<ETL::Pipeline/process> passes in the value from L<ETL::Pipeline/mapping>.
234				That can be a scalar value (string), regular expression, or array reference.
235				B<get> returns a list of values from matching fields. L<ETL::Pipeline/process>
236				passes that list directly to L<ETL::Pipeline::Output/set>.
237
238				B<Note:> B<ETL::Pipeline::Input> automatically passes the return values through
239				L</filter>. You should not call L</filter> from inside of the B<get> method.
240
241				The implmenting class must define this method.
242
243				# Retrieve one field named 'A'.
244				$etl->get( 'A' );
245
246				# Retrieve the field from the column 'ID Num'.
247				$etl->get( qr/id\s*num/i );
248
249				# A list is used to build composite field names.
250				$etl->get( '/root', '/first' );
251
252				B<NOTE:> B<get> returns a list - not an individual value. Even if only one
253				field matches, B<get> still returns a list. Calling it in scalar context
254				returns the number of elements in the list - not a value. Keep this in mind
255				when calling B<get> from L</stop_if> or L</skip_if>.
256
257				=cut
258
259				requires 'get';
260
261
262				=head3 configure
263
264				B<configure> prepares the input source. It can open files, make database
265				connections, or anything else required before reading the first record.
266
267				Why not do this in the class constructor? Some roles add automatic
268				configuration. Those roles use the usual Moose method modifiers, which would
269				not work with the constructor.
270
271				This B<configure> - for the input source - is called I<before> the
272				L<ETL::Pipeline::Output/configure> of the output destination. This method
273				should not rely on the configuration of the output destination.
274
275				The implmenting class must define this method.
276
277				$input->configure;
278
279				=cut
280
281				requires 'configure';
282
283
284				=head3 finish
285
286				B<finish> shuts down the input source. It can close files, disconnect
287				from the database, or anything else required to cleanly terminate the input.
288
289				Why not do this in the class destructor? Some roles add automatic functionality
290				via Moose method modifiers. This would not work with a destructor.
291
292				This B<finish> - for the input source - is called I<after> the
293				L<ETL::Pipeline::Output/finish> of the output destination. This method should
294				not rely on the configuration of the output destination.
295
296				The implmenting class must define this method.
297
298				$input->finish;
299
300				=cut
301
302				requires 'finish';
303
304
305				=head2 Other Methods & Attributes
306
307				=head3 record_number
308
309				The B<record_number> attribute tells you how many total records have been read
310				by L</next_record>. The count includes headers and L</skip_if> records.
311
312				The first record is always B<1>.
313
314				B<ETL::Pipeline::Input> automatically increments the counter after
315				L</next_record>. The L</next_record> method should not change B<record_number>.
316
317				=head3 decrement_record_number
318
319				This method decreases L</record_number> by one. It can be used to I<back out>
320				header records from the count.
321
322				$input->decrement_record_number;
323
324				=head3 increment_record_number
325
326				This method increases L</record_number> by one.
327
328				$input->increment_record_number;
329
330				=cut
331
332				has 'record_number' => (
333				default => '0',
334				handles => {
335				decrement_record_number => 'dec',
336				increment_record_number => 'inc',
337				},
338				is => 'ro',
339				isa => 'Int',
340				traits => [qw/Counter/],
341				);
342
343				around 'next_record' => sub {
344				my $original = shift;
345				my $self = shift;
346
347				my $result = $self->$original( @_ );
348				$self->increment_record_number if $result;
349				return $result;
350				};
351
352
353				=head1 SEE ALSO
354
355				L<ETL::Pipeline>, L<ETL::Pipeline::Output>
356
357				=head1 AUTHOR
358
359				Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
360
361				=head1 LICENSE
362
363				Copyright 2016 (c) Vanderbilt University Medical Center
364
365				This program is free software; you can redistribute it and/or modify it under
366				the same terms as Perl itself.
367
368				=cut
369
370	7	7	33	no Moose;
	7		12
	7		40
371
372				# Required by Perl to load the module.
373				1;