File Coverage

blib/lib/ETL/Pipeline/Input/File.pm
Criterion Covered Total %
statement 17 17 100.0
branch n/a
condition n/a
subroutine 7 7 100.0
pod 0 1 0.0
total 24 25 96.0


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::File - Role for file based input sources
6              
7             =head1 SYNOPSIS
8              
9             # In the input source...
10             use Moose;
11             with 'ETL::Pipeline::Input';
12             with 'ETL::Pipeline::Input::File';
13             ...
14              
15             # In the ETL::Pipeline script...
16             ETL::Pipeline->new( {
17             work_in => {root => 'C:\Data', iname => qr/Ficticious/},
18             input => ['Excel', iname => qr/\.xlsx?$/ ],
19             mapping => {Name => 'A', Address => 'B', ID => 'C' },
20             constants => {Type => 1, Information => 'Demographic' },
21             output => ['SQL', table => 'NewData' ],
22             } )->process;
23              
24             # Or with a specific file...
25             ETL::Pipeline->new( {
26             work_in => {root => 'C:\Data', iname => qr/Ficticious/},
27             input => ['Excel', iname => 'ExportedData.xlsx' ],
28             mapping => {Name => 'A', Address => 'B', ID => 'C' },
29             constants => {Type => 1, Information => 'Demographic' },
30             output => ['SQL', table => 'NewData' ],
31             } )->process;
32              
33             =head1 DESCRIPTION
34              
35             This role adds functionality and attributes common to all file based input
36             sources. It is a quick and easy way to create new sources with the ability
37             to search directories. Useful when the file name changes.
38              
39             B<ETL::Pipeline::Input::File> works with a single source file. To process an
40             entire directory of files, use L<ETL::Pipeline::Input::FileListing> instead.
41              
42             =cut
43              
44             package ETL::Pipeline::Input::File;
45              
46 4     4   3122 use 5.014000;
  4         17  
47              
48 4     4   23 use Carp;
  4         8  
  4         296  
49 4     4   20 use Moose::Role;
  4         8  
  4         38  
50 4     4   19991 use MooseX::Types::Path::Class qw/File/;
  4         9  
  4         85  
51 4     4   4948 use Path::Class::Rule;
  4         10  
  4         1950  
52              
53              
54             our $VERSION = '3.00';
55              
56              
57             =head1 METHODS & ATTRIBUTES
58              
59             =head2 Arguments for L<ETL::Pipeline/input>
60              
61             B<ETL::Pipeline::Input::File> accepts any of the tests provided by
62             L<Path::Iterator::Rule>. The value of the argument is passed directly into the
63             test. For boolean tests (e.g. readable, exists, etc.), pass an C<undef> value.
64              
65             B<ETL::Pipeline::Input::File> automatically applies the C<file> filter. Do not
66             pass C<file> through L<ETL::Pipeline/input>.
67              
68             C<iname> is the most common one that I use. It matches the file name, supports
69             wildcards and regular expressions, and is case insensitive.
70              
71             # Search using a regular expression...
72             $etl->input( 'Excel', iname => qr/\.xlsx$/ );
73              
74             # Search using a file glob...
75             $etl->input( 'Excel', iname => '*.xlsx' );
76              
77             The code throws an error if no files match the criteria. Only the first match
78             is used. If you want to match more than one file, use
79             L<ETL::Pipeline::Input::File::List> instead.
80              
81             =cut
82              
83             # BUILD in the consuming class will override this one. I add a fake BUILD in
84             # case the class doesn't have one. The method modifier then runs the code to
85             # extract search criteria from the constructor arguments. The modifier will
86             # run even if the consuming class has its own BUILD.
87             # https://www.perlmonks.org/?node_id=837369
88       9 0   sub BUILD {}
89              
90             after 'BUILD' => sub {
91             my $self = shift;
92             my $arguments = shift;
93              
94             while (my ($name, $value) = each %$arguments) {
95             $self->_add_criteria( $name, $value )
96             if $name ne 'file' && Path::Class::Rule->can( $name );
97             }
98             };
99              
100              
101             # Execute the actual search AFTER everything is set in stone. This lets a script
102             # create the input source before it calls "work_in".
103             before 'run' => sub {
104             my ($self, $etl) = @_;
105              
106             if (defined $self->path) {
107             $self->_set_path( $self->path->absolute( $etl->data_in ) )
108             if $self->path->is_relative;
109             } else {
110             # Build the search rule from the criteria passed to the constructor.
111             my $rule = Path::Class::Rule->new->file;
112             foreach my $pair ($self->_search_criteria) {
113             my $name = $pair->[0];
114             my $value = $pair->[1];
115              
116             eval "\$rule = \$rule->$name( \$value )";
117             croak $@ unless $@ eq '';
118             }
119             my @matches = $rule->all( $etl->data_in );
120              
121             # Find the first file that matches all of the criteria.
122             if (scalar( @matches ) < 1) {
123             croak 'No files matched the search criteria';
124             } elsif (!-r $matches[0]) {
125             croak "You do not have permission to read '$matches[0]'";
126             } else {
127             $self->_set_path( $matches[0] );
128             $self->source( $matches[0]->relative( $etl->work_in )->stringify );
129             $etl->status( 'INFO', 'File name' );
130             }
131             }
132             };
133              
134              
135             =head3 path
136              
137             Optional. When passed to L<ETL::Pipeline/input>, this file becomes the input
138             source. No search or matching is performed. If you specify a relative path, it
139             is relative to L</data_in>.
140              
141             Once the object has been created, this attribute holds the file that matched
142             search criteria. It should be used by your input source class as the file name.
143              
144             # File inside of "data_in"...
145             $etl->input( 'Excel', path => 'Data.xlsx' );
146              
147             # Absolute path name...
148             $etl->input( 'Excel', path => 'C:\Data.xlsx' );
149              
150             # Inside the input source class...
151             open my $io, '<', $self->path;
152              
153             =cut
154              
155             has 'path' => (
156             coerce => 1,
157             is => 'ro',
158             isa => File,
159             writer => '_set_path',
160             );
161              
162              
163             =head3 skipping
164              
165             Optional. B<skipping> jumps over a certain number of rows/lines in the beginning
166             of the file. Report formats often contain extra headers - even before the column
167             names. B<skipping> ignores those and starts processing at the data.
168              
169             B<Note:> B<skipping> is applied I<before> reading column names.
170              
171             B<skipping> accepts either an integer or code reference. An integer represents
172             the number of rows/records to ignore. For a code reference, the code discards
173             records until the subroutine returns a I<true> value.
174              
175             # Bypass the first three rows.
176             $etl->input( 'Excel', skipping => 3 );
177              
178             # Bypass until we find something in column 'C'.
179             $etl->input( 'Excel', skipping => sub { hascontent( $_->get( 'C' ) ) } );
180              
181             The exact nature of the I<record> depends on the input file. For example files,
182             Excel files will send a data row as a hash. But a CSV file would send a single
183             line of plain text with no parsing. See the input source to find out exactly
184             what it sends.
185              
186             If your input source implements B<skipping>, you can pass whatever parameters
187             you want. For consistency, I recommend passing the raw data. If you are jumping
188             over report headers, they may not be formatted.
189              
190             =cut
191              
192             has 'skipping' => (
193             default => 0,
194             is => 'ro',
195             isa => 'CodeRef|Int',
196             );
197              
198              
199             #-------------------------------------------------------------------------------
200             # Internal methods and attributes
201              
202             # Search criteria for the file list. I capture the criteria from the constructor
203             # but don't build the iterator until the loop kicks off. Since the search
204             # depends on "data_in", this allows the user to setup the pipeline in whatever
205             # order they want and it will do the right thing.
206             has '_criteria' => (
207             default => sub { {} },
208             handles => {_add_criteria => 'set', _search_criteria => 'kv'},
209             is => 'ro',
210             isa => 'HashRef[Any]',
211             traits => [qw/Hash/],
212             );
213              
214              
215             =head1 SEE ALSO
216              
217             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File::List>,
218             L<Path::Iterator::Rule>
219              
220             =head1 AUTHOR
221              
222             Robert Wohlfarth <robert.j.wohlfarth@vumc.org>
223              
224             =head1 LICENSE
225              
226             Copyright 2021 (c) Vanderbilt University Medical Center
227              
228             This program is free software; you can redistribute it and/or modify it under
229             the same terms as Perl itself.
230              
231             =cut
232              
233 4     4   30 no Moose;
  4         8  
  4         36  
234              
235             # Required by Perl to load the module.
236             1;