File Coverage

blib/lib/ETL/Pipeline/Input/File.pm
Criterion Covered Total %
statement 36 38 94.7
branch 7 10 70.0
condition 1 3 33.3
subroutine 9 9 100.0
pod n/a
total 53 60 88.3


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::File - Role for file based input sources
6              
7             =head1 SYNOPSIS
8              
9             # In the input source...
10             use Moose;
11             with 'ETL::Pipeline::Input::File';
12             ...
13              
14             # In the ETL::Pipeline script...
15             ETL::Pipeline->new( {
16             work_in => {search => 'C:\Data', find => qr/Ficticious/},
17             input => ['Excel', matching => qr/\.xlsx?$/ ],
18             mapping => {Name => 'A', Address => 'B', ID => 'C' },
19             constants => {Type => 1, Information => 'Demographic' },
20             output => ['SQL', table => 'NewData' ],
21             } )->process;
22              
23             # Or with a specific file...
24             ETL::Pipeline->new( {
25             work_in => {search => 'C:\Data', find => qr/Ficticious/},
26             input => ['Excel', file => 'ExportedData.xlsx' ],
27             mapping => {Name => 'A', Address => 'B', ID => 'C' },
28             constants => {Type => 1, Information => 'Demographic' },
29             output => ['SQL', table => 'NewData' ],
30             } )->process;
31              
32             =head1 DESCRIPTION
33              
34             B<ETL::Pipeline::Input::File> provides methods and attributes common to
35             file based input sources. It makes file searches available for any file
36             format. With B<ETL::Pipeline::Input::File>, you can...
37              
38             =over
39              
40             =item Specify the exact path to the file.
41              
42             =item Or search the file system for a matching name.
43              
44             =back
45              
46             For setting an exact path, see the L</path> attribute. For searches, see the
47             L</find> attribute.
48              
49             =head2 File vs. DataFile
50              
51             L<ETL::Pipeline::Input::DataFile> extends B<ETL::Pipeline::Input::File>.
52             This role, B<ETL::Pipeline::Input::File> makes no assumptions about the file
53             format. It works CSV text files, MS Access databases, spread sheets, XML, or
54             any other format found on disk.
55              
56             L<ETL::Pipeline::Input::DataFile> assumes that each record is stored on one
57             row. And the data is divided into fields (columns). Basically,
58              
59             =cut
60              
61             package ETL::Pipeline::Input::File;
62 4     4   12637 use Moose::Role;
  4         7  
  4         37  
63              
64 4     4   16131 use 5.014000;
  4         10  
65 4     4   18 use Carp;
  4         6  
  4         323  
66 4     4   21 use MooseX::Types::Path::Class qw/Dir File/;
  4         4  
  4         60  
67 4     4   4373 use Path::Class::Rule;
  4         6  
  4         129  
68 4     4   17 use String::Util qw/hascontent/;
  4         6  
  4         1173  
69              
70              
71             our $VERSION = '2.00';
72              
73              
74             =head1 METHODS & ATTRIBUTES
75              
76             =head2 Arguments for L<ETL::Pipeline/input>
77              
78             =head3 matching
79              
80             B<matching> locates the first file that matches the given pattern. The
81             pattern can be a glob or regular expression. B<matching> sets L</file>
82             to the first file that matches. Search patterns are case insensitive.
83              
84             # Search using a regular expression...
85             $etl->input( 'Excel', matching => qr/\.xlsx$/i );
86            
87             # Search using a file glob...
88             $etl->input( 'Excel', matching => '*.xlsx' );
89              
90             For very weird cases, B<matching> also accepts a code reference.
91             B<matching> executes the subroutine against the file names. B<matching>
92             sets L</file> to the first file where the subroutine returns a true
93             value.
94              
95             B<matching> passes two parameters into the subroutine...
96              
97             =over
98              
99             =item The L<ETL::Pipeline> object
100              
101             =item The L<Path::Class::File> object
102              
103             =back
104              
105             # File larger than 2K...
106             $etl->input( 'Excel', matching => sub {
107             my ($etl, $file) = @_;
108             return (!$file->is_dir && $file->size > 2048 ? 1 : 0);
109             } );
110              
111             B<matching> searches inside the L<ETL::Pipeline/data_in> directory.
112              
113             =cut
114              
115             has 'matching' => (
116             is => 'ro',
117             isa => 'Maybe[CodeRef|RegexpRef|Str]',
118             );
119              
120              
121             =head3 file
122              
123             B<file> holds a L<Path::Class::File> object pointing to the input file.
124             If L<ETL::Pipeline/input> does not set B<file>, then the L</matching>
125             attribute searches the file system for a match. If
126             L<ETL::Pipeline/input> sets B<file>, then L</matching> is ignored.
127              
128             B<file> is relative to L<ETL::Pipeline/data_in>, unless you set it to an
129             absolute path name. With L</matching>, the search is always limited to
130             L<ETL::Pipeline/data_in>.
131              
132             # File inside of "data_in"...
133             $etl->input( 'Excel', file => 'Data.xlsx' );
134            
135             # Absolute path name...
136             $etl->input( 'Excel', file => 'C:\Data.xlsx' );
137              
138             =cut
139              
140             has 'file' => (
141             builder => '_build_file',
142             coerce => 1,
143             is => 'ro',
144             isa => File,
145             lazy => 1,
146             trigger => \&_trigger_file,
147             writer => '_set_file',
148             );
149              
150              
151             sub _build_file {
152 9     9   10 my $self = shift;
153              
154 9         82 my $rule = Path::Class::Rule->new;
155 9         251 my $pattern = $self->matching;
156 9         218 my $pipeline = $self->pipeline;
157              
158 9 100       25 if (ref( $pattern ) eq 'CODE') {
159 1         3 my $search = $rule->iter( $pipeline->data_in );
160 1         123 while (my $file = $search->()) {
161 2 100       787 return $file if $pipeline->execute_code_ref( $pattern, $file );
162             }
163 0         0 croak 'No file matched for "input"';
164 0         0 return undef;
165             } else {
166 8         28 $rule->file;
167 8 50       257 $rule->iname( $pattern ) if defined $pattern;
168 8         1372 my $search = $rule->iter( $pipeline->data_in );
169              
170 8         1601 my $file = $search->();
171 8 50       10468 croak 'No file matched for "input"' unless defined $file;
172 8         404 return $file;
173             }
174             }
175              
176              
177             sub _trigger_file {
178 2     2   24 my ($self, $old, $new) = @_;
179 2 50 33     48 $self->_set_file( $new->absolute( $self->pipeline->data_in ) )
180             if defined( $new ) && $new->is_relative;
181             }
182              
183              
184             =head1 SEE ALSO
185              
186             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::TabularFile>
187              
188             =head1 AUTHOR
189              
190             Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
191              
192             =head1 LICENSE
193              
194             Copyright 2016 (c) Vanderbilt University Medical Center
195              
196             This program is free software; you can redistribute it and/or modify it under
197             the same terms as Perl itself.
198              
199             =cut
200              
201 4     4   20 no Moose;
  4         4  
  4         25  
202              
203             # Required by Perl to load the module.
204             1;