File Coverage

blib/lib/ETL/Pipeline/Input/XmlFiles.pm
Criterion Covered Total %
statement 21 23 91.3
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 29 31 93.5


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::XmlFiles - Records in individual XML files
6              
7             =head1 SYNOPSIS
8              
9             use ETL::Pipeline;
10             ETL::Pipeline->new( {
11             input => ['XmlFiles', from => 'Documents'],
12             mapping => {Name => '/Root/Name', Address => '/Root/Address'},
13             output => ['UnitTest']
14             } )->process;
15              
16             =head1 DESCRIPTION
17              
18             B<ETL::Pipeline::Input::XmlFiles> defines an input source that reads multiple
19             XML files from a directory. Each XML file contains exactly one record. Fields
20             are accessed with the full XML path.
21              
22             =cut
23              
24             package ETL::Pipeline::Input::XmlFiles;
25 1     1   4 use Moose;
  1         1  
  1         7  
26              
27 1     1   4951 use 5.014000;
  1         3  
28 1     1   3 use warnings;
  1         1  
  1         20  
29              
30 1     1   4 use Carp;
  1         1  
  1         68  
31 1     1   4 use MooseX::Types::Path::Class qw/Dir File/;
  1         1  
  1         14  
32 1     1   1021 use Path::Class qw//;
  1         1  
  1         13  
33 1     1   3 use Path::Class::Rule;
  1         2  
  1         24  
34 1     1   449 use XML::XPath;
  0            
  0            
35              
36              
37             our $VERSION = '2.00';
38              
39              
40             =head1 METHODS & ATTRIBUTES
41              
42             =head2 Arguments for L<ETL::Pipeline/input>
43              
44             =head3 from
45              
46             B<from> tells B<ETL::Pipeline::Input::XmlFiles> where to find the data files.
47             By default, B<ETL::Pipeline::Input::XmlFiles> looks in
48             L<ETL::Pipeline/data_in>. B<from> tells the code to look in another place.
49              
50             If B<from> is a regular expression, the code finds the first directory whose
51             name matches. If B<from> is a relative path, it is expected to reside under
52             L<ETL::Pipeline/data_in>. An absolute path is exact.
53              
54             =cut
55              
56             has 'from' => (
57             init_arg => undef,
58             is => 'bare',
59             isa => Dir,
60             reader => '_get_from',
61             writer => '_set_from',
62             );
63              
64              
65             sub from {
66             my $self = shift;
67              
68             if (scalar( @_ ) > 0) {
69             my $new = shift;
70             if (ref( $new ) eq 'Regexp') {
71             my $match = Path::Class::Rule->new
72             ->iname( $new )
73             ->max_depth( 1 )
74             ->directory
75             ->iter( $self->pipeline->data_in )
76             ->()
77             ;
78             croak 'No matching directories' unless defined $match;
79             $self->_set_from( $match );
80             } else {
81             my $folder = Path::Class::dir( $new );
82             $folder = $folder->absolute( $self->pipeline->data_in )
83             if $folder->is_relative;
84             $self->_set_from( $folder );
85             }
86             }
87             return $self->_get_from;
88             }
89              
90              
91             =head3 ...
92              
93             B<ETL::Pipeline::Input::XmlFiles> accepts any of the tests provided by
94             L<Path::Iterator::Rule>. The value of the argument is passed directly into the
95             test. For boolean tests (e.g. readable, exists, etc.), pass an C<undef> value.
96              
97             B<ETL::Pipeline::Input::XmlFiles> automatically applies the C<file> and
98             C<iname> filters. Do not pass C<file> through L<ETL::Pipeline/input>. You may
99             pass in C<name> or C<iname> to override the default filter of B<*.xml>.
100              
101             =cut
102              
103             sub BUILD {
104             my $self = shift;
105             my $arguments = shift;
106              
107             # Set the top level directory.
108             if (defined $arguments->{from}) {
109             $self->from( $arguments->{from} );
110             } else { $self->from( '.' ); }
111              
112             # Configure the file search.
113             my @criteria = grep {
114             $_ ne 'file'
115             && !$self->meta->has_attribute( $_ )
116             } keys %$arguments;
117             my $search = Path::Class::Rule->new;
118             foreach my $name (@criteria) {
119             my $value = $arguments->{$name};
120             eval "\$search->$name( \$value )";
121             croak $@ unless $@ eq '';
122             }
123             $search->iname( '*.xml' )
124             unless exists( $arguments->{name} ) || exists( $arguments->{iname} );
125             $search->file;
126             $self->_set_iterator( $search->iter( $self->from ) );
127             }
128              
129              
130             =head2 Called from L<ETL::Pipeline/process>
131              
132             =head3 get
133              
134             B<get> returns a list of values from matching nodes. The field name is an
135             I<XPath>. See L<http://www.w3schools.com/xpath/xpath_functions.asp> for more
136             information on XPaths.
137              
138             XML lends itself to recursive records. What happens when you need two fields
139             under the same subnode? For example, a I<person involved> can have both a
140             I<name> and a I<role>. The names and roles go together. How do you B<get> them
141             together?
142              
143             B<get> supports subnodes as additional parameters. Pass the top node as the
144             first parameter. Pass the subnode names in subsequent parameters. The values
145             are returned in the same order as the parameters. B<get> returns C<undef> for
146             any non-existant subnodes.
147              
148             Here are some examples...
149              
150             # Return a single value from a single field.
151             $etl->get( '/Root/Name' );
152             'John Doe'
153            
154             # Return a list from multiple fields with the same name.
155             $etl->get( '/Root/PersonInvolved/Name' );
156             ('John Doe', 'Jane Doe')
157            
158             # Return a list from subnodes.
159             $etl->get( '/Root/PersonInvolved', 'Name' );
160             ('John Doe', 'Jane Doe')
161            
162             # Return a list of related fields from subnodes.
163             $etl->get( '/Root/PersonInvolved', 'Name', 'Role' );
164             (['John Doe', 'Husband'], ['Jane Doe', 'Wife'])
165              
166             In the L<ETL::Pipeline/mapping>, those examples looks like this...
167              
168             {Name => '/Root/Name'}
169             {Name => '/Root/PersonInvolved/Name'}
170             {Name => ['/Root/PersonInvolved', 'Name']}
171             {Name => ['/Root/PersonInvolved', 'Name', 'Role']}
172              
173             =cut
174              
175             sub get {
176             my ($self, $top, @subnodes) = @_;
177             my $xpath = $self->xpath;
178              
179             my $match = $xpath->find( $top );
180             if ($match->isa( 'XML::XPath::NodeSet' )) {
181             if (scalar( @subnodes ) == 0) {
182             return map { $_->string_value } $match->get_nodelist;
183             } elsif (scalar( @subnodes ) == 1) {
184             my @values;
185             foreach my $node ($match->get_nodelist) {
186             my $data = $xpath->find( $subnodes[0], $node );
187             push @values, $data->string_value;
188             }
189             return @values;
190             } else {
191             my @values;
192             foreach my $node ($match->get_nodelist) {
193             my @current;
194             foreach my $path (@subnodes) {
195             my $data = $xpath->find( $path, $node );
196             push @current, $data->string_value;
197             }
198             push @values, \@current;
199             }
200             return @values;
201             }
202             } else { return $match->value; }
203             }
204              
205              
206             =head3 next_record
207              
208             This method parses the next file in the folder.
209              
210             B<Data::ETL::Extract::XmlFiles> builds a list of file names when it first
211             starts. B<next_record> iterates over this in-memory list. It will not parse
212             any new files saved into the folder.
213              
214             =cut
215              
216             sub next_record {
217             my ($self) = @_;
218              
219             my $object = $self->_next_file;
220             if (defined $object) {
221             $self->_set_file( $object );
222              
223             my $parser = XML::XPath->new( filename => "$object" );
224             croak "Unable to parse the XML in '$object'" unless defined $parser;
225             $self->_set_xpath( $parser );
226              
227             return 1;
228             } else { return 0; }
229             }
230              
231              
232             =head3 configure
233              
234             B<configure> doesn't actually do anything. But it is required by
235             L<ETL::Pipeline/process>.
236              
237             =cut
238              
239             sub configure { }
240              
241              
242             =head3 finish
243              
244             B<finish> doesn't actually do anything. But it is required by
245             L<ETL::Pipeline/process>.
246              
247             =cut
248              
249             sub finish { }
250              
251              
252             =head2 Other Methods & Attributes
253              
254             =head3 exists
255              
256             The B<exists> method tells you whether the given path exists or not. It returns
257             a boolean value. B<True> means that the given node exists in this XML file.
258             B<False> means that it does not.
259              
260             B<exists> accepts an XPath string as the only parameter. You can learn more
261             about XPath here: L<http://www.w3schools.com/xpath/xpath_functions.asp>.
262              
263             =cut
264              
265             sub exists {
266             my ($self, $xpath_string) = @_;
267              
268             my @matches = $self->xpath->findnodes( $xpath_string );
269             return (scalar( @matches ) > 0 ? 1 : 0);
270             }
271              
272              
273             =head3 file
274              
275             The B<file> attribute holds a L<Path::Class:File> object for the current XML
276             file. You can use it for accessing the file name or directory.
277              
278             B<file> is automatically set by L</next_record>.
279              
280             =cut
281              
282             has 'file' => (
283             init_arg => undef,
284             is => 'ro',
285             isa => File,
286             writer => '_set_file',
287             );
288              
289              
290             =head3 iterator
291              
292             L<Path::Class::Rule> creates an iterator that returns each file in turn.
293             B<iterator> holds it for L</next_record>.
294              
295             =cut
296              
297             has 'iterator' => (
298             handles => {_next_file => 'execute'},
299             is => 'ro',
300             isa => 'CodeRef',
301             traits => [qw/Code/],
302             writer => '_set_iterator',
303             );
304              
305              
306             =head3 xpath
307              
308             The B<xpath> attribute holds the current L<XML::XPath> object. It is
309             automatically set by the L</next_record> method.
310              
311             =cut
312              
313             has 'xpath' => (
314             init_arg => undef,
315             is => 'ro',
316             isa => 'XML::XPath',
317             writer => '_set_xpath',
318             );
319              
320              
321             =head1 SEE ALSO
322              
323             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::XML>,
324             L<Path::Class::File>, L<Path::Class::Rule>, L<Path::Iterator::Rule>,
325             L<XML::XPath>
326              
327             =cut
328              
329             with 'ETL::Pipeline::Input';
330              
331              
332             =head1 AUTHOR
333              
334             Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
335              
336             =head1 LICENSE
337              
338             Copyright 2016 (c) Vanderbilt University Medical Center
339              
340             This program is free software; you can redistribute it and/or modify it under
341             the same terms as Perl itself.
342              
343             =cut
344              
345             no Moose;
346             __PACKAGE__->meta->make_immutable;