line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=pod |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
ETL::Pipeline::Input::XmlFiles - Records in individual XML files |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use ETL::Pipeline; |
10
|
|
|
|
|
|
|
ETL::Pipeline->new( { |
11
|
|
|
|
|
|
|
input => ['XmlFiles', from => 'Documents'], |
12
|
|
|
|
|
|
|
mapping => {Name => '/Root/Name', Address => '/Root/Address'}, |
13
|
|
|
|
|
|
|
output => ['UnitTest'] |
14
|
|
|
|
|
|
|
} )->process; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 DESCRIPTION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
B<ETL::Pipeline::Input::XmlFiles> defines an input source that reads multiple |
19
|
|
|
|
|
|
|
XML files from a directory. Each XML file contains exactly one record. Fields |
20
|
|
|
|
|
|
|
are accessed with the full XML path. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=cut |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
package ETL::Pipeline::Input::XmlFiles; |
25
|
1
|
|
|
1
|
|
4
|
use Moose; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
7
|
|
26
|
|
|
|
|
|
|
|
27
|
1
|
|
|
1
|
|
4951
|
use 5.014000; |
|
1
|
|
|
|
|
3
|
|
28
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
20
|
|
29
|
|
|
|
|
|
|
|
30
|
1
|
|
|
1
|
|
4
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
68
|
|
31
|
1
|
|
|
1
|
|
4
|
use MooseX::Types::Path::Class qw/Dir File/; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
14
|
|
32
|
1
|
|
|
1
|
|
1021
|
use Path::Class qw//; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
13
|
|
33
|
1
|
|
|
1
|
|
3
|
use Path::Class::Rule; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
34
|
1
|
|
|
1
|
|
449
|
use XML::XPath; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
our $VERSION = '2.00'; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 METHODS & ATTRIBUTES |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 Arguments for L<ETL::Pipeline/input> |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head3 from |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
B<from> tells B<ETL::Pipeline::Input::XmlFiles> where to find the data files. |
47
|
|
|
|
|
|
|
By default, B<ETL::Pipeline::Input::XmlFiles> looks in |
48
|
|
|
|
|
|
|
L<ETL::Pipeline/data_in>. B<from> tells the code to look in another place. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
If B<from> is a regular expression, the code finds the first directory whose |
51
|
|
|
|
|
|
|
name matches. If B<from> is a relative path, it is expected to reside under |
52
|
|
|
|
|
|
|
L<ETL::Pipeline/data_in>. An absolute path is exact. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=cut |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
has 'from' => ( |
57
|
|
|
|
|
|
|
init_arg => undef, |
58
|
|
|
|
|
|
|
is => 'bare', |
59
|
|
|
|
|
|
|
isa => Dir, |
60
|
|
|
|
|
|
|
reader => '_get_from', |
61
|
|
|
|
|
|
|
writer => '_set_from', |
62
|
|
|
|
|
|
|
); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub from { |
66
|
|
|
|
|
|
|
my $self = shift; |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
if (scalar( @_ ) > 0) { |
69
|
|
|
|
|
|
|
my $new = shift; |
70
|
|
|
|
|
|
|
if (ref( $new ) eq 'Regexp') { |
71
|
|
|
|
|
|
|
my $match = Path::Class::Rule->new |
72
|
|
|
|
|
|
|
->iname( $new ) |
73
|
|
|
|
|
|
|
->max_depth( 1 ) |
74
|
|
|
|
|
|
|
->directory |
75
|
|
|
|
|
|
|
->iter( $self->pipeline->data_in ) |
76
|
|
|
|
|
|
|
->() |
77
|
|
|
|
|
|
|
; |
78
|
|
|
|
|
|
|
croak 'No matching directories' unless defined $match; |
79
|
|
|
|
|
|
|
$self->_set_from( $match ); |
80
|
|
|
|
|
|
|
} else { |
81
|
|
|
|
|
|
|
my $folder = Path::Class::dir( $new ); |
82
|
|
|
|
|
|
|
$folder = $folder->absolute( $self->pipeline->data_in ) |
83
|
|
|
|
|
|
|
if $folder->is_relative; |
84
|
|
|
|
|
|
|
$self->_set_from( $folder ); |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
return $self->_get_from; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head3 ... |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
B<ETL::Pipeline::Input::XmlFiles> accepts any of the tests provided by |
94
|
|
|
|
|
|
|
L<Path::Iterator::Rule>. The value of the argument is passed directly into the |
95
|
|
|
|
|
|
|
test. For boolean tests (e.g. readable, exists, etc.), pass an C<undef> value. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
B<ETL::Pipeline::Input::XmlFiles> automatically applies the C<file> and |
98
|
|
|
|
|
|
|
C<iname> filters. Do not pass C<file> through L<ETL::Pipeline/input>. You may |
99
|
|
|
|
|
|
|
pass in C<name> or C<iname> to override the default filter of B<*.xml>. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=cut |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub BUILD { |
104
|
|
|
|
|
|
|
my $self = shift; |
105
|
|
|
|
|
|
|
my $arguments = shift; |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
# Set the top level directory. |
108
|
|
|
|
|
|
|
if (defined $arguments->{from}) { |
109
|
|
|
|
|
|
|
$self->from( $arguments->{from} ); |
110
|
|
|
|
|
|
|
} else { $self->from( '.' ); } |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# Configure the file search. |
113
|
|
|
|
|
|
|
my @criteria = grep { |
114
|
|
|
|
|
|
|
$_ ne 'file' |
115
|
|
|
|
|
|
|
&& !$self->meta->has_attribute( $_ ) |
116
|
|
|
|
|
|
|
} keys %$arguments; |
117
|
|
|
|
|
|
|
my $search = Path::Class::Rule->new; |
118
|
|
|
|
|
|
|
foreach my $name (@criteria) { |
119
|
|
|
|
|
|
|
my $value = $arguments->{$name}; |
120
|
|
|
|
|
|
|
eval "\$search->$name( \$value )"; |
121
|
|
|
|
|
|
|
croak $@ unless $@ eq ''; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
$search->iname( '*.xml' ) |
124
|
|
|
|
|
|
|
unless exists( $arguments->{name} ) || exists( $arguments->{iname} ); |
125
|
|
|
|
|
|
|
$search->file; |
126
|
|
|
|
|
|
|
$self->_set_iterator( $search->iter( $self->from ) ); |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head2 Called from L<ETL::Pipeline/process> |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head3 get |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
B<get> returns a list of values from matching nodes. The field name is an |
135
|
|
|
|
|
|
|
I<XPath>. See L<http://www.w3schools.com/xpath/xpath_functions.asp> for more |
136
|
|
|
|
|
|
|
information on XPaths. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
XML lends itself to recursive records. What happens when you need two fields |
139
|
|
|
|
|
|
|
under the same subnode? For example, a I<person involved> can have both a |
140
|
|
|
|
|
|
|
I<name> and a I<role>. The names and roles go together. How do you B<get> them |
141
|
|
|
|
|
|
|
together? |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
B<get> supports subnodes as additional parameters. Pass the top node as the |
144
|
|
|
|
|
|
|
first parameter. Pass the subnode names in subsequent parameters. The values |
145
|
|
|
|
|
|
|
are returned in the same order as the parameters. B<get> returns C<undef> for |
146
|
|
|
|
|
|
|
any non-existant subnodes. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Here are some examples... |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# Return a single value from a single field. |
151
|
|
|
|
|
|
|
$etl->get( '/Root/Name' ); |
152
|
|
|
|
|
|
|
'John Doe' |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Return a list from multiple fields with the same name. |
155
|
|
|
|
|
|
|
$etl->get( '/Root/PersonInvolved/Name' ); |
156
|
|
|
|
|
|
|
('John Doe', 'Jane Doe') |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# Return a list from subnodes. |
159
|
|
|
|
|
|
|
$etl->get( '/Root/PersonInvolved', 'Name' ); |
160
|
|
|
|
|
|
|
('John Doe', 'Jane Doe') |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# Return a list of related fields from subnodes. |
163
|
|
|
|
|
|
|
$etl->get( '/Root/PersonInvolved', 'Name', 'Role' ); |
164
|
|
|
|
|
|
|
(['John Doe', 'Husband'], ['Jane Doe', 'Wife']) |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
In the L<ETL::Pipeline/mapping>, those examples looks like this... |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
{Name => '/Root/Name'} |
169
|
|
|
|
|
|
|
{Name => '/Root/PersonInvolved/Name'} |
170
|
|
|
|
|
|
|
{Name => ['/Root/PersonInvolved', 'Name']} |
171
|
|
|
|
|
|
|
{Name => ['/Root/PersonInvolved', 'Name', 'Role']} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=cut |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub get { |
176
|
|
|
|
|
|
|
my ($self, $top, @subnodes) = @_; |
177
|
|
|
|
|
|
|
my $xpath = $self->xpath; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
my $match = $xpath->find( $top ); |
180
|
|
|
|
|
|
|
if ($match->isa( 'XML::XPath::NodeSet' )) { |
181
|
|
|
|
|
|
|
if (scalar( @subnodes ) == 0) { |
182
|
|
|
|
|
|
|
return map { $_->string_value } $match->get_nodelist; |
183
|
|
|
|
|
|
|
} elsif (scalar( @subnodes ) == 1) { |
184
|
|
|
|
|
|
|
my @values; |
185
|
|
|
|
|
|
|
foreach my $node ($match->get_nodelist) { |
186
|
|
|
|
|
|
|
my $data = $xpath->find( $subnodes[0], $node ); |
187
|
|
|
|
|
|
|
push @values, $data->string_value; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
return @values; |
190
|
|
|
|
|
|
|
} else { |
191
|
|
|
|
|
|
|
my @values; |
192
|
|
|
|
|
|
|
foreach my $node ($match->get_nodelist) { |
193
|
|
|
|
|
|
|
my @current; |
194
|
|
|
|
|
|
|
foreach my $path (@subnodes) { |
195
|
|
|
|
|
|
|
my $data = $xpath->find( $path, $node ); |
196
|
|
|
|
|
|
|
push @current, $data->string_value; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
push @values, \@current; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
return @values; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
} else { return $match->value; } |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head3 next_record |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
This method parses the next file in the folder. |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
B<Data::ETL::Extract::XmlFiles> builds a list of file names when it first |
211
|
|
|
|
|
|
|
starts. B<next_record> iterates over this in-memory list. It will not parse |
212
|
|
|
|
|
|
|
any new files saved into the folder. |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=cut |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub next_record { |
217
|
|
|
|
|
|
|
my ($self) = @_; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
my $object = $self->_next_file; |
220
|
|
|
|
|
|
|
if (defined $object) { |
221
|
|
|
|
|
|
|
$self->_set_file( $object ); |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
my $parser = XML::XPath->new( filename => "$object" ); |
224
|
|
|
|
|
|
|
croak "Unable to parse the XML in '$object'" unless defined $parser; |
225
|
|
|
|
|
|
|
$self->_set_xpath( $parser ); |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
return 1; |
228
|
|
|
|
|
|
|
} else { return 0; } |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=head3 configure |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
B<configure> doesn't actually do anything. But it is required by |
235
|
|
|
|
|
|
|
L<ETL::Pipeline/process>. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=cut |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub configure { } |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=head3 finish |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
B<finish> doesn't actually do anything. But it is required by |
245
|
|
|
|
|
|
|
L<ETL::Pipeline/process>. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=cut |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub finish { } |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=head2 Other Methods & Attributes |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head3 exists |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
The B<exists> method tells you whether the given path exists or not. It returns |
257
|
|
|
|
|
|
|
a boolean value. B<True> means that the given node exists in this XML file. |
258
|
|
|
|
|
|
|
B<False> means that it does not. |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
B<exists> accepts an XPath string as the only parameter. You can learn more |
261
|
|
|
|
|
|
|
about XPath here: L<http://www.w3schools.com/xpath/xpath_functions.asp>. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=cut |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub exists { |
266
|
|
|
|
|
|
|
my ($self, $xpath_string) = @_; |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
my @matches = $self->xpath->findnodes( $xpath_string ); |
269
|
|
|
|
|
|
|
return (scalar( @matches ) > 0 ? 1 : 0); |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head3 file |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
The B<file> attribute holds a L<Path::Class:File> object for the current XML |
276
|
|
|
|
|
|
|
file. You can use it for accessing the file name or directory. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
B<file> is automatically set by L</next_record>. |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=cut |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
has 'file' => ( |
283
|
|
|
|
|
|
|
init_arg => undef, |
284
|
|
|
|
|
|
|
is => 'ro', |
285
|
|
|
|
|
|
|
isa => File, |
286
|
|
|
|
|
|
|
writer => '_set_file', |
287
|
|
|
|
|
|
|
); |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=head3 iterator |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
L<Path::Class::Rule> creates an iterator that returns each file in turn. |
293
|
|
|
|
|
|
|
B<iterator> holds it for L</next_record>. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=cut |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
has 'iterator' => ( |
298
|
|
|
|
|
|
|
handles => {_next_file => 'execute'}, |
299
|
|
|
|
|
|
|
is => 'ro', |
300
|
|
|
|
|
|
|
isa => 'CodeRef', |
301
|
|
|
|
|
|
|
traits => [qw/Code/], |
302
|
|
|
|
|
|
|
writer => '_set_iterator', |
303
|
|
|
|
|
|
|
); |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=head3 xpath |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
The B<xpath> attribute holds the current L<XML::XPath> object. It is |
309
|
|
|
|
|
|
|
automatically set by the L</next_record> method. |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=cut |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
has 'xpath' => ( |
314
|
|
|
|
|
|
|
init_arg => undef, |
315
|
|
|
|
|
|
|
is => 'ro', |
316
|
|
|
|
|
|
|
isa => 'XML::XPath', |
317
|
|
|
|
|
|
|
writer => '_set_xpath', |
318
|
|
|
|
|
|
|
); |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=head1 SEE ALSO |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::XML>, |
324
|
|
|
|
|
|
|
L<Path::Class::File>, L<Path::Class::Rule>, L<Path::Iterator::Rule>, |
325
|
|
|
|
|
|
|
L<XML::XPath> |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=cut |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
with 'ETL::Pipeline::Input'; |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=head1 AUTHOR |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu> |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=head1 LICENSE |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
Copyright 2016 (c) Vanderbilt University Medical Center |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it under |
341
|
|
|
|
|
|
|
the same terms as Perl itself. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=cut |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
no Moose; |
346
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |