File Coverage

blib/lib/ETL/Pipeline/Input/JsonFiles.pm
Criterion Covered Total %
statement 29 29 100.0
branch 3 4 75.0
condition 2 3 66.6
subroutine 8 8 100.0
pod 1 1 100.0
total 43 45 95.5


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::JsonFiles - Process JSON content from individual files
6              
7             =head1 SYNOPSIS
8              
9             use ETL::Pipeline;
10             ETL::Pipeline->new( {
11             input => ['JsonFiles', iname => qr/\.json$/i, records_at => '/json'],
12             mapping => {First => '/File/A', Second => '/File/Patient'},
13             output => ['UnitTest']
14             } )->process;
15              
16             =head1 DESCRIPTION
17              
18             B<ETL::Pipeline::Input::JsonFiles> defines an input source that reads one or
19             more records from one or more JSON files. Most of the time, there should be one
20             record per file. But the class handles multiple records per file too.
21              
22             =cut
23              
24             package ETL::Pipeline::Input::JsonFiles;
25              
26 1     1   26 use 5.014000;
  1         3  
27 1     1   5 use warnings;
  1         1  
  1         32  
28              
29 1     1   4 use Carp;
  1         2  
  1         74  
30 1     1   6 use Data::DPath qw/dpath/;
  1         2  
  1         10  
31 1     1   1202 use JSON;
  1         7892  
  1         7  
32 1     1   142 use Moose;
  1         2  
  1         11  
33              
34              
35             our $VERSION = '2.00';
36              
37              
38             =head1 METHODS & ATTRIBUTES
39              
40             =head2 Arguments for L<ETL::Pipeline/input>
41              
42             =head3 records_at
43              
44             Optional. The path to the record nodes, such as C</json/Record>. The
45             last item in the list is the name of the root for each individual record. The
46             default is B</> - one record in the file.
47              
48             You might use this attribute in two cases...
49              
50             =over
51              
52             =item 1. Multiple records per file. This is the top of each record, like in L<ETL::Pipeline::Input::Xml>.
53              
54             =item 2. Shorthand to leave off extra nodes from every path. One record per file, but you don't want extra path parts on the beginning of every field.
55              
56             =back
57              
58             This can be any value accepted by L<Data::DPath>.
59              
60             =cut
61              
62             has 'records_at' => (
63             default => '/',
64             is => 'ro',
65             isa => 'Str',
66             );
67              
68              
69             =head3 skipping
70              
71             Not used. This attribute is ignored. JSON files must follow specific formatting
72             rules. Extra rows are parsed as data. There's nothing to skip.
73              
74             =head2 Methods
75              
76             =head3 run
77              
78             This is the main loop. It opens the file, reads records, and closes it when
79             done. This is the place to look if there are problems.
80              
81             L<ETL::Pipeline> automatically calls this method.
82              
83             =cut
84              
85             sub run {
86 3     3 1 9 my ($self, $etl) = @_;
87              
88 3         49 my $parser = JSON->new->utf8;
89 3         14 while (my $path = $self->next_path( $etl )) {
90 6         41 my $text = $path->slurp; # Force scalar context, otherwise slurp breaks it into lines.
91 6         1295 my $json = $parser->decode( $text );
92 6 50       19 croak "JSON file '$path', unable to parse" unless defined $json;
93              
94             # Find the node that is an array of records. This comes from the
95             # "records_at" attribute.
96             #
97             # I assume that records are field/value pairs - a Perl hash. So if the
98             # DPath matches an array, it is an array of record. I need to
99             # de-reference that list to get to the actual records.
100 6         193 my @matches = dpath( $self->records_at )->match( $json );
101 6 100 66     1133 my $list = (scalar( @matches ) == 1 && ref( $matches[0] ) eq 'ARRAY') ? $matches[0] : \@matches;
102              
103             # Process each record. And that's it. The record is a Perl data
104             # structure corresponding with the JSON structure.
105 6         30 $etl->record( $_ ) foreach (@$list);
106             }
107             }
108              
109              
110             =head1 SEE ALSO
111              
112             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File::List>,
113             L<JSON>
114              
115             =cut
116              
117             with 'ETL::Pipeline::Input';
118             with 'ETL::Pipeline::Input::File::List';
119              
120              
121             =head1 AUTHOR
122              
123             Robert Wohlfarth <robert.j.wohlfarth@vumc.org>
124              
125             =head1 LICENSE
126              
127             Copyright 2021 (c) Vanderbilt University Medical Center
128              
129             This program is free software; you can redistribute it and/or modify it under
130             the same terms as Perl itself.
131              
132             =cut
133              
134 1     1   6621 no Moose;
  1         2  
  1         5  
135             __PACKAGE__->meta->make_immutable;