File Coverage

blib/lib/ETL/Pipeline/Input/Excel.pm
Criterion Covered Total %
statement 32 32 100.0
branch n/a
condition n/a
subroutine 12 12 100.0
pod 2 2 100.0
total 46 46 100.0


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             ETL::Pipeline::Input::Excel - Input source for Microsoft Excel spreadsheets
6              
7             =head1 SYNOPSIS
8              
9             use ETL::Pipeline;
10             ETL::Pipeline->new( {
11             input => ['Excel', matching => qr/\.xlsx$/i],
12             mapping => {First => 'A', Second => qr/ID\s*Num/i},
13             output => ['UnitTest']
14             } )->process;
15              
16             =head1 DESCRIPTION
17              
18             B<ETL::Pipeline::Input::Excel> defines an input source for reading MS Excel
19             spreadsheets. It uses L<Spreadsheet::XLSX> or L<Spreadsheet::ParseExcel>,
20             depending on the file type (XLSX or XLS).
21              
22             =cut
23              
24 1     1   20 use 5.014000;
  1         2  
25 1     1   3 use warnings;
  1         1  
  1         32  
26              
27             package ETL::Pipeline::Input::Excel;
28 1     1   3 use Moose;
  1         1  
  1         8  
29              
30 1     1   4776 use Carp;
  1         1  
  1         58  
31 1     1   546 use List::AllUtils qw/first/;
  1         6930  
  1         66  
32 1     1   741 use Spreadsheet::ParseExcel;
  1         37563  
  1         30  
33 1     1   8 use Spreadsheet::ParseExcel::Utility qw/int2col/;
  1         1  
  1         44  
34 1     1   538 use Spreadsheet::XLSX;
  1         52649  
  1         43  
35 1     1   9 use String::Util qw/hascontent/;
  1         1  
  1         618  
36              
37              
38             our $VERSION = '2.00';
39              
40              
41             =head1 METHODS & ATTRIBUTES
42              
43             =head2 Arguments for L<ETL::Pipeline/input>
44              
45             B<ETL::Pipeline::Input::DelimitedText> implements L<ETL::Pipeline::Input::File>
46             and L<ETL::Pipeline::Input::TabularFile>. It supports all of the attributes
47             from these roles.
48              
49             =head3 return_blank_rows
50              
51             L<Spreadsheet::XLSX> or L<Spreadsheet::ParseExcel> can't identify merged rows.
52             Merged rows simply appear as blanks. So by default,
53             B<ETL::Pipeline::Input::Excel> skips over blank rows. Merged rows look like one
54             record. When counting headers, do not count empty rows.
55              
56             This boolean attribute overrides the default behaviour.
57             B<ETL::Pipeline::Input::Excel> returns blank rows as an empty record.
58              
59             =cut
60              
61             has 'return_blank_rows' => (
62             default => 0,
63             is => 'rw', # Allow callbacks to change this flag!
64             isa => 'Bool',
65             );
66              
67              
68             =head3 worksheet
69              
70             B<worksheet> reads data from a specific worksheet. By default,
71             B<ETL::Pipeline::Input::Excel> uses the first worksheet.
72              
73             B<worksheet> accepts a string or regular expression. As a string, B<worksheet>
74             looks for an exact match. As a regular expression, B<worksheet> finds the first
75             worksheet whose name matches the regular expression. Note that B<worksheet>
76             stops looking once it finds the first mach.
77              
78             B<ETL::Pipeline::Input::Excel> throws an error if it cannot find a worksheet
79             with a matching name.
80              
81             =cut
82              
83             has 'worksheet' => (
84             is => 'ro',
85             isa => 'Maybe[RegexpRef|Str]',
86             );
87              
88              
89             =head3 password
90              
91             B<password> works with encrypted files. B<ETL::Pipeline::Input::Excel> decrypts
92             the file automatically.
93              
94             B<Warning:> B<password> only works with Excel 2003 file (XLS). Encrypted XLSX
95             files always fail. L<Spreadsheet::XLSX> does not support encryption.
96              
97             =cut
98              
99             has 'password' => (
100             is => 'ro',
101             isa => 'Maybe[Str]',
102             );
103              
104              
105             =head2 Called from L<ETL::Pipeline/process>
106              
107             =head3 get
108              
109             B<get> retrieves one field from the current record. B<get> accepts one
110             parameter. That parameter can be an index number, a column name, or a regular
111             expression to match against column names.
112              
113             $etl->get( 0 );
114             $etl->get( 'A' );
115             $etl->get( 'First' );
116             $etl->get( qr/\bfirst\b/i );
117              
118             =cut
119              
120             sub get {
121             my ($self, $column) = @_;
122              
123             return undef unless $column =~ m/^\d+$/;
124              
125             my $row = $self->row;
126             my $cell = $self->tab->{Cells}->[$row][$column];
127             return defined( $cell ) ? $cell->value : undef;
128             }
129              
130              
131             =head3 next_record
132              
133             Read one record from the file for processing. B<next_record> returns a boolean.
134             I<True> means success. I<False> means it reached the end of the file.
135              
136             while ($input->next_record) {
137             ...
138             }
139              
140             =cut
141              
142             sub next_record {
143             my ($self) = @_;
144              
145             # If the last read grabbed the last row, then we've reached the end. This
146             # read will fail.
147             my $row = $self->row;
148             my $last_row = $self->tab->{MaxRow};
149             return 0 if $row == $last_row;
150              
151             # We're still in the data, so retrieve this row.
152             my $empty = 1;
153             my $cells = $self->tab->{Cells};
154              
155             # Skip blank rows, but don't loop forever.
156             while ($row <= $last_row && $empty) {
157             $row++;
158             if ($self->return_blank_rows) {
159             $empty = 0;
160             } else {
161             $empty = 1;
162             foreach my $column ($self->tab->{MinCol} .. $self->tab->{MaxCol}) {
163             if (hascontent( $cells->[$row][$column]->value )) {
164             $empty = 0;
165             last;
166             }
167             }
168             }
169             }
170             $self->row( $row );
171              
172             # If it's an emtpy row, then we reached the end of the data.
173             return ($empty ? 0 : 1);
174             }
175              
176              
177             =head3 get_column_names
178              
179             B<get_column_names> reads the field names from the first row in the file.
180             L</get> can match field names using regular expressions.
181              
182             =cut
183              
184             sub get_column_names {
185 2     2 1 4 my ($self) = @_;
186            
187 2         8 $self->next_record;
188             $self->add_column( $self->get( $_ ), $_ )
189 2         46 foreach ($self->tab->{MinCol} .. $self->tab->{MaxCol});
190             }
191              
192              
193             =head3 configure
194              
195             B<configure> opens the MS Excel spread sheet for reading. It creates the
196             correct worksheet object for XLS versus XLSX. XLS and XLSX files are different
197             formats. B<ETL::Pipeline::Input::Excel> uses the correct module for this
198             specific file.
199              
200             Both Excel parsers use coulmn numbers, starting with zero. B<configure>
201             automatically creates aliases for the column letters.
202              
203             =cut
204              
205             sub configure {
206             my ($self) = @_;
207              
208             # Create the correct worksheet objects based on the file format.
209             my $path = $self->file->stringify;
210             my $workbook;
211            
212             if ($path =~ m/\.xls$/i) {
213             my $excel = Spreadsheet::ParseExcel->new( Password => $self->password );
214             $workbook = $excel->parse( $path );
215             croak "Unable to open the Excel file $path" unless defined $workbook;
216             } else {
217             $workbook = Spreadsheet::XLSX->new( $path );
218             croak "Unable to open the Excel file $path" unless defined $workbook;
219             }
220              
221             # Find the worksheet with data...
222             my $name = $self->worksheet;
223             my $worksheet;
224             if (hascontent( $name )) {
225             if (ref( $name ) eq 'Regexp') {
226             $worksheet = first { $_->get_name() =~ m/$name/ } $workbook->worksheets();
227             } else {
228             $worksheet = $workbook->worksheet( $name );
229             }
230             croak "No workseets match '$name'" unless defined $worksheet;
231             } else {
232             $worksheet = $workbook->worksheet( 0 );
233             croak "'$path' has no worksheets" unless defined $worksheet;
234             }
235             $self->tab( $worksheet );
236              
237             # Convert the column numbers into their letter designations. Do this here
238             # instead of in getl_column_names. The letters apply to every spread sheet
239             # - even if no_column_names = 1, which bypasses get_column_names.
240             $self->add_column( int2col( $_ ), $_ )
241             foreach ($worksheet->{MinCol} .. $worksheet->{MaxCol});
242              
243             # Start on the first row as defined by the spread sheet.
244             $self->row( $worksheet->{MinRow} - 1 );
245             }
246              
247              
248             =head3 finish
249              
250             B<finish> closes the file.
251              
252             =cut
253              
254       6 1   sub finish {}
255              
256              
257             =head2 Other Methods & Attributes
258              
259             =head3 row
260              
261             B<row> is the next row in the spreadsheet for reading. Because
262             B<ETL::Pipeline::Input::Excel> skips blank rows, the L</record_number> may not
263             match the row number.
264              
265             =cut
266              
267             has 'row' => (
268             is => 'rw',
269             isa => 'Int',
270             );
271              
272              
273             =head3 tab
274              
275             B<tab> holds the current worksheet object. The Excel parsers return an object
276             for the tab (worksheet) with the data. It is set by L</find_worksheet>.
277              
278             =cut
279              
280             has 'tab' => (
281             is => 'rw',
282             isa => 'Maybe[Spreadsheet::ParseExcel::Worksheet]',
283             );
284              
285              
286             =head1 SEE ALSO
287              
288             L<ETL::Pipeline>, L<ETL::Pipeline::Input>, L<ETL::Pipeline::Input::File>,
289             L<ETL::Pipeline::Input::Tabular>
290              
291             =cut
292              
293             with 'ETL::Pipeline::Input::File';
294             with 'ETL::Pipeline::Input::Tabular';
295             with 'ETL::Pipeline::Input';
296              
297              
298             =head1 AUTHOR
299              
300             Robert Wohlfarth <robert.j.wohlfarth@vanderbilt.edu>
301              
302             =head1 LICENSE
303              
304             Copyright 2016 (c) Vanderbilt University Medical Center
305              
306             This program is free software; you can redistribute it and/or modify it under
307             the same terms as Perl itself.
308              
309             =cut
310              
311 1     1   52 no Moose;
  1         2  
  1         9  
312             __PACKAGE__->meta->make_immutable;