File Coverage

blib/lib/Text/CSV/Merge.pm
Criterion Covered Total %
statement 27 59 45.7
branch 1 20 5.0
condition 0 12 0.0
subroutine 9 13 69.2
pod 2 2 100.0
total 39 106 36.7


line stmt bran cond sub pod time code
1             package Text::CSV::Merge;
2             # ABSTRACT: Merge two CSV files
3              
4 4     4   162647 use Modern::Perl '2010';
  4         29202  
  4         22  
5 4     4   2454 use Moo 1.001000;
  4         37516  
  4         23  
6 4     4   4283 use IO::File;
  4         11  
  4         483  
7 4     4   3000 use Text::CSV_XS;
  4         30387  
  4         229  
8 4     4   4974 use DBI; # for DBD::CSV
  4         49575  
  4         255  
9 4     4   2263 use Log::Dispatch;
  4         691052  
  4         133  
10 4     4   1954 use autodie;
  4         50307  
  4         22  
11 4     4   18628 use utf8;
  4         6  
  4         32  
12              
13              
14             has +logger => (
15             is => 'lazy',
16             builder => sub {
17 0     0   0 Log::Dispatch->new(
18             outputs => [
19             [ 'File', autoflush => 1, min_level => 'debug', filename => 'merge.log', newline => 1, mode => '>>' ],
20             #[ 'Screen', min_level => 'info', newline => 1 ],
21             ],
22             );
23             }
24             );
25              
26             has +csv_parser => (
27             is => 'lazy',
28             builder => sub {
29 0 0   0   0 Text::CSV_XS->new({ binary => 1, eol => $/ })
30             or die "Cannot use CSV: " . Text::CSV_XS->error_diag();
31             }
32             );
33              
34             has +dbh => (
35             is => 'lazy',
36             # must return only a code ref
37             builder => sub {
38 0 0   0   0 DBI->connect("dbi:CSV:", undef, undef, {
39             RaiseError => 1,
40             PrintError => 1,
41             f_ext => ".csv/r",
42             # Better performance with XS
43             csv_class => "Text::CSV_XS",
44             # csv_null => 1,
45             }) or die "Cannot connect: $DBI::errstr";
46             }
47             );
48              
49             has base_file => (
50             is => 'rw',
51             required => 1,
52             #allow external names to be different from class attribute
53             init_arg => 'base',
54             #validate it
55             #isa => sub {},
56             coerce => sub {
57             my $base_fh = IO::File->new( $_[0], '<' ) or die "$_[0]: $!";
58             $base_fh->binmode(":utf8");
59            
60             return $base_fh;
61             }
62             );
63              
64             has merge_file => (
65             # We use only the raw file name/path and do not create a FH here, unlike base_file().
66             is => 'rw',
67             init_arg => 'merge',
68             required => 1
69             );
70              
71             has output_file => (
72             is => 'rw',
73             init_arg => 'output',
74             # an output file name is NOT required
75             required => 0,
76             default => 'merged_output.csv',
77             coerce => sub {
78             my $output_fh = IO::File->new( $_[0], '>' ) or die "$_[0]: $!";
79             $output_fh->binmode(":utf8");
80            
81             return $output_fh;
82             }
83             );
84              
85             has columns=> (
86             is => 'rw',
87             required => 1,
88             );
89              
90             has search_field => (
91             is => 'rw',
92             required => 1,
93             init_arg => 'search'
94             );
95              
96             has first_row_is_headers => (
97             is => 'rw',
98             required => 1,
99             #validate it
100             isa => sub {
101             # @TODO: there's got to be a better way to do this!
102             die "Must be 1 or 0" unless $_[0] =~ /'1'|'0'/ || $_[0] == 1 || $_[0] == 0;
103             },
104             );
105              
106             #=method BUILD
107             #Constructor.
108             #=cut
109             #sub BUILD {
110             # my $self = shift;
111             #}
112              
113             sub merge {
114 0     0 1 0 my $self = shift;
115            
116 0         0 $self->csv_parser->column_names( $self->columns );
117            
118             # Loop through the base file to find missing data
119             #@TODO: make into $self->rows?
120 0         0 my @rows;
121            
122 0         0 while ( my $row = $self->csv_parser->getline_hr( $self->base_file ) ) {
123             # skip column names
124 0 0 0     0 next if ($. == 1 and $self->first_row_is_headers);
125              
126 0 0       0 if ( $self->csv_parser->parse($row) ) {
127             # keep a list of null columns in this row
128 0         0 my @nulls;
129              
130             # might be slightly more efficient to use while()
131 0         0 foreach my $key ( keys %{$row} ) {
  0         0  
132             # which fields is this row missing?
133 0 0 0     0 if ( $row->{$key} eq 'NULL' or $row->{$key} eq "" ) {
134 0         0 push @nulls, $key;
135              
136 0         0 $self->logger->info("Missing data in column: $key for '$row->{$self->search_field}'");
137             }
138             }
139              
140             # make a hash of arrays
141 0 0       0 if ( @nulls ) {
142             # search $merge_file for the missing data's row
143 0         0 $" = ','; # reset the list separator for array interpolation to suit SQL
144            
145             # To get the original case for the columns, specify the column
146             # names rather than using SELECT *, since it normalizes to
147             # lowercase, per:
148             # http://stackoverflow.com/questions/3350775/dbdcsv-returns-header-in-lower-case
149 0 0       0 my $sth = $self->dbh->prepare(
150 0         0 "select @{$self->columns} from $self->{merge_file} where $self->{search_field} = ?"
151             ) or die "Cannot prepare: " . $self->dbh->errstr ();
152              
153 0         0 $sth->execute($row->{$self->search_field});
154            
155 0         0 while ( my $filler = $sth->fetchrow_hashref() ) {
156 0         0 foreach my $gap ( @nulls ) {
157 0 0 0     0 if (exists $filler->{$gap} and defined $filler->{$gap} and $filler->{$gap} ne "") {
      0        
158 0         0 $self->logger->info(
159             "Found Data: '$gap' = '$filler->{$gap}' for '$row->{$self->search_field}'"
160             );
161            
162 0         0 $row->{$gap} = $filler->{$gap};
163             } else {
164 0         0 $self->logger->info(
165             "Data not Found for column: '$gap' for '$row->{$self->search_field}' $self->{merge_file}"
166             );
167             }
168             }
169             }
170            
171             # Be efficient and neat!
172 0         0 $sth->finish();
173             }
174            
175             # insert the updated row as a reference; even if not processed, the
176             # row will still appear in the final output.
177 0         0 push @rows, $row;
178             } else {
179 0         0 my $err = $self->csv_parser->error_input;
180 0         0 $self->logger->error("Failed to parse line: $err");
181             }
182             }
183              
184             # Ensure we've processed to the end of the file
185 0 0       0 $self->csv_parser->eof or $self->csv_parser->error_diag();
186              
187             # print does NOT want an actual array! Use a hash slice, instead:
188             #$self->csv_parser->print($output_fh, [ @$_{@columns} ]) for @rows;
189             #
190             # Or, here I've switched to Text::CSV_XS's specific print_hr(), which
191             # is simply missing from the PP (Pure Perl) version.
192 0         0 $self->csv_parser->print_hr($self->output_file, $_) for @rows;
193             };
194              
195             sub DEMOLISH {
196 1     1 1 1867 my $self = shift;
197              
198             ## Clean up!
199 1         17 $self->base_file->close();
200 1 50       45 $self->output_file->close() or die "output.csv: $!";
201             }
202              
203              
204             1;
205              
206             __END__
207              
208             =pod
209              
210             =head1 NAME
211              
212             Text::CSV::Merge - Merge two CSV files
213              
214             =head1 VERSION
215              
216             version 0.002
217              
218             =head1 SYNOPSIS
219              
220             my $merger = Text::CSV::Merge->new({
221             base => 'into.csv',
222             merge => 'from.csv',
223             output => 'output.csv', # optional
224             columns => [q/email name age/],
225             search => 'email',
226             first_row_is_headers => 1 # optional
227             });
228              
229             $merger->merge();
230              
231             =head1 DESCRIPTION
232              
233             The use case for this module is when one has two CSV files of largely the same structure. Yet, the 'from.csv' has data which 'into.csv' lacks.
234              
235             In this initial release, Text::CSV::Merge only fills in empty cells; it does not overwrite data in 'into.csv' which also exists in 'from.csv'.
236              
237             =head2 Subclassing
238             Text::CSV::Merge may be subclassed. In the subclass, the following attributes may be overridden:
239              
240             =over 4
241              
242             =item *
243              
244             C<csv_parser>
245              
246             =item *
247              
248             C<dbh>
249              
250             =item *
251              
252             C<logger>
253              
254             =back
255              
256             =head1 ATTRIBUTES
257              
258             =head2 logger
259             The logger for all operations in this module.
260              
261             The logger records data gaps in the base CSV file, and records which data from the merge CSV file was used fill the gaps in the base CSV file.
262              
263             =head2 csv_parser
264             The CSV parser used internally is an immutable class property.
265              
266             The internal CSV parser is the XS version of Text::CSV: Text::CSV_XS. You may use Text::CSV::PP if you wish, but using any other parser which does not duplicate Text::CSV's API will probably not work without modifying the source of this module.
267              
268             Text::CSV_XS is also used, hard-coded, as the parser for DBD::CSV. This is configurable, however, and may be made configurable by the end-user in a future release. It can be overridden in a subclass.
269              
270             =head2 dbh
271             Create reusable DBI connection to the CSV data to be merged in to base file.
272              
273             This method is overridable in a subclass. A good use of this would be to merge data into an existing CSV file from a database, or XML file. It must conform to the DBI's API, however.
274              
275             DBD::CSV is a base requirement for this module.
276              
277             =head2 base_file
278             The CSV file into which new data will be merged.
279              
280             The base file is readonly, not read-write. This prevents accidental trashing of the original data.
281              
282             =head2 merge_file
283             The CSV file used to find data to merge into C<base_file>.
284              
285             =head2 output_file
286             The output file into which the merge results are written.
287              
288             I felt it imperative not to alter the original data files. I may make this a configurable option in the future, but wold likely set its default to 'false'.
289              
290             =head2 columns
291             The columns to be merged.
292              
293             A column to be merged must exist in both C<base_file> and C<merge_file>. Other than that requirement, each file may have other columns which do not exist in the other.
294              
295             =head2 search_field
296             The column/field to match rows in C<merge_file>.
297              
298             This column must exist in both files and be identially cased.
299              
300             =head2 first_row_is_headers
301             1 if the CSV files' first row are its headers; 0 if not.
302              
303             If there are no headers, then the column names supplied by the C<columns> argument/property are applied to the columns in each file virtually, in numerical orders as they were passed in the list.
304              
305             =head1 METHODS
306              
307             =head2 merge()
308             Main method and is public.
309              
310             C<merge()> performs the actual merge of the two CSV files.
311              
312             =head2 DEMOLISH()
313             This method locally overrides a Moo built-in.
314              
315             It close out all file handles, which will only occur after a call to C<merge()>.
316              
317             =head1 SEE ALSO
318              
319             =over 4
320              
321             =item *
322              
323             L<Text::CSV_XS>
324              
325             =back
326              
327             =head1 AUTHOR
328              
329             Michael Gatto <mgatto@lisantra.com>
330              
331             =head1 COPYRIGHT AND LICENSE
332              
333             This software is copyright (c) 2013 by Michael Gatto.
334              
335             This is free software; you can redistribute it and/or modify it under
336             the same terms as the Perl 5 programming language system itself.
337              
338             =cut