File Coverage

lib/UR/DataSource/File.pm
Criterion Covered Total %
statement 510 658 77.5
branch 213 362 58.8
condition 53 101 52.4
subroutine 56 59 94.9
pod 3 10 30.0
total 835 1190 70.1


line stmt bran cond sub pod time code
1             package UR::DataSource::File;
2              
3             # NOTE! This module is deprecated. Use UR::DataSource::Filesystem instead.
4              
5             # A data source implementation for text files where the fields
6             # are delimited by commas (or anything else really). Usually,
7             # the lines in the file will be sorted by one or more columns,
8             # but it isn't strictly necessary
9             #
10             # For now, it's structured around files where the record is delimited by
11             # newlines, and the fields are delimited by qr(\s*,\s*). Those are
12             # overridable in concrete data sources by specifying record_seperator() and
13             # delimiter().
14             # FIXME - work out a way to support record-oriented data as well as line-oriented data
15              
16 19     19   535 use UR;
  19         26  
  19         123  
17 19     19   69 use strict;
  19         26  
  19         328  
18 19     19   63 use warnings;
  19         24  
  19         851  
19             our $VERSION = "0.46"; # UR $VERSION;
20              
21 19     19   72 use Fcntl qw(:DEFAULT :flock);
  19         20  
  19         7662  
22 19     19   94 use Errno qw(EINTR EAGAIN EOPNOTSUPP);
  19         27  
  19         2308  
23 19     19   77 use File::Temp;
  19         30  
  19         1116  
24 19     19   79 use File::Basename;
  19         31  
  19         847  
25 19     19   73 use IO::File qw();
  19         22  
  19         24865  
26              
27             our @CARP_NOT = qw( UR::Context UR::DataSource::FileMux UR::Object::Type );
28              
29             class UR::DataSource::File {
30             is => ['UR::DataSource'],
31             has => [
32             delimiter => { is => 'String', default_value => '\s*,\s*', doc => 'Delimiter between columns on the same line' },
33             record_separator => { is => 'String', default_value => "\n", doc => 'Delimiter between lines in the file' },
34             column_order => { is => 'ARRAY', doc => 'Names of the columns in the file, in order' },
35             skip_first_line => { is => 'Integer', default_value => 0, doc => 'Number of lines at the start of the file to skip' },
36             handle_class => { is => 'String', default_value => 'IO::File', doc => 'Class to use for new file handles' },
37             quick_disconnect => { is => 'Boolean', default_value => 1, doc => 'Do not hold the file handle open between requests' },
38             ],
39             has_optional => [
40             server => { is => 'String', doc => 'pathname to the data file' },
41             file_list => { is => 'ARRAY', doc => 'list of pathnames of equivalent files' },
42             sort_order => { is => 'ARRAY', doc => 'Names of the columns by which the data file is sorted' },
43             constant_values => { is => 'ARRAY', doc => 'Property names which are not in the data file(s), but are part of the objects loaded from the data source' },
44              
45            
46             # REMOVE
47             #file_cache_index => { is => 'Integer', doc => 'index into the file cache where the next read will be placed' },
48             _open_query_count => { is => 'Integer', doc => 'number of queries currently using this data source, used internally' },
49            
50             ],
51             doc => 'A data source for line-oriented files',
52             };
53              
54              
55 10     10 0 23 sub can_savepoint { 0;} # Doesn't support savepoints
56            
57             sub create_default_handle {
58 375     375 0 416 my $self = shift;
59              
60 375 50       1113 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
61 0         0 my $time = time();
62 0         0 UR::DBI->sql_fh->printf("\nFILE OPEN AT %d [%s]\n",$time, scalar(localtime($time)));
63             }
64              
65 375         1459 my $filename = $self->server;
66 375 100       11748 unless (-e $filename) {
67             # file doesn't exist
68 2         6 $filename = '/dev/null';
69             }
70              
71 375         1174 my $handle_class = $self->handle_class;
72 375         2584 my $fh = $handle_class->new($filename);
73 375 50       27407 unless($fh) {
74 0         0 $self->error_message("Can't open ".$self->server." for reading: $!");
75 0         0 return;
76             }
77              
78 375 50       1027 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
79 0         0 UR::DBI->sql_fh->printf("FILE: opened %s fileno %d\n\n",$self->server, $fh->fileno);
80             }
81              
82 375         1426 $self->is_connected(1);
83 375         793 return $fh;
84             }
85              
86             sub disconnect {
87 2     2 0 6 my $self = shift;
88              
89 2 50       4 if ($self->has_default_handle) {
90 2         6 my $fh = $self->get_default_handle;
91 2         26 flock($fh,LOCK_UN);
92 2         14 $fh->close();
93 2         192 $self->__invalidate_get_default_handle__;
94 2         8 $self->is_connected(0);
95             }
96             }
97              
98             sub _file_position {
99 1222     1222   1005 my $self = shift;
100 1222         2479 my $fh = $self->get_default_handle;
101 1222 50       3391 return $fh ? $fh->tell() : undef;
102             }
103              
104             sub prepare_for_fork {
105 2     2 0 4 my $self = shift;
106              
107             # make sure this is clear before we fork
108 2         6 $self->{'_fh_position'} = undef;
109 2 50       8 if ($self->has_default_handle) {
110 2         10 $self->{'_fh_position'} = $self->_file_position();
111 2 50       24 UR::DBI->sql_fh->printf("FILE: preparing to fork; closing file %s and noting position at %s\n",$self->server, $self->{'_fh_position'}) if $ENV{'UR_DBI_MONITOR_SQL'};
112             }
113 2         36 $self->disconnect_default_handle;
114             }
115              
116             sub finish_up_after_fork {
117 2     2 0 19 my $self = shift;
118 2 50       29 if (defined $self->{'_fh_position'}) {
119 2 50       27 UR::DBI->sql_fh->printf("FILE: resetting after fork; reopening file %s and fast-forwarding to %s\n",$self->server, $self->{'_fh_position'}) if $ENV{'UR_DBI_MONITOR_SQL'};
120 2         30 my $fh = $self->get_default_handle;
121 2         24 $fh->seek($self->{'_fh_position'},0);
122             }
123             }
124              
125             sub _regex {
126 376     376   386 my $self = shift;
127              
128 376 100       778 unless ($self->{'_regex'}) {
129 26         96 my $delimiter = $self->delimiter;
130 26         49 my $r = eval { qr($delimiter) };
  26         464  
131 26 50 33     158 if ($@ || !$r) {
132 0         0 $self->error_message("Unable to interepret delimiter '".$self->delimiter.": $@");
133 0         0 return;
134             }
135 26         68 $self->{'_regex'} = $r;
136             }
137 376         514 return $self->{'_regex'};
138             }
139              
140             # We're overriding server() so everyone else can have a single way of getting
141             # the file's pathname instead of having to know about both server and file_list
142             sub server {
143 45     45 1 51 my $self = shift;
144              
145 45 100       95 unless ($self->{'_cached_server'}) {
146 16 100       65 if ($self->__server()) {
    50          
147 15         39 $self->{'_cached_server'} = $self->__server();
148             } elsif ($self->file_list) {
149 1         2 my $files = $self->file_list;
150 1         2 my $count = scalar(@$files);
151 1         3 my $idx = $$ % $count;
152 1         3 $self->{'_cached_server'} = $files->[$idx];
153             } else {
154 0         0 die "Data source ",$self->id," didn't specify either server or file_list";
155             }
156             }
157 45         81 return $self->{'_cached_server'};
158             }
159              
160              
161             # Should be divisible by 3
162             our $MAX_CACHE_SIZE = 99;
163             # The offset cache is an arrayref containing three pieces of data:
164             # 0: If this cache slot is being used by a loading iterator
165             # 1: concatenated data from the sorted columns for comparison with where you are in the file
166             # 2: the seek position that line came from
167             sub _offset_cache {
168 1116     1116   974 my $self = shift;
169              
170 1116 100       1896 unless ($self->{'_offset_cache'}) {
171 25         58 $self->{'_offset_cache'} = [];
172             }
173 1116         1181 return $self->{'_offset_cache'};
174             }
175              
176             our %iterator_data_source;
177             our %iterator_cache_slot_refs;
178              
179             sub _allocate_offset_cache_slot {
180 372     372   381 my $self = shift;
181              
182 372         620 my $cache = $self->_offset_cache();
183 372         454 my $next = scalar(@$cache);
184             #print STDERR "_allocate_offset_cache_slot ".$self->server." current size is $next ";
185 372 100       724 if ($next > $MAX_CACHE_SIZE) {
186             #print STDERR "searching... \n";
187 282         493 my $last_offset_cache_slot = $self->{'_last_offset_cache_slot'};
188 282 100       552 if ($last_offset_cache_slot >= $MAX_CACHE_SIZE) {
189 9         15 $next = 0;
190             } else {
191 273         385 $next = $last_offset_cache_slot + 3;
192             }
193             # Search for an unused slot
194 282   33     869 while ($cache->[$next] and $next != $last_offset_cache_slot) {
195 0         0 $next += 3;
196 0 0       0 $next = 0 if ($next > $MAX_CACHE_SIZE);
197             }
198 282 50 33     1565 if ($next > $MAX_CACHE_SIZE or $next eq $last_offset_cache_slot) {
199             #print STDERR scalar(keys(%iterator_data_source))." items in iterator_data_source ".scalar(keys(%iterator_cache_slot))." in iterator_cache_slot\n";
200 0         0 Carp::carp("Unable to find an open file offset cache slot because there are too many outstanding loading iterators. Temporarily expanding the cache...");
201             # We'll let it go ahead and expand the list
202 0         0 $next = $MAX_CACHE_SIZE;
203 0         0 $MAX_CACHE_SIZE += 3;
204             }
205             }
206 372         423 $cache->[$next] = 1;
207 372         500 $cache->[$next+1] = undef;
208 372         489 $cache->[$next+2] = undef;
209              
210 372         436 $self->{'_last_offset_cache_slot'} = $next;
211             #print STDERR "using slot $next current size ".scalar(@$cache)."\n";
212 372         550 return $next;
213             }
214              
215              
216             sub _free_offset_cache_slot {
217 372     372   492 my($self, $cache_slot) = @_;
218              
219 372         671 my $cache = $self->_offset_cache();
220 372 50       767 unless ($cache_slot < scalar(@$cache)) {
221 0         0 $self->warning_message("Freeing offset cache slot past the end. Current size ".scalar(@$cache).", requested $cache_slot");
222 0         0 return;
223             }
224              
225 372 50       804 unless (defined $cache->[$cache_slot]) {
226 0         0 $self->warning_message("Freeing unused offset cache slot $cache_slot");
227 0         0 return;
228             }
229              
230 372 50 66     1018 if ($cache->[$cache_slot+1] and scalar(@{$cache->[$cache_slot+1]}) == 0) {
  169         489  
231             # There's no data in here. Must have happened when the reader went all the
232             # way to the end of the file and found nothing. Remove this entry completely
233             # because it's not helpful
234 0         0 splice(@$cache, $cache_slot,3);
235              
236             } else {
237             # There is data in here, mark it as a free slot
238 372         422 $cache->[$cache_slot] = 0;
239             }
240 372         524 return 1;
241             }
242            
243              
244              
245             sub _invalidate_cache {
246 4     4   6 my $self = shift;
247              
248 4         9 $self->{'_offset_cache'} = [];
249            
250 4         15 return 1;
251             }
252              
253             sub _generate_loading_templates_arrayref {
254 49     49   85 my($self,$old_sql_cols) = @_;
255              
256 49         215 my $columns_in_file = $self->column_order;
257 49         149 my %column_to_position_map;
258 49         152 for (my $i = 0; $i < @$columns_in_file; $i++) {
259 136         330 $column_to_position_map{$columns_in_file->[$i]} = $i;
260             }
261              
262             # strip out columns that don't exist in the file
263 49         57 my $sql_cols;
264 49         102 foreach my $column_data ( @$old_sql_cols ) {
265 212         342 my $propertys_column_name = $column_data->[1]->column_name;
266 212 100 66     567 next unless ($propertys_column_name and exists($column_to_position_map{$propertys_column_name}));
267              
268 136         185 push @$sql_cols, $column_data;
269             }
270              
271 49 50       136 unless ($sql_cols) {
272 0         0 $self->error_message("Couldn't determine column information for data source " . $self->id);
273 0         0 return;
274             }
275              
276             # reorder the requested columns to be in the same order as the file
277             my @sql_cols_with_column_name =
278 49         87 map{ [ $column_to_position_map{ $_->[1]->column_name }, $_ ] }
  136         254  
279             @$sql_cols;
280             my @sorted_sql_cols =
281 136         182 map { $_->[1] }
282 49         227 sort { $a->[0] <=> $b->[0] }
  125         205  
283             @sql_cols_with_column_name;
284 49         89 $sql_cols = \@sorted_sql_cols;
285 49         317 my $templates = $self->SUPER::_generate_loading_templates_arrayref($sql_cols);
286              
287 49 100       241 if (my $constant_values = $self->constant_values) {
288             # Find the first unused index in the loading template
289 12         37 my $next_template_slot = -1;
290 12         23 foreach my $tmpl ( @$templates ) {
291 12         11 foreach my $col ( @{$tmpl->{'column_positions'}} ) {
  12         19  
292 36 50       49 if ($col >= $next_template_slot) {
293 36         39 $next_template_slot = $col + 1;
294             }
295             }
296             }
297 12 50       30 if ($next_template_slot == -1) {
298 0         0 die "Couldn't determine last column in loading template for data source" . $self->id;
299             }
300            
301 12         19 foreach my $prop ( @$constant_values ) {
302 18         16 push @{$templates->[0]->{'column_positions'}}, $next_template_slot++;
  18         29  
303 18         15 push @{$templates->[0]->{'property_names'}}, $prop;
  18         30  
304             }
305             }
306            
307 49         220 return $templates;
308             }
309              
310             sub _things_in_list_are_numeric {
311 260     260   258 my $self = shift;
312              
313 260         241 foreach ( @{$_[0]} ) {
  260         493  
314 276 100       1125 return 0 if (! Scalar::Util::looks_like_number($_));
315             }
316 213         715 return 1;
317             }
318              
319             # Construct a closure to perform a test on the $index-th column of
320             # @$$next_candidate_row. The closures return 0 is the test is successful,
321             # -1 if unsuccessful but the file's value was less than $value, and 1
322             # if unsuccessful and greater. The iterator that churns throug the file
323             # knows that if it's comparing an ID/sorted column, and the comparator
324             # returns 1 then we've gone past the point where we can expect to ever
325             # find another successful match and we should stop looking
326             my $ALWAYS_FALSE = sub { -1 };
327             sub _comparator_for_operator_and_property {
328 359     359   618 my($self,$property,$next_candidate_row, $index, $operator,$value) = @_;
329              
330 19     19   96 no warnings 'uninitialized'; # we're handling ''/undef/null specially below where it matters
  19         23  
  19         39244  
331              
332 359 100 100     2629 if ($operator eq 'between') {
    100          
    100          
    100          
    100          
    100          
333 44 100 100     163 if ($value->[0] eq '' or $value->[1] eq '') {
334 28         40 return $ALWAYS_FALSE;
335             }
336              
337 16 50 33     38 if ($property->is_numeric and $self->_things_in_list_are_numeric($value)) {
338 16 50       31 if ($value->[0] > $value->[1]) {
339             # Will never be true
340 0         0 Carp::carp "'between' comparison will never be true with values ".$value->[0]," and ".$value->[1];
341 0         0 return $ALWAYS_FALSE;
342             }
343              
344             # numeric 'between' comparison
345             return sub {
346 32 50   32   67 return -1 if ($$next_candidate_row->[$index] eq '');
347 0 0       0 if ($$next_candidate_row->[$index] < $value->[0]) {
    0          
348 0         0 return -1;
349             } elsif ($$next_candidate_row->[$index] > $value->[1]) {
350 0         0 return 1;
351             } else {
352 0         0 return 0;
353             }
354 16         77 };
355             } else {
356 0 0       0 if ($value->[0] gt $value->[1]) {
357 0         0 Carp::carp "'between' comparison will never be true with values ".$value->[0]," and ".$value->[1];
358 0         0 return $ALWAYS_FALSE;
359             }
360              
361             # A string 'between' comparison
362             return sub {
363 0 0   0   0 return -1 if ($$next_candidate_row->[$index] eq '');
364 0 0       0 if ($$next_candidate_row->[$index] lt $value->[0]) {
    0          
365 0         0 return -1;
366             } elsif ($$next_candidate_row->[$index] gt $value->[1]) {
367 0         0 return 1;
368             } else {
369 0         0 return 0;
370             }
371 0         0 };
372             }
373              
374             } elsif ($operator eq 'in') {
375 20 100       50 if (! @$value) {
376 8         14 return $ALWAYS_FALSE;
377             }
378              
379 12 100 66     51 if ($property->is_numeric and $self->_things_in_list_are_numeric($value)) {
380             # Numeric 'in' comparison returns undef if we're within the range of the list
381             # but don't actually match any of the items in the list
382 8         27 @$value = sort { $a <=> $b } @$value; # sort the values first
  0         0  
383             return sub {
384 16 50   16   40 return -1 if ($$next_candidate_row->[$index] eq '');
385 0 0       0 if ($$next_candidate_row->[$index] < $value->[0]) {
    0          
386 0         0 return -1;
387             } elsif ($$next_candidate_row->[$index] > $value->[-1]) {
388 0         0 return 1;
389             } else {
390 0         0 foreach ( @$value ) {
391 0 0       0 return 0 if $$next_candidate_row->[$index] == $_;
392             }
393 0         0 return -1;
394             }
395 8         51 };
396              
397             } else {
398             # A string 'in' comparison
399 4         13 @$value = sort { $a cmp $b } @$value;
  0         0  
400             return sub {
401 8 50   8   29 if ($$next_candidate_row->[$index] lt $value->[0]) {
    50          
402 0         0 return -1;
403             } elsif ($$next_candidate_row->[$index] gt $value->[-1]) {
404 0         0 return 1;
405             } else {
406 8         18 foreach ( @$value ) {
407 8 50       19 return 0 if $$next_candidate_row->[$index] eq $_;
408             }
409 0         0 return -1;
410             }
411 4         26 };
412              
413             }
414              
415             } elsif ($operator eq 'not in') {
416 14 100       41 if (! @$value) {
417 4         6 return $ALWAYS_FALSE;
418             }
419              
420 10 100 100     38 if ($property->is_numeric and $self->_things_in_list_are_numeric($value)) {
421             return sub {
422 8 50   8   18 return -1 if ($$next_candidate_row->[$index] eq '');
423 0         0 foreach ( @$value ) {
424 0 0       0 return -1 if $$next_candidate_row->[$index] == $_;
425             }
426 0         0 return 0;
427             }
428              
429 4         21 } else {
430             return sub {
431 16     16   25 foreach ( @$value ) {
432 22 100       53 return -1 if $$next_candidate_row->[$index] eq $_;
433             }
434 4         6 return 0;
435             }
436 6         37 }
437              
438             } elsif ($operator eq 'like') {
439             # 'like' is always a string comparison. In addition, we can't know if we're ahead
440             # or behind in the file's ID columns, so the only two return values are 0 and 1
441            
442 18 100       60 return $ALWAYS_FALSE if ($value eq ''); # property like NULL is always false
443              
444             # Convert SQL-type wildcards to Perl-type wildcards
445             # Convert a % to a *, and _ to ., unless they're preceeded by \ to escape them.
446             # Not that this isn't precisely correct, as \\% should really mean a literal \
447             # followed by a wildcard, but we can't be correct in all cases without including
448             # a real parser. This will catch most cases.
449              
450 14         82 $value =~ s/(?
451 14         29 $value =~ s/(?
452 14         109 my $regex = qr($value);
453             return sub {
454 32 100   32   86 return -1 if ($$next_candidate_row->[$index] eq '');
455 8 100       32 if ($$next_candidate_row->[$index] =~ $regex) {
456 2         6 return 0;
457             } else {
458 6         8 return 1;
459             }
460 14         92 };
461              
462             } elsif ($operator eq 'not like') {
463 16 100       53 return $ALWAYS_FALSE if ($value eq ''); # property like NULL is always false
464 12         69 $value =~ s/(?
465 12         29 $value =~ s/(?
466 12         90 my $regex = qr($value);
467             return sub {
468 24 50   24   72 return -1 if ($$next_candidate_row->[$index] eq '');
469 0 0       0 if ($$next_candidate_row->[$index] =~ $regex) {
470 0         0 return 1;
471             } else {
472 0         0 return 0;
473             }
474 12         82 };
475              
476              
477             # FIXME - should we only be testing the numericness of the property?
478             } elsif ($property->is_numeric and $self->_things_in_list_are_numeric([$value])) {
479             # Basic numeric comparisons
480 185 100 33     614 if ($operator eq '=') {
    100          
    100          
    100          
    100          
    50          
    100          
    50          
481             return sub {
482 1470 100   1470   2178 return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
483 1446         1873 return $$next_candidate_row->[$index] <=> $value;
484 130         769 };
485             } elsif ($operator eq '<') {
486             return sub {
487 24 50   24   47 return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
488 0 0       0 $$next_candidate_row->[$index] < $value ? 0 : 1;
489 12         56 };
490             } elsif ($operator eq '<=') {
491             return sub {
492 24 50   24   65 return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
493 0 0       0 $$next_candidate_row->[$index] <= $value ? 0 : 1;
494 12         78 };
495             } elsif ($operator eq '>') {
496             return sub {
497 24 50   24   70 return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
498 0 0       0 $$next_candidate_row->[$index] > $value ? 0 : -1;
499 12         92 };
500             } elsif ($operator eq '>=') {
501             return sub {
502 24 50   24   58 return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
503 0 0       0 $$next_candidate_row->[$index] >= $value ? 0 : -1;
504 12         77 };
505             } elsif ($operator eq 'true') {
506             return sub {
507 0 0   0   0 $$next_candidate_row->[$index] ? 0 : -1;
508 0         0 };
509             } elsif ($operator eq 'false') {
510             return sub {
511 2 50   2   5 $$next_candidate_row->[$index] ? -1 : 0;
512 1         6 };
513             } elsif ($operator eq '!=' or $operator eq 'ne') {
514             return sub {
515 12 50   12   27 return 0 if ($$next_candidate_row->[$index] eq ''); # null always != a number
516 0 0       0 $$next_candidate_row->[$index] != $value ? 0 : -1;
517             }
518 6         29 }
519              
520             } else {
521             # Basic string comparisons
522 62 100 0     254 if ($operator eq '=') {
    100          
    100          
    100          
    100          
    100          
    50          
    0          
523             return sub {
524 120 50 50 120   505 return -1 if ($$next_candidate_row->[$index] eq '' xor $value eq '');
525 120         168 return $$next_candidate_row->[$index] cmp $value;
526 25         168 };
527             } elsif ($operator eq '<') {
528             return sub {
529 16 50   16   30 $$next_candidate_row->[$index] lt $value ? 0 : 1;
530 8         38 };
531             } elsif ($operator eq '<=') {
532             return sub {
533 16 50 33 16   46 return -1 if ($$next_candidate_row->[$index] eq '' or $value eq '');
534 0 0       0 $$next_candidate_row->[$index] le $value ? 0 : 1;
535 8         44 };
536             } elsif ($operator eq '>') {
537             return sub {
538 16 50   16   45 $$next_candidate_row->[$index] gt $value ? 0 : -1;
539 8         54 };
540             } elsif ($operator eq '>=') {
541             return sub {
542 16 50 33 16   74 return -1 if ($$next_candidate_row->[$index] eq '' or $value eq '');
543 0 0       0 $$next_candidate_row->[$index] ge $value ? 0 : -1;
544 8         46 };
545             } elsif ($operator eq 'true') {
546             return sub {
547 8 50   8   17 $$next_candidate_row->[$index] ? 0 : -1;
548 4         29 };
549             } elsif ($operator eq 'false') {
550             return sub {
551 2 50   2   6 $$next_candidate_row->[$index] ? -1 : 0;
552 1         5 };
553             } elsif ($operator eq '!=' or $operator eq 'ne') {
554             return sub {
555 0 0   0   0 $$next_candidate_row->[$index] ne $value ? 0 : -1;
556             }
557 0         0 }
558             }
559             }
560            
561             my $iterator_serial = 0;
562             sub create_iterator_closure_for_rule {
563 372     372 1 381 my($self,$rule) = @_;
564              
565 372         817 my $class_name = $rule->subject_class_name;
566 372         1074 my $class_meta = $class_name->__meta__;
567 372         707 my $rule_template = $rule->template;
568              
569 372         1293 my $csv_column_order_names = $self->column_order;
570 372         1280 my $csv_column_count = scalar @$csv_column_order_names;
571              
572 372         1083 my $operators_for_properties = $rule_template->operators_for_properties();
573 372         814 my $values_for_properties = $rule->legacy_params_hash;
574 372         1053 foreach ( values %$values_for_properties ) {
575 1506 50 66     2826 if (ref eq 'HASH' and exists $_->{'value'}) {
576 188         339 $_ = $_->{'value'};
577             }
578             }
579              
580 372         968 my $sort_order_names = $self->sort_order;
581 372         1069 my %sort_column_names = map { $_ => 1 } @$sort_order_names;
  371         953  
582 372         525 my @non_sort_column_names = grep { ! exists($sort_column_names{$_}) } @$csv_column_order_names;
  1096         1712  
583              
584 372         397 my %column_name_to_index_map;
585 372         882 for (my $i = 0; $i < @$csv_column_order_names; $i++) {
586 1096         1957 $column_name_to_index_map{$csv_column_order_names->[$i]} = $i;
587             }
588              
589             # Index in the split-file-data for each sorted column in order
590 372         460 my @sort_order_column_indexes = map { $column_name_to_index_map{$_} } @$sort_order_names;
  371         769  
591              
592 372         406 my(%property_meta_for_column_name);
593 372         489 foreach my $column_name ( @$csv_column_order_names ) {
594 1096         3445 my $prop = UR::Object::Property->get(class_name => $class_name, column_name => $column_name);
595 1096         1007 our %WARNED_ABOUT_COLUMN;
596 1096 50 33     2348 unless ( $prop or $WARNED_ABOUT_COLUMN{$class_name . '::' . $column_name}++) {
597 0         0 $self->warning_message("Couldn't find a property in class $class_name that goes with column $column_name");
598 0         0 next;
599             }
600 1096         1906 $property_meta_for_column_name{$column_name} = $prop;
601             }
602              
603 372         398 my @rule_columns_in_order; # The order we should perform rule matches on - value is the index in @next_file_row to test
604             my @comparison_for_column; # closures to call to perform the match - same order as @rule_columns_in_order
605 372         393 my $last_sort_column_in_rule = -1; # Last index in @rule_columns_in_order that applies when trying "the shortcut"
606 372         348 my $looking_for_sort_columns = 1;
607              
608 372         405 my $next_candidate_row; # This will be filled in by the closure below
609 372         525 foreach my $column_name ( @$sort_order_names, @non_sort_column_names ) {
610 1096         1215 my $property_meta = $property_meta_for_column_name{$column_name};
611 1096 50       1596 unless ($property_meta) {
612 0         0 Carp::croak("Class $class_name has no property connected to column named '$column_name' in data source ".$self->id);
613             }
614 1096         2240 my $property_name = $property_meta->property_name;
615 1096 100 66     2660 if (! $operators_for_properties->{$property_name}) {
    100          
616 737         623 $looking_for_sort_columns = 0;
617 737         996 next;
618             } elsif ($looking_for_sort_columns && $sort_column_names{$column_name}) {
619 129         165 $last_sort_column_in_rule++;
620             } else {
621             # There's been a gap in the ID column list in the rule, stop looking for
622             # further ID columns
623 230         235 $looking_for_sort_columns = 0;
624             }
625              
626 359         590 push @rule_columns_in_order, $column_name_to_index_map{$column_name};
627            
628 359         482 my $operator = $operators_for_properties->{$property_name};
629 359         436 my $rule_value = $values_for_properties->{$property_name};
630            
631             my $comparison_function = $self->_comparator_for_operator_and_property($property_meta,
632             \$next_candidate_row,
633 359         1275 $column_name_to_index_map{$column_name},
634             $operator,
635             $rule_value);
636 359 50       898 unless ($comparison_function) {
637 0         0 Carp::croak("Unknown operator '$operator' in file data source filter");
638             }
639 359         583 push @comparison_for_column, $comparison_function;
640             }
641              
642 372         942 my $split_regex = $self->_regex();
643              
644             # FIXME - another performance boost might be to do some kind of binary search
645             # against the file to set the initial/next position?
646 372         385 my $file_pos = 0;
647              
648             # search in the offset cache for something helpful
649 372         761 my $offset_cache = $self->_offset_cache();
650              
651             # If the rule doesn't touch the sorted columns, then we can't use the offset cache for help :(
652 372 100       820 if ($last_sort_column_in_rule >= 0) {
653             # Starting at index 1 because we're interested in the file and seek data, not if it's in use
654             # offset 0 is the in-use flag, offset 1 is a ref to the file data and offset 2 is the file seek pos
655             SEARCH_CACHE:
656 129         378 for (my $i = 1; $i < @$offset_cache; $i+=3) {
657 3343 100 66     7703 next unless (defined($offset_cache->[$i]) && defined($offset_cache->[$i+1]));
658              
659 1205         993 $next_candidate_row = $offset_cache->[$i];
660 1205         744 my $matched = 0;
661             COMPARE_VALUES:
662 1205         1551 for (my $c = 0; $c <= $last_sort_column_in_rule; $c++) {
663 1205         1138 my $comparison = $comparison_for_column[$c]->();
664              
665 1205 100       2609 next SEARCH_CACHE if $comparison > 0;
666 15 100       38 if ($comparison < 0) {
667 10         12 $matched = 1;
668 10         16 last COMPARE_VALUES;
669             }
670             }
671             # If we made it this far, then the file data in this slot is earlier in the file
672             # than the data we're looking for. So, if the seek pos data is later than what
673             # we've found yet, use it instead
674 15 100 66     74 if ($matched and $offset_cache->[$i+1] > $file_pos) {
675 10         26 $file_pos = $offset_cache->[$i+1];
676             }
677             }
678             }
679              
680 372         388 my($monitor_start_time,$monitor_printed_first_fetch);
681 372 50       951 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
682 0         0 $monitor_start_time = Time::HiRes::time();
683 0         0 $monitor_printed_first_fetch = 0;
684 0         0 my @filters_list;
685 0         0 for (my $i = 0; $i < @rule_columns_in_order; $i++) {
686 0         0 my $column = $rule_columns_in_order[$i];
687 0         0 my $column_name = $csv_column_order_names->[$column];
688 0 0       0 my $is_sorted = $i <= $last_sort_column_in_rule ? ' (sorted)' : '';
689 0   0     0 my $operator = $operators_for_properties->{$column_name} || '=';
690 0         0 my $rule_value = $values_for_properties->{$column_name};
691 0 0       0 if (ref $rule_value eq 'ARRAY') {
692 0         0 $rule_value = '[' . join(',', @$rule_value) . ']';
693             }
694 0         0 my $filter_string = $column_name . " $operator $rule_value" . $is_sorted;
695 0         0 push @filters_list, $filter_string;
696             }
697 0         0 my $filter_list = join("\n\t", @filters_list);
698 0         0 UR::DBI->sql_fh->printf("\nFILE: %s\nFILTERS %s\n\n", $self->server, $filter_list);
699             }
700              
701 372   100     1001 $self->{'_last_read_serial'} ||= '';
702              
703 372         1046 my $record_separator = $self->record_separator;
704 372         788 my $cache_slot = $self->_allocate_offset_cache_slot();
705 372         389 my $cache_insert_counter = 100; # a "breadcrumb" will be left in the offset cache after this many lines are read
706              
707 372         311 my $lines_read = 0;
708 372         272 my $printed_first_match = 0;
709 372         343 my $lines_matched = 0;
710              
711 372         338 my $fh; # File handle we'll be reading from
712 372         344 my $this_iterator_serial = $iterator_serial++;
713             my $iterator = sub {
714              
715 594 100   594   1065 unless (ref($fh)) {
716 372         1129 $fh = $self->get_default_handle();
717             # Lock the file for reading... For more fine-grained locking we could move this to
718             # after READ_LINE_FROM_FILE: but that would slow down read operations a bit. If
719             # there ends up being a problem with lock contention, go ahead and move it before $line = <$fh>;
720             #flock($fh,LOCK_SH);
721             }
722              
723 594 50 33     1623 if ($monitor_start_time && ! $monitor_printed_first_fetch) {
724 0         0 UR::DBI->sql_fh->printf("FILE: FIRST FETCH TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
725 0         0 $monitor_printed_first_fetch = 1;
726             }
727              
728 594 100       1779 if ($self->{'_last_read_serial'} ne $this_iterator_serial) {
729 374 50       929 UR::DBI->sql_fh->printf("FILE: Resetting file position to $file_pos\n") if $ENV{'UR_DBI_MONITOR_SQL'};
730             # The last read was from a different request, reset the position
731 374         1510 $fh->seek($file_pos,0);
732 374 100       3121 if ($file_pos == 0) {
733 365         1001 my $skip = $self->skip_first_line;
734 365         1004 while ($skip-- > 0) {
735 0         0 scalar(<$fh>);
736             }
737             }
738 374         792 $file_pos = $self->_file_position();
739              
740 374         2025 $self->{'_last_read_serial'} = $this_iterator_serial;
741             }
742              
743 594         1777 local $/; # Make sure some wise guy hasn't changed this out from under us
744 594         846 $/ = $record_separator;
745              
746 594         737 my $line;
747             READ_LINE_FROM_FILE:
748 594         1307 until($line) {
749            
750             # Hack for OSX 10.5.
751             # At EOF, the getline below will return undef. Most builds of Perl
752             # will also set $! to 0 at EOF so you can distinguish between the cases
753             # of EOF (which may have actually happened a while ago because of buffering)
754             # and an actual read error. OSX 10.5's Perl does not, and so $!
755             # retains whatever value it had after the last failed syscall, likely
756             # a stat() while looking for a Perl module. This should have no effect
757             # other platforms where you can't trust $! at arbitrary points in time
758             # anyway
759 1095         1696 $! = 0;
760 1095         8610 $line = <$fh>;
761              
762 1095 100       1844 unless (defined $line) {
763 249 50       707 if ($!) {
764 0 0 0     0 redo READ_LINE_FROM_FILE if ($! == EAGAIN or $! == EINTR);
765 0         0 my $pathname = $self->server();
766 0         0 Carp::confess("getline() failed for DataSource $self pathname $pathname boolexpr $rule: $!");
767             }
768              
769             # at EOF. Close up shop and return
770             #flock($fh,LOCK_UN);
771 249         260 $fh = undef;
772              
773 249 50       486 if ($monitor_start_time) {
774 0         0 UR::DBI->sql_fh->printf("FILE: at EOF\nFILE: $lines_read lines read for this request. $lines_matched matches\nFILE: TOTAL EXECUTE-FETCH TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
775             }
776              
777 249         892 return;
778             }
779              
780 846         720 $lines_read++;
781 846         810 my $last_read_size = length($line);
782 846         974 chomp $line;
783             # FIXME - to support record-oriented files, we need some replacement for this...
784 846         4124 $next_candidate_row = [ split($split_regex, $line, $csv_column_count) ];
785 846         1321 $#{$a} = $csv_column_count-1;
  846         1209  
786              
787 846         1339 $file_pos = $self->_file_position();
788 846         3095 my $file_pos_before_read = $file_pos - $last_read_size;
789              
790             # Every so many lines read, leave a breadcrumb about what we've seen
791 846 50       1404 unless ($lines_read % $cache_insert_counter) {
792 0         0 $offset_cache->[$cache_slot+1] = $next_candidate_row;
793 0         0 $offset_cache->[$cache_slot+2] = $file_pos_before_read;
794 0         0 $self->_free_offset_cache_slot($cache_slot);
795              
796             # get a new slot
797 0         0 $cache_slot = $self->_allocate_offset_cache_slot();
798 0         0 $offset_cache->[$cache_slot+1] = $next_candidate_row;
799 0         0 $offset_cache->[$cache_slot+2] = $file_pos_before_read;
800              
801 0         0 $cache_insert_counter <<= 2; # Double the insert counter
802             }
803              
804 846         1554 for (my $i = 0; $i < @rule_columns_in_order; $i++) {
805 801         1198 my $comparison = $comparison_for_column[$i]->();
806              
807 801 100 100     2580 if ($comparison > 0 and $i <= $last_sort_column_in_rule) {
    100          
808             # We've gone past the last thing that could possibly match
809              
810 121 50       239 if ($monitor_start_time) {
811 0         0 UR::DBI->sql_fh->printf("FILE: $lines_read lines read for this request. $lines_matched matches\nFILE: TOTAL EXECUTE-FETCH TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
812             }
813              
814             #flock($fh,LOCK_UN);
815              
816             # Save the info from the last row we read
817 121         224 $offset_cache->[$cache_slot+1] = $next_candidate_row;
818 121         248 $offset_cache->[$cache_slot+2] = $file_pos_before_read;
819 121         489 return;
820            
821             } elsif ($comparison) {
822             # comparison didn't match, read another line from the file
823 501         710 redo READ_LINE_FROM_FILE;
824             }
825              
826             # That comparison worked... stay in the for() loop for other comparisons
827             }
828             # All the comparisons return '0', meaning they passed
829              
830             # Now see if the offset cache file data is different than the row we just read
831             COMPARE_TO_CACHE:
832 224         425 foreach my $column ( @sort_order_column_indexes) {
833 19     19   103 no warnings 'uninitialized';
  19         26  
  19         26062  
834 223 50       719 if ($offset_cache->[$cache_slot+1]->[$column] ne $next_candidate_row->[$column]) {
835             # They're different. Update the offset cache data
836 223         304 $offset_cache->[$cache_slot+1] = $next_candidate_row;
837 223         315 $offset_cache->[$cache_slot+2] = $file_pos_before_read;
838 223         309 last COMPARE_TO_CACHE;
839             }
840             }
841              
842 224 50 33     1031 if (! $printed_first_match and $monitor_start_time) {
843 0         0 UR::DBI->sql_fh->printf("FILE: First match after reading $lines_read lines\n");
844 0         0 $printed_first_match=1;
845             }
846 224         229 $lines_matched++;
847              
848 224         910 return $next_candidate_row;
849             }
850 372         2408 }; # end sub $iterator
851              
852 372         2380 Sub::Name::subname('UR::DataSource::File::__datasource_iterator(closure)__', $iterator);
853              
854 372   100     1026 my $count = $self->_open_query_count() || 0;
855 372         890 $self->_open_query_count($count+1);
856 372         1171 bless $iterator, 'UR::DataSource::File::Tracker';
857 372         1017 $iterator_data_source{$iterator} = $self;
858 372         646 $iterator_cache_slot_refs{$iterator} = \$cache_slot;
859            
860 372         2815 return $iterator;
861             }
862              
863              
864             sub UR::DataSource::File::Tracker::DESTROY {
865 372     372   348 my $iterator = shift;
866 372         893 my $ds = delete $iterator_data_source{$iterator};
867 372 50       781 return unless $ds; # The data source may have gone out of scope first during global destruction
868              
869 372         672 my $cache_slot_ref = delete $iterator_cache_slot_refs{$iterator};
870 372 50 33     1480 if (defined($cache_slot_ref) and defined($$cache_slot_ref)) {
871             # Mark this slot unused
872             #print STDERR "Freeing cache slot $cache_slot\n";
873             #$ds->_offset_cache->[$$cache_slot_ref] = 0;
874 372         1065 $ds->_free_offset_cache_slot($$cache_slot_ref);
875             }
876              
877 372         938 my $count = $ds->_open_query_count();
878 372         783 $ds->_open_query_count(--$count);
879              
880 372 100       938 return unless ($ds->quick_disconnect);
881 368 100 66     1469 if ($count == 0 && $ds->has_default_handle) {
882             # All open queries have supposedly been fulfilled. Close the
883             # file handle and undef it so get_default_handle() will re-open if necessary
884 366         692 my $fh = $ds->get_default_handle;
885              
886 366 50       844 UR::DBI->sql_fh->printf("FILE: CLOSING fileno ".fileno($fh)."\n") if ($ENV{'UR_DBI_MONITOR_SQL'});
887             #flock($fh,LOCK_UN);
888 366         1114 $fh->close();
889 366         13968 $ds->__invalidate_get_default_handle__;
890             }
891             }
892              
893             # Names of creation params that we should force to be listrefs
894             our %creation_param_is_list = map { $_ => 1 } qw( column_order file_list sort_order constant_values );
895             sub create_from_inline_class_data {
896 3     3 1 5 my($class, $class_data, $ds_data) = @_;
897              
898             # User didn't specify columns in the file. Assumme every property is a column, and in the same order
899 3 100       11 unless (exists $ds_data->{'column_order'}) {
900 1         299 Carp::croak "data_source has no column_order specified";
901             }
902              
903 2   33     14 $ds_data->{'server'} ||= $ds_data->{'path'} || $ds_data->{'file'};
      66        
904              
905 2         2 my %ds_creation_params;
906 2         5 foreach my $param ( qw( delimiter record_separator column_order skip_first_line server file_list sort_order constant_values ) ) {
907 16 100       27 if (exists $ds_data->{$param}) {
908 7 50 66     28 if ($creation_param_is_list{$param} and ref($ds_data->{$param}) ne 'ARRAY') {
909 0         0 $ds_creation_params{$param} = \( $ds_data->{$param} );
910             } else {
911 7         10 $ds_creation_params{$param} = $ds_data->{$param};
912             }
913             }
914             }
915            
916 2         15 my($namespace, $class_name) = ($class_data->{'class_name'} =~ m/^(\w+?)::(.*)/);
917 2         5 my $ds_id = "${namespace}::DataSource::${class_name}";
918 2         4 my $ds_type = delete $ds_data->{'is'};
919 2         13 my $ds = $ds_type->create( %ds_creation_params, id => $ds_id );
920 2         9 return $ds;
921             }
922              
923              
924             # The string used to join fields of a row together
925             #
926             # Since the 'delimiter' property is interpreted as a regex in the reading
927             # code, we'll try to be smart about making a real string from that.
928             #
929             # subclasses can override this to provide a different implementation
930             sub join_pattern {
931 4     4 0 6 my $self = shift;
932              
933 4         11 my $join_pattern = $self->delimiter;
934              
935             # make some common substitutions...
936 4 50       11 if ($join_pattern eq '\s*,\s*') {
937             # The default...
938 0         0 return ', ';
939             }
940              
941 4         9 $join_pattern =~ s/\\s*//g; # Turn 0-or-more whitespaces to nothing
942 4         4 $join_pattern =~ s/\\t/\t/; # tab
943 4         6 $join_pattern =~ s/\\s/ /; # whitespace
944            
945 4         5 return $join_pattern;
946             }
947              
948              
949              
950             sub _sync_database {
951 4     4   5 my $self = shift;
952 4         8 my %params = @_;
953              
954 4 50       11 unless (ref($self)) {
955 0 0       0 if ($self->isa("UR::Singleton")) {
956 0         0 $self = $self->_singleton_object;
957             }
958             else {
959 0         0 die "Called as a class-method on a non-singleton datasource!";
960             }
961             }
962              
963 4         15 my $read_fh = $self->get_default_handle();
964 4 50       9 unless ($read_fh) {
965 0         0 Carp::croak($self->class . ": Can't _sync_database(): Can't open file " . $self->server . " for reading: $!");
966             }
967              
968 4         10 my $original_data_file = $self->server;
969 4         152 my $original_data_dir = File::Basename::dirname($original_data_file);
970 4         7 my $use_quick_rename;
971 4 50       46 unless (-d $original_data_dir){
972 0         0 File::Path::mkpath($original_data_dir);
973             }
974 4 50       29 if (-w $original_data_dir) {
    0          
975 4         5 $use_quick_rename = 1; # We can write to the data dir
976             } elsif (! -w $original_data_file) {
977 0         0 $self->error_message("Neither the directory nor the file for $original_data_file are writable - cannot sync_database");
978 0         0 return;
979             }
980              
981              
982 4         12 my $split_regex = $self->_regex();
983 4         24 my $join_pattern = $self->join_pattern;
984 4         11 my $record_separator = $self->record_separator;
985 4         13 local $/; # Make sure some wise guy hasn't changed this out from under us
986 4         6 $/ = $record_separator;
987              
988 4         9 my $csv_column_order_names = $self->column_order;
989 4         9 my $csv_column_count = scalar(@$csv_column_order_names);
990 4         3 my %column_name_to_index_map;
991 4         14 for (my $i = 0; $i < @$csv_column_order_names; $i++) {
992 12         27 $column_name_to_index_map{$csv_column_order_names->[$i]} = $i;
993             }
994              
995 4         7 my $changed_objects = delete $params{changed_objects};
996              
997              
998             # We're going to assumme all the passed-in objects are of the same class *gulp*
999 4         14 my $class_name = $changed_objects->[0]->class;
1000 4         29 my $class_meta = UR::Object::Type->get(class_name => $class_name);
1001 15         22 my %column_name_to_property_meta = map { $_->column_name => $_ }
1002 4         35 grep { $_->column_name }
  19         29  
1003             $class_meta->all_property_metas;
1004 4         9 my @property_names_in_column_order;
1005 4         9 foreach my $column_name ( @$csv_column_order_names ) {
1006 12         13 my $prop_meta = $column_name_to_property_meta{$column_name};
1007 12 50       20 unless ($prop_meta) {
1008 0         0 die "Data source " . $self->class . " id " . $self->id .
1009             " could not resolve a $class_name property for the data source's column named $column_name";
1010             }
1011              
1012 12         19 push @property_names_in_column_order, $prop_meta->property_name;
1013             }
1014              
1015 4         6 my $insert = [];
1016 4         6 my $update = {};
1017 4         5 my $delete = {};
1018 4         7 foreach my $obj ( @$changed_objects ) {
1019 16 100       50 if ($obj->isa('UR::Object::Ghost')) {
    100          
1020             # This should be removed from the file
1021 4         6 my $original = $obj->{'db_committed'};
1022 4         4 my $line = join($join_pattern, @{$original}{@property_names_in_column_order}) . $record_separator;
  4         12  
1023 4         7 $delete->{$line} = $obj;
1024              
1025             } elsif ($obj->{'db_committed'}) {
1026             # This object is changed since it was read in the file
1027 4         6 my $original = $obj->{'db_committed'};
1028 4         4 my $original_line = join($join_pattern, @{$original}{@property_names_in_column_order}) . $record_separator;
  4         13  
1029 4         6 my $changed_line = join($join_pattern, @{$obj}{@property_names_in_column_order}) . $record_separator;
  4         7  
1030 4         9 $update->{$original_line} = $changed_line;
1031            
1032             } else {
1033             # This object is new and should be added to the file
1034 8         15 push @$insert, [ @{$obj}{@property_names_in_column_order} ];
  8         17  
1035             }
1036             }
1037              
1038 4         11 my $sort_order_names = $self->sort_order;
1039 4         14 foreach my $sort_column_name ( @$sort_order_names ) {
1040 4 50       12 unless (exists $column_name_to_index_map{$sort_column_name}) {
1041 0         0 Carp::croak("Column name '$sort_column_name' appears in the sort_order list, but not in the column_order list for data source ".$self->id);
1042             }
1043             }
1044 4         6 my $file_is_sorted = scalar(@$sort_order_names);
1045 4         7 my %column_sorts_numerically = map { $_->column_name => $_->is_numeric }
  15         28  
1046             values %column_name_to_property_meta;
1047             my $row_sort_sub = sub ($$) {
1048 20     20   15 my $comparison;
1049              
1050 20         18 foreach my $column_name ( @$sort_order_names ) {
1051 20         21 my $i = $column_name_to_index_map{$column_name};
1052 20 50       24 if ($column_sorts_numerically{$column_name}) {
1053 20         25 $comparison = $_[0]->[$i] <=> $_[1]->[$i];
1054             } else {
1055 0         0 $comparison = $_[0]->[$i] cmp $_[1]->[$i];
1056             }
1057 20 50       39 return $comparison if $comparison != 0;
1058             }
1059 0         0 return 0;
1060 4         21 };
1061 4 50 33     25 if ($sort_order_names && $file_is_sorted && scalar(@$insert)) {
      50        
1062             # the inserted things should be sorted the same way as the file
1063 4         9 my @sorted = sort $row_sort_sub @$insert;
1064 4         6 $insert = \@sorted;
1065             }
1066              
1067 4         6 my $write_fh;
1068             my $temp_file_name;
1069 4 50       7 if ($use_quick_rename) {
1070 4         28 $temp_file_name = sprintf("%s/.%d.%d" , $original_data_dir, time(), $$);
1071 4         21 $write_fh = IO::File->new($temp_file_name, O_WRONLY|O_CREAT);
1072             } else {
1073 0         0 $write_fh = File::Temp->new(UNLINK => 1);
1074 0 0       0 $temp_file_name = $write_fh->filename if ($write_fh);
1075             }
1076 4 50       504 unless ($write_fh) {
1077 0         0 Carp::croak "Can't create temporary file for writing: $!";
1078             }
1079              
1080 4         6 my $monitor_start_time;
1081 4 50       18 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1082 0         0 $monitor_start_time = Time::HiRes::time();
1083 0         0 my $time = time();
1084 0         0 UR::DBI->sql_fh->printf("\nFILE: SYNC_DATABASE AT %d [%s]. Started transaction for %s to temp file %s\n",
1085             $time, scalar(localtime($time)), $original_data_file, $temp_file_name);
1086              
1087             }
1088              
1089 4 50       29 unless (flock($read_fh,LOCK_SH)) {
1090 0 0       0 unless ($! == EOPNOTSUPP ) {
1091 0         0 Carp::croak($self->class(). ": Can't get exclusive lock for file ".$self->server.": $!");
1092             }
1093             }
1094              
1095             # write headers to the new file
1096 4         17 for (my $i = 0; $i < $self->skip_first_line; $i++) {
1097 0         0 my $line = <$read_fh>;
1098 0         0 $write_fh->print($line);
1099             }
1100              
1101            
1102 4         6 my $line;
1103             READ_A_LINE:
1104 4         5 while(1) {
1105 20 100       26 unless ($line) {
1106 15         90 $line = <$read_fh>;
1107 15 100       25 last unless defined $line;
1108             }
1109              
1110 16 100 50     38 if ($file_is_sorted && scalar(@$insert)) {
1111             # there are sorted things waiting to insert
1112 15         16 my $chomped = $line;
1113 15         14 chomp $chomped;
1114 15         51 my $row = [ split($split_regex, $chomped, $csv_column_count) ];
1115 15         23 my $comparison = $row_sort_sub->($row, $insert->[0]);
1116 15 100       29 if ($comparison > 0) {
1117             # write the object's data
1118 19     19   104 no warnings 'uninitialized'; # Some of the object's data may be undef
  19         32  
  19         3899  
1119 5         7 my $new_row = shift @$insert;
1120 5         12 my $new_line = join($join_pattern, @$new_row) . $record_separator;
1121              
1122 5 50       10 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1123 0         0 UR::DBI->sql_fh->print("INSERT >>$new_line<<\n");
1124             }
1125              
1126 5         13 $write_fh->print($new_line);
1127             # Don't undef the last line read, meaning it could still be written to the output...
1128 5         35 next READ_A_LINE;
1129             }
1130             }
1131              
1132 11 100       32 if (my $obj = delete $delete->{$line}) {
    100          
1133 2 50       8 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1134 0         0 UR::DBI->sql_fh->print("DELETE >>$line<<\n");
1135             }
1136 2         2 $line = undef;
1137 2         4 next;
1138            
1139             } elsif (my $changed = delete $update->{$line}) {
1140 4 50       11 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1141 0         0 UR::DBI->sql_fh->print("UPDATE replace >>$line<< with >>$changed<<\n");
1142             }
1143 4         8 $write_fh->print($changed);
1144 4         15 $line = undef;
1145 4         5 next;
1146            
1147             } else {
1148             # This line from the file was unchanged in the app
1149 5         10 $write_fh->print($line);
1150 5         24 $line = undef;
1151             }
1152             }
1153              
1154 4 50       12 if (keys %$delete) {
1155 0         0 $self->warning_message("There were ",scalar(keys %$delete)," deleted $class_name objects that did not match data in the file");
1156             }
1157 4 50       14 if (keys %$update) {
1158 0         0 $self->warning_message("There were ",scalar(keys %$update)," updated $class_name objects that did not match data in the file");
1159             }
1160              
1161             # finish out by writing the rest of the new data
1162 4         7 foreach my $new_row ( @$insert ) {
1163 19     19   89 no warnings 'uninitialized'; # Some of the object's data may be undef
  19         32  
  19         8317  
1164 3         11 my $new_line = join($join_pattern, @$new_row) . $record_separator;
1165 3 50       9 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1166 0         0 UR::DBI->sql_fh->print("INSERT >>$new_line<<\n");
1167             }
1168 3         7 $write_fh->print($new_line);
1169             }
1170 4         28 $write_fh->close();
1171            
1172 4 50       253 if ($use_quick_rename) {
1173 4 50       13 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1174 0         0 UR::DBI->sql_fh->print("FILE: COMMIT rename $temp_file_name over $original_data_file\n");
1175             }
1176              
1177 4 50       195 unless(rename($temp_file_name, $original_data_file)) {
1178 0         0 $self->error_message("Can't rename the temp file over the original file: $!");
1179 0         0 return;
1180             }
1181             } else {
1182             # We have to copy the data from the temp file to the original file
1183              
1184 0 0       0 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1185 0         0 UR::DBI->sql_fh->print("FILE: COMMIT write over $original_data_file in place\n");
1186             }
1187 0         0 my $new_write_fh = IO::File->new($original_data_file, O_WRONLY|O_TRUNC);
1188 0 0       0 unless ($new_write_fh) {
1189 0         0 $self->error_message("Can't open $original_data_file for writing: $!");
1190 0         0 return;
1191             }
1192              
1193 0         0 my $temp_file_fh = IO::File->new($temp_file_name);
1194 0 0       0 unless ($temp_file_fh) {
1195 0         0 $self->error_message("Can't open $temp_file_name for reading: $!");
1196 0         0 return;
1197             }
1198            
1199 0         0 while(<$temp_file_fh>) {
1200 0         0 $new_write_fh->print($_);
1201             }
1202            
1203 0         0 $new_write_fh->close();
1204             }
1205              
1206             # Because of the rename/copy process during syncing, the previously opened filehandle may
1207             # not be valid anymore. get_default_handle will reopen the file next time it's needed
1208 4         27 $self->_invalidate_cache();
1209 4         33 $self->__invalidate_get_default_handle__;
1210              
1211 4 50       9 if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1212 0         0 UR::DBI->sql_fh->printf("FILE: TOTAL COMMIT TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
1213             }
1214              
1215 4         32 flock($read_fh, LOCK_UN);
1216 4         11 $read_fh->close();
1217              
1218             # FIXME - this is ugly... With RDBMS-type data sources, they will call $dbh->commit() which
1219             # gets to UR::DBI->commit(), which calls _set_object_saved_committed for them. Since we're
1220             # not using DBI we have to do this 2-part thing ourselves. In the future, we might break
1221             # out things so the saving to the temp file goes in _sync_database(), and moving the temp
1222             # file over the original goes in commit()
1223 4 50       2092 unless ($self->_set_specified_objects_saved_uncommitted($changed_objects)) {
1224 0         0 Carp::croak("Error setting objects to a saved state after sync_database. Exiting.");
1225 0         0 return;
1226             }
1227              
1228 4         47 $self->_set_specified_objects_saved_committed($changed_objects);
1229 4         73 return 1;
1230             }
1231              
1232            
1233              
1234             sub initializer_should_create_column_name_for_class_properties {
1235 28     28 0 353 1;
1236             }
1237              
1238              
1239             1;
1240              
1241             =pod
1242              
1243             =head1 NAME
1244              
1245             UR::DataSource::File - Parent class for file-based data sources
1246              
1247             =head1 DEPRECATED
1248              
1249             This module is deprecated. Use UR::DataSource::Filesystem instead.
1250              
1251             =head1 SYNOPSIS
1252            
1253             package MyNamespace::DataSource::MyFile;
1254             class MyNamespace::DataSource::MyFile {
1255             is => ['UR::DataSource::File', 'UR::Singleton'],
1256             };
1257             sub server { '/path/to/file' }
1258             sub delimiter { "\t" }
1259             sub column_order { ['thing_id', 'thing_name', 'thing_color' ] }
1260             sub sort_order { ['thing_id'] }
1261              
1262             package main;
1263             class MyNamespace::Thing {
1264             id_by => 'thing_id',
1265             has => [ 'thing_id', 'thing_name', 'thing_color' ],
1266             data_source => 'MyNamespace::DataSource::MyFile',
1267             }
1268             my @objs = MyNamespace::Thing->get(thing_name => 'Bob');
1269              
1270             =head1 DESCRIPTION
1271              
1272             Classes which wish to retrieve their data from a regular file can use a UR::DataSource::File-based
1273             data source. The modules implementing these data sources live under the DataSource subdirectory
1274             of the application's Namespace, by convention. Besides defining a class for your data source
1275             inheriting from UR::DataSource::File, it should have the following methods, either as properties
1276             or functions in the package.
1277              
1278             =head2 Configuration
1279              
1280             These methods determine the configuration for your data source.
1281              
1282             =over 4
1283              
1284             =item server()
1285              
1286             server() should return a string representing the pathname of the file where the data is stored.
1287              
1288             =item file_list()
1289              
1290             The file_list() method should return a listref of pathnames to one or more identical files
1291             where data is stored. Use file_list() instead of server() when you want to load-balance several NFS
1292             servers, for example.
1293              
1294             You must have either server() or file_list() in your module, but not both. The existence of server()
1295             takes precedence over file_list().
1296              
1297             =item delimiter()
1298              
1299             delimiter() should return a string representing how the fields in each record are split into
1300             columns. This string is interpreted as a regex internally. The default delimiter is "\s*,\s*"
1301             meaning that the file is separated by commas.
1302              
1303             =item record_separator()
1304              
1305             record_separator() should return a string that gets stored in $/ before getline() is called on the
1306             file's filehandle. The default record_separator() is "\n" meaning that the file's records are
1307             separated by newlines.
1308              
1309             =item skip_first_line()
1310              
1311             skip_first_line() should return a boolean value. If true, the first line of the file is ignored, for
1312             example if the first line defines the columns in the file.
1313              
1314             =item column_order()
1315              
1316             column_order() should return a listref of column names in the file. column_order is required; there
1317             is no default.
1318              
1319             =item sort_order()
1320              
1321             If the data file is sorted in some way, sort_order() should return a listref of column names (which must
1322             exist in column_order()) by which the file is sorted. This gives the system a hint about how the file
1323             is structured, and is able to make shortcuts when reading the file to speed up data access. The default
1324             is to assumme the file is not sorted.
1325              
1326             =back
1327              
1328             =head1 INHERITANCE
1329              
1330             UR::DataSource
1331              
1332             =head1 SEE ALSO
1333              
1334             UR, UR::DataSource
1335              
1336             =cut