File Coverage

lib/UR/DataSource/File.pm

Criterion	Covered	Total	%
statement	510	658	77.5
branch	213	362	58.8
condition	53	101	52.4
subroutine	56	59	94.9
pod	3	10	30.0
total	835	1190	70.1

line	stmt	bran	cond	sub	pod	time	code
1							package UR::DataSource::File;
2
3							# NOTE! This module is deprecated. Use UR::DataSource::Filesystem instead.
4
5							# A data source implementation for text files where the fields
6							# are delimited by commas (or anything else really). Usually,
7							# the lines in the file will be sorted by one or more columns,
8							# but it isn't strictly necessary
9							#
10							# For now, it's structured around files where the record is delimited by
11							# newlines, and the fields are delimited by qr(\s,\s). Those are
12							# overridable in concrete data sources by specifying record_seperator() and
13							# delimiter().
14							# FIXME - work out a way to support record-oriented data as well as line-oriented data
15
16	19			19		535	use UR;
	19					26
	19					123
17	19			19		69	use strict;
	19					26
	19					328
18	19			19		63	use warnings;
	19					24
	19					851
19							our $VERSION = "0.46"; # UR $VERSION;
20
21	19			19		72	use Fcntl qw(:DEFAULT :flock);
	19					20
	19					7662
22	19			19		94	use Errno qw(EINTR EAGAIN EOPNOTSUPP);
	19					27
	19					2308
23	19			19		77	use File::Temp;
	19					30
	19					1116
24	19			19		79	use File::Basename;
	19					31
	19					847
25	19			19		73	use IO::File qw();
	19					22
	19					24865
26
27							our @CARP_NOT = qw( UR::Context UR::DataSource::FileMux UR::Object::Type );
28
29							class UR::DataSource::File {
30							is => ['UR::DataSource'],
31							has => [
32							delimiter => { is => 'String', default_value => '\s,\s', doc => 'Delimiter between columns on the same line' },
33							record_separator => { is => 'String', default_value => "\n", doc => 'Delimiter between lines in the file' },
34							column_order => { is => 'ARRAY', doc => 'Names of the columns in the file, in order' },
35							skip_first_line => { is => 'Integer', default_value => 0, doc => 'Number of lines at the start of the file to skip' },
36							handle_class => { is => 'String', default_value => 'IO::File', doc => 'Class to use for new file handles' },
37							quick_disconnect => { is => 'Boolean', default_value => 1, doc => 'Do not hold the file handle open between requests' },
38							],
39							has_optional => [
40							server => { is => 'String', doc => 'pathname to the data file' },
41							file_list => { is => 'ARRAY', doc => 'list of pathnames of equivalent files' },
42							sort_order => { is => 'ARRAY', doc => 'Names of the columns by which the data file is sorted' },
43							constant_values => { is => 'ARRAY', doc => 'Property names which are not in the data file(s), but are part of the objects loaded from the data source' },
44
45
46							# REMOVE
47							#file_cache_index => { is => 'Integer', doc => 'index into the file cache where the next read will be placed' },
48							_open_query_count => { is => 'Integer', doc => 'number of queries currently using this data source, used internally' },
49
50							],
51							doc => 'A data source for line-oriented files',
52							};
53
54
55	10			10	0	23	sub can_savepoint { 0;} # Doesn't support savepoints
56
57							sub create_default_handle {
58	375			375	0	416	my $self = shift;
59
60	375	50				1113	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
61	0					0	my $time = time();
62	0					0	UR::DBI->sql_fh->printf("\nFILE OPEN AT %d [%s]\n",$time, scalar(localtime($time)));
63							}
64
65	375					1459	my $filename = $self->server;
66	375	100				11748	unless (-e $filename) {
67							# file doesn't exist
68	2					6	$filename = '/dev/null';
69							}
70
71	375					1174	my $handle_class = $self->handle_class;
72	375					2584	my $fh = $handle_class->new($filename);
73	375	50				27407	unless($fh) {
74	0					0	$self->error_message("Can't open ".$self->server." for reading: $!");
75	0					0	return;
76							}
77
78	375	50				1027	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
79	0					0	UR::DBI->sql_fh->printf("FILE: opened %s fileno %d\n\n",$self->server, $fh->fileno);
80							}
81
82	375					1426	$self->is_connected(1);
83	375					793	return $fh;
84							}
85
86							sub disconnect {
87	2			2	0	6	my $self = shift;
88
89	2	50				4	if ($self->has_default_handle) {
90	2					6	my $fh = $self->get_default_handle;
91	2					26	flock($fh,LOCK_UN);
92	2					14	$fh->close();
93	2					192	$self->__invalidate_get_default_handle__;
94	2					8	$self->is_connected(0);
95							}
96							}
97
98							sub _file_position {
99	1222			1222		1005	my $self = shift;
100	1222					2479	my $fh = $self->get_default_handle;
101	1222	50				3391	return $fh ? $fh->tell() : undef;
102							}
103
104							sub prepare_for_fork {
105	2			2	0	4	my $self = shift;
106
107							# make sure this is clear before we fork
108	2					6	$self->{'_fh_position'} = undef;
109	2	50				8	if ($self->has_default_handle) {
110	2					10	$self->{'_fh_position'} = $self->_file_position();
111	2	50				24	UR::DBI->sql_fh->printf("FILE: preparing to fork; closing file %s and noting position at %s\n",$self->server, $self->{'_fh_position'}) if $ENV{'UR_DBI_MONITOR_SQL'};
112							}
113	2					36	$self->disconnect_default_handle;
114							}
115
116							sub finish_up_after_fork {
117	2			2	0	19	my $self = shift;
118	2	50				29	if (defined $self->{'_fh_position'}) {
119	2	50				27	UR::DBI->sql_fh->printf("FILE: resetting after fork; reopening file %s and fast-forwarding to %s\n",$self->server, $self->{'_fh_position'}) if $ENV{'UR_DBI_MONITOR_SQL'};
120	2					30	my $fh = $self->get_default_handle;
121	2					24	$fh->seek($self->{'_fh_position'},0);
122							}
123							}
124
125							sub _regex {
126	376			376		386	my $self = shift;
127
128	376	100				778	unless ($self->{'_regex'}) {
129	26					96	my $delimiter = $self->delimiter;
130	26					49	my $r = eval { qr($delimiter) };
	26					464
131	26	50	33			158	if ($@ \|\| !$r) {
132	0					0	$self->error_message("Unable to interepret delimiter '".$self->delimiter.": $@");
133	0					0	return;
134							}
135	26					68	$self->{'_regex'} = $r;
136							}
137	376					514	return $self->{'_regex'};
138							}
139
140							# We're overriding server() so everyone else can have a single way of getting
141							# the file's pathname instead of having to know about both server and file_list
142							sub server {
143	45			45	1	51	my $self = shift;
144
145	45	100				95	unless ($self->{'_cached_server'}) {
146	16	100				65	if ($self->__server()) {
		50
147	15					39	$self->{'_cached_server'} = $self->__server();
148							} elsif ($self->file_list) {
149	1					2	my $files = $self->file_list;
150	1					2	my $count = scalar(@$files);
151	1					3	my $idx = $$ % $count;
152	1					3	$self->{'_cached_server'} = $files->[$idx];
153							} else {
154	0					0	die "Data source ",$self->id," didn't specify either server or file_list";
155							}
156							}
157	45					81	return $self->{'_cached_server'};
158							}
159
160
161							# Should be divisible by 3
162							our $MAX_CACHE_SIZE = 99;
163							# The offset cache is an arrayref containing three pieces of data:
164							# 0: If this cache slot is being used by a loading iterator
165							# 1: concatenated data from the sorted columns for comparison with where you are in the file
166							# 2: the seek position that line came from
167							sub _offset_cache {
168	1116			1116		974	my $self = shift;
169
170	1116	100				1896	unless ($self->{'_offset_cache'}) {
171	25					58	$self->{'_offset_cache'} = [];
172							}
173	1116					1181	return $self->{'_offset_cache'};
174							}
175
176							our %iterator_data_source;
177							our %iterator_cache_slot_refs;
178
179							sub _allocate_offset_cache_slot {
180	372			372		381	my $self = shift;
181
182	372					620	my $cache = $self->_offset_cache();
183	372					454	my $next = scalar(@$cache);
184							#print STDERR "_allocate_offset_cache_slot ".$self->server." current size is $next ";
185	372	100				724	if ($next > $MAX_CACHE_SIZE) {
186							#print STDERR "searching... \n";
187	282					493	my $last_offset_cache_slot = $self->{'_last_offset_cache_slot'};
188	282	100				552	if ($last_offset_cache_slot >= $MAX_CACHE_SIZE) {
189	9					15	$next = 0;
190							} else {
191	273					385	$next = $last_offset_cache_slot + 3;
192							}
193							# Search for an unused slot
194	282		33			869	while ($cache->[$next] and $next != $last_offset_cache_slot) {
195	0					0	$next += 3;
196	0	0				0	$next = 0 if ($next > $MAX_CACHE_SIZE);
197							}
198	282	50	33			1565	if ($next > $MAX_CACHE_SIZE or $next eq $last_offset_cache_slot) {
199							#print STDERR scalar(keys(%iterator_data_source))." items in iterator_data_source ".scalar(keys(%iterator_cache_slot))." in iterator_cache_slot\n";
200	0					0	Carp::carp("Unable to find an open file offset cache slot because there are too many outstanding loading iterators. Temporarily expanding the cache...");
201							# We'll let it go ahead and expand the list
202	0					0	$next = $MAX_CACHE_SIZE;
203	0					0	$MAX_CACHE_SIZE += 3;
204							}
205							}
206	372					423	$cache->[$next] = 1;
207	372					500	$cache->[$next+1] = undef;
208	372					489	$cache->[$next+2] = undef;
209
210	372					436	$self->{'_last_offset_cache_slot'} = $next;
211							#print STDERR "using slot $next current size ".scalar(@$cache)."\n";
212	372					550	return $next;
213							}
214
215
216							sub _free_offset_cache_slot {
217	372			372		492	my($self, $cache_slot) = @_;
218
219	372					671	my $cache = $self->_offset_cache();
220	372	50				767	unless ($cache_slot < scalar(@$cache)) {
221	0					0	$self->warning_message("Freeing offset cache slot past the end. Current size ".scalar(@$cache).", requested $cache_slot");
222	0					0	return;
223							}
224
225	372	50				804	unless (defined $cache->[$cache_slot]) {
226	0					0	$self->warning_message("Freeing unused offset cache slot $cache_slot");
227	0					0	return;
228							}
229
230	372	50	66			1018	if ($cache->[$cache_slot+1] and scalar(@{$cache->[$cache_slot+1]}) == 0) {
	169					489
231							# There's no data in here. Must have happened when the reader went all the
232							# way to the end of the file and found nothing. Remove this entry completely
233							# because it's not helpful
234	0					0	splice(@$cache, $cache_slot,3);
235
236							} else {
237							# There is data in here, mark it as a free slot
238	372					422	$cache->[$cache_slot] = 0;
239							}
240	372					524	return 1;
241							}
242
243
244
245							sub _invalidate_cache {
246	4			4		6	my $self = shift;
247
248	4					9	$self->{'_offset_cache'} = [];
249
250	4					15	return 1;
251							}
252
253							sub _generate_loading_templates_arrayref {
254	49			49		85	my($self,$old_sql_cols) = @_;
255
256	49					215	my $columns_in_file = $self->column_order;
257	49					149	my %column_to_position_map;
258	49					152	for (my $i = 0; $i < @$columns_in_file; $i++) {
259	136					330	$column_to_position_map{$columns_in_file->[$i]} = $i;
260							}
261
262							# strip out columns that don't exist in the file
263	49					57	my $sql_cols;
264	49					102	foreach my $column_data ( @$old_sql_cols ) {
265	212					342	my $propertys_column_name = $column_data->[1]->column_name;
266	212	100	66			567	next unless ($propertys_column_name and exists($column_to_position_map{$propertys_column_name}));
267
268	136					185	push @$sql_cols, $column_data;
269							}
270
271	49	50				136	unless ($sql_cols) {
272	0					0	$self->error_message("Couldn't determine column information for data source " . $self->id);
273	0					0	return;
274							}
275
276							# reorder the requested columns to be in the same order as the file
277							my @sql_cols_with_column_name =
278	49					87	map{ [ $column_to_position_map{ $_->[1]->column_name }, $_ ] }
	136					254
279							@$sql_cols;
280							my @sorted_sql_cols =
281	136					182	map { $_->[1] }
282	49					227	sort { $a->[0] <=> $b->[0] }
	125					205
283							@sql_cols_with_column_name;
284	49					89	$sql_cols = \@sorted_sql_cols;
285	49					317	my $templates = $self->SUPER::_generate_loading_templates_arrayref($sql_cols);
286
287	49	100				241	if (my $constant_values = $self->constant_values) {
288							# Find the first unused index in the loading template
289	12					37	my $next_template_slot = -1;
290	12					23	foreach my $tmpl ( @$templates ) {
291	12					11	foreach my $col ( @{$tmpl->{'column_positions'}} ) {
	12					19
292	36	50				49	if ($col >= $next_template_slot) {
293	36					39	$next_template_slot = $col + 1;
294							}
295							}
296							}
297	12	50				30	if ($next_template_slot == -1) {
298	0					0	die "Couldn't determine last column in loading template for data source" . $self->id;
299							}
300
301	12					19	foreach my $prop ( @$constant_values ) {
302	18					16	push @{$templates->[0]->{'column_positions'}}, $next_template_slot++;
	18					29
303	18					15	push @{$templates->[0]->{'property_names'}}, $prop;
	18					30
304							}
305							}
306
307	49					220	return $templates;
308							}
309
310							sub _things_in_list_are_numeric {
311	260			260		258	my $self = shift;
312
313	260					241	foreach ( @{$_[0]} ) {
	260					493
314	276	100				1125	return 0 if (! Scalar::Util::looks_like_number($_));
315							}
316	213					715	return 1;
317							}
318
319							# Construct a closure to perform a test on the $index-th column of
320							# @$$next_candidate_row. The closures return 0 is the test is successful,
321							# -1 if unsuccessful but the file's value was less than $value, and 1
322							# if unsuccessful and greater. The iterator that churns throug the file
323							# knows that if it's comparing an ID/sorted column, and the comparator
324							# returns 1 then we've gone past the point where we can expect to ever
325							# find another successful match and we should stop looking
326							my $ALWAYS_FALSE = sub { -1 };
327							sub _comparator_for_operator_and_property {
328	359			359		618	my($self,$property,$next_candidate_row, $index, $operator,$value) = @_;
329
330	19			19		96	no warnings 'uninitialized'; # we're handling ''/undef/null specially below where it matters
	19					23
	19					39244
331
332	359	100	100			2629	if ($operator eq 'between') {
		100
		100
		100
		100
		100
333	44	100	100			163	if ($value->[0] eq '' or $value->[1] eq '') {
334	28					40	return $ALWAYS_FALSE;
335							}
336
337	16	50	33			38	if ($property->is_numeric and $self->_things_in_list_are_numeric($value)) {
338	16	50				31	if ($value->[0] > $value->[1]) {
339							# Will never be true
340	0					0	Carp::carp "'between' comparison will never be true with values ".$value->[0]," and ".$value->[1];
341	0					0	return $ALWAYS_FALSE;
342							}
343
344							# numeric 'between' comparison
345							return sub {
346	32	50		32		67	return -1 if ($$next_candidate_row->[$index] eq '');
347	0	0				0	if ($$next_candidate_row->[$index] < $value->[0]) {
		0
348	0					0	return -1;
349							} elsif ($$next_candidate_row->[$index] > $value->[1]) {
350	0					0	return 1;
351							} else {
352	0					0	return 0;
353							}
354	16					77	};
355							} else {
356	0	0				0	if ($value->[0] gt $value->[1]) {
357	0					0	Carp::carp "'between' comparison will never be true with values ".$value->[0]," and ".$value->[1];
358	0					0	return $ALWAYS_FALSE;
359							}
360
361							# A string 'between' comparison
362							return sub {
363	0	0		0		0	return -1 if ($$next_candidate_row->[$index] eq '');
364	0	0				0	if ($$next_candidate_row->[$index] lt $value->[0]) {
		0
365	0					0	return -1;
366							} elsif ($$next_candidate_row->[$index] gt $value->[1]) {
367	0					0	return 1;
368							} else {
369	0					0	return 0;
370							}
371	0					0	};
372							}
373
374							} elsif ($operator eq 'in') {
375	20	100				50	if (! @$value) {
376	8					14	return $ALWAYS_FALSE;
377							}
378
379	12	100	66			51	if ($property->is_numeric and $self->_things_in_list_are_numeric($value)) {
380							# Numeric 'in' comparison returns undef if we're within the range of the list
381							# but don't actually match any of the items in the list
382	8					27	@$value = sort { $a <=> $b } @$value; # sort the values first
	0					0
383							return sub {
384	16	50		16		40	return -1 if ($$next_candidate_row->[$index] eq '');
385	0	0				0	if ($$next_candidate_row->[$index] < $value->[0]) {
		0
386	0					0	return -1;
387							} elsif ($$next_candidate_row->[$index] > $value->[-1]) {
388	0					0	return 1;
389							} else {
390	0					0	foreach ( @$value ) {
391	0	0				0	return 0 if $$next_candidate_row->[$index] == $_;
392							}
393	0					0	return -1;
394							}
395	8					51	};
396
397							} else {
398							# A string 'in' comparison
399	4					13	@$value = sort { $a cmp $b } @$value;
	0					0
400							return sub {
401	8	50		8		29	if ($$next_candidate_row->[$index] lt $value->[0]) {
		50
402	0					0	return -1;
403							} elsif ($$next_candidate_row->[$index] gt $value->[-1]) {
404	0					0	return 1;
405							} else {
406	8					18	foreach ( @$value ) {
407	8	50				19	return 0 if $$next_candidate_row->[$index] eq $_;
408							}
409	0					0	return -1;
410							}
411	4					26	};
412
413							}
414
415							} elsif ($operator eq 'not in') {
416	14	100				41	if (! @$value) {
417	4					6	return $ALWAYS_FALSE;
418							}
419
420	10	100	100			38	if ($property->is_numeric and $self->_things_in_list_are_numeric($value)) {
421							return sub {
422	8	50		8		18	return -1 if ($$next_candidate_row->[$index] eq '');
423	0					0	foreach ( @$value ) {
424	0	0				0	return -1 if $$next_candidate_row->[$index] == $_;
425							}
426	0					0	return 0;
427							}
428
429	4					21	} else {
430							return sub {
431	16			16		25	foreach ( @$value ) {
432	22	100				53	return -1 if $$next_candidate_row->[$index] eq $_;
433							}
434	4					6	return 0;
435							}
436	6					37	}
437
438							} elsif ($operator eq 'like') {
439							# 'like' is always a string comparison. In addition, we can't know if we're ahead
440							# or behind in the file's ID columns, so the only two return values are 0 and 1
441
442	18	100				60	return $ALWAYS_FALSE if ($value eq ''); # property like NULL is always false
443
444							# Convert SQL-type wildcards to Perl-type wildcards
445							# Convert a % to a *, and _ to ., unless they're preceeded by \ to escape them.
446							# Not that this isn't precisely correct, as \\% should really mean a literal \
447							# followed by a wildcard, but we can't be correct in all cases without including
448							# a real parser. This will catch most cases.
449
450	14					82	$value =~ s/(?
451	14					29	$value =~ s/(?
452	14					109	my $regex = qr($value);
453							return sub {
454	32	100		32		86	return -1 if ($$next_candidate_row->[$index] eq '');
455	8	100				32	if ($$next_candidate_row->[$index] =~ $regex) {
456	2					6	return 0;
457							} else {
458	6					8	return 1;
459							}
460	14					92	};
461
462							} elsif ($operator eq 'not like') {
463	16	100				53	return $ALWAYS_FALSE if ($value eq ''); # property like NULL is always false
464	12					69	$value =~ s/(?
465	12					29	$value =~ s/(?
466	12					90	my $regex = qr($value);
467							return sub {
468	24	50		24		72	return -1 if ($$next_candidate_row->[$index] eq '');
469	0	0				0	if ($$next_candidate_row->[$index] =~ $regex) {
470	0					0	return 1;
471							} else {
472	0					0	return 0;
473							}
474	12					82	};
475
476
477							# FIXME - should we only be testing the numericness of the property?
478							} elsif ($property->is_numeric and $self->_things_in_list_are_numeric([$value])) {
479							# Basic numeric comparisons
480	185	100	33			614	if ($operator eq '=') {
		100
		100
		100
		100
		50
		100
		50
481							return sub {
482	1470	100		1470		2178	return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
483	1446					1873	return $$next_candidate_row->[$index] <=> $value;
484	130					769	};
485							} elsif ($operator eq '<') {
486							return sub {
487	24	50		24		47	return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
488	0	0				0	$$next_candidate_row->[$index] < $value ? 0 : 1;
489	12					56	};
490							} elsif ($operator eq '<=') {
491							return sub {
492	24	50		24		65	return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
493	0	0				0	$$next_candidate_row->[$index] <= $value ? 0 : 1;
494	12					78	};
495							} elsif ($operator eq '>') {
496							return sub {
497	24	50		24		70	return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
498	0	0				0	$$next_candidate_row->[$index] > $value ? 0 : -1;
499	12					92	};
500							} elsif ($operator eq '>=') {
501							return sub {
502	24	50		24		58	return -1 if ($$next_candidate_row->[$index] eq ''); # null always != a number
503	0	0				0	$$next_candidate_row->[$index] >= $value ? 0 : -1;
504	12					77	};
505							} elsif ($operator eq 'true') {
506							return sub {
507	0	0		0		0	$$next_candidate_row->[$index] ? 0 : -1;
508	0					0	};
509							} elsif ($operator eq 'false') {
510							return sub {
511	2	50		2		5	$$next_candidate_row->[$index] ? -1 : 0;
512	1					6	};
513							} elsif ($operator eq '!=' or $operator eq 'ne') {
514							return sub {
515	12	50		12		27	return 0 if ($$next_candidate_row->[$index] eq ''); # null always != a number
516	0	0				0	$$next_candidate_row->[$index] != $value ? 0 : -1;
517							}
518	6					29	}
519
520							} else {
521							# Basic string comparisons
522	62	100	0			254	if ($operator eq '=') {
		100
		100
		100
		100
		100
		50
		0
523							return sub {
524	120	50	50	120		505	return -1 if ($$next_candidate_row->[$index] eq '' xor $value eq '');
525	120					168	return $$next_candidate_row->[$index] cmp $value;
526	25					168	};
527							} elsif ($operator eq '<') {
528							return sub {
529	16	50		16		30	$$next_candidate_row->[$index] lt $value ? 0 : 1;
530	8					38	};
531							} elsif ($operator eq '<=') {
532							return sub {
533	16	50	33	16		46	return -1 if ($$next_candidate_row->[$index] eq '' or $value eq '');
534	0	0				0	$$next_candidate_row->[$index] le $value ? 0 : 1;
535	8					44	};
536							} elsif ($operator eq '>') {
537							return sub {
538	16	50		16		45	$$next_candidate_row->[$index] gt $value ? 0 : -1;
539	8					54	};
540							} elsif ($operator eq '>=') {
541							return sub {
542	16	50	33	16		74	return -1 if ($$next_candidate_row->[$index] eq '' or $value eq '');
543	0	0				0	$$next_candidate_row->[$index] ge $value ? 0 : -1;
544	8					46	};
545							} elsif ($operator eq 'true') {
546							return sub {
547	8	50		8		17	$$next_candidate_row->[$index] ? 0 : -1;
548	4					29	};
549							} elsif ($operator eq 'false') {
550							return sub {
551	2	50		2		6	$$next_candidate_row->[$index] ? -1 : 0;
552	1					5	};
553							} elsif ($operator eq '!=' or $operator eq 'ne') {
554							return sub {
555	0	0		0		0	$$next_candidate_row->[$index] ne $value ? 0 : -1;
556							}
557	0					0	}
558							}
559							}
560
561							my $iterator_serial = 0;
562							sub create_iterator_closure_for_rule {
563	372			372	1	381	my($self,$rule) = @_;
564
565	372					817	my $class_name = $rule->subject_class_name;
566	372					1074	my $class_meta = $class_name->__meta__;
567	372					707	my $rule_template = $rule->template;
568
569	372					1293	my $csv_column_order_names = $self->column_order;
570	372					1280	my $csv_column_count = scalar @$csv_column_order_names;
571
572	372					1083	my $operators_for_properties = $rule_template->operators_for_properties();
573	372					814	my $values_for_properties = $rule->legacy_params_hash;
574	372					1053	foreach ( values %$values_for_properties ) {
575	1506	50	66			2826	if (ref eq 'HASH' and exists $_->{'value'}) {
576	188					339	$_ = $_->{'value'};
577							}
578							}
579
580	372					968	my $sort_order_names = $self->sort_order;
581	372					1069	my %sort_column_names = map { $_ => 1 } @$sort_order_names;
	371					953
582	372					525	my @non_sort_column_names = grep { ! exists($sort_column_names{$_}) } @$csv_column_order_names;
	1096					1712
583
584	372					397	my %column_name_to_index_map;
585	372					882	for (my $i = 0; $i < @$csv_column_order_names; $i++) {
586	1096					1957	$column_name_to_index_map{$csv_column_order_names->[$i]} = $i;
587							}
588
589							# Index in the split-file-data for each sorted column in order
590	372					460	my @sort_order_column_indexes = map { $column_name_to_index_map{$_} } @$sort_order_names;
	371					769
591
592	372					406	my(%property_meta_for_column_name);
593	372					489	foreach my $column_name ( @$csv_column_order_names ) {
594	1096					3445	my $prop = UR::Object::Property->get(class_name => $class_name, column_name => $column_name);
595	1096					1007	our %WARNED_ABOUT_COLUMN;
596	1096	50	33			2348	unless ( $prop or $WARNED_ABOUT_COLUMN{$class_name . '::' . $column_name}++) {
597	0					0	$self->warning_message("Couldn't find a property in class $class_name that goes with column $column_name");
598	0					0	next;
599							}
600	1096					1906	$property_meta_for_column_name{$column_name} = $prop;
601							}
602
603	372					398	my @rule_columns_in_order; # The order we should perform rule matches on - value is the index in @next_file_row to test
604							my @comparison_for_column; # closures to call to perform the match - same order as @rule_columns_in_order
605	372					393	my $last_sort_column_in_rule = -1; # Last index in @rule_columns_in_order that applies when trying "the shortcut"
606	372					348	my $looking_for_sort_columns = 1;
607
608	372					405	my $next_candidate_row; # This will be filled in by the closure below
609	372					525	foreach my $column_name ( @$sort_order_names, @non_sort_column_names ) {
610	1096					1215	my $property_meta = $property_meta_for_column_name{$column_name};
611	1096	50				1596	unless ($property_meta) {
612	0					0	Carp::croak("Class $class_name has no property connected to column named '$column_name' in data source ".$self->id);
613							}
614	1096					2240	my $property_name = $property_meta->property_name;
615	1096	100	66			2660	if (! $operators_for_properties->{$property_name}) {
		100
616	737					623	$looking_for_sort_columns = 0;
617	737					996	next;
618							} elsif ($looking_for_sort_columns && $sort_column_names{$column_name}) {
619	129					165	$last_sort_column_in_rule++;
620							} else {
621							# There's been a gap in the ID column list in the rule, stop looking for
622							# further ID columns
623	230					235	$looking_for_sort_columns = 0;
624							}
625
626	359					590	push @rule_columns_in_order, $column_name_to_index_map{$column_name};
627
628	359					482	my $operator = $operators_for_properties->{$property_name};
629	359					436	my $rule_value = $values_for_properties->{$property_name};
630
631							my $comparison_function = $self->_comparator_for_operator_and_property($property_meta,
632							\$next_candidate_row,
633	359					1275	$column_name_to_index_map{$column_name},
634							$operator,
635							$rule_value);
636	359	50				898	unless ($comparison_function) {
637	0					0	Carp::croak("Unknown operator '$operator' in file data source filter");
638							}
639	359					583	push @comparison_for_column, $comparison_function;
640							}
641
642	372					942	my $split_regex = $self->_regex();
643
644							# FIXME - another performance boost might be to do some kind of binary search
645							# against the file to set the initial/next position?
646	372					385	my $file_pos = 0;
647
648							# search in the offset cache for something helpful
649	372					761	my $offset_cache = $self->_offset_cache();
650
651							# If the rule doesn't touch the sorted columns, then we can't use the offset cache for help :(
652	372	100				820	if ($last_sort_column_in_rule >= 0) {
653							# Starting at index 1 because we're interested in the file and seek data, not if it's in use
654							# offset 0 is the in-use flag, offset 1 is a ref to the file data and offset 2 is the file seek pos
655							SEARCH_CACHE:
656	129					378	for (my $i = 1; $i < @$offset_cache; $i+=3) {
657	3343	100	66			7703	next unless (defined($offset_cache->[$i]) && defined($offset_cache->[$i+1]));
658
659	1205					993	$next_candidate_row = $offset_cache->[$i];
660	1205					744	my $matched = 0;
661							COMPARE_VALUES:
662	1205					1551	for (my $c = 0; $c <= $last_sort_column_in_rule; $c++) {
663	1205					1138	my $comparison = $comparison_for_column[$c]->();
664
665	1205	100				2609	next SEARCH_CACHE if $comparison > 0;
666	15	100				38	if ($comparison < 0) {
667	10					12	$matched = 1;
668	10					16	last COMPARE_VALUES;
669							}
670							}
671							# If we made it this far, then the file data in this slot is earlier in the file
672							# than the data we're looking for. So, if the seek pos data is later than what
673							# we've found yet, use it instead
674	15	100	66			74	if ($matched and $offset_cache->[$i+1] > $file_pos) {
675	10					26	$file_pos = $offset_cache->[$i+1];
676							}
677							}
678							}
679
680	372					388	my($monitor_start_time,$monitor_printed_first_fetch);
681	372	50				951	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
682	0					0	$monitor_start_time = Time::HiRes::time();
683	0					0	$monitor_printed_first_fetch = 0;
684	0					0	my @filters_list;
685	0					0	for (my $i = 0; $i < @rule_columns_in_order; $i++) {
686	0					0	my $column = $rule_columns_in_order[$i];
687	0					0	my $column_name = $csv_column_order_names->[$column];
688	0	0				0	my $is_sorted = $i <= $last_sort_column_in_rule ? ' (sorted)' : '';
689	0		0			0	my $operator = $operators_for_properties->{$column_name} \|\| '=';
690	0					0	my $rule_value = $values_for_properties->{$column_name};
691	0	0				0	if (ref $rule_value eq 'ARRAY') {
692	0					0	$rule_value = '[' . join(',', @$rule_value) . ']';
693							}
694	0					0	my $filter_string = $column_name . " $operator $rule_value" . $is_sorted;
695	0					0	push @filters_list, $filter_string;
696							}
697	0					0	my $filter_list = join("\n\t", @filters_list);
698	0					0	UR::DBI->sql_fh->printf("\nFILE: %s\nFILTERS %s\n\n", $self->server, $filter_list);
699							}
700
701	372		100			1001	$self->{'_last_read_serial'} \|\|= '';
702
703	372					1046	my $record_separator = $self->record_separator;
704	372					788	my $cache_slot = $self->_allocate_offset_cache_slot();
705	372					389	my $cache_insert_counter = 100; # a "breadcrumb" will be left in the offset cache after this many lines are read
706
707	372					311	my $lines_read = 0;
708	372					272	my $printed_first_match = 0;
709	372					343	my $lines_matched = 0;
710
711	372					338	my $fh; # File handle we'll be reading from
712	372					344	my $this_iterator_serial = $iterator_serial++;
713							my $iterator = sub {
714
715	594	100		594		1065	unless (ref($fh)) {
716	372					1129	$fh = $self->get_default_handle();
717							# Lock the file for reading... For more fine-grained locking we could move this to
718							# after READ_LINE_FROM_FILE: but that would slow down read operations a bit. If
719							# there ends up being a problem with lock contention, go ahead and move it before $line = <$fh>;
720							#flock($fh,LOCK_SH);
721							}
722
723	594	50	33			1623	if ($monitor_start_time && ! $monitor_printed_first_fetch) {
724	0					0	UR::DBI->sql_fh->printf("FILE: FIRST FETCH TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
725	0					0	$monitor_printed_first_fetch = 1;
726							}
727
728	594	100				1779	if ($self->{'_last_read_serial'} ne $this_iterator_serial) {
729	374	50				929	UR::DBI->sql_fh->printf("FILE: Resetting file position to $file_pos\n") if $ENV{'UR_DBI_MONITOR_SQL'};
730							# The last read was from a different request, reset the position
731	374					1510	$fh->seek($file_pos,0);
732	374	100				3121	if ($file_pos == 0) {
733	365					1001	my $skip = $self->skip_first_line;
734	365					1004	while ($skip-- > 0) {
735	0					0	scalar(<$fh>);
736							}
737							}
738	374					792	$file_pos = $self->_file_position();
739
740	374					2025	$self->{'_last_read_serial'} = $this_iterator_serial;
741							}
742
743	594					1777	local $/; # Make sure some wise guy hasn't changed this out from under us
744	594					846	$/ = $record_separator;
745
746	594					737	my $line;
747							READ_LINE_FROM_FILE:
748	594					1307	until($line) {
749
750							# Hack for OSX 10.5.
751							# At EOF, the getline below will return undef. Most builds of Perl
752							# will also set $! to 0 at EOF so you can distinguish between the cases
753							# of EOF (which may have actually happened a while ago because of buffering)
754							# and an actual read error. OSX 10.5's Perl does not, and so $!
755							# retains whatever value it had after the last failed syscall, likely
756							# a stat() while looking for a Perl module. This should have no effect
757							# other platforms where you can't trust $! at arbitrary points in time
758							# anyway
759	1095					1696	$! = 0;
760	1095					8610	$line = <$fh>;
761
762	1095	100				1844	unless (defined $line) {
763	249	50				707	if ($!) {
764	0	0	0			0	redo READ_LINE_FROM_FILE if ($! == EAGAIN or $! == EINTR);
765	0					0	my $pathname = $self->server();
766	0					0	Carp::confess("getline() failed for DataSource $self pathname $pathname boolexpr $rule: $!");
767							}
768
769							# at EOF. Close up shop and return
770							#flock($fh,LOCK_UN);
771	249					260	$fh = undef;
772
773	249	50				486	if ($monitor_start_time) {
774	0					0	UR::DBI->sql_fh->printf("FILE: at EOF\nFILE: $lines_read lines read for this request. $lines_matched matches\nFILE: TOTAL EXECUTE-FETCH TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
775							}
776
777	249					892	return;
778							}
779
780	846					720	$lines_read++;
781	846					810	my $last_read_size = length($line);
782	846					974	chomp $line;
783							# FIXME - to support record-oriented files, we need some replacement for this...
784	846					4124	$next_candidate_row = [ split($split_regex, $line, $csv_column_count) ];
785	846					1321	$#{$a} = $csv_column_count-1;
	846					1209
786
787	846					1339	$file_pos = $self->_file_position();
788	846					3095	my $file_pos_before_read = $file_pos - $last_read_size;
789
790							# Every so many lines read, leave a breadcrumb about what we've seen
791	846	50				1404	unless ($lines_read % $cache_insert_counter) {
792	0					0	$offset_cache->[$cache_slot+1] = $next_candidate_row;
793	0					0	$offset_cache->[$cache_slot+2] = $file_pos_before_read;
794	0					0	$self->_free_offset_cache_slot($cache_slot);
795
796							# get a new slot
797	0					0	$cache_slot = $self->_allocate_offset_cache_slot();
798	0					0	$offset_cache->[$cache_slot+1] = $next_candidate_row;
799	0					0	$offset_cache->[$cache_slot+2] = $file_pos_before_read;
800
801	0					0	$cache_insert_counter <<= 2; # Double the insert counter
802							}
803
804	846					1554	for (my $i = 0; $i < @rule_columns_in_order; $i++) {
805	801					1198	my $comparison = $comparison_for_column[$i]->();
806
807	801	100	100			2580	if ($comparison > 0 and $i <= $last_sort_column_in_rule) {
		100
808							# We've gone past the last thing that could possibly match
809
810	121	50				239	if ($monitor_start_time) {
811	0					0	UR::DBI->sql_fh->printf("FILE: $lines_read lines read for this request. $lines_matched matches\nFILE: TOTAL EXECUTE-FETCH TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
812							}
813
814							#flock($fh,LOCK_UN);
815
816							# Save the info from the last row we read
817	121					224	$offset_cache->[$cache_slot+1] = $next_candidate_row;
818	121					248	$offset_cache->[$cache_slot+2] = $file_pos_before_read;
819	121					489	return;
820
821							} elsif ($comparison) {
822							# comparison didn't match, read another line from the file
823	501					710	redo READ_LINE_FROM_FILE;
824							}
825
826							# That comparison worked... stay in the for() loop for other comparisons
827							}
828							# All the comparisons return '0', meaning they passed
829
830							# Now see if the offset cache file data is different than the row we just read
831							COMPARE_TO_CACHE:
832	224					425	foreach my $column ( @sort_order_column_indexes) {
833	19			19		103	no warnings 'uninitialized';
	19					26
	19					26062
834	223	50				719	if ($offset_cache->[$cache_slot+1]->[$column] ne $next_candidate_row->[$column]) {
835							# They're different. Update the offset cache data
836	223					304	$offset_cache->[$cache_slot+1] = $next_candidate_row;
837	223					315	$offset_cache->[$cache_slot+2] = $file_pos_before_read;
838	223					309	last COMPARE_TO_CACHE;
839							}
840							}
841
842	224	50	33			1031	if (! $printed_first_match and $monitor_start_time) {
843	0					0	UR::DBI->sql_fh->printf("FILE: First match after reading $lines_read lines\n");
844	0					0	$printed_first_match=1;
845							}
846	224					229	$lines_matched++;
847
848	224					910	return $next_candidate_row;
849							}
850	372					2408	}; # end sub $iterator
851
852	372					2380	Sub::Name::subname('UR::DataSource::File::__datasource_iterator(closure)__', $iterator);
853
854	372		100			1026	my $count = $self->_open_query_count() \|\| 0;
855	372					890	$self->_open_query_count($count+1);
856	372					1171	bless $iterator, 'UR::DataSource::File::Tracker';
857	372					1017	$iterator_data_source{$iterator} = $self;
858	372					646	$iterator_cache_slot_refs{$iterator} = \$cache_slot;
859
860	372					2815	return $iterator;
861							}
862
863
864							sub UR::DataSource::File::Tracker::DESTROY {
865	372			372		348	my $iterator = shift;
866	372					893	my $ds = delete $iterator_data_source{$iterator};
867	372	50				781	return unless $ds; # The data source may have gone out of scope first during global destruction
868
869	372					672	my $cache_slot_ref = delete $iterator_cache_slot_refs{$iterator};
870	372	50	33			1480	if (defined($cache_slot_ref) and defined($$cache_slot_ref)) {
871							# Mark this slot unused
872							#print STDERR "Freeing cache slot $cache_slot\n";
873							#$ds->_offset_cache->[$$cache_slot_ref] = 0;
874	372					1065	$ds->_free_offset_cache_slot($$cache_slot_ref);
875							}
876
877	372					938	my $count = $ds->_open_query_count();
878	372					783	$ds->_open_query_count(--$count);
879
880	372	100				938	return unless ($ds->quick_disconnect);
881	368	100	66			1469	if ($count == 0 && $ds->has_default_handle) {
882							# All open queries have supposedly been fulfilled. Close the
883							# file handle and undef it so get_default_handle() will re-open if necessary
884	366					692	my $fh = $ds->get_default_handle;
885
886	366	50				844	UR::DBI->sql_fh->printf("FILE: CLOSING fileno ".fileno($fh)."\n") if ($ENV{'UR_DBI_MONITOR_SQL'});
887							#flock($fh,LOCK_UN);
888	366					1114	$fh->close();
889	366					13968	$ds->__invalidate_get_default_handle__;
890							}
891							}
892
893							# Names of creation params that we should force to be listrefs
894							our %creation_param_is_list = map { $_ => 1 } qw( column_order file_list sort_order constant_values );
895							sub create_from_inline_class_data {
896	3			3	1	5	my($class, $class_data, $ds_data) = @_;
897
898							# User didn't specify columns in the file. Assumme every property is a column, and in the same order
899	3	100				11	unless (exists $ds_data->{'column_order'}) {
900	1					299	Carp::croak "data_source has no column_order specified";
901							}
902
903	2		33			14	$ds_data->{'server'} \|\|= $ds_data->{'path'} \|\| $ds_data->{'file'};
			66
904
905	2					2	my %ds_creation_params;
906	2					5	foreach my $param ( qw( delimiter record_separator column_order skip_first_line server file_list sort_order constant_values ) ) {
907	16	100				27	if (exists $ds_data->{$param}) {
908	7	50	66			28	if ($creation_param_is_list{$param} and ref($ds_data->{$param}) ne 'ARRAY') {
909	0					0	$ds_creation_params{$param} = \( $ds_data->{$param} );
910							} else {
911	7					10	$ds_creation_params{$param} = $ds_data->{$param};
912							}
913							}
914							}
915
916	2					15	my($namespace, $class_name) = ($class_data->{'class_name'} =~ m/^(\w+?)::(.*)/);
917	2					5	my $ds_id = "${namespace}::DataSource::${class_name}";
918	2					4	my $ds_type = delete $ds_data->{'is'};
919	2					13	my $ds = $ds_type->create( %ds_creation_params, id => $ds_id );
920	2					9	return $ds;
921							}
922
923
924							# The string used to join fields of a row together
925							#
926							# Since the 'delimiter' property is interpreted as a regex in the reading
927							# code, we'll try to be smart about making a real string from that.
928							#
929							# subclasses can override this to provide a different implementation
930							sub join_pattern {
931	4			4	0	6	my $self = shift;
932
933	4					11	my $join_pattern = $self->delimiter;
934
935							# make some common substitutions...
936	4	50				11	if ($join_pattern eq '\s,\s') {
937							# The default...
938	0					0	return ', ';
939							}
940
941	4					9	$join_pattern =~ s/\\s*//g; # Turn 0-or-more whitespaces to nothing
942	4					4	$join_pattern =~ s/\\t/\t/; # tab
943	4					6	$join_pattern =~ s/\\s/ /; # whitespace
944
945	4					5	return $join_pattern;
946							}
947
948
949
950							sub _sync_database {
951	4			4		5	my $self = shift;
952	4					8	my %params = @_;
953
954	4	50				11	unless (ref($self)) {
955	0	0				0	if ($self->isa("UR::Singleton")) {
956	0					0	$self = $self->_singleton_object;
957							}
958							else {
959	0					0	die "Called as a class-method on a non-singleton datasource!";
960							}
961							}
962
963	4					15	my $read_fh = $self->get_default_handle();
964	4	50				9	unless ($read_fh) {
965	0					0	Carp::croak($self->class . ": Can't _sync_database(): Can't open file " . $self->server . " for reading: $!");
966							}
967
968	4					10	my $original_data_file = $self->server;
969	4					152	my $original_data_dir = File::Basename::dirname($original_data_file);
970	4					7	my $use_quick_rename;
971	4	50				46	unless (-d $original_data_dir){
972	0					0	File::Path::mkpath($original_data_dir);
973							}
974	4	50				29	if (-w $original_data_dir) {
		0
975	4					5	$use_quick_rename = 1; # We can write to the data dir
976							} elsif (! -w $original_data_file) {
977	0					0	$self->error_message("Neither the directory nor the file for $original_data_file are writable - cannot sync_database");
978	0					0	return;
979							}
980
981
982	4					12	my $split_regex = $self->_regex();
983	4					24	my $join_pattern = $self->join_pattern;
984	4					11	my $record_separator = $self->record_separator;
985	4					13	local $/; # Make sure some wise guy hasn't changed this out from under us
986	4					6	$/ = $record_separator;
987
988	4					9	my $csv_column_order_names = $self->column_order;
989	4					9	my $csv_column_count = scalar(@$csv_column_order_names);
990	4					3	my %column_name_to_index_map;
991	4					14	for (my $i = 0; $i < @$csv_column_order_names; $i++) {
992	12					27	$column_name_to_index_map{$csv_column_order_names->[$i]} = $i;
993							}
994
995	4					7	my $changed_objects = delete $params{changed_objects};
996
997
998							# We're going to assumme all the passed-in objects are of the same class gulp
999	4					14	my $class_name = $changed_objects->[0]->class;
1000	4					29	my $class_meta = UR::Object::Type->get(class_name => $class_name);
1001	15					22	my %column_name_to_property_meta = map { $_->column_name => $_ }
1002	4					35	grep { $_->column_name }
	19					29
1003							$class_meta->all_property_metas;
1004	4					9	my @property_names_in_column_order;
1005	4					9	foreach my $column_name ( @$csv_column_order_names ) {
1006	12					13	my $prop_meta = $column_name_to_property_meta{$column_name};
1007	12	50				20	unless ($prop_meta) {
1008	0					0	die "Data source " . $self->class . " id " . $self->id .
1009							" could not resolve a $class_name property for the data source's column named $column_name";
1010							}
1011
1012	12					19	push @property_names_in_column_order, $prop_meta->property_name;
1013							}
1014
1015	4					6	my $insert = [];
1016	4					6	my $update = {};
1017	4					5	my $delete = {};
1018	4					7	foreach my $obj ( @$changed_objects ) {
1019	16	100				50	if ($obj->isa('UR::Object::Ghost')) {
		100
1020							# This should be removed from the file
1021	4					6	my $original = $obj->{'db_committed'};
1022	4					4	my $line = join($join_pattern, @{$original}{@property_names_in_column_order}) . $record_separator;
	4					12
1023	4					7	$delete->{$line} = $obj;
1024
1025							} elsif ($obj->{'db_committed'}) {
1026							# This object is changed since it was read in the file
1027	4					6	my $original = $obj->{'db_committed'};
1028	4					4	my $original_line = join($join_pattern, @{$original}{@property_names_in_column_order}) . $record_separator;
	4					13
1029	4					6	my $changed_line = join($join_pattern, @{$obj}{@property_names_in_column_order}) . $record_separator;
	4					7
1030	4					9	$update->{$original_line} = $changed_line;
1031
1032							} else {
1033							# This object is new and should be added to the file
1034	8					15	push @$insert, [ @{$obj}{@property_names_in_column_order} ];
	8					17
1035							}
1036							}
1037
1038	4					11	my $sort_order_names = $self->sort_order;
1039	4					14	foreach my $sort_column_name ( @$sort_order_names ) {
1040	4	50				12	unless (exists $column_name_to_index_map{$sort_column_name}) {
1041	0					0	Carp::croak("Column name '$sort_column_name' appears in the sort_order list, but not in the column_order list for data source ".$self->id);
1042							}
1043							}
1044	4					6	my $file_is_sorted = scalar(@$sort_order_names);
1045	4					7	my %column_sorts_numerically = map { $_->column_name => $_->is_numeric }
	15					28
1046							values %column_name_to_property_meta;
1047							my $row_sort_sub = sub ($$) {
1048	20			20		15	my $comparison;
1049
1050	20					18	foreach my $column_name ( @$sort_order_names ) {
1051	20					21	my $i = $column_name_to_index_map{$column_name};
1052	20	50				24	if ($column_sorts_numerically{$column_name}) {
1053	20					25	$comparison = $_[0]->[$i] <=> $_[1]->[$i];
1054							} else {
1055	0					0	$comparison = $_[0]->[$i] cmp $_[1]->[$i];
1056							}
1057	20	50				39	return $comparison if $comparison != 0;
1058							}
1059	0					0	return 0;
1060	4					21	};
1061	4	50	33			25	if ($sort_order_names && $file_is_sorted && scalar(@$insert)) {
			50
1062							# the inserted things should be sorted the same way as the file
1063	4					9	my @sorted = sort $row_sort_sub @$insert;
1064	4					6	$insert = \@sorted;
1065							}
1066
1067	4					6	my $write_fh;
1068							my $temp_file_name;
1069	4	50				7	if ($use_quick_rename) {
1070	4					28	$temp_file_name = sprintf("%s/.%d.%d" , $original_data_dir, time(), $$);
1071	4					21	$write_fh = IO::File->new($temp_file_name, O_WRONLY\|O_CREAT);
1072							} else {
1073	0					0	$write_fh = File::Temp->new(UNLINK => 1);
1074	0	0				0	$temp_file_name = $write_fh->filename if ($write_fh);
1075							}
1076	4	50				504	unless ($write_fh) {
1077	0					0	Carp::croak "Can't create temporary file for writing: $!";
1078							}
1079
1080	4					6	my $monitor_start_time;
1081	4	50				18	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1082	0					0	$monitor_start_time = Time::HiRes::time();
1083	0					0	my $time = time();
1084	0					0	UR::DBI->sql_fh->printf("\nFILE: SYNC_DATABASE AT %d [%s]. Started transaction for %s to temp file %s\n",
1085							$time, scalar(localtime($time)), $original_data_file, $temp_file_name);
1086
1087							}
1088
1089	4	50				29	unless (flock($read_fh,LOCK_SH)) {
1090	0	0				0	unless ($! == EOPNOTSUPP ) {
1091	0					0	Carp::croak($self->class(). ": Can't get exclusive lock for file ".$self->server.": $!");
1092							}
1093							}
1094
1095							# write headers to the new file
1096	4					17	for (my $i = 0; $i < $self->skip_first_line; $i++) {
1097	0					0	my $line = <$read_fh>;
1098	0					0	$write_fh->print($line);
1099							}
1100
1101
1102	4					6	my $line;
1103							READ_A_LINE:
1104	4					5	while(1) {
1105	20	100				26	unless ($line) {
1106	15					90	$line = <$read_fh>;
1107	15	100				25	last unless defined $line;
1108							}
1109
1110	16	100	50			38	if ($file_is_sorted && scalar(@$insert)) {
1111							# there are sorted things waiting to insert
1112	15					16	my $chomped = $line;
1113	15					14	chomp $chomped;
1114	15					51	my $row = [ split($split_regex, $chomped, $csv_column_count) ];
1115	15					23	my $comparison = $row_sort_sub->($row, $insert->[0]);
1116	15	100				29	if ($comparison > 0) {
1117							# write the object's data
1118	19			19		104	no warnings 'uninitialized'; # Some of the object's data may be undef
	19					32
	19					3899
1119	5					7	my $new_row = shift @$insert;
1120	5					12	my $new_line = join($join_pattern, @$new_row) . $record_separator;
1121
1122	5	50				10	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1123	0					0	UR::DBI->sql_fh->print("INSERT >>$new_line<<\n");
1124							}
1125
1126	5					13	$write_fh->print($new_line);
1127							# Don't undef the last line read, meaning it could still be written to the output...
1128	5					35	next READ_A_LINE;
1129							}
1130							}
1131
1132	11	100				32	if (my $obj = delete $delete->{$line}) {
		100
1133	2	50				8	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1134	0					0	UR::DBI->sql_fh->print("DELETE >>$line<<\n");
1135							}
1136	2					2	$line = undef;
1137	2					4	next;
1138
1139							} elsif (my $changed = delete $update->{$line}) {
1140	4	50				11	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1141	0					0	UR::DBI->sql_fh->print("UPDATE replace >>$line<< with >>$changed<<\n");
1142							}
1143	4					8	$write_fh->print($changed);
1144	4					15	$line = undef;
1145	4					5	next;
1146
1147							} else {
1148							# This line from the file was unchanged in the app
1149	5					10	$write_fh->print($line);
1150	5					24	$line = undef;
1151							}
1152							}
1153
1154	4	50				12	if (keys %$delete) {
1155	0					0	$self->warning_message("There were ",scalar(keys %$delete)," deleted $class_name objects that did not match data in the file");
1156							}
1157	4	50				14	if (keys %$update) {
1158	0					0	$self->warning_message("There were ",scalar(keys %$update)," updated $class_name objects that did not match data in the file");
1159							}
1160
1161							# finish out by writing the rest of the new data
1162	4					7	foreach my $new_row ( @$insert ) {
1163	19			19		89	no warnings 'uninitialized'; # Some of the object's data may be undef
	19					32
	19					8317
1164	3					11	my $new_line = join($join_pattern, @$new_row) . $record_separator;
1165	3	50				9	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1166	0					0	UR::DBI->sql_fh->print("INSERT >>$new_line<<\n");
1167							}
1168	3					7	$write_fh->print($new_line);
1169							}
1170	4					28	$write_fh->close();
1171
1172	4	50				253	if ($use_quick_rename) {
1173	4	50				13	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1174	0					0	UR::DBI->sql_fh->print("FILE: COMMIT rename $temp_file_name over $original_data_file\n");
1175							}
1176
1177	4	50				195	unless(rename($temp_file_name, $original_data_file)) {
1178	0					0	$self->error_message("Can't rename the temp file over the original file: $!");
1179	0					0	return;
1180							}
1181							} else {
1182							# We have to copy the data from the temp file to the original file
1183
1184	0	0				0	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1185	0					0	UR::DBI->sql_fh->print("FILE: COMMIT write over $original_data_file in place\n");
1186							}
1187	0					0	my $new_write_fh = IO::File->new($original_data_file, O_WRONLY\|O_TRUNC);
1188	0	0				0	unless ($new_write_fh) {
1189	0					0	$self->error_message("Can't open $original_data_file for writing: $!");
1190	0					0	return;
1191							}
1192
1193	0					0	my $temp_file_fh = IO::File->new($temp_file_name);
1194	0	0				0	unless ($temp_file_fh) {
1195	0					0	$self->error_message("Can't open $temp_file_name for reading: $!");
1196	0					0	return;
1197							}
1198
1199	0					0	while(<$temp_file_fh>) {
1200	0					0	$new_write_fh->print($_);
1201							}
1202
1203	0					0	$new_write_fh->close();
1204							}
1205
1206							# Because of the rename/copy process during syncing, the previously opened filehandle may
1207							# not be valid anymore. get_default_handle will reopen the file next time it's needed
1208	4					27	$self->_invalidate_cache();
1209	4					33	$self->__invalidate_get_default_handle__;
1210
1211	4	50				9	if ($ENV{'UR_DBI_MONITOR_SQL'}) {
1212	0					0	UR::DBI->sql_fh->printf("FILE: TOTAL COMMIT TIME: %.4f s\n", Time::HiRes::time() - $monitor_start_time);
1213							}
1214
1215	4					32	flock($read_fh, LOCK_UN);
1216	4					11	$read_fh->close();
1217
1218							# FIXME - this is ugly... With RDBMS-type data sources, they will call $dbh->commit() which
1219							# gets to UR::DBI->commit(), which calls _set_object_saved_committed for them. Since we're
1220							# not using DBI we have to do this 2-part thing ourselves. In the future, we might break
1221							# out things so the saving to the temp file goes in _sync_database(), and moving the temp
1222							# file over the original goes in commit()
1223	4	50				2092	unless ($self->_set_specified_objects_saved_uncommitted($changed_objects)) {
1224	0					0	Carp::croak("Error setting objects to a saved state after sync_database. Exiting.");
1225	0					0	return;
1226							}
1227
1228	4					47	$self->_set_specified_objects_saved_committed($changed_objects);
1229	4					73	return 1;
1230							}
1231
1232
1233
1234							sub initializer_should_create_column_name_for_class_properties {
1235	28			28	0	353	1;
1236							}
1237
1238
1239							1;
1240
1241							=pod
1242
1243							=head1 NAME
1244
1245							UR::DataSource::File - Parent class for file-based data sources
1246
1247							=head1 DEPRECATED
1248
1249							This module is deprecated. Use UR::DataSource::Filesystem instead.
1250
1251							=head1 SYNOPSIS
1252
1253							package MyNamespace::DataSource::MyFile;
1254							class MyNamespace::DataSource::MyFile {
1255							is => ['UR::DataSource::File', 'UR::Singleton'],
1256							};
1257							sub server { '/path/to/file' }
1258							sub delimiter { "\t" }
1259							sub column_order { ['thing_id', 'thing_name', 'thing_color' ] }
1260							sub sort_order { ['thing_id'] }
1261
1262							package main;
1263							class MyNamespace::Thing {
1264							id_by => 'thing_id',
1265							has => [ 'thing_id', 'thing_name', 'thing_color' ],
1266							data_source => 'MyNamespace::DataSource::MyFile',
1267							}
1268							my @objs = MyNamespace::Thing->get(thing_name => 'Bob');
1269
1270							=head1 DESCRIPTION
1271
1272							Classes which wish to retrieve their data from a regular file can use a UR::DataSource::File-based
1273							data source. The modules implementing these data sources live under the DataSource subdirectory
1274							of the application's Namespace, by convention. Besides defining a class for your data source
1275							inheriting from UR::DataSource::File, it should have the following methods, either as properties
1276							or functions in the package.
1277
1278							=head2 Configuration
1279
1280							These methods determine the configuration for your data source.
1281
1282							=over 4
1283
1284							=item server()
1285
1286							server() should return a string representing the pathname of the file where the data is stored.
1287
1288							=item file_list()
1289
1290							The file_list() method should return a listref of pathnames to one or more identical files
1291							where data is stored. Use file_list() instead of server() when you want to load-balance several NFS
1292							servers, for example.
1293
1294							You must have either server() or file_list() in your module, but not both. The existence of server()
1295							takes precedence over file_list().
1296
1297							=item delimiter()
1298
1299							delimiter() should return a string representing how the fields in each record are split into
1300							columns. This string is interpreted as a regex internally. The default delimiter is "\s,\s"
1301							meaning that the file is separated by commas.
1302
1303							=item record_separator()
1304
1305							record_separator() should return a string that gets stored in $/ before getline() is called on the
1306							file's filehandle. The default record_separator() is "\n" meaning that the file's records are
1307							separated by newlines.
1308
1309							=item skip_first_line()
1310
1311							skip_first_line() should return a boolean value. If true, the first line of the file is ignored, for
1312							example if the first line defines the columns in the file.
1313
1314							=item column_order()
1315
1316							column_order() should return a listref of column names in the file. column_order is required; there
1317							is no default.
1318
1319							=item sort_order()
1320
1321							If the data file is sorted in some way, sort_order() should return a listref of column names (which must
1322							exist in column_order()) by which the file is sorted. This gives the system a hint about how the file
1323							is structured, and is able to make shortcuts when reading the file to speed up data access. The default
1324							is to assumme the file is not sorted.
1325
1326							=back
1327
1328							=head1 INHERITANCE
1329
1330							UR::DataSource
1331
1332							=head1 SEE ALSO
1333
1334							UR, UR::DataSource
1335
1336							=cut