File Coverage

blib/lib/Text/TabTable.pm

Criterion	Covered	Total	%
statement	40	492	8.1
branch	0	186	0.0
condition	0	48	0.0
subroutine	14	57	24.5
pod	0	18	0.0
total	54	801	6.7

line	stmt	bran	cond	sub	pod	time	code
1							# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2							# Copyright (c) 2002-2003 Vivendi Universal Net USA
3							#
4							# May be copied under the same terms as perl itself.
5							# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6
7
8							# Database-like operations on tab-delimited files.
9							#
10							# Given two files:
11							# band_data.tab with fields band_id, band_name, and band_status
12							# song_data.tab with fields song_id, band_id, song_title
13							#
14							# The following sequence is more or less equivalent to
15							#
16							# SELECT song_id, band_data.band_id AS band_id,
17							# song_title, band_name,
18							# int(band_id/1000) AS band_dir
19							# FROM song_data INNER JOIN band_data ON song_data.band_id=band_data.band_id
20							# WHERE band_status = 'APPROVED'
21							# ORDER BY band_name
22							# INTO TABLE songband
23							#
24							#
25							# $band_data = Text::TabTable->import_headered("band_data.tab") ;
26							# $song_data = Text::TabTable->import_headered("song_data.tab") ;
27							# $joined = $song_data->join($band_data, "band_id", "band_id", "INNER") ;
28							# $selected = $joined->select(
29							# [
30							# 'song_id',
31							# ['band_data.band_id', 'band_id'],
32							# 'song_title',
33							# 'band_name',
34							# [ sub { int($_[0] / 1000) }, "band_dir", ["band_id"]],
35							# ],
36							#
37							# sub { $_[0]->band_status eq 'APPROVED' },
38							# ) ;
39							# $out = $selected->order("band_name") ;
40							# $out->export_headered("songband.tab") ;
41							#
42							##############################################################################
43							#
44							# You can speed up LEFT and INNER joins on primary keys if you create an index
45							# for the primary key column on the right-side table using
46							#
47							# $righttable->build_primary_index("band_id") ;
48							# $newtable = $lefttable->join($righttable, "band_id", "band_id", "LEFT") ;
49							#
50							# If both tables are already sorted by the primary key because order() was
51							# previously used, this will be slower.
52							#
53							# The index will not be used for RIGHT joins.
54
55							package Text::TabTable ;
56
57	1			1		749	use strict ;
	1					2
	1					29
58	1			1		4	use Carp ;
	1					2
	1					162
59	1			1		1122	use Data::Dumper ;
	1					11085
	1					68
60	1			1		10	use Fcntl qw(O_WRONLY O_EXCL O_CREAT) ;
	1					2
	1					62
61
62	1			1		5	use vars qw($SORT $JOIN $VERBOSE $TMPDIR $VERSION) ;
	1					3
	1					13886
63
64							$VERSION = "1.02" ;
65
66							$TMPDIR = "." ;
67							$SORT = "/bin/sort" ;
68							$JOIN = "/usr/bin/join" ;
69
70							$VERBOSE=$ENV{TABTABLE_VERBOSE} ;
71
72							####
73							# Constructor. Takes a tab delimited file with a field name
74							# header line and returns a TabTable object. Parses the header line
75							# and creates a temporary file without the header line.
76							####
77							sub import_headered
78							{
79	0			0	0		my ($package,$fname) = @_ ;
80	0						my $newf = _make_tempfile() ;
81
82	0	0					carp "importing $fname" if $VERBOSE ;
83	0	0					open(F, $fname) \|\| return undef ;
84	0	0					open(NEWF, ">$newf") \|\| return undef ;
85	0						my $header = ;
86	0						my $buf ;
87
88							# copy the unheadered version of the file to a new file.
89	0						while( read(F, $buf, 2048) ) {
90	0						print NEWF $buf ;
91							}
92	0						close F ;
93	0						close NEWF ;
94
95	0						chomp $header ;
96	0						my @fieldnames = split(/\t/, $header) ;
97	0						my @fields = map { Text::TabTable::Field->new($_) } @fieldnames ;
	0
98
99	0						my $name = $fname ;
100	0						$name =~ s/\..*$// ; # remove extensions
101	0						$name =~ s@.*\/@@ ; # remove path
102
103	0						my $self = {
104							filename => $newf,
105							fieldlist => Text::TabTable::FieldList->new(@fields),
106							name => $name,
107							} ;
108	0						bless $self, $package ;
109							}
110
111							####
112							# Alternate constructor. Takes a tab delimited file without a field name
113							# header line, plus the field names, and returns a TabTable object. This
114							# saves time because it doesn't require making a tempfile without the header.
115							####
116							sub import_unheadered
117							{
118	0			0	0		my ($package,$fname, @fieldnames) = @_ ;
119	0						my $newf = _make_tempfile() ;
120
121	0	0					carp "importing $fname (unheadered)" if $VERBOSE ;
122	0	0	0				return undef if !-f $fname \|\| !-r $fname ;
123
124	0						my @fields = map { Text::TabTable::Field->new($_) } @fieldnames ;
	0
125
126	0						my $name = $fname ;
127	0						$name =~ s/\..*$// ; # remove extensions
128	0						$name =~ s@.*\/@@ ; # remove path
129
130	0						my $self = {
131							filename => $fname,
132							dontdelete => 1, # so we know it's not a tempfile.
133							fieldlist => Text::TabTable::FieldList->new(@fields),
134							name => $name,
135							} ;
136	0						bless $self, $package ;
137							}
138
139
140							####
141							# Undoes the escaping done by MediaExtractor.
142							####
143							sub unescape
144							{
145	0			0	0		my ($str) = @_ ;
146
147	0						my $x = $str ;
148	0						$str =~ s/\\\\/\xff/g ;
149	0						$str =~ s/\\n/\n/g ;
150	0						$str =~ s/\\t/\t/g ;
151	0						$str =~ s/\xff/\\/g ;
152
153	0						return $str ;
154							}
155
156							####
157							# This is the same escaping done by MediaExtractor.
158							####
159							sub escape
160							{
161	0			0	0		my ($str) = @_ ;
162
163	0						$str =~ s/\\/\\\\/g;
164	0						$str =~ s/\t/\\t/g;
165	0						$str =~ s/\n/\\n/g;
166
167
168	0						return $str ;
169							}
170
171							####
172							# Writes out a table as a file with a header.
173							####
174							sub export_headered
175							{
176	0			0	0		my ($self, $filename) = @_ ;
177	0	0					carp "exporting $self->{name} to $filename" if $VERBOSE ;
178
179	0	0					open(F, ">$filename") \|\| croak "$filename: $!\n" ;
180	0						print F $self->{fieldlist}->as_string(), "\n" ;
181	0						close F ;
182	0						system "cat $self->{filename} >> $filename" ;
183							}
184
185							####
186							# Writes out a table as a file without a header.
187							####
188							sub export_unheadered
189							{
190	0			0	0		my ($self, $filename) = @_ ;
191	0	0					carp "exporting $self->{name} to $filename" if $VERBOSE ;
192	0						system "cp $self->{filename} $filename" ;
193	0	0					if ($?) {
194	0						unlink $filename ;
195	0						croak "can't export to $filename: $!" ;
196							}
197							}
198
199							####
200							# Returns a new table that has only one of each value in the specified
201							# column.
202							####
203							sub uniq
204							{
205	0			0	0		my ($table, $colname) = @_ ;
206
207	0						my $colnum = $table->{fieldlist}->find_colnum($colname) ;
208	0	0					croak "no field $colname in table" if !$colname ;
209
210	0	0	0				if (!$table->{sorted_column} \|\| $table->{sorted_column} != $colnum) {
211	0						my $name = $table->name() ;
212	0						$table = $table->order($colname) ;
213	0						$table->name($name) ;
214							}
215
216	0	0					carp "uniquing $table->{name} by $colname" if $VERBOSE ;
217
218	0						my $newf = _make_tempfile() ;
219
220	0	0					open(OLDF, "<$table->{filename}") \|\| die ;
221	0	0					open(NEWF, ">$newf") \|\| croak "$newf: $!\n" ;
222
223	0						my $oldval = undef ;
224	0						while () {
225	0						chomp ;
226	0						my @f = split(/\t/, $_, -1) ;
227	0	0	0				if ($oldval ne $f[$colnum-1] \|\| !defined $oldval) {
228	0						print NEWF $_, "\n" ;
229	0						$oldval = $f[$colnum-1] ;
230							}
231							}
232
233	0						close(OLDF) ;
234	0						close(NEWF) ;
235
236	0						my $newtable = {
237							filename => $newf,
238							fieldlist => $table->{fieldlist}->deepcopy(),
239							sorted_colnum => $colnum,
240							name => $table->name(),
241							} ;
242
243	0	0					if (!defined wantarray ) {
244	0						carp "Warning: Useless uniq in void context." ;
245							}
246
247	0						bless $newtable, ref $table ;
248							}
249
250							####
251							# Export to a cdb file.
252							# There will be a special key "FIELDNAMES" whose value is a tab
253							# separated list of the names of the fields.
254							# The rest of the cdb file will be of the form key => tab-delimited-values.
255							#
256							# The key must be unique; however as a special case multiple blank keys
257							# are allowed to be present; only the first one is used. This is a hack,
258							# but is too good an optimization to pass up.
259							####
260							sub export_cdb
261							{
262	0			0	0		my ($self, $filename, $colname) = @_ ;
263	0						require CDB_File ;
264	0	0					carp "exporting $self->{name} to cdb $filename" if $VERBOSE ;
265
266	0	0					my $t = CDB_File->new($filename, "$filename.new$$") or croak "$filename: $!" ;
267
268	0						$t->insert("FIELDNAMES", $self->{fieldlist}->as_string()) ;
269
270	0	0					open(F, "< $self->{filename}") \|\| die "$self->{filename}: $!" ;
271	0						my $colnum = $self->{fieldlist}->find_colnum($colname) ;
272	0						$colnum-- ;
273
274							# Create a regex to skip over the columns before the key column and
275							# collect the key column in $1.
276	0						my $regex = '^' . ('[^\t]\t' x $colnum) . '([^\t])' ;
277	0						$regex = qr($regex) ;
278	0						my $didblankkey = 0 ;
279	0						while () {
280	0						chomp ;
281	0	0					/$regex/ \|\| die ;
282	0						my $key = $1 ;
283
284	0	0	0				next if $key eq '' && $didblankkey++ ;
285
286	0						$t->insert($key, $_) ;
287							}
288
289	0						$t->finish() ;
290	0						close(F) ;
291							}
292
293							####
294							# Returns the named column of the table as an array.
295							####
296							sub export_column
297							{
298	0			0	0		my ($table, $colname) = @_ ;
299	0	0					carp "exporting $table->{name} column $colname" if $VERBOSE ;
300
301	0						my $colnum = $table->{fieldlist}->find_colnum($colname) ;
302
303	0	0					if (!defined $colnum) {
304	0						croak "no column $colname" ;
305							}
306
307	0						my @arr ;
308	0	0					open(CUT, "cut -f$colnum $table->{filename}\|") \|\| die ;
309	0						while(defined ($_=)) {
310	0						chomp ;
311	0						push @arr, $_ ;
312							}
313	0						close CUT ;
314
315	0						return @arr ;
316							}
317
318							####
319							# Returns a new TabTable that is sorted by the requested column. Dies
320							# if no such column. You can specify -descending=>1 or -numeric =>1
321							# after the fieldname.
322							#
323							# The sort is stable, so you can sort on multiple fields by doing
324							# multiple sorts, with the most important one last.
325							####
326							sub order
327							{
328	0			0	0		my ($self, $fieldname, %args) = @_ ;
329	0	0					carp "sorting $self->{name} by $fieldname" if $VERBOSE ;
330
331	0						my $newf = _make_tempfile() ;
332
333							# This is a flag that gets turned off if the sort is not alphabetic
334							# and ascending. In that case, the sort order is not correct for the
335							# join() method, and so join() would have to re-sort.
336	0						my $joinable_sort = 1 ;
337
338	0						my $colnum = $self->{fieldlist}->find_colnum($fieldname) ;
339	0	0					if (!$colnum) {
340	0						unlink $newf ;
341	0						croak "No such field $fieldname" ;
342							}
343
344	0						my @sortargs = ("-s",
345							"-T$TMPDIR",
346							"-t\t",
347							"-k$colnum,$colnum",
348							"-o$newf",
349							$self->{filename}
350							) ;
351	0	0					if ($args{-descending}) {
352	0						unshift @sortargs, "-r" ;
353	0						$joinable_sort = 0 ;
354							}
355	0	0					if ($args{-numeric}) {
356	0						unshift @sortargs, "-n" ;
357	0						$joinable_sort = 0 ;
358							}
359
360	0						system $SORT, @sortargs ;
361
362	0	0					if ($?) {
363	0						unlink $newf ;
364	0						croak "sort error" ;
365							}
366
367	0	0					my $newtable = {
368							filename => $newf,
369							fieldlist => $self->{fieldlist}->deepcopy(),
370							sorted_colnum => $joinable_sort ? $colnum : undef,
371							name => $self->name(),
372							} ;
373
374	0	0					if (!defined wantarray ) {
375	0						carp "Warning: Useless order in void context." ;
376							}
377
378	0						bless $newtable, ref $self ;
379							}
380
381							####
382							# Returns a new table with two columns. The first column will contain the
383							# unique values of the specified field, the second column will contain the
384							# number of occurrences of that value.
385							####
386							sub groupby_and_count
387							{
388	0			0	0		my ($table, $fieldname, $newfieldname, %args) = @_ ;
389	0						my $colnum = $table->{fieldlist}->find_colnum($fieldname) ;
390	0	0					if (!$colnum) {
391	0						croak "No such field $fieldname" ;
392							}
393							# Create a temporary table that is sorted by the specified column.
394	0						my $sortedtable = $table->order($fieldname, %args);
395
396							# Taken from uniq(), One pass through the file counting the number
397							# of times the specified column appears and creating a new file
398							# with the specified column and count.
399	0						my $newf = _make_tempfile() ;
400	0	0					open(OLDF, "<$sortedtable->{filename}") \|\| die ;
401	0	0					open(NEWF, ">$newf") \|\| croak "$newf: $!\n" ;
402
403	0						my $count = 0 ;
404	0						my $oldval = undef ;
405	0						while () {
406	0						chomp ;
407	0						my @f = split(/\t/, $_, -1) ;
408	0	0					if (!defined $oldval) {
		0
409	0						$oldval = $f[$colnum-1] ;
410	0						$count = 1 ;
411							} elsif ($oldval ne $f[$colnum-1]) {
412	0						print NEWF $oldval, "\t", $count, "\n" ;
413	0						$oldval = $f[$colnum-1] ;
414	0						$count = 1 ;
415							} else {
416	0						$count++ ;
417							}
418							}
419	0	0					if (defined $oldval) {
420	0						print NEWF $oldval, "\t", $count, "\n" ;
421							}
422
423	0						close(OLDF) ;
424	0						close(NEWF) ;
425
426	0						my @newfieldnames = ($fieldname, $newfieldname) ;
427	0						my @newfields = map { Text::TabTable::Field->new($_) } @newfieldnames ;
	0
428	0						my $newtable = {
429							filename => $newf,
430							fieldlist => Text::TabTable::FieldList->new(@newfields),
431							sorted_colnum => 1,
432							name => $table->name(),
433							} ;
434
435	0						bless $newtable, ref $table ;
436							}
437
438							####
439							# Takes a list of pairs of oldname=>newname and changes the names of fields
440							# of the table. This wipes out the old field names entirely.
441							sub rename_fields
442							{
443	0			0	0		my ($table, @renames) = @_ ;
444
445	0						my $oldname ;
446							my $newname ;
447	0						my @fields = $table->{fieldlist}->fields() ;
448	0		0				while ( ($oldname = shift(@renames)) && ($newname = shift(@renames)) ) {
449	0						my $colnum = $table->{fieldlist}->find_colnum($oldname) ;
450	0						$fields[$colnum-1]->set_name($newname) ;
451							}
452							}
453
454							####
455							# Gets or sets the name of the table.
456							####
457							sub name
458							{
459	0			0	0		my ($self, $name) = @_ ;
460	0	0					if (defined $name) {
461	0						$self->{name} = $name ;
462							}
463	0						return $self->{name} ;
464							}
465
466							####
467							# takes a table and exports it as a cdb file, creating a primary key
468							# index. This can make joins go faster if this table is on the right side
469							# of the join, since neither table has to be sorted, and building a cdb
470							# is generally faster than sorting (~O(n) instead of O(nLogn).
471							####
472							sub build_primary_index
473							{
474	0			0	0		my ($self, $colname) = @_ ;
475	0						my $newf = _make_tempfile() ;
476
477	0						$self->export_cdb($newf, $colname) ;
478	0						my $colnum = $self->{fieldlist}->find_colnum($colname) ;
479
480	0						$self->{cdb}{$colnum} = $newf ;
481							}
482
483							####
484							# Returns a new table created by joining the two tables on a specified
485							# column. the $side parameter can be specified as LEFT or RIGHT to
486							# create LEFT/RIGHT joins, or can be INNER, OUTER, or undef.
487							# $leftfield/$rightfield are the field names to be used in the two tables
488							# for joining.
489							#
490							# If the right table has a primary index on the join column (created
491							# by build_primary_index()), and it's either a left or inner join,
492							# a simpler join algorithm will be used that does not require sorting.
493							#
494							# Both tables must have names. Tables get names either by setting them
495							# with the name() method, or from the filename in the import_headered
496							# method.
497							####
498							sub join
499							{
500	0			0	0		my ($lefttable, $righttable, $leftfield, $rightfield, $side) = @_ ;
501
502	0	0	0				if (!$lefttable->name() \|\| !$righttable->name()) {
503	0						croak "both tables must have name()s" ;
504							}
505
506	0						my $leftcol = $lefttable->{fieldlist}->find_colnum($leftfield) ;
507	0	0					croak "no field $leftfield in left table" if !$leftfield ;
508	0						my $rightcol = $righttable->{fieldlist}->find_colnum($rightfield) ;
509	0	0					croak "no field $rightfield in right table" if !$rightfield ;
510
511	0	0	0				if ($righttable->{cdb}{$rightcol} && $side ne 'RIGHT' && $side ne 'OUTER') {
			0
512	0	0					if ($VERBOSE) {
513	0						carp "index joining $lefttable->{name} with $righttable->{name}" ;
514							}
515	0						return $lefttable->_join_using_index($righttable,
516							$leftcol, $rightcol, $side) ;
517							}
518
519							# tables must be sorted by field.
520	0	0	0				if (!$lefttable->{sorted_colnum} \|\| $lefttable->{sorted_colnum} ne $leftcol) {
521	0						$lefttable = $lefttable->order($leftfield) ;
522							}
523	0	0	0				if (!$righttable->{sorted_colnum} \|\| $righttable->{sorted_colnum} ne $rightcol) {
524	0						$righttable = $righttable->order($rightfield) ;
525							}
526
527	0	0					carp "joining $lefttable->{name} with $righttable->{name}" if $VERBOSE ;
528
529
530							# create a format string for join(1).
531							# Looks like
532							# 1.1,1.2,1.3,1.4, ... ,2.1,2.2,2.3, ...
533
534	0						my $format =
535	0						join(",", map { "1.$_" } 1..$lefttable->{fieldlist}->fieldcount())
536							. "," .
537	0						join(",", map { "2.$_" } 1..$righttable->{fieldlist}->fieldcount()) ;
538
539
540
541	0						my $command = "$JOIN -1 $leftcol -2 $rightcol -o $format -t '\t' " ;
542
543	0	0	0				if ($side eq 'LEFT') {
		0
		0
		0
544	0						$command .= "-a 1 "
545							} elsif ($side eq 'RIGHT') {
546	0						$command .= "-a 2 " ;
547							} elsif ($side eq 'OUTER') {
548	0						$command .= "-a 1 -a 2 " ;
549							} elsif (defined $side && $side ne 'INNER') {
550	0						croak "invalid side argument" ;
551							}
552
553	0						$command .= $lefttable->{filename} . " " ;
554	0						$command .= $righttable->{filename} . " " ;
555
556	0						my $newf = _make_tempfile() ;
557	0						$command .= "> $newf" ;
558
559	0						system $command ;
560
561	0	0					croak "join failed" if $? ;
562
563
564							# We've now joined the files, so we just have to create a fieldlist
565							# for the new table.
566
567	0						my $leftlistcopy = $lefttable->{fieldlist}->deepcopy ;
568	0						foreach my $field ($leftlistcopy->fields) {
569	0						$field->add_name( $lefttable->name . "." . $field->name() ) ;
570							}
571	0						my $rightlistcopy = $righttable->{fieldlist}->deepcopy ;
572	0						foreach my $field ($rightlistcopy->fields) {
573	0						$field->add_name( $righttable->name . "." . $field->name() ) ;
574							}
575
576							# we've now got copies of the two fieldlists, with new aliases for
577							# the field names of the form tablename.fieldname. Construct
578							# a final field list from these two lists.
579
580	0						my @fields = ($leftlistcopy->fields, $rightlistcopy->fields) ;
581
582	0						my $newtable = {
583							name => $lefttable->{name},
584							filename => $newf,
585							fieldlist => Text::TabTable::FieldList->new(@fields),
586							sorted_colnum => $leftcol,
587							} ;
588
589	0	0					if (!defined wantarray ) {
590	0						carp "Warning: Useless join in void context." ;
591							}
592	0						bless $newtable, ref $lefttable ;
593							}
594
595							####
596							# called by ->join() to perform a join when there is an appropriate cdb index
597							# present on the right side table and it's not a right join.
598							#
599							# The column numbers passed in are 1-based.
600							####
601							sub _join_using_index
602							{
603	0			0			my ($lefttable, $righttable, $leftcol, $rightcol, $side) = @_ ;
604
605	0						my $isleftjoin = $side eq 'LEFT' ;
606	0						my $emptyright ;
607	0	0					if ($isleftjoin) {
608	0						$emptyright = "\t" x ($righttable->{fieldlist}->fieldcount() - 1) ;
609							}
610
611	0	0					open(LEFTF, $lefttable->{filename}) \|\| croak ;
612	0						require CDB_File ;
613
614	0						my $newf = _make_tempfile() ;
615	0	0					open(NEWF, ">$newf") \|\| croak "$newf: $!" ;
616
617	0						my %right ;
618
619	0	0					tie (%right, 'CDB_File', $righttable->{cdb}{$rightcol}) \|\| die ;
620
621							# create a regex that will extract the join field from a tab delimited
622							# line.
623	0						my $regex = '^' . ('[^\t]\t' x ($leftcol-1)) . '([^\t])' ;
624	0						$regex = qr($regex) ;
625
626	0						my $leftfieldcount = $lefttable->{fieldlist}->fieldcount() ;
627	0						my $rightfieldcount = $righttable->{fieldlist}->fieldcount() ;
628
629	0						while () {
630	0						chomp ;
631	0						_add_missing_tabs(\$_, $leftfieldcount) ;
632	0	0					/$regex/ \|\| die "malformed temp file in line $_" ;
633	0						my $key = $1 ;
634
635
636	0	0					if (exists $right{$key}) {
637							# found a match. Print a complete line.
638	0						my $val = $right{$key} ;
639	0						_add_missing_tabs(\$val, $rightfieldcount) ;
640	0						print NEWF CORE::join("\t", $_, $val), "\n" ;
641							} else {
642							# didn't match. print a line if it's a left join, otherwise skip it.
643	0	0					if ($isleftjoin) {
644	0						print NEWF CORE::join("\t", $_, $emptyright), "\n" ;
645							}
646							}
647							}
648
649	0						untie %right ;
650
651	0						close LEFTF ;
652	0						close NEWF ;
653
654							# We've now joined the files, so we just have to create a fieldlist
655							# for the new table.
656
657	0						my $leftlistcopy = $lefttable->{fieldlist}->deepcopy ;
658	0						foreach my $field ($leftlistcopy->fields) {
659	0						$field->add_name( $lefttable->name . "." . $field->name() ) ;
660							}
661	0						my $rightlistcopy = $righttable->{fieldlist}->deepcopy ;
662	0						foreach my $field ($rightlistcopy->fields) {
663	0						$field->add_name( $righttable->name . "." . $field->name() ) ;
664							}
665
666							# we've now got copies of the two fieldlists, with new aliases for
667							# the field names of the form tablename.fieldname. Construct
668							# a final field list from these two lists.
669
670	0						my @fields = ($leftlistcopy->fields, $rightlistcopy->fields) ;
671
672	0						my $newtable = {
673							name => $lefttable->{name},
674							filename => $newf,
675							fieldlist => Text::TabTable::FieldList->new(@fields),
676							} ;
677
678	0	0					if (!defined wantarray ) {
679	0						carp "Warning: Useless join in void context." ;
680							}
681	0						bless $newtable, ref $lefttable ;
682
683							}
684
685							# Given a ref to a string that's supposed to have n columns, make sure there are
686							# n-1 tabs by adding more at the end.
687							sub _add_missing_tabs
688							{
689	0			0			my ($strref, $n) = @_ ;
690
691	0						my $tabcount = ($$strref =~ tr/\t/\t/) ;
692
693	0	0					if ($tabcount < $n-1) {
694	0						$$strref .= "\t" x ( $n-1-$tabcount ) ;
695							}
696							}
697
698
699							####
700							# processes a table and creates a new one with different stuff.
701							#
702							# parameters:
703							# table is a Text::TabTable object.
704							#
705							# fieldspecs is a listref containing items of any of the following forms
706							# fieldname ( a simple scalar )
707							# [fieldname, newfieldname] ( for "cd_table.id as cd_id")
708							# [sub {...}, newfieldname, [list of fieldnames]]
709							# (for calculated fields. The sub receives values for the
710							# listed fields as parameters, and returns the new value)
711							# fieldspecs can also be a simple "*", which returns all fields unchanged.
712							#
713							# wheresub is an optional subref. It is passed an object with getvalue,
714							# setvalue, and autoloaded field-name-named methods to get and set values
715							# of fields by name. It is expected to return a true value if the
716							# row should be included in the output.
717							####
718							sub select
719							{
720	0			0	0		my ($table, $fieldspecs, $wheresub) = @_ ;
721
722	0	0					carp "selecting from $table->{name}" if $VERBOSE ;
723
724	0						my $newtable = { name => $table->{name} } ;
725
726							# this gets set to zero if there is a where clause or calculated columns.
727							# Otherwise it just runs /bin/cut to pick the right columns.
728	0						my $cut_ok = 1 ;
729
730							# create a field list for the new table based on the selected fields.
731							# also create an array saying how to calculate each output field.
732	0						my @fieldrules ;
733	0	0	0				if (!ref $fieldspecs && ($fieldspecs eq '*' \|\| !defined $fieldspecs)) {
			0
734							# simple case. Just copy the fieldlist.
735	0						undef $fieldspecs ;
736	0						$newtable->{fieldlist} = $table->{fieldlist}->deepcopy() ;
737
738							# we don't need any rules in this case; input = output
739							} else {
740							# make a new fieldlist, and rules.
741
742							# @fields is the list of Field objects being built.
743	0						my @fields ;
744
745	0						foreach my $fieldspec (@$fieldspecs) {
746	0	0					if (!ref $fieldspec) {
		0
		0
747							# a simple scalar, representing a field name.
748	0						my $colnum = $table->{fieldlist}->find_colnum($fieldspec) ;
749	0	0					if (!$colnum) {
750	0						croak "no field $fieldspec in table" ;
751							}
752							# find_colnum returns 1-based column numbers.
753	0						push @fieldrules, $colnum - 1 ;
754	0						push @fields, Text::TabTable::Field->new($fieldspec) ;
755							} elsif (@$fieldspec == 2) {
756							# a field name to look up and what to call it in the output table.
757
758	0						my $colnum = $table->{fieldlist}->find_colnum($fieldspec->[0]) ;
759	0	0					if (!$colnum) {
760	0						croak "no field $fieldspec->[0] in table" ;
761							}
762							# find_colnum returns 1-based column numbers.
763	0						push @fieldrules, $colnum - 1 ;
764	0						push @fields, Text::TabTable::Field->new($fieldspec->[1]) ;
765							} elsif (@$fieldspec == 3) {
766							# A subref, a new column name, and a list of columns to pass to
767							# the subref.
768	0						my @paramcols ;
769
770							# since we're doing a calculated column, we have to use perl instead
771							# of /bin/cut.
772	0						$cut_ok = 0 ;
773
774	0						foreach my $fieldname (@{$fieldspec->[2]}) {
	0
775	0						my $colnum = $table->{fieldlist}->find_colnum($fieldname) ;
776	0	0					if (!$colnum) {
777	0						croak "no field $fieldname in table" ;
778							}
779	0						push @paramcols, $colnum-1 ;
780							}
781							# create a rule consiting of the subref followed by a listref of
782							# what columns to get parameters from.
783	0						push @fieldrules, [ $fieldspec->[0], \@paramcols ] ;
784
785							# fieldname is the new name passed in.
786	0						push @fields, Text::TabTable::Field->new($fieldspec->[1]) ;
787							} else {
788	0						croak "bad fieldspec" ;
789							}
790							}
791
792	0						$newtable->{fieldlist} = Text::TabTable::FieldList->new(@fields) ;
793							}
794
795	0						$newtable->{filename} = _make_tempfile() ;
796
797							# build a hash saying which field is in which position in the input.
798	0						my %fieldloc ;
799	0						$table->_build_fieldloc(\%fieldloc) ;
800
801							# cut won't reorder columns, which angers me. So if the columns aren't
802							# sorted, don't use cut.
803	0						my $test_unsort = CORE::join(" ", @fieldrules) ;
804	0						my $test_sort = CORE::join(" ", sort {$a <=> $b } @fieldrules) ;
	0
805	0	0					if ($test_unsort ne $test_sort) {
806	0						$cut_ok = 0 ;
807							}
808
809							# now $newtable->{fieldlist} contains the table names. We're done with that.
810							# $newtable->{filename} contains the name of the file to be created.
811							# @fieldrules tells us how to create each output column.
812							# %fieldloc says which column number a field name can be found in.
813							# so it's time to start processing.
814
815
816	0	0	0				if ($cut_ok && !$wheresub && $fieldspecs) {
			0
817							# there aren't any calculated columns, and there's no where clause,
818							# so we can just use cut to pick the columns they wanted.
819
820							# @fieldrules has zero-based column numbers. Make one-based.
821	0						my @cutfields = map { $_ + 1 } @fieldrules ;
	0
822
823	0	0					carp "...selecting using cut" if $VERBOSE ;
824	0						system "cut -f" . CORE::join(',', @cutfields) .
825							" $table->{filename} > $newtable->{filename}" ;
826	0	0					if ($?) {
827	0						unlink $newtable->{filename} ;
828	0						croak "cut error in select" ;
829							}
830							} else {
831
832							# process the file using perl.
833
834	0	0					open(INFILE, $table->{filename}) \|\| croak "can't open table file" ;
835	0	0					open(OUTFILE, ">$newtable->{filename}") \|\| croak "can't open output table file" ;
836
837	0						while() {
838	0						chomp ;
839	0						my @values = split(/\t/, $_, 999999) ;
840
841							# run the where clause subroutine, if any, and skip if it says to.
842	0	0					if ($wheresub) {
843	0						my $rowdata = bless([\%fieldloc, \@values], 'Text::TabTable::DataRow') ;
844	0	0					next if !&$wheresub($rowdata) ;
845							}
846
847	0	0					if (!$fieldspecs) {
848							# select *. Just print them out.
849	0						print OUTFILE CORE::join("\t", @values), "\n" ;
850							} else {
851	0						my @outvals ;
852
853							# use the @fieldrules to create @outvals from @values.
854	0						foreach my $rule (@fieldrules) {
855	0	0					if (!ref $rule) {
856	0						push @outvals, $values[$rule] ;
857							} else {
858							# it's an arrayref containing a subref and a bunch of column
859							# numbers. Call the subroutine with the values pointed to
860							# by those column numbers and use the return value as the
861							# output field value.
862	0						my @params = map { $values[$_] } @{$rule->[1]} ;
	0
	0
863	0						my $subref = $rule->[0] ;
864	0						push @outvals, scalar(&$subref(@params)) ;
865							}
866							}
867
868	0						print OUTFILE CORE::join("\t", @outvals), "\n" ;
869							}
870							}
871
872	0						close OUTFILE ;
873	0						close INFILE ;
874							}
875
876	0	0					if (!defined wantarray ) {
877	0						carp "Warning: select used in void context." ;
878							}
879
880	0						return bless $newtable, ref $table ;
881							}
882
883							####
884							# Fills in a hash with a mapping from field name to column number.
885							####
886							sub _build_fieldloc
887							{
888	0			0			my ($table, $hr_fieldloc) = @_ ;
889	0						my $pos = 0 ;
890	0						foreach my $field ($table->{fieldlist}->fields()) {
891	0						foreach my $fieldname ($field->names()) {
892	0	0					$hr_fieldloc->{$fieldname} = $pos if !exists $hr_fieldloc->{$fieldname} ;
893							}
894	0						$pos++ ;
895							}
896							}
897
898							####
899							# Runs through the rows of a tab table, calling a subroutine for each
900							# line. The subroutine has the same calling convention as the where
901							# part of a select() call.
902							#
903							# This is like a select(undef,sub {}) but does not return a new table.
904							####
905							sub iterate
906							{
907	0			0	0		my ($table, $wheresub) = @_ ;
908	0	0					carp "iterating over $table->{name}" if $VERBOSE ;
909
910	0	0					open(INFILE, $table->{filename}) \|\| croak "can't open table file" ;
911
912	0	0					die if !$wheresub ;
913
914							# build a hash saying which field is in which position in the input.
915	0						my %fieldloc ;
916	0						$table->_build_fieldloc(\%fieldloc) ;
917
918	0						while() {
919	0						chomp ;
920	0						my @values = split(/\t/, $_, 999999) ;
921
922	0						my $rowdata = bless([\%fieldloc, \@values], 'Text::TabTable::DataRow') ;
923	0						&$wheresub($rowdata) ;
924							}
925
926	0						close INFILE ;
927							}
928
929							####
930							# Returns an object with a next() method, which gives one row object each
931							# time next() is called.
932							#
933							# If -unescape=>1, the tab/backslash/newline escaping will be removed.
934							####
935							sub make_iterator
936							{
937	0			0	0		my ($table, %args) = @_ ;
938	0	0					carp "iterating over $table->{name}" if $VERBOSE ;
939
940	0	0					if ($args{-unescape}) {
941	0						return Text::TabTable::Iterator::Unescaping->new($table) ;
942							} else {
943	0						return Text::TabTable::Iterator->new($table) ;
944							}
945							}
946
947							sub DESTROY
948							{
949	0			0			my $self = shift ;
950	0	0					if (!$self->{dontdelete}) {
951	0						unlink $self->{filename} ;
952							}
953
954	0						foreach my $cdbfile ( values %{$self->{cdb}} ) {
	0
955	0						unlink $cdbfile ;
956							}
957							}
958
959							####
960							# Creates a temporary file and returns its filename.
961							####
962	1			1		15	use vars qw(@TEMPFILES) ;
	1					2
	1					388
963							sub _make_tempfile
964							{
965	0			0			my $watchdog = 0 ;
966	0						while ($watchdog++ < 1000) {
967	0						my $fname = "$TMPDIR/$$." . int(rand(9999999)) ;
968	0						my $status = sysopen TEMPF, $fname, O_CREAT \| O_WRONLY \| O_EXCL, 0666 ;
969	0						close(TEMPF) ;
970	0	0					if (defined $status) {
971	0						push @TEMPFILES, $fname ;
972	0						return $fname ;
973							}
974							}
975	0						die "couldn't create a temporary file\n" ;
976							}
977
978							END {
979							# delete any tempfiles that didn't get deleted. This shouldn't happen.
980	1			1		308	foreach my $file (@TEMPFILES) {
981	0						unlink $file ;
982							}
983							}
984
985
986							###############################################################################
987							# Text::TabTable::Field
988							###############################################################################
989							package Text::TabTable::Field ;
990
991							sub new
992							{
993	0			0			my ($package, $fieldname) = @_ ;
994	0						my $self = { names => [$fieldname] } ;
995	0						bless $self, $package ;
996							}
997
998							sub name
999							{
1000	0			0			return $_[0]->{names}->[0] ;
1001							}
1002
1003							####
1004							# Return all the names for this field.
1005							sub names
1006							{
1007	0			0			return @{$_[0]->{names}} ;
	0
1008							}
1009
1010							sub has_name
1011							{
1012	0			0			my $self = shift ;
1013	0						my $name = shift ;
1014	0	0					if (grep( $_ eq $name, @{$self->{names}})) {
	0
1015	0						return 1 ;
1016							} else {
1017	0						return 0 ;
1018							}
1019							}
1020
1021							####
1022							# Add an alias name to a field.
1023							####
1024							sub add_name
1025							{
1026	0			0			my ($self, @names) = @_ ;
1027	0						push @{$self->{names}}, @names ;
	0
1028							}
1029
1030							####
1031							# sets the name of the field, wiping out all previous aliases.
1032							####
1033							sub set_name
1034							{
1035	0			0			my ($self, $name) = @_ ;
1036	0						$self->{names} = [$name] ;
1037							}
1038
1039							###############################################################################
1040							# Text::TabTable::FieldList
1041							#
1042							# Represents the list of Fields on a table.
1043							###############################################################################
1044							package Text::TabTable::FieldList ;
1045
1046	1			1		5	use Data::Dumper ;
	1					2
	1					113
1047
1048							sub new
1049							{
1050	0			0			my ($package, @fields) = @_ ;
1051
1052	0						bless { fields => \@fields }, $package ;
1053							}
1054
1055							sub deepcopy
1056							{
1057	0			0			my ($self) = @_ ;
1058
1059	1			1		8	no strict ;
	1					2
	1					292
1060	0						my $newent = eval Dumper($self) ;
1061							}
1062
1063							####
1064							# Returns a 1-based column number for the given field, or undef
1065							# if not present.
1066							####
1067							sub find_colnum
1068							{
1069	0			0			my ($self, $fieldname) = @_ ;
1070	0						for (my $i = 0 ; $i < @{$self->{fields}} ; $i++) {
	0
1071	0	0					if ($self->{fields}->[$i]->has_name($fieldname)) {
1072	0						return $i+1 ;
1073							}
1074							}
1075	0						return undef ;
1076							}
1077
1078							####
1079							# Return the number of fields.
1080							####
1081							sub fieldcount
1082							{
1083	0			0			my $self = shift ;
1084	0						return scalar(@{$self->{fields}}) ;
	0
1085							}
1086
1087							####
1088							# Return field names as a tab-delimited string.
1089							####
1090							sub as_string
1091							{
1092	0			0			my $self = shift ;
1093	0						my @names = map {$_->name} @{$self->{fields}} ;
	0
	0
1094	0						return join("\t", @names) ;
1095							}
1096
1097							sub fields
1098							{
1099	0			0			return @{$_[0]->{fields}} ;
	0
1100							}
1101
1102							###############################################################################
1103							# Text::TabTable::Iterator
1104							# Text::TabTable::Iterator::Unescaping
1105							#
1106							# A thing that returns one row at a time from a table.
1107							# The ::Unescaping version will run Text::TabTable::Unescape on all the
1108							# data first.
1109							###############################################################################
1110							package Text::TabTable::Iterator ;
1111
1112							@Text::TabTable::Iterator::Unescaping::ISA = ('MP3Com::TabTable::Iterator') ;
1113
1114	1			1		6	use strict ;
	1					2
	1					33
1115	1			1		5	use Carp ;
	1					1
	1					355
1116
1117							sub new
1118							{
1119	0			0			my ($package, $table) = @_ ;
1120
1121	0						require IO::File ;
1122
1123	0						my %fieldloc ;
1124	0						$table->_build_fieldloc(\%fieldloc) ;
1125
1126	0		0				my $fh = IO::File->new("<$table->{filename}") \|\| croak ;
1127
1128	0						my $self = {
1129							fieldloc => \%fieldloc,
1130							fh => $fh
1131							} ;
1132	0						bless $self, $package ;
1133							}
1134
1135							sub next
1136							{
1137	0			0			my ($self) = @_ ;
1138	0						my $line = $self->{fh}->getline() ;
1139
1140	0	0					if (!$line) {
1141	0						return undef ;
1142	0						delete $self->{fh} ;
1143							}
1144	0						chomp $line ;
1145	0						my @values = split(/\t/, $line, -1) ;
1146
1147	0						return bless([$self->{fieldloc}, \@values], 'Text::TabTable::DataRow') ;
1148							}
1149
1150							sub Text::TabTable::Iterator::Unescaping::next
1151							{
1152	0			0			my ($self) = @_ ;
1153
1154							# get a row and unescape the data in it.
1155
1156	0						my $row = $self->Text::TabTable::Iterator::next() ;
1157	0	0					return undef if !$row ;
1158
1159	0						foreach my $val (@{$row->[1]}) {
	0
1160	0						$val = Text::TabTable::unescape($val) ;
1161							}
1162
1163	0						return $row ;
1164							}
1165
1166							###############################################################################
1167							# Text::TabTable::DataRow
1168							#
1169							# Represents a row of data from a TabTable.
1170							###############################################################################
1171							package Text::TabTable::DataRow ;
1172
1173	1			1		5	use vars qw($AUTOLOAD) ;
	1					2
	1					55
1174	1			1		4	use Carp ;
	1					2
	1					44
1175
1176	1			1		5	use strict ;
	1					2
	1					300
1177
1178							# This constructor is not actually used by select(); it blesses the
1179							# right structure itself for speed purposes.
1180							sub new
1181							{
1182	0			0			my ($package, $name2colhash, $values) = @_ ;
1183	0						bless [$name2colhash, $values], $package ;
1184							}
1185
1186							sub getvalue
1187							{
1188	0			0			my $self = shift ;
1189	0						my $name = shift ;
1190	0						return $self->[1][ $self->[0]{$name} ] ;
1191							}
1192
1193							sub setvalue
1194							{
1195	0			0			my $self = shift ;
1196	0						my $name = shift ;
1197	0						my $newval = shift ;
1198	0						$self->[1][$self->[0]{$name}] = $newval ;
1199							}
1200
1201							# to save work for autoload.
1202	0			0			sub DESTROY {} ;
1203
1204							####
1205							# implements field-named methods for getting values.
1206							####
1207							sub AUTOLOAD
1208							{
1209	0			0			my $self = shift ;
1210
1211	0						my $name = $AUTOLOAD ;
1212	0						$name =~ s/.*:// ;
1213
1214	0	0					if (!exists $self->[0]{$name}) {
1215	0						croak "No $name field in table" ;
1216							}
1217
1218							# create a function to calculate it.
1219	0						eval <
1220							sub $name {
1221							return \$_[0]->[1][ \$_[0]->[0]{'$name'} ] ;
1222							}
1223							EOT
1224
1225	0						return $self->$name() ;
1226							}
1227
1228
1229							1;