File Coverage

blib/lib/Text/Shoebox.pm
Criterion Covered Total %
statement 124 160 77.5
branch 66 126 52.3
condition 10 42 23.8
subroutine 8 9 88.8
pod 4 4 100.0
total 212 341 62.1


line stmt bran cond sub pod time code
1              
2             # Time-stamp: "2004-04-03 20:20:51 ADT"
3             require 5;
4             package Text::Shoebox;
5 4     4   13572 use strict;
  4         8  
  4         155  
6 4     4   4489 use integer; # we don't need noninteger math in here
  4         195  
  4         25  
7 4     4   144 use Carp qw(carp croak);
  4         14  
  4         306  
8 4     4   24 use vars qw(@ISA @EXPORT $Debug $VERSION %p);
  4         13  
  4         12959  
9             require Exporter;
10             require UNIVERSAL;
11             @ISA = qw(Exporter);
12             @EXPORT = qw(read_sf write_sf are_hw_keys_uniform are_hw_values_unique);
13              
14             $Debug = 0 unless defined $Debug;
15             $VERSION = "1.02";
16              
17             =head1 NAME
18              
19             Text::Shoebox - read and write SIL Shoebox Standard Format (.sf) files
20              
21             =head1 SYNOPSIS
22              
23             use Text::Shoebox;
24             my $lex = [];
25             foreach my $file (@ARGV) {
26             read_sf(
27             from_file => $file, into => $lex,
28             ) or warn "read from $file failed\n";
29             }
30             print scalar(@$lex), " entries read.\n";
31            
32             die "hw field-names differ\n"
33             unless are_hw_keys_uniform($lex);
34             warn "hw field-values aren't unique\n"
35             unless are_hw_values_unique($lex);
36            
37             write_sf(from => $lex, to_file => "merged.sf")
38             or die "Couldn't write to merged.sf: $!";
39              
40             =head1 DESCRIPTION
41              
42             The Summer Institute of Linguistics (C) makes a
43             piece of free software called "the Linguist's Shoebox", or just
44             "Shoebox" for short. It's a simple database program generally used
45             for making lexicon databases (altho it can also be used for databases
46             of field notes, etc.).
47              
48             Shoebox can export its databases to SF (Standard Format) files, a
49             simple text format. Reading and writing those SF files is what this
50             Perl module, Text::Shoebox, is for. (I have heard that Standard Format
51             predates Shoebox quite a bit, and is used by other programs. If you
52             use SF files with something other than Shoebox, I'd be interested in
53             hearing about it, particularly about whether such files and
54             Text::Shoebox are happily compatible.)
55              
56             =head1 OBJECT-ORIENTED INTERFACE
57              
58             This module provides a functional interface. If you want an
59             object-oriented interface, with a bit more convenience, then see
60             the classes L and L.
61              
62              
63             =head1 FUNCTIONS
64              
65             =over
66              
67             =item $lex_lol = read_sf(...options...)
68              
69             Reads entries in Standard Format from the source specified. If no
70             entries were read, returns false. Otherwise, returns a reference to
71             the array that the entries were added to (which will be a new array,
72             unless the "into" option is set). If there's an I/O error while reading
73             (like if you specify an unreadable file), then this routine dies.
74              
75             The options are:
76              
77             =over
78              
79             =item from_file => STRING
80              
81             This specifies that the source of the SF data is a file, whose
82             filespec is given.
83              
84             =item from_handle => FILEHANDLE
85              
86             This specifies that the source of the SF data is a given filehandle.
87             (Examples of filehandle values: a global filehandle passed either
88             like C<*MYFH{IO}> or C<*MYFH>; or an object value from an IO class like
89             IO::Socket or IO::Handle.)
90              
91             The filehandle isn't closed when all its data is read.
92              
93             =item rs => STRING
94              
95             This specifies that the given string should be used as the record
96             separator (newline string) for the data source being read from.
97              
98             If the SF source is specified by a "from_file" option, and you don't
99             specify an "rs" option, then Text::Shoebox will try guessing the line
100             format of the file by reading the first 2K of the file and looking for
101             a CRLF ("\cm\cj"), an LF ("\cj"), or a CR ("\cm"). If you need to
102             stop it from trying to guess, just stipulate an "rs" value of C<$/>.
103              
104             If the SF source is specified by a "from_handle" option, and you don't
105             specify an "rs" option, then Text::Shoebox will just use the value in
106             the Perl variable C<$/> (the global RS value).
107              
108             =item into => ARRAYREF
109              
110             If this option is stipulated, then entries read will be pushed to the
111             end of the array specified. Otherwise the entries will be put into a
112             new array.
113              
114             =back
115              
116             Example use:
117              
118             use Text::Shoebox;
119             my $lexicon = read_sf(from_file => 'foo.sf')
120             || die "No entries?";
121             print scalar(@$lexicon), " entries read.\n";
122             print "First entry has ",
123             @{ $lexicon->[0] } / 2 , " fields.\n";
124              
125             =cut
126              
127             sub read_sf {
128 21     21 1 735 my(%options) = @_;
129              
130 21         26 my($target);
131 21 100       56 if(exists $options{'into'} ) {
132 12         25 $target = $options{'into'};
133             } else {
134 9         15 $target = [];
135             }
136              
137 21         92 local $/ = $/;
138 21         26 my($fh, $to_close);
139 21 50       77 if( exists $options{'from_handle'}) {
    50          
140 0         0 $fh = $options{'from_handle'};
141 0 0       0 $/ = $options{'rs'} if exists $options{'rs'};
142             # otherwise use default $/ value
143             } elsif(exists $options{'from_file'}) {
144 21         53 local *IN;
145 21         35 my $from_file = $options{'from_file'};
146 21 50       957 open(IN, "<$from_file") or croak "Can't read-open $from_file: $!";
147 21         56 binmode(IN);
148 21         41 $fh = *IN{IO};
149 21         31 $to_close = 1;
150              
151 21 100       51 if(exists $options{'rs'}) {
152 11         37 $/ = $options{'rs'};
153             } else {
154 10         13 my $chunk;
155 10         297 read($fh, $chunk, 2048); # should be plenty long enough!
156 10         77 seek($fh, 0,0); # rewind
157            
158             # All the NL formats I know about...
159 10 50 33     136 if(defined $chunk and $chunk =~ m<(\cm\cj|\cm|\cj)>s) {
160 10         50 $/ = $1;
161             } else {
162 0 0       0 print "Couldn't set \$/ for some reason.\n" if $Debug;
163             # Otherwise let it stand.
164             }
165             }
166             } else {
167 0         0 croak "read_sf needs an option specifying input source";
168             }
169              
170 21 50       50 print "\$/ is ", unpack("H*", $/), "\n" if $Debug;
171              
172             #my $lines_so_far = 0;
173 21         28 my $line; # current line
174             my $hw_field; # set from the first real field name we see
175 0         0 my @new_entries; # to fill up with new things from this file
176 21         25 my $Debug = $Debug; # lexical for speed
177 21         26 my $last_field_was_comment = 0;
178              
179 21         248 while(defined($line = <$fh>)) {
180 210         328 chomp($line);
181             #next if !defined($hw_field) and
182             ## ++$lines_so_far == 1 and
183 210 50 66     757 if(length $line > 1 and substr($line,0,2) eq '\_') {
184 0         0 $last_field_was_comment = 1;
185 0         0 next;
186             }
187              
188 210 100       551 if($line =~ m<^\\(\S+) ?(.*)>s) { # It's a normal "\foo val" line...
189             # This is the typical line in typical .sf files
190              
191             # That RE matches either "\foo" or "\foo bar..."
192             # (Because the \S+ stops either at end-of-string, or at ' '.
193             # Note that in either case, $2 is never undef.
194              
195 105 50       197 print "<$line> parses as <$1> = <$2>\n" if $Debug > 1;
196 105         108 $last_field_was_comment = 0;
197 105 100       172 if(@new_entries) {
198 84 100       162 if($1 eq $hw_field) { # it's a non-initial new entry
199             # A new entry means that the preceding entry's last value got
200             # one too many \n's at the end. So chop it.
201             # (Assumes "\n" is a single byte long; safe, I hope.)
202 21 50       69 chop($new_entries[-1][-1])
203             if substr($new_entries[-1][-1], -1, 1) eq "\n";
204              
205             # Start off a new entry
206 21         116 push @new_entries, [$1, $2];
207             } else {
208 63         61 push @{$new_entries[-1]}, $1, $2;
  63         380  
209             # This is all that happens to typical lines:
210             # Just tack more items to the end of the last entry.
211             }
212             } else { # No entries seen yet
213 21         42 $hw_field = $1;
214             # First field ever seen (ignoring _sh).
215             # That must be the headword field! Note it as such.
216              
217             # Now start off a new entry (the first, it so happens)
218 21         145 push @new_entries, [$1, $2];
219             }
220              
221             } else { # It's a continuation line...
222 105 50       190 next if $last_field_was_comment; # just toss this.
223              
224 105 50       159 print "<$line> is a continuation line.\n" if $Debug > 1;
225 105 50       163 if(@new_entries) { # expected case!
226 105         122 $line =~ s<^ \\><\\>s;
227             # Continuations starting with '\' get a leading space put on
228             # the front them -- so take it off. (Even tho it could have
229             # originated as a real ' \'.)
230            
231 105         533 $new_entries[-1][-1] .= "\n" . $line;
232             # So, yes, newline in the file ($/) turns into "\n".
233             # Tack this line onto the end of the last value in the last new entry
234            
235             } else { # most unexpected -- continuation of nothing!
236 0 0       0 warn "line \"$line\" is a continuation, but of what?"
237             if $line =~ m<\S>s;
238             # (but forgive such things if they're pure whitespace)
239             }
240             } # end of continuation line
241             } # end while() over the lines
242            
243 21 50       402 close($fh) if $to_close;
244            
245 21 50       56 print "All read: ", scalar(@new_entries), " entries read.\n" if $Debug;
246            
247 21 50       84 return 0 unless @new_entries;
248            
249 21         42 push @$target, @new_entries;
250            
251 21         117 return $target;
252             }
253              
254             #--------------------------------------------------------------------------
255              
256             =item write_sf(...options...)
257              
258             This writes the given lexicon, in Standard Format, to the destination
259             specified. If all entries were written, returns true; otherwise (in
260             case of an IO error), returns false, in which case you should
261             check C<$!>. Note that this routine I die in the case of
262             an I/O error, so you should always check the return value of this
263             function, as with:
264              
265             write_sf(...) || die "Couldn't write: $!";
266              
267             The options are:
268              
269             =over
270              
271             =item from => ARRAYREF
272              
273             This option must be present, to specify the lexicon that you want to
274             write out.
275              
276             =item to_file => STRING
277              
278             This specifies that the SF data is to be written to the file
279             specified. (Note that the file is opened in overwrite mode, not
280             append mode.)
281              
282             =item to_handle => FILEHANDLE
283              
284             This specifies that the destination for the SF data is the given
285             filehandle.
286              
287             The filehandle isn't closed when all the data is written to it.
288              
289             =item rs => STRING
290              
291             This specifies that the given string should be used as the record
292             separator (newline string) for the SF data written.
293              
294             If not specified, defaults to "\n".
295              
296             =back
297              
298             =cut
299              
300             sub write_sf {
301 11     11 1 1183 my(%options) = @_;
302 11         15 my $from;
303 11 50       28 if(exists $options{'from'}) {
304 11         21 $from = $options{'from'};
305             } else {
306 0 0 0     0 croak("'from' should be a reference")
307             unless defined $from and ref $from;
308             }
309              
310 11         15 my($fh, $to_close);
311 11 50       39 if(exists $options{'to_handle'}) {
    50          
312 0         0 $fh = $options{'to_handle'};
313 0 0       0 print "Writing to $fh from object $from\n" if $Debug;
314             } elsif(exists $options{'to_file'}) {
315             # passed a filespec
316 11         28 local *OUT;
317 11         20 my $dest = $options{'to_file'};
318 11 50       27 print "Writing to $dest from object $from\n" if $Debug;
319 11 50       1313 open(OUT, ">$dest") or return 0;
320 11         30 $fh = *OUT{IO};
321 11         51 binmode($fh);
322             } else {
323 0         0 croak "write_sf needs either a to_handle or a to_file option";
324             }
325              
326 11         28 my $nl;
327 11 100       36 if(exists $options{'rs'}) {
328 9         17 $nl = $options{'rs'};
329             # Some sanity checks:
330 9 50       24 croak "rs can't be undef" unless defined $nl;
331 9 50       21 croak "rs can't be empty-string" unless length $nl;
332 9 50       22 croak "rs can't be a reference" if ref $nl;
333             } else {
334 2         5 $nl = "\n";
335             }
336              
337 11         24 my $qnl = quotemeta $nl;
338              
339 11         28 my $nl_is_weird = 0;
340 11 100       63 $nl_is_weird = 1 unless $nl =~ m<^[\cm\cj]+$>s;
341              
342 11         18 my $am_first_entry = 1;
343 11         14 my($k, $v, $i, $i_entry, $e); # scratch vars
344              
345             Entry:
346 11         51 for($i_entry = 0; $i_entry < @$from; ++$i_entry) {
347 22 50 66     166 unless(defined(
      33        
348             $e = $from->[$i_entry] # copy the entry ref
349             ) and (
350             ref $e eq 'ARRAY'
351             or UNIVERSAL::isa($e, 'ARRAY')
352             )
353             ) {
354 0 0       0 print "Skipping $e -- not an entry\n" if $Debug;
355 0         0 Carp::cluck "Skipping $e -- not an entry";
356 0         0 next Entry;
357             }
358 22 50       57 unless(@$e) {
359 0 0       0 print "Skipping $e -- a null entry\n" if $Debug;
360 0         0 next Entry;
361             }
362              
363 22 100       37 if($am_first_entry) {
364 11         17 $am_first_entry = undef; # do nothing but turn it off.
365             } else { # print a NL before every entry except the first
366 11 50       47 return 0 unless print $fh $nl;
367             }
368              
369             Field:
370 22         64 for($i = 0; $i < @$e; $i += 2) { # iterate across keys
371 55 50 33     254 unless(defined(
372             $k = $e->[$i] # copy the key
373             ) and length $k
374             ) {
375 0         0 next Field;
376             }
377              
378 55 100       84 if($nl_is_weird) {
379 10         43 $k =~ s<$qnl><>g; # basic attempt at sanity.
380 10         15 $k =~ tr< ><>d;
381             # Up to the user to keep [\cm\cj\t] out of the keys!
382             } else {
383 45         77 $k =~ tr<\cm\cj\t ><>d; # basic sanity for any sane NL value
384             }
385              
386 55 50       683 unless(length $k) {
387 0 0       0 carp "Key field in lexicon->[ $i_entry ][ $i ] is null!\n" if $Debug;
388 0         0 next Field;
389             }
390              
391 55 50       147 if(defined(
392             $v = $e->[1 + $i] # copy value
393             )) {
394 55 100       108 if(length $v) {
395 44         67 $v =~ s<\n\\><\n \\>g;
396 44 100       126 $v =~ s<\n><$nl>g if $nl ne "\n"; # swap NL
397             }
398             } else {
399 0         0 $v = '';
400             }
401              
402 55 100       412 return 0 unless # return if there's an error in printing
    50          
403             length($v) ? (print $fh "\\", $k, ' ', $v, $nl) # "\foo bar" + NL
404             : (print $fh "\\", $k, $nl) # "\foo" + NL
405             ;
406             }
407             }
408 11 50       39 close($fh) if $to_close;
409 11         644 return 1;
410             }
411              
412             #--------------------------------------------------------------------------
413              
414             =item are_hw_keys_uniform($lol)
415              
416             This function returns true iff all the entries in the lexicon have the
417             same key for their headword fields (i.e., the first field per record).
418             This will always be true if you read the lexicon from one file; but if
419             you read it from several, it's possible that the different files have
420             different keys marking headword fields.
421              
422             =cut
423              
424             sub are_hw_keys_uniform {
425 1 50   1 1 188 carp('Wrong number of arguments to are_hw_keys_uniform'), return 0
426             unless @_ == 1;
427 1         3 my $lex = $_[0];
428 1 50 0     8 $Debug && carp('Argument to are_hw_keys_uniform isn\'t a listref'), return 0
      33        
429             unless defined $lex and ref $lex eq 'ARRAY';
430 1 50 0     3 $Debug && carp('Empty lexicon to are_hw_keys_uniform'), return 0
431             unless @$lex;
432              
433 1         3 my($hw_key, $e, $i);
434 1         6 for(my $i = 0; $i < @$lex; ++$i) {
435 2 50       3 next unless @{$e = $lex->[$i]}; # just skip null entries, I guess.
  2         6  
436 2 50 0     7 $Debug && carp("Entry $i has an undef headword"), return 0
437             unless defined $e->[0];
438 2 100       4 if(defined($hw_key)) {
439 1 50       6 if($e->[0] ne $hw_key) {
440 0 0       0 carp("Entry $i\'s hw key \"" . $e->[0] .
441             "\" differs from previous hw key \"$hw_key\"") if $Debug;
442 0         0 return 0;
443             }
444             } else {
445 1         4 $hw_key = $e->[0];
446             }
447             }
448 1 50 0     3 $Debug && carp("Entry $i\'s hw key \"" . $e->[0]), return 0
449             unless defined $hw_key;
450            
451 1         4 return 1; # all fine.
452             }
453              
454             #--------------------------------------------------------------------------
455              
456             =item are_hw_values_unique($lex_lol)
457              
458             This function returns true iff all the headword values in all non-null
459             entries in the lexicon $lol are unique -- i.e., if no two (or more)
460             entries have the same values for their headword fields. I don't know
461             if uniqueness is a requirement for SF lexicons that you'd want to
462             import into Shoebox, but some tasks you put lexicons to might require
463             it.
464              
465             =cut
466              
467             sub are_hw_values_unique {
468 2     2 1 4 my %seen;
469 2         14 foreach my $e (@{$_[0]}) {
  2         5  
470 5 50 66     38 return 0 if @$e and $seen{ defined($e->[1]) ? $e->[1] : '' }++;
    100          
471             }
472 1         5 return 1; # no duplicates found
473             }
474              
475             #--------------------------------------------------------------------------
476             %p = (
477             ( map {; (chr($_), sprintf('\x%02X',$_)) } 0.. 255 ),
478             "\a" => '\a', # ding!
479             "\b" => '\b', # BS
480             "\e" => '\e', # ESC
481             "\f" => '\f', # FF
482             "\t" => '\t', # tab
483             "\cm" => '\cm',
484             "\cj" => '\cj',
485             "\n" => '\n', # presumably overrides one of either \cm or \cj
486             '"' => '\"',
487             '\\' => '\\\\',
488             '$' => '\\$',
489             '@' => '\\@',
490             '%' => '\\%',
491             '#' => '\\#',
492             );
493              
494             sub _dump {
495 0     0     my $lol = $_[0];
496              
497 0           print "[ #", scalar(@$lol), " entries...\n";
498              
499 0           my $safe;
500 0           my $toggle = 0;
501 0           foreach my $e (@$lol) {
502 0 0 0       next unless defined $e and ref $e and UNIVERSAL::isa($e, 'ARRAY');
      0        
503 0           print " [ ";
504 0           foreach my $v (@$e) {
505 0           ($safe = $v) =~
506             s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E\xA1-\xFE])>
507 0           <$p{$1}>eg;
508 0 0         print(
509             ($toggle ^= 1) ? qq{"$safe" => } : qq{"$safe", \n }
510             );
511             }
512 0           print "],\n";
513             }
514 0           print "];\n";
515             }
516              
517             #--------------------------------------------------------------------------
518              
519             =back
520              
521             =head1 A NOTE ABOUT VALIDITY
522              
523             I make very few assumptions about what characters can be in a field
524             key in SF files. Just now, I happen to assume they can't start with
525             an underscore (lest they be considered comments), and can't contain
526             any whitespace characters.
527              
528             I make essentially no assumptions about what can be in a field value,
529             except that there can be no newline followed immediately by a
530             backslash. (Any newline-backslash sequence in turned into
531             newline-space-backslash.)
532              
533             You should be aware that Shoebox, or whatever other programs use SF
534             files, may have a I more restricted view of what can be in a
535             field key or value.
536              
537             =head1 SEE ALSO
538              
539             L
540              
541             L
542              
543             =head1 COPYRIGHT
544              
545             Copyright 2000-2004, Sean M. Burke C, all rights
546             reserved. This program is free software; you can redistribute it
547             and/or modify it under the same terms as Perl itself.
548              
549             =head1 AUTHOR
550              
551             Sean M. Burke, C
552              
553             Please contact me if you find that this module is not behaving
554             correctly. I've tested it only on Shoebox files I generate on my own.
555              
556             I hasten to point out, incidentally, that I am not in any way
557             affiliated with the Summer Institute of Linguistics.
558              
559             =cut
560              
561             1;
562              
563             __END__