File Coverage

blib/lib/Boulder/Stream.pm
Criterion Covered Total %
statement 138 202 68.3
branch 41 92 44.5
condition 17 40 42.5
subroutine 21 26 80.7
pod 4 16 25.0
total 221 376 58.7


line stmt bran cond sub pod time code
1             package Boulder::Stream;
2              
3             # CHANGE HISTORY:
4              
5             # version 1.07
6             # patches from Andy Law to quash warnings under -w switch
7              
8             # changes from 1.04 to 1.05
9             # - new() will now accept filehandle globs, IO::File, and FileHandle objects
10              
11             # changes from 1.03 to 1.04
12             # - Fixed regexp bug that broke on tags with embedded spaces -pete
13              
14             # Changes from 1.01 to 1.03
15             # - Fixed a problem in escaping the {} characters
16              
17             # Changes from 1.00 to 1.01
18             # - Added the asTable() method to Boulder::Stream
19              
20             =head1 NAME
21              
22             Boulder::Stream - Read and write tag/value data from an input stream
23              
24             =head1 SYNOPSIS
25              
26             #!/bin/perl
27             # Read a series of People records from STDIN.
28             # Add an "Eligible" tag to all those whose
29             # Age >= 35 and Friends list includes "Fred"
30             use Boulder::Stream;
31            
32             # filestream way:
33             my $stream = Boulder::Stream->newFh;
34             while ( my $record = <$stream> ) {
35             next unless $record->Age >= 35;
36             my @friends = $record->Friends;
37             next unless grep {$_ eq 'Fred'} @friends;
38              
39             $record->insert(Eligible => 'yes');
40             print $stream $record;
41             }
42              
43             # object oriented way:
44             my $stream = Boulder::Stream->new;
45             while (my $record = $stream->get ) {
46             next unless $record->Age >= 35;
47             my @friends = $record->Friends;
48             next unless grep {$_ eq 'Fred'} @friends;
49              
50             $record->insert(Eligible => 'yes');
51             print $stream $record;
52             }
53              
54              
55              
56             =head1 DESCRIPTION
57              
58             Boulder::Stream provides stream-oriented access to L IO
59             hierarchical tag/value data. It can be used in a magic tied
60             filehandle mode, as shown in the synopsis, or in object-oriented mode.
61             Using tied filehandles, L objects are read from input using the
62             standard <> operator. Stone objects printed to the tied filehandle
63             appear on the output stream in L format.
64              
65             By default, data is read from the magic ARGV filehandle (STDIN or a
66             list of files provided on the command line) and written to STDOUT.
67             This can be changed to the filehandles of your choice.
68              
69             =head2 Pass through behavior
70              
71             When using the object-oriented form of Boulder::Stream, tags which
72             aren't specifically requested by the get() method are passed through
73             to output unchanged. This allows pipes of programs to be constructed
74             easily. Most programs will want to put the tags back into the boulder
75             stream once they're finished, potentially adding their own. Of course
76             some programs will want to behave differently. For example, a
77             database query program will generate but not read a B
78             stream, while a report generator will read but not write the stream.
79              
80             This convention allows the following type of pipe to be set up:
81              
82             query_database | find_vector | find_dups | \
83             | blast_sequence | pick_primer | mail_report
84              
85             If all the programs in the pipe follow the conventions, then it will be
86             possible to interpose other programs, such as a repetitive element finder,
87             in the middle of the pipe without disturbing other components.
88              
89             =head1 SKELETON BOULDER PROGRAM
90              
91             Here is a skeleton example.
92              
93             #!/bin/perl
94             use Boulder::Stream;
95            
96             my $stream = Boulder::Stream->newFh;
97            
98             while ( my $record = <$stream> ) {
99             next unless $record->Age >= 35;
100             my @friends = $record->Friends;
101             next unless grep {$_ eq 'Fred'} @friends;
102              
103             $record->insert(Eligible => 'yes');
104             print $stream $record;
105             }
106              
107             The code starts by creating a B object to handle the
108             I/O. It reads from the stream one record at a time, returning a
109             L object. We recover the I and I tags, and
110             continue looping unless the Age is greater or equal to 35, and the
111             list of Friends contains "Fred". If these criteria match, then we
112             insert a new tag named Eligible and print the record to the stream.
113             The output may look like this:
114              
115             Name=Janice
116             Age=36
117             Eligible=yes
118             Friends=Susan
119             Friends=Fred
120             Friends=Ralph
121             =
122             Name=Ralph
123             Age=42
124             Eligible=yes
125             Friends=Janice
126             Friends=Fred
127             =
128             Name=Susan
129             Age=35
130             Eligible=yes
131             Friends=Susan
132             Friends=Fred
133             =
134              
135             Note that in this case only records that meet the criteria are echoed
136             to standard output. The object-oriented version of the program looks
137             like this:
138              
139             #!/bin/perl
140             use Boulder::Stream;
141            
142             my $stream = Boulder::Stream->new;
143            
144             while ( my $record = $stream->get('Age','Friends') ) {
145             next unless $record->Age >= 35;
146             my @friends = $record->Friends;
147             next unless grep {$_ eq 'Fred'} @friends;
148              
149             $record->insert(Eligible => 'yes');
150             $stream->put($record);
151             }
152              
153             The get() method is used to fetch Stones containing one or more of the
154             indicated tags. The put() method is used to send the result to
155             standard output. The pass-through behavior might produce a set of
156             records like this one:
157              
158             Name=Janice
159             Age=36
160             Eligible=yes
161             Friends=Susan
162             Friends=Fred
163             Friends=Ralph
164             =
165             Name=Phillip
166             Age=30
167             =
168             Name=Ralph
169             Age=42
170             Eligible=yes
171             Friends=Janice
172             Friends=Fred
173             =
174             Name=Barbara
175             Friends=Agatha
176             Friends=Janice
177             =
178             Name=Susan
179             Age=35
180             Eligible=yes
181             Friends=Susan
182             Friends=Fred
183             =
184              
185             Notice that there are now two records ("Phillip" and "Barbara") that
186             do not contain the Eligible tag.
187              
188             =head1 Boulder::Stream METHODS
189              
190             =head2 $stream = Boulder::Stream->new(*IN,*OUT)
191              
192             =head2 $stream = Boulder::Stream->new(-in=>*IN,-out=>*OUT)
193              
194             The B method creates a new B object. You can
195             provide input and output filehandles. If you leave one or both
196             undefined B will default to standard input or standard output.
197             You are free to use files, pipes, sockets, and other types of file
198             handles. You may provide the filehandle arguments as bare words,
199             globs, or glob refs. You are also free to use the named argument style
200             shown in the second heading.
201              
202             =head2 $fh = Boulder::Stream->newFh(-in=>*IN, -out=>*OUT)
203              
204             Returns a filehandle object tied to a Boulder::Stream object. Reads
205             on the filehandle perform a get(). Writes invoke a put().
206              
207             To retrieve the underlying Boulder::Stream object, call Perl's
208             built-in tied() function:
209              
210             $stream = tied $fh;
211              
212             =head2 $stone = $stream->get(@taglist)
213              
214             =head2 @stones = $stream->get(@taglist)
215              
216             Every time get() is called, it will return a new Stone object. The
217             Stone will be created from the input stream, using just the tags
218             provided in the argument list. Pass no tags to receive whatever tags
219             are present in the input stream.
220              
221             If none of the tags that you specify are in the current boulder
222             record, you will receive an empty B. At the end of the input
223             stream, you will receive B.
224              
225             If called in an array context, get() returns a list of all stones from
226             the input stream that contain one or more of the specified tags.
227              
228             =head2 $stone = $stream->read_record(@taglist)
229              
230             Identical to get(>, but the name is longer.
231              
232             =head2 $stream->put($stone)
233              
234             Write a B to the output filehandle.
235              
236             =head2 $stream->write_record($stone)
237              
238             Identical to put(), but the name is longer.
239              
240             =head2 Useful State Variables in a B
241              
242             Every Boulder::Stream has several state variables that you can adjust.
243             Fix them in this fashion:
244              
245             $a = new Boulder::Stream;
246             $a->{delim}=':';
247             $a->{record_start}='[';
248             $a->{record_end}=']';
249             $a->{passthru}=undef;
250              
251             =over 4
252              
253             =item * delim
254              
255             This is the delimiter character between tags and values, "=" by default.
256              
257             =item * record_start
258              
259             This is the start of nested record character, "{" by default.
260              
261             =item * record_end
262              
263             This is the end of nested record character, "}" by default.
264              
265             =item * passthru
266              
267             This determines whether unrecognized tags should be passed through
268             from the input stream to the output stream. This is 'true' by
269             default. Set it to undef to override this behavior.
270              
271             =back
272              
273             =head1 BUGS
274              
275             Because the delim, record_start and record_end characters in the
276             B object are used in optimized (once-compiled)
277             pattern matching, you cannot change these values once get() has once
278             been called. To change the defaults, you must create the
279             Boulder::Stream, set the characters, and only then begin reading from
280             the input stream. For the same reason, different Boulder::Stream
281             objects cannot use different delimiters.
282              
283             =head1 AUTHOR
284              
285             Lincoln D. Stein , Cold Spring Harbor Laboratory,
286             Cold Spring Harbor, NY. This module can be used and distributed on
287             the same terms as Perl itself.
288              
289             =head1 SEE ALSO
290              
291             L,
292             L, L, L, L,
293             L, L
294              
295             =cut
296              
297             require 5.004;
298 2     2   3257 use strict;
  2         4  
  2         77  
299 2     2   1849 use Stone;
  2         8  
  2         196  
300 2     2   9 use Carp;
  2         4  
  2         148  
301 2     2   1782 use Symbol();
  2         2104  
  2         52  
302              
303 2     2   12 use vars '$VERSION';
  2         4  
  2         5997  
304             $VERSION=1.07;
305              
306             # Pseudonyms and deprecated methods.
307             *get = \&read_record;
308             *next = \&read_record;
309             *put = \&write_record;
310              
311             # Call this with IN and OUT filehandles of your choice.
312             # If none specified, defaults to <>/STDOUT.
313             sub new {
314 2     2 1 155 my $package = shift;
315 2         11 my ($in,$out) = rearrange(['IN','OUT'],@_);
316              
317 2   50     10 $in = $package->to_fh($in) || \*main::ARGV;
318 2   50     5 $out = $package->to_fh($out,1) || \*main::STDOUT;
319 2         6 my $pack = caller;
320              
321 2         47 return bless {
322             'IN'=>$in,
323             'OUT'=>$out,
324             'delim'=>'=',
325             'record_stop'=>"=\n",
326             'line_end'=>"\n",
327             'subrec_start'=>"\{",
328             'subrec_end'=>"\}",
329             'binary'=>'true',
330             'passthru'=>'true'
331             },$package;
332             }
333              
334             # You are free to redefine the following magic variables:
335             # $a = new Boulder::Stream;
336             # $a->{delim} separates tag = value ['=']
337             # $a->{line_end} separates tag=value pairs [ newline ]
338             # $a->{record_stop} ends records ["=\n"]
339             # $a->{subrec_start} begins a nested record [ "{" ]
340             # $a->{subrec_end} ends a nested record [ "}" ]
341             # $a->{passthru} if true, passes unread tags -> output [ 'true' ]
342             # $a->{binary} if true, escapes and unescapes records [ 'true' ]
343              
344             # Since escaping/unescaping has some overhead, you might want to undef
345             # 'binary' in order to improve performance.
346              
347             # Read in and return a Rolling Stone record. Will return
348             # undef() when an empty record is hit. You can specify
349             # keys that you are interested in getting, as in the
350             # original boulder package.
351             sub read_one_record {
352 5     5 0 5 my($self,@keywords) = @_;
353              
354 5 50       10 return if $self->done;
355              
356 5         6 my(%interested,$key,$value);
357 5         6 grep($interested{$_}++,@keywords);
358              
359 5         7 my $out=$self->{OUT};
360 5         5 my $delim=$self->{'delim'};
361 5         6 my $subrec_start=$self->{'subrec_start'};
362 5         5 my $subrec_end=$self->{'subrec_end'};
363 5         6 my ($pebble,$found);
364              
365             # This is a small hack to ensure that we respect the
366             # record delimiters even when we don't make an
367             # intervening record write.
368 5 50 66     45 if (!$self->{WRITE} && $self->{INVOKED} && !$self->{LEVEL}
      100        
      66        
      66        
369             && $self->{'passthru'} && $self->{PASSED}) {
370 0         0 print $out ($self->{'record_stop'});
371             } else {
372 5         7 $self->{INVOKED}++; # keep track of our invocations
373             }
374              
375 5         7 undef $self->{WRITE};
376 5         5 undef $self->{PASSED};
377              
378 5         12 my $stone = new Stone();
379              
380 5         6 while (1) {
381              
382 21 100       33 last unless $_ = $self->next_pair;
383              
384 19 50       39 if (/^#/) {
385 0 0       0 print $out ("$_$self->{line_end}") if $self->{'passthru'};
386 0         0 next;
387             }
388              
389 19 50       78 if (/^\s*$delim/o) {
390 0         0 undef $self->{LEVEL};
391 0         0 last;
392             }
393              
394 19 100       43 if (/$subrec_end$/o) {
395 3 50       9 $self->{LEVEL}--,last if $self->{LEVEL};
396 0 0       0 print $out ("$_$self->{line_end}") if $self->{'passthru'};
397 0         0 next;
398             }
399              
400 16 50       103 next unless ($key,$value) = /^\s*(.+?)\s*$delim\s*(.*)/o;
401              
402 16 50 33     42 if ((!@keywords) || $interested{$key}) {
    0          
403              
404 16         16 $found++;
405 16 100       51 if ($value=~/^\s*$subrec_start/o) {
406 3         11 $self->{LEVEL}++;
407 3         10 $pebble = read_one_record($self); # call ourselves recursively
408 3 50       13 $pebble = new Stone() unless defined($pebble); # an empty record is still valid
409 3         7 $stone->insert($self->unescapekey($key)=>$pebble);
410 3         6 next;
411             }
412              
413 13         23 $stone->insert($self->unescapekey($key)=>$self->unescapeval($value));
414              
415             } elsif ($self->{'passthru'}) {
416 0         0 print $out ("$_$self->{line_end}");
417 0         0 $self->{PASSED}++; # flag that we will need to write a record delimiter
418             }
419             }
420            
421 5 50       9 return undef unless $found;
422 5         13 return $stone;
423             }
424              
425             # Write out the specified Stone record.
426             sub write_record {
427 2     2 1 4 my($self,@stone)=@_;
428 2         5 for my $stone (@stone) {
429 2         4 $self->{'WRITE'}++;
430 2         3 my $out=$self->{OUT};
431              
432             # Write out a Stone record in boulder format.
433 2         2 my ($key,$value,@value);
434 2         6 foreach $key ($stone->tags) {
435 7         17 @value = $stone->get($key);
436 7         12 $key = $self->escapekey($key);
437 7         11 foreach $value (@value) {
438 7 50       13 next unless ref $value;
439 7 100       18 if (exists $value->{'.name'}) {
440 4         7 $value = $self->escapeval($value);
441 4         13 print $out ("$key$self->{delim}$value\n");
442             } else {
443 3         7 print $out ("$key$self->{delim}$self->{subrec_start}\n");
444 3         7 _write_nested($self,1,$value);
445             }
446             }
447             }
448 2         8 print $out ("$self->{delim}\n");
449             }
450 2         6 1;
451             }
452              
453             # read_record() returns one stone if called in a scalar
454             # context and all the stones if called in an array
455             # context.
456             sub read_record {
457 3     3 1 5 my($self,@tags) = @_;
458 3 50       5 if (wantarray) {
459 0         0 my(@result,$s);
460 0         0 while (!$self->done) {
461 0         0 $s = $self->read_one_record(@tags);
462 0 0       0 push(@result,$s) if $s;
463             }
464 0         0 return @result;
465             } else {
466 3         4 my $s;
467 3         6 while (!$self->done) {
468 2         6 $s = $self->read_one_record(@tags);
469 2 50       5 return $s if $s;
470             }
471 1         3 return undef;
472             }
473             }
474              
475             # ----------------------------------------------------------------
476             # TIED INTERFACE METHODS
477             # ----------------------------------------------------------------
478              
479             # newFh() is a class method that returns a tied filehandle
480             #
481             sub newFh {
482 0     0 1 0 my $class = shift;
483 0 0       0 return unless my $self = $class->new(@_);
484 0         0 return $self->fh;
485             }
486              
487             # fh() returns a filehandle that you can read stones from
488             sub fh {
489 0     0 0 0 my $self = shift;
490 0   0     0 my $class = ref($self) || $self;
491 0         0 my $s = Symbol::gensym;
492 0         0 tie $$s,$class,$self;
493 0         0 return $s;
494             }
495              
496             sub TIEHANDLE {
497 0     0   0 my $class = shift;
498 0         0 return bless {stream => shift},$class;
499             }
500              
501             sub READLINE {
502 0     0   0 my $self = shift;
503 0         0 return $self->{stream}->read_record();
504             }
505              
506             sub PRINT {
507 0     0   0 my $self = shift;
508 0         0 $self->{stream}->write_record(@_);
509             }
510              
511             #--------------------------------------
512             # Internal (private) procedures.
513             #--------------------------------------
514             # This finds an array of key/value pairs and
515             # stashes it where we can find it.
516             sub read_next_rec {
517 2     2 0 2 my($self) = @_;
518 2         5 my($olddelim) = $/;
519              
520 2         5 $/="\n".$self->{record_stop};
521 2         3 my($in) = $self->{IN};
522              
523 2         28 my $data = <$in>;
524 2 50       6 chomp($data) if defined $data;
525              
526 2 50       8 if ($in !~ /ARGV/) {
527 2 100       11 $self->{EOF}++ if eof($in);
528             } else {
529 0 0       0 $self->{EOF}++ if eof();
530             }
531              
532 2         3 $/=$olddelim;
533 2 50       28 $self->{PAIRS}=[grep($_,split($self->{'line_end'},$data))]
534             if defined $data;
535             }
536              
537             # This returns TRUE when we've reached the end
538             # of the input stream
539             sub done {
540 8     8 0 10 my $self = shift;
541 8 100 66     20 return if defined $self->{PAIRS} && @{$self->{PAIRS}};
  3         14  
542 5         14 return $self->{EOF};
543             }
544              
545             # This returns the next key/value pair.
546             sub next_pair {
547 21     21 0 21 my $self = shift;
548 21 100       43 $self->read_next_rec unless $self->{PAIRS};
549 21 50       37 return unless $self->{PAIRS};
550 21 100       16 return shift @{$self->{PAIRS}} if @{$self->{PAIRS}};
  19         47  
  21         44  
551 2         4 undef $self->{PAIRS};
552 2         4 return undef;
553             }
554              
555             sub _write_nested {
556 3     3   4 my($self,$level,$stone) = @_;
557 3         6 my $indent = ' ' x $level;
558 3         4 my($key,$value,@value);
559 3         5 my $out = $self->{OUT};
560              
561 3         8 foreach $key ($stone->tags) {
562 8         20 @value = $stone->get($key);
563 8         16 $key = $self->escapekey($key);
564 8         12 foreach $value (@value) {
565 9 50       48 if (exists $value->{'.name'}) {
566 9         16 $value = $self->escapeval($value);
567 9         26 print $out ($indent,"$key$self->{delim}$value\n");
568             } else {
569 0         0 print $out ($indent,"$key$self->{delim}$self->{subrec_start}\n");
570 0         0 _write_nested($self,$level+1,$value);
571             }
572             }
573             }
574              
575 3         15 print $out (' ' x ($level-1),$self->{'subrec_end'},"\n");
576             }
577              
578             # Escape special characters.
579             sub escapekey {
580 15     15 0 19 my($s,$toencode)=@_;
581 15 50       29 return $toencode unless $s->{binary};
582 15         51 my $specials=" $s->{delim}$s->{subrec_start}$s->{subrec_end}$s->{line_end}$s->{record_stop}%";
583 15         47 $toencode=~s/([$specials])/uc sprintf("%%%02x",ord($1))/oge;
  0         0  
584 15         27 return $toencode;
585             }
586              
587             sub escapeval {
588 13     13 0 15 my($s,$toencode)=@_;
589 13 50       25 return $toencode unless $s->{binary};
590 13         32 my $specials="$s->{delim}$s->{subrec_start}$s->{subrec_end}$s->{line_end}$s->{record_stop}%";
591 13         42 $toencode=~s/([$specials])/uc sprintf("%%%02x",ord($1))/oge;
  0         0  
592 13         24 return $toencode;
593             }
594              
595             # Unescape special characters
596             sub unescapekey {
597 16     16 0 23 unescape(@_);
598             }
599              
600             sub unescapeval {
601 13     13 0 17 unescape(@_);
602             }
603              
604             # Unescape special characters
605             sub unescape {
606 29     29 0 34 my($s,$todecode)=@_;
607 29 50       54 return $todecode unless $s->{binary};
608 29         30 $todecode =~ s/%([0-9a-fA-F]{2})/pack("c",hex($1))/ge;
  0         0  
609 29         73 return $todecode;
610             }
611              
612             # utility routine to turn type globs, barewords, IO::File structs, etc into
613             # filehandles.
614             sub to_fh {
615 4     4 0 8 my ($pack,$thingy,$write) = @_;
616 4 50       8 return unless $thingy;
617 4 50       18 return $thingy if defined fileno($thingy);
618              
619 0         0 my $caller;
620 0         0 while (my $package = caller(++$caller)) {
621 0         0 my $qualified_thingy = Symbol::qualify_to_ref($thingy,$package);
622 0 0       0 return $qualified_thingy if defined fileno($qualified_thingy);
623             }
624            
625             # otherwise try to open it as a file
626 0         0 my $fh = Symbol::gensym();
627 0 0       0 $thingy = ">$thingy" if $write;
628 0 0       0 open ($fh,$thingy) || croak "$pack open of $thingy: $!";
629 0         0 return \*$fh;
630             }
631              
632             sub DESTROY {
633 1     1   2 my $self = shift;
634 1         2 my $out=$self->{OUT};
635 1 0 33     9 print $out ($self->{'delim'},"\n")
      33        
      0        
      0        
636             if !$self->{WRITE} && $self->{INVOKED} && !$self->{LEVEL} && $self->{'passthru'} && $self->{PASSED};
637             }
638              
639              
640             #####################################################################
641             ###################### private routines #############################
642             sub rearrange {
643 2     2 0 5 my($order,@param) = @_;
644 2 50       8 return unless @param;
645 2         4 my %param;
646              
647 2 50       8 if (ref $param[0] eq 'HASH') {
648 0         0 %param = %{$param[0]};
  0         0  
649             } else {
650 2 50 33     40 return @param unless (defined($param[0]) && substr($param[0],0,1) eq '-');
651              
652 0           my $i;
653 0           for ($i=0;$i<@param;$i+=2) {
654 0           $param[$i]=~s/^\-//; # get rid of initial - if present
655 0           $param[$i]=~tr/a-z/A-Z/; # parameters are upper case
656             }
657              
658 0           %param = @param; # convert into associative array
659             }
660            
661 0           my(@return_array);
662            
663 0           local($^W) = 0;
664 0           my($key)='';
665 0           foreach $key (@$order) {
666 0           my($value);
667 0 0         if (ref($key) eq 'ARRAY') {
668 0           foreach (@$key) {
669 0 0         last if defined($value);
670 0           $value = $param{$_};
671 0           delete $param{$_};
672             }
673             } else {
674 0           $value = $param{$key};
675 0           delete $param{$key};
676             }
677 0           push(@return_array,$value);
678             }
679 0 0         push (@return_array,{%param}) if %param;
680 0           return @return_array;
681             }
682              
683             1;
684