File Coverage

blib/lib/Data/Bvec.pm
Criterion Covered Total %
statement 102 114 89.4
branch 54 74 72.9
condition 8 13 61.5
subroutine 19 19 100.0
pod 12 12 100.0
total 195 232 84.0


line stmt bran cond sub pod time code
1             #---------------------------------------------------------------------
2             package Data::Bvec;
3              
4 13     13   334679 use 5.008003;
  13         55  
  13         517  
5 13     13   221 use strict;
  13         28  
  13         428  
6 13     13   78 use warnings;
  13         36  
  13         346  
7 13     13   74 use Carp;
  13         23  
  13         1503  
8 13     13   11571 use Math::Int2Base qw( int2base base2int );
  13         11980  
  13         1623  
9              
10             #---------------------------------------------------------------------
11              
12             =head1 NAME
13              
14             Data::Bvec - a module to manipulate integer arrays as bit vectors and
15             "compressed bit strings" (using a simple RLE).
16              
17             =head1 VERSION
18              
19             VERSION: 1.01
20              
21             =cut
22              
23             our $VERSION;
24              
25             $VERSION = '1.01';
26              
27             =head1 SYNOPSIS
28              
29             use Data::Bvec;
30              
31             my $bv = Data::Bvec::->new( nums=>[1,2,3] );
32              
33             my $vec = $bv->get_bvec(); # 01110000
34             my $bstr = $bv->get_bstr(); # '-134'
35             my $nums = $bv->get_nums(); # [1,2,3]
36              
37             ----
38              
39             use Data::Bvec qw( :all );
40              
41             my $vec = num2bit( [1,2,3] ); # 01110000
42             set_bit( $vec, 4, 1 ); # 01111000
43             my $bstr = compress bit2str $vec; # '-143'
44             my $nums = bit2num str2bit uncompress $bstr; # [1,2,3,4]
45              
46             =head1 DESCRIPTION/DISCUSSION
47              
48             This module encapsulates some simple routines for manipulating Perl bit
49             vectors (putting values in; getting values out), but its main goal is
50             to implement a simple run-length encoding scheme for bit vectors that
51             compresses them into relatively human-readable and flat-file-storable
52             strings.
53              
54             My use case was wanting to prototype a data indexing system, and I
55             wanted to ease debugging by plopping the bitstrings in a flat file that
56             I could examine directly. (Each bit in a vector represents a record in
57             the database -- true or false whether the term is in that record in the
58             field being indexed.) It has worked well enough that I haven't felt
59             the need to change how the bitstrings are stored (just where they're
60             stored).
61              
62             The initial version of the module used a different set of base-62
63             digits. In writing Math::Int2Base, I decided to normalize all the
64             bases from 2 to 62 to use 0-9,A-Z,a-z. It makes the numbers sort
65             correctly (ascii-betically == numerically), and it let me say that A
66             base-16 == A base-36 == A base-62.
67              
68             So now I'm rewriting this module to use those base conversion routines.
69              
70             =head1 EXPORTS
71              
72             Nothing is exported by default. The following may be exported
73             individually; all of them may be exported using the C<:all> tag:
74              
75             - set_bit
76             - howmany
77             - bit2str
78             - str2bit
79             - bit2num
80             - num2bit
81             - compress
82             - uncompress
83              
84             Examples:
85              
86             use Data::Bvec qw( :all );
87             use Data::Bvec qw( bit2str str2bit compress uncompress );
88              
89             However, if you only use the object methods, nothing would need to be
90             exported. See below.
91              
92             =cut
93              
94             our ( @ISA, @EXPORT_OK, %EXPORT_TAGS );
95             BEGIN {
96 13     13   79 use Exporter;
  13         26  
  13         892  
97 13     13   190 @ISA = qw( Exporter );
98 13         46 @EXPORT_OK = qw(
99             set_bit howmany
100             bit2str str2bit
101             bit2num num2bit
102             compress uncompress
103             );
104 13         25667 %EXPORT_TAGS = ( all => [ @EXPORT_OK ] );
105             }
106              
107              
108             #---------------------------------------------------------------------
109              
110             =head1 SUBROUTINES
111              
112             =head2 set_bit( $vec, $num, $zero_or_one )
113              
114             This is a shallow wrapper around Perl's vec() that simply provides
115             the third parameter (1) to that routine that says we're working with
116             a bit vector.
117              
118             Normally returns $num, if you care.
119              
120             =head3 Parameters:
121              
122             =head4 $vec
123              
124             A Perl bit vector stored in the scalar.
125              
126             =head4 $num
127              
128             The number whose bit you want to target in the bit vector.
129              
130             =head4 $zero_or_one
131              
132             The value you want to set the bit to: 0 or 1. If not defined,
133             1 is assumed.
134              
135              
136             Examples:
137              
138             my $vec = ""; # empty vector
139              
140             set_bit $vec, 1, 1; # 01000000
141             set_bit $vec, 2, 1; # 01100000
142             set_bit $vec, 3; # 01110000
143             set_bit $vec, 1, 0; # 00110000
144              
145             =cut
146              
147             sub set_bit {
148 50 100   50 1 26231 if( defined $_[2] ) { vec( $_[0], $_[1], 1 ) = $_[2] }
  41         144  
149 9         34 else { vec( $_[0], $_[1], 1 ) = 1 }
150             }
151              
152             #---------------------------------------------------------------------
153              
154             =head2 bit2str( $vec )
155              
156             This routine is a shallow wrapper around unpack() that unpacks a bit
157             vector into a string of '0's and '1's, in preparation for compression.
158              
159             =head3 Parameters:
160              
161             =head4 $vec
162              
163             A Perl bit vector.
164              
165             Example:
166              
167             my $vec = "";
168             set_bit $vec, 4, 1; # 00001000
169             my $str = bit2str $vec; # '00001000'
170              
171             =cut
172              
173 72     72 1 1913 sub bit2str { unpack "b*", $_[0] }
174              
175             #---------------------------------------------------------------------
176              
177             =head2 str2bit( $str )
178              
179             This routine is a shallow wrapper around pack() that packs a string
180             of '0's and '1's (following uncompression) into a bit vector.
181              
182             =head3 Parameters:
183              
184             =head4 $str
185              
186             A string of '0's and '1's, e.g., "00001000".
187              
188             Example:
189              
190             my $vec = str2bit '00001000';
191              
192             =cut
193              
194 16     16 1 1082 sub str2bit { pack "b*", $_[0] }
195              
196             #---------------------------------------------------------------------
197              
198             =head2 num2bit( \@integers )
199              
200             This routine accepts an array ref of integers and returns a bit
201             vector with those integer's bits turned on.
202              
203             =head3 Parameters:
204              
205             =head4 \@integers
206              
207             A reference to an array of integers.
208              
209             Examples:
210              
211             my $vec = num2bit [1,2,3]; # 01110000
212             my $vec = num2bit [3,2,1]; # 01110000
213              
214             The second example is intended to make clear that the order of the
215             integers in the array is not retained (for obvious reasons), and
216             calling bit2num( $vec ) will always return the integers in ascending
217             order (see bit2num() below).
218              
219             =cut
220              
221             sub num2bit {
222 9     9 1 1705 my $bvec = "";
223 9         17 vec( $bvec, $_, 1 ) = 1 for @{$_[0]};
  9         95  
224 9         42 $bvec; # returned
225             }
226              
227             #---------------------------------------------------------------------
228              
229             =head2 bit2num( $vec, $beg, $cnt )
230              
231             This routine accepts a bit vector and returns an array of integers
232             represented by the 1 bits.
233              
234             The parameters $beg and $cnt are to support retrieving subsets of
235             integers from a large vector -- in essence, to support "paging" through
236             the set.
237              
238             In scalar context, returns a reference to the array.
239              
240             =head3 Parameters:
241              
242             =head4 $vec
243              
244             A bit vector.
245              
246             =head4 $beg
247              
248             The first integer (where the bit is 1) to return. Unlike array
249             subscripts, the $beg positions start with 1, not 0.
250              
251             =head4 $cnt
252              
253             The maximum number of integers (including the first) to return.
254              
255             Examples:
256              
257             # 0----+----1----+----2----+----3-
258             my $vec = str2bit '01110011110001111100001111110001';
259              
260             my $set1 = bit2num $vec, 1, 5; # [ 1, 2, 3, 6, 7 ]
261             my $set2 = bit2num $vec, 6, 5; # [ 8, 9, 13, 14, 15 ]
262             my $set3 = bit2num $vec, 11, 5; # [ 16, 17, 22, 23, 24 ]
263             my $set4 = bit2num $vec, 16, 5; # [ 25, 26, 27, 31 ]
264              
265             =cut
266              
267             sub bit2num {
268 10     10 1 1159 my( $vec, $beg, $cnt ) = @_;
269              
270 10         13 my( @num, $count );
271              
272 10 100       24 if( $beg ) {
273              
274 4 50       7 if( $cnt ) {
275 4         7 my $end = $beg + $cnt - 1;
276 4         11 for( my $i = 0; $i < 8 * length $vec; ++$i ) {
277 128 100 100     558 if( vec $vec, $i, 1 and ++$count >= $beg and $count <= $end ) {
      100        
278 19         114 push @num, $i } }
279             }
280             else {
281 0         0 for( my $i = 0; $i < 8 * length $vec; ++$i ) {
282 0 0 0     0 if( vec $vec, $i, 1 and ++$count >= $beg ) {
283 0         0 push @num, $i } }
284             }
285              
286             }
287             else {
288 6         40 for( my $i = 0; $i < 8 * length $vec; ++$i ) {
289 168 100       1071 push @num, $i if vec $vec, $i, 1 }
290             }
291              
292 10 50       23 return @num if wantarray;
293 10         32 return \@num;
294              
295             }
296              
297             #---------------------------------------------------------------------
298              
299             =head2 compress( $str )
300              
301             This routine takes a string of '0's and '1's and compresses it using a
302             simple run-length encoding (RLE). It returns this "compressed bit
303             string".
304              
305             =head3 Parameters:
306              
307             =head4 $str
308              
309             A string of '0's and '1's, e.g., "01110".
310              
311             Note: the length of the string need not be a multiple of 8.
312              
313             Example:
314              
315             my $bstr;
316             $bstr = compress '01110000'; # '-134'
317             my $str = ('1'x100).('0'x30).('1'x6);
318             $bstr = compress $str; # '+@1cU6'
319              
320             =head3 Compression Scheme
321              
322             The compression scheme counts the number of consecutive '0's and '1's
323             and concatenates that count (in base-62) to the compressed bit string.
324              
325             If the first bit is '0', the compressed bit string begins with '-'. If
326             the first bit is '1', it begins with '+'. The digit following that
327             represents that many of those bits. The next digit represents that
328             many of the "other" bits, and so on. (A "digit" matches /[0-9A-Za-z]/.)
329              
330             So in the first example, '-134' means 1 '0' bit, then 3 '1' bits, then
331             4 '0' bits, i.e., '01110000'.
332              
333             The second example includes a 2-digit number, 1c base-62 (100 decimal,
334             as defined by Math::Int2base).
335              
336             Any multi-digit number is preceded by a non-digit:
337              
338             '@' for a 2-digit number
339             '#' for 3 digits
340             '$' for 4 digits
341             '%' for 5 digits, and
342             '^' for 6 digits
343              
344             (Mnemonic: look above the numbers on a qwerty keyboard. A 6-digit
345             number will accommodate 32,590,299,105 consecutive bits. If you need
346             more than that, let me know.)
347              
348             So '+@1cU6' means 1c (100) '1' bits, then U (30) '0' bits, then 6 '1'
349             bits.
350              
351             =cut
352              
353             sub compress {
354              
355             # 1st char '-' => 1st bit '0', '+' => '1'
356 10     10 1 1194 my( $first_char, @a );
357 10         22 for( $_[0] ) {
358 10 100       139 if( /^0/ ) { $first_char = '-'; @a = /(0*)(1*)/g }
  9         21  
  9         121  
359 1         3 else { $first_char = '+'; @a = /(1*)(0*)/g }
  1         11  
360             }
361              
362 10 50       33 return '' unless @a;
363              
364 10         66 pop @a while $a[-1] eq ''; # remove trailing nulls
365              
366             # return compressed format
367 65         181 $first_char . join( '',
368             map {
369 10         22 my $chars = int2base( length, 62 );
370 65         959 ( undef,'','@','#','$','%','^' )[ length $chars ] . $chars;
371             } @a );
372              
373             }
374              
375             #---------------------------------------------------------------------
376              
377             =head2 uncompress( $bstr )
378              
379             This routine uncompresses a compressed bit string (which would have
380             been compressed by the compress() routine above).
381              
382             It returns a string of '0's and '1's. This string will (normally) then
383             be converted to a bit vector using str2bit() above.
384              
385             =head3 Parameters:
386              
387             =head4 $bstr
388              
389             A compressed bit string (see compress() above).
390              
391             Example:
392              
393             my $bstr = '-134';
394             my $str = uncompress $bstr; # '01110000'
395              
396             =cut
397              
398             sub uncompress {
399 6     6 1 20 my $compressed = shift;
400              
401 6 50       21 croak "Undefined" unless defined $compressed;
402 6         11 my $ret = '';
403              
404             # examine first character to determine first bit's value
405 6 50       21 my $bit = substr( $compressed, 0, 1 ) eq '+' ? '1' : '0';
406              
407             # examine the rest of the characters, expand into 0's & 1's
408 6         589 for( my $i = 1; $i < length $compressed; ++$i ) {
409              
410 32         52 my $char = substr( $compressed, $i, 1 );
411              
412             # multi-digit number?
413 32         48 my $len = index '..@#$%^', $char; # @==2, #==3, etc.
414 32 50       54 if( $len > 1 ) {
415 0         0 $ret .= $bit x (base2int substr( $compressed, $i + 1, $len ), 62);
416 0         0 $i += $len;
417             }
418             else {
419 32         81 $ret .= $bit x (base2int $char, 62);
420             }
421              
422 32 100       706 $bit = $bit ? '0' : '1'; # toggle between 0/1
423             }
424              
425 6         23 $ret; # returned
426             }
427              
428             #---------------------------------------------------------------------
429              
430             =head2 howmany( $vec, $zero_or_one )
431              
432             This routine returns a count of the 0 or 1 bits in a bit vector.
433              
434             =head3 Parameters:
435              
436             =head4 $vec
437              
438             A bit vector.
439              
440             =head4 $zero_or_one
441              
442             The value you want a count of: 0 or 1. Defaults to 1 if not given.
443              
444             Examples:
445              
446             my $vec = str2bit '01010010';
447             my $ones_count = howmany $vec; # 3
448             my $zeros_count = howmany $vec, 0; # 5
449              
450             Note that howmany( $vec, 0 ) will include trailing zero bits.
451              
452             =cut
453              
454             sub howmany {
455 2     2 1 7 my( $bvec, $bitval ) = @_;
456              
457 2 100       6 $bitval = 1 unless defined $bitval;
458              
459 2         6 my $setbits = unpack "%32b*", $bvec;
460 2 100       6 return $setbits if $bitval;
461 1         5 return 8 * length( $bvec ) - $setbits; # includes trailing 0's
462             }
463              
464             #---------------------------------------------------------------------
465              
466             =head1 METHODS
467              
468             =head2 new()
469              
470             This constructs a Data::Bvec object. Each object represents a single
471             array of integers stored either as a bit vector, a compressed bit
472             string, or an array.
473              
474             =head3 Parameters:
475              
476             All parameters to new() are named.
477              
478             =head4 bvec=>$bit_vector
479              
480             Stores a Perl bit vector in the object.
481              
482             my $vec = str2bit '01110011110001111100001111110001';
483             my $bv = Data::Bvec::->new( bvec => $vec );
484              
485             =head4 bstr=>$compressed_bit_string
486              
487             Stores a compressed bit string in the object.
488              
489             my $bstr = compress bit2str $vec;
490             my $bv = Data::Bvec::->new( bstr => $bstr );
491              
492             =head4 nums=>\@integers
493              
494             Stores an array of integers in the object. The order of the array is
495             retained when stored.
496              
497             my $nums = bit2num $vec;
498             my $bv = Data::Bvec::->new( nums => $nums );
499              
500             =head4 bvec2nums=>$bit_vector
501              
502             Accepts a bit vector and stores it as an array of integers (as
503             $self->{nums}).
504              
505             my $bv = Data::Bvec::->new( bvec2nums => $vec );
506              
507             =head4 nums2bvec=>\@integers
508              
509             Accepts an array of integers and stores it as a bit vector (as
510             $self->{bvec}). The order of the array is not retained.
511              
512             my $bv = Data::Bvec::->new( nums2bvec => $nums );
513              
514             =head4 bvec2bstr=>$bit_vector
515              
516             Accepts a bit vector and stores it as a compressed bit string (as
517             $self->{bstr}).
518              
519             my $bv = Data::Bvec::->new( bvec2bstr => $vec );
520              
521             =head4 bstr2bvec=>$compressed_bit_string
522              
523             Accepts a compressed bit string and stores it as a bit vector (as
524             $self->{bvec}).
525              
526             my $bv = Data::Bvec::->new( bstr2bvec => $bstr );
527              
528             =head4 bstr2nums=>$compressed_bit_string
529              
530             Accepts a compressed bit string and stores it as an array of integers
531             (as $self->{nums}).
532              
533             my $bv = Data::Bvec::->new( bstr2nums => $bstr );
534              
535             =head4 nums2bstr=>\@integers
536              
537             Accepts an array of integers and stores it as a compressed bit string
538             (as $self->{bstr}). The order of the array is not retained.
539              
540             my $bv = Data::Bvec::->new( nums2bstr => $nums );
541              
542             =cut
543              
544             sub new {
545 13     13 1 931 my $class = shift;
546              
547 13 50       46 die "Too many parameters" if @_ > 2;
548 13         43 my %parm = @_;
549 13         23 my $self = {};
550              
551 13 100       43 $self->{bvec} = $parm{bvec} if defined $parm{bvec};
552 13 100       51 $self->{bstr} = $parm{bstr} if defined $parm{bstr};
553 13 100       44 $self->{nums} = $parm{nums} if defined $parm{nums};
554              
555 13 100       49 $self->{nums} = bit2num $parm{bvec2nums} if defined $parm{bvec2nums};
556 13 100       40 $self->{bvec} = num2bit $parm{nums2bvec} if defined $parm{nums2bvec};
557              
558 13 100       41 $self->{bstr} = compress bit2str $parm{bvec2bstr} if defined $parm{bvec2bstr};
559 13 100       48 $self->{bvec} = str2bit uncompress $parm{bstr2bvec} if defined $parm{bstr2bvec};
560              
561 13 100       38 $self->{nums} = bit2num str2bit uncompress $parm{bstr2nums} if defined $parm{bstr2nums};
562 13 100       41 $self->{bstr} = compress bit2str num2bit $parm{nums2bstr} if defined $parm{nums2bstr};
563              
564 13         63 bless $self, $class;
565             }
566              
567             #---------------------------------------------------------------------
568              
569             =head2 get_bvec()
570              
571             This routine takes no parameters. It returns a bit vector, regardless
572             how the integers are stored. The object is not changed.
573              
574             my $vec = $bv->get_bvec();
575              
576             =cut
577              
578             sub get_bvec {
579 5     5 1 26 my $self = shift;
580              
581 5 100       46 return $self->{bvec} if defined $self->{bvec};
582 2 50       7 return str2bit uncompress $self->{bstr} if defined $self->{bstr};
583 2 50       13 return num2bit $self->{nums} if defined $self->{nums};
584              
585             }
586              
587             #---------------------------------------------------------------------
588              
589             =head2 get_bstr()
590              
591             This routine takes no parameters. It returns a compressed bit string,
592             regardless how the integers are stored. The object is not changed.
593              
594             my $bstr = $bv->get_bstr();
595              
596             =cut
597              
598             sub get_bstr {
599 5     5 1 24 my $self = shift;
600              
601 5 100       33 return $self->{bstr} if defined $self->{bstr};
602 2 50       7 return compress bit2str $self->{bvec} if defined $self->{bvec};
603 2 50       13 return compress bit2str num2bit $self->{nums} if defined $self->{nums};
604              
605             }
606              
607             #---------------------------------------------------------------------
608              
609             =head2 get_nums( $beg, $cnt )
610              
611             This routine returns an array of integers, regardless how the integers
612             are stored. The object is not changed.
613              
614             Note that if the integers are stored as 'nums' (an array), get_nums()
615             will return them in the same order as the array. If they are stored
616             another way, they will be returned in ascending order.
617              
618             my @integers = $bv->get_nums(); # list returned in list context
619             my $ints = $bv->get_nums(); # aref returned in scalar context
620              
621             =head3 Parameters:
622              
623             =head4 $beg
624              
625             The first integer to return. Unlike array subscripts, the $beg
626             positions start with 1, not 0. If no $beg is given, 1 is assumed.
627              
628             =head4 $cnt
629              
630             The maximum number of integers (including the first) to return.
631             If no $cnt is given, the rest of the integers are returned.
632              
633             =cut
634              
635             sub get_nums {
636 6     6 1 26 my $self = shift;
637 6   50     70 my $beg = shift||1;
638 6   50     37 my $cnt = shift||'';
639              
640 6         8 my @num;
641              
642 6 50       27 if( defined $self->{nums} ) {
    0          
    0          
643 6 50       33 if ( $cnt ) { @num = @{$self->{nums}}[--$beg..$beg+$cnt-1] }
  0 50       0  
  0         0  
644 6         11 elsif( $beg == 1 ) { @num = @{$self->{nums}} }
  6         28  
645 0         0 else { @num = @{$self->{nums}}[--$beg..$#{$self->{nums}}] }
  0         0  
  0         0  
646             }
647             elsif( defined $self->{bvec} ) {
648 0         0 @num = bit2num $self->{bvec}, $beg, $cnt;
649             }
650             elsif( defined $self->{bstr} ) {
651 0         0 @num = bit2num str2bit( uncompress $self->{bstr} ), $beg, $cnt;
652             }
653              
654 6 100       54 return @num if wantarray;
655 2         11 return \@num;
656             }
657              
658              
659             1; # return true
660              
661             __END__