File Coverage

blib/lib/String/REPartition.pm
Criterion Covered Total %
statement 9 188 4.7
branch 0 76 0.0
condition 0 27 0.0
subroutine 3 10 30.0
pod 1 1 100.0
total 13 302 4.3


line stmt bran cond sub pod time code
1             # String::REPartition, a module used to partition data using a regular
2             # expression.
3              
4             package String::REPartition;
5              
6             require 5;
7 2     2   29036 use strict;
  2         4  
  2         68  
8 2     2   12 use warnings;
  2         3  
  2         57  
9 2     2   10 use Exporter;
  2         7  
  2         4936  
10             our @ISA = qw(Exporter);
11             our @EXPORT = qw(make_partition_re);
12              
13             our $VERSION = 1.6;
14              
15             my $DEBUG = 0;
16              
17             # This is the main (and only) accessor function for this module. Given a
18             # ratio and a reference to a list of strings, it will produce a regular
19             # expression that will match @{$ref} * $ratio of the words in the list,
20             # and (this is the important part) not the rest of them. For example, if
21             # $ratio is .4, the resulting regular expression will match 40% of the
22             # strings in the list, and will fail to match the remaining 60%.
23             sub make_partition_re {
24 0     0 1   my($ratio) = shift;
25 0           my($arryref) = shift;
26              
27 0           my(%lenhash) = ();
28 0           my(@words) = ();
29              
30             # Just checking inputs here.
31 0 0         warn("Checking inputs...\n") if $DEBUG;
32 0 0 0       unless ($ratio && _is_numeric($ratio) && ($ratio > 0) && ($ratio < 1)) {
      0        
      0        
33 0           return _whine ("Invalid ratio given. Must be a number between 0 and 1.");
34             }
35 0 0 0       unless ($arryref && ref($arryref) && ref($arryref) eq 'ARRAY') {
      0        
36 0           return _whine ("Invalid reference given. Must be a reference to a list or an array.");
37             }
38 0           @words = @{ $arryref };
  0            
39              
40 0           chomp @words;
41              
42             # First we build a hash recording the number of strings with each length.
43 0           foreach (@words) {
44 0           $lenhash{length($_)}++;
45             }
46 0 0         if ($DEBUG) {
47 0           print "My first length hash looks like this:\n";
48 0           foreach my $key (sort {$a <=> $b} (keys %lenhash)) {
  0            
49 0           print " $key -> $lenhash{$key}\n";
50             }
51             }
52              
53             # And then use the _make_list subroutine to examine that hash and determine
54             # a set of lengths which constitute the closest solution, given the ratio.
55 0           my(@soln) = _make_list(\%lenhash,$ratio);
56 0 0         if ($DEBUG) {
57 0           print "First solution is: ";
58 0           print join('--', @soln);
59 0           print "\n";
60             }
61              
62             # The new ratio will be the last value of the returned array. It may or
63             # may not be defined. An undef ratio implies that an exact solution has
64             # been found.
65 0           $ratio = pop(@soln);
66 0 0         if ($DEBUG) {
67 0 0         if (defined $ratio) {
68 0           print "I found the ratio $ratio and want to split $soln[-1]\n";
69             }
70             else {
71 0           print "No ratio found -- must have found an exact solution on the first try\n";
72             }
73             }
74 0 0         my($split) = pop(@soln) if defined $ratio;
75 0           my($regex) = "^(";
76              
77             # If any lengths were appropriate to go into the solution (there need not
78             # be), then we'll build the first part of the regex.
79 0 0         if (scalar @soln) {
80 0           $regex .= join('|',map {'(' . '.{' . $_ . '})'} _shrink_list(@soln));
  0            
81             }
82 0 0         print "Regex so far is $regex\n" if $DEBUG;
83 0           my($splitlen) = 0;
84 0           my(%alphahash) = ();
85 0           my(@solns) = ();
86              
87             # Now, if ratio *is* defined, that means we have to further subdivide words
88             # of one of the lengths.
89 0 0         if (defined($ratio)) {
90              
91             # This is just setting a bunch of stuff up.
92 0 0         print "Starting to re-split\n" if $DEBUG;
93 0           $splitlen = $split;
94 0           my($splitval) = $lenhash{$split};
95 0           my($letnum) = 0;
96 0           my($total) = 0;
97              
98             # We only want to play with the words of the appropriate length.
99 0           @words = grep( (length($_) == $splitlen), @words );
100              
101             # And now we will continue re-subdividing the words until we've been
102             # asked to split the sample too much.
103 0   0       until (
      0        
104             (int($splitval * $ratio) <= 1) ||
105             (int($splitval * $ratio) >= ($splitval - 1)) ||
106             ($letnum >= $splitlen)
107             ) {
108 0           %alphahash = ();
109              
110             # Here we build a hash similar to the lenhash before.
111 0           foreach my $word (@words) {
112 0           $alphahash{substr($word,$letnum,1)}++;
113             }
114              
115             # And then build the solution with the new data.
116 0           @soln = _make_list(\%alphahash,$ratio);
117 0           $ratio = pop(@soln);
118 0 0         if ($DEBUG) {
119 0 0         if (defined $ratio) {
120 0           print "I found the ratio $ratio and want to split $soln[-1]\n";
121             }
122             else {
123 0           print "No ratio found -- must have found an exact solution\n";
124             }
125             }
126             # Store the solution...
127 0 0         if ($DEBUG) {
128 0           print "Adding: " . join('--',@soln) . " to the solutions.\n";
129             }
130 0           @{$solns[$letnum]} = @soln;
  0            
131              
132             # Maybe do some stuff if we have to further subdivide...
133 0 0         if (defined($ratio)) {
134 0           $split = pop(@soln);
135 0           @words = grep( (substr($_,$letnum,1) eq $split), @words );
136 0           $splitval = $alphahash{$split};
137 0           $letnum++;
138             }
139              
140             # Otherwise, make the loop bomb out so we can get on with our lives.
141             else {
142 0           $ratio = -1;
143             }
144             }
145 0 0 0       if ($ratio >= 0 && (scalar @solns > 0)) {
146 0           pop(@{$solns[-1]});
  0            
147             }
148             }
149              
150             # Now, if we have some solutions from subdividing the remaining words,
151             # we want to incorporate that into our regex...
152 0           my($regex_annex) = "";
153 0 0         if (scalar @solns) {
154 0           my($prefix) = "";
155 0           my($templetter) = "";
156 0           foreach my $num (0..$#solns) {
157 0           $splitlen--;
158              
159             # If there are more solutions in the solution array after the one
160             # we're looking at, then the last letter isn't part of the solution
161             # but rather the letter that'll be split for the *next* solution.
162             # Thus we have to save it and store it.
163 0 0         if ($num < $#solns) {
164 0           $templetter = pop(@{$solns[$num]});
  0            
165             }
166             else {
167 0           $templetter = '';
168             }
169 0 0         if (scalar @{$solns[$num]} > 0) {
  0            
170 0           $regex_annex .= "($prefix\[" . join('',@{$solns[$num]}) . ']';
  0            
171 0 0         if ($splitlen > 0) {
172 0           $regex_annex .= '.{' . $splitlen . '}';
173             }
174 0           $regex_annex .= ')|';
175             }
176 0           $prefix .= $templetter;
177             }
178 0           chop $regex_annex;
179             }
180 0 0 0       if (length($regex) > 2 && length($regex_annex)) {
181 0           $regex .= "|";
182             }
183 0           $regex .= $regex_annex;
184 0           $regex .= ")\$";
185 0           $regex =~ s/\[\^/\[\\\^/g;
186 0           return $regex;
187             }
188              
189             # This function takes a reference to a hashtable and a ratio as its arguments.
190             # The hashtable represents the names and sizes of the buckets available to
191             # make the solution, and the ratio represents the percentage of the total
192             # of all the bucket sizes that the solution must represent.
193             # This function returns a list, representing the solution to the proplem
194             # presented to it, in the following format:
195             # The last element is the ratio by which one of the buckets must be further
196             # subdivided. If an exact solution was found, then this ratio will be
197             # undefined.
198             # If the last element is defined, the next to last element will be the name
199             # of the bucket which needs to be subdivided by the ratio indicated therein.
200             # The rest of the list returned contains the names of the buckets which will
201             # go into the solution.
202             # This explanation is a little confusing, and since the action of this function
203             # is so central to the working of this module, I'll give an example to help
204             # clear things up. Let's say the hash you pass in looks like this:
205             # { 'a' => 4, 'b' => 2, 'c' => 4 }
206             # If the ratio given is .6, then a valid return from the function will be:
207             # ('a', 'b', undef), since the combination of the 'a' and 'b' buckets adds
208             # up exactly to 60% of the total of all the buckets. However, if the ratio
209             # asked for is .5, then the return would probably be:
210             # ('a', 'b', .5), since the only way to get 50% of the total is to take
211             # all of the 'a' bucket and half of the 'b' bucket.
212             # I hope that clears things up.
213              
214             sub _make_list {
215              
216 0     0     my($hashref, $ratio) = @_;
217              
218 0           my(@values) = values(%{$hashref});
  0            
219 0           my($target,$max) = (0,0);
220              
221             # Here we figure out some attributes of the data we've been given -- what
222             # amount we're shooting for, and what the larget value is.
223 0           foreach (@values) {
224 0 0         die "Non-number found: $_\n" unless /^\d+$/;
225 0           $target += $_;
226 0 0         if ($_ > $max) { $max = $_ }
  0            
227             }
228 0           $target *= $ratio;
229 0           $target = int($target);
230              
231             # Once we have an understanding of the data we're working with, we can
232             # start trying to find a good solution. The first thing we do is
233             # try to solve the problem with no bounds -- having the third argument
234             # at $max+1 guarantees that all of the buckets will be considered
235             # for inclusion. The first returned value is a reference to a hash
236             # describing the solution and the second is a somewhat arbitrary "score"
237             # which describes how "good" that solution is. While the score is not
238             # really a good metric of anything realistic, it roughly decreases as
239             # the quality of the solution increases, and reaches 0 as the solution
240             # become perfect (requiring no further subdivison).
241 0           my($besthash, $bestscore) = _find_soln(\@values, $target, $max+1);
242              
243             # So, if on our first try, we get a score of 0, we just return the
244             # solution with an undef ratio.
245 0 0         if ($bestscore == 0) { return (_get_words($besthash, $hashref), undef) }
  0            
246 0           my($ref,$score) = ("",0);
247              
248             # If the first solution wasn't perfect, then the theory is that maybe it
249             # didn't do very well because it included a bucket it shouldn't have. So
250             # what we do is try re-making the solution a number of times, excluding
251             # a number of different buckets each time, until we either come up with a
252             # perfect solution or run out of things to try. This is almost certainly
253             # not the best way to go about things, but it works, so I'm not going to
254             # worry about it until the next version. If then. :)
255 0           foreach $max (keys %{$besthash}) {
  0            
256 0           ($ref, $score) = _find_soln(\@values, $target, $max);
257 0 0         next unless defined $ref;
258 0 0         if ($score == 0) { return (_get_words($ref, $hashref), undef) }
  0            
259 0 0         if ($score < $bestscore) { $bestscore = $score; $besthash = $ref }
  0            
  0            
260             }
261 0           return (_get_words($besthash, $hashref, $target));
262             }
263              
264             # This function takes as inputs a list of values, a target total to aim for and
265             # a maximum bucket size to use. It then constructs a suitable combination of
266             # the values it was given to get as close to the target as possible without
267             # using any values that aren't smaller than the max. It doesn't do it very
268             # well.
269             # It returns a solution hash and a score for the "goodness" of that solution.
270             sub _find_soln {
271 0     0     my($ref, $target, $max) = @_;
272 0           my($val,$sum) = (0,0);
273 0           my(%soln) = ();
274 0           my(%left) = ();
275              
276 0           foreach $val (sort {$b <=> $a} @{$ref}) {
  0            
  0            
277 0 0         next unless $val < $max;
278 0 0         if ($val + $sum <= $target) {
279 0           $sum += $val;
280 0           $soln{$val}++;
281             }
282             else {
283 0           $left{$val}++;
284             }
285             }
286 0           my($diff) = $target - $sum;
287 0           my($min) = (sort {$b <=> $a} keys %left)[-1];
  0            
288 0 0         unless (defined $min) { return undef }
  0            
289 0           my($score)= $diff * $min;
290 0           return (\%soln, $score);
291             }
292              
293             # This function takes a solution hash as returned from _find_soln, the data
294             # hash as given to _make_list and an optional target, and returns a list
295             # appropriate to be returned by _make_list. Its action is uninteresting and
296             # straightforward, so I will not waste bytes in describing it further.
297             sub _get_words {
298 0     0     my($soln, $data, $target) = @_;
299 0           my(@retarray) = ();
300 0           my($total,%left) = (0,%{$data});
  0            
301              
302 0           foreach my $solkey (keys %{$soln}) {
  0            
303 0           foreach my $num (1..$soln->{$solkey}) {
304 0           foreach my $datkey (keys %left) {
305 0 0         if ($data->{$datkey} == $solkey) {
306 0           push(@retarray, $datkey);
307 0           delete($left{$datkey});
308 0           $total += $solkey;
309 0           last;
310             }
311             }
312             }
313             }
314 0 0         if (defined $target) {
315 0           my($minkey) = "";
316 0           my($min) = (sort {$a <=> $b} values %{$data})[-1];
  0            
  0            
317 0           foreach my $leftkey (keys %left) {
318 0 0         if ($min >= $data->{$leftkey}) {
319 0           $minkey = $leftkey;
320 0           $min = $data->{$leftkey};
321             }
322             }
323 0           push(@retarray,($minkey, (($target - $total)/$min)));
324             }
325 0           return @retarray;
326             }
327              
328             # Simple test for numericity, pulled straight from the FAQ.
329             sub _is_numeric {
330 0     0     my($test) = shift;
331              
332 0 0         unless ($test =~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/) {
333 0           return undef;
334             }
335 0           return $test;
336             }
337              
338             # Used to return polite errors from the module.
339             sub _whine {
340 0     0     my($msg) = shift;
341              
342 0 0         if ($^W) {
343 0           warn("String::REPartition says: $msg\n");
344             }
345 0           return undef;
346             }
347              
348             # Given a list of numbers, sorts it and combines contiguous members into
349             # comma-separated pairs. That is, turns (1 2 3 4 7 8) into (1,4 7,8). This
350             # is useful in building a nice-looking regular expression.
351             sub _shrink_list {
352 0     0     my(@list) = sort {$a <=> $b} @_;
  0            
353 0           my($num) = 0;
354              
355 0           until ($num >= $#list) {
356 0 0         if ((split(',',$list[$num]))[-1] == ($list[$num+1]-1)) {
357 0 0         if ($list[$num] =~ /^\d+$/) {
358 0           $list[$num] .= "," . splice(@list,$num+1,1);
359             }
360             else {
361 0           substr($list[$num],index($list[$num],",")+1) = splice(@list,$num+1,1);
362             }
363             }
364             else {
365 0           $num++;
366             }
367             }
368 0           return @list;
369             }
370              
371             1;
372              
373             __END__