File Coverage

blib/lib/Lingua/TokenParse.pm
Criterion Covered Total %
statement 146 184 79.3
branch 62 120 51.6
condition 15 28 53.5
subroutine 22 28 78.5
pod 18 20 90.0
total 263 380 69.2


line stmt bran cond sub pod time code
1             # $Id: TokenParse.pm,v 1.20 2004/08/08 01:20:36 gene Exp $
2              
3             package Lingua::TokenParse;
4             $VERSION = '0.1601';
5 1     1   45583 use strict;
  1         4  
  1         93  
6 1     1   7 use warnings;
  1         3  
  1         242  
7 1     1   7 use Carp;
  1         26  
  1         267  
8 1     1   7675 use Storable;
  1         4399  
  1         71  
9 1     1   15722 use Math::BaseCalc;
  1         2179  
  1         2700  
10              
11             sub new {
12 2     2 1 14 my $proto = shift;
13 2   33     12 my $class = ref $proto || $proto;
14 2         31 my $self = {
15             verbose => 0,
16             # The word to parse!
17             word => undef,
18             # We need to use this.
19             word_length => 0,
20             # Known tokens.
21             lexicon => {},
22             # Local lexicon cache file name.
23             lexicon_file => '', # ?: 'lexicon-' . time(),
24             # All word parts.
25             parts => [],
26             # All possible parts combinations.
27             combinations => [],
28             # Scored list of the known parts combinations.
29             knowns => {},
30             # Definitions of the known and unknown fragments in knowns.
31             definitions => {},
32             # Fragment definition separator.
33             separator => ' + ',
34             # Known-but-not-defined definition output string.
35             not_defined => '.',
36             # Unknown definition output string.
37             unknown => '?',
38             # Known trimming regexp rules.
39             constraints => [],
40             @_, # slurp anything else and override defaults.
41             };
42 2         8 bless $self, $class;
43 2         8 $self->_init();
44 2         4 return $self;
45             }
46              
47             sub _init {
48 2     2   3 my $self = shift;
49 2 50       11 warn "Entering _init()\n" if $self->{verbose};
50 2 100       10 $self->word( $self->{word} ) if $self->{word};
51             # Retrieve our lexicon cache if a filename was set.
52 2         8 $self->lexicon_cache;
53             }
54              
55             sub DESTROY {
56 2     2   437 my $self = shift;
57             # Cache our lexicon if a filename has been given.
58 2 100       15 $self->lexicon_cache( $self->{lexicon_file} )
59             if $self->{lexicon_file};
60             }
61              
62             sub verbose {
63 0     0 0 0 my $self = shift;
64 0 0       0 $self->{verbose} = shift if @_;
65 0         0 return $self->{verbose};
66             }
67              
68             sub word {
69             # WORD: This method is the only place where word_length is set.
70 1     1 1 2 my $self = shift;
71 1 50       4 warn "Entering word()\n" if $self->{verbose};
72 1 50       3 if( @_ ) {
73 1         2 $self->{word} = shift;
74 1         3 $self->{word_length} = length $self->{word};
75 1 50       3 printf "\tword = %s\n\tlength = %d\n",
76             $self->{word}, $self->{word_length}
77             if $self->{verbose};
78             }
79 1         1 return $self->{word};
80             }
81              
82             sub lexicon {
83 964     964 1 2088 my $self = shift;
84 964 100       1598 if( @_ ) {
85 2 0 33     19 $self->{lexicon} = @_ == 1 && ref $_[0] eq 'HASH'
    50          
86             ? shift
87             : @_ % 2 == 0
88             ? { @_ }
89             : {};
90             }
91 964         2046 return $self->{lexicon};
92             }
93              
94             sub parts {
95 2     2 1 706 my $self = shift;
96 2 100       8 $self->{parts} = shift if @_;
97 2         7 return $self->{parts};
98             }
99              
100             sub combinations {
101 195     195 1 254 my $self = shift;
102 195 100       319 $self->{combinations} = shift if @_;
103 195         564 return $self->{combinations};
104             }
105              
106             sub knowns {
107 3     3 1 1005 my $self = shift;
108 3 100       10 $self->{knowns} = shift if @_;
109 3         24 return $self->{knowns};
110             }
111              
112             sub definitions {
113 2     2 1 5 my $self = shift;
114 2 100       7 $self->{definitions} = shift if @_;
115 2         7 return $self->{definitions};
116             }
117              
118             sub separator {
119 0     0 1 0 my $self = shift;
120 0 0       0 $self->{separator} = shift if @_;
121 0         0 return $self->{separator};
122             }
123              
124             sub not_defined {
125 0     0 1 0 my $self = shift;
126 0 0       0 $self->{not_defined} = shift if @_;
127 0         0 return $self->{not_defined};
128             }
129              
130             sub unknown {
131 0     0 1 0 my $self = shift;
132 0 0       0 $self->{unknown} = shift if @_;
133 0         0 return $self->{unknown};
134             }
135              
136             sub constraints {
137 46     46 1 46 my $self = shift;
138 46 100       85 $self->{constraints} = shift if @_;
139 46         79 return $self->{constraints};
140             }
141              
142             sub parse {
143 1     1 1 45 my $self = shift;
144 1 50       24 warn "Entering parse()\n" if $self->{verbose};
145 1 50       4 $self->word( shift ) if @_;
146 1 50       3 croak 'No word provided.' unless defined $self->{word};
147 1 50       1 croak 'No lexicon defined.' unless keys %{ $self->{lexicon} };
  1         5  
148             # Reset our data structures.
149 1         3 $self->parts([]);
150 1         5 $self->definitions({});
151 1         2 $self->combinations([]);
152 1         2 $self->knowns({});
153             # Build new ones based on the word.
154 1         4 $self->build_parts;
155 1         10 $self->build_definitions;
156 1         5 $self->build_combinations;
157 1         5 $self->build_knowns;
158             }
159              
160             sub build_parts {
161 1     1 1 1 my $self = shift;
162 1 50       6 warn "Entering build_parts()\n" if $self->{verbose};
163              
164 1         3 for my $i (0 .. $self->{word_length} - 1) {
165 9         16 for my $j (1 .. $self->{word_length} - $i) {
166 45         64 my $part = substr $self->{word}, $i, $j;
167 45         121 push @{ $self->{parts} }, $part
  45         178  
168 45         72 unless grep { $part =~ /$_/ }
169 45 50       54 @{ $self->constraints };
170             }
171             }
172              
173 1 50       4 if($self->{verbose}) {
174             # XXX This is ugly.
175 0         0 my $last = 0;
176 0         0 for my $part (@{ $self->{parts} }) {
  0         0  
177 0 0       0 print '',
    0          
178             ($last ? $last > length( $part ) ? "\n\t" : ', ' : "\t"),
179             $part;
180 0         0 $last = length $part;
181             }
182 0 0       0 print "\n" if @{ $self->{parts} };
  0         0  
183             }
184 1         3 return $self->{parts};
185             }
186              
187             # Save a known combination entry => definition table.
188             sub build_definitions {
189 1     1 1 2 my $self = shift;
190 1 50       5 warn "Entering build_definitions()\n" if $self->{verbose};
191 1         1 for my $part (@{ $self->{parts} }) {
  1         3  
192 45 100       110 $self->{definitions}{$part} = $self->{lexicon}{$part}
193             if $self->{lexicon}{$part};
194             }
195 1 50       8 warn "\t", join( "\n\t", sort keys %{ $self->definitions } ), "\n"
  0         0  
196             if $self->{verbose};
197 1         3 return $self->{definitions};
198             }
199              
200             sub build_combinations {
201 1     1 1 2 my $self = shift;
202 1 50       4 warn "Entering build_combinations()\n" if $self->{verbose};
203              
204             # field size for binary iteration (digits of precision)
205 1         4 my $y = $self->{word_length} - 1;
206             # total number of zero-based combinations
207 1         3 my $z = 2 ** $y - 1;
208             # field size for the count
209 1         3 my $lz = length $z;
210             # field size for a combination
211 1         2 my $m = $self->{word_length} + $y;
212 1 50       4 warn sprintf
213             "\tTotal combinations: %d\n\tConstrained combinations:\n",
214             $z + 1
215             if $self->{verbose};
216              
217             # Truth is a single partition character: the lowly dot.
218 1         20 my $c = Math::BaseCalc->new( digits => [ 0, '.' ] );
219              
220             # Build a word part combination for each iteration.
221 1         49 for my $n ( 0 .. $z ) {
222             # Iterate in base two.
223 256         619 my $i = $c->to_base( $n );
224              
225             # Get the binary digits as an array.
226 256         5279 my @i = split //, sprintf( '%0'.$y.'s', $i );
227              
228             # Join the character and digit arrays into a partitioned word.
229 256         355 my $t = '';
230             # ..by stepping over the characters and peeling off a digit.
231 256         788 for( split //, $self->{word} ) {
232             # Zero values become ''. Haha! Truth prevails.
233 2304   100     5558 $t .= $_ . (shift( @i ) || '');
234             }
235              
236 256 100       1470 unless( grep { $t =~ /$_/ } @{ $self->{constraints} } ) {
  256         1161  
  256         430  
237             # Preach it.
238 192 50       463 printf "\t%".$lz.'d) %0'.$y.'s => %'.$m."s\n", $n, $i, $t
239             if $self->{verbose};
240 192         166 push @{ $self->combinations }, $t;
  192         332  
241             }
242             }
243              
244 1         18 return $self->{combinations};
245             }
246              
247             sub build_knowns {
248 1     1 1 1 my $self = shift;
249 1 50       2 return unless scalar keys %{ $self->{lexicon} };
  1         6  
250 1 50       3 warn "Entering build_knowns()\n" if $self->{verbose};
251              
252             # Save the familiarity value for each "raw" combination.
253 1         2 for my $combo (@{ $self->{combinations} }) {
  1         4  
254             # Skip combinations that have already been seen.
255 192 50       435 next if exists $self->{knowns}{$combo};
256              
257 192         234 my ($sum, $frag_sum, $char_sum) = (0, 0, 0);
258              
259             # Get the bits of the combination.
260 192         585 my @chunks = split /\./, $combo;
261 192         284 for (@chunks) {
262             # XXX Uh.. Magically handle hyphens in lexicon entries.
263 960         1704 ($_, my $combo_seen) = _hyphenate($_, $self->lexicon, 0);
264              
265             # Sum the combination familiarity values.
266 960 100       2299 if ($combo_seen) {
267 108         5299 $frag_sum++;
268 108         171 $char_sum += length;
269             }
270             }
271             # XXX Huh? Why? Can $_ change or something?
272             # Stick our combination back together.
273 192         363 $combo = join '.', @chunks;
274              
275             # Save this combination and its familiarity ratios.
276 192         266 my $x = $frag_sum / @chunks;
277 192         250 my $y = $char_sum / $self->{word_length};
278 192 50       355 warn "\t$combo: [$x, $y]\n" if $self->{verbose};
279 192 100 66     531 if( $x || $y ) {
280 85         453 $self->{knowns}{$combo} = [ $x, $y ];
281             }
282             else {
283 107         269 delete $self->{knowns}{$combo};
284             }
285             }
286              
287 1         8 return $self->{knowns};
288             }
289              
290             # Reduce the number of known combinations by concatinating adjacent
291             # unknowns (and then removing any duplicates produced).
292              
293             sub learn {
294 0     0 0 0 my ($self, %args) = @_;
295             # Get the list of (partially) unknown stem combinations.
296             # Loop through each looking in %args or prompting for a definition.
297             }
298              
299             # Update the given string with its actual lexicon value and increment
300             # the seen flag.
301             sub _hyphenate {
302 960     960   1263 my ($string, $lexicon, $combo_seen) = @_;
303              
304 960 100       2960 if (exists $lexicon->{$string}) {
    50          
    50          
305 108 50       190 $combo_seen++ if defined $combo_seen;
306             }
307             elsif (exists $lexicon->{"-$string"}) {
308 0 0       0 $combo_seen++ if defined $combo_seen;
309 0         0 $string = "-$string";
310             }
311             elsif (exists $lexicon->{"$string-"}) {
312 0 0       0 $combo_seen++ if defined $combo_seen;
313 0         0 $string = "$string-";
314             }
315              
316 960 50       2651 return wantarray ? ($string, $combo_seen) : $string;
317             }
318              
319             sub output_knowns {
320 0     0 1 0 my $self = shift;
321 0         0 my @out = ();
322 0         0 my $header = <
323             Combination [frag familiarity, char familiarity]
324             Fragment definitions
325              
326             HEADER
327              
328 0 0       0 for my $known (
  0         0  
329             reverse sort {
330 0         0 $self->{knowns}{$a}[0] <=> $self->{knowns}{$b}[0] ||
331             $self->{knowns}{$a}[1] <=> $self->{knowns}{$b}[1]
332             } keys %{ $self->{knowns} }
333             ) {
334 0         0 my @definition;
335 0         0 for my $chunk (split /\./, $known) {
336 0 0       0 push @definition,
    0          
337             defined $self->{definitions}{$chunk}
338             ? $self->{definitions}{$chunk}
339             ? $self->{definitions}{$chunk}
340             : $self->{not_defined}
341             : $self->{unknown};
342             }
343              
344 0         0 push @out, sprintf qq/%s [%s]\n%s/,
345             $known,
346 0         0 join (', ', map { sprintf '%0.2f', $_ }
347 0         0 @{ $self->{knowns}{$known} }),
348             join ($self->{separator}, @definition);
349             }
350              
351 0 0       0 return wantarray ? @out : $header . join "\n\n", @out;
352             }
353              
354             # Naive, no locking read/write. If you run a production environment,
355             # you know what to do.
356             sub lexicon_cache {
357 5     5 1 475 my( $self, $file, $value ) = @_;
358 5 50       15 warn "Entering lexicon_cache()\n" if $self->{verbose};
359              
360             # Set the file and the lexicon_file attribute if we are told to.
361 5 100 100     29 if( $file && $file eq 'lexicon_file' && $value ) {
      66        
362 1         3 $self->{lexicon_file} = $value;
363 1         2 $file = $value;
364             }
365              
366             # If there is no file try to use the lexicon_file.
367 5   66     21 $file ||= $self->{lexicon_file};
368             # Otherwise, bail out!
369 5 50 0     18 warn( "No lexicon cache file set\n" ) and return
      33        
370             if $self->{verbose} && !$file;
371              
372 5 100       15 if( $file ) {
373             # Store 'em if you got 'em.
374 3 100       5 if( keys %{ $self->{lexicon} } ) {
  3         10  
375 2 50       7 warn "store( $self->{lexicon}, $file )\n" if $self->{verbose};
376 2         9 store( $self->{lexicon}, $file );
377             }
378             # ..Retrieve 'em if not.
379             else {
380 1 50 33     5 warn "retrieve( $file )\n" if $self->{verbose} && -e $file;
381 1 50       24 $self->lexicon( retrieve( $file ) ) if -e $file;
382             }
383             }
384             }
385              
386             1;
387              
388             __END__