File Coverage

blib/lib/Lingua/TokenParse.pm
Criterion Covered Total %
statement 146 183 79.7
branch 62 120 51.6
condition 15 28 53.5
subroutine 22 27 81.4
pod 19 19 100.0
total 264 377 70.0


line stmt bran cond sub pod time code
1             package Lingua::TokenParse;
2             $Lingua::TokenParse::VERSION = '0.1602';
3             our $AUTHORITY = 'cpan:GENE';
4              
5             # ABSTRACT: DEPRECATED in favor of Lingua::Word::Parser
6              
7             $VERSION = '0.1602';
8              
9 1     1   775 use strict;
  1         3  
  1         29  
10 1     1   8 use warnings;
  1         2  
  1         27  
11 1     1   8 use Carp qw(croak);
  1         2  
  1         55  
12 1     1   613 use Storable qw(retrieve store);
  1         3155  
  1         61  
13 1     1   428 use Math::BaseCalc ();
  1         1505  
  1         2256  
14              
15             sub new {
16 2     2 1 546 my $proto = shift;
17 2   33     11 my $class = ref $proto || $proto;
18 2         17 my $self = {
19             verbose => 0,
20             # The word to parse!
21             word => undef,
22             # We need to use this.
23             word_length => 0,
24             # Known tokens.
25             lexicon => {},
26             # Local lexicon cache file name.
27             lexicon_file => '', # ?: 'lexicon-' . time(),
28             # All word parts.
29             parts => [],
30             # All possible parts combinations.
31             combinations => [],
32             # Scored list of the known parts combinations.
33             knowns => {},
34             # Definitions of the known and unknown fragments in knowns.
35             definitions => {},
36             # Fragment definition separator.
37             separator => ' + ',
38             # Known-but-not-defined definition output string.
39             not_defined => '.',
40             # Unknown definition output string.
41             unknown => '?',
42             # Known trimming regexp rules.
43             constraints => [],
44             @_, # slurp anything else and override defaults.
45             };
46 2         4 bless $self, $class;
47 2         6 $self->_init();
48 2         6 return $self;
49             }
50              
51             sub _init {
52 2     2   3 my $self = shift;
53 2 50       8 warn "Entering _init()\n" if $self->{verbose};
54 2 100       7 $self->word( $self->{word} ) if $self->{word};
55             # Retrieve our lexicon cache if a filename was set.
56 2         5 $self->lexicon_cache;
57             }
58              
59             sub DESTROY {
60 2     2   739 my $self = shift;
61             # Cache our lexicon if a filename has been given.
62             $self->lexicon_cache( $self->{lexicon_file} )
63 2 100       11 if $self->{lexicon_file};
64             }
65              
66             sub verbose {
67 0     0 1 0 my $self = shift;
68 0 0       0 $self->{verbose} = shift if @_;
69 0         0 return $self->{verbose};
70             }
71              
72             sub word {
73             # WORD: This method is the only place where word_length is set.
74 1     1 1 1 my $self = shift;
75 1 50       3 warn "Entering word()\n" if $self->{verbose};
76 1 50       4 if( @_ ) {
77 1         1 $self->{word} = shift;
78 1         3 $self->{word_length} = length $self->{word};
79             printf "\tword = %s\n\tlength = %d\n",
80             $self->{word}, $self->{word_length}
81 1 50       3 if $self->{verbose};
82             }
83 1         1 return $self->{word};
84             }
85              
86             sub lexicon {
87 964     964 1 2075 my $self = shift;
88 964 100       1585 if( @_ ) {
89 2 0 33     17 $self->{lexicon} = @_ == 1 && ref $_[0] eq 'HASH'
    50          
90             ? shift
91             : @_ % 2 == 0
92             ? { @_ }
93             : {};
94             }
95 964         1641 return $self->{lexicon};
96             }
97              
98             sub parts {
99 2     2 1 360 my $self = shift;
100 2 100       7 $self->{parts} = shift if @_;
101 2         30 return $self->{parts};
102             }
103              
104             sub combinations {
105 195     195 1 288 my $self = shift;
106 195 100       341 $self->{combinations} = shift if @_;
107 195         493 return $self->{combinations};
108             }
109              
110             sub knowns {
111 3     3 1 622 my $self = shift;
112 3 100       9 $self->{knowns} = shift if @_;
113 3         19 return $self->{knowns};
114             }
115              
116             sub definitions {
117 2     2 1 4 my $self = shift;
118 2 100       7 $self->{definitions} = shift if @_;
119 2         6 return $self->{definitions};
120             }
121              
122             sub separator {
123 0     0 1 0 my $self = shift;
124 0 0       0 $self->{separator} = shift if @_;
125 0         0 return $self->{separator};
126             }
127              
128             sub not_defined {
129 0     0 1 0 my $self = shift;
130 0 0       0 $self->{not_defined} = shift if @_;
131 0         0 return $self->{not_defined};
132             }
133              
134             sub unknown {
135 0     0 1 0 my $self = shift;
136 0 0       0 $self->{unknown} = shift if @_;
137 0         0 return $self->{unknown};
138             }
139              
140             sub constraints {
141 46     46 1 418 my $self = shift;
142 46 100       81 $self->{constraints} = shift if @_;
143 46         82 return $self->{constraints};
144             }
145              
146             sub parse {
147 1     1 1 4 my $self = shift;
148 1 50       4 warn "Entering parse()\n" if $self->{verbose};
149 1 50       2 $self->word( shift ) if @_;
150 1 50       3 croak 'No word provided.' unless defined $self->{word};
151 1 50       1 croak 'No lexicon defined.' unless keys %{ $self->{lexicon} };
  1         6  
152             # Reset our data structures.
153 1         4 $self->parts([]);
154 1         3 $self->definitions({});
155 1         3 $self->combinations([]);
156 1         2 $self->knowns({});
157             # Build new ones based on the word.
158 1         3 $self->build_parts;
159 1         2 $self->build_definitions;
160 1         3 $self->build_combinations;
161 1         15 $self->build_knowns;
162             }
163              
164             sub build_parts {
165 1     1 1 1 my $self = shift;
166 1 50       3 warn "Entering build_parts()\n" if $self->{verbose};
167              
168 1         4 for my $i (0 .. $self->{word_length} - 1) {
169 9         16 for my $j (1 .. $self->{word_length} - $i) {
170 45         73 my $part = substr $self->{word}, $i, $j;
171 45         105 push @{ $self->{parts} }, $part
172 45         146 unless grep { $part =~ /$_/ }
173 45 50       59 @{ $self->constraints };
  45         68  
174             }
175             }
176              
177 1 50       3 if($self->{verbose}) {
178             # XXX This is ugly.
179 0         0 my $last = 0;
180 0         0 for my $part (@{ $self->{parts} }) {
  0         0  
181 0 0       0 print '',
    0          
182             ($last ? $last > length( $part ) ? "\n\t" : ', ' : "\t"),
183             $part;
184 0         0 $last = length $part;
185             }
186 0 0       0 print "\n" if @{ $self->{parts} };
  0         0  
187             }
188 1         2 return $self->{parts};
189             }
190              
191             # Save a known combination entry => definition table.
192             sub build_definitions {
193 1     1 1 2 my $self = shift;
194 1 50       3 warn "Entering build_definitions()\n" if $self->{verbose};
195 1         1 for my $part (@{ $self->{parts} }) {
  1         3  
196             $self->{definitions}{$part} = $self->{lexicon}{$part}
197 45 100       83 if $self->{lexicon}{$part};
198             }
199 0         0 warn "\t", join( "\n\t", sort keys %{ $self->definitions } ), "\n"
200 1 50       25 if $self->{verbose};
201 1         5 return $self->{definitions};
202             }
203              
204             sub build_combinations {
205 1     1 1 2 my $self = shift;
206 1 50       2 warn "Entering build_combinations()\n" if $self->{verbose};
207              
208             # field size for binary iteration (digits of precision)
209 1         2 my $y = $self->{word_length} - 1;
210             # total number of zero-based combinations
211 1         9 my $z = 2 ** $y - 1;
212             # field size for the count
213 1         3 my $lz = length $z;
214             # field size for a combination
215 1         2 my $m = $self->{word_length} + $y;
216             warn sprintf
217             "\tTotal combinations: %d\n\tConstrained combinations:\n",
218             $z + 1
219 1 50       2 if $self->{verbose};
220              
221             # Truth is a single partition character: the lowly dot.
222 1         8 my $c = Math::BaseCalc->new( digits => [ 0, '.' ] );
223              
224             # Build a word part combination for each iteration.
225 1         46 for my $n ( 0 .. $z ) {
226             # Iterate in base two.
227 256         587 my $i = $c->to_base( $n );
228              
229             # Get the binary digits as an array.
230 256         7306 my @i = split //, sprintf( '%0'.$y.'s', $i );
231              
232             # Join the character and digit arrays into a partitioned word.
233 256         408 my $t = '';
234             # ..by stepping over the characters and peeling off a digit.
235 256         777 for( split //, $self->{word} ) {
236             # Zero values become ''. Haha! Truth prevails.
237 2304   100     5575 $t .= $_ . (shift( @i ) || '');
238             }
239              
240 256 100       435 unless( grep { $t =~ /$_/ } @{ $self->{constraints} } ) {
  256         1165  
  256         467  
241             # Preach it.
242             printf "\t%".$lz.'d) %0'.$y.'s => %'.$m."s\n", $n, $i, $t
243 192 50       376 if $self->{verbose};
244 192         235 push @{ $self->combinations }, $t;
  192         293  
245             }
246             }
247              
248 1         10 return $self->{combinations};
249             }
250              
251             sub build_knowns {
252 1     1 1 3 my $self = shift;
253 1 50       2 return unless scalar keys %{ $self->{lexicon} };
  1         5  
254 1 50       4 warn "Entering build_knowns()\n" if $self->{verbose};
255              
256             # Save the familiarity value for each "raw" combination.
257 1         2 for my $combo (@{ $self->{combinations} }) {
  1         3  
258             # Skip combinations that have already been seen.
259 192 50       371 next if exists $self->{knowns}{$combo};
260              
261 192         296 my ($sum, $frag_sum, $char_sum) = (0, 0, 0);
262              
263             # Get the bits of the combination.
264 192         425 my @chunks = split /\./, $combo;
265 192         301 for (@chunks) {
266             # XXX Uh.. Magically handle hyphens in lexicon entries.
267 960         1449 ($_, my $combo_seen) = _hyphenate($_, $self->lexicon, 0);
268              
269             # Sum the combination familiarity values.
270 960 100       1871 if ($combo_seen) {
271 108         128 $frag_sum++;
272 108         171 $char_sum += length;
273             }
274             }
275             # XXX Huh? Why? Can $_ change or something?
276             # Stick our combination back together.
277 192         354 $combo = join '.', @chunks;
278              
279             # Save this combination and its familiarity ratios.
280 192         309 my $x = $frag_sum / @chunks;
281 192         281 my $y = $char_sum / $self->{word_length};
282 192 50       328 warn "\t$combo: [$x, $y]\n" if $self->{verbose};
283 192 100 66     471 if( $x || $y ) {
284 85         297 $self->{knowns}{$combo} = [ $x, $y ];
285             }
286             else {
287 107         230 delete $self->{knowns}{$combo};
288             }
289             }
290              
291 1         3 return $self->{knowns};
292             }
293              
294             # Reduce the number of known combinations by concatinating adjacent
295             # unknowns (and then removing any duplicates produced).
296              
297             #sub learn {
298             # my ($self, %args) = @_;
299             # Get the list of (partially) unknown stem combinations.
300             # Loop through each looking in %args or prompting for a definition.
301             #}
302              
303             # Update the given string with its actual lexicon value and increment
304             # the seen flag.
305             sub _hyphenate {
306 960     960   1514 my ($string, $lexicon, $combo_seen) = @_;
307              
308 960 100       2442 if (exists $lexicon->{$string}) {
    50          
    50          
309 108 50       195 $combo_seen++ if defined $combo_seen;
310             }
311             elsif (exists $lexicon->{"-$string"}) {
312 0 0       0 $combo_seen++ if defined $combo_seen;
313 0         0 $string = "-$string";
314             }
315             elsif (exists $lexicon->{"$string-"}) {
316 0 0       0 $combo_seen++ if defined $combo_seen;
317 0         0 $string = "$string-";
318             }
319              
320 960 50       2097 return wantarray ? ($string, $combo_seen) : $string;
321             }
322              
323             sub output_knowns {
324 0     0 1 0 my $self = shift;
325 0         0 my @out = ();
326 0         0 my $header = <
327             Combination [frag familiarity, char familiarity]
328             Fragment definitions
329              
330             HEADER
331              
332 0         0 for my $known (
333             reverse sort {
334             $self->{knowns}{$a}[0] <=> $self->{knowns}{$b}[0] ||
335 0 0       0 $self->{knowns}{$a}[1] <=> $self->{knowns}{$b}[1]
336 0         0 } keys %{ $self->{knowns} }
337             ) {
338 0         0 my @definition;
339 0         0 for my $chunk (split /\./, $known) {
340             push @definition,
341             defined $self->{definitions}{$chunk}
342             ? $self->{definitions}{$chunk}
343             ? $self->{definitions}{$chunk}
344             : $self->{not_defined}
345 0 0       0 : $self->{unknown};
    0          
346             }
347              
348             push @out, sprintf qq/%s [%s]\n%s/,
349             $known,
350 0         0 join (', ', map { sprintf '%0.2f', $_ }
351 0         0 @{ $self->{knowns}{$known} }),
352 0         0 join ($self->{separator}, @definition);
353             }
354              
355 0 0       0 return wantarray ? @out : $header . join "\n\n", @out;
356             }
357              
358             # Naive, no locking read/write. If you run a production environment,
359             # you know what to do.
360             sub lexicon_cache {
361 5     5 1 444 my( $self, $file, $value ) = @_;
362 5 50       13 warn "Entering lexicon_cache()\n" if $self->{verbose};
363              
364             # Set the file and the lexicon_file attribute if we are told to.
365 5 100 100     25 if( $file && $file eq 'lexicon_file' && $value ) {
      66        
366 1         3 $self->{lexicon_file} = $value;
367 1         3 $file = $value;
368             }
369              
370             # If there is no file try to use the lexicon_file.
371 5   66     16 $file ||= $self->{lexicon_file};
372             # Otherwise, bail out!
373             warn( "No lexicon cache file set\n" ) and return
374 5 50 0     12 if $self->{verbose} && !$file;
      33        
375              
376 5 100       16 if( $file ) {
377             # Store 'em if you got 'em.
378 3 100       3 if( keys %{ $self->{lexicon} } ) {
  3         11  
379 2 50       15 warn "store( $self->{lexicon}, $file )\n" if $self->{verbose};
380 2         9 store( $self->{lexicon}, $file );
381             }
382             # ..Retrieve 'em if not.
383             else {
384 1 50 33     5 warn "retrieve( $file )\n" if $self->{verbose} && -e $file;
385 1 50       22 $self->lexicon( retrieve( $file ) ) if -e $file;
386             }
387             }
388             }
389              
390             1;
391              
392             __END__