File Coverage

blib/lib/Text/Conversation.pm
Criterion Covered Total %
statement 221 329 67.1
branch 38 98 38.7
condition 14 30 46.6
subroutine 37 46 80.4
pod 2 3 66.6
total 312 506 61.6


line stmt bran cond sub pod time code
1             package Text::Conversation;
2              
3 1     1   7 use warnings;
  1         2  
  1         26  
4 1     1   5 use strict;
  1         2  
  1         28  
5              
6 1     1   6 use vars qw($VERSION);
  1         1  
  1         46  
7             $VERSION = '0.050';
8              
9 1     1   991 use Lingua::StopWords::EN qw(getStopWords);
  1         506  
  1         78  
10 1     1   861 use Lingua::Stem::Snowball qw(stem);
  1         2128  
  1         66  
11 1     1   936 use String::Approx qw(amatch adistr);
  1         5754  
  1         87  
12              
13 1     1   11 use constant CT_STOPWORDS => 0;
  1         2  
  1         82  
14 1     1   6 use constant CT_CONTEXT => 1;
  1         2  
  1         41  
15 1     1   6 use constant CT_NICKS => 2;
  1         2  
  1         37  
16 1     1   6 use constant CT_IDS => 3;
  1         2  
  1         38  
17 1     1   6 use constant CT_CONTEXT_MAX => 4;
  1         2  
  1         37  
18 1     1   6 use constant CT_WORDS => 5;
  1         2  
  1         45  
19 1     1   5 use constant CT_WORDS_TOTAL => 6;
  1         2  
  1         37  
20 1     1   6 use constant CT_DEBUG => 7;
  1         7  
  1         54  
21              
22 1     1   6 use constant CTX_ID => 0;
  1         2  
  1         44  
23 1     1   5 use constant CTX_NICK => 1;
  1         1  
  1         45  
24 1     1   5 use constant CTX_ADDRESSEE => 2;
  1         1  
  1         37  
25 1     1   5 use constant CTX_WORDS => 3;
  1         3  
  1         44  
26              
27 1     1   5 use constant ID_REFERENT => 0;
  1         2  
  1         39  
28 1     1   5 use constant ID_REFERERS => 1;
  1         1  
  1         41  
29 1     1   5 use constant ID_TEXT => 2;
  1         2  
  1         41  
30 1     1   5 use constant ID_NICK => 3;
  1         1  
  1         4710  
31              
32             ### Manage scrollback.
33              
34             # The next ID is package static so messages will be unique across all
35             # threaders.
36              
37             my $next_id = "a";
38              
39             sub new {
40 1     1 1 3 my ($class, %args) = @_;
41              
42 1   50     10 $args{thread_buffer} ||= 30;
43              
44 1   50     14 my $self = bless [
      50        
45             undef, # CT_STOPWORDS
46             [ ], # CT_CONTEXT
47             { }, # CT_NICKS
48             { }, # CT_IDS
49             $args{thread_buffer} || 30, # CT_CONTEXT_MAX
50             { }, # CT_WORDS
51             0, # CT_WORDS_TOTAL
52             $args{debug} || 0, # CT_DEBUG
53             ], $class;
54              
55             # Stem stopwords.
56              
57 1         6 my $stopwords = getStopWords();
58 1         263 my %stopwords;
59              
60 1         46 foreach my $stopword (keys %$stopwords) {
61             # $stopwords{$self->_word_stem($stopword)}++;
62             }
63              
64 1         15 $self->[CT_STOPWORDS] = \%stopwords;
65              
66 1         27 return $self;
67             }
68              
69             #sub hear {
70             # my ($self, $nick, $ident, $host, $text) = @_;
71             #}
72             #
73             #sub see {
74             # my ($self, $nick, $ident, $host, $text) = @_;
75             #}
76             #
77             #sub rename {
78             # my ($self, $old_nick, $new_nick, $ident, $host) = @_;
79             #}
80             #
81             #sub arrival {
82             # my ($self, $nick, $ident, $host) = @_;
83             #}
84             #
85             #sub departure {
86             # my ($self, $nick, $ident, $host) = @_;
87             #}
88              
89             sub observe {
90 3     3 1 6 my ($self, $nick, $text) = @_;
91              
92             # IRC nicks are case-insensitive.
93 3         9 $nick = $self->_nick_fix($nick);
94              
95 3 50       9 if ($self->[CT_DEBUG]) {
96 0         0 warn ">>>> <$nick> $text\n";
97             }
98              
99             # Extract non-stopwords from spoken text.
100             #
101             # TODO - Determine stopwords dynamically from observed context. Or
102             # perhaps generate stopwords from some logs.
103             #
104             # Stem the words here, so they go into the system as stems.
105              
106 3         6 my $words_text = lc($text);
107 3         13 $words_text =~ s/[^\w\s]+/ /g;
108              
109 3         4 my %my_words;
110 3         12 foreach my $word (
  15         37  
111 15         32 grep { ! exists $self->[CT_STOPWORDS]{$_} }
112 15         27 map { $self->_word_stem($_) }
113             grep { length() > 1 }
114             split /\s+/, $words_text
115             ) {
116 15         30 $my_words{$word}++;
117             }
118              
119 3         14 my @my_words = keys %my_words;
120              
121             # Find explicit addressees.
122              
123 3         8 my $addressee_text = lc($text);
124              
125 3         5 my $best_addressee = "";
126 3         4 my $best_addressee_score = 0;
127              
128             # Nickname starts the line.
129              
130 3 50 33     36 if (
      33        
131             ($addressee_text =~ /^\s*(\S+?)\s*[:,]\s+/) or
132             ($addressee_text =~ /^\s*(\S+?)-*\s+/) or
133             ($addressee_text =~ /^\s*t\s+(\S+?)\s+/)
134             ) {
135 3         6 my $test = $self->_nick_fix($1);
136 3 50       9 if ($self->[CT_DEBUG]) {
137 0         0 warn " pre($test)\n";
138             }
139 3         9 my ($best_nick, $best_nick_score) = $self->_nick_exists($test);
140              
141             # Best addressee score is 3x because the nick is at the start.
142 3         6 $best_nick_score *= 3;
143              
144 3 50       9 if ($best_nick_score > $best_addressee_score) {
145 0 0       0 if ($self->[CT_DEBUG]) {
146 0         0 warn(
147             " found $test ",
148             "($best_nick = $best_nick_score > $best_addressee_score)\n"
149             );
150             }
151 0         0 $best_addressee = $best_nick;
152 0         0 $best_addressee_score = $best_nick_score;
153             }
154             }
155              
156             # Nickname ends the line.
157              
158 3 50       62 if ($addressee_text =~ /[\s,]*(\S+?)[.?!'")\]\}\s]*$/) {
159 3         8 my $test = $self->_nick_fix($1);
160 3 50       9 if ($self->[CT_DEBUG]) {
161 0         0 warn " post($test)\n";
162             }
163 3         9 my ($best_nick, $best_nick_score) = $self->_nick_exists($test);
164              
165             # Best addressee score is 2x because the nick is at the end.
166 3         4 $best_nick_score *= 2;
167              
168 3 50       8 if ($best_nick_score > $best_addressee_score) {
169 0 0       0 if ($self->[CT_DEBUG]) {
170 0         0 warn(
171             " found $test ",
172             "($best_nick = $best_nick_score > $best_addressee_score)\n"
173             );
174             }
175 0         0 $best_addressee = $best_nick;
176 0         0 $best_addressee_score = $best_nick_score;
177             }
178             }
179              
180             # Nickname occurs somewhere in the middle.
181              
182 3         17 while ($addressee_text =~ m/\s*,\s*(\S+?)\s*[,!?.]\s*/g) {
183 0         0 my $test = $self->_nick_fix($1);
184 0 0       0 if ($self->[CT_DEBUG]) {
185 0         0 warn " in($test)";
186             }
187 0         0 my ($best_nick, $best_nick_score) = $self->_nick_exists($test);
188              
189 0 0       0 if ($best_nick_score > $best_addressee_score) {
190 0 0       0 if ($self->[CT_DEBUG]) {
191 0         0 warn(
192             " found $test ",
193             "($best_nick = $best_nick_score > $best_addressee_score)\n"
194             );
195             }
196 0         0 $best_addressee = $best_nick;
197 0         0 $best_addressee_score = $best_nick_score;
198             }
199             }
200              
201 3 50       9 if ($self->[CT_DEBUG]) {
202 0         0 warn " best addressee score = $best_addressee_score\n";
203             }
204              
205             # TODO - If an implied statement goes to nobody, then perhaps it's a
206             # continuation of the last statement they said?
207              
208 3         3 my $seen_them_factor = 0;
209 3         4 my $seen_me_factor = 0;
210 3         3 my $seen_other_factor = 0;
211              
212 3         4 my $best_score = 0;
213 3         3 my $best_index;
214              
215 3         3 my $index = @{$self->[CT_CONTEXT]};
  3         5  
216 3         9 while ($index--) {
217 3         6 my $context = $self->[CT_CONTEXT][$index];
218 3         4 my $them = $context->[CTX_NICK];
219              
220             # Figure out speaker/them affinity.
221              
222 3         10 my $affinity = $self->_nick_score($nick, $them);
223              
224 3         8 my $match_factor = $self->_correlate_statements(
225             \@my_words, $context->[CTX_WORDS]
226             );
227              
228 3         4 my $distance_factor = @{$self->[CT_CONTEXT]} - $index;
  3         7  
229              
230 3         4 my $addressee_score = 0;
231 3 50       7 $addressee_score = $best_addressee_score if $them eq $best_addressee;
232              
233             # Weigh factors.
234              
235 3         3 my $weighted_addressee = $addressee_score * 30;
236 3         5 my $weighted_affinity = $affinity * 45; # half addressee
237 3         4 my $weighted_match = $match_factor * 30;
238 3         4 my $weighted_seen_them = $seen_them_factor * -3;
239 3         3 my $weighted_seen_me = $seen_me_factor * -3;
240 3         4 my $weighted_distance = $distance_factor * -1;
241              
242             # Calculate a weighted score.
243              
244 3         5 my $score = (
245             $weighted_affinity +
246             $weighted_addressee +
247             $weighted_match +
248             $weighted_seen_them +
249             $weighted_seen_me +
250             $weighted_distance
251             );
252              
253 3 100       8 if ($score > $best_score) {
254 1         1 $best_score = $score;
255 1         2 $best_index = $index;
256             }
257              
258 3         59 my $out = sprintf(
259             ( " aff(%9.3f) addr(%9.3f) match(%9.3f) " .
260             "sthem(%9.3f) sme(%9.3f) dst(%9.3f) " .
261             "score(%9.3f) best(%9.3f) "
262             ),
263             $weighted_affinity, $weighted_addressee, $weighted_match,
264             $weighted_seen_them, $weighted_seen_me, $weighted_distance,
265             $score, $best_score,
266             );
267              
268 3         11 $out .= substr(
269             $self->_id_get_text($context->[CTX_ID]), 0, 156 - length($out) - 2
270             );
271              
272 3 50       14 if ($self->[CT_DEBUG]) {
273 0         0 warn $out, "\n";
274             }
275              
276             # Serious penalties for passing people by.
277 3 50       7 if ($nick eq $them) {
278 0 0       0 if ($seen_other_factor) {
279 0         0 $seen_me_factor++;
280             }
281             }
282             else {
283 3         9 $seen_other_factor++;
284             }
285             }
286              
287 3         9 return $self->process_match(
288             $best_index, $nick, $text, \%my_words, $best_score
289             );
290             }
291              
292             sub _context_get_id {
293 1     1   2 my ($self, $index) = @_;
294 1         3 return $self->[CT_CONTEXT][$index][CTX_ID];
295             }
296              
297             sub _context_get_nick {
298 1     1   2 my ($self, $index) = @_;
299 1         3 return $self->[CT_CONTEXT][$index][CTX_NICK];
300             }
301              
302             ### Manage seen nicks.
303              
304             # Add a nickname to the database, or update the confidence between
305             # $nick and $addressee of an existing nickname.
306              
307             sub _nick_add {
308 3     3   5 my ($self, $nick, $addressee, $confidence) = @_;
309              
310             # Make sure the nick exists.
311 3 50       17 unless (exists $self->[CT_NICKS]{$nick}) {
312 3         7 $self->[CT_NICKS]{$nick} = { };
313             }
314              
315 3 50 66     13 if (
316             defined($addressee) and
317             !exists($self->[CT_NICKS]{$addressee})
318             ) {
319 0         0 $self->[CT_NICKS]{$addressee} = { };
320             }
321              
322             # Decay everybody. This is a lousy O(N**2) problem.
323 3         3 foreach my $me (keys %{$self->[CT_NICKS]}) {
  3         10  
324 6         6 foreach my $them (keys %{$self->[CT_NICKS]{$me}}) {
  6         23  
325 0         0 $self->_nick_decay_link($me, $them);
326             }
327             }
328              
329             # Average in the new confidence.
330 3 100       10 if (defined $addressee) {
331 1 50       5 if (exists $self->[CT_NICKS]{$addressee}{$nick}) {
332 0 0       0 if ($self->[CT_NICKS]{$addressee}{$nick} < $confidence) {
333 0         0 $self->[CT_NICKS]{$addressee}{$nick} = $confidence;
334             }
335             }
336             else {
337 1         3 $self->[CT_NICKS]{$addressee}{$nick} = $confidence;
338             }
339              
340 1 50       3 if (exists $self->[CT_NICKS]{$nick}{$addressee}) {
341 0 0       0 if ($self->[CT_NICKS]{$nick}{$addressee} < $confidence) {
342 0         0 $self->[CT_NICKS]{$nick}{$addressee} = $confidence;
343             }
344             }
345             else {
346 1         4 $self->[CT_NICKS]{$nick}{$addressee} = $confidence;
347             }
348             }
349             }
350              
351             sub _nick_decay_link {
352 0     0   0 my ($self, $me, $them) = @_;
353 0         0 $self->[CT_NICKS]{$me}{$them} /= 4;
354 0 0       0 if ($self->[CT_NICKS]{$me}{$them} < 1) {
355 0         0 delete $self->[CT_NICKS]{$me}{$them};
356             }
357             }
358              
359             sub _nick_del {
360 0     0   0 my $nick = shift;
361             # Nothing?
362             }
363              
364             # The score is the average of the speaker/other and other/speaker
365             # links. It must be a number from 0 through 1.
366              
367             sub _nick_score {
368 3     3   4 my ($self, $speaker, $other) = @_;
369              
370             # Speaker to other.
371              
372 3         4 my $total_speaker_to_other = 0;
373 3         3 my $speaker_to_other = 0;
374              
375 3 50 33     10 if (
376             exists($self->[CT_NICKS]{$speaker}) and
377             exists($self->[CT_NICKS]{$speaker}{$other})
378             ) {
379 0         0 $speaker_to_other = $self->[CT_NICKS]{$speaker}{$other};
380 0         0 foreach my $audience (keys %{$self->[CT_NICKS]{$speaker}}) {
  0         0  
381 0         0 $total_speaker_to_other += $self->[CT_NICKS]{$speaker}{$audience};
382             }
383             }
384              
385             # Other from speaker.
386              
387 3         12 my $total_other_to_speaker = 0;
388 3         3 my $other_to_speaker = 0;
389              
390 3 50 33     17 if (
391             exists($self->[CT_NICKS]{$other}) and
392             exists($self->[CT_NICKS]{$other}{$speaker})
393             ) {
394 0         0 $other_to_speaker = $self->[CT_NICKS]{$other}{$speaker};
395 0         0 foreach my $them (keys %{$self->[CT_NICKS]{$other}}) {
  0         0  
396 0         0 $total_other_to_speaker += $self->[CT_NICKS]{$other}{$them};
397             }
398             }
399              
400             # If the total of the totals is zero, then avoid the division by
401             # zero.
402              
403 3         5 my $total_total = $total_speaker_to_other + $total_other_to_speaker;
404 3 50       9 return 0 unless $total_total;
405              
406 0         0 return( ($speaker_to_other + $other_to_speaker) / $total_total );
407             }
408              
409             sub _nick_fix {
410 9     9   18 my ($self, $nick) = @_;
411              
412 9         15 my $fixed_nick = lc($nick);
413 9         15 $fixed_nick =~ s/^q\[(\S+)]$/$1/; # q[nick] remove the quotes
414 9         41 $fixed_nick =~ s/[^A-Za-z0-9]*$//; # remove trailing junk
415 9         22 $fixed_nick =~ s/^[^A-Za-z0-9]*//; # remove leading junk
416              
417             # If it's all junk, return it lowercased.
418 9 100       25 $fixed_nick = lc($nick) unless length $fixed_nick;
419              
420 9         19 return $fixed_nick;
421             }
422              
423             # Does a nickname exist? Return a new new nickname and a number
424             # between 0 and 1 that tells how much the given nickname matches it.
425              
426             sub _nick_exists {
427 6     6   8 my ($self, $nick) = @_;
428              
429             # No match if nothing here. Keeps amatch() from bailing.
430 6         7 my @known_nicks = keys %{$self->[CT_NICKS]};
  6         15  
431 6 100       18 return (undef, 0) unless @known_nicks;
432              
433             # Often a nickname is a shortened version of some other. Sometimes
434             # it's an extended version of it. Other times it's a bastardization
435             # of a known nickname.
436             #
437             # Find all the nicknames that begin with the specified nickname.
438             #
439             # TODO - If there are none, try string distances? Is there a better
440             # way to hash string distances with lengths?
441              
442 4         62 my @found = grep /^\Q$nick/, @known_nicks;
443              
444             # Never did find nothin'.
445 4 50       17 return (undef, 0) unless @found;
446              
447 0 0       0 if ($self->[CT_DEBUG]) {
448 0         0 warn " $nick matches (@found)\n";
449             }
450              
451             # Find the best match out of the found matches. "Best match" is a
452             # combination of string distance and ratio of entered nick to match.
453              
454 0         0 my @proximities = map { 1 - $_ } adistr($nick, @found);
  0         0  
455              
456 0 0       0 if ($self->[CT_DEBUG]) {
457 0         0 warn " $nick proximities (@proximities)\n";
458             }
459              
460 0         0 my ($best_nick, $best_score) = ("", 0);
461 0         0 while (@found) {
462 0 0       0 die unless @found == @proximities;
463 0         0 my $match = shift @found;
464 0         0 my $prox = shift @proximities;
465              
466             # Words closer to the input length score higher.
467             # Squared so it diminishes faster.
468 0         0 my $length_score = (length($nick) / length($match)) ** 2;
469              
470 0         0 my $score = $prox * $length_score;
471 0 0       0 next if $score < $best_score;
472              
473 0 0       0 if ($self->[CT_DEBUG]) {
474 0         0 warn " $prox * $length_score = $score\n";
475             }
476              
477 0         0 $best_nick = $match;
478 0         0 $best_score = $score;
479             }
480              
481 0 0       0 if ($best_nick) {
482 0 0       0 if ($self->[CT_DEBUG]) {
483 0         0 warn " $nick = $best_nick ($best_score)\n";
484             }
485 0         0 return ($best_nick, $best_score);
486             }
487              
488 0 0       0 if ($self->[CT_DEBUG]) {
489 0         0 warn " $nick not found\n";
490             }
491 0         0 return (undef, 0);
492             }
493              
494             ### Manage known IDs.
495              
496             sub _id_fully_qualified {
497 0     0   0 my ($self, $id) = @_;
498              
499 0         0 my @key;
500 0         0 while ($id) {
501 0         0 unshift @key, $id;
502 0         0 $id = $self->[CT_IDS]{$id}[ID_REFERENT];
503             }
504              
505 0         0 return join "/", @key;
506             }
507              
508             sub _id_add {
509 3     3   7 my ($self, $id, $referent, $nick, $text) = @_;
510              
511 3         12 $self->[CT_IDS]{$id} = [
512             $referent, # ID_REFERENT
513             [ ], # ID_REFERERS
514             $text, # ID_TEXT
515             $nick, # ID_NICK
516             ];
517              
518 3 100 66     13 if ($referent and exists $self->[CT_IDS]{$referent}) {
519 1         2 push @{$self->[CT_IDS]{$referent}[ID_REFERERS]}, $id;
  1         4  
520             }
521             }
522              
523             sub _id_del {
524 0     0   0 my ($self, $id) = @_;
525              
526 0         0 my $old = delete $self->[CT_IDS]{$id};
527              
528             # Fix the statement's kids to stop pointing at the parent.
529              
530 0         0 foreach my $referer (@{$old->[ID_REFERERS]}) {
  0         0  
531 0         0 $self->[CT_IDS]{$referer}[ID_REFERENT] = undef;
532             }
533             }
534              
535             sub _id_get_referent {
536 0     0   0 my ($self, $id) = @_;
537 0         0 return $self->[CT_IDS]{$id}[ID_REFERENT];
538             }
539              
540             sub _id_get_nick {
541 0     0   0 my ($self, $id) = @_;
542              
543             # XXX - Happens when someone explicitly addresses a nickname that
544             # hasn't appeared yet.
545             #
546             # Use of uninitialized value in hash element at ChatThread.pm line 352.
547              
548 0         0 return $self->[CT_IDS]{$id}[ID_NICK];
549             }
550              
551             sub _id_exists {
552 0     0   0 my ($self, $id) = @_;
553 0         0 return exists $self->[CT_IDS]{$id};
554             }
555              
556             sub _id_get_text {
557 3     3   5 my ($self, $id) = @_;
558 3         13 return $self->[CT_IDS]{$id}[ID_TEXT];
559             }
560              
561             sub _id_list {
562 0     0   0 my $self = shift;
563              
564             # We do this rather than keys of CT_IDS because it's always
565             # guaranteed to be in time order. That is, referents come before
566             # stuff that refers to them.
567 0         0 return map { $_->[CTX_ID] } @{$self->[CT_CONTEXT]};
  0         0  
  0         0  
568             }
569              
570             # Fuzzy match text.
571             #
572             # The current idea is to return a number that represents how much of
573             # @$my_words matches %$their_words; Each matching word multiplied by
574             # a per-word score that reflects the uniqueness of that word.
575              
576             sub _correlate_statements {
577 3     3   6 my ($self, $my_words, $their_words) = @_;
578              
579 3         4 my $match_factor = 0;
580 3   50     8 my $total_words = @$my_words || 1;
581              
582 3         4 foreach my $my_word (@$my_words) {
583 11 100       26 next unless exists $their_words->{$my_word};
584 2         8 $match_factor += $self->_word_get_score($my_word);
585             }
586              
587 3         8 return $match_factor / $total_words;
588             }
589              
590             ###
591              
592             sub process_match {
593 3     3 0 8 my ($self, $index, $nick, $text, $my_words, $confidence) = @_;
594              
595 3         4 my $id = $next_id++;
596              
597 3         4 my ($referent, $addressee, $print_addressee);
598              
599 3 100       8 if (defined $index) {
600 1         12 $referent = $self->_context_get_id($index);
601 1         6 $addressee = $self->_context_get_nick($index);
602              
603             # If the person refers to themselves, refer them instead to
604             # whoever they were talking to previously.
605 1 50       12 if ($addressee eq $nick) {
606 0 0       0 if ($self->_id_exists($referent)) {
607 0         0 $referent = $self->_id_get_referent($referent);
608 0         0 $addressee = $self->_id_get_nick($referent);
609             }
610             else {
611 0         0 $referent = $addressee = undef;
612             }
613             }
614             }
615              
616 3 100       8 if (defined $addressee) {
617 1         2 $print_addressee = $addressee;
618             }
619             else {
620 2         3 $print_addressee = "(nobody)";
621             }
622              
623 3 50       9 if ($self->[CT_DEBUG]) {
624 0         0 warn "<<<< ($id) $nick -> $print_addressee : $text\n";
625             }
626              
627             # XXX - _context_add() ?
628 3         4 push @{$self->[CT_CONTEXT]}, [
  3         9  
629             $id, # CTX_ID
630             $nick, # CTX_NICK
631             $addressee, # CTX_ADDRESSEE
632             $my_words, # CTX_WORDS
633             $referent, # CTX_REFERENT
634             ];
635              
636 3         9 my $debug_text = "<$nick> $text";
637              
638 3         10 $self->_nick_add($nick, $addressee, $confidence);
639 3         8 $self->_id_add($id, $referent, $nick, $debug_text);
640 3         40 $self->_words_add($my_words);
641              
642             # XXX - _context_prune() ?
643             # XXX - Deleting the words here is a cheezy way to decay word
644             # importance over time.
645 3         7 while (@{$self->[CT_CONTEXT]} > $self->[CT_CONTEXT_MAX]) {
  3         11  
646 0         0 my $old = shift @{$self->[CT_CONTEXT]};
  0         0  
647 0         0 $self->_nick_del($old->[CTX_NICK]);
648 0         0 $self->_words_del($old->[CTX_WORDS]);
649 0         0 $self->_id_del($old->[CTX_ID]);
650             }
651              
652             return (
653 3         16 $id, # new ID
654             $referent, # referent ID
655             $debug_text, # display text
656             );
657             }
658              
659             ### Manage words, for frequency and feature extraction.
660              
661             sub _word_stem {
662 15     15   27 my ($self, $word) = @_;
663 15         34 my $stem = stem("en", $word);
664 15         770 return $stem;
665             }
666              
667             sub _words_add {
668 3     3   6 my ($self, $words) = @_;
669              
670 3         8 foreach my $word (keys %$words) {
671 15         16 $self->[CT_WORDS_TOTAL] += $words->{$word};
672 15         32 $self->[CT_WORDS]{$word} += $words->{$word};
673             }
674             }
675              
676             sub _words_del {
677 0     0   0 my ($self, $words) = @_;
678              
679             # XXX - Experimenting with building a huge corpus.
680 0         0 return;
681              
682 0         0 foreach my $word (keys %$words) {
683 0         0 $self->[CT_WORDS_TOTAL] -= $words->{$word};
684 0         0 $self->[CT_WORDS]{$word} -= $words->{$word};
685 0 0       0 next if $self->[CT_WORDS]{$word} > 0;
686 0         0 delete $self->[CT_WORDS]{$word};
687             }
688             }
689              
690             # The word's score increases as its frequency decreases.
691              
692             sub _word_get_score {
693 2     2   3 my ($self, $stem) = @_;
694              
695 2   50     6 my $word_count = $self->[CT_WORDS]{$stem} || 0;
696 2   50     6 my $corpus_count = $self->[CT_WORDS_TOTAL] || 1;
697              
698 2         4 my $word_score = ($corpus_count - $word_count) / $corpus_count;
699 2 50       6 if (exists $self->[CT_STOPWORDS]{$stem}) {
700 0         0 $word_score /= 2;
701             }
702              
703 2         6 return $word_score;
704             }
705              
706             1;
707              
708             __END__