File Coverage

blib/lib/Text/Summarize.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Text::Summarize;
2 1     1   392968 use strict;
  1         4  
  1         40  
3 1     1   6 use warnings;
  1         1  
  1         30  
4 1     1   1760 use Log::Log4perl;
  1         187936  
  1         6  
5 1     1   1847 use Text::Categorize::Textrank;
  0            
  0            
6             use Data::Dump qw(dump);
7              
8             BEGIN
9             {
10             use Exporter ();
11             use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12             $VERSION = '0.50';
13             @ISA = qw(Exporter);
14             @EXPORT = qw(getSumbasicRankingOfSentences);
15             @EXPORT_OK = qw(getSumbasicRankingOfSentences);
16             %EXPORT_TAGS = ();
17             }
18              
19             #12345678901234567890123456789012345678901234
20             #Routine to compute summaries of text.
21              
22             =head1 NAME
23              
24             C - Routine to compute summaries of text.
25              
26             =head1 SYNOPSIS
27              
28             use strict;
29             use warnings;
30             use Text::Summarize;
31             use Data::Dump qw(dump);
32             my $listOfSentences = [
33             { id => 0, listOfTokens => [qw(all people are equal)] },
34             { id => 1, listOfTokens => [qw(all men are equal)] },
35             { id => 2, listOfTokens => [qw(all are equal)] },
36             ];
37             dump getSumbasicRankingOfSentences(listOfSentences => $listOfSentences);
38              
39             =head1 DESCRIPTION
40              
41             C contains a routine to score a list of sentences
42             for inclusion in a summary of the text using the
43             SumBasic algorithm from the report I
44             by L. Vanderwendea, H. Suzukia, C. Brocketta, and A. Nenkovab.
45              
46             =head1 ROUTINES
47              
48             =head2 C
49              
50             use Text::Summarize;
51             use Data::Dump qw(dump);
52             my $listOfSentences = [
53             { id => 0, listOfTokens => [qw(all people are equal)] },
54             { id => 1, listOfTokens => [qw(all men are equal)] },
55             { id => 2, listOfTokens => [qw(all are equal)] },
56             ];
57             dump getSumbasicRankingOfSentences(listOfSentences => $listOfSentences);
58              
59             C computes the sumBasic score of the list of sentences
60             provided. It returns an array reference containing the pairs C<[id, score]> sorted
61             in descending order of score, where C is from C.
62              
63             =over
64              
65             =item C
66              
67             listOfSentences => [{id => '..', listOfTokens => [...]}, ..., {id => '..', listOfTokens => [...]}]
68              
69             C holds the list of sentences that are to be scored. Each
70             item in the list is a hash reference of the form C<{id =E '..', listOfTokens =E [...]}> where
71             C is a unique identifier for the sentence and C is an array
72             reference of the list of tokens comprizing the sentence.
73              
74             =item C
75              
76             tokenWeight => {}
77              
78             C is a optional hash reference that provides the weight of the tokens defined
79             in C. If C is defined, but undefined for a token in a sentence,
80             then the tokens weight defaults to zero unless C is true,
81             in which case the token is ignored and not used to compute the average weight
82             of the sentences containing it. If C is undefined then the weights of the tokens
83             are either their frequency of occurrence in the filtered text, or their textranks if C is defined.
84              
85             =item C
86              
87             ignoreUndefinedTokens => 0
88              
89             If C is true, then any tokens for which C is
90             undefined are ignored and not used to compute the average weight of a
91             sentence; the default is false.
92              
93             =item C
94              
95             tokenWeightUpdateFunction => &subroutine (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
96              
97             C is an optional parameter for defining the function that updates the
98             weight of a token when it is contained in a selected sentence. Five parameters are passed to the
99             subroutine: the token's current weight (float), the token's initial weight (float), the token (string), the C of the
100             selected sentence (string), and the current average weight of the tokens in the selected sentence (float).
101             The default is L.
102              
103             =item C
104              
105             textRankParameters => undef
106              
107             If C is defined, then the token weights
108             are computed using L. The parameters to use for L,
109             excluding the C parameters, can be set using the hash reference defined by C.
110             For example, C {directedGraph =E 1}> would make the textrank weights
111             be computed using a directed token graph.
112              
113             =back
114              
115             =cut
116              
117             sub getSumbasicRankingOfSentences
118             {
119             my (%Parameters) = @_;
120              
121             # get the list of sentences.
122             my $listOfSentences = $Parameters{listOfSentences} if exists $Parameters{listOfSentences};
123             return [] unless defined $listOfSentences;
124              
125             # get the original token weights.
126             my $originalTokenWeights;
127             $originalTokenWeights = $Parameters{tokenWeight} if (exists($Parameters{tokenWeight}) && defined($Parameters{tokenWeight}));
128            
129             # if textRankParameters is defined, compute the token weights via textrank.
130             if (exists($Parameters{textRankParameters}) && defined($Parameters{textRankParameters}))
131             {
132             $originalTokenWeights = _getTextRankWeightOfTokens(%Parameters, listOfSentences => $listOfSentences);
133             }
134              
135             # if $originalTokenWeights is not defined, then use the frequency of the tokens as their weight.
136             if (!defined ($originalTokenWeights))
137             {
138             $originalTokenWeights = _getFrequencyWeightOfTokens(listOfSentences => $listOfSentences);
139             }
140              
141             # get the function to update the weights of the tokens.
142             my $tokenWeightUpdateFunction = \&tokenWeightUpdateFunction_Squared;
143             $tokenWeightUpdateFunction = $Parameters{tokenWeightUpdateFunction} if exists $Parameters{tokenWeightUpdateFunction};
144              
145             # set the flag for ignoreUndefinedTokens.
146             my $ignoreUndefinedTokens = exists $Parameters{ignoreUndefinedTokens} && $Parameters{ignoreUndefinedTokens};
147              
148             # copy the weights of only the tokens that occur in the sentences.
149             # the default weight of a token is zero.
150             my %tokenWeight;
151             for (my $i = 0 ; $i < @$listOfSentences ; $i++)
152             {
153              
154             # if the sentence has no id, skip it.
155             unless (exists $listOfSentences->[$i]->{id})
156             {
157              
158             # get the list of tokens in the sentence as a string.
159             my $stringOfTokens;
160             if (exists($listOfSentences->[$i]->{listOfTokens}))
161             {
162             $stringOfTokens = join(' ', @{ $listOfSentences->[$i]->{listOfTokens} });
163             }
164              
165             # create the message to log.
166             my $logger = Log::Log4perl->get_logger();
167             my $message;
168             if (defined $stringOfTokens)
169             {
170             $message = "warning: skipping sentence number $i with tokens $stringOfTokens since it is missing an id.\n";
171             }
172             else
173             {
174             $message = "warning: skipping sentence number $i since it is missing an id and listOfTokens.\n";
175             }
176              
177             # log the message as a warning.
178             $logger->logwarn($message);
179              
180             # skip processing the sentence.
181             next;
182             }
183              
184             # get the listOfTokens of the sentence.
185             if ((exists $listOfSentences->[$i]->{listOfTokens}) && (@{ $listOfSentences->[$i]->{listOfTokens} }))
186             {
187             my $listOfTokens = $listOfSentences->[$i]->{listOfTokens};
188             foreach my $token (@$listOfTokens)
189             {
190              
191             # if the weight is already defined for the token, skip it.
192             next if exists $tokenWeight{$token};
193              
194             # if weight for token not defined, it defaults to zero if ignoreUndefinedTokens is false.
195             if (exists $originalTokenWeights->{$token})
196             {
197             $tokenWeight{$token} = $originalTokenWeights->{$token};
198             }
199             elsif (!$ignoreUndefinedTokens)
200             {
201             $tokenWeight{$token} = 0;
202             }
203             }
204             }
205             }
206              
207             # normalize the token weights to sum to one.
208             my $sum = 0;
209             while (my ($token, $weight) = each %tokenWeight) { $sum += $weight; }
210             $sum = 1 if ($sum == 0);
211             while (my ($token, $weight) = each %tokenWeight) { $tokenWeight{$token} /= $sum; }
212              
213             # keep a copy of the initial token weights.
214             my %initialTokenWeight = %tokenWeight;
215              
216             # @listOfEmptySentenceIds will hold the list of empty sentence ids.
217             my @listOfEmptySentenceIds;
218              
219             # make a copy of the list of sentences
220             my @localListOfSentences;
221             for (my $i = 0 ; $i < @$listOfSentences ; $i++)
222             {
223              
224             # if the sentence has no id, skip it.
225             next unless exists $listOfSentences->[$i]->{id};
226              
227             # copy the id of the sentence.
228             my %sentence;
229             $sentence{id} = $listOfSentences->[$i]->{id};
230              
231             # convert the list of tokens in a sentence to a hash with the key as the token and the value its occurance in the sentence.
232             if ((exists $listOfSentences->[$i]->{listOfTokens}) && (@{ $listOfSentences->[$i]->{listOfTokens} }))
233             {
234             my %tokenCount;
235             my $empty = 1;
236             foreach my $token (@{ $listOfSentences->[$i]->{listOfTokens} })
237             {
238              
239             # if the weight for the token is not defined, skip it.
240             if (exists $tokenWeight{$token})
241             {
242             ++$tokenCount{$token};
243             $empty = 0;
244             }
245             }
246              
247             # if the sentence has no defined tokens, store the id on @listOfEmptySentenceIds.
248             if ($empty)
249             {
250             push @listOfEmptySentenceIds, [ $listOfSentences->[$i]->{id}, scalar @{ $listOfSentences->[$i]->{listOfTokens} } ];
251             }
252             else
253             {
254             $sentence{tokenCounts} = \%tokenCount;
255              
256             # store the sentence in a list.
257             push @localListOfSentences, \%sentence;
258             }
259             }
260             else
261             {
262              
263             # if the sentence has no tokens, store the id on @listOfEmptySentenceIds.
264             push @listOfEmptySentenceIds, [ $listOfSentences->[$i]->{id}, scalar @{ $listOfSentences->[$i]->{listOfTokens} } ];
265             }
266             }
267              
268             # compute the average weight of each sentence and initialize its selected flag to false.
269             for (my $i = 0 ; $i < @localListOfSentences ; $i++)
270             {
271              
272             # get the pointer to the sentence.
273             my $sentence = $localListOfSentences[$i];
274              
275             # compute the weight of the sentence.
276             my $weight = 0;
277             my $tokenCountSum = 0;
278             while (my ($token, $count) = each %{ $sentence->{tokenCounts} })
279             {
280             $weight += $count * $tokenWeight{$token};
281             $tokenCountSum += $count;
282             }
283             $sentence->{size} = $tokenCountSum;
284             $sentence->{weight} = $weight / $sentence->{size};
285              
286             # initialize each sentence as not selected.
287             $sentence->{selected} = 0;
288             }
289              
290             # build the inverted index of the sentences and tokens, called tokenSentenceIndex.
291             my %tokenSentenceIndex;
292             for (my $i = 0 ; $i < @localListOfSentences ; $i++)
293             {
294              
295             # get the pointer to the sentence.
296             my $sentence = $localListOfSentences[$i];
297              
298             # get the list of tokens in the sentence.
299             foreach my $token (keys %{ $sentence->{tokenCounts} })
300             {
301              
302             # add the weightSentence pointer to the tokenSentenceIndex.
303             $tokenSentenceIndex{$token} = [] unless exists $tokenSentenceIndex{$token};
304              
305             # note we are storing the index of the sentence, not the pointer to the sentence.
306             push @{ $tokenSentenceIndex{$token} }, $i;
307             }
308             }
309              
310             # make the list of just the tokens.
311             my @listOfTokens = keys %tokenWeight;
312              
313             # @rankedListOfSentences will hold the sentences in sumbasic order.
314             my @rankedListOfSentences;
315              
316             # loop over the sentences until they have all been selected.
317             while (scalar(@rankedListOfSentences) < scalar(@localListOfSentences))
318             {
319              
320             # if there are no tokens left, exit the loop.
321             last unless @listOfTokens > 0;
322              
323             # get the token with the greatest (weight, length, -order).
324             my $maxIndex = 0;
325             my $maxToken = $listOfTokens[$maxIndex];
326             my $maxTokenWeight = $tokenWeight{$maxToken};
327             for (my $i = 1 ; $i < scalar(@listOfTokens) ; $i++)
328             {
329             my $cmp;
330             if ($maxTokenWeight < $tokenWeight{ $listOfTokens[$i] })
331             {
332              
333             # $maxTokenWeight is smaller.
334             $cmp = -1;
335             }
336             elsif ($maxTokenWeight > $tokenWeight{ $listOfTokens[$i] })
337             {
338              
339             # $maxTokenWeight is larger.
340             $cmp = 1;
341             }
342             else
343             {
344              
345             # weights are equal, compare token lengths, choose the longer one.
346             $cmp = length($maxToken) <=> length($listOfTokens[$i]);
347              
348             # if tokens have equal length, choose the one lexically smaller.
349             if ($cmp == 0) { $cmp = $listOfTokens[$i] cmp $maxToken; }
350             }
351              
352             # if the current max is smaller, replace it.
353             if ($cmp == -1)
354             {
355             $maxIndex = $i;
356             $maxToken = $listOfTokens[$maxIndex];
357             $maxTokenWeight = $tokenWeight{$maxToken};
358             }
359             }
360              
361             # copy the last token to where the max was, it may be popped off if there are no
362             # sentences left containing it.
363             $listOfTokens[$maxIndex] = $listOfTokens[-1];
364             $listOfTokens[-1] = $maxToken;
365              
366             # if there are no sentences remaining with the token, move on to the next token.
367             unless (exists $tokenSentenceIndex{$maxToken})
368             {
369             pop @listOfTokens;
370             next;
371             }
372              
373             # get the list of sentences that have the token.
374             my $listOfSentencesWithToken = $tokenSentenceIndex{$maxToken};
375              
376             # if there are no sentences remaining with the token, move on to the next token.
377             unless (scalar(@$listOfSentencesWithToken) > 0)
378             {
379             pop @listOfTokens;
380             delete $tokenSentenceIndex{$maxToken};
381             next;
382             }
383              
384             # find the sentence having the token with the highest weight not yet selected.
385             my $maxSentenceIndex;
386             my $maxSentence;
387             my @remainingListOfSentencesWithToken;
388             foreach my $sentenceIndex (@$listOfSentencesWithToken)
389             {
390              
391             # get the pointer to the sentence.
392             my $sentence = $localListOfSentences[$sentenceIndex];
393              
394             # skip the sentence if already selected.
395             next if $sentence->{selected};
396              
397             # if no sentence has been selected, just take the first valid sentence.
398             unless (defined($maxSentence))
399             {
400             $maxSentence = $sentence;
401             $maxSentenceIndex = $sentenceIndex;
402             next;
403             }
404              
405             # choose the sentence with the greater weight, or the greater size, or the lesser id.
406             my $cmp =
407             ($sentence->{weight} <=> $maxSentence->{weight})
408             || ($sentence->{size} <=> $maxSentence->{size})
409             || ($sentence->{id} cmp $maxSentence->{id});
410              
411             # store the new maximum sentence.
412             if ($cmp == 1)
413             {
414              
415             # store the previous maximum as an unselected sentence.
416             push @remainingListOfSentencesWithToken, $maxSentenceIndex;
417             $maxSentence = $sentence;
418             $maxSentenceIndex = $sentenceIndex;
419             }
420             else
421             {
422              
423             # store the current sentence as unselected.
424             push @remainingListOfSentencesWithToken, $sentenceIndex;
425             }
426             }
427              
428             # update the list of sentences with the token that were not selected for the summary.
429             if (@remainingListOfSentencesWithToken == 0)
430             {
431             delete $tokenSentenceIndex{$maxToken};
432             }
433             else
434             {
435              
436             # update the list of sentences that the token is contained in.
437             $tokenSentenceIndex{$maxToken} = \@remainingListOfSentencesWithToken;
438             }
439              
440             # if no sentence selected, then there are no unselected sentences with the
441             # token, so move on to the next token.
442             unless (defined $maxSentence)
443             {
444             pop @listOfTokens;
445             delete $tokenSentenceIndex{$maxToken};
446             next;
447             }
448              
449             # store the sentence selected and its weight.
450             $maxSentence->{selected} = 1;
451             push @rankedListOfSentences, [ $maxSentence, $maxSentence->{weight} ];
452              
453             # update the weight of all the tokens in the max sentence.
454             my @sentenceTokens = keys %{ $maxSentence->{tokenCounts} };
455             foreach my $token (@sentenceTokens)
456             {
457              
458             # (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
459             $tokenWeight{$token} =
460             &$tokenWeightUpdateFunction($tokenWeight{$token}, $initialTokenWeight{$token}, $token, $maxSentence->{id}, $maxSentence->{weight});
461             }
462              
463             # get all of the sentences that share tokens with the max sentence.
464             my %sentencesToUpdate;
465             foreach my $token (@sentenceTokens)
466             {
467             next unless exists $tokenSentenceIndex{$token};
468             foreach my $sentenceIndex (@{ $tokenSentenceIndex{$token} })
469             {
470             $sentencesToUpdate{$sentenceIndex} = 1;
471             }
472             }
473             my @listOfSentencesToUpdate = keys %sentencesToUpdate;
474              
475             # recompute the weight of the sentences that have tokens whose weight changed.
476             # floating point calculations will become unstable due to rounding errors if the
477             # old weights are subtracted and the new weights added. slower, but best to
478             # recompute the average weights by summing.
479             foreach my $sentenceIndex (@listOfSentencesToUpdate)
480             {
481              
482             # get the pointer to the sentence.
483             my $sentence = $localListOfSentences[$sentenceIndex];
484              
485             # skip the sentence if it was already selected.
486             next if $sentence->{selected};
487              
488             # compute the weight of the sentence.
489             my $weight = 0;
490             while (my ($token, $count) = each %{ $sentence->{tokenCounts} })
491             {
492             $weight += $count * $tokenWeight{$token};
493             }
494             $sentence->{weight} = $weight / $sentence->{size};
495             }
496             }
497              
498             # normalize the sentence weights so they sum to one.
499             my $totalSentenceWeight = 0;
500             foreach my $sentenceWeight (@rankedListOfSentences)
501             {
502             $totalSentenceWeight += $sentenceWeight->[1];
503             }
504             $totalSentenceWeight = 1 if ($totalSentenceWeight == 0);
505              
506             foreach my $sentenceWeight (@rankedListOfSentences)
507             {
508              
509             # normalize the sentence weight.
510             $sentenceWeight = [ $sentenceWeight->[0]->{id}, $sentenceWeight->[1] / $totalSentenceWeight ];
511             }
512              
513             # add the empty sentences to the list.
514             push @rankedListOfSentences, map { [ $_->[0], 0 ] } sort { ($a->[1] <=> $b->[1]) || ($a->[0] cmp $b->[0]) } @listOfEmptySentenceIds;
515            
516             # adjust the weights to be descending (a kludge).
517             if (@rankedListOfSentences)
518             {
519             my $totalSentenceWeight = 0;
520             my $runningSum = 0;
521             for (my $i = @rankedListOfSentences - 1; $i > -1; $i--)
522             {
523             $runningSum += $rankedListOfSentences[$i]->[1];
524             $rankedListOfSentences[$i]->[1] = $runningSum;
525             $totalSentenceWeight += $rankedListOfSentences[$i]->[1];
526             }
527             $totalSentenceWeight = 1 if ($totalSentenceWeight <= 0);
528             foreach my $idWeight (@rankedListOfSentences)
529             {
530             $idWeight->[1] = abs ($idWeight->[1]/ $totalSentenceWeight);
531             }
532             }
533              
534             return \@rankedListOfSentences;
535             }
536              
537             =head2 C
538              
539             Returns the tokens current weight squared.
540              
541             =cut
542              
543             sub tokenWeightUpdateFunction_Squared # (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
544             {
545             return $_[0] * $_[0];
546             }
547              
548             =head2 C
549              
550             Returns the tokens current weight times its intial weight.
551              
552             =cut
553              
554             sub tokenWeightUpdateFunction_Multiplicative # (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
555             {
556             return $_[0] * $_[1];
557             }
558              
559             =head2 C
560              
561             Returns the tokens current weight times its the average weight of the tokens in the selected sentence.
562              
563             =cut
564              
565             sub tokenWeightUpdateFunction_Sentence # (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
566             {
567             return $_[0] * $_[4];
568             }
569              
570             # computes the textrank of the tokens.
571             sub _getTextRankWeightOfTokens
572             {
573             my %Parameters = @_;
574              
575             # use any textrank parameters if defined.
576             my %textRankParameters;
577             %textRankParameters = %{ $Parameters{textRankParameters} } if ((exists $Parameters{textRankParameters}) && (defined $Parameters{textRankParameters}));
578              
579             # if no sentences, return now.
580             return {} unless exists $Parameters{listOfSentences};
581             my $listOfSentences = $Parameters{listOfSentences};
582              
583             # build the list of tokens.
584             my @listOfTokens = map { ($_->{listOfTokens}) } @$listOfSentences;
585              
586             # return the textrank of each token.
587             return getTextrankOfListOfTokens(%textRankParameters, listOfTokens => \@listOfTokens);
588             }
589              
590             # computes the frequency of the tokens.
591             sub _getFrequencyWeightOfTokens
592             {
593             my %Parameters = @_;
594              
595             # if no sentences, return now.
596             return {} unless exists $Parameters{listOfSentences};
597             my $listOfSentences = $Parameters{listOfSentences};
598              
599             # compute total occurrence and frequency of the tokens.
600             my $totalOccurrence = 0;
601             my %tokenFrequency;
602              
603             foreach my $sentence (@$listOfSentences)
604             {
605             foreach my $token (@{$sentence->{listOfTokens}})
606             {
607             ++$tokenFrequency{$token};
608             ++$totalOccurrence;
609             }
610             }
611             $totalOccurrence = 1 if $totalOccurrence < 1;
612              
613             while (my ($token, undef) = each %tokenFrequency)
614             {
615             $tokenFrequency{$token} /= $totalOccurrence;
616             }
617              
618             # return the frequency of each token.
619             return \%tokenFrequency;
620             }
621              
622             =head1 INSTALLATION
623              
624             Use L to install the module and all its prerequisites:
625              
626             perl -MCPAN -e shell
627             >install Text::Summarize
628              
629             =head1 BUGS
630              
631             Please email bugs reports or feature requests to C, or through
632             the web interface at L. The author
633             will be notified and you can be automatically notified of progress on the bug fix or feature request.
634              
635             =head1 AUTHOR
636              
637             Jeff Kubina
638              
639             =head1 COPYRIGHT
640              
641             Copyright (c) 2009 Jeff Kubina. All rights reserved.
642             This program is free software; you can redistribute
643             it and/or modify it under the same terms as Perl itself.
644              
645             The full text of the license can be found in the
646             LICENSE file included with this module.
647              
648             =head1 KEYWORDS
649              
650             information processing, summary, summaries, summarization, summarize, sumbasic, textrank
651              
652             =head1 SEE ALSO
653              
654             L, L, L
655              
656             =begin html
657              
658            

The SumBasic algorithm for ranking sentences is from

659             Beyond SumBasic: Task-Focused Summarization with Sentence Simplification and Lexical Expansion
660             by L. Vanderwendea, H. Suzukia, C. Brocketta, and A. Nenkovab.

661              
662             =end html
663              
664             =cut
665              
666             1;
667              
668             # The preceding line will help the module return a true value
669