File Coverage

blib/lib/Text/Categorize/Textrank.pm
Criterion Covered Total %
statement 64 64 100.0
branch 16 26 61.5
condition 1 3 33.3
subroutine 9 9 100.0
pod 1 1 100.0
total 91 103 88.3


line stmt bran cond sub pod time code
1             package Text::Categorize::Textrank;
2              
3 1     1   36283 use strict;
  1         3  
  1         49  
4 1     1   9 use warnings;
  1         2  
  1         45  
5 1     1   3865 use Graph;
  1         182182  
  1         38  
6 1     1   978 use Graph::Centrality::Pagerank;
  1         9212  
  1         67  
7 1     1   14 use Data::Dump qw(dump);
  1         2  
  1         44  
8              
9             BEGIN {
10 1     1   4 use Exporter ();
  1         3  
  1         20  
11 1     1   5 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         2  
  1         97  
12 1     1   3 $VERSION = '0.51';
13 1         14 @ISA = qw(Exporter);
14 1         3 @EXPORT = qw(getTextrankOfListOfTokens);
15 1         2 @EXPORT_OK = qw(getTextrankOfListOfTokens);
16 1         431 %EXPORT_TAGS = ();
17             }
18              
19             #12345678901234567890123456789012345678901234
20             #Method to rank potential keywords of text.
21              
22             =head1 NAME
23              
24             C - Method to rank potential keywords of text.
25              
26             =head1 SYNOPSIS
27              
28             use strict;
29             use warnings;
30             use Text::Categorize::Textrank;
31             use Data::Dump qw(dump);
32             my $listOfTokens = [ [qw(This is the first sentence)], [qw(Here is the second sentence)] ];
33             my $hashOfTextrankValues = getTextrankOfListOfTokens(listOfTokens => $listOfTokens);
34             dump $hashOfTextrankValues;
35              
36             =head1 DESCRIPTION
37              
38             C provides a routine for ranking the words in
39             text as potential keywords. It implements a version of the textrank algorithm
40             from the report I by R. Mihalcea and P. Tarau.
41              
42             =head1 ROUTINES
43              
44             =head2 C
45              
46             The routine C returns a hash reference containing the textrank value
47             for all the tokens in the lists provided; the textrank values sum to one. The textrank of a token
48             is its pagerank in the graph obtained by joining neighboring tokens with an edge, called the
49             token graph.
50              
51             Usually, C should I be applied to all the words of the text. The
52             complete textrank algorithm first filters the words in the text to only the nouns and adjectives.
53             See L to compute the textrank of English text.
54              
55             =over
56              
57             =item C
58              
59             listOfTokens => [[...], [...], ...[...]]
60              
61             C is an array reference containing the list of tokens that are
62             to be ranked using textrank. Each list is also an array reference of tokens that should
63             correspond to the list of tokens in a sentence. For example,
64             C<[[qw(This is the first sentence)], [qw(Here is the second sentence)]]>.
65              
66             =item C
67              
68             edgeCreationSpan => 1
69              
70             For each token in the C, C is the number of successive
71             tokens used to make an edge in the token graph. For example, if
72             C is two, then given the token sequence C<"apple orange pear">
73             the edges C<[apple, orange]> and C<[apple, pear]> will be added to the token
74             graph for the token C. The default is one.
75              
76             Note that loop edges are ignored. For example,
77             if C is two, then given the token sequence C<"daba daba doo">
78             the edge C<[daba, daba]> is disguarded but the edge C<[daba, doo]> is
79             added to the token graph.
80              
81             =item C
82              
83             directedGraph => 0
84              
85             If C is true, the textranks
86             are computed from the directed token graph, if false, they
87             are computed from the undirected version of the graph. The default is false.
88              
89             =item C
90              
91             dampeningFactor => 0.85
92              
93             When computing the textranks of the token graph, the dampening factor
94             specified by C will
95             be used; it should range from zero to one. The default is 0.85.
96              
97             =begin html
98              
99             The Wikipedia article on pagerank has a good explaination of the
100             dampening factor.
 
101              
102             =end html
103              
104             =item C
105              
106             addEdgesSpanningLists => 1
107              
108             If C is true, then when building the token graph, links
109             between the tokens at the end of a list and the beginning of the next list
110             will be made. For example, for the lists C<[[qw(This is the first list)], [qw(Here is the second list)]]>
111             the edge C<[list, Here]> will be added to the token graph. The default is true.
112              
113             =item C
114              
115             tokenWeights => {}
116              
117             C is an optional hash reference that can provide a weight for a subset
118             of the tokens provided by C. If C is not defined for any token
119             in C, then each
120             token has a weight of one. If C is defined for
121             at least one node in the graph, then the default weight of any undefined
122             token is zero.
123              
124             =back
125              
126             =cut
127              
128             sub getTextrankOfListOfTokens
129             {
130 31     31 1 706944 my %Parameters = @_;
131              
132             # if there are no lists, return undef.
133 31 50       198 return undef unless exists $Parameters{listOfTokens};
134 31         81 my $listOfTokens = $Parameters{listOfTokens};
135              
136             # edgeCreationSpan holds the number of sucessive tokens linked by an edge for each token.
137             # for example, given tokens "apple orange pear", the edges [apple, orange] and [apple, pear]
138             # will be created for token apple if $edgeCreationSpan is two.
139 31         60 my $edgeCreationSpan = 1;
140 31 50       124 $edgeCreationSpan = abs $Parameters{edgeCreationSpan} if (exists ($Parameters{edgeCreationSpan}));
141 31 50       119 $edgeCreationSpan = 1 if ($edgeCreationSpan < 1);
142              
143             # set the type of graph built from the tokens to compute the textrank.
144 31         68 $Parameters{undirected} = 1;
145 31 50       128 $Parameters{undirected} = !$Parameters{directedGraph} if exists ($Parameters{directedGraph});
146              
147             # get the pagerank dampening factor.
148 31 50       122 $Parameters{dampeningFactor} = 0.85 unless exists ($Parameters{dampeningFactor});
149 31         71 $Parameters{dampeningFactor} = abs $Parameters{dampeningFactor};
150 31 50       123 $Parameters{dampeningFactor} = 1 if ($Parameters{dampeningFactor} > 1);
151              
152             # set the addEdgesSpanningLists flag.
153 31         45 my $addEdgesSpanningLists = 1;
154 31 50       81 $addEdgesSpanningLists = $Parameters{addEdgesSpanningLists} if exists $Parameters{addEdgesSpanningLists};
155              
156             # build the graph of the links between tokens.
157 31         245 my $graph = Graph->new(directed => !$Parameters{undirected});
158              
159             # if edges will span lists, make listOfTokens into just one list.
160 31 50       9083 if ($addEdgesSpanningLists)
161             {
162 31         53 my @allWords;
163 31         86 foreach my $list (@$listOfTokens)
164             {
165 362         1370 push @allWords, @$list;
166             }
167 31         126 $listOfTokens = [\@allWords];
168             }
169              
170             # add all the edges to the graph.
171 31         78 foreach my $list (@$listOfTokens)
172             {
173             # get the total tokens in the list.
174 31         86 my $totalTokens = $#$list + 1;
175 31         123 for (my $i = 0; $i < $totalTokens; $i++)
176             {
177             # since isolalted vertices are possible, add the vertex.
178             # for example, if a list had only one token in it, it would
179             # not be linked to any other token.
180 5097         960205 $graph->add_vertex ($list->[0]);
181              
182             # get the index of the last token to link to the current token in
183             # the list.
184 5097         140213 my $lastToken = $i + $edgeCreationSpan + 1;
185 5097 100       10206 $lastToken = $totalTokens if ($lastToken > $totalTokens);
186 5097         11317 for (my $j = $i + 1; $j < $lastToken; $j++)
187             {
188             # skip loop edges.
189 5066 100       15693 next if ($list->[$i] eq $list->[$j]);
190              
191             # create the edge.
192 4931         12120 my @edge = ($list->[$i], $list->[$j]);
193              
194             # if the edge exists already, add to its weight.
195 4931 100       13537 if ($graph->has_edge (@edge))
196             {
197 3306         83316 my $weight = $graph->get_edge_weight (@edge);
198 3306 50       355707 $weight = 1 unless defined $weight;
199 3306         3762 ++$weight;
200 3306         9211 $graph->add_weighted_edge (@edge, $weight);
201             }
202             else
203             {
204             # edge does not exist, default edge weight is one.
205 1625         31069 $graph->add_weighted_edge (@edge, 1);
206             }
207             }
208             }
209             }
210              
211             # get the pagerank computer.
212 31         234 my $pageRankEngine = Graph::Centrality::Pagerank->new ();
213              
214             # set the parameter for inclusion of edge weights.
215 31         6694 $Parameters{useEdgeWeights} = 1;
216              
217             # set the node weights if provided.
218 31 50 33     122 $Parameters{nodeWeights} = $Parameters{tokenWeights} if ((exists $Parameters{tokenWeights}) && (defined $Parameters{tokenWeights}));
219              
220             # compute and return the pagerank of the tokens.
221 31         210 return $pageRankEngine->getPagerankOfNodes (%Parameters, graph => $graph);
222             }
223              
224             =head1 INSTALLATION
225              
226             To install the module run the following commands:
227              
228             perl Makefile.PL
229             make
230             make test
231             make install
232              
233             If you are on a windows box you should use 'nmake' rather than 'make'.
234              
235             =head1 BUGS
236              
237             Please email bugs reports or feature requests to C, or through
238             the web interface at L. The author
239             will be notified and you can be automatically notified of progress on the bug fix or feature request.
240              
241             =head1 AUTHOR
242              
243             Jeff Kubina
244              
245             =head1 COPYRIGHT
246              
247             Copyright (c) 2009 Jeff Kubina. All rights reserved.
248             This program is free software; you can redistribute
249             it and/or modify it under the same terms as Perl itself.
250              
251             The full text of the license can be found in the
252             LICENSE file included with this module.
253              
254             =head1 KEYWORDS
255              
256             categorize, keywords, keyphrases, nlp, pagerank, textrank
257              
258             =head1 SEE ALSO
259              
260             =begin html
261              
262             This package implements the Textrank algorithm from the report
263             TextRank: Bringing Order into Texts
264             by Rada Mihalcea and Paul Tarau;
265             which is related to pagerank.
266              
267             =end html
268              
269             L, L, L, L
270              
271             =cut
272              
273             1;
274             # The preceding line will help the module return a true value