File Coverage

blib/lib/Text/Categorize/Textrank.pm

Criterion	Covered	Total	%
statement	64	64	100.0
branch	16	26	61.5
condition	1	3	33.3
subroutine	9	9	100.0
pod	1	1	100.0
total	91	103	88.3

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Categorize::Textrank;
2
3	1			1		36283	use strict;
	1					3
	1					49
4	1			1		9	use warnings;
	1					2
	1					45
5	1			1		3865	use Graph;
	1					182182
	1					38
6	1			1		978	use Graph::Centrality::Pagerank;
	1					9212
	1					67
7	1			1		14	use Data::Dump qw(dump);
	1					2
	1					44
8
9							BEGIN {
10	1			1		4	use Exporter ();
	1					3
	1					20
11	1			1		5	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	1					2
	1					97
12	1			1		3	$VERSION = '0.51';
13	1					14	@ISA = qw(Exporter);
14	1					3	@EXPORT = qw(getTextrankOfListOfTokens);
15	1					2	@EXPORT_OK = qw(getTextrankOfListOfTokens);
16	1					431	%EXPORT_TAGS = ();
17							}
18
19							#12345678901234567890123456789012345678901234
20							#Method to rank potential keywords of text.
21
22							=head1 NAME
23
24							C - Method to rank potential keywords of text.
25
26							=head1 SYNOPSIS
27
28							use strict;
29							use warnings;
30							use Text::Categorize::Textrank;
31							use Data::Dump qw(dump);
32							my $listOfTokens = [ [qw(This is the first sentence)], [qw(Here is the second sentence)] ];
33							my $hashOfTextrankValues = getTextrankOfListOfTokens(listOfTokens => $listOfTokens);
34							dump $hashOfTextrankValues;
35
36							=head1 DESCRIPTION
37
38							C provides a routine for ranking the words in
39							text as potential keywords. It implements a version of the textrank algorithm
40							from the report I by R. Mihalcea and P. Tarau.
41
42							=head1 ROUTINES
43
44							=head2 C
45
46							The routine C returns a hash reference containing the textrank value
47							for all the tokens in the lists provided; the textrank values sum to one. The textrank of a token
48							is its pagerank in the graph obtained by joining neighboring tokens with an edge, called the
49							token graph.
50
51							Usually, C should I be applied to all the words of the text. The
52							complete textrank algorithm first filters the words in the text to only the nouns and adjectives.
53							See L to compute the textrank of English text.
54
55							=over
56
57							=item C
58
59							listOfTokens => [[...], [...], ...[...]]
60
61							C is an array reference containing the list of tokens that are
62							to be ranked using textrank. Each list is also an array reference of tokens that should
63							correspond to the list of tokens in a sentence. For example,
64							C<[[qw(This is the first sentence)], [qw(Here is the second sentence)]]>.
65
66							=item C
67
68							edgeCreationSpan => 1
69
70							For each token in the C, C is the number of successive
71							tokens used to make an edge in the token graph. For example, if
72							C is two, then given the token sequence C<"apple orange pear">
73							the edges C<[apple, orange]> and C<[apple, pear]> will be added to the token
74							graph for the token C. The default is one.
75
76							Note that loop edges are ignored. For example,
77							if C is two, then given the token sequence C<"daba daba doo">
78							the edge C<[daba, daba]> is disguarded but the edge C<[daba, doo]> is
79							added to the token graph.
80
81							=item C
82
83							directedGraph => 0
84
85							If C is true, the textranks
86							are computed from the directed token graph, if false, they
87							are computed from the undirected version of the graph. The default is false.
88
89							=item C
90
91							dampeningFactor => 0.85
92
93							When computing the textranks of the token graph, the dampening factor
94							specified by C will
95							be used; it should range from zero to one. The default is 0.85.
96
97							=begin html
98
99							The Wikipedia article on pagerank has a good explaination of the
100							dampening factor.
101
102							=end html
103
104							=item C
105
106							addEdgesSpanningLists => 1
107
108							If C is true, then when building the token graph, links
109							between the tokens at the end of a list and the beginning of the next list
110							will be made. For example, for the lists C<[[qw(This is the first list)], [qw(Here is the second list)]]>
111							the edge C<[list, Here]> will be added to the token graph. The default is true.
112
113							=item C
114
115							tokenWeights => {}
116
117							C is an optional hash reference that can provide a weight for a subset
118							of the tokens provided by C. If C is not defined for any token
119							in C, then each
120							token has a weight of one. If C is defined for
121							at least one node in the graph, then the default weight of any undefined
122							token is zero.
123
124							=back
125
126							=cut
127
128							sub getTextrankOfListOfTokens
129							{
130	31			31	1	706944	my %Parameters = @_;
131
132							# if there are no lists, return undef.
133	31	50				198	return undef unless exists $Parameters{listOfTokens};
134	31					81	my $listOfTokens = $Parameters{listOfTokens};
135
136							# edgeCreationSpan holds the number of sucessive tokens linked by an edge for each token.
137							# for example, given tokens "apple orange pear", the edges [apple, orange] and [apple, pear]
138							# will be created for token apple if $edgeCreationSpan is two.
139	31					60	my $edgeCreationSpan = 1;
140	31	50				124	$edgeCreationSpan = abs $Parameters{edgeCreationSpan} if (exists ($Parameters{edgeCreationSpan}));
141	31	50				119	$edgeCreationSpan = 1 if ($edgeCreationSpan < 1);
142
143							# set the type of graph built from the tokens to compute the textrank.
144	31					68	$Parameters{undirected} = 1;
145	31	50				128	$Parameters{undirected} = !$Parameters{directedGraph} if exists ($Parameters{directedGraph});
146
147							# get the pagerank dampening factor.
148	31	50				122	$Parameters{dampeningFactor} = 0.85 unless exists ($Parameters{dampeningFactor});
149	31					71	$Parameters{dampeningFactor} = abs $Parameters{dampeningFactor};
150	31	50				123	$Parameters{dampeningFactor} = 1 if ($Parameters{dampeningFactor} > 1);
151
152							# set the addEdgesSpanningLists flag.
153	31					45	my $addEdgesSpanningLists = 1;
154	31	50				81	$addEdgesSpanningLists = $Parameters{addEdgesSpanningLists} if exists $Parameters{addEdgesSpanningLists};
155
156							# build the graph of the links between tokens.
157	31					245	my $graph = Graph->new(directed => !$Parameters{undirected});
158
159							# if edges will span lists, make listOfTokens into just one list.
160	31	50				9083	if ($addEdgesSpanningLists)
161							{
162	31					53	my @allWords;
163	31					86	foreach my $list (@$listOfTokens)
164							{
165	362					1370	push @allWords, @$list;
166							}
167	31					126	$listOfTokens = [\@allWords];
168							}
169
170							# add all the edges to the graph.
171	31					78	foreach my $list (@$listOfTokens)
172							{
173							# get the total tokens in the list.
174	31					86	my $totalTokens = $#$list + 1;
175	31					123	for (my $i = 0; $i < $totalTokens; $i++)
176							{
177							# since isolalted vertices are possible, add the vertex.
178							# for example, if a list had only one token in it, it would
179							# not be linked to any other token.
180	5097					960205	$graph->add_vertex ($list->[0]);
181
182							# get the index of the last token to link to the current token in
183							# the list.
184	5097					140213	my $lastToken = $i + $edgeCreationSpan + 1;
185	5097	100				10206	$lastToken = $totalTokens if ($lastToken > $totalTokens);
186	5097					11317	for (my $j = $i + 1; $j < $lastToken; $j++)
187							{
188							# skip loop edges.
189	5066	100				15693	next if ($list->[$i] eq $list->[$j]);
190
191							# create the edge.
192	4931					12120	my @edge = ($list->[$i], $list->[$j]);
193
194							# if the edge exists already, add to its weight.
195	4931	100				13537	if ($graph->has_edge (@edge))
196							{
197	3306					83316	my $weight = $graph->get_edge_weight (@edge);
198	3306	50				355707	$weight = 1 unless defined $weight;
199	3306					3762	++$weight;
200	3306					9211	$graph->add_weighted_edge (@edge, $weight);
201							}
202							else
203							{
204							# edge does not exist, default edge weight is one.
205	1625					31069	$graph->add_weighted_edge (@edge, 1);
206							}
207							}
208							}
209							}
210
211							# get the pagerank computer.
212	31					234	my $pageRankEngine = Graph::Centrality::Pagerank->new ();
213
214							# set the parameter for inclusion of edge weights.
215	31					6694	$Parameters{useEdgeWeights} = 1;
216
217							# set the node weights if provided.
218	31	50	33			122	$Parameters{nodeWeights} = $Parameters{tokenWeights} if ((exists $Parameters{tokenWeights}) && (defined $Parameters{tokenWeights}));
219
220							# compute and return the pagerank of the tokens.
221	31					210	return $pageRankEngine->getPagerankOfNodes (%Parameters, graph => $graph);
222							}
223
224							=head1 INSTALLATION
225
226							To install the module run the following commands:
227
228							perl Makefile.PL
229							make
230							make test
231							make install
232
233							If you are on a windows box you should use 'nmake' rather than 'make'.
234
235							=head1 BUGS
236
237							Please email bugs reports or feature requests to C, or through
238							the web interface at L. The author
239							will be notified and you can be automatically notified of progress on the bug fix or feature request.
240
241							=head1 AUTHOR
242
243							Jeff Kubina
244
245							=head1 COPYRIGHT
246
247							Copyright (c) 2009 Jeff Kubina. All rights reserved.
248							This program is free software; you can redistribute
249							it and/or modify it under the same terms as Perl itself.
250
251							The full text of the license can be found in the
252							LICENSE file included with this module.
253
254							=head1 KEYWORDS
255
256							categorize, keywords, keyphrases, nlp, pagerank, textrank
257
258							=head1 SEE ALSO
259
260							=begin html
261
262							This package implements the Textrank algorithm from the report
263							TextRank: Bringing Order into Texts
264							by Rada Mihalcea and Paul Tarau;
265							which is related to pagerank.
266
267							=end html
268
269							L, L, L, L
270
271							=cut
272
273							1;
274							# The preceding line will help the module return a true value