File Coverage

blib/lib/Lingua/ZH/Keywords.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				# $File: //member/autrijus/Lingua-ZH-Keywords/Keywords.pm $ $Author: autrijus $
2				# $Revision: #9 $ $Change: 3723 $ $DateTime: 2003/01/20 22:15:45 $
3
4				package Lingua::ZH::Keywords;
5				$Lingua::ZH::Keywords::VERSION = '0.04';
6
7	2	2	1529	use strict;
	2		4
	2		95
8	2	2	13	use vars qw($VERSION @ISA @EXPORT @StopWords);
	2		5
	2		173
9
10	2	2	12	use Exporter;
	2		6
	2		90
11	2	2	4700	use Lingua::ZH::TaBE ();
	0
	0
12
13				=head1 NAME
14
15				Lingua::ZH::Keywords - Extract keywords from Chinese text
16
17				=head1 SYNOPSIS
18
19				# Exports keywords() by default
20				use Lingua::ZH::Keywords;
21
22				print join(",", keywords($text)); # Prints five keywords
23				print join(",", keywords($text, 10)); # Prints ten keywords
24
25				=head1 DESCRIPTION
26
27				This is a very simple algorithm which removes stopwords from the
28				text, and then counts up what it considers to be the most important
29				B. The C subroutine returns a list of keywords
30				in order of relevance.
31
32				The stopwords list is accessible as C<@Lingua::ZH::Keywords::StopWords>.
33
34				If the input C<$text> is an Unicode string, the returned keywords
35				will also be Unicode strings; otherwise they are assumed to be
36				Big5-encoded bytestrings.
37
38				=cut
39
40				@ISA = qw(Exporter);
41				@EXPORT = qw(keywords);
42
43				@StopWords = qw(
44				提供相關我們可以如何因為目前如果其他我的大家沒有主要所以
45				以上這個所有有關就是他們因此但是以及是否由於對於任何什麼
46				這些現在無法成為可能不過包括必須關於這是這樣以下已經你的
47				雖然許多也是不是除了還是為了之後只要其中都是各種還有非常
48				而且這種其它不要我要他的只是各位只有的話不能這裡相當我是
49				全部很多可是或是其實那麼你們下列如此另外然後各項才能不會
50				甚至總會不得怎麼即可作為至於當然根據我想能夠之間為何不知
51				例如期間時候也有常見並且容易我有實際有人有些分別並不以後
52				使得經由重新如下在此這麼那些整個都有這次之前令人來的就會
53				上述位於那個而已使用假如於是還得是在無法何況曾經我們的
54				);
55
56				my $Tabe;
57
58				sub keywords {
59				$Tabe \|\|= Lingua::ZH::TaBE->new;
60
61				eval { require Encode::compat } if $] < 5.007;
62				my $is_utf8 = eval { require Encode; Encode::is_utf8($_[0]) };
63
64				my (%hist, %ref);
65				$hist{$_}++ for grep {
66				length > 2 and index($_, '一') == -1
67				} $Tabe->split(
68				$is_utf8 ? Encode::encode(big5 => $_[0]) : $_[0]
69				);
70				delete @hist{@StopWords};
71
72				my $count = $_[1] \|\| 5;
73
74				# By occurence, then freq, then lexical order
75				map {
76				$is_utf8 ? Encode::decode(big5 => $_) : $_
77				} grep length, (sort {
78				$hist{$b} <=> $hist{$a}
79				or
80				($ref{$b} \|\|= freq($b)) <=> ($ref{$a} \|\|= freq($a))
81				or
82				$b cmp $a
83				} keys %hist)[ 0 .. $count-1 ];
84				}
85
86				sub freq {
87				my $tsi = $Tabe->Tsi($_[0]);
88				$Tabe->TsiDB->Get($tsi);
89				return $tsi->refcount;
90				}
91
92				1;
93
94				__END__