File Coverage

blib/lib/Lingua/ZH/MMSEG.pm

Criterion	Covered	Total	%
statement	108	113	95.5
branch	30	42	71.4
condition	13	21	61.9
subroutine	10	11	90.9
pod	3	3	100.0
total	164	190	86.3

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::ZH::MMSEG;
2	3			3		121361	use strict;
	3					9
	3					202
3	3			3		22	use warnings;
	3					7
	3					107
4	3			3		1393	use utf8;
	3					18
	3					23
5	3			3		9405	use Encode qw (is_utf8);
	3					43377
	3					324
6	3			3		13646	use encoding 'utf8';
	3					11596
	3					22
7	3			3		1897	use List::Util qw(sum);
	3					43
	3					378
8
9							our $VERSION=0.4005;
10
11							require Exporter;
12							our @ISA = qw(Exporter);
13							our @EXPORT = qw(mmseg fmm word_freq);
14
15							=encoding utf8
16
17							=head1 NAME
18
19							Lingua::ZH::MMSEG Mandarin Chinese segmentation
20
21							=head1 SYNOPSIS
22
23							#!/usr/bin/perl
24							use utf8;
25							use Lingua::ZH::MMSEG;
26
27							my $zh_string="現代漢語的複合動詞可分三個結構語意關係來探討";
28
29							my @phrases = mmseg($zh_string);
30							# use MMSEG algorithm
31
32							my @phrases = fmm($zh_string);
33							# use Forward Maximum Matching algorithm
34
35							while (<>) {
36							chomp;
37							push @phrases, mmseg;
38							} # mmseg and fmm will parse $_ automaticly
39
40							print $_, word_freq($_) for @phrases;
41							# you can get phrase frequency by calling word_freq
42
43							=head1 DESCRIPTION
44
45							A problem in computational analysis of Chinese text is that there are no word
46							boundaries in conventionally printed text. Since the word is such a fundamental
47							linguistic unit, it is necessary to identify words in Chinese text so that
48							higher-level analyses can be performed.
49
50							Lingua::ZH::MMSEG implements L
51							original developed by L. The whole module is
52							rewritten in pure Perl, and the phrase library is
53							L<新酷音 forked from OpenFoundry\|http://www.openfoundry.org/of/projects/436>.
54
55							=head1 INSTALL
56
57							To install this module, just type
58
59							cpanm Lingua::ZH::MMSEG
60
61							If you don't have cpanm,
62
63							curl -LO http://bit.ly/cpanm
64							chmod +x cpanm
65							sudo cp cpanm /usr/local/bin
66
67							=head1 FUNCTIONS
68
69							=head2 mmseg
70
71							@phrases = mmseg($zh_string);
72							@phrases = mmseg;
73							# use $_ automatically
74
75							C convert a mandarin Chinese string to a sequence of phrases using
76							L algorithm. If there were any
77							english containted in the input string, it simply parse the linked ascii code as
78							one phrase. For example:
79
80							$_ = "這裡有中文Today is Wednesday.這邊又有中文 I go to school on Friday.";
81							print "$_\n" for mmseg;
82
83							這裡有
84							中文
85							Today is Wednesday.
86							這邊
87							又有
88							中文
89							I go to school on Friday.
90
91							The ascii characters are recognized by C.
92
93							=head2 fmm (Forward Maximum Matching)
94
95							@phrases = fmm($zh_string);
96							@phrases = fmm;
97							# use $_ automatically
98
99							C uses forward maximum matching (so called longest match principle) to
100							convert a mandarin Chinese string to a sequence of phrases. It uses the same
101							rule of C to deal with ascii string. The advantage of C is it has
102							lower complexity compare to C; the disadvantage is it cannot solve
103							ambiguity when there is multiple way to seperate a string.
104
105							=head2 word_freq
106
107							$freq = word_freq($phrase);
108							$freq = word_freq;
109							# use $_ automatically
110
111							C return the phrase frequency defined in L<新酷音\|http://www.openfoundry.org/of/projects/436>.
112
113							=head1 AUTHOR
114
115							Felix Ren-Chyan Chern (dryman) C<< >>
116
117							=head1 LICENSE AND COPYRIGHT
118
119							L
120
121							=cut
122
123							our %dict;
124
125							while (){
126							chomp;
127							my ($phrase,$freq) = split;
128							$dict{$phrase}=$freq;
129							}
130
131							sub word_freq {
132	0	0		0	1	0	my $string = $_[0] ? $_[0] : $_;
133	0					0	$dict{$string};
134							}
135
136							sub mmseg {
137	1	50		1	1	12	my $string = $_[0] ? $_[0] : $_;
138	1					2	my @phrases;
139	1	50				8	die unless is_utf8($string);
140	1					4	chomp ($string);
141	1					8	for my $str (split (/([ -~]+)/, $string)) {
142	1	50				8	if ($str =~ /^[ -~]/) {
143	0					0	push @phrases, $str;
144	0					0	next;
145							}
146	1					3	while($str){
147	7					20	my $word1 = &_mmseg($str);
148	7					12	push @phrases, $word1;
149	7					33	$str = substr $str, length $word1;
150							}
151							}
152	1					9	return @phrases;
153							}
154
155							sub fmm {
156	1	50		1	1	14	my $string = $_[0] ? $_[0] : $_;
157	1					3	my @phrases;
158	1	50				9	die unless is_utf8($string);
159	1					3	chomp ($string);
160	1					12	for my $str (split (/([ -~])+/, $string)) {
161	1	50				23	if ($str =~ /^[ -~]/) {
162	0					0	push @phrases, $str,
163							next;
164							}
165	1					6	while($str){
166	7					23	for (reverse (1..(length $str))) {
167	48					85	my $match = substr $str, 0, $_;
168	48	100	66			243	if (defined $dict{$match} or $_==1){
169	7					12	push @phrases, $match;
170	7					16	$str = substr $str, $_;
171	7					23	last;
172							}
173							}
174							}
175							}
176	1					10	return @phrases;
177							}
178
179
180							sub _mmseg {
181	7			7		12	my $str = shift;
182	7					12	my @chunk = &_findChunk($str);
183
184	7	50				21	return $chunk[0]->{w1} if $#chunk == 0;
185
186							# rule 1, find max length chunks
187	7					22	my @mlc_tmp = sort {$b->{len} <=> $a->{len}} @chunk;
	38					64
188	7					13	my @max_len_chunk = grep {$_->{len} == $mlc_tmp[0]->{len}} @mlc_tmp;
	31					61
189	7	100				38	return $max_len_chunk[0]->{w1} if $#max_len_chunk == 0;
190
191							# rule 2, find max avg length chunks
192	3					7	my @malc_tmp = sort {$b->{avglen} <=> $a->{avglen}} @max_len_chunk;
	4					13
193	3					5	my @max_avglen_chunk = grep {$_->{avglen} == $malc_tmp[0]->{avglen}} @malc_tmp;
	7					19
194	3	100				19	return $max_avglen_chunk[0]->{w1} if $#max_avglen_chunk == 0;
195
196							# rule 3, smallest varience
197	1					3	for (@max_len_chunk) {
198	2					11	my $avg = $_->{avglen};
199	2					7	my @word = ($_->{w1},$_->{w2},$_->{w3});
200	2	50				4	pop @word unless $word[$#word];
201	2					4	my @len = map {length $_} @word;
	6					10
202	2					2	my $varience = sqrt ((sum (map {$_2-$avg2} @len))/(scalar @len));
	6					97
203	2					7	$_->{varience} = $varience;
204							}
205	1					3	my @mvc_tmp = sort {$a->{varience} <=> $b->{varience}} @max_len_chunk;
	1					3
206	2					7	my @min_varience_chunk =
207	1					2	grep {abs($_->{varience} - $mvc_tmp[0]->{varience})<0.01} @mvc_tmp;
208	1	50				4	return $min_varience_chunk[0]->{w1} if $#min_varience_chunk == 0;
209							# rule 4, check length one word and choose max freq of it
210	1					2	for (@min_varience_chunk) {
211	2					4	my $freq = 0;
212	2	100	66			15	$freq += $dict{$_->{w1}} if length $_->{w1} == 1 and defined $dict{$_->{w1}};
213	2	100	66			12	$freq += $dict{$_->{w2}} if length $_->{w2} == 1 and defined $dict{$_->{w2}};
214	2	50	33			13	$freq += $dict{$_->{w3}} if length $_->{w3} == 1 and defined $dict{$_->{w3}};
215	2					4	$_->{freq} = $freq;
216							}
217	1					3	my @last = sort {$b->{freq} <=> $a->{freq}} @min_varience_chunk;
	1					4
218	1					8	return $last[0]->{w1};
219							}
220
221
222
223
224							sub _findChunk{
225	7			7		10	my $str = shift;
226	7					7	my $index = 0;
227	7					9	my @chunk;
228
229							my @word1;
230							# will fail if $str=""
231	7					18	for (1..(length($str) - $index)){
232	54					89	my $substr = substr $str, $index, $_;
233	54	100	66			241	push @word1, $substr if defined $dict{$substr} or $_==1;
234							}
235	7					14	foreach my $w1 (@word1){
236	13					26	my $l1 = length $w1;
237	13					16	my $index = $index + $l1;
238	13					12	my @word2;
239
240	13	100				27	if (length($str) - $index == 0){
241	1					5	push @chunk, {
242							w1 => $w1,
243							w2 => undef,
244							w3 => undef,
245							len => $l1,
246							avglen => $l1,
247							};
248	1					4	next;
249							}
250
251	12					23	for (1..(length($str) - $index)){
252	78					131	my $substr = substr $str, $index, $_;
253	78	100	66			318	push @word2, $substr if defined $dict{$substr} or $_==1;
254							}
255
256	12					22	foreach my $w2 (@word2){
257	19					28	my $l2 = length $w2;
258	19					24	my $index = $index + $l2;
259
260	19	100				43	if (length($str) - $index == 0){
261	2					10	push @chunk, {
262							w1 => $w1,
263							w2 => $w2,
264							w3 => undef,
265							len => $l1+$l2,
266							avglen => ($l1+$l2)/2,
267							};
268	2					10	next;
269							}
270
271	17					27	for (1..(length($str) - $index)){
272	101					156	my $substr = substr $str, $index, $_;
273	101	100	66			485	if (defined $dict{$substr} or $_==1){
274	28					31	my $w3 = $substr;
275	28					37	my $l3 = length $w3;
276	28					169	push @chunk, {
277							w1 => $w1,
278							w2 => $w2,
279							w3 => $w3,
280							len => $l1+$l2+$l3,
281							avglen => ($l1+$l2+$l3)/3,
282							};
283							}
284							}
285							}
286							}
287	7					29	return @chunk;
288							}
289
290							1;
291
292							__DATA__