File Coverage

blib/lib/Lingua/ZH/WordSegment.pm

Criterion	Covered	Total	%
statement	28	126	22.2
branch	8	36	22.2
condition	0	6	0.0
subroutine	5	8	62.5
pod	0	5	0.0
total	41	181	22.6

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::ZH::WordSegment;
2
3	1			1		50506	use 5.008007;
	1					4
	1					204
4	1			1		7	use strict;
	1					2
	1					39
5	1			1		6	use warnings;
	1					7
	1					2203
6
7							require Exporter;
8							#use AutoLoader qw(AUTOLOAD);
9
10							our @ISA = qw(Exporter);
11							our %EXPORT_TAGS = ( 'all' => [ qw(
12
13							) ] );
14							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
15							our @EXPORT = qw(seg seg_STDIO set_dic);
16							our $VERSION = '0.04';
17
18							my %wordFreqList=(); #Chinese word as key of the hash table, frequency is the value
19							my %longestWordListStart=(); #Chinese character as key in the hash table, length of the longest word starting with
20							# this character is the value in the table
21
22							#reading the dictionary
23							sub set_dic {
24	1			1	0	3	my ($dictFileName)=@_;
25	1					1	%wordFreqList=();
26	1					2	%longestWordListStart=();
27	1	50				5	if (defined $dictFileName) {
28	0					0	local *dictFile;
29	0	0				0	open dictFile, $dictFileName or die "Can't open file $dictFileName.";
30	0					0	load_dic(*dictFile);
31	0					0	close dictFile;
32							} else {
33	1					4	load_dic(\*DATA);
34							}
35							}
36
37							sub load_dic {
38	1			1	0	2	my ($dictFile)=@_;
39	1					7	while(<$dictFile>){
40	22747					69928	s/\x0A//;s/\x0D//;s/\x20/\t/g;
	22747					53270
	22747					31849
41	22747					31378	my $line=$_;
42	22747					91400	my @entries=split(/\t/,$line);
43	22747					47436	my ($thisChnWord,$thisFreq)=@entries;
44
45							#print "$thisChnWord,$thisFreq\n"; #for debug
46	22747					86539	$wordFreqList{$thisChnWord}=$thisFreq;
47
48	22747	100				78502	if($thisChnWord=~/[\x00-\x7F]/){
49							}else{
50	22692	50				95188	my $headChar=$1 if ($thisChnWord=~/^([\x80-\xFF].)/);
51	22692					44643	my $thisLen=length($thisChnWord)/2; #the length of the Chinese word in character
52	22692	100				75259	$longestWordListStart{$headChar}=0 if (! defined $longestWordListStart{$headChar});
53	22692	100				152366	$longestWordListStart{$headChar}=$thisLen if($longestWordListStart{$headChar}<$thisLen);
54							}
55							}
56							}
57
58							sub seg_STDIO {
59	0			0	0		while(){
60	0						s/\x0A//g;s/\x0D//g;
	0
61	0						print seg($_); print "\n";
	0
62							}
63							}
64
65							sub seg {
66	0			0	0		my ($thisSent)=@_;
67	0						my $finalResult="";
68	0						my $sentLen=length($thisSent);
69
70	0						my $partialChnString="";
71	0						my $index=0;
72	0						while($index<$sentLen){
73	0						my $thisChar=substr($thisSent, $index,1);
74
75	0	0					if($thisChar ge "\x80") { #this is half of a Chinese character
76	0						$thisChar=substr($thisSent,$index,2);
77	0						$index+=2;
78	0						$partialChnString=$partialChnString.$thisChar;
79							}else{
80	0						$index++;
81
82	0	0					if($partialChnString ne ""){
83	0						my $partialSegString=segmentZhStr($partialChnString);
84	0						$finalResult=$finalResult.$partialSegString;
85
86	0						$partialChnString="";
87	0						$partialSegString="";
88							}
89
90	0						$finalResult=$finalResult.$thisChar;
91							}
92							}
93
94							#in case of pure Chinese characters
95	0	0					if($partialChnString ne ""){
96	0						my $partialSegString=segmentZhStr($partialChnString);
97	0						$finalResult=$finalResult.$partialSegString;
98
99	0						$partialChnString="";
100	0						$partialSegString="";
101							}
102
103	0						$finalResult=~s/^\x20+//;
104	0						$finalResult=~s/\x20+\Z//;
105	0						$finalResult=~s/\x20+/\x20/g;
106
107	0						return $finalResult;
108							}
109
110							sub segmentZhStr{ #segmenting a string of Chinese characters, there should be no non-Chinese character in the string
111	0			0	0		my $inputString=$_[0];
112	0						my $result="";
113
114							#for debug
115							#print STDERR "Try to segment string $inputString\n";
116
117	0						my $lenOfString=length($inputString)/2;
118	0						my @arcTable=();
119
120							#----------------------------------------------------------
121							#step0, initialize the arcTable
122	0						for(my $i=0;$i<$lenOfString;$i++){
123	0						for(my $j=0;$j<$lenOfString;$j++){
124	0	0					if($i==$j){
125	0						$arcTable[$i][$j]=1;
126							}else{
127	0						$arcTable[$i][$j]=-1;
128							}
129							}
130							}
131
132
133							#-----------------------------------------------------------
134							#step1: search for all possible arcs in the input string
135							# and create an array for them
136
137	0						for(my $currentPos=0;$currentPos<$lenOfString;$currentPos++){ #currentPos is the index of Chinese character
138	0						my $currentChar=substr($inputString,$currentPos*2,2);
139
140							#from this position, try to find all possible words led by this character
141	0						my $possibleLen=$longestWordListStart{$currentChar};
142	0	0					$possibleLen=1 if (! defined $possibleLen);
143							#for debug
144							#print STDERR "\n$currentChar=$possibleLen\n";
145
146	0	0					if(($possibleLen+$currentPos)> ($lenOfString-1)){
147	0						$possibleLen=$lenOfString-$currentPos;
148							}
149
150	0						while($possibleLen>=2){ #all possible words with more than 2 characters
151	0						my $subString=substr($inputString,$currentPos2,$possibleLen2);
152
153							#for debug
154							#print STDERR "s=$subString\n";
155
156	0	0					if($wordFreqList{$subString}){
157							#for debug
158							#print STDERR "$subString found\n";
159
160	0						$arcTable[$currentPos][$currentPos+$possibleLen-1]=$wordFreqList{$subString};
161							}
162
163
164	0						$possibleLen--;
165							}
166
167							}
168
169							#for debug
170							#for($i=0;$i<$lenOfString;$i++){
171							# for($j=0;$j<$lenOfString;$j++){
172							# print " ",$arcTable[$i][$j];
173							# }
174							# print "\n";
175							#}
176
177
178							#--------------------------------------------------------------------------
179							#step2: from the arc table, try to find the best path as segmentation at
180							#each point use the longest possible arc
181							# Try from two directions for the search: left to right and right to left
182							# using the one with higher product of frequency of the arcs
183
184	0						my @leftRightSegLabel=();
185	0						my @rightLeftSegLabel=();
186
187							#initialize the segmentation label array
188	0						for(my $k=0;$k<$lenOfString;$k++){
189	0						$leftRightSegLabel[$k]=0;
190	0						$rightLeftSegLabel[$k]=0;
191							}
192
193							#from left to right
194							#-------------------------------
195	0						my $leftToRightFreq=0;
196	0						my $thisCharIndex=0;
197	0						my $charIndexEnd=$lenOfString-1;
198
199	0						while($thisCharIndex<$lenOfString){
200	0						my $endCharIndex=$charIndexEnd;
201	0						my $found=0;
202
203	0		0				while((!$found)&&($endCharIndex>=$thisCharIndex)){
204	0	0					if($arcTable[$thisCharIndex][$endCharIndex]!=-1){
205	0						$leftToRightFreq+=log($arcTable[$thisCharIndex][$endCharIndex]);
206	0						$found=1;
207							}
208							else{
209	0						$endCharIndex--;
210							}
211							}
212
213	0						$leftRightSegLabel[$endCharIndex]=1;
214	0						$thisCharIndex=$endCharIndex+1;
215							}
216
217							#for debug
218							#print STDERR @leftRightSegLabel,"\n $leftToRightFreq\n";
219
220							#from right to left
221							#---------------------------------
222	0						my $rightToLeftFreq=0;
223	0						$thisCharIndex=$lenOfString-1;
224
225	0						while($thisCharIndex>=0){
226	0						my $startCharIndex=0;
227	0						my $found=0;
228	0		0				while((!$found)&&($startCharIndex<=$thisCharIndex)){
229	0	0					if($arcTable[$startCharIndex][$thisCharIndex]!=-1){
230	0						$found=1;
231	0						$rightToLeftFreq+=log($arcTable[$startCharIndex][$thisCharIndex]);
232							}
233							else{
234	0						$startCharIndex++;
235							}
236							}
237
238	0						$rightLeftSegLabel[$startCharIndex]=1;
239	0						$thisCharIndex=$startCharIndex-1;
240							}
241
242							#for debug
243							#print STDERR @rightLeftSegLabel,"\n $rightToLeftFreq\n";
244
245
246							#---------------------------------------------------------------------------------
247							# Step3: create result
248	0	0					if($leftToRightFreq>$rightToLeftFreq){#using left to right solution, prefer right to left
249	0						for(my $p=0;$p<$lenOfString;$p++){
250	0						$result=$result.substr($inputString, $p*2, 2);
251
252	0	0					if($leftRightSegLabel[$p]==1){
253	0						$result=$result." ";
254							}
255							}
256							}
257							else{
258	0						for(my $p=0;$p<$lenOfString;$p++){
259	0	0					if($rightLeftSegLabel[$p]==1){
260	0						$result=$result." ";
261							}
262	0						$result=$result.substr($inputString, $p*2, 2);
263							}
264							}
265
266	0						$result=~s/^\x20+//;
267	0						$result=~s/\x20+\Z//;
268
269							#for debug
270							#print "result=$result\n";
271
272	0						return " $result ";
273							}
274
275							set_dic();
276							1;
277
278							=pod
279
280							=head1 NAME
281
282							Lingua::ZH::WordSegment - Simple Simplified Chinese Word Segmentation
283
284							=head1 SYNOPSIS
285
286							use Lingua::ZH::WordSegment;
287							print seg($str_in);
288							seg_STDIO();# Read from STDIN, and print the segmented result to STDOUT
289
290							set_dic($dictionary_file_name); #load word from the file, this is not a must
291							perl -MLingua::ZH::WordSegment -e 'seg_STDIO();' < input_file > output_file
292
293							=head1 DESCRIPTION
294
295							The default word list is extracted from People's Daily in Jan, 1998 owned by Institute of Computational Linguistics, Peking University, China
296
297							This code is mainly written by Joy, joy@cs.cmu.edu in July 4th, 2001.
298
299							This program is a perl version of left-right mandarin segmentor
300							As LDC segmenter takes a long time to build the DB files which makes the
301							the training process last too long time.
302
303							For ablation experiments, we do not need to create the DB files because the
304							specific frequency dictionary will be used only once for each slice.
305
306							The algorithm for this segmenter is to search the longest word at each point
307							from both left and right directions, and choose the one with higher frequency
308							product.
309
310							The above is Joy's original declarations.
311
312
313
314							=head1 METHODS
315
316							seg($str_in); # return the string of segmentation result.
317							seg_STDIO(); # Read from STDIN, and print the segmented result to STDOUT
318
319							set_dic($dictionary_file_name)
320							#The format of the dictionary file for each line is:
321							# "chineseWord\tFrequency\n"
322							#
323							#Notice that if you don't call set_dic,
324							#the default dictionary in GBK encoding will be loaded.
325							#The default dictionary is extracted from corpus of the People's Daily,
326							#January, 1998.
327							#Thanks to Institute of Computational Linguistics, Peking University,China.
328
329							=head1 SEE ALSO
330
331
332							=head1 AUTHORS
333
334							Rewrited by Chen Yirong E cyr.master@gmail.com E, September 21, 2006 and modified in Feb 20, 2007.
335							Original Author:
336							Joy, joy@cs.cmu.edu in July 4th, 2001
337
338							=head1 KUDOS
339
340							Many thanks to Joy who made the code available.
341							Thanks to the PKU Corpus (from Institute of Computational Linguistics, Peking University, China) to help to automatic generate the default dictionary.
342
343							=head1 COPYRIGHT
344
345							This program is free software; you can redistribute it and/or modify it
346							under the same terms as Perl itself.
347
348							See L
349
350							=cut
351
352							__DATA__