File Coverage

blib/lib/Lingua/ZH/WordSegment.pm
Criterion Covered Total %
statement 28 126 22.2
branch 8 36 22.2
condition 0 6 0.0
subroutine 5 8 62.5
pod 0 5 0.0
total 41 181 22.6


line stmt bran cond sub pod time code
1             package Lingua::ZH::WordSegment;
2            
3 1     1   50506 use 5.008007;
  1         4  
  1         204  
4 1     1   7 use strict;
  1         2  
  1         39  
5 1     1   6 use warnings;
  1         7  
  1         2203  
6            
7             require Exporter;
8             #use AutoLoader qw(AUTOLOAD);
9            
10             our @ISA = qw(Exporter);
11             our %EXPORT_TAGS = ( 'all' => [ qw(
12            
13             ) ] );
14             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
15             our @EXPORT = qw(seg seg_STDIO set_dic);
16             our $VERSION = '0.04';
17            
18             my %wordFreqList=(); #Chinese word as key of the hash table, frequency is the value
19             my %longestWordListStart=(); #Chinese character as key in the hash table, length of the longest word starting with
20             # this character is the value in the table
21            
22             #reading the dictionary
23             sub set_dic {
24 1     1 0 3 my ($dictFileName)=@_;
25 1         1 %wordFreqList=();
26 1         2 %longestWordListStart=();
27 1 50       5 if (defined $dictFileName) {
28 0         0 local *dictFile;
29 0 0       0 open dictFile, $dictFileName or die "Can't open file $dictFileName.";
30 0         0 load_dic(*dictFile);
31 0         0 close dictFile;
32             } else {
33 1         4 load_dic(\*DATA);
34             }
35             }
36            
37             sub load_dic {
38 1     1 0 2 my ($dictFile)=@_;
39 1         7 while(<$dictFile>){
40 22747         69928 s/\x0A//;s/\x0D//;s/\x20/\t/g;
  22747         53270  
  22747         31849  
41 22747         31378 my $line=$_;
42 22747         91400 my @entries=split(/\t/,$line);
43 22747         47436 my ($thisChnWord,$thisFreq)=@entries;
44            
45             #print "$thisChnWord,$thisFreq\n"; #for debug
46 22747         86539 $wordFreqList{$thisChnWord}=$thisFreq;
47            
48 22747 100       78502 if($thisChnWord=~/[\x00-\x7F]/){
49             }else{
50 22692 50       95188 my $headChar=$1 if ($thisChnWord=~/^([\x80-\xFF].)/);
51 22692         44643 my $thisLen=length($thisChnWord)/2; #the length of the Chinese word in character
52 22692 100       75259 $longestWordListStart{$headChar}=0 if (! defined $longestWordListStart{$headChar});
53 22692 100       152366 $longestWordListStart{$headChar}=$thisLen if($longestWordListStart{$headChar}<$thisLen);
54             }
55             }
56             }
57            
58             sub seg_STDIO {
59 0     0 0   while(){
60 0           s/\x0A//g;s/\x0D//g;
  0            
61 0           print seg($_); print "\n";
  0            
62             }
63             }
64            
65             sub seg {
66 0     0 0   my ($thisSent)=@_;
67 0           my $finalResult="";
68 0           my $sentLen=length($thisSent);
69            
70 0           my $partialChnString="";
71 0           my $index=0;
72 0           while($index<$sentLen){
73 0           my $thisChar=substr($thisSent, $index,1);
74            
75 0 0         if($thisChar ge "\x80") { #this is half of a Chinese character
76 0           $thisChar=substr($thisSent,$index,2);
77 0           $index+=2;
78 0           $partialChnString=$partialChnString.$thisChar;
79             }else{
80 0           $index++;
81            
82 0 0         if($partialChnString ne ""){
83 0           my $partialSegString=segmentZhStr($partialChnString);
84 0           $finalResult=$finalResult.$partialSegString;
85            
86 0           $partialChnString="";
87 0           $partialSegString="";
88             }
89            
90 0           $finalResult=$finalResult.$thisChar;
91             }
92             }
93            
94             #in case of pure Chinese characters
95 0 0         if($partialChnString ne ""){
96 0           my $partialSegString=segmentZhStr($partialChnString);
97 0           $finalResult=$finalResult.$partialSegString;
98            
99 0           $partialChnString="";
100 0           $partialSegString="";
101             }
102            
103 0           $finalResult=~s/^\x20+//;
104 0           $finalResult=~s/\x20+\Z//;
105 0           $finalResult=~s/\x20+/\x20/g;
106            
107 0           return $finalResult;
108             }
109            
110             sub segmentZhStr{ #segmenting a string of Chinese characters, there should be no non-Chinese character in the string
111 0     0 0   my $inputString=$_[0];
112 0           my $result="";
113            
114             #for debug
115             #print STDERR "Try to segment string $inputString\n";
116            
117 0           my $lenOfString=length($inputString)/2;
118 0           my @arcTable=();
119            
120             #----------------------------------------------------------
121             #step0, initialize the arcTable
122 0           for(my $i=0;$i<$lenOfString;$i++){
123 0           for(my $j=0;$j<$lenOfString;$j++){
124 0 0         if($i==$j){
125 0           $arcTable[$i][$j]=1;
126             }else{
127 0           $arcTable[$i][$j]=-1;
128             }
129             }
130             }
131            
132            
133             #-----------------------------------------------------------
134             #step1: search for all possible arcs in the input string
135             # and create an array for them
136            
137 0           for(my $currentPos=0;$currentPos<$lenOfString;$currentPos++){ #currentPos is the index of Chinese character
138 0           my $currentChar=substr($inputString,$currentPos*2,2);
139            
140             #from this position, try to find all possible words led by this character
141 0           my $possibleLen=$longestWordListStart{$currentChar};
142 0 0         $possibleLen=1 if (! defined $possibleLen);
143             #for debug
144             #print STDERR "\n$currentChar=$possibleLen\n";
145            
146 0 0         if(($possibleLen+$currentPos)> ($lenOfString-1)){
147 0           $possibleLen=$lenOfString-$currentPos;
148             }
149            
150 0           while($possibleLen>=2){ #all possible words with more than 2 characters
151 0           my $subString=substr($inputString,$currentPos*2,$possibleLen*2);
152            
153             #for debug
154             #print STDERR "s=$subString\n";
155            
156 0 0         if($wordFreqList{$subString}){
157             #for debug
158             #print STDERR "$subString found\n";
159            
160 0           $arcTable[$currentPos][$currentPos+$possibleLen-1]=$wordFreqList{$subString};
161             }
162            
163            
164 0           $possibleLen--;
165             }
166            
167             }
168            
169             #for debug
170             #for($i=0;$i<$lenOfString;$i++){
171             # for($j=0;$j<$lenOfString;$j++){
172             # print " ",$arcTable[$i][$j];
173             # }
174             # print "\n";
175             #}
176            
177            
178             #--------------------------------------------------------------------------
179             #step2: from the arc table, try to find the best path as segmentation at
180             #each point use the longest possible arc
181             # Try from two directions for the search: left to right and right to left
182             # using the one with higher product of frequency of the arcs
183            
184 0           my @leftRightSegLabel=();
185 0           my @rightLeftSegLabel=();
186            
187             #initialize the segmentation label array
188 0           for(my $k=0;$k<$lenOfString;$k++){
189 0           $leftRightSegLabel[$k]=0;
190 0           $rightLeftSegLabel[$k]=0;
191             }
192            
193             #from left to right
194             #-------------------------------
195 0           my $leftToRightFreq=0;
196 0           my $thisCharIndex=0;
197 0           my $charIndexEnd=$lenOfString-1;
198            
199 0           while($thisCharIndex<$lenOfString){
200 0           my $endCharIndex=$charIndexEnd;
201 0           my $found=0;
202            
203 0   0       while((!$found)&&($endCharIndex>=$thisCharIndex)){
204 0 0         if($arcTable[$thisCharIndex][$endCharIndex]!=-1){
205 0           $leftToRightFreq+=log($arcTable[$thisCharIndex][$endCharIndex]);
206 0           $found=1;
207             }
208             else{
209 0           $endCharIndex--;
210             }
211             }
212            
213 0           $leftRightSegLabel[$endCharIndex]=1;
214 0           $thisCharIndex=$endCharIndex+1;
215             }
216            
217             #for debug
218             #print STDERR @leftRightSegLabel,"\n $leftToRightFreq\n";
219            
220             #from right to left
221             #---------------------------------
222 0           my $rightToLeftFreq=0;
223 0           $thisCharIndex=$lenOfString-1;
224            
225 0           while($thisCharIndex>=0){
226 0           my $startCharIndex=0;
227 0           my $found=0;
228 0   0       while((!$found)&&($startCharIndex<=$thisCharIndex)){
229 0 0         if($arcTable[$startCharIndex][$thisCharIndex]!=-1){
230 0           $found=1;
231 0           $rightToLeftFreq+=log($arcTable[$startCharIndex][$thisCharIndex]);
232             }
233             else{
234 0           $startCharIndex++;
235             }
236             }
237            
238 0           $rightLeftSegLabel[$startCharIndex]=1;
239 0           $thisCharIndex=$startCharIndex-1;
240             }
241            
242             #for debug
243             #print STDERR @rightLeftSegLabel,"\n $rightToLeftFreq\n";
244            
245            
246             #---------------------------------------------------------------------------------
247             # Step3: create result
248 0 0         if($leftToRightFreq>$rightToLeftFreq){#using left to right solution, prefer right to left
249 0           for(my $p=0;$p<$lenOfString;$p++){
250 0           $result=$result.substr($inputString, $p*2, 2);
251            
252 0 0         if($leftRightSegLabel[$p]==1){
253 0           $result=$result." ";
254             }
255             }
256             }
257             else{
258 0           for(my $p=0;$p<$lenOfString;$p++){
259 0 0         if($rightLeftSegLabel[$p]==1){
260 0           $result=$result." ";
261             }
262 0           $result=$result.substr($inputString, $p*2, 2);
263             }
264             }
265            
266 0           $result=~s/^\x20+//;
267 0           $result=~s/\x20+\Z//;
268            
269             #for debug
270             #print "result=$result\n";
271            
272 0           return " $result ";
273             }
274            
275             set_dic();
276             1;
277            
278             =pod
279            
280             =head1 NAME
281            
282             Lingua::ZH::WordSegment - Simple Simplified Chinese Word Segmentation
283            
284             =head1 SYNOPSIS
285            
286             use Lingua::ZH::WordSegment;
287             print seg($str_in);
288             seg_STDIO();# Read from STDIN, and print the segmented result to STDOUT
289            
290             set_dic($dictionary_file_name); #load word from the file, this is not a must
291             perl -MLingua::ZH::WordSegment -e 'seg_STDIO();' < input_file > output_file
292            
293             =head1 DESCRIPTION
294            
295             The default word list is extracted from People's Daily in Jan, 1998 owned by Institute of Computational Linguistics, Peking University, China
296            
297             This code is mainly written by Joy, joy@cs.cmu.edu in July 4th, 2001.
298            
299             This program is a perl version of left-right mandarin segmentor
300             As LDC segmenter takes a long time to build the DB files which makes the
301             the training process last too long time.
302            
303             For ablation experiments, we do not need to create the DB files because the
304             specific frequency dictionary will be used only once for each slice.
305            
306             The algorithm for this segmenter is to search the longest word at each point
307             from both left and right directions, and choose the one with higher frequency
308             product.
309            
310             The above is Joy's original declarations.
311            
312            
313            
314             =head1 METHODS
315            
316             seg($str_in); # return the string of segmentation result.
317             seg_STDIO(); # Read from STDIN, and print the segmented result to STDOUT
318            
319             set_dic($dictionary_file_name)
320             #The format of the dictionary file for each line is:
321             # "chineseWord\tFrequency\n"
322             #
323             #Notice that if you don't call set_dic,
324             #the default dictionary in GBK encoding will be loaded.
325             #The default dictionary is extracted from corpus of the People's Daily,
326             #January, 1998.
327             #Thanks to Institute of Computational Linguistics, Peking University,China.
328            
329             =head1 SEE ALSO
330            
331            
332             =head1 AUTHORS
333            
334             Rewrited by Chen Yirong E cyr.master@gmail.com E, September 21, 2006 and modified in Feb 20, 2007.
335             Original Author:
336             Joy, joy@cs.cmu.edu in July 4th, 2001
337            
338             =head1 KUDOS
339            
340             Many thanks to Joy who made the code available.
341             Thanks to the PKU Corpus (from Institute of Computational Linguistics, Peking University, China) to help to automatic generate the default dictionary.
342            
343             =head1 COPYRIGHT
344            
345             This program is free software; you can redistribute it and/or modify it
346             under the same terms as Perl itself.
347            
348             See L
349            
350             =cut
351            
352             __DATA__