File Coverage

blib/lib/Lingua/Han/Utils.pm
Criterion Covered Total %
statement 45 45 100.0
branch 8 12 66.6
condition n/a
subroutine 11 11 100.0
pod 5 5 100.0
total 69 73 94.5


line stmt bran cond sub pod time code
1             package Lingua::Han::Utils;
2              
3 3     3   67757 use warnings;
  3         6  
  3         106  
4 3     3   19 use strict;
  3         6  
  3         112  
5 3     3   17 use base 'Exporter';
  3         8  
  3         287  
6 3     3   20 use vars qw/$VERSION @EXPORT_OK/;
  3         6  
  3         254  
7             $VERSION = '0.12';
8             @EXPORT_OK = qw/Unihan_value csplit cdecode csubstr clength/;
9              
10 3     3   3324 use Encode;
  3         39273  
  3         320  
11 3     3   9087 use Encode::Detect::CJK qw(detect);
  3         211943  
  3         1323  
12              
13             sub cdecode {
14 9     9 1 15 my $word = shift;
15 9         38 my $encoding = detect($word);
16 9 100       1653 $encoding = 'cp936' if $encoding eq 'iso-8859-1'; # hard fix
17 9         36 $word = decode($encoding, $word);
18 9         10914 return $word;
19             }
20              
21             sub Unihan_value {
22 9     9 1 31 my $word = shift;
23 9         26 $word = cdecode($word);
24 9         38 my @unihan = map { uc sprintf("%x",$_) } unpack ("U*", $word);
  21         81  
25 9 100       70 return wantarray?@unihan:(join('', @unihan));
26             }
27              
28             sub csplit {
29 3     3 1 6 my $word = shift;
30 3         10 my $encoding = detect($word);
31 3         225 my @return_words;
32 3         8 my @code = Unihan_value($word);
33 3         8 foreach my $code (@code) {
34 9         25 my $value = pack("U*", hex $code);
35 9         24 $value = encode($encoding, $value);
36 9 50       213 push @return_words, $value if ($value);
37             }
38 3 50       21 return wantarray?@return_words:(join('', @return_words));
39             }
40              
41             sub csubstr {
42 1     1 1 3 my ($word, $offset, $len) = @_;
43 1         3 my @words = csplit($word);
44 1 50       8 $len = scalar @words - $offset unless ($len);
45 1         5 @words = splice(@words, $offset, $len);
46 1 50       8 return wantarray?@words:(join('', @words));
47             }
48              
49             sub clength {
50 1     1 1 4 my $word = shift;
51 1         3 my @words = csplit($word);
52 1         6 return scalar @words;
53             }
54              
55             1;
56             __END__
57             =encoding utf8
58              
59             =head1 NAME
60              
61             Lingua::Han::Utils - The utility tools of Chinese character(HanZi)
62              
63             =head1 SYNOPSIS
64              
65             use Lingua::Han::Utils qw/Unihan_value csplit cdecode csubstr clength/;
66              
67             # cdecode
68             # the same as decode('cp936', $word) in ASCII editing mode
69             # and decode('utf8', $word) in Unicode editing mode
70             my $word = cdecode($word);
71              
72             # Unihan_value
73             # return the first field of Unihan.txt on unicode.org
74             my $word = "我";
75             my $unihan = Unihan_value($word); # return '6211'
76             my $words = "爱你";
77             my @unihan = Unihan_value($word); # return (7231, 4F60)
78             my $unihan = Unihan_value($word); # return 72314F60
79              
80             # csplit
81             # split the Chinese characters into an array
82             my $words = "我爱你";
83             my @words = csplit($words); # return ("我", "爱", "ä½ ")
84              
85             # csubstr
86             # treat the Chinese characters as one
87             # so it's the same as splice(csplit($words), $offset, $length)
88             my $words = "我爱你啊";
89             my @words = csubstr($words, 1, 2); # return ("爱", "ä½ ")
90             my @words = csubstr($words, 1); # return ("爱", "ä½ ", "å•Š")
91             my $words = csubstr($words, 1, 2); # 爱你
92              
93             # clength
94             # treat the Chinese character as one
95             my $words = "我爱你";
96             print clength($words); # 3
97              
98             =head1 EXPORT
99              
100             Nothing is exported by default.
101              
102             =head1 EXPORT_OK
103              
104             =over 4
105              
106             =item cdecode
107              
108             use L<Encode::Guess> to decode the character. It behavers like: decode('cp936', $word) under ASCII editing mode and decode('utf8', $word) under Unicode editing mode.
109              
110             =item Unihan_value
111              
112             the first field of Unihan.txt is the Unicode scalar value as U+[x]xxxx, we return the [x]xxxx.
113              
114             =item csplit
115              
116             split the Chinese characters into an array, English words can be mixed in.
117              
118             =item csubstr(WORD, OFFSET, LENGTH)
119              
120             treat the Chinese character as one word, substr it.
121              
122             (BE CAFEFUL! it's NOT lvalue, we cann't use csubstr($word, 2, 3) = $REPLACEMENT)
123              
124             if no LENGTH is specified, substr form OFFSET to END.
125              
126             =item clength
127              
128             treat the Chinese character as one word(length 1).
129              
130             =back
131              
132             =head1 DOCUMENT
133              
134             a Chinese version of document can be found @ L<http://www.fayland.org/journal/Lingua-Han-Utils.html>
135              
136             =head1 AUTHOR
137              
138             Fayland Lam, C<< <fayland at gmail.com> >>
139              
140             =head1 BUGS
141              
142             Please report any bugs or feature requests to
143             C<bug-lingua-han-utils at rt.cpan.org>, or through the web interface at
144             L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-Han-Utils>.
145             I will be notified, and then you'll automatically be notified of progress on
146             your bug as I make changes.
147              
148             =head1 SUPPORT
149              
150             You can find documentation for this module with the perldoc command.
151              
152             perldoc Lingua::Han::Utils
153              
154             You can also look for information at:
155              
156             =over 4
157              
158             =item * AnnoCPAN: Annotated CPAN documentation
159              
160             L<http://annocpan.org/dist/Lingua-Han-Utils>
161              
162             =item * CPAN Ratings
163              
164             L<http://cpanratings.perl.org/d/Lingua-Han-Utils>
165              
166             =item * RT: CPAN's request tracker
167              
168             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Lingua-Han-Utils>
169              
170             =item * Search CPAN
171              
172             L<http://search.cpan.org/dist/Lingua-Han-Utils>
173              
174             =back
175              
176             =head1 ACKNOWLEDGEMENTS
177              
178             the wonderful L<Encode::Guess>
179              
180             =head1 COPYRIGHT & LICENSE
181              
182             Copyright 2005 Fayland Lam, all rights reserved.
183              
184             This program is free software; you can redistribute it and/or modify it
185             under the same terms as Perl itself.