File Coverage

blib/lib/Lingua/Han/Utils.pm

Criterion	Covered	Total	%
statement	45	45	100.0
branch	8	12	66.6
condition			n/a
subroutine	11	11	100.0
pod	5	5	100.0
total	69	73	94.5

line	stmt	bran	sub	pod	time	code
1						package Lingua::Han::Utils;
2
3	3		3		67757	use warnings;
	3				6
	3				106
4	3		3		19	use strict;
	3				6
	3				112
5	3		3		17	use base 'Exporter';
	3				8
	3				287
6	3		3		20	use vars qw/$VERSION @EXPORT_OK/;
	3				6
	3				254
7						$VERSION = '0.12';
8						@EXPORT_OK = qw/Unihan_value csplit cdecode csubstr clength/;
9
10	3		3		3324	use Encode;
	3				39273
	3				320
11	3		3		9087	use Encode::Detect::CJK qw(detect);
	3				211943
	3				1323
12
13						sub cdecode {
14	9		9	1	15	my $word = shift;
15	9				38	my $encoding = detect($word);
16	9	100			1653	$encoding = 'cp936' if $encoding eq 'iso-8859-1'; # hard fix
17	9				36	$word = decode($encoding, $word);
18	9				10914	return $word;
19						}
20
21						sub Unihan_value {
22	9		9	1	31	my $word = shift;
23	9				26	$word = cdecode($word);
24	9				38	my @unihan = map { uc sprintf("%x",$_) } unpack ("U*", $word);
	21				81
25	9	100			70	return wantarray?@unihan:(join('', @unihan));
26						}
27
28						sub csplit {
29	3		3	1	6	my $word = shift;
30	3				10	my $encoding = detect($word);
31	3				225	my @return_words;
32	3				8	my @code = Unihan_value($word);
33	3				8	foreach my $code (@code) {
34	9				25	my $value = pack("U*", hex $code);
35	9				24	$value = encode($encoding, $value);
36	9	50			213	push @return_words, $value if ($value);
37						}
38	3	50			21	return wantarray?@return_words:(join('', @return_words));
39						}
40
41						sub csubstr {
42	1		1	1	3	my ($word, $offset, $len) = @_;
43	1				3	my @words = csplit($word);
44	1	50			8	$len = scalar @words - $offset unless ($len);
45	1				5	@words = splice(@words, $offset, $len);
46	1	50			8	return wantarray?@words:(join('', @words));
47						}
48
49						sub clength {
50	1		1	1	4	my $word = shift;
51	1				3	my @words = csplit($word);
52	1				6	return scalar @words;
53						}
54
55						1;
56						__END__
57						=encoding utf8
58
59						=head1 NAME
60
61						Lingua::Han::Utils - The utility tools of Chinese character(HanZi)
62
63						=head1 SYNOPSIS
64
65						use Lingua::Han::Utils qw/Unihan_value csplit cdecode csubstr clength/;
66
67						# cdecode
68						# the same as decode('cp936', $word) in ASCII editing mode
69						# and decode('utf8', $word) in Unicode editing mode
70						my $word = cdecode($word);
71
72						# Unihan_value
73						# return the first field of Unihan.txt on unicode.org
74						my $word = "æˆ‘";
75						my $unihan = Unihan_value($word); # return '6211'
76						my $words = "çˆ±ä½ ";
77						my @unihan = Unihan_value($word); # return (7231, 4F60)
78						my $unihan = Unihan_value($word); # return 72314F60
79
80						# csplit
81						# split the Chinese characters into an array
82						my $words = "æˆ‘çˆ±ä½ ";
83						my @words = csplit($words); # return ("æˆ‘", "çˆ±", "ä½ ")
84
85						# csubstr
86						# treat the Chinese characters as one
87						# so it's the same as splice(csplit($words), $offset, $length)
88						my $words = "æˆ‘çˆ±ä½ å•Š";
89						my @words = csubstr($words, 1, 2); # return ("çˆ±", "ä½ ")
90						my @words = csubstr($words, 1); # return ("çˆ±", "ä½ ", "å•Š")
91						my $words = csubstr($words, 1, 2); # çˆ±ä½
92
93						# clength
94						# treat the Chinese character as one
95						my $words = "æˆ‘çˆ±ä½ ";
96						print clength($words); # 3
97
98						=head1 EXPORT
99
100						Nothing is exported by default.
101
102						=head1 EXPORT_OK
103
104						=over 4
105
106						=item cdecode
107
108						use L<Encode::Guess> to decode the character. It behavers like: decode('cp936', $word) under ASCII editing mode and decode('utf8', $word) under Unicode editing mode.
109
110						=item Unihan_value
111
112						the first field of Unihan.txt is the Unicode scalar value as U+[x]xxxx, we return the [x]xxxx.
113
114						=item csplit
115
116						split the Chinese characters into an array, English words can be mixed in.
117
118						=item csubstr(WORD, OFFSET, LENGTH)
119
120						treat the Chinese character as one word, substr it.
121
122						(BE CAFEFUL! it's NOT lvalue, we cann't use csubstr($word, 2, 3) = $REPLACEMENT)
123
124						if no LENGTH is specified, substr form OFFSET to END.
125
126						=item clength
127
128						treat the Chinese character as one word(length 1).
129
130						=back
131
132						=head1 DOCUMENT
133
134						a Chinese version of document can be found @ L<http://www.fayland.org/journal/Lingua-Han-Utils.html>
135
136						=head1 AUTHOR
137
138						Fayland Lam, C<< <fayland at gmail.com> >>
139
140						=head1 BUGS
141
142						Please report any bugs or feature requests to
143						C<bug-lingua-han-utils at rt.cpan.org>, or through the web interface at
144						L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-Han-Utils>.
145						I will be notified, and then you'll automatically be notified of progress on
146						your bug as I make changes.
147
148						=head1 SUPPORT
149
150						You can find documentation for this module with the perldoc command.
151
152						perldoc Lingua::Han::Utils
153
154						You can also look for information at:
155
156						=over 4
157
158						=item * AnnoCPAN: Annotated CPAN documentation
159
160						L<http://annocpan.org/dist/Lingua-Han-Utils>
161
162						=item * CPAN Ratings
163
164						L<http://cpanratings.perl.org/d/Lingua-Han-Utils>
165
166						=item * RT: CPAN's request tracker
167
168						L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Lingua-Han-Utils>
169
170						=item * Search CPAN
171
172						L<http://search.cpan.org/dist/Lingua-Han-Utils>
173
174						=back
175
176						=head1 ACKNOWLEDGEMENTS
177
178						the wonderful L<Encode::Guess>
179
180						=head1 COPYRIGHT & LICENSE
181
182						Copyright 2005 Fayland Lam, all rights reserved.
183
184						This program is free software; you can redistribute it and/or modify it
185						under the same terms as Perl itself.