File Coverage

blib/lib/Unicode/CharName.pm
Criterion Covered Total %
statement 37 37 100.0
branch 16 18 88.8
condition 12 15 80.0
subroutine 5 5 100.0
pod 0 2 0.0
total 70 77 90.9


line stmt bran cond sub pod time code
1             package Unicode::CharName;
2              
3             =head1 NAME
4              
5             Unicode::CharName - Look up Unicode character names
6              
7             =head1 SYNOPSIS
8              
9             use Unicode::CharName qw(uname ublock);
10             print uname(ord('%')), "\n";
11             print ublock(0x0300), "\n";
12              
13             =head1 DESCRIPTION
14              
15             This module provide a two functions named uname() and ublock(). The
16             uname() function will return the Unicode character name for the given
17             code (a number between 0 and 0x10FFFF). Unicode character names are
18             written in upper-case ASCII letters, and are strings like:
19              
20             LATIN CAPITAL LETTER A
21             LATIN SMALL LETTER A WITH RING ABOVE
22             CJK UNIFIED IDEOGRAPH 7C80
23             HANGUL SYLLABLE PWILH
24              
25             The ublock() will return the name of the Unicode
26             character block that the given character belongs to.
27              
28             =head1 SEE ALSO
29              
30             L
31              
32             =head1 COPYRIGHT
33              
34             Copyright 1997,2005 Gisle Aas.
35              
36             This library is free software; you can redistribute it and/or
37             modify it under the same terms as Perl itself.
38              
39             Name table extracted from the Unicode 4.1 Character
40             Database. Copyright (c) 1991-2005 Unicode, Inc. All Rights reserved.
41              
42             =cut
43              
44             require Exporter;
45             @ISA=qw(Exporter);
46             @EXPORT_OK = qw(uname ublock);
47             $VERSION = sprintf("%d.%02d", q$Revision$ =~ /(\d+)\.(\d+)/);
48              
49 1     1   1939 use strict;
  1         1  
  1         29  
50 1     1   3 use vars qw(%JAMO_SHORT_NAME %NAMES @BLOCKS);
  1         0  
  1         688  
51              
52             sub uname {
53 9     9 0 46 my $code = shift;
54 9 100       17 if ($code >= 0x4E00) {
55 5 100 66     34 if ($code <= 0x9FFF || ($code >= 0xF900 && $code <= 0xFAFF)) {
    100 66        
    100 100        
      66        
56             # CJK Ideographs
57 1         5 return sprintf "CJK UNIFIED IDEOGRAPH %04X", $code;
58             } elsif ($code >= 0xD800 && $code <= 0xF8FF) {
59             # Surrogate and private
60 2 100       3 if ($code <= 0xDFFF) {
61 1         2 return "";
62             } else {
63 1         2 return "";
64             }
65             } elsif ($code >= 0xAC00 && $code <= 0xD7A3) {
66             # Hangul Syllables
67 1         2 my $sindex = $code - 0xAC00;
68 1         5 my $l = 0x1100 + int($sindex / (21*28));
69 1         3 my $v = 0x1161 + int(($sindex % (21*28)) / 28);
70 1         2 my $t = 0x11A7 + $sindex % 28;
71 1         1 my @s = ($l, $v, $t);
72 1 50       3 pop(@s) if $t == 0x11A7;
73             @s = map {
74 1         2 $_ = sprintf("%04X", $_);
  3         5  
75 3 50       10 $JAMO_SHORT_NAME{$_} || " U+$_ ";
76             } @s;
77 1         4 return join("", "HANGUL SYLLABLE ", @s)
78             }
79             }
80 5 100       10 _init_names() unless %NAMES;
81 5         19 $NAMES{sprintf("%04X",$code)}
82             }
83              
84             sub ublock
85             {
86 5     5 0 16 my $code = shift;
87             # XXX: could use a binary search, but I am too lazy today...
88 5         2 my $block;
89 5         6 for $block (@BLOCKS) {
90 389 100 100     896 return $block->[2] if $block->[0] <= $code && $block->[1] >= $code;
91             }
92 1         2 undef;
93             }
94              
95             @BLOCKS = (
96             # start end block name
97             [0x0000, 0x007F => 'Basic Latin'],
98             [0x0080, 0x00FF => 'Latin-1 Supplement'],
99             [0x0100, 0x017F => 'Latin Extended-A'],
100             [0x0180, 0x024F => 'Latin Extended-B'],
101             [0x0250, 0x02AF => 'IPA Extensions'],
102             [0x02B0, 0x02FF => 'Spacing Modifier Letters'],
103             [0x0300, 0x036F => 'Combining Diacritical Marks'],
104             [0x0370, 0x03FF => 'Greek and Coptic'],
105             [0x0400, 0x04FF => 'Cyrillic'],
106             [0x0500, 0x052F => 'Cyrillic Supplement'],
107             [0x0530, 0x058F => 'Armenian'],
108             [0x0590, 0x05FF => 'Hebrew'],
109             [0x0600, 0x06FF => 'Arabic'],
110             [0x0700, 0x074F => 'Syriac'],
111             [0x0750, 0x077F => 'Arabic Supplement'],
112             [0x0780, 0x07BF => 'Thaana'],
113             [0x0900, 0x097F => 'Devanagari'],
114             [0x0980, 0x09FF => 'Bengali'],
115             [0x0A00, 0x0A7F => 'Gurmukhi'],
116             [0x0A80, 0x0AFF => 'Gujarati'],
117             [0x0B00, 0x0B7F => 'Oriya'],
118             [0x0B80, 0x0BFF => 'Tamil'],
119             [0x0C00, 0x0C7F => 'Telugu'],
120             [0x0C80, 0x0CFF => 'Kannada'],
121             [0x0D00, 0x0D7F => 'Malayalam'],
122             [0x0D80, 0x0DFF => 'Sinhala'],
123             [0x0E00, 0x0E7F => 'Thai'],
124             [0x0E80, 0x0EFF => 'Lao'],
125             [0x0F00, 0x0FFF => 'Tibetan'],
126             [0x1000, 0x109F => 'Myanmar'],
127             [0x10A0, 0x10FF => 'Georgian'],
128             [0x1100, 0x11FF => 'Hangul Jamo'],
129             [0x1200, 0x137F => 'Ethiopic'],
130             [0x1380, 0x139F => 'Ethiopic Supplement'],
131             [0x13A0, 0x13FF => 'Cherokee'],
132             [0x1400, 0x167F => 'Unified Canadian Aboriginal Syllabics'],
133             [0x1680, 0x169F => 'Ogham'],
134             [0x16A0, 0x16FF => 'Runic'],
135             [0x1700, 0x171F => 'Tagalog'],
136             [0x1720, 0x173F => 'Hanunoo'],
137             [0x1740, 0x175F => 'Buhid'],
138             [0x1760, 0x177F => 'Tagbanwa'],
139             [0x1780, 0x17FF => 'Khmer'],
140             [0x1800, 0x18AF => 'Mongolian'],
141             [0x1900, 0x194F => 'Limbu'],
142             [0x1950, 0x197F => 'Tai Le'],
143             [0x1980, 0x19DF => 'New Tai Lue'],
144             [0x19E0, 0x19FF => 'Khmer Symbols'],
145             [0x1A00, 0x1A1F => 'Buginese'],
146             [0x1D00, 0x1D7F => 'Phonetic Extensions'],
147             [0x1D80, 0x1DBF => 'Phonetic Extensions Supplement'],
148             [0x1DC0, 0x1DFF => 'Combining Diacritical Marks Supplement'],
149             [0x1E00, 0x1EFF => 'Latin Extended Additional'],
150             [0x1F00, 0x1FFF => 'Greek Extended'],
151             [0x2000, 0x206F => 'General Punctuation'],
152             [0x2070, 0x209F => 'Superscripts and Subscripts'],
153             [0x20A0, 0x20CF => 'Currency Symbols'],
154             [0x20D0, 0x20FF => 'Combining Diacritical Marks for Symbols'],
155             [0x2100, 0x214F => 'Letterlike Symbols'],
156             [0x2150, 0x218F => 'Number Forms'],
157             [0x2190, 0x21FF => 'Arrows'],
158             [0x2200, 0x22FF => 'Mathematical Operators'],
159             [0x2300, 0x23FF => 'Miscellaneous Technical'],
160             [0x2400, 0x243F => 'Control Pictures'],
161             [0x2440, 0x245F => 'Optical Character Recognition'],
162             [0x2460, 0x24FF => 'Enclosed Alphanumerics'],
163             [0x2500, 0x257F => 'Box Drawing'],
164             [0x2580, 0x259F => 'Block Elements'],
165             [0x25A0, 0x25FF => 'Geometric Shapes'],
166             [0x2600, 0x26FF => 'Miscellaneous Symbols'],
167             [0x2700, 0x27BF => 'Dingbats'],
168             [0x27C0, 0x27EF => 'Miscellaneous Mathematical Symbols-A'],
169             [0x27F0, 0x27FF => 'Supplemental Arrows-A'],
170             [0x2800, 0x28FF => 'Braille Patterns'],
171             [0x2900, 0x297F => 'Supplemental Arrows-B'],
172             [0x2980, 0x29FF => 'Miscellaneous Mathematical Symbols-B'],
173             [0x2A00, 0x2AFF => 'Supplemental Mathematical Operators'],
174             [0x2B00, 0x2BFF => 'Miscellaneous Symbols and Arrows'],
175             [0x2C00, 0x2C5F => 'Glagolitic'],
176             [0x2C80, 0x2CFF => 'Coptic'],
177             [0x2D00, 0x2D2F => 'Georgian Supplement'],
178             [0x2D30, 0x2D7F => 'Tifinagh'],
179             [0x2D80, 0x2DDF => 'Ethiopic Extended'],
180             [0x2E00, 0x2E7F => 'Supplemental Punctuation'],
181             [0x2E80, 0x2EFF => 'CJK Radicals Supplement'],
182             [0x2F00, 0x2FDF => 'Kangxi Radicals'],
183             [0x2FF0, 0x2FFF => 'Ideographic Description Characters'],
184             [0x3000, 0x303F => 'CJK Symbols and Punctuation'],
185             [0x3040, 0x309F => 'Hiragana'],
186             [0x30A0, 0x30FF => 'Katakana'],
187             [0x3100, 0x312F => 'Bopomofo'],
188             [0x3130, 0x318F => 'Hangul Compatibility Jamo'],
189             [0x3190, 0x319F => 'Kanbun'],
190             [0x31A0, 0x31BF => 'Bopomofo Extended'],
191             [0x31C0, 0x31EF => 'CJK Strokes'],
192             [0x31F0, 0x31FF => 'Katakana Phonetic Extensions'],
193             [0x3200, 0x32FF => 'Enclosed CJK Letters and Months'],
194             [0x3300, 0x33FF => 'CJK Compatibility'],
195             [0x3400, 0x4DBF => 'CJK Unified Ideographs Extension A'],
196             [0x4DC0, 0x4DFF => 'Yijing Hexagram Symbols'],
197             [0x4E00, 0x9FFF => 'CJK Unified Ideographs'],
198             [0xA000, 0xA48F => 'Yi Syllables'],
199             [0xA490, 0xA4CF => 'Yi Radicals'],
200             [0xA700, 0xA71F => 'Modifier Tone Letters'],
201             [0xA800, 0xA82F => 'Syloti Nagri'],
202             [0xAC00, 0xD7AF => 'Hangul Syllables'],
203             [0xD800, 0xDB7F => 'High Surrogates'],
204             [0xDB80, 0xDBFF => 'High Private Use Surrogates'],
205             [0xDC00, 0xDFFF => 'Low Surrogates'],
206             [0xE000, 0xF8FF => 'Private Use Area'],
207             [0xF900, 0xFAFF => 'CJK Compatibility Ideographs'],
208             [0xFB00, 0xFB4F => 'Alphabetic Presentation Forms'],
209             [0xFB50, 0xFDFF => 'Arabic Presentation Forms-A'],
210             [0xFE00, 0xFE0F => 'Variation Selectors'],
211             [0xFE10, 0xFE1F => 'Vertical Forms'],
212             [0xFE20, 0xFE2F => 'Combining Half Marks'],
213             [0xFE30, 0xFE4F => 'CJK Compatibility Forms'],
214             [0xFE50, 0xFE6F => 'Small Form Variants'],
215             [0xFE70, 0xFEFF => 'Arabic Presentation Forms-B'],
216             [0xFF00, 0xFFEF => 'Halfwidth and Fullwidth Forms'],
217             [0xFFF0, 0xFFFF => 'Specials'],
218             [0x10000, 0x1007F => 'Linear B Syllabary'],
219             [0x10080, 0x100FF => 'Linear B Ideograms'],
220             [0x10100, 0x1013F => 'Aegean Numbers'],
221             [0x10140, 0x1018F => 'Ancient Greek Numbers'],
222             [0x10300, 0x1032F => 'Old Italic'],
223             [0x10330, 0x1034F => 'Gothic'],
224             [0x10380, 0x1039F => 'Ugaritic'],
225             [0x103A0, 0x103DF => 'Old Persian'],
226             [0x10400, 0x1044F => 'Deseret'],
227             [0x10450, 0x1047F => 'Shavian'],
228             [0x10480, 0x104AF => 'Osmanya'],
229             [0x10800, 0x1083F => 'Cypriot Syllabary'],
230             [0x10A00, 0x10A5F => 'Kharoshthi'],
231             [0x1D000, 0x1D0FF => 'Byzantine Musical Symbols'],
232             [0x1D100, 0x1D1FF => 'Musical Symbols'],
233             [0x1D200, 0x1D24F => 'Ancient Greek Musical Notation'],
234             [0x1D300, 0x1D35F => 'Tai Xuan Jing Symbols'],
235             [0x1D400, 0x1D7FF => 'Mathematical Alphanumeric Symbols'],
236             [0x20000, 0x2A6DF => 'CJK Unified Ideographs Extension B'],
237             [0x2F800, 0x2FA1F => 'CJK Compatibility Ideographs Supplement'],
238             [0xE0000, 0xE007F => 'Tags'],
239             [0xE0100, 0xE01EF => 'Variation Selectors Supplement'],
240             [0xF0000, 0xFFFFF => 'Supplementary Private Use Area-A'],
241             [0x100000, 0x10FFFF => 'Supplementary Private Use Area-B'],
242             );
243              
244             %JAMO_SHORT_NAME = (
245             '1100' => 'G',
246             '1101' => 'GG',
247             '1102' => 'N',
248             '1103' => 'D',
249             '1104' => 'DD',
250             '1105' => 'L',
251             '1106' => 'M',
252             '1107' => 'B',
253             '1108' => 'BB',
254             '1109' => 'S',
255             '110A' => 'SS',
256             '110B' => '',
257             '110C' => 'J',
258             '110D' => 'JJ',
259             '110E' => 'C',
260             '110F' => 'K',
261             '1110' => 'T',
262             '1111' => 'P',
263             '1112' => 'H',
264             '1161' => 'A',
265             '1162' => 'AE',
266             '1163' => 'YA',
267             '1164' => 'YAE',
268             '1165' => 'EO',
269             '1166' => 'E',
270             '1167' => 'YEO',
271             '1168' => 'YE',
272             '1169' => 'O',
273             '116A' => 'WA',
274             '116B' => 'WAE',
275             '116C' => 'OE',
276             '116D' => 'YO',
277             '116E' => 'U',
278             '116F' => 'WEO',
279             '1170' => 'WE',
280             '1171' => 'WI',
281             '1172' => 'YU',
282             '1173' => 'EU',
283             '1174' => 'YI',
284             '1175' => 'I',
285             '11A8' => 'G',
286             '11A9' => 'GG',
287             '11AA' => 'GS',
288             '11AB' => 'N',
289             '11AC' => 'NJ',
290             '11AD' => 'NH',
291             '11AE' => 'D',
292             '11AF' => 'L',
293             '11B0' => 'LG',
294             '11B1' => 'LM',
295             '11B2' => 'LB',
296             '11B3' => 'LS',
297             '11B4' => 'LT',
298             '11B5' => 'LP',
299             '11B6' => 'LH',
300             '11B7' => 'M',
301             '11B8' => 'B',
302             '11B9' => 'BS',
303             '11BA' => 'S',
304             '11BB' => 'SS',
305             '11BC' => 'NG',
306             '11BD' => 'J',
307             '11BE' => 'C',
308             '11BF' => 'K',
309             '11C0' => 'T',
310             '11C1' => 'P',
311             '11C2' => 'H',
312             );
313              
314             sub _init_names
315             {
316 1     1   78 keys %NAMES = 16351; # preextent
317 1         2 local($_);
318 1         5 while () {
319 16351         9524 chop;
320 16351         15437 my($code, $name) = split(' ', $_, 2);
321 16351         28026 $NAMES{$code} = $name;
322             }
323 1         52 close(DATA);
324             }
325              
326             1;
327              
328              
329             __DATA__