File Coverage

blib/lib/Unicode/String.pm

Criterion	Covered	Total	%
statement	178	228	78.0
branch	72	110	65.4
condition	26	33	78.7
subroutine	25	30	83.3
pod	23	25	92.0
total	324	426	76.0

line	stmt	bran	cond	sub	pod	time	code
1							package Unicode::String;
2
3							# Copyright 1997-1999, Gisle Aas.
4
5	5			5		4786	use strict;
	5					5
	5					124
6	5			5		15	use vars qw($VERSION @ISA @EXPORT_OK $UTF7_OPTIONAL_DIRECT_CHARS);
	5					6
	5					259
7	5			5		15	use Carp;
	5					6
	5					680
8
9							require Exporter;
10							require DynaLoader;
11							@ISA = qw(Exporter DynaLoader);
12
13							@EXPORT_OK = qw(
14							utf16 utf16le utf16be ucs2
15							utf8
16							utf7
17							ucs4 utf32 utf32be utf32le
18							latin1
19							uchr uhex
20
21							byteswap2 byteswap4
22							);
23
24							$VERSION = '2.10';
25
26							$UTF7_OPTIONAL_DIRECT_CHARS \|\|= 1;
27
28							bootstrap Unicode::String $VERSION;
29
30	5					39	use overload '""' => \&as_string,
31							'bool' => \&as_bool,
32							'0+' => \&as_num,
33							'.=' => \&append,
34							'.' => \&concat,
35							'x' => \&repeat,
36							'=' => \©,
37	5			5		4462	'fallback' => 1;
	5					3570
38
39							my %stringify = (
40							unicode => \&utf16,
41							utf16 => \&utf16,
42							utf16be => \&utf16,
43							utf16le => \&utf16le,
44							ucs2 => \&utf16,
45							utf8 => \&utf8,
46							utf7 => \&utf7,
47							ucs4 => \&ucs4,
48							utf32 => \&ucs4,
49							utf32be => \&ucs4,
50							utf32le => \&utf32le,
51							latin1 => \&latin1,
52							'hex' => \&hex,
53							);
54
55							my $stringify_as = \&utf8;
56
57							# some aliases
58							*ucs2 = \&utf16;
59							*utf16be = \&utf16;
60							*utf32 = \&ucs4;
61							*utf32be = \&ucs4;
62							*uhex = \&hex;
63							*uchr = \&chr;
64
65							sub new
66							{
67							#_dump_arg("new", @_);
68	54			54	1	796	my $class = shift;
69	54					38	my $str;
70	54					58	my $self = bless \$str, $class;
71	54	100				83	&$stringify_as($self, shift) if @_;
72	54					59	$self;
73							}
74
75
76							sub repeat
77							{
78	1			1	1	6	my($self, $count) = @_;
79	1					1	my $class = ref($self);
80	1					3	my $str = $$self x $count;
81	1					2	bless \$str, $class;
82							}
83
84
85							sub _dump_arg
86							{
87	0			0		0	my $func = shift;
88	0					0	print "$func(";
89	0	0				0	print join(",", map { if (defined $_) {
	0					0
90	0					0	my $x = overload::StrVal($_);
91	0					0	$x =~ s/\n/\\n/g;
92	0	0				0	$x = '""' unless length $x;
93	0					0	$x;
94							} else {
95	0					0	"undef"
96							}
97							} @_);
98	0					0	print ")\n";
99							}
100
101
102							sub concat
103							{
104							#_dump_arg("concat", @_);
105	4			4	1	21	my($self, $other, $reversed) = @_;
106	4					5	my $class = ref($self);
107	4	100				10	unless (UNIVERSAL::isa($other, 'Unicode::String')) {
108	2					4	$other = Unicode::String->new($other);
109							}
110	4	100				9	my $str = $reversed ? $$other . $$self : $$self . $$other;
111	4					9	bless \$str, $class;
112							}
113
114
115							sub append
116							{
117							#_dump_arg("append", @_);
118	20			20	1	56	my($self, $other) = @_;
119	20	50				43	unless (UNIVERSAL::isa($other, 'Unicode::String')) {
120	0					0	$other = Unicode::String->new($other);
121							}
122	20					66	$$self .= $$other;
123	20					31	$self;
124							}
125
126
127							sub copy
128							{
129	1			1	1	14	my($self) = @_;
130	1					2	my $class = ref($self);
131	1					2	my $copy = $$self;
132	1					2	bless \$copy, $class;
133							}
134
135
136							sub as_string
137							{
138							#_dump_arg("as_string", @_);
139	3			3	1	16	&$stringify_as($_[0]);
140							}
141
142
143							sub as_bool
144							{
145							# This is different from perl's normal behaviour by not letting
146							# a U+0030 ("0") be false.
147	3			3	1	12	my $self = shift;
148	3	100				16	$$self ? 1 : "";
149							}
150
151
152							sub as_num
153							{
154							# Should be able to use the numeric property from Unidata
155							# in order to parse a large number of numbers. Currently we
156							# only convert it to a plain string and let perl's normal
157							# num-converter do the job.
158	1			1	1	10	my $self = shift;
159	1					2	my $str = $self->utf8;
160	1					2	$str + 0;
161							}
162
163
164							sub stringify_as
165							{
166	3			3	1	21	my $class;
167	3	50				6	if (@_ > 1) {
168	0					0	$class = shift;
169	0	0				0	$class = ref($class) if ref($class);
170							} else {
171	3					3	$class = "Unicode::String";
172							}
173	3					3	my $old = $stringify_as;
174	3	50				5	if (@_) {
175	3					2	my $as = shift;
176							croak("Don't know how to stringify as '$as'")
177	3	100				156	unless exists $stringify{$as};
178	2					4	$stringify_as = $stringify{$as};
179							}
180	2					2	$old;
181							}
182
183
184							sub utf16
185							{
186	78			78	0	255	my $self = shift;
187	78	100				105	unless (ref $self) {
188	29					37	my $u = new Unicode::String;
189	29					41	$u->utf16($self);
190	29					91	return $u;
191							}
192	49					45	my $old = $$self;
193	49	100				63	if (@_) {
194	31					31	$$self = shift;
195	31	100				50	if ((length($$self) % 2) != 0) {
196	1	50				17	warn "Uneven UTF16 data" if $^W;
197	1					82	$$self .= "\0";
198							}
199	31	100				54	if ($$self =~ /^\xFF\xFE/) {
200							# the string needs byte swapping
201	2					5	$$self = byteswap2($$self);
202							}
203							}
204	49					61	$old;
205							}
206
207
208							sub utf16le
209							{
210	3			3	1	5	my $self = shift;
211	3	100				7	unless (ref $self) {
212	2					4	my $u = new Unicode::String;
213	2					7	$u->utf16(byteswap2($self));
214	2					3	return $u;
215							}
216	1					3	my $old = byteswap2($$self);
217	1	50				6	if (@_) {
218	0					0	$self->utf16(byteswap2(shift));
219							}
220	1					3	$old;
221							}
222
223
224							sub utf32le
225							{
226	2			2	1	4	my $self = shift;
227	2	100				5	unless (ref $self) {
228	1					4	my $u = new Unicode::String;
229	1					5	$u->ucs4(byteswap4($self));
230	1					2	return $u;
231							}
232	1					8	my $old = byteswap4($self->ucs4);
233	1	50				5	if (@_) {
234	0					0	$self->ucs4(byteswap4(shift));
235							}
236	1					3	$old;
237							}
238
239
240							sub utf7 # rfc1642
241							{
242	32			32	1	547	my $self = shift;
243	32	100				46	unless (ref $self) {
244							# act as ctor
245	10					18	my $u = new Unicode::String;
246	10					22	$u->utf7($self);
247	10					18	return $u;
248							}
249	22					16	my $old;
250	22	100				51	if (defined wantarray) {
251							# encode into $old
252	12					8	$old = "";
253	12					22	pos($$self) = 0;
254	12					14	my $len = length($$self);
255	12					18	while (pos($$self) < $len) {
256	42	100	100			308	if (($UTF7_OPTIONAL_DIRECT_CHARS &&
		50	100
			66
			66
257							$$self =~ /\G((?:\0[A-Za-z0-9\'\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\\|\}\s])+)/gc)
258							\|\| $$self =~ /\G((?:\0[A-Za-z0-9\'\,\-\.\/\:\?\s])+)/gc)
259							{
260							#print "Plain ", utf16($1)->latin1, "\n";
261	19					25	$old .= utf16($1)->latin1;
262							}
263							elsif (($UTF7_OPTIONAL_DIRECT_CHARS &&
264							$$self =~ /\G((?:[^\0].\|\0[^A-Za-z0-9\'\,\-\.\/\:\?\!\"\#\$\%\&\*\;\<\=\>\@\[\]\^\_\`\{\\|\}\s])+)/gsc)
265							\|\| $$self =~ /\G((?:[^\0].\|\0[^A-Za-z0-9\'\,\-\.\/\:\?\s])+)/gsc)
266							{
267							#print "Unplain ", utf16($1)->hex, "\n";
268	23	100				37	if ($1 eq "\0+") {
269	2					4	$old .= "+-";
270							} else {
271	21					535	require MIME::Base64;
272	21					567	my $base64 = MIME::Base64::encode($1, '');
273	21					40	$base64 =~ s/=+$//;
274	21					48	$old .= "+$base64-";
275							# XXX should we determine when the final "-" is
276							# unnecessary? depends on next char not being part
277							# of the base64 char set.
278							}
279							} else {
280	0					0	die "This should not happen, pos=" . pos($$self) .
281							": " . $self->hex . "\n";
282							}
283							}
284							}
285
286	22	100				30	if (@_) {
287							# decode
288	11					10	my $len = length($_[0]);
289	11					11	$$self = "";
290	11					14	pos($_[0]) = 0;
291	11					19	while (pos($_[0]) < $len) {
292	40	100				117	if ($_[0] =~ /\G([^+]+)/gc) {
		100
		50
		0
293	18					60	$self->append(latin1($1));
294							} elsif ($_[0] =~ /\G\+-/gc) {
295	2					4	$$self .= "\0+";
296							} elsif ($_[0] =~ /\G\+([A-Za-z0-9+\/]+)-?/gc) {
297	20					22	my $base64 = $1;
298	20					20	my $pad = length($base64) % 4;
299	20	100				32	$base64 .= "=" x (4 - $pad) if $pad;
300	20					50	require MIME::Base64;
301	20					36	$$self .= MIME::Base64::decode($base64);
302	20	50				45	if ((length($$self) % 2) != 0) {
303	0	0				0	warn "Uneven UTF7 base64-data" if $^W;
304	0					0	chop($$self); # correct it
305							}
306							} elsif ($_[0] =~ /\G\+/gc) {
307	0	0				0	warn "Bad UTF7 data escape" if $^W;
308	0					0	$$self .= "\0+";
309							} else {
310	0					0	die "This should not happen " . pos($_[0]);
311							}
312							}
313							}
314	22					37	$old;
315							}
316
317
318							sub hex
319							{
320	27			27	1	330	my $self = shift;
321	27	50				51	unless (ref $self) {
322	0					0	my $u = new Unicode::String;
323	0					0	$u->hex($self);
324	0					0	return $u;
325							}
326	27					19	my $old;
327	27	50	33			134	if (defined($$self) && defined wantarray) {
328	27					53	$old = unpack("H*", $$self);
329	27					188	$old =~ s/(....)/U+$1 /g;
330	27					94	$old =~ s/\s+$//;
331							}
332	27	50				45	if (@_) {
333	0					0	my $new = shift;
334	0					0	$new =~ tr/0-9A-Fa-f//cd; # leave only hex chars
335	0	0				0	croak("Hex string length must be multiple of four")
336							unless (length($new) % 4) == 0;
337	0					0	$$self = pack("H*", $new);
338							}
339	27					141	$old;
340							}
341
342
343							sub length
344							{
345	6			6	1	1699	my $self = shift;
346	6					56	int(length($$self) / 2);
347							}
348
349							sub byteswap
350							{
351	1			1	1	62	my $self = shift;
352	1					42	byteswap2($$self);
353	1					1	$self;
354							}
355
356							sub unpack
357							{
358	0			0	1	0	my $self = shift;
359	0					0	unpack("n*", $$self)
360							}
361
362
363							sub pack
364							{
365	6			6	1	36	my $self = shift;
366	6					16	$$self = pack("n*", @_);
367	6					6	$self;
368							}
369
370
371							sub ord
372							{
373	12			12	1	53	my $self = shift;
374	12	50				16	return () unless defined $$self;
375
376	12					11	my $array = wantarray;
377	12					10	my @ret;
378							my @chars;
379	12	100				13	if ($array) {
380	4					7	@chars = CORE::unpack("n*", $$self);
381							} else {
382	8					11	@chars = CORE::unpack("n2", $$self);
383							}
384
385	12					19	while (@chars) {
386	20					15	my $first = shift(@chars);
387	20	100	100			43	if ($first >= 0xD800 && $first <= 0xDFFF) { # surrogate
388	6					5	my $second = shift(@chars);
389							#print "F=$first S=$second\n";
390	6	100	100			28	if ($first >= 0xDC00 \|\| $second < 0xDC00 \|\| $second > 0xDFFF) {
			66
391	2					209	carp(sprintf("Bad surrogate pair (U+%04x U+%04x)",
392							$first, $second));
393	2					57	unshift(@chars, $second);
394	2					4	next;
395							}
396	4					7	push(@ret, ($first-0xD800)*0x400 + ($second-0xDC00) + 0x10000);
397							} else {
398	14					11	push(@ret, $first);
399							}
400	18	100				27	last unless $array;
401							}
402	12	100				28	$array ? @ret : $ret[0];
403							}
404
405
406							sub name
407							{
408	0			0	1	0	my $self = shift;
409	0					0	require Unicode::CharName;
410	0	0				0	if (wantarray) {
411	0					0	return map { Unicode::CharName::uname($_) } $self->ord;
	0					0
412							} else {
413	0					0	return Unicode::CharName::uname(scalar($self->ord));
414							}
415							}
416
417
418							sub chr
419							{
420	8			8	1	81	my($self,$val) = @_;
421	8	100				12	unless (ref $self) {
422							# act as ctor
423	4					8	my $u = new Unicode::String;
424	4					8	return $u->uchr($self);
425							}
426	4	100				6	if ($val > 0xFFFF) {
427							# must be represented by a surrogate pair
428	2	50				4	return undef if $val > 0x10FFFF; # Unicode limit
429	2					2	$val -= 0x10000;
430	2					5	my $h = int($val / 0x400) + 0xD800;
431	2					2	my $l = ($val % 0x400) + 0xDC00;
432	2					4	$$self = CORE::pack("n2", $h, $l);
433							} else {
434	2					40	$$self = CORE::pack("n", $val);
435							}
436	4					13	$self;
437							}
438
439
440							sub substr
441							{
442	6			6	1	27	my($self, $offset, $length, $substitute) = @_;
443	6		100			12	$offset \|\|= 0;
444	6					4	$offset *= 2;
445	6					7	my $substr;
446	6	50				7	if (defined $substitute) {
447	0	0				0	unless (UNIVERSAL::isa($substitute, 'Unicode::String')) {
448	0					0	$substitute = Unicode::String->new($substitute);
449							}
450	0	0				0	if (defined $length) {
451	0					0	$substr = substr($$self, $offset, $length*2) = $$substitute;
452							} else {
453	0					0	$substr = substr($$self, $offset) = $$substitute;
454							}
455							} else {
456	6	100				7	if (defined $length) {
457	4					6	$substr = substr($$self, $offset, $length*2);
458							} else {
459	2					4	$substr = substr($$self, $offset);
460							}
461							}
462	6					14	bless \$substr, ref($self);
463							}
464
465
466							sub index
467							{
468	5			5	1	16	my($self, $other, $pos) = @_;
469	5		100			11	$pos \|\|= 0;
470	5					5	$pos *= 2;
471	5	50				7	$other = Unicode::String->new($other) unless ref($other);
472	5		100			21	$pos++ while ($pos = index($$self, $$other, $pos)) > 0 && ($pos%2) != 0;
473	5	100				8	$pos /= 2 if $pos > 0;
474	5					5	$pos;
475							}
476
477
478							sub rindex
479							{
480	0			0	0		my($self, $other, $pos) = @_;
481	0		0				$pos \|\|= 0;
482	0						die "NYI";
483							}
484
485
486							sub chop
487							{
488	0			0	1		my $self = shift;
489	0	0					if (CORE::length $$self) {
490	0						my $chop = chop($$self);
491	0						$chop = chop($$self) . $chop;
492	0						return bless \$chop, ref($self);
493							}
494	0						undef;
495							}
496
497
498							# XXX: Ideas to be implemented
499							sub scan;
500							sub reverse;
501
502							sub lc;
503							sub lcfirst;
504							sub uc;
505							sub ucfirst;
506
507							sub split;
508							sub sprintf;
509							sub study;
510							sub tr;
511
512
513							1;
514
515							__END__