File Coverage

blib/lib/Encode/IMAPUTF7.pm

Criterion	Covered	Total	%
statement	44	50	88.0
branch	13	22	59.0
condition			n/a
subroutine	7	8	87.5
pod	1	1	100.0
total	65	81	80.2

line	stmt	bran	sub	pod	time	code
1						#
2						# $Id: IMAPUTF7.pm 3398 2009-04-21 13:18:16Z makholm $
3						#
4						package Encode::IMAPUTF7;
5	2		2		84491	use strict;
	2				4
	2				74
6	2		2		11	no warnings 'redefine';
	2				4
	2				77
7	2		2		12	use base qw(Encode::Encoding);
	2				7
	2				2735
8						__PACKAGE__->Define('IMAP-UTF-7', 'imap-utf-7');
9						our $VERSION = '1.05';
10	2		2		38421	use MIME::Base64;
	2				1705
	2				137
11	2		2		15	use Encode;
	2				4
	2				1365
12
13						#
14						# Algorithms taken from Unicode::String by Gisle Aas
15						# Code directly borrowed from Encode::Unicode::UTF7 by Dan Kogai
16						#
17
18						# Directly from the definition in RFC2060:
19						# Ampersand (\x26) is represented as a special case
20						my $re_asis = qr/(?:[\x20-\x25\x27-\x7e])/; # printable US-ASCII except "&" represents itself
21						my $re_encoded = qr/(?:[^\x20-\x7e])/; # Everything else are represented by modified base64
22
23						my $e_utf16 = find_encoding("UTF-16BE");
24
25	0		0	1	0	sub needs_lines { 1 };
26
27						sub encode($$;$) {
28	6		6		1079	my ( $obj, $str, $chk ) = @_;
29	6				68	my $len = length($str);
30	6				18	pos($str) = 0;
31	6				12	my $bytes = '';
32	6				20	while ( pos($str) < $len ) {
33	739	100			3492	if ( $str =~ /\G($re_asis+)/ogc ) {
		100
		50
34	367				1114	$bytes .= $1;
35						} elsif ( $str =~ /\G&/ogc ) {
36	5				13	$bytes .= "&-";
37						} elsif ( $str =~ /\G($re_encoded+)/ogsc ) {
38	367				642	my $s = $1;
39	367				2452	my $base64 = encode_base64( $e_utf16->encode($s), '' );
40	367				1312	$base64 =~ s/=+$//;
41	367				741	$base64 =~ s/\//,/g;
42	367				1568	$bytes .= "&$base64-";
43						} else {
44	0				0	die "This should not happen! (pos=" . pos($str) . ")";
45						}
46						}
47	6	50			16	$_[1] = '' if $chk;
48	6				74	return $bytes;
49						}
50
51						sub decode($$;$) {
52	1		1		134	my ( $obj, $bytes, $chk ) = @_;
53	1				114	my $len = length($bytes);
54	1				2	my $str = "";
55	1				3	pos($bytes) = 0;
56	1				6	while ( pos($bytes) < $len ) {
57	726	100			2566	if ( $bytes =~ /\G([^&]+)/ogc ) {
		50
		50
		0
58	363				840	$str .= $1;
59						} elsif ( $bytes =~ /\G\&-/ogc ) {
60	0				0	$str .= "&";
61						} elsif ( $bytes =~ /\G\&([A-Za-z0-9+,]+)-?/ogsc ) {
62	363				526	my $base64 = $1;
63	363				914	$base64 =~ s/,/\//g;
64	363				682	my $pad = length($base64) % 4;
65	363	100			895	$base64 .= "=" x ( 4 - $pad ) if $pad;
66	363				2632	$str .= $e_utf16->decode( decode_base64($base64) );
67						} elsif ( $bytes =~ /\G\&/ogc ) {
68	0	0			0	$^W and warn "Bad IMAP-UTF7 data escape";
69	0				0	$str .= "&";
70						} else {
71	0				0	die "This should not happen " . pos($bytes);
72						}
73						}
74	1	50			4	$_[1] = '' if $chk;
75	1				45	return $str;
76						}
77
78
79						1;
80						__END__
81
82						=head1 NAME
83
84						Encode::IMAPUTF7 - modification of UTF-7 encoding for IMAP
85
86						=head1 SYNOPSIS
87
88						use Encode qw/encode decode/;
89
90						print encode('IMAP-UTF-7', 'RÃ©pertoire');
91						print decode('IMAP-UTF-7', R&AOk-pertoire');
92
93						=head1 ABSTRACT
94
95						IMAP mailbox names are encoded in a modified UTF7 when names contains
96						international characters outside of the printable ASCII range. The
97						modified UTF-7 encoding is defined in RFC2060 (section 5.1.3).
98
99						There is another CPAN module with same purpose, Unicode::IMAPUtf7. However, it
100						works correctly only with strings, which encoded form does not
101						contain plus sign. For example, the Cyrillic string
102						\x{043f}\x{0440}\x{0435}\x{0434}\x{043b}\x{043e}\x{0433} is represented in UTF-7 as
103						+BD8EQAQ1BDQEOwQ+BDM- Note the second plus sign 4 characters before the end.
104						Unicode::IMAPUtf7 encodes the above string as +BD8EQAQ1BDQEOwQ&BDM-
105						which is not valid modified UTF-7 (the ampersand and
106						the plus are swapped). The problem is solved by the current module,
107						which is slightly modified Encode::Unicode::UTF7 and has nothing common with
108						Unicode::IMAPUtf7.
109
110						=head1 RFC2060 - section 5.1.3 - Mailbox International Naming Convention
111
112						By convention, international mailbox names are specified using a
113						modified version of the UTF-7 encoding described in [UTF-7]. The
114						purpose of these modifications is to correct the following problems
115						with UTF-7:
116
117						1) UTF-7 uses the "+" character for shifting; this conflicts with
118						the common use of "+" in mailbox names, in particular USENET
119						newsgroup names.
120
121						2) UTF-7's encoding is BASE64 which uses the "/" character; this
122						conflicts with the use of "/" as a popular hierarchy delimiter.
123
124						3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
125						the use of "\" as a popular hierarchy delimiter.
126
127						4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
128						the use of "~" in some servers as a home directory indicator.
129
130						5) UTF-7 permits multiple alternate forms to represent the same
131						string; in particular, printable US-ASCII chararacters can be
132						represented in encoded form.
133
134						In modified UTF-7, printable US-ASCII characters except for "&"
135						represent themselves; that is, characters with octet values 0x20-0x25
136						and 0x27-0x7e. The character "&" (0x26) is represented by the two-
137						octet sequence "&-".
138
139						All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
140						Unicode 16-bit octets) are represented in modified BASE64, with a
141						further modification from [UTF-7] that "," is used instead of "/".
142						Modified BASE64 MUST NOT be used to represent any printing US-ASCII
143						character which can represent itself.
144
145						"&" is used to shift to modified BASE64 and "-" to shift back to US-
146						ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
147						is, a name that ends with a Unicode 16-bit octet MUST end with a "-
148						").
149
150						For example, here is a mailbox name which mixes English, Japanese,
151						and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
152
153						=head1 REQUESTS & BUGS
154
155						Please report any requests, suggestions or bugs via the RT bug-tracking system
156						at http://rt.cpan.org/ or email to bug-Encode-IMAPUTF7@rt.cpan.org.
157
158						http://rt.cpan.org/NoAuth/Bugs.html?Dist=Encode-IMAPUTF7 is the RT queue for Encode::IMAPUTF7.
159						Please check to see if your bug has already been reported.
160
161						=head1 COPYRIGHT
162
163						Copyright 2005 Sava Chankov
164
165						Sava Chankov, sava@cpan.org
166
167						This software may be freely copied and distributed under the same
168						terms and conditions as Perl.
169
170						=head1 AUTHORS
171
172						Peter Makholm E<lt>peter@makholm.netE<gt>, current maintainer
173
174						Sava Chankov E<lt>sava@cpan.orgE<gt>, original author
175
176						=head1 SEE ALSO
177
178						perl(1), Encode.
179
180						=cut