File Coverage

blib/lib/URI/Escape.pm

Criterion	Covered	Total	%
statement	57	57	100.0
branch	22	24	91.6
condition	3	6	50.0
subroutine	10	10	100.0
pod	3	4	75.0
total	95	101	94.0

line	stmt	bran	cond	sub	pod	time	code
1							package URI::Escape;
2
3	43			43		76704	use strict;
	43					77
	43					1231
4	43			43		210	use warnings;
	43					67
	43					2037
5
6							=head1 NAME
7
8							URI::Escape - Percent-encode and percent-decode unsafe characters
9
10							=head1 SYNOPSIS
11
12							use URI::Escape;
13							$safe = uri_escape("10% is enough\n");
14							$verysafe = uri_escape("foo", "\0-\377");
15							$str = uri_unescape($safe);
16
17							=head1 DESCRIPTION
18
19							This module provides functions to percent-encode and percent-decode URI strings as
20							defined by RFC 3986. Percent-encoding URI's is informally called "URI escaping".
21							This is the terminology used by this module, which predates the formalization of the
22							terms by the RFC by several years.
23
24							A URI consists of a restricted set of characters. The restricted set
25							of characters consists of digits, letters, and a few graphic symbols
26							chosen from those common to most of the character encodings and input
27							facilities available to Internet users. They are made up of the
28							"unreserved" and "reserved" character sets as defined in RFC 3986.
29
30							unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
31							reserved = ":" / "/" / "?" / "#" / "[" / "]" / "@"
32							"!" / "$" / "&" / "'" / "(" / ")"
33							/ "*" / "+" / "," / ";" / "="
34
35							In addition, any byte (octet) can be represented in a URI by an escape
36							sequence: a triplet consisting of the character "%" followed by two
37							hexadecimal digits. A byte can also be represented directly by a
38							character, using the US-ASCII character for that octet.
39
40							Some of the characters are I for use as delimiters or as
41							part of certain URI components. These must be escaped if they are to
42							be treated as ordinary data. Read RFC 3986 for further details.
43
44							The functions provided (and exported by default) from this module are:
45
46							=over 4
47
48							=item uri_escape( $string )
49
50							=item uri_escape( $string, $unsafe )
51
52							Replaces each unsafe character in the $string with the corresponding
53							escape sequence and returns the result. The $string argument should
54							be a string of bytes. The uri_escape() function will croak if given a
55							characters with code above 255. Use uri_escape_utf8() if you know you
56							have such chars or/and want chars in the 128 .. 255 range treated as
57							UTF-8.
58
59							The uri_escape() function takes an optional second argument that
60							overrides the set of characters that are to be escaped. The set is
61							specified as a string that can be used in a regular expression
62							character class (between [ ]). E.g.:
63
64							"\x00-\x1f\x7f-\xff" # all control and hi-bit characters
65							"a-z" # all lower case characters
66							"^A-Za-z" # everything not a letter
67
68							The default set of characters to be escaped is all those which are
69							I part of the C character class shown above as well
70							as the reserved characters. I.e. the default is:
71
72							"^A-Za-z0-9\-\._~"
73
74							The second argument can also be specified as a regular expression object:
75
76							qr/[^A-Za-z]/
77
78							Any strings matched by this regular expression will have all of their
79							characters escaped.
80
81							=item uri_escape_utf8( $string )
82
83							=item uri_escape_utf8( $string, $unsafe )
84
85							Works like uri_escape(), but will encode chars as UTF-8 before
86							escaping them. This makes this function able to deal with characters
87							with code above 255 in $string. Note that chars in the 128 .. 255
88							range will be escaped differently by this function compared to what
89							uri_escape() would. For chars in the 0 .. 127 range there is no
90							difference.
91
92							Equivalent to:
93
94							utf8::encode($string);
95							my $uri = uri_escape($string);
96
97							Note: JavaScript has a function called escape() that produces the
98							sequence "%uXXXX" for chars in the 256 .. 65535 range. This function
99							has really nothing to do with URI escaping but some folks got confused
100							since it "does the right thing" in the 0 .. 255 range. Because of
101							this you sometimes see "URIs" with these kind of escapes. The
102							JavaScript encodeURIComponent() function is similar to uri_escape_utf8().
103
104							=item uri_unescape($string,...)
105
106							Returns a string with each %XX sequence replaced with the actual byte
107							(octet).
108
109							This does the same as:
110
111							$string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
112
113							but does not modify the string in-place as this RE would. Using the
114							uri_unescape() function instead of the RE might make the code look
115							cleaner and is a few characters less to type.
116
117							In a simple benchmark test I did,
118							calling the function (instead of the inline RE above) if a few chars
119							were unescaped was something like 40% slower, and something like 700% slower if none were. If
120							you are going to unescape a lot of times it might be a good idea to
121							inline the RE.
122
123							If the uri_unescape() function is passed multiple strings, then each
124							one is returned unescaped.
125
126							=back
127
128							The module can also export the C<%escapes> hash, which contains the
129							mapping from all 256 bytes to the corresponding escape codes. Lookup
130							in this hash is faster than evaluating C
131							each time.
132
133							=head1 SEE ALSO
134
135							L
136
137
138							=head1 COPYRIGHT
139
140							Copyright 1995-2004 Gisle Aas.
141
142							This program is free software; you can redistribute it and/or modify
143							it under the same terms as Perl itself.
144
145							=cut
146
147	43			43		277	use Exporter 5.57 'import';
	43					984
	43					3630
148							our %escapes;
149							our @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8);
150							our @EXPORT_OK = qw(%escapes);
151							our $VERSION = '5.20';
152
153	43			43		312	use Carp ();
	43					132
	43					14904
154
155							# Build a char->hex map
156							for (0..255) {
157							$escapes{chr($_)} = sprintf("%%%02X", $_);
158							}
159
160							my %subst; # compiled patterns
161
162							my %Unsafe = (
163							RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/,
164							RFC3986 => qr/[^A-Za-z0-9\-\._~]/,
165							);
166
167							sub uri_escape {
168	29			29	1	15831	my($text, $patn) = @_;
169	29	100				126	return undef unless defined $text;
170	28					56	my $re;
171	28	100				96	if (defined $patn){
172	20	100				127	if (ref $patn eq 'Regexp') {
173	2					52	$text =~ s{($patn)}{
174	4		33			43	join('', map +($escapes{$_} \|\| _fail_hi($_)), split //, "$1")
175							}ge;
176	2					17	return $text;
177							}
178	18					74	$re = $subst{$patn};
179	18	100				97	if (!defined $re) {
180	15					48	$re = $patn;
181							# we need to escape the [] characters, except for those used in
182							# posix classes. if they are prefixed by a backslash, allow them
183							# through unmodified.
184	15					184	$re =~ s{(\[:\w+:\])\|(\\)?([\[\]]\|\\\z)}{
185	13	50				148	defined $1 ? $1 : defined $2 ? "$2$3" : "\\$3"
		100
186							}ge;
187	15	100				50	eval {
188							# disable the warnings here, since they will trigger later
189							# when used, and we only want them to appear once per call,
190							# but every time the same pattern is used.
191	43			43		313	no warnings 'regexp';
	43					82
	43					22549
192	15					661	$re = $subst{$patn} = qr{[$re]};
193	14					87	1;
194							} or Carp::croak("uri_escape: $@");
195							}
196							}
197							else {
198	8					19	$re = $Unsafe{RFC3986};
199							}
200	25	100				609	$text =~ s/($re)/$escapes{$1} \|\| _fail_hi($1)/ge;
	257					1111
201	24					234	$text;
202							}
203
204							sub _fail_hi {
205	1			1		4	my $chr = shift;
206	1					123	Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr));
207							}
208
209							sub uri_escape_utf8 {
210	2			2	1	5	my $text = shift;
211	2	50				6	return undef unless defined $text;
212	2					8	utf8::encode($text);
213	2					5	return uri_escape($text, @_);
214							}
215
216							sub uri_unescape {
217							# Note from RFC1630: "Sequences which start with a percent sign
218							# but are not followed by two hexadecimal characters are reserved
219							# for future extension"
220	1073			1073	1	1653	my $str = shift;
221	1073	100	66			2255	if (@_ && wantarray) {
222							# not executed for the common case of a single argument
223	1					4	my @str = ($str, @_); # need to copy
224	1					3	for (@str) {
225	3					13	s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
	3					13
226							}
227	1					8	return @str;
228							}
229	1072	100				2392	$str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str;
	892					2270
230	1072					3167	$str;
231							}
232
233							# XXX FIXME escape_char is buggy as it assigns meaning to the string's storage format.
234							sub escape_char {
235							# Old versions of utf8::is_utf8() didn't properly handle magical vars (e.g. $1).
236							# The following forces a fetch to occur beforehand.
237	602			602	0	1113	my $dummy = substr($_[0], 0, 0);
238
239	602	100				1362	if (utf8::is_utf8($_[0])) {
240	134					187	my $s = shift;
241	134					227	utf8::encode($s);
242	134					203	unshift(@_, $s);
243							}
244
245	602					2715	return join '', @URI::Escape::escapes{split //, $_[0]};
246							}
247
248							1;