File Coverage

blib/lib/URI/Escape.pm
Criterion Covered Total %
statement 57 57 100.0
branch 22 24 91.6
condition 3 6 50.0
subroutine 10 10 100.0
pod 3 4 75.0
total 95 101 94.0


line stmt bran cond sub pod time code
1             package URI::Escape;
2              
3 43     43   76704 use strict;
  43         77  
  43         1231  
4 43     43   210 use warnings;
  43         67  
  43         2037  
5              
6             =head1 NAME
7              
8             URI::Escape - Percent-encode and percent-decode unsafe characters
9              
10             =head1 SYNOPSIS
11              
12             use URI::Escape;
13             $safe = uri_escape("10% is enough\n");
14             $verysafe = uri_escape("foo", "\0-\377");
15             $str = uri_unescape($safe);
16              
17             =head1 DESCRIPTION
18              
19             This module provides functions to percent-encode and percent-decode URI strings as
20             defined by RFC 3986. Percent-encoding URI's is informally called "URI escaping".
21             This is the terminology used by this module, which predates the formalization of the
22             terms by the RFC by several years.
23              
24             A URI consists of a restricted set of characters. The restricted set
25             of characters consists of digits, letters, and a few graphic symbols
26             chosen from those common to most of the character encodings and input
27             facilities available to Internet users. They are made up of the
28             "unreserved" and "reserved" character sets as defined in RFC 3986.
29              
30             unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
31             reserved = ":" / "/" / "?" / "#" / "[" / "]" / "@"
32             "!" / "$" / "&" / "'" / "(" / ")"
33             / "*" / "+" / "," / ";" / "="
34              
35             In addition, any byte (octet) can be represented in a URI by an escape
36             sequence: a triplet consisting of the character "%" followed by two
37             hexadecimal digits. A byte can also be represented directly by a
38             character, using the US-ASCII character for that octet.
39              
40             Some of the characters are I for use as delimiters or as
41             part of certain URI components. These must be escaped if they are to
42             be treated as ordinary data. Read RFC 3986 for further details.
43              
44             The functions provided (and exported by default) from this module are:
45              
46             =over 4
47              
48             =item uri_escape( $string )
49              
50             =item uri_escape( $string, $unsafe )
51              
52             Replaces each unsafe character in the $string with the corresponding
53             escape sequence and returns the result. The $string argument should
54             be a string of bytes. The uri_escape() function will croak if given a
55             characters with code above 255. Use uri_escape_utf8() if you know you
56             have such chars or/and want chars in the 128 .. 255 range treated as
57             UTF-8.
58              
59             The uri_escape() function takes an optional second argument that
60             overrides the set of characters that are to be escaped. The set is
61             specified as a string that can be used in a regular expression
62             character class (between [ ]). E.g.:
63              
64             "\x00-\x1f\x7f-\xff" # all control and hi-bit characters
65             "a-z" # all lower case characters
66             "^A-Za-z" # everything not a letter
67              
68             The default set of characters to be escaped is all those which are
69             I part of the C character class shown above as well
70             as the reserved characters. I.e. the default is:
71              
72             "^A-Za-z0-9\-\._~"
73              
74             The second argument can also be specified as a regular expression object:
75              
76             qr/[^A-Za-z]/
77              
78             Any strings matched by this regular expression will have all of their
79             characters escaped.
80              
81             =item uri_escape_utf8( $string )
82              
83             =item uri_escape_utf8( $string, $unsafe )
84              
85             Works like uri_escape(), but will encode chars as UTF-8 before
86             escaping them. This makes this function able to deal with characters
87             with code above 255 in $string. Note that chars in the 128 .. 255
88             range will be escaped differently by this function compared to what
89             uri_escape() would. For chars in the 0 .. 127 range there is no
90             difference.
91              
92             Equivalent to:
93              
94             utf8::encode($string);
95             my $uri = uri_escape($string);
96              
97             Note: JavaScript has a function called escape() that produces the
98             sequence "%uXXXX" for chars in the 256 .. 65535 range. This function
99             has really nothing to do with URI escaping but some folks got confused
100             since it "does the right thing" in the 0 .. 255 range. Because of
101             this you sometimes see "URIs" with these kind of escapes. The
102             JavaScript encodeURIComponent() function is similar to uri_escape_utf8().
103              
104             =item uri_unescape($string,...)
105              
106             Returns a string with each %XX sequence replaced with the actual byte
107             (octet).
108              
109             This does the same as:
110              
111             $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
112              
113             but does not modify the string in-place as this RE would. Using the
114             uri_unescape() function instead of the RE might make the code look
115             cleaner and is a few characters less to type.
116              
117             In a simple benchmark test I did,
118             calling the function (instead of the inline RE above) if a few chars
119             were unescaped was something like 40% slower, and something like 700% slower if none were. If
120             you are going to unescape a lot of times it might be a good idea to
121             inline the RE.
122              
123             If the uri_unescape() function is passed multiple strings, then each
124             one is returned unescaped.
125              
126             =back
127              
128             The module can also export the C<%escapes> hash, which contains the
129             mapping from all 256 bytes to the corresponding escape codes. Lookup
130             in this hash is faster than evaluating C
131             each time.
132              
133             =head1 SEE ALSO
134              
135             L
136              
137              
138             =head1 COPYRIGHT
139              
140             Copyright 1995-2004 Gisle Aas.
141              
142             This program is free software; you can redistribute it and/or modify
143             it under the same terms as Perl itself.
144              
145             =cut
146              
147 43     43   277 use Exporter 5.57 'import';
  43         984  
  43         3630  
148             our %escapes;
149             our @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8);
150             our @EXPORT_OK = qw(%escapes);
151             our $VERSION = '5.20';
152              
153 43     43   312 use Carp ();
  43         132  
  43         14904  
154              
155             # Build a char->hex map
156             for (0..255) {
157             $escapes{chr($_)} = sprintf("%%%02X", $_);
158             }
159              
160             my %subst; # compiled patterns
161              
162             my %Unsafe = (
163             RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/,
164             RFC3986 => qr/[^A-Za-z0-9\-\._~]/,
165             );
166              
167             sub uri_escape {
168 29     29 1 15831 my($text, $patn) = @_;
169 29 100       126 return undef unless defined $text;
170 28         56 my $re;
171 28 100       96 if (defined $patn){
172 20 100       127 if (ref $patn eq 'Regexp') {
173 2         52 $text =~ s{($patn)}{
174 4   33     43 join('', map +($escapes{$_} || _fail_hi($_)), split //, "$1")
175             }ge;
176 2         17 return $text;
177             }
178 18         74 $re = $subst{$patn};
179 18 100       97 if (!defined $re) {
180 15         48 $re = $patn;
181             # we need to escape the [] characters, except for those used in
182             # posix classes. if they are prefixed by a backslash, allow them
183             # through unmodified.
184 15         184 $re =~ s{(\[:\w+:\])|(\\)?([\[\]]|\\\z)}{
185 13 50       148 defined $1 ? $1 : defined $2 ? "$2$3" : "\\$3"
    100          
186             }ge;
187 15 100       50 eval {
188             # disable the warnings here, since they will trigger later
189             # when used, and we only want them to appear once per call,
190             # but every time the same pattern is used.
191 43     43   313 no warnings 'regexp';
  43         82  
  43         22549  
192 15         661 $re = $subst{$patn} = qr{[$re]};
193 14         87 1;
194             } or Carp::croak("uri_escape: $@");
195             }
196             }
197             else {
198 8         19 $re = $Unsafe{RFC3986};
199             }
200 25 100       609 $text =~ s/($re)/$escapes{$1} || _fail_hi($1)/ge;
  257         1111  
201 24         234 $text;
202             }
203              
204             sub _fail_hi {
205 1     1   4 my $chr = shift;
206 1         123 Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr));
207             }
208              
209             sub uri_escape_utf8 {
210 2     2 1 5 my $text = shift;
211 2 50       6 return undef unless defined $text;
212 2         8 utf8::encode($text);
213 2         5 return uri_escape($text, @_);
214             }
215              
216             sub uri_unescape {
217             # Note from RFC1630: "Sequences which start with a percent sign
218             # but are not followed by two hexadecimal characters are reserved
219             # for future extension"
220 1073     1073 1 1653 my $str = shift;
221 1073 100 66     2255 if (@_ && wantarray) {
222             # not executed for the common case of a single argument
223 1         4 my @str = ($str, @_); # need to copy
224 1         3 for (@str) {
225 3         13 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
  3         13  
226             }
227 1         8 return @str;
228             }
229 1072 100       2392 $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str;
  892         2270  
230 1072         3167 $str;
231             }
232              
233             # XXX FIXME escape_char is buggy as it assigns meaning to the string's storage format.
234             sub escape_char {
235             # Old versions of utf8::is_utf8() didn't properly handle magical vars (e.g. $1).
236             # The following forces a fetch to occur beforehand.
237 602     602 0 1113 my $dummy = substr($_[0], 0, 0);
238              
239 602 100       1362 if (utf8::is_utf8($_[0])) {
240 134         187 my $s = shift;
241 134         227 utf8::encode($s);
242 134         203 unshift(@_, $s);
243             }
244              
245 602         2715 return join '', @URI::Escape::escapes{split //, $_[0]};
246             }
247              
248             1;