File Coverage

blib/lib/utf8/all.pm
Criterion Covered Total %
statement 96 98 97.9
branch 39 52 75.0
condition 9 12 75.0
subroutine 16 16 100.0
pod n/a
total 160 178 89.8


line stmt bran cond sub pod time code
1             package utf8::all;
2 21     21   254190 use strict;
  21         46  
  21         519  
3 21     21   96 use warnings;
  21         39  
  21         485  
4 21     21   410 use 5.010; # state
  21         73  
5              
6             # ABSTRACT: turn on Unicode - all of it
7             our $VERSION = '0.023'; # VERSION
8              
9             #pod =head1 SYNOPSIS
10             #pod
11             #pod use utf8::all; # Turn on UTF-8, all of it.
12             #pod
13             #pod open my $in, '<', 'contains-utf8'; # UTF-8 already turned on here
14             #pod print length 'føø bār'; # 7 UTF-8 characters
15             #pod my $utf8_arg = shift @ARGV; # @ARGV is UTF-8 too (only for main)
16             #pod
17             #pod =head1 DESCRIPTION
18             #pod
19             #pod The C pragma tells the Perl parser to allow UTF-8 in the
20             #pod program text in the current lexical scope. This also means that you
21             #pod can now use literal Unicode characters as part of strings, variable
22             #pod names, and regular expressions.
23             #pod
24             #pod C goes further:
25             #pod
26             #pod =over 4
27             #pod
28             #pod =item *
29             #pod
30             #pod L|charnames> are imported so C<\N{...}> sequences can be
31             #pod used to compile Unicode characters based on names.
32             #pod
33             #pod =item *
34             #pod
35             #pod On Perl C or higher, the C is
36             #pod enabled.
37             #pod
38             #pod =item *
39             #pod
40             #pod C and C are enabled on Perl
41             #pod C<5.16.0> and higher.
42             #pod
43             #pod =item *
44             #pod
45             #pod Filehandles are opened with UTF-8 encoding turned on by default
46             #pod (including C, C, and C when C is
47             #pod used from the C
package). Meaning that they automatically
48             #pod convert UTF-8 octets to characters and vice versa. If you I
49             #pod want UTF-8 for a particular filehandle, you'll have to set C
50             #pod $filehandle>.
51             #pod
52             #pod =item *
53             #pod
54             #pod C<@ARGV> gets converted from UTF-8 octets to Unicode characters (when
55             #pod C is used from the C
package). This is similar to the
56             #pod behaviour of the C<-CA> perl command-line switch (see L).
57             #pod
58             #pod =item *
59             #pod
60             #pod C, C, C (including the C and
61             #pod backtick operators), and L|perlfunc/glob> (including the C<<
62             #pod <> >> operator) now all work with and return Unicode characters
63             #pod instead of (UTF-8) octets (again only when C is used from
64             #pod the C
package).
65             #pod
66             #pod =back
67             #pod
68             #pod =head2 Lexical Scope
69             #pod
70             #pod The pragma is lexically-scoped, so you can do the following if you had
71             #pod some reason to:
72             #pod
73             #pod {
74             #pod use utf8::all;
75             #pod open my $out, '>', 'outfile';
76             #pod my $utf8_str = 'føø bār';
77             #pod print length $utf8_str, "\n"; # 7
78             #pod print $out $utf8_str; # out as utf8
79             #pod }
80             #pod open my $in, '<', 'outfile'; # in as raw
81             #pod my $text = do { local $/; <$in>};
82             #pod print length $text, "\n"; # 10, not 7!
83             #pod
84             #pod Instead of lexical scoping, you can also use C to turn
85             #pod off the effects.
86             #pod
87             #pod Note that the effect on C<@ARGV> and the C, C, and
88             #pod C file handles is always global and can not be undone!
89             #pod
90             #pod =head2 Enabling/Disabling Global Features
91             #pod
92             #pod As described above, the default behaviour of C is to
93             #pod convert C<@ARGV> and to open the C, C, and C
94             #pod file handles with UTF-8 encoding, and override the C and
95             #pod C functions and C operators when C is used
96             #pod from the C
package.
97             #pod
98             #pod If you want to disable these features even when C is used
99             #pod from the C
package, add the option C (or
100             #pod C) to the use line. E.g.:
101             #pod
102             #pod use utf8::all 'NO-GLOBAL';
103             #pod
104             #pod If on the other hand you want to enable these global effects even when
105             #pod C was used from another package than C
, use the
106             #pod option C on the use line:
107             #pod
108             #pod use utf8::all 'GLOBAL';
109             #pod
110             #pod =head2 UTF-8 Errors
111             #pod
112             #pod C will handle invalid code points (i.e., utf-8 that does
113             #pod not map to a valid unicode "character"), as a fatal error.
114             #pod
115             #pod For C, C, and C, one can change this
116             #pod behaviour by setting the attribute L.
117             #pod
118             #pod =head1 COMPATIBILITY
119             #pod
120             #pod The filesystems of Dos, Windows, and OS/2 do not (fully) support
121             #pod UTF-8. The C and C functions and C operators
122             #pod will therefore not be replaced on these systems.
123             #pod
124             #pod =head1 SEE ALSO
125             #pod
126             #pod =over 4
127             #pod
128             #pod =item *
129             #pod
130             #pod L for fully utf-8 aware File::Find functions.
131             #pod
132             #pod =item *
133             #pod
134             #pod L for fully utf-8 aware Cwd functions.
135             #pod
136             #pod =back
137             #pod
138             #pod =cut
139              
140 21     21   8071 use Import::Into;
  21         45065  
  21         664  
141 21     21   7981 use parent qw(Encode charnames utf8 open warnings feature);
  21         4795  
  21         135  
142 21     21   731705 use Symbol qw(qualify_to_ref);
  21         12629  
  21         1503  
143 21     21   151 use Config;
  21         49  
  21         1332  
144              
145             # Holds the pointers to the original version of redefined functions
146             state %_orig_functions;
147              
148             # Current (i.e., this) package
149             my $current_package = __PACKAGE__;
150              
151             require Carp;
152             $Carp::Internal{$current_package}++; # To get warnings reported at correct caller level
153              
154             #pod =attr $utf8::all::UTF8_CHECK
155             #pod
156             #pod By default C marks decoding errors as fatal (default value
157             #pod for this setting is C). If you want, you can change this by
158             #pod setting C<$utf8::all::UTF8_CHECK>. The value C reports
159             #pod the encoding errors as warnings, and C will completely
160             #pod ignore them. Please see L for details. Note: C is
161             #pod I enforced.
162             #pod
163             #pod Important: Only controls the handling of decoding errors in C,
164             #pod C, and C.
165             #pod
166             #pod =cut
167              
168 21     21   125 use Encode ();
  21         46  
  21         335  
169 21     21   8313 use PerlIO::utf8_strict;
  21         8230  
  21         5530  
170              
171             our $UTF8_CHECK = Encode::FB_CROAK | Encode::LEAVE_SRC; # Die on encoding errors
172              
173             # UTF-8 Encoding object
174             my $_UTF8 = Encode::find_encoding('UTF-8');
175              
176             sub import {
177             # Enable features/pragmas in calling package
178 29     29   1544 my $target = caller;
179              
180             # Enable global effects be default only when imported from main package
181 29         103 my $no_global = $target ne 'main';
182              
183             # Override global?
184 29 100 66     160 if (defined $_[1] && $_[1] =~ /^(?:(NO-)?GLOBAL|LEXICAL-ONLY)$/i) {
185 2         6 $no_global = $_[1] !~ /^GLOBAL$/i;
186 2         6 splice(@_, 1, 1); # Remove option from import's arguments
187             }
188              
189 29         211 'utf8'->import::into($target);
190 29         6288 'open'->import::into($target, 'IO' => ':utf8_strict');
191              
192             # use open ':std' only works with some encodings.
193 29         6509 state $have_encoded_std = 0;
194 29 100 100     241 unless ($no_global || $have_encoded_std++) {
195 18         134 binmode STDERR, ':utf8_strict';
196 18         75 binmode STDOUT, ':utf8_strict';
197 18         77 binmode STDIN, ':utf8_strict';
198             }
199              
200 29         164 'charnames'->import::into($target, qw{:full :short});
201 29         9682 'warnings'->import::into($target, qw{FATAL utf8});
202 29 50       6058 'feature'->import::into($target, qw{unicode_strings}) if $^V >= v5.11.0;
203 29 50       7039 'feature'->import::into($target, qw{unicode_eval fc}) if $^V >= v5.16.0;
204              
205 29 100 66     5699 unless ($no_global || $^O =~ /MSWin32|cygwin|dos|os2/) {
206 21     21   214 no strict qw(refs); ## no critic (TestingAndDebugging::ProhibitNoStrict)
  21         48  
  21         739  
207 21     21   110 no warnings qw(redefine);
  21         51  
  21         12200  
208              
209             # Replace readdir with utf8 aware version
210 25         71 *{$target . '::readdir'} = \&_utf8_readdir;
  25         131  
211              
212             # Replace readdir with utf8 aware version
213 25         74 *{$target . '::readlink'} = \&_utf8_readlink;
  25         97  
214              
215             # Replace glob with utf8 aware version
216 25         52 *{$target . '::glob'} = \&_utf8_glob;
  25         88  
217              
218             # Set compiler hint to encode/decode in the redefined functions
219 25         101 $^H{'utf8::all'} = 1;
220             }
221              
222             # Make @ARGV utf-8 when, unless perl was launched with the -CA
223             # flag as this already has @ARGV decoded automatically. -CA is
224             # active if the the fifth bit (32) of the ${^UNICODE} variable is
225             # set. (see perlrun on the -C command switch for details about
226             # ${^UNICODE})
227 29 100 66     189 unless ($no_global || (${^UNICODE} & 32)) {
228 25         54 state $have_encoded_argv = 0;
229 25 100       104 if (!$have_encoded_argv++) {
230 18 50       99 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
231 18 50       86 $_ = ($_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_) for @ARGV;
232             }
233             }
234              
235 29         10254 return;
236             }
237              
238             sub unimport { ## no critic (Subroutines::ProhibitBuiltinHomonyms)
239             # Disable features/pragmas in calling package
240             # Note: Does NOT undo the effect on @ARGV,
241             # nor on the STDIN, STDOUT, and STDERR file handles!
242             # These effects are always "global".
243              
244 1     1   8 my $target = caller;
245 1         5 'utf8'->unimport::out_of($target);
246 1         156 'open'->import::into($target, qw{IO :bytes});
247              
248 1 50       193 unless ($^O =~ /MSWin32|cygwin|dos|os2/) {
249 1         3 $^H{'utf8::all'} = 0; # Reset compiler hint
250             }
251              
252 1         1489 return;
253             }
254              
255             sub _utf8_readdir(*) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
256 6     6   17478 my $pre_handle = shift;
257 6         73 my $hints = (caller 0)[10];
258 6 100       30 my $handle = ref($pre_handle) ? $pre_handle : qualify_to_ref($pre_handle, caller);
259 6 100       64 if (not $hints->{'utf8::all'}) {
260 1         11 return CORE::readdir($handle);
261             } else {
262 5 50       30 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
263 5 100       12 if (wantarray) {
264 4 50       50 return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::readdir($handle);
  16         152  
265             } else {
266 1         5 my $r = CORE::readdir($handle);
267 1 50       9 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
268             }
269             }
270             }
271              
272             sub _utf8_readlink(_) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
273 4     4   2456 my $arg = shift;
274 4         27 my $hints = (caller 0)[10];
275 4 50       131 if (not $hints->{'utf8::all'}) {
276 0         0 return CORE::readlink($arg);
277             } else {
278 4 100       22 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
279 4 50       43 $arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
280 3         93 my $r = CORE::readlink($arg);
281 3 100       47 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
282             }
283             }
284              
285             sub _utf8_glob {
286 9     9   3441 my $arg = $_[0]; # Making this a lexical somehow is important!
287 9         66 my $hints = (caller 0)[10];
288 9 50       173 if (not $hints->{'utf8::all'}) {
289 0         0 return CORE::glob($arg);
290             } else {
291 9 100       41 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
292 9 50       73 $arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
293 8 100       81 if (wantarray) {
294 3 50       101 return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::glob($arg);
  6         68  
295             } else {
296 5         80 my $r = CORE::glob($arg);
297 5 100       40 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
298             }
299             }
300             }
301              
302             #pod =head1 INTERACTION WITH AUTODIE
303             #pod
304             #pod If you use L, which is a great idea, you need to use at least
305             #pod version B<2.12>, released on L
306             #pod 2012|https://metacpan.org/source/PJF/autodie-2.12/Changes#L3>.
307             #pod Otherwise, autodie obliterates the IO layers set by the L
308             #pod pragma. See L
309             #pod #54777|https://rt.cpan.org/Ticket/Display.html?id=54777> and L
310             #pod #7|https://github.com/doherty/utf8-all/issues/7>.
311             #pod
312             #pod =cut
313              
314             1;
315              
316             __END__