File Coverage

blib/lib/utf8/all.pm
Criterion Covered Total %
statement 91 93 97.8
branch 34 48 70.8
condition 3 3 100.0
subroutine 16 16 100.0
pod n/a
total 144 160 90.0


line stmt bran cond sub pod time code
1             package utf8::all;
2 18     18   645601 use strict;
  18         40  
  18         477  
3 18     18   91 use warnings;
  18         37  
  18         656  
4 18     18   339 use 5.010; # state
  18         63  
5              
6             # ABSTRACT: turn on Unicode - all of it
7             our $VERSION = '0.022'; # VERSION
8              
9             #pod =head1 SYNOPSIS
10             #pod
11             #pod use utf8::all; # Turn on UTF-8, all of it.
12             #pod
13             #pod open my $in, '<', 'contains-utf8'; # UTF-8 already turned on here
14             #pod print length 'føø bār'; # 7 UTF-8 characters
15             #pod my $utf8_arg = shift @ARGV; # @ARGV is UTF-8 too (only for main)
16             #pod
17             #pod =head1 DESCRIPTION
18             #pod
19             #pod The C pragma tells the Perl parser to allow UTF-8 in the
20             #pod program text in the current lexical scope. This also means that you
21             #pod can now use literal Unicode characters as part of strings, variable
22             #pod names, and regular expressions.
23             #pod
24             #pod C goes further:
25             #pod
26             #pod =over 4
27             #pod
28             #pod =item *
29             #pod
30             #pod L|charnames> are imported so C<\N{...}> sequences can be
31             #pod used to compile Unicode characters based on names.
32             #pod
33             #pod =item *
34             #pod
35             #pod On Perl C or higher, the C is
36             #pod enabled.
37             #pod
38             #pod =item *
39             #pod
40             #pod C and C are enabled on Perl
41             #pod C<5.16.0> and higher.
42             #pod
43             #pod =item *
44             #pod
45             #pod Filehandles are opened with UTF-8 encoding turned on by default
46             #pod (including STDIN, STDOUT, STDERR). Meaning that they automatically
47             #pod convert UTF-8 octets to characters and vice versa. If you I
48             #pod want UTF-8 for a particular filehandle, you'll have to set C
49             #pod $filehandle>.
50             #pod
51             #pod =item *
52             #pod
53             #pod C<@ARGV> gets converted from UTF-8 octets to Unicode characters (when
54             #pod C is used from the main package). This is similar to the
55             #pod behaviour of the C<-CA> perl command-line switch (see L).
56             #pod
57             #pod =item *
58             #pod
59             #pod C, C, C (including the C and
60             #pod backtick operators), and L|perlfunc/glob> (including the C<< <>
61             #pod >> operator) now all work with and return Unicode characters instead
62             #pod of (UTF-8) octets.
63             #pod
64             #pod =back
65             #pod
66             #pod =head2 Lexical Scope
67             #pod
68             #pod The pragma is lexically-scoped, so you can do the following if you had
69             #pod some reason to:
70             #pod
71             #pod {
72             #pod use utf8::all;
73             #pod open my $out, '>', 'outfile';
74             #pod my $utf8_str = 'føø bār';
75             #pod print length $utf8_str, "\n"; # 7
76             #pod print $out $utf8_str; # out as utf8
77             #pod }
78             #pod open my $in, '<', 'outfile'; # in as raw
79             #pod my $text = do { local $/; <$in>};
80             #pod print length $text, "\n"; # 10, not 7!
81             #pod
82             #pod Instead of lexical scoping, you can also use C to turn
83             #pod off the effects.
84             #pod
85             #pod Note that the effect on C<@ARGV> and the C, C, and
86             #pod C file handles is always global!
87             #pod
88             #pod =head2 UTF-8 Errors
89             #pod
90             #pod C will handle invalid code points (i.e., utf-8 that does
91             #pod not map to a valid unicode "character"), as a fatal error.
92             #pod
93             #pod For C, C, and C, one can change this
94             #pod behaviour by setting the attribute L.
95             #pod
96             #pod =head1 COMPATIBILITY
97             #pod
98             #pod The filesystems of Dos, Windows, and OS/2 do not (fully) support
99             #pod UTF-8. The C and C functions and C operators
100             #pod will therefore not be replaced on these systems.
101             #pod
102             #pod =head1 SEE ALSO
103             #pod
104             #pod =over 4
105             #pod
106             #pod =item *
107             #pod
108             #pod L for fully utf-8 aware File::Find functions.
109             #pod
110             #pod =item *
111             #pod
112             #pod L for fully utf-8 aware Cwd functions.
113             #pod
114             #pod =back
115             #pod
116             #pod =cut
117              
118 18     18   7139 use Import::Into;
  18         38561  
  18         518  
119 18     18   6127 use parent qw(Encode charnames utf8 open warnings feature);
  18         4039  
  18         152  
120 18     18   2236111 use Symbol qw(qualify_to_ref);
  18         12015  
  18         1384  
121 18     18   139 use Config;
  18         50  
  18         1181  
122              
123             # Holds the pointers to the original version of redefined functions
124             state %_orig_functions;
125              
126             # Current (i.e., this) package
127             my $current_package = __PACKAGE__;
128              
129             require Carp;
130             $Carp::Internal{$current_package}++; # To get warnings reported at correct caller level
131              
132             #pod =attr $utf8::all::UTF8_CHECK
133             #pod
134             #pod By default C marks decoding errors as fatal (default value
135             #pod for this setting is C). If you want, you can change this by
136             #pod setting C<$utf8::all::UTF8_CHECK>. The value C reports
137             #pod the encoding errors as warnings, and C will completely
138             #pod ignore them. Please see L for details. Note: C is
139             #pod I enforced.
140             #pod
141             #pod Important: Only controls the handling of decoding errors in C,
142             #pod C, and C.
143             #pod
144             #pod =cut
145              
146 18     18   118 use Encode ();
  18         43  
  18         335  
147 18     18   7346 use PerlIO::utf8_strict;
  18         7482  
  18         3903  
148              
149             our $UTF8_CHECK = Encode::FB_CROAK | Encode::LEAVE_SRC; # Die on encoding errors
150              
151             # UTF-8 Encoding object
152             my $_UTF8 = Encode::find_encoding('UTF-8');
153              
154             sub import {
155             # Enable features/pragmas in calling package
156 26     26   1343 my $target = caller;
157              
158 26         229 'utf8'->import::into($target);
159 26         6280 'open'->import::into($target, 'IO' => ':utf8_strict');
160              
161             # use open ':std' only works with some encodings.
162 26         6613 state $have_encoded_std = 0;
163 26 100       203 if (!$have_encoded_std++) {
164 18         146 binmode STDERR, ':utf8_strict';
165 18         89 binmode STDOUT, ':utf8_strict';
166 18         88 binmode STDIN, ':utf8_strict';
167             }
168              
169 26         164 'charnames'->import::into($target, qw{:full :short});
170 26         10362 'warnings'->import::into($target, qw{FATAL utf8});
171 26 50       6083 'feature'->import::into($target, qw{unicode_strings}) if $^V >= v5.11.0;
172 26 50       7578 'feature'->import::into($target, qw{unicode_eval fc}) if $^V >= v5.16.0;
173              
174 26 50       5891 unless ($^O =~ /MSWin32|cygwin|dos|os2/) {
175 18     18   194 no strict qw(refs); ## no critic (TestingAndDebugging::ProhibitNoStrict)
  18         44  
  18         665  
176 18     18   98 no warnings qw(redefine);
  18         42  
  18         10913  
177              
178             # Replace readdir with utf8 aware version
179 26         86 *{$target . '::readdir'} = \&_utf8_readdir;
  26         169  
180              
181             # Replace readdir with utf8 aware version
182 26         74 *{$target . '::readlink'} = \&_utf8_readlink;
  26         148  
183              
184             # Replace glob with utf8 aware version
185 26         90 *{$target . '::glob'} = \&_utf8_glob;
  26         170  
186              
187             # Set compiler hint to encode/decode in the redefined functions
188 26         124 $^H{'utf8::all'} = 1;
189             }
190              
191             # Make @ARGV utf-8 when called from the main package, unless perl was launched
192             # with the -CA flag as this already has @ARGV decoded automatically.
193             # -CA is active if the the fifth bit (32) of the ${^UNICODE} variable is set.
194             # (see perlrun on the -C command switch for details about ${^UNICODE})
195 26 50       137 if (!(${^UNICODE} & 32)) {
196 26         68 state $have_encoded_argv = 0;
197 26 100 100     215 if ($target eq 'main' && !$have_encoded_argv++) {
198 17 50       116 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
199 17 50       95 $_ = ($_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_) for @ARGV;
200             }
201             }
202              
203 26         9787 return;
204             }
205              
206             sub unimport { ## no critic (Subroutines::ProhibitBuiltinHomonyms)
207             # Disable features/pragmas in calling package
208             # Note: Does NOT undo the effect on @ARGV,
209             # nor on the STDIN, STDOUT, and STDERR file handles!
210             # These effects are always "global".
211              
212 1     1   8 my $target = caller;
213 1         6 'utf8'->unimport::out_of($target);
214 1         162 'open'->import::into($target, qw{IO :bytes});
215              
216 1         192 $^H{'utf8::all'} = 0; # Reset compiler hint
217              
218 1         1535 return;
219             }
220              
221             sub _utf8_readdir(*) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
222 6     6   24951 my $pre_handle = shift;
223 6         74 my $hints = (caller 0)[10];
224 6 100       32 my $handle = ref($pre_handle) ? $pre_handle : qualify_to_ref($pre_handle, caller);
225 6 100       64 if (not $hints->{'utf8::all'}) {
226 1         10 return CORE::readdir($handle);
227             } else {
228 5 50       35 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
229 5 100       17 if (wantarray) {
230 4 50       64 return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::readdir($handle);
  16         193  
231             } else {
232 1         4 my $r = CORE::readdir($handle);
233 1 50       8 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
234             }
235             }
236             }
237              
238             sub _utf8_readlink(_) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
239 4     4   2662 my $arg = shift;
240 4         28 my $hints = (caller 0)[10];
241 4 50       124 if (not $hints->{'utf8::all'}) {
242 0         0 return CORE::readlink($arg);
243             } else {
244 4 100       25 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
245 4 50       41 $arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
246 3         88 my $r = CORE::readlink($arg);
247 3 100       36 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
248             }
249             }
250              
251             sub _utf8_glob {
252 9     9   3183 my $arg = $_[0]; # Making this a lexical somehow is important!
253 9         49 my $hints = (caller 0)[10];
254 9 50       147 if (not $hints->{'utf8::all'}) {
255 0         0 return CORE::glob($arg);
256             } else {
257 9 100       35 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
258 9 50       67 $arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
259 8 100       69 if (wantarray) {
260 3 50       69 return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::glob($arg);
  6         55  
261             } else {
262 5         78 my $r = CORE::glob($arg);
263 5 100       36 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
264             }
265             }
266             }
267              
268             #pod =head1 INTERACTION WITH AUTODIE
269             #pod
270             #pod If you use L, which is a great idea, you need to use at least
271             #pod version B<2.12>, released on L
272             #pod 2012|https://metacpan.org/source/PJF/autodie-2.12/Changes#L3>.
273             #pod Otherwise, autodie obliterates the IO layers set by the L
274             #pod pragma. See L
275             #pod #54777|https://rt.cpan.org/Ticket/Display.html?id=54777> and L
276             #pod #7|https://github.com/doherty/utf8-all/issues/7>.
277             #pod
278             #pod =cut
279              
280             1;
281              
282             __END__