File Coverage

blib/lib/HTML/Entities.pm
Criterion Covered Total %
statement 25 26 96.1
branch 9 12 75.0
condition 2 3 66.6
subroutine 4 5 80.0
pod 3 4 75.0
total 43 50 86.0


line stmt bran cond sub pod time code
1             package HTML::Entities;
2              
3             =encoding utf8
4              
5             =head1 NAME
6              
7             HTML::Entities - Encode or decode strings with HTML entities
8              
9             =head1 SYNOPSIS
10              
11             use HTML::Entities;
12              
13             $a = "Våre norske tegn bør æres";
14             decode_entities($a);
15             encode_entities($a, "\200-\377");
16              
17             For example, this:
18              
19             $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20             print encode_entities($input), "\n"
21              
22             Prints this out:
23              
24             vis-à-vis Beyoncé's naïve
25             papier-mâché résumé
26              
27             =head1 DESCRIPTION
28              
29             This module deals with encoding and decoding of strings with HTML
30             character entities. The module provides the following functions:
31              
32             =over 4
33              
34             =item decode_entities( $string, ... )
35              
36             This routine replaces HTML entities found in the $string with the
37             corresponding Unicode character. Unrecognized entities are left alone.
38              
39             If multiple strings are provided as argument they are each decoded
40             separately and the same number of strings are returned.
41              
42             If called in void context the arguments are decoded in-place.
43              
44             This routine is exported by default.
45              
46             =item _decode_entities( $string, \%entity2char )
47              
48             =item _decode_entities( $string, \%entity2char, $expand_prefix )
49              
50             This will in-place replace HTML entities in $string. The %entity2char
51             hash must be provided. Named entities not found in the %entity2char
52             hash are left alone. Numeric entities are expanded unless their value
53             overflow.
54              
55             The keys in %entity2char are the entity names to be expanded and their
56             values are what they should expand into. The values do not have to be
57             single character strings. If a key has ";" as suffix,
58             then occurrences in $string are only expanded if properly terminated
59             with ";". Entities without ";" will be expanded regardless of how
60             they are terminated for compatibility with how common browsers treat
61             entities in the Latin-1 range.
62              
63             If $expand_prefix is TRUE then entities without trailing ";" in
64             %entity2char will even be expanded as a prefix of a longer
65             unrecognized name. The longest matching name in %entity2char will be
66             used. This is mainly present for compatibility with an MSIE
67             misfeature.
68              
69             $string = "foo bar";
70             _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
71             print $string; # will print "foo bar"
72              
73             This routine is exported by default.
74              
75             =item encode_entities( $string )
76              
77             =item encode_entities( $string, $unsafe_chars )
78              
79             This routine replaces unsafe characters in $string with their entity
80             representation. A second argument can be given to specify which characters to
81             consider unsafe. The unsafe characters is specified using the regular
82             expression character class syntax (what you find within brackets in regular
83             expressions).
84              
85             The default set of characters to encode are control chars, high-bit chars, and
86             the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
87             for example, would encode I the C<< < >>, C<< & >>, C<< > >>, and C<< "
88             >> characters:
89              
90             $encoded = encode_entities($input, '<>&"');
91              
92             and this would only encode non-plain ASCII:
93              
94             $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
95              
96             This routine is exported by default.
97              
98             =item encode_entities_numeric( $string )
99              
100             =item encode_entities_numeric( $string, $unsafe_chars )
101              
102             This routine works just like encode_entities, except that the replacement
103             entities are always C<&#xI;> and never C<&I;>. For
104             example, C returns "rôle", but
105             C returns "rôle".
106              
107             This routine is I exported by default. But you can always
108             export it with C
109             or even C
110              
111             =back
112              
113             All these routines modify the string passed as the first argument, if
114             called in a void context. In scalar and array contexts, the encoded or
115             decoded string is returned (without changing the input string).
116              
117             If you prefer not to import these routines into your namespace, you can
118             call them as:
119              
120             use HTML::Entities ();
121             $decoded = HTML::Entities::decode($a);
122             $encoded = HTML::Entities::encode($a);
123             $encoded = HTML::Entities::encode_numeric($a);
124              
125             The module can also export the %char2entity and the %entity2char
126             hashes, which contain the mapping from all characters to the
127             corresponding entities (and vice versa, respectively).
128              
129             =head1 COPYRIGHT
130              
131             Copyright 1995-2006 Gisle Aas. All rights reserved.
132              
133             This library is free software; you can redistribute it and/or
134             modify it under the same terms as Perl itself.
135              
136             =cut
137              
138 49     49   3548 use strict;
  49         116  
  49         82328  
139             our $VERSION = '3.80';
140             our (%entity2char, %char2entity);
141              
142             require 5.004;
143             require Exporter;
144             our @ISA = qw(Exporter);
145              
146             our @EXPORT = qw(encode_entities decode_entities _decode_entities);
147             our @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
148              
149 0     0 1 0 sub Version { $VERSION; }
150              
151             require HTML::Parser; # for fast XS implemented decode_entities
152              
153              
154             %entity2char = (
155             # Some normal chars that have special meaning in SGML context
156             amp => '&', # ampersand
157             'gt' => '>', # greater than
158             'lt' => '<', # less than
159             quot => '"', # double quote
160             apos => "'", # single quote
161              
162             # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
163             AElig => chr(198), # capital AE diphthong (ligature)
164             Aacute => chr(193), # capital A, acute accent
165             Acirc => chr(194), # capital A, circumflex accent
166             Agrave => chr(192), # capital A, grave accent
167             Aring => chr(197), # capital A, ring
168             Atilde => chr(195), # capital A, tilde
169             Auml => chr(196), # capital A, dieresis or umlaut mark
170             Ccedil => chr(199), # capital C, cedilla
171             ETH => chr(208), # capital Eth, Icelandic
172             Eacute => chr(201), # capital E, acute accent
173             Ecirc => chr(202), # capital E, circumflex accent
174             Egrave => chr(200), # capital E, grave accent
175             Euml => chr(203), # capital E, dieresis or umlaut mark
176             Iacute => chr(205), # capital I, acute accent
177             Icirc => chr(206), # capital I, circumflex accent
178             Igrave => chr(204), # capital I, grave accent
179             Iuml => chr(207), # capital I, dieresis or umlaut mark
180             Ntilde => chr(209), # capital N, tilde
181             Oacute => chr(211), # capital O, acute accent
182             Ocirc => chr(212), # capital O, circumflex accent
183             Ograve => chr(210), # capital O, grave accent
184             Oslash => chr(216), # capital O, slash
185             Otilde => chr(213), # capital O, tilde
186             Ouml => chr(214), # capital O, dieresis or umlaut mark
187             THORN => chr(222), # capital THORN, Icelandic
188             Uacute => chr(218), # capital U, acute accent
189             Ucirc => chr(219), # capital U, circumflex accent
190             Ugrave => chr(217), # capital U, grave accent
191             Uuml => chr(220), # capital U, dieresis or umlaut mark
192             Yacute => chr(221), # capital Y, acute accent
193             aacute => chr(225), # small a, acute accent
194             acirc => chr(226), # small a, circumflex accent
195             aelig => chr(230), # small ae diphthong (ligature)
196             agrave => chr(224), # small a, grave accent
197             aring => chr(229), # small a, ring
198             atilde => chr(227), # small a, tilde
199             auml => chr(228), # small a, dieresis or umlaut mark
200             ccedil => chr(231), # small c, cedilla
201             eacute => chr(233), # small e, acute accent
202             ecirc => chr(234), # small e, circumflex accent
203             egrave => chr(232), # small e, grave accent
204             eth => chr(240), # small eth, Icelandic
205             euml => chr(235), # small e, dieresis or umlaut mark
206             iacute => chr(237), # small i, acute accent
207             icirc => chr(238), # small i, circumflex accent
208             igrave => chr(236), # small i, grave accent
209             iuml => chr(239), # small i, dieresis or umlaut mark
210             ntilde => chr(241), # small n, tilde
211             oacute => chr(243), # small o, acute accent
212             ocirc => chr(244), # small o, circumflex accent
213             ograve => chr(242), # small o, grave accent
214             oslash => chr(248), # small o, slash
215             otilde => chr(245), # small o, tilde
216             ouml => chr(246), # small o, dieresis or umlaut mark
217             szlig => chr(223), # small sharp s, German (sz ligature)
218             thorn => chr(254), # small thorn, Icelandic
219             uacute => chr(250), # small u, acute accent
220             ucirc => chr(251), # small u, circumflex accent
221             ugrave => chr(249), # small u, grave accent
222             uuml => chr(252), # small u, dieresis or umlaut mark
223             yacute => chr(253), # small y, acute accent
224             yuml => chr(255), # small y, dieresis or umlaut mark
225              
226             # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
227             copy => chr(169), # copyright sign
228             reg => chr(174), # registered sign
229             nbsp => chr(160), # non breaking space
230              
231             # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
232             iexcl => chr(161),
233             cent => chr(162),
234             pound => chr(163),
235             curren => chr(164),
236             yen => chr(165),
237             brvbar => chr(166),
238             sect => chr(167),
239             uml => chr(168),
240             ordf => chr(170),
241             laquo => chr(171),
242             'not' => chr(172), # not is a keyword in perl
243             shy => chr(173),
244             macr => chr(175),
245             deg => chr(176),
246             plusmn => chr(177),
247             sup1 => chr(185),
248             sup2 => chr(178),
249             sup3 => chr(179),
250             acute => chr(180),
251             micro => chr(181),
252             para => chr(182),
253             middot => chr(183),
254             cedil => chr(184),
255             ordm => chr(186),
256             raquo => chr(187),
257             frac14 => chr(188),
258             frac12 => chr(189),
259             frac34 => chr(190),
260             iquest => chr(191),
261             'times' => chr(215), # times is a keyword in perl
262             divide => chr(247),
263              
264             ( $] > 5.007 ? (
265             'OElig;' => chr(338),
266             'oelig;' => chr(339),
267             'Scaron;' => chr(352),
268             'scaron;' => chr(353),
269             'Yuml;' => chr(376),
270             'fnof;' => chr(402),
271             'circ;' => chr(710),
272             'tilde;' => chr(732),
273             'Alpha;' => chr(913),
274             'Beta;' => chr(914),
275             'Gamma;' => chr(915),
276             'Delta;' => chr(916),
277             'Epsilon;' => chr(917),
278             'Zeta;' => chr(918),
279             'Eta;' => chr(919),
280             'Theta;' => chr(920),
281             'Iota;' => chr(921),
282             'Kappa;' => chr(922),
283             'Lambda;' => chr(923),
284             'Mu;' => chr(924),
285             'Nu;' => chr(925),
286             'Xi;' => chr(926),
287             'Omicron;' => chr(927),
288             'Pi;' => chr(928),
289             'Rho;' => chr(929),
290             'Sigma;' => chr(931),
291             'Tau;' => chr(932),
292             'Upsilon;' => chr(933),
293             'Phi;' => chr(934),
294             'Chi;' => chr(935),
295             'Psi;' => chr(936),
296             'Omega;' => chr(937),
297             'alpha;' => chr(945),
298             'beta;' => chr(946),
299             'gamma;' => chr(947),
300             'delta;' => chr(948),
301             'epsilon;' => chr(949),
302             'zeta;' => chr(950),
303             'eta;' => chr(951),
304             'theta;' => chr(952),
305             'iota;' => chr(953),
306             'kappa;' => chr(954),
307             'lambda;' => chr(955),
308             'mu;' => chr(956),
309             'nu;' => chr(957),
310             'xi;' => chr(958),
311             'omicron;' => chr(959),
312             'pi;' => chr(960),
313             'rho;' => chr(961),
314             'sigmaf;' => chr(962),
315             'sigma;' => chr(963),
316             'tau;' => chr(964),
317             'upsilon;' => chr(965),
318             'phi;' => chr(966),
319             'chi;' => chr(967),
320             'psi;' => chr(968),
321             'omega;' => chr(969),
322             'thetasym;' => chr(977),
323             'upsih;' => chr(978),
324             'piv;' => chr(982),
325             'ensp;' => chr(8194),
326             'emsp;' => chr(8195),
327             'thinsp;' => chr(8201),
328             'zwnj;' => chr(8204),
329             'zwj;' => chr(8205),
330             'lrm;' => chr(8206),
331             'rlm;' => chr(8207),
332             'ndash;' => chr(8211),
333             'mdash;' => chr(8212),
334             'lsquo;' => chr(8216),
335             'rsquo;' => chr(8217),
336             'sbquo;' => chr(8218),
337             'ldquo;' => chr(8220),
338             'rdquo;' => chr(8221),
339             'bdquo;' => chr(8222),
340             'dagger;' => chr(8224),
341             'Dagger;' => chr(8225),
342             'bull;' => chr(8226),
343             'hellip;' => chr(8230),
344             'permil;' => chr(8240),
345             'prime;' => chr(8242),
346             'Prime;' => chr(8243),
347             'lsaquo;' => chr(8249),
348             'rsaquo;' => chr(8250),
349             'oline;' => chr(8254),
350             'frasl;' => chr(8260),
351             'euro;' => chr(8364),
352             'image;' => chr(8465),
353             'weierp;' => chr(8472),
354             'real;' => chr(8476),
355             'trade;' => chr(8482),
356             'alefsym;' => chr(8501),
357             'larr;' => chr(8592),
358             'uarr;' => chr(8593),
359             'rarr;' => chr(8594),
360             'darr;' => chr(8595),
361             'harr;' => chr(8596),
362             'crarr;' => chr(8629),
363             'lArr;' => chr(8656),
364             'uArr;' => chr(8657),
365             'rArr;' => chr(8658),
366             'dArr;' => chr(8659),
367             'hArr;' => chr(8660),
368             'forall;' => chr(8704),
369             'part;' => chr(8706),
370             'exist;' => chr(8707),
371             'empty;' => chr(8709),
372             'nabla;' => chr(8711),
373             'isin;' => chr(8712),
374             'notin;' => chr(8713),
375             'ni;' => chr(8715),
376             'prod;' => chr(8719),
377             'sum;' => chr(8721),
378             'minus;' => chr(8722),
379             'lowast;' => chr(8727),
380             'radic;' => chr(8730),
381             'prop;' => chr(8733),
382             'infin;' => chr(8734),
383             'ang;' => chr(8736),
384             'and;' => chr(8743),
385             'or;' => chr(8744),
386             'cap;' => chr(8745),
387             'cup;' => chr(8746),
388             'int;' => chr(8747),
389             'there4;' => chr(8756),
390             'sim;' => chr(8764),
391             'cong;' => chr(8773),
392             'asymp;' => chr(8776),
393             'ne;' => chr(8800),
394             'equiv;' => chr(8801),
395             'le;' => chr(8804),
396             'ge;' => chr(8805),
397             'sub;' => chr(8834),
398             'sup;' => chr(8835),
399             'nsub;' => chr(8836),
400             'sube;' => chr(8838),
401             'supe;' => chr(8839),
402             'oplus;' => chr(8853),
403             'otimes;' => chr(8855),
404             'perp;' => chr(8869),
405             'sdot;' => chr(8901),
406             'lceil;' => chr(8968),
407             'rceil;' => chr(8969),
408             'lfloor;' => chr(8970),
409             'rfloor;' => chr(8971),
410             'lang;' => chr(9001),
411             'rang;' => chr(9002),
412             'loz;' => chr(9674),
413             'spades;' => chr(9824),
414             'clubs;' => chr(9827),
415             'hearts;' => chr(9829),
416             'diams;' => chr(9830),
417             ) : ())
418             );
419              
420              
421             # Make the opposite mapping
422             while (my($entity, $char) = each(%entity2char)) {
423             $entity =~ s/;\z//;
424             $char2entity{$char} = "&$entity;";
425             }
426             delete $char2entity{"'"}; # only one-way decoding
427              
428             # Fill in missing entities
429             for (0 .. 255) {
430             next if exists $char2entity{chr($_)};
431             $char2entity{chr($_)} = "&#$_;";
432             }
433              
434             my %subst; # compiled encoding regexps
435              
436             sub encode_entities
437             {
438 17 50   17 1 17016 return undef unless defined $_[0];
439 17         28 my $ref;
440 17 100       37 if (defined wantarray) {
441 14         24 my $x = $_[0];
442 14         27 $ref = \$x; # copy
443             } else {
444 3         6 $ref = \$_[0]; # modify in-place
445             }
446 17 100 66     57 if (defined $_[1] and length $_[1]) {
447 6 50       17 unless (exists $subst{$_[1]}) {
448             # Because we can't compile regex we fake it with a cached sub
449 6         9 my $chars = $_[1];
450 6         29 $chars =~ s,(?
451 6         17 $chars =~ s,(?
452 6         17 my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
453 6         692 $subst{$_[1]} = eval $code;
454 6 50       25 die( $@ . " while trying to turn range: \"$_[1]\"\n "
455             . "into code: $code\n "
456             ) if $@;
457             }
458 6         13 &{$subst{$_[1]}}($$ref);
  6         119  
459             } else {
460             # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
461 11 100       59 $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
  197         648  
462             }
463 17         76 $$ref;
464             }
465              
466             sub encode_entities_numeric {
467 2     2 1 542 local %char2entity;
468 2         6 return &encode_entities; # a goto &encode_entities wouldn't work
469             }
470              
471              
472             sub num_entity {
473 25     25 0 113 sprintf "&#x%X;", ord($_[0]);
474             }
475              
476             # Set up aliases
477             *encode = \&encode_entities;
478             *encode_numeric = \&encode_entities_numeric;
479             *encode_numerically = \&encode_entities_numeric;
480             *decode = \&decode_entities;
481              
482             1;