File Coverage

blib/lib/HTML/Entities.pm

Criterion	Covered	Total	%
statement	25	26	96.1
branch	9	12	75.0
condition	2	3	66.6
subroutine	4	5	80.0
pod	3	4	75.0
total	43	50	86.0

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Entities;
2
3							=encoding utf8
4
5							=head1 NAME
6
7							HTML::Entities - Encode or decode strings with HTML entities
8
9							=head1 SYNOPSIS
10
11							use HTML::Entities;
12
13							$a = "Våre norske tegn bør æres";
14							decode_entities($a);
15							encode_entities($a, "\200-\377");
16
17							For example, this:
18
19							$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20							print encode_entities($input), "\n"
21
22							Prints this out:
23
24							vis-à-vis Beyoncé's naïve
25							papier-mâché résumé
26
27							=head1 DESCRIPTION
28
29							This module deals with encoding and decoding of strings with HTML
30							character entities. The module provides the following functions:
31
32							=over 4
33
34							=item decode_entities( $string, ... )
35
36							This routine replaces HTML entities found in the $string with the
37							corresponding Unicode character. Unrecognized entities are left alone.
38
39							If multiple strings are provided as argument they are each decoded
40							separately and the same number of strings are returned.
41
42							If called in void context the arguments are decoded in-place.
43
44							This routine is exported by default.
45
46							=item _decode_entities( $string, \%entity2char )
47
48							=item _decode_entities( $string, \%entity2char, $expand_prefix )
49
50							This will in-place replace HTML entities in $string. The %entity2char
51							hash must be provided. Named entities not found in the %entity2char
52							hash are left alone. Numeric entities are expanded unless their value
53							overflow.
54
55							The keys in %entity2char are the entity names to be expanded and their
56							values are what they should expand into. The values do not have to be
57							single character strings. If a key has ";" as suffix,
58							then occurrences in $string are only expanded if properly terminated
59							with ";". Entities without ";" will be expanded regardless of how
60							they are terminated for compatibility with how common browsers treat
61							entities in the Latin-1 range.
62
63							If $expand_prefix is TRUE then entities without trailing ";" in
64							%entity2char will even be expanded as a prefix of a longer
65							unrecognized name. The longest matching name in %entity2char will be
66							used. This is mainly present for compatibility with an MSIE
67							misfeature.
68
69							$string = "foo bar";
70							_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
71							print $string; # will print "foo bar"
72
73							This routine is exported by default.
74
75							=item encode_entities( $string )
76
77							=item encode_entities( $string, $unsafe_chars )
78
79							This routine replaces unsafe characters in $string with their entity
80							representation. A second argument can be given to specify which characters to
81							consider unsafe. The unsafe characters is specified using the regular
82							expression character class syntax (what you find within brackets in regular
83							expressions).
84
85							The default set of characters to encode are control chars, high-bit chars, and
86							the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
87							for example, would encode I the C<< < >>, C<< & >>, C<< > >>, and C<< "
88							>> characters:
89
90							$encoded = encode_entities($input, '<>&"');
91
92							and this would only encode non-plain ASCII:
93
94							$encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
95
96							This routine is exported by default.
97
98							=item encode_entities_numeric( $string )
99
100							=item encode_entities_numeric( $string, $unsafe_chars )
101
102							This routine works just like encode_entities, except that the replacement
103							entities are always C<&#xI;> and never C<&I;>. For
104							example, C returns "rôle", but
105							C returns "rôle".
106
107							This routine is I exported by default. But you can always
108							export it with C
109							or even C
110
111							=back
112
113							All these routines modify the string passed as the first argument, if
114							called in a void context. In scalar and array contexts, the encoded or
115							decoded string is returned (without changing the input string).
116
117							If you prefer not to import these routines into your namespace, you can
118							call them as:
119
120							use HTML::Entities ();
121							$decoded = HTML::Entities::decode($a);
122							$encoded = HTML::Entities::encode($a);
123							$encoded = HTML::Entities::encode_numeric($a);
124
125							The module can also export the %char2entity and the %entity2char
126							hashes, which contain the mapping from all characters to the
127							corresponding entities (and vice versa, respectively).
128
129							=head1 COPYRIGHT
130
131							Copyright 1995-2006 Gisle Aas. All rights reserved.
132
133							This library is free software; you can redistribute it and/or
134							modify it under the same terms as Perl itself.
135
136							=cut
137
138	49			49		3548	use strict;
	49					116
	49					82328
139							our $VERSION = '3.80';
140							our (%entity2char, %char2entity);
141
142							require 5.004;
143							require Exporter;
144							our @ISA = qw(Exporter);
145
146							our @EXPORT = qw(encode_entities decode_entities _decode_entities);
147							our @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
148
149	0			0	1	0	sub Version { $VERSION; }
150
151							require HTML::Parser; # for fast XS implemented decode_entities
152
153
154							%entity2char = (
155							# Some normal chars that have special meaning in SGML context
156							amp => '&', # ampersand
157							'gt' => '>', # greater than
158							'lt' => '<', # less than
159							quot => '"', # double quote
160							apos => "'", # single quote
161
162							# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
163							AElig => chr(198), # capital AE diphthong (ligature)
164							Aacute => chr(193), # capital A, acute accent
165							Acirc => chr(194), # capital A, circumflex accent
166							Agrave => chr(192), # capital A, grave accent
167							Aring => chr(197), # capital A, ring
168							Atilde => chr(195), # capital A, tilde
169							Auml => chr(196), # capital A, dieresis or umlaut mark
170							Ccedil => chr(199), # capital C, cedilla
171							ETH => chr(208), # capital Eth, Icelandic
172							Eacute => chr(201), # capital E, acute accent
173							Ecirc => chr(202), # capital E, circumflex accent
174							Egrave => chr(200), # capital E, grave accent
175							Euml => chr(203), # capital E, dieresis or umlaut mark
176							Iacute => chr(205), # capital I, acute accent
177							Icirc => chr(206), # capital I, circumflex accent
178							Igrave => chr(204), # capital I, grave accent
179							Iuml => chr(207), # capital I, dieresis or umlaut mark
180							Ntilde => chr(209), # capital N, tilde
181							Oacute => chr(211), # capital O, acute accent
182							Ocirc => chr(212), # capital O, circumflex accent
183							Ograve => chr(210), # capital O, grave accent
184							Oslash => chr(216), # capital O, slash
185							Otilde => chr(213), # capital O, tilde
186							Ouml => chr(214), # capital O, dieresis or umlaut mark
187							THORN => chr(222), # capital THORN, Icelandic
188							Uacute => chr(218), # capital U, acute accent
189							Ucirc => chr(219), # capital U, circumflex accent
190							Ugrave => chr(217), # capital U, grave accent
191							Uuml => chr(220), # capital U, dieresis or umlaut mark
192							Yacute => chr(221), # capital Y, acute accent
193							aacute => chr(225), # small a, acute accent
194							acirc => chr(226), # small a, circumflex accent
195							aelig => chr(230), # small ae diphthong (ligature)
196							agrave => chr(224), # small a, grave accent
197							aring => chr(229), # small a, ring
198							atilde => chr(227), # small a, tilde
199							auml => chr(228), # small a, dieresis or umlaut mark
200							ccedil => chr(231), # small c, cedilla
201							eacute => chr(233), # small e, acute accent
202							ecirc => chr(234), # small e, circumflex accent
203							egrave => chr(232), # small e, grave accent
204							eth => chr(240), # small eth, Icelandic
205							euml => chr(235), # small e, dieresis or umlaut mark
206							iacute => chr(237), # small i, acute accent
207							icirc => chr(238), # small i, circumflex accent
208							igrave => chr(236), # small i, grave accent
209							iuml => chr(239), # small i, dieresis or umlaut mark
210							ntilde => chr(241), # small n, tilde
211							oacute => chr(243), # small o, acute accent
212							ocirc => chr(244), # small o, circumflex accent
213							ograve => chr(242), # small o, grave accent
214							oslash => chr(248), # small o, slash
215							otilde => chr(245), # small o, tilde
216							ouml => chr(246), # small o, dieresis or umlaut mark
217							szlig => chr(223), # small sharp s, German (sz ligature)
218							thorn => chr(254), # small thorn, Icelandic
219							uacute => chr(250), # small u, acute accent
220							ucirc => chr(251), # small u, circumflex accent
221							ugrave => chr(249), # small u, grave accent
222							uuml => chr(252), # small u, dieresis or umlaut mark
223							yacute => chr(253), # small y, acute accent
224							yuml => chr(255), # small y, dieresis or umlaut mark
225
226							# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
227							copy => chr(169), # copyright sign
228							reg => chr(174), # registered sign
229							nbsp => chr(160), # non breaking space
230
231							# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
232							iexcl => chr(161),
233							cent => chr(162),
234							pound => chr(163),
235							curren => chr(164),
236							yen => chr(165),
237							brvbar => chr(166),
238							sect => chr(167),
239							uml => chr(168),
240							ordf => chr(170),
241							laquo => chr(171),
242							'not' => chr(172), # not is a keyword in perl
243							shy => chr(173),
244							macr => chr(175),
245							deg => chr(176),
246							plusmn => chr(177),
247							sup1 => chr(185),
248							sup2 => chr(178),
249							sup3 => chr(179),
250							acute => chr(180),
251							micro => chr(181),
252							para => chr(182),
253							middot => chr(183),
254							cedil => chr(184),
255							ordm => chr(186),
256							raquo => chr(187),
257							frac14 => chr(188),
258							frac12 => chr(189),
259							frac34 => chr(190),
260							iquest => chr(191),
261							'times' => chr(215), # times is a keyword in perl
262							divide => chr(247),
263
264							( $] > 5.007 ? (
265							'OElig;' => chr(338),
266							'oelig;' => chr(339),
267							'Scaron;' => chr(352),
268							'scaron;' => chr(353),
269							'Yuml;' => chr(376),
270							'fnof;' => chr(402),
271							'circ;' => chr(710),
272							'tilde;' => chr(732),
273							'Alpha;' => chr(913),
274							'Beta;' => chr(914),
275							'Gamma;' => chr(915),
276							'Delta;' => chr(916),
277							'Epsilon;' => chr(917),
278							'Zeta;' => chr(918),
279							'Eta;' => chr(919),
280							'Theta;' => chr(920),
281							'Iota;' => chr(921),
282							'Kappa;' => chr(922),
283							'Lambda;' => chr(923),
284							'Mu;' => chr(924),
285							'Nu;' => chr(925),
286							'Xi;' => chr(926),
287							'Omicron;' => chr(927),
288							'Pi;' => chr(928),
289							'Rho;' => chr(929),
290							'Sigma;' => chr(931),
291							'Tau;' => chr(932),
292							'Upsilon;' => chr(933),
293							'Phi;' => chr(934),
294							'Chi;' => chr(935),
295							'Psi;' => chr(936),
296							'Omega;' => chr(937),
297							'alpha;' => chr(945),
298							'beta;' => chr(946),
299							'gamma;' => chr(947),
300							'delta;' => chr(948),
301							'epsilon;' => chr(949),
302							'zeta;' => chr(950),
303							'eta;' => chr(951),
304							'theta;' => chr(952),
305							'iota;' => chr(953),
306							'kappa;' => chr(954),
307							'lambda;' => chr(955),
308							'mu;' => chr(956),
309							'nu;' => chr(957),
310							'xi;' => chr(958),
311							'omicron;' => chr(959),
312							'pi;' => chr(960),
313							'rho;' => chr(961),
314							'sigmaf;' => chr(962),
315							'sigma;' => chr(963),
316							'tau;' => chr(964),
317							'upsilon;' => chr(965),
318							'phi;' => chr(966),
319							'chi;' => chr(967),
320							'psi;' => chr(968),
321							'omega;' => chr(969),
322							'thetasym;' => chr(977),
323							'upsih;' => chr(978),
324							'piv;' => chr(982),
325							'ensp;' => chr(8194),
326							'emsp;' => chr(8195),
327							'thinsp;' => chr(8201),
328							'zwnj;' => chr(8204),
329							'zwj;' => chr(8205),
330							'lrm;' => chr(8206),
331							'rlm;' => chr(8207),
332							'ndash;' => chr(8211),
333							'mdash;' => chr(8212),
334							'lsquo;' => chr(8216),
335							'rsquo;' => chr(8217),
336							'sbquo;' => chr(8218),
337							'ldquo;' => chr(8220),
338							'rdquo;' => chr(8221),
339							'bdquo;' => chr(8222),
340							'dagger;' => chr(8224),
341							'Dagger;' => chr(8225),
342							'bull;' => chr(8226),
343							'hellip;' => chr(8230),
344							'permil;' => chr(8240),
345							'prime;' => chr(8242),
346							'Prime;' => chr(8243),
347							'lsaquo;' => chr(8249),
348							'rsaquo;' => chr(8250),
349							'oline;' => chr(8254),
350							'frasl;' => chr(8260),
351							'euro;' => chr(8364),
352							'image;' => chr(8465),
353							'weierp;' => chr(8472),
354							'real;' => chr(8476),
355							'trade;' => chr(8482),
356							'alefsym;' => chr(8501),
357							'larr;' => chr(8592),
358							'uarr;' => chr(8593),
359							'rarr;' => chr(8594),
360							'darr;' => chr(8595),
361							'harr;' => chr(8596),
362							'crarr;' => chr(8629),
363							'lArr;' => chr(8656),
364							'uArr;' => chr(8657),
365							'rArr;' => chr(8658),
366							'dArr;' => chr(8659),
367							'hArr;' => chr(8660),
368							'forall;' => chr(8704),
369							'part;' => chr(8706),
370							'exist;' => chr(8707),
371							'empty;' => chr(8709),
372							'nabla;' => chr(8711),
373							'isin;' => chr(8712),
374							'notin;' => chr(8713),
375							'ni;' => chr(8715),
376							'prod;' => chr(8719),
377							'sum;' => chr(8721),
378							'minus;' => chr(8722),
379							'lowast;' => chr(8727),
380							'radic;' => chr(8730),
381							'prop;' => chr(8733),
382							'infin;' => chr(8734),
383							'ang;' => chr(8736),
384							'and;' => chr(8743),
385							'or;' => chr(8744),
386							'cap;' => chr(8745),
387							'cup;' => chr(8746),
388							'int;' => chr(8747),
389							'there4;' => chr(8756),
390							'sim;' => chr(8764),
391							'cong;' => chr(8773),
392							'asymp;' => chr(8776),
393							'ne;' => chr(8800),
394							'equiv;' => chr(8801),
395							'le;' => chr(8804),
396							'ge;' => chr(8805),
397							'sub;' => chr(8834),
398							'sup;' => chr(8835),
399							'nsub;' => chr(8836),
400							'sube;' => chr(8838),
401							'supe;' => chr(8839),
402							'oplus;' => chr(8853),
403							'otimes;' => chr(8855),
404							'perp;' => chr(8869),
405							'sdot;' => chr(8901),
406							'lceil;' => chr(8968),
407							'rceil;' => chr(8969),
408							'lfloor;' => chr(8970),
409							'rfloor;' => chr(8971),
410							'lang;' => chr(9001),
411							'rang;' => chr(9002),
412							'loz;' => chr(9674),
413							'spades;' => chr(9824),
414							'clubs;' => chr(9827),
415							'hearts;' => chr(9829),
416							'diams;' => chr(9830),
417							) : ())
418							);
419
420
421							# Make the opposite mapping
422							while (my($entity, $char) = each(%entity2char)) {
423							$entity =~ s/;\z//;
424							$char2entity{$char} = "&$entity;";
425							}
426							delete $char2entity{"'"}; # only one-way decoding
427
428							# Fill in missing entities
429							for (0 .. 255) {
430							next if exists $char2entity{chr($_)};
431							$char2entity{chr($_)} = "&#$_;";
432							}
433
434							my %subst; # compiled encoding regexps
435
436							sub encode_entities
437							{
438	17	50		17	1	17016	return undef unless defined $_[0];
439	17					28	my $ref;
440	17	100				37	if (defined wantarray) {
441	14					24	my $x = $_[0];
442	14					27	$ref = \$x; # copy
443							} else {
444	3					6	$ref = \$_[0]; # modify in-place
445							}
446	17	100	66			57	if (defined $_[1] and length $_[1]) {
447	6	50				17	unless (exists $subst{$_[1]}) {
448							# Because we can't compile regex we fake it with a cached sub
449	6					9	my $chars = $_[1];
450	6					29	$chars =~ s,(?
451	6					17	$chars =~ s,(?
452	6					17	my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} \|\| num_entity(\$1)/ge; }";
453	6					692	$subst{$_[1]} = eval $code;
454	6	50				25	die( $@ . " while trying to turn range: \"$_[1]\"\n "
455							. "into code: $code\n "
456							) if $@;
457							}
458	6					13	&{$subst{$_[1]}}($$ref);
	6					119
459							} else {
460							# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
461	11	100				59	$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} \|\| num_entity($1)/ge;
	197					648
462							}
463	17					76	$$ref;
464							}
465
466							sub encode_entities_numeric {
467	2			2	1	542	local %char2entity;
468	2					6	return &encode_entities; # a goto &encode_entities wouldn't work
469							}
470
471
472							sub num_entity {
473	25			25	0	113	sprintf "&#x%X;", ord($_[0]);
474							}
475
476							# Set up aliases
477							*encode = \&encode_entities;
478							*encode_numeric = \&encode_entities_numeric;
479							*encode_numerically = \&encode_entities_numeric;
480							*decode = \&decode_entities;
481
482							1;