File Coverage

lib/MKDoc/XML/Decode/XHTML.pm
Criterion Covered Total %
statement 10 10 100.0
branch 1 2 50.0
condition n/a
subroutine 3 3 100.0
pod 0 1 0.0
total 14 16 87.5


line stmt bran cond sub pod time code
1             package MKDoc::XML::Decode::XHTML;
2 7     7   37 use warnings;
  7         12  
  7         326  
3 7     7   31 use strict;
  7         100  
  7         11587  
4              
5              
6             # Portions (c) International Organization for Standardization 1986:
7             # Permission to copy in any form is granted for use with conforming SGML
8             # systems and applications as defined in ISO 8879, provided this notice is
9             # included in all copies.
10             our %ENTITY_2_CHAR = (
11            
12             # Latin1 characters
13             'nbsp' => chr(160),
14             'iexcl' => chr(161),
15             'cent' => chr(162),
16             'pound' => chr(163),
17             'curren' => chr(164),
18             'yen' => chr(165),
19             'brvbar' => chr(166),
20             'sect' => chr(167),
21             'uml' => chr(168),
22             'copy' => chr(169),
23             'ordf' => chr(170),
24             'laquo' => chr(171),
25             'not' => chr(172),
26             'shy' => chr(173),
27             'reg' => chr(174),
28             'macr' => chr(175),
29             'deg' => chr(176),
30             'plusmn' => chr(177),
31             'sup2' => chr(178),
32             'sup3' => chr(179),
33             'acute' => chr(180),
34             'micro' => chr(181),
35             'para' => chr(182),
36             'middot' => chr(183),
37             'cedil' => chr(184),
38             'sup1' => chr(185),
39             'ordm' => chr(186),
40             'raquo' => chr(187),
41             'frac14' => chr(188),
42             'frac12' => chr(189),
43             'frac34' => chr(190),
44             'iquest' => chr(191),
45             'Agrave' => chr(192),
46             'Aacute' => chr(193),
47             'Acirc' => chr(194),
48             'Atilde' => chr(195),
49             'Auml' => chr(196),
50             'Aring' => chr(197),
51             'AElig' => chr(198),
52             'Ccedil' => chr(199),
53             'Egrave' => chr(200),
54             'Eacute' => chr(201),
55             'Ecirc' => chr(202),
56             'Euml' => chr(203),
57             'Igrave' => chr(204),
58             'Iacute' => chr(205),
59             'Icirc' => chr(206),
60             'Iuml' => chr(207),
61             'ETH' => chr(208),
62             'Ntilde' => chr(209),
63             'Ograve' => chr(210),
64             'Oacute' => chr(211),
65             'Ocirc' => chr(212),
66             'Otilde' => chr(213),
67             'Ouml' => chr(214),
68             'times' => chr(215),
69             'Oslash' => chr(216),
70             'Ugrave' => chr(217),
71             'Uacute' => chr(218),
72             'Ucirc' => chr(219),
73             'Uuml' => chr(220),
74             'Yacute' => chr(221),
75             'THORN' => chr(222),
76             'szlig' => chr(223),
77             'agrave' => chr(224),
78             'aacute' => chr(225),
79             'acirc' => chr(226),
80             'atilde' => chr(227),
81             'auml' => chr(228),
82             'aring' => chr(229),
83             'aelig' => chr(230),
84             'ccedil' => chr(231),
85             'egrave' => chr(232),
86             'eacute' => chr(233),
87             'ecirc' => chr(234),
88             'euml' => chr(235),
89             'igrave' => chr(236),
90             'iacute' => chr(237),
91             'icirc' => chr(238),
92             'iuml' => chr(239),
93             'eth' => chr(240),
94             'ntilde' => chr(241),
95             'ograve' => chr(242),
96             'oacute' => chr(243),
97             'ocirc' => chr(244),
98             'otilde' => chr(245),
99             'ouml' => chr(246),
100             'divide' => chr(247),
101             'oslash' => chr(248),
102             'ugrave' => chr(249),
103             'uacute' => chr(250),
104             'ucirc' => chr(251),
105             'uuml' => chr(252),
106             'yacute' => chr(253),
107             'thorn' => chr(254),
108             'yuml' => chr(255),
109            
110             # C0 Controls and Basic Latin
111             # 'quot' => chr(34),
112             # 'amp' => chr(38),
113             # 'apos' => chr(39),
114             # 'lt' => chr(60),
115             # 'gt' => chr(62),
116            
117             # Latin Extended-A
118             'OElig' => chr(338),
119             'oelig' => chr(339),
120             'Scaron' => chr(352),
121             'scaron' => chr(353),
122             'Yuml' => chr(376),
123            
124             # Spacin g Modifier Letters
125             'circ' => chr(710),
126             'tilde' => chr(732),
127            
128             # General Punctuation
129             # * lsaquo is proposed but not yet ISO standardized
130             # * rsaquo is proposed but not yet ISO standardized
131             'ensp' => chr(8194),
132             'emsp' => chr(8195),
133             'thinsp' => chr(8201),
134             'zwnj' => chr(8204),
135             'zwj' => chr(8205),
136             'lrm' => chr(8206),
137             'rlm' => chr(8207),
138             'ndash' => chr(8211),
139             'mdash' => chr(8212),
140             'lsquo' => chr(8216),
141             'rsquo' => chr(8217),
142             'sbquo' => chr(8218),
143             'ldquo' => chr(8220),
144             'rdquo' => chr(8221),
145             'bdquo' => chr(8222),
146             'dagger' => chr(8224),
147             'Dagger' => chr(8225),
148             'permil' => chr(8240),
149             'lsaquo' => chr(8249),
150             'rsaquo' => chr(8250),
151             'euro' => chr(8364),
152            
153             # Mathematical, Greek and Symbolic characters for HTML
154             # Latin Extended-B
155             'fnof' => chr(402),
156            
157             # Greek
158             # * there is no Sigmaf, and no U+03A2 character either
159             'Alpha' => chr(913),
160             'Beta' => chr(914),
161             'Gamma' => chr(915),
162             'Delta' => chr(916),
163             'Epsilon' => chr(917),
164             'Zeta' => chr(918),
165             'Eta' => chr(919),
166             'Theta' => chr(920),
167             'Iota' => chr(921),
168             'Kappa' => chr(922),
169             'Lambda' => chr(923),
170             'Mu' => chr(924),
171             'Nu' => chr(925),
172             'Xi' => chr(926),
173             'Omicron' => chr(927),
174             'Pi' => chr(928),
175             'Rho' => chr(929),
176             'Sigma' => chr(931),
177             'Tau' => chr(932),
178             'Upsilon' => chr(933),
179             'Phi' => chr(934),
180             'Chi' => chr(935),
181             'Psi' => chr(936),
182             'Omega' => chr(937),
183             'alpha' => chr(945),
184             'beta' => chr(946),
185             'gamma' => chr(947),
186             'delta' => chr(948),
187             'epsilon' => chr(949),
188             'zeta' => chr(950),
189             'eta' => chr(951),
190             'theta' => chr(952),
191             'iota' => chr(953),
192             'kappa' => chr(954),
193             'lambda' => chr(955),
194             'mu' => chr(956),
195             'nu' => chr(957),
196             'xi' => chr(958),
197             'omicron' => chr(959),
198             'pi' => chr(960),
199             'rho' => chr(961),
200             'sigmaf' => chr(962),
201             'sigma' => chr(963),
202             'tau' => chr(964),
203             'upsilon' => chr(965),
204             'phi' => chr(966),
205             'chi' => chr(967),
206             'psi' => chr(968),
207             'omega' => chr(969),
208             'thetasym' => chr(977),
209             'upsih' => chr(978),
210             'piv' => chr(982),
211            
212             # General Punctuation
213             # * bullet is NOT the same as bullet operator, U+2219
214             'bull' => chr(8226),
215             'hellip' => chr(8230),
216             'prime' => chr(8242),
217             'Prime' => chr(8243),
218             'oline' => chr(8254),
219             'frasl' => chr(8260),
220            
221             # Letterlike Symbols
222             # * alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters
223             'weierp' => chr(8472),
224             'image' => chr(8465),
225             'real' => chr(8476),
226             'trade' => chr(8482),
227             'alefsym' => chr(8501),
228            
229             # Arrows
230             # * Unicode does not say that lArr is the same as the 'is implied by' arrow but also
231             # does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests
232             # * Unicode does not say rArr is the 'implies' character but does not have another
233             # character with this function so ? rArr can be used for 'implies' as ISOtech suggests
234             'larr' => chr(8592),
235             'uarr' => chr(8593),
236             'rarr' => chr(8594),
237             'darr' => chr(8595),
238             'harr' => chr(8596),
239             'crarr' => chr(8629),
240             'lArr' => chr(8656),
241             'uArr' => chr(8657),
242             'rArr' => chr(8658),
243             'dArr' => chr(8659),
244             'hArr' => chr(8660),
245            
246             # Mathematical Operators
247             # * should there be a more memorable name than 'ni'?
248             # * prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both
249             # * sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both
250             # * sim: tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both
251             # * note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included.
252             # Should it be, for symmetry? It is in ISOamsn
253             # * sdot: dot operator is NOT the same character as U+00B7 middle dot
254             'forall' => chr(8704),
255             'part' => chr(8706),
256             'exist' => chr(8707),
257             'empty' => chr(8709),
258             'nabla' => chr(8711),
259             'isin' => chr(8712),
260             'notin' => chr(8713),
261             'ni' => chr(8715),
262             'prod' => chr(8719),
263             'sum' => chr(8721),
264             'minus' => chr(8722),
265             'lowast' => chr(8727),
266             'radic' => chr(8730),
267             'prop' => chr(8733),
268             'infin' => chr(8734),
269             'ang' => chr(8736),
270             'and' => chr(8743),
271             'or' => chr(8744),
272             'cap' => chr(8745),
273             'cup' => chr(8746),
274             'int' => chr(8747),
275             'there4' => chr(8756),
276             'sim' => chr(8764),
277             'cong' => chr(8773),
278             'asymp' => chr(8776),
279             'ne' => chr(8800),
280             'equiv' => chr(8801),
281             'le' => chr(8804),
282             'ge' => chr(8805),
283             'sub' => chr(8834),
284             'sup' => chr(8835),
285             'nsub' => chr(8836),
286             'sube' => chr(8838),
287             'supe' => chr(8839),
288             'oplus' => chr(8853),
289             'otimes' => chr(8855),
290             'perp' => chr(8869),
291             'sdot' => chr(8901),
292            
293             # Miscellaneous Technical
294             # * lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark'
295             # * rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark'
296             'lceil' => chr(8968),
297             'rceil' => chr(8969),
298             'lfloor' => chr(8970),
299             'rfloor' => chr(8971),
300             'lang' => chr(9001),
301             'rang' => chr(9002),
302            
303             # Geometric Shapes
304             'loz' => chr(9674),
305            
306             # Miscellaneous Symbols
307             # * black here seems to mean filled as opposed to hollow
308             'spades' => chr(9824),
309             'clubs' => chr(9827),
310             'hearts' => chr(9829),
311             'diams' => chr(9830),
312             );
313              
314              
315             sub process
316             {
317 6 50   6 0 17 (@_ == 2) or warn "MKDoc::XML::Encode::process() should be called with two arguments";
318 6         7 my $class = shift;
319 6         8 my $stuff = shift;
320 6         21 return $ENTITY_2_CHAR{$stuff};
321             }
322              
323              
324             1;