File Coverage

blib/lib/HTML/Tagset.pm
Criterion Covered Total %
statement 9 9 100.0
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 12 12 100.0


line stmt bran cond sub pod time code
1             package HTML::Tagset;
2              
3 2     2   397873 use strict;
  2         5  
  2         83  
4              
5             =head1 NAME
6              
7             HTML::Tagset - data tables useful in parsing HTML
8              
9             =head1 VERSION
10              
11             Version 3.20
12              
13             =cut
14              
15 2     2   11 use vars qw( $VERSION );
  2         4  
  2         167  
16              
17             $VERSION = '3.20';
18              
19             =head1 SYNOPSIS
20              
21             use HTML::Tagset;
22             # Then use any of the items in the HTML::Tagset package
23             # as need arises
24              
25             =head1 DESCRIPTION
26              
27             This module contains several data tables useful in various kinds of
28             HTML parsing operations.
29              
30             Note that all tag names used are lowercase.
31              
32             In the following documentation, a "hashset" is a hash being used as a
33             set -- the hash conveys that its keys are there, and the actual values
34             associated with the keys are not significant. (But what values are
35             there, are always true.)
36              
37             =cut
38              
39 2         1535 use vars qw(
40             $VERSION
41             %emptyElement %optionalEndTag %linkElements %boolean_attr
42             %isHeadElement %isBodyElement %isPhraseMarkup
43             %is_Possible_Strict_P_Content
44             %isHeadOrBodyElement
45             %isList %isTableElement %isFormElement
46             %isKnown %canTighten
47             @p_closure_barriers
48             %isCDATA_Parent
49 2     2   11 );
  2         7  
50              
51             =head1 VARIABLES
52              
53             Note that none of these variables are exported.
54              
55             =head2 hashset %HTML::Tagset::emptyElement
56              
57             This hashset has as values the tag-names (GIs) of elements that cannot
58             have content. (For example, "base", "br", "hr".) So
59             C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
60             C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
61              
62             =cut
63              
64             %emptyElement = map {; $_ => 1 } qw(base link meta isindex
65             img br hr wbr
66             input area param
67             embed bgsound spacer
68             basefont col frame
69             ~comment ~literal
70             ~declaration ~pi
71             );
72             # The "~"-initial names are for pseudo-elements used by HTML::Entities
73             # and TreeBuilder
74              
75             =head2 hashset %HTML::Tagset::optionalEndTag
76              
77             This hashset lists tag-names for elements that can have content, but whose
78             end-tags are generally, "safely", omissible. Example:
79             C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
80              
81             =cut
82              
83             %optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
84              
85             =head2 hash %HTML::Tagset::linkElements
86              
87             Values in this hash are tagnames for elements that might contain
88             links, and the value for each is a reference to an array of the names
89             of attributes whose values can be links.
90              
91             =cut
92              
93             %linkElements =
94             (
95             'a' => ['href'],
96             'applet' => ['archive', 'codebase', 'code'],
97             'area' => ['href'],
98             'base' => ['href'],
99             'bgsound' => ['src'],
100             'blockquote' => ['cite'],
101             'body' => ['background'],
102             'del' => ['cite'],
103             'embed' => ['pluginspage', 'src'],
104             'form' => ['action'],
105             'frame' => ['src', 'longdesc'],
106             'iframe' => ['src', 'longdesc'],
107             'ilayer' => ['background'],
108             'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
109             'input' => ['src', 'usemap'],
110             'ins' => ['cite'],
111             'isindex' => ['action'],
112             'head' => ['profile'],
113             'layer' => ['background', 'src'],
114             'link' => ['href'],
115             'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
116             'q' => ['cite'],
117             'script' => ['src', 'for'],
118             'table' => ['background'],
119             'td' => ['background'],
120             'th' => ['background'],
121             'tr' => ['background'],
122             'xmp' => ['href'],
123             );
124              
125             =head2 hash %HTML::Tagset::boolean_attr
126              
127             This hash (not hashset) lists what attributes of what elements can be
128             printed without showing the value (for example, the "noshade" attribute
129             of "hr" elements). For elements with only one such attribute, its value
130             is simply that attribute name. For elements with many such attributes,
131             the value is a reference to a hashset containing all such attributes.
132              
133             =cut
134              
135             %boolean_attr = (
136             # TODO: make these all hashes
137             'area' => 'nohref',
138             'dir' => 'compact',
139             'dl' => 'compact',
140             'hr' => 'noshade',
141             'img' => 'ismap',
142             'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
143             'menu' => 'compact',
144             'ol' => 'compact',
145             'option' => 'selected',
146             'select' => 'multiple',
147             'td' => 'nowrap',
148             'th' => 'nowrap',
149             'ul' => 'compact',
150             );
151              
152             #==========================================================================
153             # List of all elements from Extensible HTML version 1.0 Transitional DTD:
154             #
155             # a abbr acronym address applet area b base basefont bdo big
156             # blockquote body br button caption center cite code col colgroup
157             # dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
158             # head hr html i iframe img input ins isindex kbd label legend li
159             # link map menu meta noframes noscript object ol optgroup option p
160             # param pre q s samp script select small span strike strong style
161             # sub sup table tbody td textarea tfoot th thead title tr tt u ul
162             # var
163             #
164             # Varia from Mozilla source internal table of tags:
165             # Implemented:
166             # xmp listing wbr nobr frame frameset noframes ilayer
167             # layer nolayer spacer embed multicol
168             # But these are unimplemented:
169             # sound?? keygen?? server??
170             # Also seen here and there:
171             # marquee?? app?? (both unimplemented)
172             #==========================================================================
173              
174             =head2 hashset %HTML::Tagset::isPhraseMarkup
175              
176             This hashset contains all phrasal-level elements.
177              
178             =cut
179              
180             %isPhraseMarkup = map {; $_ => 1 } qw(
181             span abbr acronym q sub sup
182             cite code em kbd samp strong var dfn strike
183             b i u s tt small big
184             a img br
185             wbr nobr blink
186             font basefont bdo
187             spacer embed noembed
188             ); # had: center, hr, table
189              
190              
191             =head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
192              
193             This hashset contains all phrasal-level elements that be content of a
194             P element, for a strict model of HTML.
195              
196             =cut
197              
198             %is_Possible_Strict_P_Content = (
199             %isPhraseMarkup,
200             %isFormElement,
201             map {; $_ => 1} qw( object script map )
202             # I've no idea why there's these latter exceptions.
203             # I'm just following the HTML4.01 DTD.
204             );
205              
206             #from html4 strict:
207             #
208             #
209             #
210             # SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
211             #
212             #
213             # "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
214             #
215             #
216             #
217             #
218             #
219              
220             =head2 hashset %HTML::Tagset::isHeadElement
221              
222             This hashset contains all elements that elements that should be
223             present only in the 'head' element of an HTML document.
224              
225             =cut
226              
227             %isHeadElement = map {; $_ => 1 }
228             qw(title base link meta isindex script style object bgsound);
229              
230             =head2 hashset %HTML::Tagset::isList
231              
232             This hashset contains all elements that can contain "li" elements.
233              
234             =cut
235              
236             %isList = map {; $_ => 1 } qw(ul ol dir menu);
237              
238             =head2 hashset %HTML::Tagset::isTableElement
239              
240             This hashset contains all elements that are to be found only in/under
241             a "table" element.
242              
243             =cut
244              
245             %isTableElement = map {; $_ => 1 }
246             qw(tr td th thead tbody tfoot caption col colgroup);
247              
248             =head2 hashset %HTML::Tagset::isFormElement
249              
250             This hashset contains all elements that are to be found only in/under
251             a "form" element.
252              
253             =cut
254              
255             %isFormElement = map {; $_ => 1 }
256             qw(input select option optgroup textarea button label);
257              
258             =head2 hashset %HTML::Tagset::isBodyMarkup
259              
260             This hashset contains all elements that are to be found only in/under
261             the "body" element of an HTML document.
262              
263             =cut
264              
265             %isBodyElement = map {; $_ => 1 } qw(
266             h1 h2 h3 h4 h5 h6
267             p div pre plaintext address blockquote
268             xmp listing
269             center
270              
271             multicol
272             iframe ilayer nolayer
273             bgsound
274              
275             hr
276             ol ul dir menu li
277             dl dt dd
278             ins del
279            
280             fieldset legend
281            
282             map area
283             applet param object
284             isindex script noscript
285             table
286             center
287             form
288             ),
289             keys %isFormElement,
290             keys %isPhraseMarkup, # And everything phrasal
291             keys %isTableElement,
292             ;
293              
294              
295             =head2 hashset %HTML::Tagset::isHeadOrBodyElement
296              
297             This hashset includes all elements that I notice can fall either in
298             the head or in the body.
299              
300             =cut
301              
302             %isHeadOrBodyElement = map {; $_ => 1 }
303             qw(script isindex style object map area param noscript bgsound);
304             # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
305              
306              
307             =head2 hashset %HTML::Tagset::isKnown
308              
309             This hashset lists all known HTML elements.
310              
311             =cut
312              
313             %isKnown = (%isHeadElement, %isBodyElement,
314             map{; $_=>1 }
315             qw( head body html
316             frame frameset noframes
317             ~comment ~pi ~directive ~literal
318             ));
319             # that should be all known tags ever ever
320              
321              
322             =head2 hashset %HTML::Tagset::canTighten
323              
324             This hashset lists elements that might have ignorable whitespace as
325             children or siblings.
326              
327             =cut
328              
329             %canTighten = %isKnown;
330             delete @canTighten{
331             keys(%isPhraseMarkup), 'input', 'select',
332             'xmp', 'listing', 'plaintext', 'pre',
333             };
334             # xmp, listing, plaintext, and pre are untightenable, and
335             # in a really special way.
336             @canTighten{'hr','br'} = (1,1);
337             # exceptional 'phrasal' things that ARE subject to tightening.
338              
339             # The one case where I can think of my tightening rules failing is:
340             #

foo bar

baz quux ...
341             # ^-- that would get deleted.
342             # But that's pretty gruesome code anyhow. You gets what you pays for.
343              
344             #==========================================================================
345              
346             =head2 array @HTML::Tagset::p_closure_barriers
347              
348             This array has a meaning that I have only seen a need for in
349             C, but I include it here on the off chance that someone
350             might find it of use:
351              
352             When we see a "EpE" token, we go lookup up the lineage for a p
353             element we might have to minimize. At first sight, we might say that
354             if there's a p anywhere in the lineage of this new p, it should be
355             closed. But that's wrong. Consider this document:
356              
357            
358            
359             foo
360            
361            
362            

foo

363            
364            
365            
366             foo
367            

bar

368            
369            
370            
371            

372            
373            
374              
375             The second p is quite legally inside a much higher p.
376              
377             My formalization of the reason why this is legal, but this:
378              
379            

foo

bar

380              
381             isn't, is that something about the table constitutes a "barrier" to
382             the application of the rule about what p must minimize.
383              
384             So C<@HTML::Tagset::p_closure_barriers> is the list of all such
385             barrier-tags.
386              
387             =cut
388              
389             @p_closure_barriers = qw(
390             li blockquote
391             ul ol menu dir
392             dl dt dd
393             td th tr table caption
394             div
395             );
396              
397             # In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
398             # monkey business of barriers to minimization!
399              
400             =head2 hashset %isCDATA_Parent
401              
402             This hashset includes all elements whose content is CDATA.
403              
404             =cut
405              
406             %isCDATA_Parent = map {; $_ => 1 }
407             qw(script style xmp listing plaintext);
408              
409             # TODO: there's nothing else that takes CDATA children, right?
410              
411             # As the HTML3 DTD (Raggett 1995-04-24) noted:
412             # The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
413             # and derive from very early versions of HTML. They require non-
414             # standard parsers and will cause problems for processing
415             # documents with standard SGML tools.
416              
417              
418             =head1 CAVEATS
419              
420             You may find it useful to alter the behavior of modules (like
421             C or C) that use C's
422             data tables by altering the data tables themselves. You are welcome
423             to try, but be careful; and be aware that different modules may or may
424             react differently to the data tables being changed.
425              
426             Note that it may be inappropriate to use these tables for I
427             HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
428             for all elements that can appear either in the head or in the body,
429             such as "script". That doesn't mean that I am saying your code that
430             produces HTML should feel free to put script elements in either place!
431             If you are producing programs that spit out HTML, you should be
432             I familiar with the DTDs for HTML or XHTML (available at
433             C), and you should slavishly obey them, not
434             the data tables in this document.
435              
436             =head1 SEE ALSO
437              
438             L, L, L
439              
440             =head1 COPYRIGHT & LICENSE
441              
442             Copyright 1995-2000 Gisle Aas.
443              
444             Copyright 2000-2005 Sean M. Burke.
445              
446             Copyright 2005-2008 Andy Lester.
447              
448             This program is free software; you can redistribute it and/or modify it
449             under the same terms as Perl itself.
450              
451             =head1 ACKNOWLEDGEMENTS
452              
453             Most of the code/data in this module was adapted from code written
454             by Gisle Aas for C, C, and
455             C. Then it was maintained by Sean M. Burke.
456              
457             =head1 AUTHOR
458              
459             Current maintainer: Andy Lester, C<< >>
460              
461             =head1 BUGS
462              
463             Please report any bugs or feature requests to
464             C, or through the web interface at
465             L. I will
466             be notified, and then you'll automatically be notified of progress on
467             your bug as I make changes.
468              
469             =cut
470              
471             1;