File Coverage

blib/lib/HTML/DOMbo.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1              
2             require 5;
3             # Time-stamp: "2005-01-04 21:16:40 AST"
4             package HTML::DOMbo;
5 1     1   7744 use strict;
  1         2  
  1         41  
6 1     1   4 use vars qw($VERSION);
  1         2  
  1         51  
7             $VERSION = '3.10';
8              
9 1     1   1460 use HTML::Element (); # just for sanity's sake
  0            
  0            
10             use XML::DOM; # import all the nice constants
11             use Carp ();
12              
13             BEGIN { eval('sub DEBUG () {0}') unless defined &DEBUG; }
14              
15             #---------------------------------------------------------------------------
16              
17             # Types of things to handle:
18             # UNKNOWN_NODE (0) The node type is unknown (not part of DOM)
19             #
20             # ELEMENT_NODE (1) The node is an Element.
21             # ATTRIBUTE_NODE (2) The node is an Attr.
22             # TEXT_NODE (3) The node is a Text node.
23             # CDATA_SECTION_NODE (4) The node is a CDATASection.
24             # ENTITY_REFERENCE_NODE (5) The node is an EntityReference.
25             # ENTITY_NODE (6) The node is an Entity.
26             # PROCESSING_INSTRUCTION_NODE (7) The node is a ProcessingInstruction.
27             # COMMENT_NODE (8) The node is a Comment.
28             # DOCUMENT_NODE (9) The node is a Document.
29             # DOCUMENT_TYPE_NODE (10) The node is a DocumentType.
30             # DOCUMENT_FRAGMENT_NODE (11) The node is a DocumentFragment.
31             # NOTATION_NODE (12) The node is a Notation.
32             #
33             # ELEMENT_DECL_NODE (13) The node is an ElementDecl (not part of DOM)
34             # ATT_DEF_NODE (14) The node is an AttDef (not part of DOM)
35             # XML_DECL_NODE (15) The node is an XMLDecl (not part of DOM)
36             # ATTLIST_DECL_NODE (16) The node is an AttlistDecl (not part of DOM)
37              
38             sub XML::DOM::Node::to_XML_Element {
39             require XML::Element;
40             if(@_ < 2) {
41             $_[0]->to_HTML_Element('XML::Element');
42             } else {
43             shift->to_HTML_Element(@_); # just an alias, then
44             }
45             }
46              
47             sub XML::DOM::Node::to_HTML_Element { # recursive method
48             my $in = $_[0];
49             my $element_class = ref($_[1] || '') || $_[1] || 'HTML::Element';
50              
51             #print "Input object $in\n";
52              
53             Carp::croak "What DOM node?" unless ref $in;
54             Carp::croak "$in isn't a DOM node" unless $in->can('getNodeType');
55            
56             my $type = $in->getNodeType;
57            
58             if($type == DOCUMENT_FRAGMENT_NODE) {
59             my(@c) = $type->getChildNodes;
60             if(wantarray) {
61             if(@c == 0) {
62             return();
63             } elsif(@c > 1) {
64             return map $_->to_HTML_Element($element_class), @c;
65             }
66             # else fall thru
67             } else {
68             if(@c == 0) {
69             return undef; # empty fragment!
70             } elsif(@c == 1) {
71             $in = $c[0];
72             $type = $in->getNodeType; #update
73             }
74             # else fall thru
75             }
76             }
77            
78             if($type == DOCUMENT_NODE) {
79             $in = $in->getDocumentElement()
80             || Carp::croak "Document has no DocumentElement?"; # sanity
81             $type = $in->getNodeType; #update
82             }
83            
84             my $out;
85             if($type == ELEMENT_NODE) {
86             # What did we ever do to deserve such a bungled mess as the DOM?
87             # The whole DOM looks like it was drafted in crayon-scrabbled Java
88             # pseudocode after a night spent huffing glue while listening to
89             # Def Leppard.
90             #
91             # Just look at this steaming mess of code it takes to get all an object's
92             # attributes!
93             my(@attrs);
94             my $attr_map = $in->getAttributes;
95             my $i;
96             my $this_attr;
97             if($attr_map and $i = $attr_map->getLength) {
98             for(my $j = 0; $j < $i; ++$j) {
99             $this_attr = $attr_map->item($j);
100             #print " <",$this_attr->getName,'><', $this_attr->getValue, ">\n";
101             push @attrs, $this_attr->getName, $this_attr->getValue;
102             }
103             }
104             $out = $element_class->new($in->getTagName(), @attrs);
105            
106             } elsif($type == TEXT_NODE
107             or $type == CDATA_SECTION_NODE
108             ) {
109             $out = $in->getNodeValue; # yes, just text!
110              
111             } elsif($type == ENTITY_REFERENCE_NODE) {
112             $out = $in->getData; # yes, just text!
113             $out = '' unless defined $out; # sanity
114             } elsif($type == COMMENT_NODE) {
115             $out = $element_class->new('~comment', 'text', $in->getData);
116            
117             } elsif($type == PROCESSING_INSTRUCTION_NODE) {
118             $out = $element_class->new('~pi',
119             'text', join(' ', $in->getTarget, $in->getData)
120             );
121            
122             } elsif($type == DOCUMENT_FRAGMENT_NODE) {
123             # a fake-o div.
124             $out = $element_class->new('div', '_implicit' => 1);
125            
126             } else {
127             #Carp::croak "I don't know how to handle objects like $in ($type)";
128             print "I don't know how to handle objects like $in ($type)" if $^W;
129             return;
130             }
131             #TODO: Declarations?
132            
133             # Now attach children
134             foreach my $c ($in->getChildNodes) {
135             die "Trying to put children on a CDATA, Text, or EntityReference node!"
136             unless ref $out;
137             # Sanity. But could entity references be children of Text?
138             $out->push_content( $c->to_HTML_Element ); # RECURSE!
139             }
140            
141             return $out;
142             }
143              
144             #---------------------------------------------------------------------------
145              
146             sub HTML::Element::to_XML_DOM { # recursive method
147             my($in, $doc) = @_;
148             Carp::croak "What element?" unless $in and ref $in;
149             $doc ||= XML::DOM::Document->new();
150              
151             my $out;
152             my $tag = $in->tag;
153             # Make a DOM clone of this node:
154             {
155             # Consider the different kinds of HTML::Element objects,
156             # which are distinguished not by their class, but by their
157             # "tag" (GI) attribute:
158              
159             DEBUG && print "+ $tag\n";
160             die "No tag for $in?" unless defined $tag and length $tag;
161             # enforce minimal sanity
162             my($k,$v); # scratch
163              
164             if($tag eq '~literal') {
165             Carp::croak "Can't put a ~literal into a DOM tree";
166             # No, it's not the same as a CDATA. ~literals are a hack.
167            
168             } elsif($tag eq '~declaration') {
169             # Might as well ignore?
170              
171             } elsif($tag eq '~pi') {
172             $k = $in->attr('text');
173             if($k =~ m<^\s*(\S+)\s+(.*)$>s) {
174             $out = $doc->createProcessingInstruction($1,$2);
175             } elsif($k =~ m<^\s*(\S+)>s) { # minimal sanity?
176             $out = $doc->createProcessingInstruction($1,'');
177             } else {
178             return; # give up
179             }
180              
181             } elsif($tag eq '~comment') {
182             $k = $in->attr('text');
183             $k = join(' ', @$k) if ref($k) eq 'ARRAY'; # never used?
184             $out = $doc->createComment($k);
185              
186             } else {
187             # It's a normal element!
188              
189             $out = $doc->createElement($tag);
190             # An exception will be thrown there if $tag isn't a legal
191             # XML element name.
192             my @attrs = $in->all_external_attr();
193             while(@attrs) {
194             ($k,$v) = splice @attrs,0,2;
195             next if $k eq '/'; # hack.
196             DEBUG && print " attr <$k><$v>\n";
197             $out->setAttribute($k,$v);
198             # An exception will be thrown there if $k isn't a legal
199             # attribute name.
200             }
201             }
202             }
203              
204             # Now, recursively, make and attach children.
205             {
206             my $new_c; #scratch
207             foreach my $c ($in->content_list) {
208             if(ref($c)) {
209             $new_c = $c->to_XML_DOM($doc) || next;
210             } else {
211             $new_c = $doc->createTextNode($c);
212             }
213             $out->appendChild($new_c); # and attach
214             }
215             # Could conceivably throw an exception if you've done
216             # something bone stupid like put a child under a
217             # comment node.
218             }
219            
220             DEBUG && print "- $tag\n";
221             return $out;
222             }
223              
224             #---------------------------------------------------------------------------
225              
226             1;
227              
228             __END__