line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package XHTML::Util; |
2
|
6
|
|
|
6
|
|
164010
|
use strict; |
|
6
|
|
|
|
|
18
|
|
|
6
|
|
|
|
|
203
|
|
3
|
6
|
|
|
6
|
|
33
|
use warnings; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
186
|
|
4
|
6
|
|
|
6
|
|
31
|
no warnings "uninitialized"; |
|
6
|
|
|
|
|
14
|
|
|
6
|
|
|
|
|
294
|
|
5
|
|
|
|
|
|
|
our $VERSION = "0.04"; |
6
|
6
|
|
|
6
|
|
7314
|
use Encode; |
|
6
|
|
|
|
|
95137
|
|
|
6
|
|
|
|
|
621
|
|
7
|
6
|
|
|
6
|
|
138
|
use Carp; # By verbosity? |
|
6
|
|
|
|
|
14
|
|
|
6
|
|
|
|
|
469
|
|
8
|
6
|
|
|
6
|
|
34
|
use Scalar::Util "blessed"; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
654
|
|
9
|
6
|
|
|
6
|
|
6924
|
use HTML::Tagset 3.02 (); |
|
6
|
|
|
|
|
12986
|
|
|
6
|
|
|
|
|
205
|
|
10
|
6
|
|
|
6
|
|
6673
|
use HTML::Entities; |
|
6
|
|
|
|
|
57376
|
|
|
6
|
|
|
|
|
728
|
|
11
|
6
|
|
|
6
|
|
8845
|
use XML::LibXML; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use HTML::Selector::XPath (); |
13
|
|
|
|
|
|
|
use HTML::TokeParser::Simple; |
14
|
|
|
|
|
|
|
# LWP::Simple, external styles |
15
|
|
|
|
|
|
|
use CSS::Tiny; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $isKnown = \%HTML::Tagset::isKnown; |
18
|
|
|
|
|
|
|
my $emptyElement = \%HTML::Tagset::emptyElement; |
19
|
|
|
|
|
|
|
#my $canTighten = \%HTML::Tagset::canTighten; |
20
|
|
|
|
|
|
|
#my $isHeadElement = \%HTML::Tagset::isHeadElement; |
21
|
|
|
|
|
|
|
my $isBodyElement = \%HTML::Tagset::isBodyElement; |
22
|
|
|
|
|
|
|
my $isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; |
23
|
|
|
|
|
|
|
#my $isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; |
24
|
|
|
|
|
|
|
#my $isList = \%HTML::Tagset::isList; |
25
|
|
|
|
|
|
|
#my $isTableElement = \%HTML::Tagset::isTableElement; |
26
|
|
|
|
|
|
|
my $isFormElement = \%HTML::Tagset::isFormElement; |
27
|
|
|
|
|
|
|
#my $p_closure_barriers = \@HTML::Tagset::p_closure_barriers; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# Accommodate HTML::TokeParser's idea of a "tag." |
30
|
|
|
|
|
|
|
for my $t ( keys %{$emptyElement} ) { $isKnown->{"$t/"} = 1 } |
31
|
|
|
|
|
|
|
my $isBlockLevel = { map {; $_ => 1 } |
32
|
|
|
|
|
|
|
grep { ! ( $isPhraseMarkup->{$_} || $isFormElement->{$_} ) } |
33
|
|
|
|
|
|
|
keys %{$isBodyElement} |
34
|
|
|
|
|
|
|
}; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub new { |
37
|
|
|
|
|
|
|
my $class = shift; |
38
|
|
|
|
|
|
|
my $self = bless {}, $class; |
39
|
|
|
|
|
|
|
$self; |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub strip_tags { |
43
|
|
|
|
|
|
|
my $self = shift; |
44
|
|
|
|
|
|
|
my $content = shift; |
45
|
|
|
|
|
|
|
my $xpath = HTML::Selector::XPath::selector_to_xpath(shift); |
46
|
|
|
|
|
|
|
carp "No selector was given to strip_tags" and return $content unless $xpath; |
47
|
|
|
|
|
|
|
my $root = blessed($content) =~ /\AXML::LibXML::/ ? |
48
|
|
|
|
|
|
|
$content : $self->_fragment_to_body_node($content); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
my $doc = $root->getOwnerDocument; |
51
|
|
|
|
|
|
|
for my $node ( $root->findnodes($xpath) ) |
52
|
|
|
|
|
|
|
{ |
53
|
|
|
|
|
|
|
my $fragment = $doc->createDocumentFragment; |
54
|
|
|
|
|
|
|
for my $n ( $node->childNodes ) |
55
|
|
|
|
|
|
|
{ |
56
|
|
|
|
|
|
|
$fragment->appendChild($n); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
$node->replaceNode($fragment); |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
my $out = ""; |
61
|
|
|
|
|
|
|
$out .= $_->serialize(1) for $root->childNodes; |
62
|
|
|
|
|
|
|
_trim($out); |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub _trim { |
66
|
|
|
|
|
|
|
s/\A\s+|\s+\z//g for @_; |
67
|
|
|
|
|
|
|
wantarray ? @_ : $_[0]; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub remove { # Synonymous for remove_nodes, all gone. |
71
|
|
|
|
|
|
|
my $self = shift; |
72
|
|
|
|
|
|
|
# my $content = shift; |
73
|
|
|
|
|
|
|
my $content = $self->_sanitize_fragment(shift) or return; |
74
|
|
|
|
|
|
|
my $xpath = HTML::Selector::XPath::selector_to_xpath(shift); |
75
|
|
|
|
|
|
|
carp "No selector was given to strip_tags" and return $content unless $xpath; |
76
|
|
|
|
|
|
|
my $root = blessed($content) =~ /\AXML::LibXML::/ ? |
77
|
|
|
|
|
|
|
$content : $self->_fragment_to_body_node($content); |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
$_->parentNode->removeChild($_) for $root->findnodes($xpath); |
80
|
|
|
|
|
|
|
my $out = ""; |
81
|
|
|
|
|
|
|
$out .= $_->serialize(1) for $root->childNodes; |
82
|
|
|
|
|
|
|
_trim($out); |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# No... ? requires object->call shuffling to work : sub enpara_tag { +shift->{enpara_tag} = shift || "p"; } |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
sub enpara { |
88
|
|
|
|
|
|
|
my $self = shift; |
89
|
|
|
|
|
|
|
my $content = $self->_sanitize_fragment(shift) or return; |
90
|
|
|
|
|
|
|
my $selector = shift; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
my $root = blessed($content) eq 'XML::LibXML::Element' ? |
93
|
|
|
|
|
|
|
$content : $self->_fragment_to_body_node($content); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
$root->normalize; |
96
|
|
|
|
|
|
|
my $doc = $root->getOwnerDocument; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
if ( my $xpath = HTML::Selector::XPath::selector_to_xpath($selector) ) |
99
|
|
|
|
|
|
|
{ |
100
|
|
|
|
|
|
|
NODE: |
101
|
|
|
|
|
|
|
for my $designated_enpara ( $root->findnodes($xpath) ) |
102
|
|
|
|
|
|
|
{ |
103
|
|
|
|
|
|
|
next unless $designated_enpara->nodeType == 1; |
104
|
|
|
|
|
|
|
if ( $designated_enpara->nodeName eq 'pre' ) # I don't think so, honky. |
105
|
|
|
|
|
|
|
{ |
106
|
|
|
|
|
|
|
# Expand or leave it alone? or ->validate it...? |
107
|
|
|
|
|
|
|
carp "It makes no sense to enpara within a ; skipping"; |
108
|
|
|
|
|
|
|
next NODE; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
next unless $isBlockLevel->{$designated_enpara->nodeName}; |
111
|
|
|
|
|
|
|
_enpara_this_nodes_content($designated_enpara, $doc); |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
_enpara_this_nodes_content($root, $doc); |
115
|
|
|
|
|
|
|
my $out = ""; |
116
|
|
|
|
|
|
|
$out .= $_->serialize(1) for $root->childNodes; |
117
|
|
|
|
|
|
|
_trim($out); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub _enpara_this_nodes_content { |
121
|
|
|
|
|
|
|
my ( $parent, $doc ) = @_; |
122
|
|
|
|
|
|
|
my $lastChild = $parent->lastChild; |
123
|
|
|
|
|
|
|
my @naked_block; |
124
|
|
|
|
|
|
|
for my $node ( $parent->childNodes ) |
125
|
|
|
|
|
|
|
{ |
126
|
|
|
|
|
|
|
if ( $isBlockLevel->{$node->nodeName} |
127
|
|
|
|
|
|
|
or |
128
|
|
|
|
|
|
|
$node->nodeName eq "a" # special case block level, so IGNORE |
129
|
|
|
|
|
|
|
and |
130
|
|
|
|
|
|
|
grep { $_->nodeName eq "img" } $node->childNodes |
131
|
|
|
|
|
|
|
) |
132
|
|
|
|
|
|
|
{ |
133
|
|
|
|
|
|
|
next unless @naked_block; # nothing to enblock |
134
|
|
|
|
|
|
|
my $p = $doc->createElement("p"); |
135
|
|
|
|
|
|
|
$p->setAttribute("enpara","enpara"); |
136
|
|
|
|
|
|
|
$p->appendChild($_) for @naked_block; |
137
|
|
|
|
|
|
|
$parent->insertBefore( $p, $node ) |
138
|
|
|
|
|
|
|
if $p->textContent =~ /\S/; |
139
|
|
|
|
|
|
|
@naked_block = (); |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
elsif ( $node->nodeType == 3 |
142
|
|
|
|
|
|
|
and |
143
|
|
|
|
|
|
|
$node->nodeValue =~ /(?:[^\S\n]*\n){2,}/ |
144
|
|
|
|
|
|
|
) |
145
|
|
|
|
|
|
|
{ |
146
|
|
|
|
|
|
|
my $text = $node->nodeValue; |
147
|
|
|
|
|
|
|
my @text_part = map { $doc->createTextNode($_) } |
148
|
|
|
|
|
|
|
split /([^\S\n]*\n){2,}/, $text; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
my @new_node; |
151
|
|
|
|
|
|
|
for ( my $x = 0; $x < @text_part; $x++ ) |
152
|
|
|
|
|
|
|
{ |
153
|
|
|
|
|
|
|
if ( $text_part[$x]->nodeValue =~ /\S/ ) |
154
|
|
|
|
|
|
|
{ |
155
|
|
|
|
|
|
|
push @naked_block, $text_part[$x]; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
else # it's a blank newline node so _STOP_ |
158
|
|
|
|
|
|
|
{ |
159
|
|
|
|
|
|
|
next unless @naked_block; |
160
|
|
|
|
|
|
|
my $p = $doc->createElement("p"); |
161
|
|
|
|
|
|
|
$p->setAttribute("enpara","enpara"); |
162
|
|
|
|
|
|
|
$p->appendChild($_) for @naked_block; |
163
|
|
|
|
|
|
|
@naked_block = (); |
164
|
|
|
|
|
|
|
push @new_node, $p; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
if ( @new_node ) |
168
|
|
|
|
|
|
|
{ |
169
|
|
|
|
|
|
|
$parent->insertAfter($new_node[0], $node); |
170
|
|
|
|
|
|
|
for ( my $x = 1; $x < @new_node; $x++ ) |
171
|
|
|
|
|
|
|
{ |
172
|
|
|
|
|
|
|
$parent->insertAfter($new_node[$x], $new_node[$x-1]); |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
$node->unbindNode; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
else |
178
|
|
|
|
|
|
|
{ |
179
|
|
|
|
|
|
|
push @naked_block, $node; # if $node->nodeValue =~ /\S/; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
if ( $node->isSameNode( $lastChild ) |
183
|
|
|
|
|
|
|
and @naked_block ) |
184
|
|
|
|
|
|
|
{ |
185
|
|
|
|
|
|
|
my $p = $doc->createElement("p"); |
186
|
|
|
|
|
|
|
$p->setAttribute("enpara","enpara"); |
187
|
|
|
|
|
|
|
$p->appendChild($_) for ( @naked_block ); |
188
|
|
|
|
|
|
|
$parent->appendChild($p) if $p->textContent =~ /\S/; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
my $newline = $doc->createTextNode("\n"); |
193
|
|
|
|
|
|
|
my $br = $doc->createElement("br"); |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
for my $p ( $parent->findnodes('//p[@enpara="enpara"]') ) |
196
|
|
|
|
|
|
|
{ |
197
|
|
|
|
|
|
|
$p->removeAttribute("enpara"); |
198
|
|
|
|
|
|
|
$parent->insertBefore( $newline->cloneNode, $p ); |
199
|
|
|
|
|
|
|
$parent->insertAfter( $newline->cloneNode, $p ); |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
my $frag = $doc->createDocumentFragment(); |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
my @kids = $p->childNodes(); |
204
|
|
|
|
|
|
|
for ( my $i = 0; $i < @kids; $i++ ) |
205
|
|
|
|
|
|
|
{ |
206
|
|
|
|
|
|
|
my $kid = $kids[$i]; |
207
|
|
|
|
|
|
|
next unless $kid->nodeName eq "#text"; |
208
|
|
|
|
|
|
|
my $text = $kid->nodeValue; |
209
|
|
|
|
|
|
|
$text =~ s/\A\r?\n// if $i == 0; |
210
|
|
|
|
|
|
|
$text =~ s/\r?\n\z// if $i == $#kids; |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
my @lines = map { $doc->createTextNode($_) } |
213
|
|
|
|
|
|
|
split /(\r?\n)/, $text; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
for ( my $i = 0; $i < @lines; $i++ ) |
216
|
|
|
|
|
|
|
{ |
217
|
|
|
|
|
|
|
$frag->appendChild($lines[$i]); |
218
|
|
|
|
|
|
|
unless ( $i == $#lines |
219
|
|
|
|
|
|
|
or |
220
|
|
|
|
|
|
|
$lines[$i]->nodeValue =~ /\A\r?\n\z/ ) |
221
|
|
|
|
|
|
|
{ |
222
|
|
|
|
|
|
|
$frag->appendChild($br->cloneNode); |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
$kid->replaceNode($frag); |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
sub traverse { # traverse("/*") -> callback |
231
|
|
|
|
|
|
|
my ( $self, $selector, $callback ) = @_; |
232
|
|
|
|
|
|
|
croak "not implemented"; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub translate_tags { |
236
|
|
|
|
|
|
|
croak "not implemented"; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub remove_style { # (* or [list]) |
240
|
|
|
|
|
|
|
# just calls remove with args |
241
|
|
|
|
|
|
|
croak "not implemented"; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub inline_stylesheets { # (names/paths) / external sheets allowed. |
245
|
|
|
|
|
|
|
croak "not implemented"; |
246
|
|
|
|
|
|
|
my $self = shift; |
247
|
|
|
|
|
|
|
my $thing = shift; |
248
|
|
|
|
|
|
|
# :before and :after stuff is still missing |
249
|
|
|
|
|
|
|
# ?? |