File Coverage

blib/lib/HTML/Detergent.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package HTML::Detergent;
2              
3 1     1   25417 use 5.010;
  1         4  
  1         39  
4 1     1   5 use strict;
  1         2  
  1         35  
5 1     1   6 use warnings FATAL => 'all';
  1         7  
  1         49  
6              
7 1     1   979 use Moose;
  1         772658  
  1         11  
8 1     1   10012 use namespace::autoclean;
  1         1622  
  1         6  
9              
10 1     1   58 use Scalar::Util ();
  1         2  
  1         17  
11 1     1   1136 use URI ();
  1         5596  
  1         26  
12 1     1   448 use XML::LibXML ();
  0            
  0            
13             use XML::LibXSLT ();
14             use XML::LibXML::LazyBuilder qw(DOM E);
15              
16             use HTML::Detergent::Config qw(Config);
17              
18             has config => (
19             is => 'ro',
20             isa => Config,
21             coerce => 1,
22             required => 1,
23             );
24              
25             has xml_parser => (
26             is => 'ro',
27             isa => 'XML::LibXML',
28             default => sub { XML::LibXML->new(recover => 2) },
29             lazy => 1,
30             );
31              
32             # has html_parser => (
33             # is => 'ro',
34             # isa => 'HTML::HTML5::Parser',
35             # default => sub { require HTML::HTML5::Parser; HTML::HTML5::Parser->new },
36             # lazy => 1,
37             # );
38              
39             has xpc => (
40             is => 'ro',
41             isa => 'XML::LibXML::XPathContext',
42             default => sub {
43             my $xpc = XML::LibXML::XPathContext->new;
44             $xpc->registerNs('html' => 'http://www.w3.org/1999/xhtml');
45             return $xpc;
46             },
47             );
48              
49              
50              
51             # XXX huh
52             my %LINKS = (
53             head => { profile => 1, },
54             script => { src => 1, },
55             a => { href => 1, },
56             area => { href => 1, },
57             link => { href => 1, },
58             form => { action => 1, },
59             blockquote => { cite => 1, },
60             q => { cite => 1, },
61             ins => { cite => 1, },
62             del => { cite => 1, },
63             frame => { src => 1,
64             longdesc => 1, },
65             iframe => { src => 1,
66             longdesc => 1, },
67             img => { src => 1,
68             longdesc => 1,
69             usemap => 1, },
70             object => { data => 1,
71             classid => 1,
72             codebase => 1,
73             usemap => 1, },
74             input => { src => 1,
75             usemap => 1, },
76             );
77              
78             # turn aforementioned into a single xpath statement
79             my $LINKXP = join('|', map {
80             sprintf('//html:%s[%s]', $_, join('|', map { "\@$_" } keys %{$LINKS{$_}} ))
81             } keys %LINKS);
82              
83             =head1 NAME
84              
85             HTML::Detergent - Clean the gunk off an HTML document
86              
87             =head1 VERSION
88              
89             Version 0.06
90              
91             =cut
92              
93             our $VERSION = '0.06';
94              
95             =head1 SYNOPSIS
96              
97             use HTML::Detergent;
98              
99             my $scrubber = HTML::Detergent->new($config);
100              
101             # $input can be a string, GLOB reference, or XML::LibXML::Document
102              
103             my $doc = $scrubber->process($input, $uri);
104              
105             =head1 DESCRIPTION
106              
107             L<HTML::Detergent> is for isolating the main content of an HTML page,
108             stripping it of navigation, visual design, and other ancillary content.
109              
110             The main purpose of this module is to aid in the migration of web
111             content from one content management system to another. It is also
112             useful for preparing HTML resources for automated content inventories.
113              
114             The module currently has no heuristics for determining the main
115             content of a page. It works instead by assuming prior knowledge of the
116             layout, given in the configuration by an XPath expression that
117             uniquely isolates the container node. That node is then lifted into a
118             new document, along with the contents of the C<E<lt>headE<gt>>, and
119             returned by the L</process> method. To accommodate multiple layouts on
120             a site, the module can be initialized to match multiple XPath
121             expressions. If further processing is necessary, an expression can be
122             associated with an XSLT stylesheet, which is assumed to produce an
123             entire document, thus overriding the default behaviour.
124              
125             After the new document is generated and before it is returned by
126             L</process>, it is possible to inject C<E<lt>linkE<gt>> and
127             C<E<lt>metaE<gt>> elements into the C<E<lt>headE<gt>>. This enables
128             the inclusion of metadata and the re-association of the main content
129             with links that represent aspects of the page which have been removed
130             (e.g. navigation, copyright statement, etc.). In addition, if the
131             page's URI is supplied to the L</process> method, the
132             C<E<lt>baseE<gt>> element is either added or rewritten to reflect it,
133             and the URI attributes in the body are rewritten relative to the base.
134             Otherwise they are left alone.
135              
136             The document returned is an L<XML::LibXML::Document> object using the
137             XHTML namespace, C<http://www.w3.org/1999/xhtml>, but does not profess
138             to validate against any particular schema. If DTD declarations
139             (including the empty C<E<lt>!DOCTYPE htmlE<gt>> recommended in HTML5)
140             are desired, they can be added on afterward. Likewise, the object can
141             be converted from XML into HTML using L<XML::LibXML::Document/toStringHTML>.
142              
143             =head1 METHODS
144              
145             =head2 new %CONFIG | \%CONFIG | $CONFIG
146              
147             Initialize the processor, either with a list of configuration
148             parameters, a HASH reference thereof, or an L<HTML::Detergent::Config>
149             object. Below are the valid parameters:
150              
151             =over 4
152              
153             =item match
154              
155             This is an ARRAY reference of XPath expressions to try against the
156             document, in order of preference. Entries optionally may be
157             two-element ARRAY references themselves, the second element being a
158             URL where an XSLT stylesheet may be found.
159              
160             match => [ '/some/xpath/expression',
161             [ '/other/expr', '/url/of/transform.xsl' ],
162             ],
163              
164             =item link
165              
166             This is a HASH reference where the keys correspond to C<rel>
167             attributes and the values to C<href> attributes of C<E<lt>linkE<gt>>
168             elements. If the values are ARRAY references, they will be processed
169             in document order. C<rel> attributes will be sorted lexically. If a
170             callback is supplied instead, the caller expects a result of the same
171             form.
172              
173             link => { rel1 => 'href1', rel2 => [ qw(href2 href3) ] },
174              
175             # or
176              
177             link => \&_link_cb,
178              
179             =item meta
180              
181             This is a HASH reference where the keys correspond to C<name>
182             attributes and the values to C<content> attributes of
183             C<E<lt>metaE<gt>> elements. If the values are ARRAY references, they
184             will be processed in document order. C<name> attributes will be sorted
185             lexically. If a callback is supplied instead, the caller expects a
186             result of the same form.
187              
188             meta => { name1 => 'content1',
189             name2 => [ qw(content2 content3) ] },
190              
191             # or
192              
193             meta => \&_meta_cb,
194              
195             =item callback
196              
197             These callbacks will be passed into the internal L<XML::LibXSLT>
198             processor. See L<XML::LibXML::InputCallback> for details.
199              
200             callback => [ \&_match_cb, \&_open_cb, \&_read_cb, \&_close_cb ],
201              
202             # or
203              
204             callback => $icb, # isa XML::LibXML::InputCallback
205              
206             =back
207              
208             =cut
209              
210             around BUILDARGS => sub {
211             my $orig = shift;
212             my $class = shift;
213              
214             return $class->$orig(config => $_[0]) if ref $_[0];
215              
216             my %p = @_;
217             $class->$orig(config => \%p);
218             };
219              
220             my %SHEET;
221              
222             sub BUILD {
223             my $self = shift;
224              
225             my $xslt = XML::LibXSLT->new;
226             my $icb = $self->config->callback;
227             $xslt->input_callbacks($icb) if $icb;
228              
229             # cache stylesheets
230             for my $uri ($self->config->stylesheets) {
231             $SHEET{$uri} ||= $xslt->parse_stylesheet_file($uri);
232             #$SHEET{$uri} ||= $sheet;
233             }
234             }
235              
236             =head2 process $INPUT [, $URI, $CONFIG ]
237              
238             Processes C<$INPUT>, which may be a string, GLOB reference, or
239             L<XML::LibXML::Document> object. Returns an L<XML::LibXML::Document>
240             object with the changes mentioned in the L</DESCRIPTION>.
241              
242             =cut
243              
244             sub process {
245             my ($self, $input, $uri, @rest) = @_;
246              
247             if (my $ref = ref $input) {
248             if (Scalar::Util::reftype($input) eq 'GLOB') {
249             require HTML::HTML5::Parser;
250             my $p = HTML::HTML5::Parser->new(no_cache => 1);
251             $input = eval { $p->parse_fh($input) };
252             Carp::croak("Failed to parse X(HT)ML input: $@") if $@;
253             }
254             elsif (Scalar::Util::blessed($input)
255             and $input->isa('XML::LibXML::Document')) {
256             # do nothing
257             }
258             else {
259             Carp::croak("Don't know what to do with reference type $ref");
260             }
261             }
262             else {
263             require HTML::HTML5::Parser;
264             my $p = HTML::HTML5::Parser->new(no_cache => 1);
265             $input = eval { $p->parse_string($input) };
266             Carp::croak("Failed to parse X(HT)ML input: $@") if $@;
267             }
268              
269             my $xpc = $self->xpc;
270             my $doc;
271             for my $xpath ($self->config->matches) {
272             #warn $xpath;
273              
274             #warn substr($input->toString, 0, 100);
275              
276             my @body = $xpc->findnodes($xpath, $input);
277             #warn scalar @body;
278             @body or next;
279              
280             if (my $uri = $self->config->stylesheet($xpath)) {
281             #warn $uri;
282             $doc = $SHEET{$uri}->transform($input);
283             }
284             else {
285             my @head = map { $_->cloneNode(1) }
286             $xpc->findnodes('/html:html/html:head/*', $input);
287              
288             @body = map { $_->cloneNode(1) } @body;
289              
290             $doc = DOM E html => { xmlns => 'http://www.w3.org/1999/xhtml' },
291             (E head => {}, @head), (E body => {}, @body);
292             }
293              
294             last if $doc;
295             }
296              
297             return $input unless $doc;
298              
299             # don't do this if not an HTML doc
300             if (my ($head) = $xpc->findnodes('/html:html/html:head', $doc)) {
301             my ($firstl) = $xpc->findnodes('html:link', $head);
302             my $links = $self->config->links;
303             for my $k (keys %$links) {
304             for my $v (@{$links->{$k}}) {
305             # XXX abridge this
306             my $link = $doc->createElementNS
307             ('http://www.w3.org/1999/xhtml', 'link');
308             $link->setAttribute(rel => $k);
309             $link->setAttribute(href => $v);
310             if ($firstl) {
311             $head->insertBefore($link, $firstl);
312             }
313             else {
314             $head->appendChild($link);
315             }
316             }
317             }
318              
319             my $meta = $self->config->metadata;
320             my ($firstm) = $xpc->findnodes('html:link', $head);
321             for my $k (keys %$meta) {
322             for my $v (@{$meta->{$k}}) {
323             # XXX abridge this
324             my $meta = $doc->createElementNS
325             ('http://www.w3.org/1999/xhtml', 'meta');
326             $meta->setAttribute(name => $k);
327             $meta->setAttribute(content => $v);
328             if ($firstm) {
329             $head->insertBefore($meta, $firstm);
330             }
331             else {
332             $head->appendChild($meta);
333             }
334             }
335             }
336              
337             # XXX should this even be part of it?
338              
339             # do base set/url rewrite
340             if (defined $uri) {
341             # make sure it's a URI object
342             $uri = URI->new($uri) unless ref $uri;
343              
344             # start in this position
345             my $olduri = $uri;
346              
347             # try to find a <base> element
348             my ($base) = $xpc->findnodes('html:base[1]', $head);
349             if ($base) {
350             # rewrite the old base
351             $olduri = URI->new($base->getAttribute('href'));
352             }
353             else {
354             # make a new one if none found
355             $base = $doc->createElementNS
356             ('http://www.w3.org/1999/xhtml', 'base');
357             $head->appendChild($base);
358             }
359              
360             # set base to the new URI
361             $base->setAttribute(href => $uri);
362              
363             # now traverse the document looking for URI-like attributes
364             for my $node ($xpc->findnodes($LINKXP, $doc)) {
365             my $t = $LINKS{$node->localName};
366             for my $u (keys %$t) {
367             if (defined (my $a = $node->getAttribute($u))) {
368             # absolute against the old uri, relative to
369             # the new one
370             $a = URI->new_abs($a, $olduri);
371             $a = $a->rel($uri);
372             $node->setAttribute($u, $a);
373             }
374             }
375             }
376             }
377             }
378              
379             $doc;
380             }
381              
382              
383             =head1 AUTHOR
384              
385             Dorian Taylor, C<< <dorian at cpan.org> >>
386              
387             =head1 BUGS
388              
389             Please report any bugs or feature requests to C<bug-html-detergent at
390             rt.cpan.org>, or through the web interface at
391             L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Detergent>. I
392             will be notified, and then you'll automatically be notified of
393             progress on your bug as I make changes.
394              
395             =head1 SUPPORT
396              
397             You can find documentation for this module with the perldoc command.
398              
399             perldoc HTML::Detergent
400              
401             You can also look for information at:
402              
403             =over 4
404              
405             =item * RT: CPAN's request tracker (report bugs here)
406              
407             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Detergent>
408              
409             =item * AnnoCPAN: Annotated CPAN documentation
410              
411             L<http://annocpan.org/dist/HTML-Detergent>
412              
413             =item * CPAN Ratings
414              
415             L<http://cpanratings.perl.org/d/HTML-Detergent>
416              
417             =item * Search CPAN
418              
419             L<http://search.cpan.org/dist/HTML-Detergent/>
420              
421             =back
422              
423             =head1 SEE ALSO
424              
425             =over 4
426              
427             =item L<XML::LibXML>
428              
429             =item L<XML::LibXSLT>
430              
431             =item L<HTML::HTML5::Parser>
432              
433             =back
434              
435             =head1 LICENSE AND COPYRIGHT
436              
437             Copyright 2013 Dorian Taylor.
438              
439             Licensed under the Apache License, Version 2.0 (the "License"); you
440             may not use this file except in compliance with the License. You may
441             obtain a copy of the License at
442             L<http://www.apache.org/licenses/LICENSE-2.0>
443              
444             Unless required by applicable law or agreed to in writing, software
445             distributed under the License is distributed on an "AS IS" BASIS,
446             WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
447             implied. See the License for the specific language governing
448             permissions and limitations under the License.
449              
450             =cut
451              
452             __PACKAGE__->meta->make_immutable;
453             no Moose;
454              
455             1; # End of HTML::Detergent