File Coverage

blib/lib/HTML/Detergent.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package HTML::Detergent;
2
3	1	1	25417	use 5.010;
	1		4
	1		39
4	1	1	5	use strict;
	1		2
	1		35
5	1	1	6	use warnings FATAL => 'all';
	1		7
	1		49
6
7	1	1	979	use Moose;
	1		772658
	1		11
8	1	1	10012	use namespace::autoclean;
	1		1622
	1		6
9
10	1	1	58	use Scalar::Util ();
	1		2
	1		17
11	1	1	1136	use URI ();
	1		5596
	1		26
12	1	1	448	use XML::LibXML ();
	0
	0
13				use XML::LibXSLT ();
14				use XML::LibXML::LazyBuilder qw(DOM E);
15
16				use HTML::Detergent::Config qw(Config);
17
18				has config => (
19				is => 'ro',
20				isa => Config,
21				coerce => 1,
22				required => 1,
23				);
24
25				has xml_parser => (
26				is => 'ro',
27				isa => 'XML::LibXML',
28				default => sub { XML::LibXML->new(recover => 2) },
29				lazy => 1,
30				);
31
32				# has html_parser => (
33				# is => 'ro',
34				# isa => 'HTML::HTML5::Parser',
35				# default => sub { require HTML::HTML5::Parser; HTML::HTML5::Parser->new },
36				# lazy => 1,
37				# );
38
39				has xpc => (
40				is => 'ro',
41				isa => 'XML::LibXML::XPathContext',
42				default => sub {
43				my $xpc = XML::LibXML::XPathContext->new;
44				$xpc->registerNs('html' => 'http://www.w3.org/1999/xhtml');
45				return $xpc;
46				},
47				);
48
49
50
51				# XXX huh
52				my %LINKS = (
53				head => { profile => 1, },
54				script => { src => 1, },
55				a => { href => 1, },
56				area => { href => 1, },
57				link => { href => 1, },
58				form => { action => 1, },
59				blockquote => { cite => 1, },
60				q => { cite => 1, },
61				ins => { cite => 1, },
62				del => { cite => 1, },
63				frame => { src => 1,
64				longdesc => 1, },
65				iframe => { src => 1,
66				longdesc => 1, },
67				img => { src => 1,
68				longdesc => 1,
69				usemap => 1, },
70				object => { data => 1,
71				classid => 1,
72				codebase => 1,
73				usemap => 1, },
74				input => { src => 1,
75				usemap => 1, },
76				);
77
78				# turn aforementioned into a single xpath statement
79				my $LINKXP = join('\|', map {
80				sprintf('//html:%s[%s]', $_, join('\|', map { "\@$_" } keys %{$LINKS{$_}} ))
81				} keys %LINKS);
82
83				=head1 NAME
84
85				HTML::Detergent - Clean the gunk off an HTML document
86
87				=head1 VERSION
88
89				Version 0.06
90
91				=cut
92
93				our $VERSION = '0.06';
94
95				=head1 SYNOPSIS
96
97				use HTML::Detergent;
98
99				my $scrubber = HTML::Detergent->new($config);
100
101				# $input can be a string, GLOB reference, or XML::LibXML::Document
102
103				my $doc = $scrubber->process($input, $uri);
104
105				=head1 DESCRIPTION
106
107				L<HTML::Detergent> is for isolating the main content of an HTML page,
108				stripping it of navigation, visual design, and other ancillary content.
109
110				The main purpose of this module is to aid in the migration of web
111				content from one content management system to another. It is also
112				useful for preparing HTML resources for automated content inventories.
113
114				The module currently has no heuristics for determining the main
115				content of a page. It works instead by assuming prior knowledge of the
116				layout, given in the configuration by an XPath expression that
117				uniquely isolates the container node. That node is then lifted into a
118				new document, along with the contents of the C<E<lt>headE<gt>>, and
119				returned by the L</process> method. To accommodate multiple layouts on
120				a site, the module can be initialized to match multiple XPath
121				expressions. If further processing is necessary, an expression can be
122				associated with an XSLT stylesheet, which is assumed to produce an
123				entire document, thus overriding the default behaviour.
124
125				After the new document is generated and before it is returned by
126				L</process>, it is possible to inject C<E<lt>linkE<gt>> and
127				C<E<lt>metaE<gt>> elements into the C<E<lt>headE<gt>>. This enables
128				the inclusion of metadata and the re-association of the main content
129				with links that represent aspects of the page which have been removed
130				(e.g. navigation, copyright statement, etc.). In addition, if the
131				page's URI is supplied to the L</process> method, the
132				C<E<lt>baseE<gt>> element is either added or rewritten to reflect it,
133				and the URI attributes in the body are rewritten relative to the base.
134				Otherwise they are left alone.
135
136				The document returned is an L<XML::LibXML::Document> object using the
137				XHTML namespace, C<http://www.w3.org/1999/xhtml>, but does not profess
138				to validate against any particular schema. If DTD declarations
139				(including the empty C<E<lt>!DOCTYPE htmlE<gt>> recommended in HTML5)
140				are desired, they can be added on afterward. Likewise, the object can
141				be converted from XML into HTML using L<XML::LibXML::Document/toStringHTML>.
142
143				=head1 METHODS
144
145				=head2 new %CONFIG \| \%CONFIG \| $CONFIG
146
147				Initialize the processor, either with a list of configuration
148				parameters, a HASH reference thereof, or an L<HTML::Detergent::Config>
149				object. Below are the valid parameters:
150
151				=over 4
152
153				=item match
154
155				This is an ARRAY reference of XPath expressions to try against the
156				document, in order of preference. Entries optionally may be
157				two-element ARRAY references themselves, the second element being a
158				URL where an XSLT stylesheet may be found.
159
160				match => [ '/some/xpath/expression',
161				[ '/other/expr', '/url/of/transform.xsl' ],
162				],
163
164				=item link
165
166				This is a HASH reference where the keys correspond to C<rel>
167				attributes and the values to C<href> attributes of C<E<lt>linkE<gt>>
168				elements. If the values are ARRAY references, they will be processed
169				in document order. C<rel> attributes will be sorted lexically. If a
170				callback is supplied instead, the caller expects a result of the same
171				form.
172
173				link => { rel1 => 'href1', rel2 => [ qw(href2 href3) ] },
174
175				# or
176
177				link => \&_link_cb,
178
179				=item meta
180
181				This is a HASH reference where the keys correspond to C<name>
182				attributes and the values to C<content> attributes of
183				C<E<lt>metaE<gt>> elements. If the values are ARRAY references, they
184				will be processed in document order. C<name> attributes will be sorted
185				lexically. If a callback is supplied instead, the caller expects a
186				result of the same form.
187
188				meta => { name1 => 'content1',
189				name2 => [ qw(content2 content3) ] },
190
191				# or
192
193				meta => \&_meta_cb,
194
195				=item callback
196
197				These callbacks will be passed into the internal L<XML::LibXSLT>
198				processor. See L<XML::LibXML::InputCallback> for details.
199
200				callback => [ \&_match_cb, \&_open_cb, \&_read_cb, \&_close_cb ],
201
202				# or
203
204				callback => $icb, # isa XML::LibXML::InputCallback
205
206				=back
207
208				=cut
209
210				around BUILDARGS => sub {
211				my $orig = shift;
212				my $class = shift;
213
214				return $class->$orig(config => $_[0]) if ref $_[0];
215
216				my %p = @_;
217				$class->$orig(config => \%p);
218				};
219
220				my %SHEET;
221
222				sub BUILD {
223				my $self = shift;
224
225				my $xslt = XML::LibXSLT->new;
226				my $icb = $self->config->callback;
227				$xslt->input_callbacks($icb) if $icb;
228
229				# cache stylesheets
230				for my $uri ($self->config->stylesheets) {
231				$SHEET{$uri} \|\|= $xslt->parse_stylesheet_file($uri);
232				#$SHEET{$uri} \|\|= $sheet;
233				}
234				}
235
236				=head2 process $INPUT [, $URI, $CONFIG ]
237
238				Processes C<$INPUT>, which may be a string, GLOB reference, or
239				L<XML::LibXML::Document> object. Returns an L<XML::LibXML::Document>
240				object with the changes mentioned in the L</DESCRIPTION>.
241
242				=cut
243
244				sub process {
245				my ($self, $input, $uri, @rest) = @_;
246
247				if (my $ref = ref $input) {
248				if (Scalar::Util::reftype($input) eq 'GLOB') {
249				require HTML::HTML5::Parser;
250				my $p = HTML::HTML5::Parser->new(no_cache => 1);
251				$input = eval { $p->parse_fh($input) };
252				Carp::croak("Failed to parse X(HT)ML input: $@") if $@;
253				}
254				elsif (Scalar::Util::blessed($input)
255				and $input->isa('XML::LibXML::Document')) {
256				# do nothing
257				}
258				else {
259				Carp::croak("Don't know what to do with reference type $ref");
260				}
261				}
262				else {
263				require HTML::HTML5::Parser;
264				my $p = HTML::HTML5::Parser->new(no_cache => 1);
265				$input = eval { $p->parse_string($input) };
266				Carp::croak("Failed to parse X(HT)ML input: $@") if $@;
267				}
268
269				my $xpc = $self->xpc;
270				my $doc;
271				for my $xpath ($self->config->matches) {
272				#warn $xpath;
273
274				#warn substr($input->toString, 0, 100);
275
276				my @body = $xpc->findnodes($xpath, $input);
277				#warn scalar @body;
278				@body or next;
279
280				if (my $uri = $self->config->stylesheet($xpath)) {
281				#warn $uri;
282				$doc = $SHEET{$uri}->transform($input);
283				}
284				else {
285				my @head = map { $_->cloneNode(1) }
286				$xpc->findnodes('/html:html/html:head/*', $input);
287
288				@body = map { $_->cloneNode(1) } @body;
289
290				$doc = DOM E html => { xmlns => 'http://www.w3.org/1999/xhtml' },
291				(E head => {}, @head), (E body => {}, @body);
292				}
293
294				last if $doc;
295				}
296
297				return $input unless $doc;
298
299				# don't do this if not an HTML doc
300				if (my ($head) = $xpc->findnodes('/html:html/html:head', $doc)) {
301				my ($firstl) = $xpc->findnodes('html:link', $head);
302				my $links = $self->config->links;
303				for my $k (keys %$links) {
304				for my $v (@{$links->{$k}}) {
305				# XXX abridge this
306				my $link = $doc->createElementNS
307				('http://www.w3.org/1999/xhtml', 'link');
308				$link->setAttribute(rel => $k);
309				$link->setAttribute(href => $v);
310				if ($firstl) {
311				$head->insertBefore($link, $firstl);
312				}
313				else {
314				$head->appendChild($link);
315				}
316				}
317				}
318
319				my $meta = $self->config->metadata;
320				my ($firstm) = $xpc->findnodes('html:link', $head);
321				for my $k (keys %$meta) {
322				for my $v (@{$meta->{$k}}) {
323				# XXX abridge this
324				my $meta = $doc->createElementNS
325				('http://www.w3.org/1999/xhtml', 'meta');
326				$meta->setAttribute(name => $k);
327				$meta->setAttribute(content => $v);
328				if ($firstm) {
329				$head->insertBefore($meta, $firstm);
330				}
331				else {
332				$head->appendChild($meta);
333				}
334				}
335				}
336
337				# XXX should this even be part of it?
338
339				# do base set/url rewrite
340				if (defined $uri) {
341				# make sure it's a URI object
342				$uri = URI->new($uri) unless ref $uri;
343
344				# start in this position
345				my $olduri = $uri;
346
347				# try to find a <base> element
348				my ($base) = $xpc->findnodes('html:base[1]', $head);
349				if ($base) {
350				# rewrite the old base
351				$olduri = URI->new($base->getAttribute('href'));
352				}
353				else {
354				# make a new one if none found
355				$base = $doc->createElementNS
356				('http://www.w3.org/1999/xhtml', 'base');
357				$head->appendChild($base);
358				}
359
360				# set base to the new URI
361				$base->setAttribute(href => $uri);
362
363				# now traverse the document looking for URI-like attributes
364				for my $node ($xpc->findnodes($LINKXP, $doc)) {
365				my $t = $LINKS{$node->localName};
366				for my $u (keys %$t) {
367				if (defined (my $a = $node->getAttribute($u))) {
368				# absolute against the old uri, relative to
369				# the new one
370				$a = URI->new_abs($a, $olduri);
371				$a = $a->rel($uri);
372				$node->setAttribute($u, $a);
373				}
374				}
375				}
376				}
377				}
378
379				$doc;
380				}
381
382
383				=head1 AUTHOR
384
385				Dorian Taylor, C<< <dorian at cpan.org> >>
386
387				=head1 BUGS
388
389				Please report any bugs or feature requests to C<bug-html-detergent at
390				rt.cpan.org>, or through the web interface at
391				L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Detergent>. I
392				will be notified, and then you'll automatically be notified of
393				progress on your bug as I make changes.
394
395				=head1 SUPPORT
396
397				You can find documentation for this module with the perldoc command.
398
399				perldoc HTML::Detergent
400
401				You can also look for information at:
402
403				=over 4
404
405				=item * RT: CPAN's request tracker (report bugs here)
406
407				L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Detergent>
408
409				=item * AnnoCPAN: Annotated CPAN documentation
410
411				L<http://annocpan.org/dist/HTML-Detergent>
412
413				=item * CPAN Ratings
414
415				L<http://cpanratings.perl.org/d/HTML-Detergent>
416
417				=item * Search CPAN
418
419				L<http://search.cpan.org/dist/HTML-Detergent/>
420
421				=back
422
423				=head1 SEE ALSO
424
425				=over 4
426
427				=item L<XML::LibXML>
428
429				=item L<XML::LibXSLT>
430
431				=item L<HTML::HTML5::Parser>
432
433				=back
434
435				=head1 LICENSE AND COPYRIGHT
436
437				Copyright 2013 Dorian Taylor.
438
439				Licensed under the Apache License, Version 2.0 (the "License"); you
440				may not use this file except in compliance with the License. You may
441				obtain a copy of the License at
442				L<http://www.apache.org/licenses/LICENSE-2.0>
443
444				Unless required by applicable law or agreed to in writing, software
445				distributed under the License is distributed on an "AS IS" BASIS,
446				WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
447				implied. See the License for the specific language governing
448				permissions and limitations under the License.
449
450				=cut
451
452				__PACKAGE__->meta->make_immutable;
453				no Moose;
454
455				1; # End of HTML::Detergent