File Coverage

blib/lib/HTML/HTML5/ToText.pm
Criterion Covered Total %
statement 12 14 85.7
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 17 19 89.4


line stmt bran cond sub pod time code
1             package HTML::HTML5::ToText;
2              
3 1     1   30004 use 5.010;
  1         4  
  1         36  
4 1     1   927 use common::sense;
  1         10  
  1         6  
5 1     1   1170 use utf8;
  1         16  
  1         6  
6              
7             BEGIN {
8 1     1   69 $HTML::HTML5::ToText::AUTHORITY = 'cpan:TOBYINK';
9 1         21 $HTML::HTML5::ToText::VERSION = '0.004';
10             }
11              
12 1     1   600 use Moose;
  0            
  0            
13             with 'MooseX::Traits';
14              
15             has '+_trait_namespace' => (
16             default => join('::', __PACKAGE__, 'Trait'),
17             );
18              
19             use HTML::HTML5::Parser;
20             use XML::LibXML::PrettyPrint;
21              
22             BEGIN
23             {
24             my @noshow = qw[base basefont bgsound meta param script style];
25             my @empty = qw[br canvas col command embed frame hr
26             img is index keygen link];
27             my @inline = qw[a abbr area b bdi bdo big button cite code dfn em font i
28             input kbd label mark meter nobr progress q rp rt ruby s
29             samp small span strike strong sub sup time tt u var wbr];
30             my @block = qw[address applet article aside audio blockquote body caption
31             center colgroup datalist del dir div dd details dl dt
32             fieldset figcaption figure footer form frameset h1 h2 h3
33             h4 h5 h6 head header hgroup html iframe ins legend li
34             listing map marquee menu nav noembed noframes noscript
35             object ol optgroup option p pre select section source summary
36             table tbody td tfoot th thead title tr track ul video];
37            
38             {
39             no strict 'refs';
40             *{ uc $_ } = sub { (shift)->_inline($_, @_) }
41             foreach @inline;
42             *{ uc $_ } = sub { (shift)->_block($_, @_) }
43             foreach @block;
44             *{ uc $_ } = sub { (shift)->_empty($_, @_) }
45             foreach @empty;
46             *{ uc $_ } = sub { (shift)->_noshow($_, @_) }
47             foreach @noshow;
48             }
49             }
50              
51             sub process
52             {
53             my ($self, $node, $no_clone) = @_;
54             $self = $self->new unless ref $self;
55            
56             if ($node->nodeName eq '#document')
57             {
58             $node = $node->documentElement;
59             }
60            
61             unless ($no_clone)
62             {
63             $node = $node->cloneNode(1);
64             }
65            
66             if ($node->isa('XML::LibXML::Element'))
67             {
68             XML::LibXML::PrettyPrint->new_for_html->strip_whitespace($node);
69             my $elem = uc $node->nodeName;
70             my $str = $self->$elem($node);
71             $str =~ s{ (^\n+) | (\n+$) }{}gx;
72             return "$str\n";
73             }
74             elsif ($node->nodeName eq '#text')
75             {
76             return $node->data;
77             }
78             }
79              
80             sub process_string
81             {
82             shift->process(
83             HTML::HTML5::Parser->load_html(string => shift, URI => shift),
84             'no_clone',
85             );
86             }
87              
88             sub textnode
89             {
90             my ($self, $node, %args) = @_;
91             return $node->data;
92             }
93              
94             sub _inline
95             {
96             my ($self, $func, $node, %args) = @_;
97            
98             my $return = '';
99             foreach my $kid ($node->childNodes)
100             {
101             if ($kid->nodeName eq '#text')
102             {
103             $return .= $self->textnode($kid, %args);
104             }
105             elsif ($kid->isa('XML::LibXML::Element'))
106             {
107             my $elem = uc $kid->nodeName;
108             $return .= $self->$elem($kid, %args);
109             }
110             }
111            
112             $return;
113             }
114              
115             sub _block
116             {
117             my ($self, $func, $node, %args) = @_;
118            
119             my $return = "\n";
120             foreach my $kid ($node->childNodes)
121             {
122             if ($kid->nodeName eq '#text')
123             {
124             $return .= $self->textnode($kid, %args);
125             }
126             elsif ($kid->isa('XML::LibXML::Element'))
127             {
128             my $elem = uc $kid->nodeName;
129             my $str = $self->$elem($kid, %args);
130            
131             if ($str =~ m{^\n} and not $kid->previousSibling)
132             {
133             $str =~ s{^\n}{};
134             }
135            
136             if ($str =~ m{\n$} and not $kid->nextSibling)
137             {
138             $str =~ s{\n$}{};
139             }
140            
141             $return .= $str;
142             }
143             }
144             $return .= "\n";
145            
146             $return;
147             }
148              
149             sub _empty
150             {
151             return '';
152             }
153              
154             sub _noshow
155             {
156             return '';
157             }
158              
159             around BR => sub { "\n" };
160             around HR => sub { "\n" . ("-" x 8) . "\n" };
161              
162             __PACKAGE__
163             __END__
164              
165             =head1 NAME
166              
167             HTML::HTML5::ToText - convert HTML to plain text
168              
169             =head1 SYNOPSIS
170              
171             my $dom = HTML::HTML5::Parser->load_html(IO => \*STDIN);
172             print HTML::HTML5::ToText
173             ->with_traits(qw/ShowLinks ShowImages RenderTables/)
174             ->new()
175             ->process($dom);
176              
177             =head1 DESCRIPTION
178              
179             The L<HTML::HTML5::ToText> module itself produces a pretty boring conversion
180             of HTML to text, but thanks to L<Moose> and L<MooseX::Traits> it can easily
181             be composed with "traits" that improve the output.
182              
183             =head2 Compositor
184              
185             =over
186              
187             =item C<< with_traits(@traits) >>
188              
189             This class method creates a new class that composes C<HTML::HTML5::ToText>
190             with each trait given, returning the name of that class. That class will
191             be a subclass of C<HTML::HTML5::ToText>.
192              
193             Traits are taken to be in the "HTML::HTML5::ToText::Trait" namespace
194             unless overridden by prefixing the trait with "+".
195              
196             =back
197              
198             =head2 Constructors
199              
200             =over
201              
202             =item * C<< new(%attrs) >>
203              
204             Creates a new instance of the class.
205              
206             =item * C<< new_with_traits(traits => \@traits, %attrs) >>
207              
208             Shortcut for:
209              
210             HTML::HTML5::ToText->with_traits(@traits)->new(%attrs)
211              
212             =back
213              
214             =head2 Attributes
215              
216             As per usual for Moose classes, accessor methods are provided for each
217             attribute, and attributes may be set in the constructor.
218              
219             C<HTML::HTML5::ToText> does not actually provide any attributes, but
220             some traits may.
221              
222             =head2 Methods
223              
224             =over
225              
226             =item * C<< process($node) >>
227              
228             Processes an L<XML::LibXML::Node> and returns a string. May be called as a
229             class or object method.
230              
231             Because C<process> likes to perform some alterations to the DOM tree, as a
232             first stage it makes a clone of the DOM tree (so that it can leave the
233             original intact). If you don't care about any changes to the tree, and want
234             to save a bit of CPU, then you can suppress the cloning by passing a true
235             value as a second argument to C<process>.
236              
237             HTML::HTML5::ToText->process($node, 'no_clone')
238              
239             =item * C<< process_string($string) >>
240              
241             As per C<process>, but first parses the string with L<HTML::HTML5::Parser>.
242             The second argument (for cloning) does not exist as cloning is not needed in
243             this case.
244              
245             =back
246              
247             There are also methods named (in upper-case) after every element defined in
248             HTML5: C<< STRONG($node) >>, C<< DL($node) >>, C<< IMG($node) >> and so on,
249             which C<< process($node) >> delegates to; and a C<< textnode($node) >>
250             method which is the equivalent for text nodes. These are the methods which
251             traits tend to modify.
252              
253             =head1 EXTENDING
254              
255             L<MooseX::Traits> makes it pretty easy to cleanly extend this module. Say
256             for example, we want to add the feature where the HTML C<< <del> >> element
257             is output as the empty string. (The default behavious treats it rather like
258             C<< <div> >>.)
259              
260             {
261             package Local::SkipDEL;
262             use Moose::Role;
263             override DEL => sub { '' };
264             }
265            
266             print HTML::HTML5::ToText
267             -> with_traits(qw/ShowLinks ShowImages +Local::SkipDEL/)
268             -> process_string($html);
269              
270             Or maybe we want to force C<< <big> >> elements into uppercase?
271              
272             {
273             package Local::Embiggen;
274             use Moose::Role;
275             around BIG => sub
276             {
277             my ($orig, $self, $elem) = @_;
278             return uc $self->$orig($elem);
279             };
280             }
281            
282             print HTML::HTML5::ToText
283             -> with_traits(qw/+Local::Embiggen/)
284             -> process_string($html);
285              
286             Share your examples of extending HTML::HTML5::ToText at
287             L<https://bitbucket.org/tobyink/p5-html-html5-totext/wiki/Extending>.
288              
289             =head1 BUGS
290              
291             Please report any bugs to
292             L<http://rt.cpan.org/Dist/Display.html?Queue=HTML-HTML5-ToText>.
293              
294             =head1 SEE ALSO
295              
296             L<HTML::HTML5::Parser>,
297             L<HTML::HTML5::Table>.
298              
299             L<HTML::HTML5::ToText::Trait::RenderTables>,
300             L<HTML::HTML5::ToText::Trait::ShowImages>,
301             L<HTML::HTML5::ToText::Trait::ShowLinks>,
302             L<HTML::HTML5::ToText::Trait::TextFormatting>.
303              
304             =head2 Similar Modules on CPAN
305              
306             =over
307              
308             =item * L<HTML::FormatText>
309              
310             About 15 years old, and still maintained, this falls into the "mature"
311             category. This module is based on L<HTML::Tree>, so its HTML parser may
312             not behave as closely to modern browsers as HTML::HTML5::Parser's parsing,
313             but its conversion to text seems somewhat better than HTML::HTML5::ToText's
314             default output (i.e. with no traits applied).
315              
316             At the time of writing, its bug queue on rt.cpan.org lists eight bugs, some
317             quite serious. However, since being taken over by its latest maintainer,
318             there seems to be progress being made on them.
319              
320             Fairly extensible, but not in the mix-and-match traits way allowed by
321             HTML::HTML5::ToText.
322              
323             =item * L<HTML::FormatText::WithLinks>
324              
325             An extension of HTML::FormatText.
326              
327             =item * L<HTML::FormatText::WithLinks::AndTables>
328              
329             An extension of HTML::FormatText::WithLinks.
330              
331             The code that deals with tables is pretty crude compared with
332             HTML::HTML5::ToText::Trait::RenderTables. It doesn't support C<colspan>,
333             C<rowspan>, or the C<< <th> >> element.
334              
335             =item * L<LEOCHARRE::HTML::Text>
336              
337             Very basic conversion; basically just tag stripping using regular expressions.
338              
339             =item * L<HTML::FormatExternal>
340              
341             Passes HTML through external command-line tools such as `lynx`. Obviously
342             this has limited portability.
343              
344             =back
345              
346             =head1 AUTHOR
347              
348             Toby Inkster E<lt>tobyink@cpan.orgE<gt>.
349              
350             =head1 THANKS
351              
352             Everyone behind Moose. No way I could have done all this in a few hours
353             without Moose's strange brand of meta-programming!
354              
355             =head1 COPYRIGHT AND LICENCE
356              
357             This software is copyright (c) 2012-2013 by Toby Inkster.
358              
359             This is free software; you can redistribute it and/or modify it under
360             the same terms as the Perl 5 programming language system itself.
361              
362             =head1 DISCLAIMER OF WARRANTIES
363              
364             THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
365             WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
366             MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
367