File Coverage

blib/lib/HTML/Truncate.pm
Criterion Covered Total %
statement 151 174 86.7
branch 59 84 70.2
condition 6 15 40.0
subroutine 20 24 83.3
pod 11 11 100.0
total 247 308 80.1


line stmt bran cond sub pod time code
1             package HTML::Truncate;
2              
3 9     9   303047 use 5.008;
  9         33  
  9         372  
4 9     9   50 use strict;
  9         18  
  9         271  
5 9     9   64 use warnings;
  9         22  
  9         339  
6 9     9   52 no warnings "uninitialized";
  9         17  
  9         353  
7              
8 9     9   70189 use HTML::TokeParser;
  9         298846  
  9         370  
9 9     9   80 use HTML::Tagset ();
  9         81  
  9         167  
10 9     9   55 use HTML::Entities ();
  9         1295  
  9         159  
11 9     9   60 use Carp;
  9         18  
  9         3361  
12 9     9   94 use List::Util qw( first );
  9         15  
  9         27969  
13              
14             =head1 NAME
15              
16             HTML::Truncate - (beta software) truncate HTML by percentage or character count while preserving well-formedness.
17              
18             =head1 VERSION
19              
20             0.20
21              
22             =cut
23              
24             our $VERSION = "0.20";
25              
26             =head1 ABSTRACT
27              
28             When working with text it is common to want to truncate strings to make them fit a desired context. E.g., you might have a menu that is only 100px wide and prefer text doesn't wrap so you'd truncate it around 15-30 characters, depending on preference and typeface size. This is trivial with plain text using L but with HTML it is somewhat difficult because whitespace has fluid significance and open tags that are not properly closed destroy well-formedness and can wreck an entire layout.
29              
30             L attempts to account for those two problems by padding truncation for spacing and entities and closing any tags that remain open at the point of truncation.
31              
32             =head1 SYNOPSIS
33              
34             use strict;
35             use HTML::Truncate;
36              
37             my $html = '

We have to test something.

';
38             my $readmore = '... [readmore]';
39              
40             my $html_truncate = HTML::Truncate->new();
41             $html_truncate->chars(20);
42             $html_truncate->ellipsis($readmore);
43             print $html_truncate->truncate($html);
44              
45             # or
46              
47             use Encode;
48             my $ht = HTML::Truncate->new( utf8_mode => 1,
49             chars => 1_000,
50             );
51             print Encode::encode_utf8( $ht->truncate($html) );
52              
53             =head1 XHTML
54              
55             This module is designed to work with XHTML-style nested tags. More
56             below.
57              
58             =head1 WHITESPACE AND ENTITIES
59              
60             Repeated natural whitespace (i.e., "\s+" and not "   ") in HTML
61             -- with rare exception (pre tags or user defined styles) -- is not
62             meaningful. Therefore it is normalized when truncating. Entities are
63             also normalized. The following is only counted 14 chars long.
64              
65             \n

\nthis is ‘text’\n\n

66             ^^^^^^^12345----678--9------01234------^^^^^^^^
67              
68             =head1 METHODS
69              
70             =over 4
71              
72             =item B
73              
74             Can take all the methods as hash style args. "percent" and "chars" are
75             incompatible so don't use them both. Whichever is set most recently
76             will erase the other.
77              
78             my $ht = HTML::Truncate->new(utf8_mode => 1,
79             chars => 500, # default is 100
80             );
81              
82             =cut
83              
84             our %skip = ( head => 1,
85             script => 1,
86             form => 1,
87             iframe => 1,
88             object => 1,
89             embed => 1,
90             title => 1,
91             style => 1,
92             base => 1,
93             link => 1,
94             meta => 1,
95             );
96              
97              
98             sub new {
99 9     9 1 12972 my $class = shift;
100              
101 9         137 my $self = bless
102             {
103             _chars => 100,
104             _percent => undef,
105             _cleanly => qr/[\s[:punct:]]+\z/,
106             _on_space => undef,
107             _utf8_mode => undef,
108             _ellipsis => '…',
109             _raw_html => '',
110             _repair => undef,
111             _skip_tags => \%skip,
112             }, $class;
113              
114 9         62 while ( my ( $k, $v ) = splice(@_, 0, 2) )
115             {
116 2 50       7 croak "No such method or attribute '$k'" unless exists $self->{"_$k"};
117 2         6 $self->$k($v);
118             }
119 9         65 return $self;
120             }
121              
122             =item B
123              
124             Set/get, true/false. If C is set, C is also
125             set in the underlying L, entities will be transformed
126             with L and the default ellipsis will be a
127             literal ellipsis and not the default of C<…>.
128              
129             =cut
130              
131             sub utf8_mode {
132 193     193 1 620 my $self = shift;
133 193 100       348 if ( @_ )
134             {
135 5         9 $self->{_utf8_mode} = shift;
136 5         21 return 1; # say we did it, even if setting untrue value
137             }
138             else
139             {
140 188         661 return $self->{_utf8_mode};
141             }
142             }
143              
144             =item B
145              
146             Set/get. The number of characters remaining after truncation,
147             B the L.
148              
149             Entities are counted as single characters. E.g., C<©> is one
150             character for truncation counts.
151              
152             Default is "100." Side-effect: clears any L that has been
153             set.
154              
155             =cut
156              
157             sub chars {
158 93     93 1 40751 my ( $self, $chars ) = @_;
159 93 100       287 return $self->{_chars} unless defined $chars;
160 87 50       281 $chars > 0 or croak "You must truncate to at least 1 character";
161 87 50       417 $chars =~ /^(?:[1-9][_\d]*|0)$/
162             or croak "Specified chars must be a number";
163 87         146 $self->{_percent} = undef; # no conflict allowed
164 87         244 $self->{_chars} = $chars;
165             }
166              
167             =item B
168              
169             Set/get. A percentage to keep while truncating the rest. For a
170             document of 1,000 chars, percent('15%') and chars(150) would be
171             equivalent. The actual amount of character that the percent represents
172             cannot be known until the given HTML is parsed.
173              
174             Side-effect: clears any L that has been set.
175              
176             =cut
177              
178             sub percent {
179 1     1 1 4 my ( $self, $percent ) = @_;
180              
181 1 50 33     9 return unless $self->{_percent} or $percent;
182              
183 1 50       4 return sprintf("%d%%", 100 * $self->{_percent})
184             unless $percent;
185              
186 1         6 my ( $temp_percent ) = $percent =~ /^(100|[1-9]?[0-9])\%$/;
187              
188 1 50 33     11 $temp_percent and $temp_percent != 0
189             or croak "Specified percent is invalid '$percent' -- 1\% - 100\%";
190              
191 1         3 $self->{_chars} = undef; # no conflict allowed
192 1         8 $self->{_percent} = $1 / 100;
193             }
194              
195             =item B
196              
197             Set/get. Ellipsis in this case means --
198              
199             The omission of a word or phrase necessary for a complete
200             syntactical construction but not necessary for understanding.
201             http://www.answers.com/topic/ellipsis
202              
203             What it will probably mean in most real applications is "read more."
204             The default is C<…> which if the utf8 flag is true will render
205             as a literal ellipsis, C.
206              
207             The reason the default is C<…> and not "..." is this is meant
208             for use in HTML environments, not plain text, and "..." (dot-dot-dot)
209             is not typographically correct or equivalent to a real horizontal
210             ellipsis character.
211              
212             =cut
213              
214             sub ellipsis {
215 94     94 1 144 my $self = shift;
216 94 100       260 if ( @_ )
    100          
217             {
218 5         37 $self->{_ellipsis} = shift;
219             }
220             elsif ( $self->utf8_mode() )
221             {
222 6         76 return HTML::Entities::decode($self->{_ellipsis});
223             }
224             else
225             {
226 83         178 return $self->{_ellipsis};
227             }
228             }
229              
230             =item B
231              
232             It returns the truncated XHTML if asked for a return value.
233              
234             my $truncated = $ht->truncate($html);
235              
236             It will truncate the string in place if no return value is expected
237             (L is not defined).
238              
239             $ht->truncate($html);
240             print $html;
241              
242             Also can be called with inline arguments-
243              
244             print $ht->truncate( $html,
245             $chars_or_percent,
246             $ellipsis );
247              
248             No arguments are strictly required. Without HTML to operate upon it
249             returns undef. The two optional arguments may be preset with the
250             methods L (or L) and L.
251              
252             Valid nesting of tags is required (alla XHTML). Therefore some old
253             HTML habits like EpE without a E/pE are not supported
254             and may cause a fatal error. See L for help with badly formed
255             HTML.
256              
257             Certain tags are omitted by default from the truncated output.
258              
259             =over 4
260              
261             =item * Skipped tags
262              
263             These will not be included in truncated output by default.
264              
265             ...
...
266             ...
267            
268              
269             =item * Tags allowed to self-close
270              
271             See L in L.
272              
273             =back
274              
275             =cut
276              
277             sub _chars_or_percent {
278 0     0   0 my ( $self, $which ) = @_;
279 0 0       0 if ( $which =~ /\%\z/ )
280             {
281 0         0 $self->percent($which);
282             }
283             else
284             {
285 0         0 $self->chars($which);
286             }
287             }
288              
289             sub truncate {
290 97     97 1 4684 my $self = shift;
291 97         188 $self->{_raw_html} = \$_[0];
292 97 50       209 shift || return;
293              
294 97 50       200 $self->_chars_or_percent(+shift) if @_;
295 97 50       236 $self->ellipsis(+shift) if @_;
296              
297 97         90 my @tag_q;
298 97         122 my $renew = "";
299 97         154 my $total = 0;
300 97         98 my $previous_token;
301             my $next_token;
302              
303             # my $tmp_ellipsis = $self->ellipsis;
304             # $tmp_ellipsis =~ s/<\w[^>]+>//g; # Naive html strip.
305             # HTML::Entities::encode($tmp_ellipsis);
306 97         561 my $chars = $self->{_chars};# + length $tmp_ellipsis;
307              
308 97         382 my $p = HTML::TokeParser->new( $self->{_raw_html} );
309 97         13011 $p->unbroken_text(1);
310 97         225 $p->utf8_mode( $self->utf8_mode );
311              
312             TOKEN:
313 97         343 while ( my $token = $p->get_token() )
314             {
315 617         7916 my @nexttoken;
316             NEXT_TOKEN:
317 617         1441 while ( my $next = $p->get_token() )
318             {
319 1439         8921 push @nexttoken, $next;
320 1439 100       4485 if ( $next->[0] eq 'S' )
321             {
322 489         504 $next_token = $next;
323 489         684 last NEXT_TOKEN;
324             }
325             }
326 617         2420 $p->unget_token(@nexttoken);
327 617 100       3613 $previous_token = $token if $token->[0] eq 'E';
328              
329             # print " Queue: ", join ":", @tag_q; print $/;
330             # print "Previous: $previous_token->[1]\n";
331             # print " IN: $token->[1]\n";
332             # print " Next: $next_token->[1]\n\n";
333              
334 617 100       1519 if ( $token->[0] eq 'S' )
    100          
    50          
335             {
336             # _callback_for...? 321
337 253         451 ( my $real_tag = $token->[1] ) =~ s,/\z,,;
338 253 50       598 next TOKEN if $self->{_skip_tags}{$real_tag};
339 253 100       812 push @tag_q, $token->[1] unless $HTML::Tagset::emptyElement{$real_tag};
340 253         1239 $renew .= $token->[-1];
341             }
342             elsif ( $token->[0] eq 'E' )
343             {
344 116 50       293 next TOKEN if $self->{_skip_tags}{$token->[1]};
345 116         167 my $open = pop @tag_q;
346 116         208 my $close = $token->[1];
347 116 100       227 unless ( $open eq $close )
348             {
349 8 50       17 if ( $self->{_repair} )
350             {
351 8         8 my @unmatched;
352 8 100       21 push @unmatched, $open if $open;
353 8         19 while ( my $temp = pop @tag_q )
354             {
355 8 100       15 if ( $temp eq $close )
356             {
357 5         14 while ( my $add = shift @unmatched )
358             {
359 8         23 $renew .= "";
360             }
361 5         9 $renew .= "";
362 5         24 next TOKEN;
363             }
364             else
365             {
366 3         9 push @unmatched, $temp;
367             }
368             }
369 3         3 push @tag_q, reverse @unmatched;
370 3         12 next TOKEN; # silently drop unmatched close tags
371             }
372             else
373             {
374 0         0 my $nearby = substr($renew,
375             length($renew) - 15,
376             15);
377 0         0 croak qq|<$open> closed by near "$nearby"|;
378             }
379             }
380 108         401 $renew .= $token->[-1];
381             }
382             elsif ( $token->[0] eq 'T' )
383             {
384 248 50       498 next TOKEN if $token->[2]; # DATA
385             # my $txt = HTML::Entities::decode($token->[1]);
386 248         313 my $txt = $token->[1];
387 248         261 my $current_length = 0;
388 248 100   416   1171 unless ( first { $_ eq 'pre' } @tag_q ) # We're not somewhere inside a
 
  416         749  
389             {
390 244         837 $txt =~ s/\s+/ /g;
391              
392 244 100 66     853 if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow
393             and
394             ! $HTML::Tagset::isPhraseMarkup{$previous_token->[1]}
395             )
396             {
397 10         36 $txt =~ s/\A +//;
398             }
399              
400 244 100 66     669 if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow
401             and
402             ! $HTML::Tagset::isPhraseMarkup{$next_token->[1]}
403             )
404             {
405 20         52 $txt =~ s/ +\z//;
406             }
407 244         427 $current_length = _count_visual_chars($txt);
408             }
409             else
410             {
411 4         5 $current_length = length($txt);
412             }
413              
414 248         603 $total += $current_length;
415              
416 248 100       410 if ( $total >= $chars )
417             {
418 85         98 $total -= $current_length;
419              
420 85         101 my $chars_to_keep = $chars - $total;
421 85         105 my $keep = "";
422 85 100       178 if ( $self->on_space )
423             {
424 26         727 ( $keep ) = $txt =~ /\A(.{0,$chars_to_keep}\s?)(?=\s|\z)/;
425 26         90 $keep =~ s/\s+\z//;
426             }
427             else
428             {
429 59         117 $keep = substr($txt, 0, $chars_to_keep);
430             }
431              
432 85 100       202 if ( my $cleaner = $self->cleanly )
433             {
434 56         241 $keep =~ s/$cleaner//;
435             }
436              
437 85 100       194 if ( $keep )
438             {
439             # $renew .= $self->utf8_mode ?
440             # $keep : HTML::Entities::encode($keep);
441 68         108 $renew .= $keep;
442             }
443              
444 85         194 $renew .= $self->ellipsis();
445 85         199 last TOKEN;
446             }
447             else
448             {
449 163         791 $renew .= $token->[1];
450             }
451             }
452             } # TOKEN block ends
453              
454 97         303 $renew .= join('', map {""} reverse @tag_q);
  125         349  
455              
456 97 50       194 if ( defined wantarray )
457             {
458 97         1323 return $renew;
459             }
460             else
461             {
462 0         0 ${$self->{_raw_html}} = $renew;
  0         0  
463             }
464             }
465              
466             =item B
467              
468             Put one or more new tags into the list of those to be omitted from
469             truncated output. An example of when you might like to use this is if
470             you're thumb-nailing articles and they start with C<<

title

>>
471             or such before the article body. The heading level would be absurd
472             with a list of excerpts so you could drop it completely this way--
473              
474             $ht->add_skip_tags( 'h1' );
475              
476             =cut
477              
478             sub add_skip_tags {
479 0     0 1 0 my $self = shift;
480 0         0 for ( @_ )
481             {
482 0 0       0 croak "Args to add_skip_tags must be scalar tag names, not references"
483             if ref $_;
484 0         0 $self->{_skip_tags}{$_} = 1;
485             }
486             }
487              
488             =item B
489              
490             Takes tags out of the current list to be omitted from truncated output.
491              
492             =cut
493              
494             sub dont_skip_tags {
495 0     0 1 0 my $self = shift;
496 0         0 for ( @_ )
497             {
498 0 0       0 croak "Args to dont_skip_tags must be scalar tag names, not references"
499             if ref $_;
500 0 0       0 carp "$_ was not set to be skipped"
501             unless delete $self->{_skip_tags}{$_};
502             }
503             }
504              
505             =item B
506              
507             Set/get, true/false. If true, will attempt to repair unclosed HTML
508             tags by adding close-tags as late as possible (eg. C<<
509             foobar >> becomes C<< foobar >>). Unmatched
510             close tags are dropped (C<< foobar >> becomes C<< foobar >>).
511              
512             =cut
513              
514             sub repair {
515 8     8 1 15 my $self = shift;
516 8 100       22 if ( @_ )
517             {
518 3         8 $self->{_repair} = shift;
519 3         7 return 1; # say we did it, even if untrue value
520             }
521             else
522             {
523 5         24 return $self->{_repair};
524             }
525             }
526              
527             sub _load_chars_from_percent {
528 0     0   0 my $self = shift;
529 0         0 my $p = HTML::TokeParser->new( $self->{_raw_html} );
530 0         0 my $txt_length = 0;
531              
532             CHARS:
533 0         0 while ( my $token = $p->get_token )
534             {
535             # don't check padding b/c we're going by a document average
536 0 0 0     0 next unless $token->[0] eq 'T' and not $token->[2]; # Not data.
537 0         0 $txt_length += _count_visual_chars( $token->[1] );
538             }
539 0         0 $self->chars( int( $txt_length * $self->{_percent} ) );
540             }
541              
542             sub _count_visual_chars { # private function
543 244     244   1068 my $to_count = HTML::Entities::decode_entities(+shift);
544 244         442 $to_count =~ s/\s\s+/ /g;
545 244         314 $to_count =~ s/[^[:print:]]+//g;
546             # my $count = () =
547             # $to_count =~
548             # /\&\#\d+;|\&[[:alpha:]]{2,5};|\S|\s+/g;
549             # return $count;
550 244         414 return length($to_count);
551             }
552              
553             # Need to put hooks for these or not? 321
554             #sub _default_image_callback {
555             # sub {
556             # '[image]'
557             # }
558             #}
559              
560             =item B
561              
562             This will make the truncation back up to the first space it finds so
563             it doesn't truncate in the the middle of a word. L runs
564             before L if both are set.
565              
566             =cut
567              
568             sub on_space {
569 86     86 1 113 my $self = shift;
570 86 100       160 if ( @_ )
571             {
572 1         2 $self->{_on_space} = shift;
573 1         7 return 1; # say we did it, even if setting untrue value
574             }
575             else
576             {
577 85         214 return $self->{_on_space};
578             }
579             }
580              
581              
582             =item B
583              
584             Set/get -- a regular expression. This is on by default and the default
585             cleaning regular expression is C. It
586             will make the truncation strip any trailing spacing and punctuation so
587             you don't get things like "The End...." or "What? ..." You can cancel
588             it with C<$ht-Ecleanly(undef)> or provide your own regular
589             expression.
590              
591             =cut
592              
593             sub cleanly {
594 87     87 1 167 my $self = shift;
595 87 100       158 if ( @_ )
596             {
597 2         5 $self->{_cleanly} = shift;
598 2         18 return 1; # say we did it, even if setting untrue value
599             }
600             else
601             {
602 85         298 return $self->{_cleanly};
603             }
604             }
605              
606             =back
607              
608             =head1 COOKBOOK (well, a recipe)
609              
610             =head2 Template Toolkit filter
611              
612             For excerpting HTML in your Templates. Note the L which
613             is set to drop any images from the truncated output.
614              
615             use Template;
616             use HTML::Truncate;
617              
618             my %config =
619             (
620             FILTERS => {
621             truncate_html => [ \&truncate_html_filter_factory, 1 ],
622             },
623             );
624              
625             my $tt = Template->new(\%config) or die $Template::ERROR;
626              
627             # ... etc ...
628              
629             sub truncate_html_filter_factory {
630             my ( $context, $len, $ellipsis ) = @_;
631             $len = 32 unless $len;
632             $ellipsis = chr(8230) unless defined $ellipsis;
633             my $ht = HTML::Truncate->new();
634             $ht->add_skip_tags(qw( img ));
635             return sub {
636             my $html = shift || return '';
637             return $ht->truncate( $html, $len, $ellipsis );
638             }
639             }
640              
641             Then in your templates you can do things like this:
642              
643             [% FOR item IN search_results %]
644            
645             [% item.title %]
646             [% item.body | truncate_html(200) %]
647            
648             [% END %]
649              
650             See also L.
651              
652             =head1 AUTHOR
653              
654             Ashley Pond V, C<< >>.
655              
656             =head1 LIMITATIONS
657              
658             There may be places where this will break down right now. I'll pad out possible edge cases as I find them or they are sent to me via the CPAN bug ticket system.
659              
660             =head2 This is not an HTML filter
661              
662             Although this happens to do some crude HTML filtering to achieve its end, it is not a fully featured filter. If you are looking for one, check out L and L.
663              
664             =head1 BUGS, FEEDBACK, PATCHES
665              
666             Please report any bugs or feature requests to
667             C, or through the web interface at
668             L. I
669             will get the ticket, and then you'll automatically be notified of
670             progress as I make changes.
671              
672             =head2 TO DO
673              
674             Write a couple more tests (percent and skip stuff) then take out beta notice. Try to make the 5.6 stuff work without decode...? Try a C method?
675              
676             Write an L based version to load when possible...? Or make that part of L?
677              
678             =head1 THANKS TO
679              
680             Kevin Riggle for the L functionality; patch, Pod, and tests.
681              
682             Lorenzo Iannuzzi for the L functionality.
683              
684             =head1 SEE ALSO
685              
686             L, L, the "truncate" filter in L