File Coverage

blib/lib/Mail/Pyzor/Digest/StripHtml.pm
Criterion Covered Total %
statement 9 40 22.5
branch 0 10 0.0
condition n/a
subroutine 3 7 42.8
pod 1 1 100.0
total 13 58 22.4


line stmt bran cond sub pod time code
1             package Mail::Pyzor::Digest::StripHtml;
2              
3             # Copyright 2018 cPanel, LLC.
4             # All rights reserved.
5             # http://cpanel.net
6             #
7             # This is free software; you can redistribute it and/or modify it under the
8             # Apache 2.0 license.
9              
10 1     1   206894 use strict;
  1         11  
  1         30  
11 1     1   7 use warnings;
  1         2  
  1         33  
12              
13             =encoding utf-8
14              
15             =head1 NAME
16              
17             Mail::Pyzor::Digest::StripHtml
18              
19             =head1 SYNOPSIS
20              
21             my $stripped = Mail::Pyzor::Digest::StripHtml::strip($html);
22              
23             =head1 DESCRIPTION
24              
25             This module attempts to duplicate pyzor’s HTML-stripping logic.
26              
27             =head1 ACCURACY
28              
29             This library cannot achieve 100%, bug-for-bug parity with pyzor
30             because to do so would require duplicating Python’s own HTML parsing
31             library. Since that library’s output has changed over time, and those
32             changes in turn affect pyzor, it’s literally impossible to arrive at
33             a single, fully-compatible reimplementation.
34              
35             That said, all known divergences between pyzor and this library involve
36             invalid HTML as input.
37              
38             Please open bug reports for any divergences you identify, particularly
39             if the input is valid HTML.
40              
41             =cut
42              
43             #----------------------------------------------------------------------
44              
45 1     1   630 use HTML::Parser ();
  1         6009  
  1         509  
46              
47             #----------------------------------------------------------------------
48              
49             =head1 FUNCTIONS
50              
51             =head2 $stripped = strip( $HTML )
52              
53             Give it some HTML, and it’ll give back the stripped text.
54              
55             In B, the stripping consists of removing tags as well as
56             CscriptE> and CstyleE> elements; however, it also
57             removes HTML entities.
58              
59             This tries very hard to duplicate pyzor’s behavior with invalid HTML.
60              
61             =cut
62              
63             sub strip {
64 0     0 1   my ($html) = @_;
65              
66 0           $html =~ s<\A\s+><>;
67 0           $html =~ s<\s+\z><>;
68              
69 0           my $p = HTML::Parser->new( api_version => 3 );
70              
71 0           my @pieces;
72              
73 0           my $accumulate = 1;
74              
75             $p->handler(
76             start => sub {
77 0     0     my ($tagname) = @_;
78              
79 0 0         $accumulate = 0 if $tagname eq 'script';
80 0 0         $accumulate = 0 if $tagname eq 'style';
81              
82 0           return;
83             },
84 0           'tagname',
85             );
86              
87             $p->handler(
88             end => sub {
89 0     0     $accumulate = 1;
90 0           return;
91             }
92 0           );
93              
94             $p->handler(
95             text => sub {
96 0     0     my ($copy) = @_;
97              
98 0 0         return if !$accumulate;
99              
100             # pyzor’s HTML parser discards HTML entities. On top of that,
101             # we need to match, as closely as possible, pyzor’s handling of
102             # invalid HTML entities … which is a function of Python’s
103             # standard HTML parsing library. This will probably never be
104             # fully compatible with the pyzor, but we can get it close.
105              
106             # The original is:
107             #
108             # re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
109             #
110             # The parsing loop then “backs up” one byte if the last
111             # character isn’t a “;”. We use a look-ahead assertion to
112             # mimic that behavior.
113 0           $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
114              
115             # The original is:
116             #
117             # re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
118             #
119             # We again use a look-ahead assertion to mimic Python.
120 0           $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
121              
122             # Python’s HTMLParser aborts its parsing loop when it encounters
123             # an invalid numeric reference.
124 0           $copy =~ s<\&\#
125             (?:
126             [^0-9xX] # anything but the expected first char
127             |
128             [0-9]+[a-fA-F] # hex within decimal
129             |
130             [xX][^0-9a-fA-F]
131             )
132             (.*)
133             ><
134 0 0         ( -1 == index($1, ';') ) ? q<> : '&#'
135             >exs;
136              
137             # Python’s HTMLParser treats invalid entities as incomplete
138 0           $copy =~ s<(\&\#?)><$1 >gx;
139              
140 0           $copy =~ s<\A\s+><>;
141 0           $copy =~ s<\s+\z><>;
142              
143 0 0         push @pieces, \$copy if length $copy;
144             },
145 0           'text,tagname',
146             );
147              
148 0           $p->parse($html);
149 0           $p->eof();
150              
151 0           my $payload = join( q< >, map { $$_ } @pieces );
  0            
152              
153             # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
154             # plain spaces.
155 0           $payload =~ s<[^\S\x{a0}]+>< >g;
156              
157 0           return $payload;
158             }
159              
160             1;