File Coverage

blib/lib/Mail/Pyzor/Digest/StripHtml.pm
Criterion Covered Total %
statement 9 40 22.5
branch 0 10 0.0
condition n/a
subroutine 3 7 42.8
pod 1 1 100.0
total 13 58 22.4


line stmt bran cond sub pod time code
1             package Mail::Pyzor::Digest::StripHtml;
2              
3             # Copyright 2018 cPanel, LLC.
4             # All rights reserved.
5             # http://cpanel.net
6             #
7             # <@LICENSE>
8             # Licensed to the Apache Software Foundation (ASF) under one or more
9             # contributor license agreements. See the NOTICE file distributed with
10             # this work for additional information regarding copyright ownership.
11             # The ASF licenses this file to you under the Apache License, Version 2.0
12             # (the "License"); you may not use this file except in compliance with
13             # the License. You may obtain a copy of the License at:
14             #
15             # http://www.apache.org/licenses/LICENSE-2.0
16             #
17             # Unless required by applicable law or agreed to in writing, software
18             # distributed under the License is distributed on an "AS IS" BASIS,
19             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20             # See the License for the specific language governing permissions and
21             # limitations under the License.
22             #
23             #
24              
25 1     1   179787 use strict;
  1         9  
  1         27  
26 1     1   4 use warnings;
  1         2  
  1         38  
27              
28             =encoding utf-8
29              
30             =head1 NAME
31              
32             Mail::Pyzor::Digest::StripHtml
33              
34             =head1 SYNOPSIS
35              
36             my $stripped = Mail::Pyzor::Digest::StripHtml::strip($html);
37              
38             =head1 DESCRIPTION
39              
40             This module attempts to duplicate pyzor’s HTML-stripping logic.
41              
42             =head1 ACCURACY
43              
44             This library cannot achieve 100%, bug-for-bug parity with pyzor
45             because to do so would require duplicating Python’s own HTML parsing
46             library. Since that library’s output has changed over time, and those
47             changes in turn affect pyzor, it’s literally impossible to arrive at
48             a single, fully-compatible reimplementation.
49              
50             That said, all known divergences between pyzor and this library involve
51             invalid HTML as input.
52              
53             Please open bug reports for any divergences you identify, particularly
54             if the input is valid HTML.
55              
56             =cut
57              
58             #----------------------------------------------------------------------
59              
60 1     1   497 use HTML::Parser ();
  1         4836  
  1         456  
61              
62             our $VERSION = '0.06_01'; # TRIAL
63             $VERSION =~ tr/_//d;
64              
65             #----------------------------------------------------------------------
66              
67             =head1 FUNCTIONS
68              
69             =head2 $stripped = strip( $HTML )
70              
71             Give it some HTML, and it’ll give back the stripped text.
72              
73             In B, the stripping consists of removing tags as well as
74             CscriptE> and CstyleE> elements; however, it also
75             removes HTML entities.
76              
77             This tries very hard to duplicate pyzor’s behavior with invalid HTML.
78              
79             =cut
80              
81             sub strip {
82 0     0 1   my ($html) = @_;
83              
84 0           $html =~ s<\A\s+><>;
85 0           $html =~ s<\s+\z><>;
86              
87 0           my $p = HTML::Parser->new( api_version => 3 );
88              
89 0           my @pieces;
90              
91 0           my $accumulate = 1;
92              
93             $p->handler(
94             start => sub {
95 0     0     my ($tagname) = @_;
96              
97 0 0         $accumulate = 0 if $tagname eq 'script';
98 0 0         $accumulate = 0 if $tagname eq 'style';
99              
100 0           return;
101             },
102 0           'tagname',
103             );
104              
105             $p->handler(
106             end => sub {
107 0     0     $accumulate = 1;
108 0           return;
109             }
110 0           );
111              
112             $p->handler(
113             text => sub {
114 0     0     my ($copy) = @_;
115              
116 0 0         return if !$accumulate;
117              
118             # pyzor’s HTML parser discards HTML entities. On top of that,
119             # we need to match, as closely as possible, pyzor’s handling of
120             # invalid HTML entities … which is a function of Python’s
121             # standard HTML parsing library. This will probably never be
122             # fully compatible with the pyzor, but we can get it close.
123              
124             # The original is:
125             #
126             # re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
127             #
128             # The parsing loop then “backs up” one byte if the last
129             # character isn’t a “;”. We use a look-ahead assertion to
130             # mimic that behavior.
131 0           $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
132              
133             # The original is:
134             #
135             # re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
136             #
137             # We again use a look-ahead assertion to mimic Python.
138 0           $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
139              
140             # Python’s HTMLParser aborts its parsing loop when it encounters
141             # an invalid numeric reference.
142 0           $copy =~ s<\&\#
143             (?:
144             [^0-9xX] # anything but the expected first char
145             |
146             [0-9]+[a-fA-F] # hex within decimal
147             |
148             [xX][^0-9a-fA-F]
149             )
150             (.*)
151             ><
152 0 0         ( -1 == index($1, ';') ) ? q<> : '&#'
153             >exs;
154              
155             # Python’s HTMLParser treats invalid entities as incomplete
156 0           $copy =~ s<(\&\#?)><$1 >gx;
157              
158 0           $copy =~ s<\A\s+><>;
159 0           $copy =~ s<\s+\z><>;
160              
161 0 0         push @pieces, \$copy if length $copy;
162             },
163 0           'text,tagname',
164             );
165              
166 0           $p->parse($html);
167 0           $p->eof();
168              
169 0           my $payload = join( q< >, map { $$_ } @pieces );
  0            
170              
171             # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
172             # plain spaces.
173 0           $payload =~ s<[^\S\x{a0}]+>< >g;
174              
175 0           return $payload;
176             }
177              
178             1;