File Coverage

blib/lib/Mail/Pyzor/Digest/Pieces.pm
Criterion Covered Total %
statement 36 77 46.7
branch 4 14 28.5
condition 3 9 33.3
subroutine 10 14 71.4
pod 6 6 100.0
total 59 120 49.1


line stmt bran cond sub pod time code
1             package Mail::Pyzor::Digest::Pieces;
2              
3             # Copyright 2018 cPanel, LLC.
4             # All rights reserved.
5             # http://cpanel.net
6             #
7             # <@LICENSE>
8             # Licensed to the Apache Software Foundation (ASF) under one or more
9             # contributor license agreements. See the NOTICE file distributed with
10             # this work for additional information regarding copyright ownership.
11             # The ASF licenses this file to you under the Apache License, Version 2.0
12             # (the "License"); you may not use this file except in compliance with
13             # the License. You may obtain a copy of the License at:
14             #
15             # http://www.apache.org/licenses/LICENSE-2.0
16             #
17             # Unless required by applicable law or agreed to in writing, software
18             # distributed under the License is distributed on an "AS IS" BASIS,
19             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20             # See the License for the specific language governing permissions and
21             # limitations under the License.
22             #
23             #
24              
25 2     2   212942 use strict;
  2         9  
  2         50  
26 2     2   8 use warnings;
  2         4  
  2         69  
27              
28             =encoding utf-8
29              
30             =head1 NAME
31              
32             Mail::Pyzor::Digest::Pieces
33              
34             =head1 DESCRIPTION
35              
36             This module houses backend logic for L.
37              
38             It reimplements logic found in pyzor’s F module
39             (L).
40              
41             =cut
42              
43             #----------------------------------------------------------------------
44              
45 2     2   9 use Email::MIME::ContentType ();
  2         4  
  2         22  
46 2     2   6 use Encode ();
  2         4  
  2         82  
47              
48             our $VERSION = '0.06_01'; # TRIAL
49             $VERSION =~ tr/_//d;
50              
51             # each tuple is [ offset, length ]
52 2     2   10 use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
  2         14  
  2         144  
53              
54             use constant {
55 2         1626 _MIN_LINE_LENGTH => 8,
56              
57             _ATOMIC_NUM_LINES => 4,
58 2     2   11 };
  2         3  
59              
60             #----------------------------------------------------------------------
61              
62             =head1 FUNCTIONS
63              
64             =head2 $strings_ar = digest_payloads( $EMAIL_MIME )
65              
66             This imitates the corresponding object method in F.
67             It returns a reference to an array of strings. Each string can be either
68             a byte string or a character string (e.g., UTF-8 decoded).
69              
70             NB: RFC 2822 stipulates that message bodies should use CRLF
71             line breaks, not plain LF (nor plain CR). L
72             will thus convert any plain CRs in a quoted-printable message
73             body into CRLF. Python, though, doesn’t do this, so the output of
74             our implementation of C diverges from that of the Python
75             original. It doesn’t ultimately make a difference since the line-ending
76             whitespace gets trimmed regardless, but it’s necessary to factor in when
77             comparing the output of our implementation with the Python output.
78              
79             =cut
80              
81             sub digest_payloads {
82 0     0 1 0 my ($parsed) = @_;
83              
84 0         0 my @subparts = $parsed->subparts();
85              
86 0         0 my @payloads;
87              
88 0 0       0 if (@subparts) {
89 0         0 @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
  0         0  
  0         0  
90             }
91             else {
92 0         0 my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
93              
94 0         0 my $payload;
95              
96 0 0       0 if ( $main_type eq 'text' ) {
97              
98             # Decode transfer encoding, but leave us as a byte string.
99             # Note that this is where Email::MIME converts plain LF to CRLF.
100 0         0 $payload = $parsed->body();
101              
102             # This does the actual character decoding (i.e., “charset”).
103 0         0 $payload = Encode::decode( $encoding, $payload, $encode_check );
104              
105 0 0       0 if ( $subtype eq 'html' ) {
106 0         0 require Mail::Pyzor::Digest::StripHtml;
107 0         0 $payload = Mail::Pyzor::Digest::StripHtml::strip($payload);
108             }
109             }
110             else {
111              
112             # This does no decoding, even of, e.g., quoted-printable or base64.
113 0         0 $payload = $parsed->body_raw();
114             }
115              
116 0         0 push @payloads, $payload;
117             }
118              
119 0         0 return \@payloads;
120             }
121              
122             #----------------------------------------------------------------------
123              
124             =head2 normalize( $STRING )
125              
126             This imitates the corresponding object method in F.
127             It modifies C<$STRING> in-place.
128              
129             As with the original implementation, if C<$STRING> contains (decoded)
130             Unicode characters, those characters will be parsed accordingly. So:
131              
132             $str = "123\xc2\xa0"; # [ c2 a0 ] == \u00a0, non-breaking space
133              
134             normalize($str);
135              
136             The above will leave C<$str> alone, but this:
137              
138             utf8::decode($str);
139              
140             normalize($str);
141              
142             … will trim off the last two bytes from C<$str>.
143              
144             =cut
145              
146             sub normalize { ## no critic qw( Subroutines::RequireArgUnpacking )
147              
148             # NULs are bad, mm-kay?
149 0     0 1 0 $_[0] =~ tr<\0><>d;
150              
151             # NB: Python’s \s without re.UNICODE is the same as Perl’s \s
152             # with the /a modifier.
153             #
154             # https://docs.python.org/2/library/re.html
155             # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
156              
157             # Python: re.compile(r'\S{10,}')
158 0         0 $_[0] =~ s<\S{10,}><>ag;
159              
160             # Python: re.compile(r'\S+@\S+')
161 0         0 $_[0] =~ s<\S+ @ \S+><>agx;
162              
163             # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
164 0         0 $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
165              
166             # (from digest.py …)
167             # Make sure we do the whitespace last because some of the previous
168             # patterns rely on whitespace.
169 0         0 $_[0] =~ tr< \x09-\x0d><>d;
170              
171             # This is fun. digest.py’s normalize() does a non-UNICODE whitespace
172             # strip, then calls strip() on the string, which *will* strip Unicode
173             # whitespace from the ends.
174 0         0 $_[0] =~ s<\A\s+><>;
175 0         0 $_[0] =~ s<\s+\z><>;
176              
177 0         0 return;
178             }
179              
180             #----------------------------------------------------------------------
181              
182             =head2 $yn = should_handle_line( $STRING )
183              
184             This imitates the corresponding object method in F.
185             It returns a boolean.
186              
187             =cut
188              
189             sub should_handle_line {
190 6   100 6 1 20908 return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
191             }
192              
193             #----------------------------------------------------------------------
194              
195             =head2 $sr = assemble_lines( \@LINES )
196              
197             This assembles a string buffer out of @LINES. The string is the buffer
198             of octets that will be hashed to produce the message digest.
199              
200             Each member of @LINES is expected to be an B, not a
201             character string.
202              
203             =cut
204              
205             sub assemble_lines {
206 4     4 1 5884 my ($lines_ar) = @_;
207              
208 4 100       11 if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
209              
210             # cf. handle_atomic() in digest.py
211 1         8 return \join( q<>, @$lines_ar );
212             }
213              
214             #----------------------------------------------------------------------
215             # cf. handle_atomic() in digest.py
216              
217 3         5 my $str = q<>;
218              
219 3         6 for my $ofs_len ( _HASH_SPEC() ) {
220 6         10 my ( $offset, $length ) = @$ofs_len;
221              
222 6         19 for my $i ( 0 .. ( $length - 1 ) ) {
223 18         30 my $idx = int( $offset * @$lines_ar / 100 ) + $i;
224              
225 18 100       28 next if !defined $lines_ar->[$idx];
226              
227 17         29 $str .= $lines_ar->[$idx];
228             }
229             }
230              
231 3         10 return \$str;
232             }
233              
234             #----------------------------------------------------------------------
235              
236             =head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
237              
238             =cut
239              
240 2         127 use constant _QUOTED_PRINTABLE_NAMES => (
241             "quopri-codec",
242             "quopri",
243             "quoted-printable",
244             "quotedprintable",
245 2     2   14 );
  2         3  
246              
247             # Make Encode::decode() ignore anything that doesn’t fit the
248             # given encoding.
249 2     2   12 use constant _encode_check_ignore => q<>;
  2         3  
  2         484  
250              
251             sub parse_content_type {
252 0     0 1   my ($content_type) = @_;
253              
254 0           my $ct_parse = Email::MIME::ContentType::parse_content_type(
255             $content_type,
256             );
257              
258 0   0       my $main = $ct_parse->{'type'} || q<>;
259 0   0       my $sub = $ct_parse->{'subtype'} || q<>;
260              
261 0           my $encoding = $ct_parse->{'attributes'}{'charset'};
262              
263 0           my $checkval;
264              
265 0 0         if ($encoding) {
266              
267             # Lower-case everything, convert underscore to dash, and remove NUL.
268 0           $encoding =~ trd;
269              
270             # Apparently pyzor accommodates messages that put the transfer
271             # encoding in the Content-Type.
272 0 0         if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
  0            
273 0           $checkval = Encode::FB_CROAK();
274             }
275             }
276             else {
277 0           $encoding = 'ascii';
278             }
279              
280             # Match Python .decode()’s 'ignore' behavior
281 0   0       $checkval ||= \&_encode_check_ignore;
282              
283 0           return ( $main, $sub, $encoding, $checkval );
284             }
285              
286             #----------------------------------------------------------------------
287              
288             =head2 @lines = splitlines( $TEXT )
289              
290             Imitates C. (cf. C)
291              
292             Returns a plain list in list context. Returns the number of
293             items to be returned in scalar context.
294              
295             =cut
296              
297             sub splitlines {
298 0     0 1   return split m<\r\n?|\n>, $_[0];
299             }
300              
301             1;