File Coverage

blib/lib/Mail/Pyzor/Digest/Pieces.pm

Criterion	Covered	Total	%
statement	36	77	46.7
branch	4	14	28.5
condition	3	9	33.3
subroutine	10	14	71.4
pod	6	6	100.0
total	59	120	49.1

line	stmt	bran	cond	sub	pod	time	code
1							package Mail::Pyzor::Digest::Pieces;
2
3							# Copyright 2018 cPanel, LLC.
4							# All rights reserved.
5							# http://cpanel.net
6							#
7							# <@LICENSE>
8							# Licensed to the Apache Software Foundation (ASF) under one or more
9							# contributor license agreements. See the NOTICE file distributed with
10							# this work for additional information regarding copyright ownership.
11							# The ASF licenses this file to you under the Apache License, Version 2.0
12							# (the "License"); you may not use this file except in compliance with
13							# the License. You may obtain a copy of the License at:
14							#
15							# http://www.apache.org/licenses/LICENSE-2.0
16							#
17							# Unless required by applicable law or agreed to in writing, software
18							# distributed under the License is distributed on an "AS IS" BASIS,
19							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20							# See the License for the specific language governing permissions and
21							# limitations under the License.
22							#
23							#
24
25	2			2		212942	use strict;
	2					9
	2					50
26	2			2		8	use warnings;
	2					4
	2					69
27
28							=encoding utf-8
29
30							=head1 NAME
31
32							Mail::Pyzor::Digest::Pieces
33
34							=head1 DESCRIPTION
35
36							This module houses backend logic for L.
37
38							It reimplements logic found in pyzor’s F module
39							(L).
40
41							=cut
42
43							#----------------------------------------------------------------------
44
45	2			2		9	use Email::MIME::ContentType ();
	2					4
	2					22
46	2			2		6	use Encode ();
	2					4
	2					82
47
48							our $VERSION = '0.06_01'; # TRIAL
49							$VERSION =~ tr/_//d;
50
51							# each tuple is [ offset, length ]
52	2			2		10	use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
	2					14
	2					144
53
54							use constant {
55	2					1626	_MIN_LINE_LENGTH => 8,
56
57							_ATOMIC_NUM_LINES => 4,
58	2			2		11	};
	2					3
59
60							#----------------------------------------------------------------------
61
62							=head1 FUNCTIONS
63
64							=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
65
66							This imitates the corresponding object method in F.
67							It returns a reference to an array of strings. Each string can be either
68							a byte string or a character string (e.g., UTF-8 decoded).
69
70							NB: RFC 2822 stipulates that message bodies should use CRLF
71							line breaks, not plain LF (nor plain CR). L
72							will thus convert any plain CRs in a quoted-printable message
73							body into CRLF. Python, though, doesn’t do this, so the output of
74							our implementation of C diverges from that of the Python
75							original. It doesn’t ultimately make a difference since the line-ending
76							whitespace gets trimmed regardless, but it’s necessary to factor in when
77							comparing the output of our implementation with the Python output.
78
79							=cut
80
81							sub digest_payloads {
82	0			0	1	0	my ($parsed) = @_;
83
84	0					0	my @subparts = $parsed->subparts();
85
86	0					0	my @payloads;
87
88	0	0				0	if (@subparts) {
89	0					0	@payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
	0					0
	0					0
90							}
91							else {
92	0					0	my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
93
94	0					0	my $payload;
95
96	0	0				0	if ( $main_type eq 'text' ) {
97
98							# Decode transfer encoding, but leave us as a byte string.
99							# Note that this is where Email::MIME converts plain LF to CRLF.
100	0					0	$payload = $parsed->body();
101
102							# This does the actual character decoding (i.e., “charset”).
103	0					0	$payload = Encode::decode( $encoding, $payload, $encode_check );
104
105	0	0				0	if ( $subtype eq 'html' ) {
106	0					0	require Mail::Pyzor::Digest::StripHtml;
107	0					0	$payload = Mail::Pyzor::Digest::StripHtml::strip($payload);
108							}
109							}
110							else {
111
112							# This does no decoding, even of, e.g., quoted-printable or base64.
113	0					0	$payload = $parsed->body_raw();
114							}
115
116	0					0	push @payloads, $payload;
117							}
118
119	0					0	return \@payloads;
120							}
121
122							#----------------------------------------------------------------------
123
124							=head2 normalize( $STRING )
125
126							This imitates the corresponding object method in F.
127							It modifies C<$STRING> in-place.
128
129							As with the original implementation, if C<$STRING> contains (decoded)
130							Unicode characters, those characters will be parsed accordingly. So:
131
132							$str = "123\xc2\xa0"; # [ c2 a0 ] == \u00a0, non-breaking space
133
134							normalize($str);
135
136							The above will leave C<$str> alone, but this:
137
138							utf8::decode($str);
139
140							normalize($str);
141
142							… will trim off the last two bytes from C<$str>.
143
144							=cut
145
146							sub normalize { ## no critic qw( Subroutines::RequireArgUnpacking )
147
148							# NULs are bad, mm-kay?
149	0			0	1	0	$_[0] =~ tr<\0><>d;
150
151							# NB: Python’s \s without re.UNICODE is the same as Perl’s \s
152							# with the /a modifier.
153							#
154							# https://docs.python.org/2/library/re.html
155							# https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
156
157							# Python: re.compile(r'\S{10,}')
158	0					0	$_[0] =~ s<\S{10,}><>ag;
159
160							# Python: re.compile(r'\S+@\S+')
161	0					0	$_[0] =~ s<\S+ @ \S+><>agx;
162
163							# Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
164	0					0	$_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
165
166							# (from digest.py …)
167							# Make sure we do the whitespace last because some of the previous
168							# patterns rely on whitespace.
169	0					0	$_[0] =~ tr< \x09-\x0d><>d;
170
171							# This is fun. digest.py’s normalize() does a non-UNICODE whitespace
172							# strip, then calls strip() on the string, which will strip Unicode
173							# whitespace from the ends.
174	0					0	$_[0] =~ s<\A\s+><>;
175	0					0	$_[0] =~ s<\s+\z><>;
176
177	0					0	return;
178							}
179
180							#----------------------------------------------------------------------
181
182							=head2 $yn = should_handle_line( $STRING )
183
184							This imitates the corresponding object method in F.
185							It returns a boolean.
186
187							=cut
188
189							sub should_handle_line {
190	6		100	6	1	20908	return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
191							}
192
193							#----------------------------------------------------------------------
194
195							=head2 $sr = assemble_lines( \@LINES )
196
197							This assembles a string buffer out of @LINES. The string is the buffer
198							of octets that will be hashed to produce the message digest.
199
200							Each member of @LINES is expected to be an B, not a
201							character string.
202
203							=cut
204
205							sub assemble_lines {
206	4			4	1	5884	my ($lines_ar) = @_;
207
208	4	100				11	if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
209
210							# cf. handle_atomic() in digest.py
211	1					8	return \join( q<>, @$lines_ar );
212							}
213
214							#----------------------------------------------------------------------
215							# cf. handle_atomic() in digest.py
216
217	3					5	my $str = q<>;
218
219	3					6	for my $ofs_len ( _HASH_SPEC() ) {
220	6					10	my ( $offset, $length ) = @$ofs_len;
221
222	6					19	for my $i ( 0 .. ( $length - 1 ) ) {
223	18					30	my $idx = int( $offset * @$lines_ar / 100 ) + $i;
224
225	18	100				28	next if !defined $lines_ar->[$idx];
226
227	17					29	$str .= $lines_ar->[$idx];
228							}
229							}
230
231	3					10	return \$str;
232							}
233
234							#----------------------------------------------------------------------
235
236							=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
237
238							=cut
239
240	2					127	use constant _QUOTED_PRINTABLE_NAMES => (
241							"quopri-codec",
242							"quopri",
243							"quoted-printable",
244							"quotedprintable",
245	2			2		14	);
	2					3
246
247							# Make Encode::decode() ignore anything that doesn’t fit the
248							# given encoding.
249	2			2		12	use constant _encode_check_ignore => q<>;
	2					3
	2					484
250
251							sub parse_content_type {
252	0			0	1		my ($content_type) = @_;
253
254	0						my $ct_parse = Email::MIME::ContentType::parse_content_type(
255							$content_type,
256							);
257
258	0		0				my $main = $ct_parse->{'type'} \|\| q<>;
259	0		0				my $sub = $ct_parse->{'subtype'} \|\| q<>;
260
261	0						my $encoding = $ct_parse->{'attributes'}{'charset'};
262
263	0						my $checkval;
264
265	0	0					if ($encoding) {
266
267							# Lower-case everything, convert underscore to dash, and remove NUL.
268	0						$encoding =~ trd;
269
270							# Apparently pyzor accommodates messages that put the transfer
271							# encoding in the Content-Type.
272	0	0					if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
	0
273	0						$checkval = Encode::FB_CROAK();
274							}
275							}
276							else {
277	0						$encoding = 'ascii';
278							}
279
280							# Match Python .decode()’s 'ignore' behavior
281	0		0				$checkval \|\|= \&_encode_check_ignore;
282
283	0						return ( $main, $sub, $encoding, $checkval );
284							}
285
286							#----------------------------------------------------------------------
287
288							=head2 @lines = splitlines( $TEXT )
289
290							Imitates C. (cf. C)
291
292							Returns a plain list in list context. Returns the number of
293							items to be returned in scalar context.
294
295							=cut
296
297							sub splitlines {
298	0			0	1		return split m<\r\n?\|\n>, $_[0];
299							}
300
301							1;