File Coverage

blib/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm

Criterion	Covered	Total	%
statement	40	213	18.7
branch	0	162	0.0
condition	1	54	1.8
subroutine	8	24	33.3
pod	1	15	6.6
total	50	468	10.6

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Plugin::PDFInfo - PDFInfo Plugin for SpamAssassin
21
22							=head1 SYNOPSIS
23
24							loadplugin Mail::SpamAssassin::Plugin::PDFInfo
25
26							=head1 DESCRIPTION
27
28							This plugin helps detected spam using attached PDF files
29
30							=over 4
31
32							=item See "Usage:" below - more documentation see 20_pdfinfo.cf
33
34							Original info kept for history. For later changes see SVN repo
35							-------------------------------------------------------
36							PDFInfo Plugin for SpamAssassin
37							Version: 0.8
38							Info: $Id: PDFInfo.pm 904 2007-08-12 01:36:23Z root $
39							Created: 2007-08-10
40							Modified: 2007-08-10
41							By: Dallas Engelken
42
43							Changes:
44							0.8 - added .fdf detection (thanks John Lundin) [axb]
45							0.7 - fixed empty body/pdf count buglet(thanks Jeremy) [axb]
46							0.6 - added support for tags - PDFCOUNT, PDFVERSION, PDFPRODUCER, etc.
47							- fixed issue on perl 5.6.1 where pdf_match_details() failed to call
48							_find_pdf_mime_parts(), resulting in no detection of pdf mime parts.
49							- quoted-printable support - requires MIME::QuotedPrint (which should be in everyones
50							install as a part of the MIME-Base64 package which is a SA req)
51							- added simple pdf_is_empty_body() function with counts the body bytes minus the
52							subject line. can add optional <bytes> param if you need to allow for a few bytes.
53							0.5 - fix warns for undef $pdf_tags
54							- remove { } and \ before running eval in pdf_match_details to avoid eval error
55							0.4 - added pdf_is_encrypted() function
56							- added option to look for image HxW on same line
57							0.3 - added 2nd fuzzy md5 which uses pdf tag layout as data
58							- renamed pdf_image_named() to pdf_named()
59							- PDF images are encapsulated and have no names. We are matching the PDF file name.
60							- renamed pdf_image_name_regex() to pdf_name_regex()
61							- PDF images are encapsulated and have no names. We are matching the PDF file name.
62							- changed pdf_image_count() a bit and added pdf_count().
63							- pdf_count() checks how many pdf attachments there are on the mail
64							- pdf_image_count() checks how many images are found within all pdfs in the mail.
65							- removed the restriction of the pdf containing an image in order to md5 it.
66							- added pdf_match_details() function to check the following 'details'
67							- author: Author of PDF if specified
68							- producer: Software used to produce PDF
69							- creator: Software used to produce PDF, usually similar to producer
70							- title: Title of PDF
71							- created: Creation Date
72							- modified: Last Modified
73							0.2 - support PDF octet-stream
74							0.1 - just ported over the imageinfo code, and renamed to pdfinfo.
75							- removed all support for png, gif, and jpg from the code.
76							- prepended pdf_ to all function names to avoid conflicts with ImageInfo in SA 3.2.
77
78							Usage:
79
80							pdf_count()
81
82							body RULENAME eval:pdf_count(<min>,[max])
83							min: required, message contains at least x pdf mime parts
84							max: optional, if specified, must not contain more than x pdf mime parts
85
86							pdf_image_count()
87
88							body RULENAME eval:pdf_image_count(<min>,[max])
89							min: required, message contains at least x images in pdf attachments.
90							max: optional, if specified, must not contain more than x pdf images
91
92							pdf_pixel_coverage()
93
94							body RULENAME eval:pdf_pixel_coverage(<min>,[max])
95							min: required, message contains at least this much pixel area
96							max: optional, if specified, message must not contain more than this much pixel area
97
98							pdf_named()
99
100							body RULENAME eval:pdf_named(<string>)
101							string: exact file name match, if you need partial match, see pdf_name_regex()
102
103							pdf_name_regex()
104
105							body RULENAME eval:pdf_name_regex(<regex>)
106							regex: regular expression, see examples in ruleset
107
108							pdf_match_md5()
109
110							body RULENAME eval:pdf_match_md5(<string>)
111							string: 32-byte md5 hex
112
113							pdf_match_fuzzy_md5()
114
115							body RULENAME eval:pdf_match_md5(<string>)
116							string: 32-byte md5 hex - see ruleset for obtaining the fuzzy md5
117
118							pdf_match_details()
119
120							body RULENAME eval:pdf_match_details(<detail>,<regex>);
121							detail: author, creator, created, modified, producer, title
122							regex: regular expression, see examples in ruleset
123
124							pdf_is_encrypted()
125
126							body RULENAME eval:pdf_is_encrypted()
127
128							pdf_is_empty_body()
129
130							body RULENAME eval:pdf_is_empty_body(<bytes>)
131							bytes: maximum byte count to allow and still consider it empty
132
133							NOTE: See the ruleset for more examples that are not documented here.
134
135							=back
136
137							=cut
138
139							# -------------------------------------------------------
140
141							package Mail::SpamAssassin::Plugin::PDFInfo;
142
143	19			19		160	use Mail::SpamAssassin::Plugin;
	19					50
	19					680
144	19			19		123	use Mail::SpamAssassin::Logger;
	19					48
	19					1213
145	19			19		145	use Mail::SpamAssassin::Util qw(compile_regexp);
	19					61
	19					984
146	19			19		137	use strict;
	19					46
	19					486
147	19			19		107	use warnings;
	19					53
	19					663
148							# use bytes;
149	19			19		173	use Digest::MD5 qw(md5_hex);
	19					44
	19					1417
150	19			19		9270	use MIME::QuotedPrint;
	19					5136
	19					72583
151
152							our @ISA = qw(Mail::SpamAssassin::Plugin);
153
154							# constructor: register the eval rule
155							sub new {
156	60			60	1	216	my $class = shift;
157	60					190	my $mailsaobject = shift;
158
159							# some boilerplate...
160	60		33			445	$class = ref($class) \|\| $class;
161	60					353	my $self = $class->SUPER::new($mailsaobject);
162	60					181	bless ($self, $class);
163
164	60					309	$self->register_eval_rule ("pdf_count");
165	60					228	$self->register_eval_rule ("pdf_image_count");
166	60					226	$self->register_eval_rule ("pdf_pixel_coverage");
167	60					232	$self->register_eval_rule ("pdf_image_size_exact");
168	60					226	$self->register_eval_rule ("pdf_image_size_range");
169	60					221	$self->register_eval_rule ("pdf_named");
170	60					210	$self->register_eval_rule ("pdf_name_regex");
171	60					220	$self->register_eval_rule ("pdf_image_to_text_ratio");
172	60					203	$self->register_eval_rule ("pdf_match_md5");
173	60					208	$self->register_eval_rule ("pdf_match_fuzzy_md5");
174	60					215	$self->register_eval_rule ("pdf_match_details");
175	60					246	$self->register_eval_rule ("pdf_is_encrypted");
176	60					235	$self->register_eval_rule ("pdf_is_empty_body");
177
178	60					563	return $self;
179							}
180
181							# -----------------------------------------
182
183							my %get_details = (
184							'pdf' => sub {
185							my ($self, $pms, $part) = @_;
186
187							my $type = $part->{'type'} \|\| 'base64';
188							my $data = '';
189
190							if ($type eq 'quoted-printable') {
191							$data = decode_qp($data); # use QuotedPrint->decode_qp
192							}
193							else {
194							$data = $part->decode(); # just use built in base64 decoder
195							}
196
197							my $index = substr($data, 0, 8);
198
199							return unless ($index =~ /.PDF\-(\d\.\d)/);
200							my $version = $1;
201							$self->_set_tag($pms, 'PDFVERSION', $version);
202							# dbg("pdfinfo: pdf version = $version");
203
204							my ($height, $width, $fuzzy_data, $pdf_tags);
205							my ($producer, $created, $modified, $title, $creator, $author) = ('unknown','0','0','untitled','unknown','unknown');
206							my ($md5, $fuzzy_md5) = ('', '');
207							my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0);
208
209							my $name = $part->{'name'} \|\| '';
210							$self->_set_tag($pms, 'PDFNAME', $name);
211
212							my $no_more_fuzzy = 0;
213							my $got_image = 0;
214							my $encrypted = 0;
215
216							while($data =~ /([^\n]+)/g) {
217							# dbg("pdfinfo: line=$1");
218							my $line = $1;
219
220							$line_count++;
221
222							# lines containing high bytes will have no data we need, so save some cycles
223							next if ($line =~ /[\x80-\xff]/);
224
225							if (!$no_more_fuzzy && $line_count < 70) {
226							if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height\|Width\|(?:(?:Media\|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
227							$line =~ s/\s+$//; # strip off whitespace at end.
228							$fuzzy_data .= $line;
229							}
230							}
231
232							if ($line =~ m/^\/([A-Za-z]+)/) {
233							$pdf_tags .= $1;
234							}
235
236							$got_image=1 if ($line =~ m/\/Image/);
237							$encrypted=1 if ($line =~ m/^\/Encrypt/);
238
239							# once we hit the first stream, we stop collecting data for fuzzy md5
240							$no_more_fuzzy = 1 if ($line =~ m/stream/);
241
242							# From a v1.3 pdf
243							# [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm
244							# [12234] dbg: pdfinfo: line=/Width 630
245							# [12234] dbg: pdfinfo: line=/Height 149
246							if ($got_image) {
247							if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) {
248							$width = $1;
249							$height = $2;
250							}
251							elsif ($line =~ /^\/Width\s(\d+)/) {
252							$width = $1;
253							}
254							elsif ($line =~ /^\/Height\s(\d+)/) {
255							$height = $1;
256							}
257							elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) {
258							$width = $1;
259							$height = $2;
260							}
261							}
262
263							# did pdf contain image data?
264							if ($got_image && $width && $height) {
265							$no_more_fuzzy = 1;
266							my $area = $width * $height;
267							$total_height += $height;
268							$total_width += $width;
269							$total_area += $area;
270							$pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1;
271							$pms->{'pdfinfo'}->{"count_pdf_images"} ++;
272							dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)");
273							$self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}");
274							$height=0; $width=0; # reset and check for next image
275							$got_image = 0;
276							}
277
278							# [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15)
279							# [5310] dbg: pdfinfo: line=/CreationDate(D:20070703144220)
280							# [5310] dbg: pdfinfo: line=/ModDate(D:20070703144220)
281							# [5310] dbg: pdfinfo: line=/Title(Microsoft Word - Document1)
282							# [5310] dbg: pdfinfo: line=/Creator(PScript5.dll Version 5.2)
283							# [5310] dbg: pdfinfo: line=/Author(colet)>>endobj
284							# or all on same line inside xml - v1.6+
285							# <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>>
286
287							if ($line =~ /\/Producer\s?$([^$\\]+)/) {
288							$producer = $1;
289							}
290							if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) {
291							$created = $1;
292							}
293							if ($line =~ /\/ModDate\s?\(D\:(\d+)/) {
294							$modified = $1;
295							}
296							if ($line =~ /\/Title\s?$([^$\\]+)/) {
297							$title = $1;
298							# Title=\376\377\000w\000w\000n\000g
299							# Title=wwng
300							$title =~ s/\\\d{3}//g;
301							}
302							if ($line =~ /\/Creator\s?$([^$\\]+)/) {
303							$creator = $1;
304							}
305							if ($line =~ /\/Author\s?$([^$]+)/) {
306							$author = $1;
307							# Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r
308							# Author=HP_Administrator
309							$author =~ s/\\\d{3}//g;
310							}
311							}
312
313							# store the file name so we can check pdf_named() or pdf_name_match() later.
314							$pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name;
315
316							# store encrypted flag.
317							$pms->{pdfinfo}->{encrypted} = $encrypted;
318
319							# if we had multiple images in the pdf, we need to store the total HxW as well.
320							# If it was a single Image PDF, then this value will already be in the hash.
321							$pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);;
322
323							if ($total_area) {
324							$pms->{pdfinfo}->{pc_pdf} = $total_area;
325							$self->_set_tag($pms, 'PDFIMGAREA', $total_area);
326							dbg("pdfinfo: Filename=$name Total HxW: $total_height x $total_width ($total_area area)") if ($total_area);
327							}
328
329							dbg("pdfinfo: Filename=$name Title=$title Author=$author Producer=$producer Created=$created Modified=$modified");
330
331							$md5 = uc(md5_hex($data)) if $data;
332							$fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data;
333							my $tags_md5;
334							$tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags;
335
336							dbg("pdfinfo: MD5 results for ".($name ? $name : '')." - md5=".($md5 ? $md5 : '')." fuzzy1=".($fuzzy_md5 ? $fuzzy_md5 : '')." fuzzy2=".($tags_md5 ? $tags_md5 : ''));
337
338							# we dont need tags for these.
339							$pms->{pdfinfo}->{details}->{created} = $created if $created;
340							$pms->{pdfinfo}->{details}->{modified} = $modified if $modified;
341
342							if ($producer) {
343							$pms->{pdfinfo}->{details}->{producer} = $producer if $producer;
344							$self->_set_tag($pms, 'PDFPRODUCER', $producer);
345							}
346							if ($title) {
347							$pms->{pdfinfo}->{details}->{title} = $title;
348							$self->_set_tag($pms, 'PDFTITLE', $title);
349							}
350							if ($creator) {
351							$pms->{pdfinfo}->{details}->{creator} = $creator;
352							$self->_set_tag($pms, 'PDFCREATOR', $creator);
353							}
354							if ($author) {
355							$pms->{pdfinfo}->{details}->{author} = $author;
356							$self->_set_tag($pms, 'PDFAUTHOR', $author);
357							}
358							if ($md5) {
359							$pms->{pdfinfo}->{md5}->{$md5} = 1;
360							$self->_set_tag($pms, 'PDFMD5', $fuzzy_md5);
361							}
362							if ($fuzzy_md5) {
363							$pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1;
364							$self->_set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5);
365							}
366							if ($tags_md5) {
367							$pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1;
368							$self->_set_tag($pms, 'PDFMD5FUZZY2', $tags_md5);
369							}
370							},
371
372							);
373
374							# ----------------------------------------
375
376							sub _set_tag {
377
378	0			0			my ($self, $pms, $tag, $value) = @_;
379
380	0						dbg("pdfinfo: set_tag called for $tag $value");
381	0	0	0				return unless ($tag && $value);
382
383	0	0					if (exists $pms->{tag_data}->{$tag}) {
384	0						$pms->{tag_data}->{$tag} .= " $value"; # append value
385							}
386							else {
387	0						$pms->{tag_data}->{$tag} = $value;
388							}
389							}
390
391							# ----------------------------------------
392
393							sub _find_pdf_mime_parts {
394	0			0			my ($self,$pms) = @_;
395
396							# bail early if message does not have pdf parts
397	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
398
399							# initialize
400	0						$pms->{'pdfinfo'}->{"pc_pdf"} = 0;
401	0						$pms->{'pdfinfo'}->{"count_pdf"} = 0;
402	0						$pms->{'pdfinfo'}->{"count_pdf_images"} = 0;
403
404	0						my @parts = $pms->{msg}->find_parts(qr@^(image\|application)/(pdf\|octet\-stream)$@, 1);
405	0						my $part_count = scalar @parts;
406
407	0						dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content");
408
409							# cache this so we can easily bail
410	0	0					$pms->{'pdfinfo'}->{'no_parts'} = 1 unless $part_count;
411
412	0						foreach my $p (@parts) {
413	0						my $type = $p->{'type'} =~ m@/([\w\-]+)$@;
414	0		0				my $name = $p->{'name'} \|\| '';
415
416	0		0				my $cte = lc( $p->get_header('content-transfer-encoding') \|\| '' );
417
418	0	0					dbg("pdfinfo: found part, type=".($type ? $type : '')." file=".($name ? $name : '')." cte=".($cte ? $cte : '')."");
		0
		0
419
420							# make sure its a cte we support
421	0	0					next unless ($cte =~ /^(?:base64\|quoted\-printable)$/);
422
423							# filename must end with .pdf, or application type can be pdf
424							# sometimes windows muas will wrap a pdf up inside a .dat file
425							# v0.8 - Added .fdf phoney PDF detection
426	0	0	0				next unless ($name =~ /\.[fp]df$/ \|\| $type eq 'pdf');
427
428							# if we get this far, make sure type is pdf for sure (not octet-stream or anything else)
429	0						$type='pdf';
430
431	0	0	0				if ($type && exists $get_details{$type}) {
432	0						$get_details{$type}->($self, $pms, $p);
433	0						$pms->{'pdfinfo'}->{"count_$type"} ++;
434							}
435							}
436
437	0						$self->_set_tag($pms, 'PDFCOUNT', $pms->{'pdfinfo'}->{"count_pdf"});
438	0						$self->_set_tag($pms, 'PDFIMGCOUNT', $pms->{'pdfinfo'}->{"count_pdf_images"});
439
440							}
441
442							# ----------------------------------------
443
444							sub pdf_named {
445	0			0	0		my ($self,$pms,$body,$name) = @_;
446	0	0					return unless (defined $name);
447
448							# make sure we have image data read in.
449	0	0					if (!exists $pms->{'pdfinfo'}) {
450	0						$self->_find_pdf_mime_parts($pms);
451							}
452
453	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
454
455	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
456	0	0					return 1 if (exists $pms->{'pdfinfo'}->{"names_pdf"}->{$name});
457	0						return 0;
458							}
459
460							# -----------------------------------------
461
462							sub pdf_name_regex {
463	0			0	0		my ($self,$pms,$body,$re) = @_;
464	0	0					return unless (defined $re);
465
466							# make sure we have image data read in.
467	0	0					if (!exists $pms->{'pdfinfo'}) {
468	0						$self->_find_pdf_mime_parts($pms);
469							}
470
471	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
472	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
473
474	0						my ($rec, $err) = compile_regexp($re, 2);
475	0	0					if (!$rec) {
476	0						info("pdfinfo: invalid regexp '$re': $err");
477	0						return 0;
478							}
479
480	0						my $hit = 0;
481	0						foreach my $name (keys %{$pms->{'pdfinfo'}->{"names_pdf"}}) {
	0
482	0	0					if ($name =~ $rec) {
483	0						dbg("pdfinfo: pdf_name_regex hit on $name");
484	0						return 1;
485							}
486							}
487	0						return 0;
488
489							}
490
491							# -----------------------------------------
492
493							sub pdf_is_encrypted {
494	0			0	0		my ($self,$pms,$body) = @_;
495
496							# make sure we have image data read in.
497	0	0					if (!exists $pms->{'pdfinfo'}) {
498	0						$self->_find_pdf_mime_parts($pms);
499							}
500
501	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
502	0						return $pms->{'pdfinfo'}->{'encrypted'};
503							}
504
505							# -----------------------------------------
506
507							sub pdf_count {
508	0			0	0		my ($self,$pms,$body,$min,$max) = @_;
509	0	0					return unless defined $min;
510
511							# make sure we have image data read in.
512	0	0					if (!exists $pms->{'pdfinfo'}) {
513	0						$self->_find_pdf_mime_parts($pms);
514							}
515
516	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
517	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf"});
518	0						return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf"});
519
520							}
521
522							# -----------------------------------------
523
524							sub pdf_image_count {
525	0			0	0		my ($self,$pms,$body,$min,$max) = @_;
526	0	0					return unless defined $min;
527
528							# make sure we have image data read in.
529	0	0					if (!exists $pms->{'pdfinfo'}) {
530	0						$self->_find_pdf_mime_parts($pms);
531							}
532
533	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
534	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf_images"});
535	0						return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf_images"});
536
537							}
538
539							# -----------------------------------------
540
541							sub pdf_pixel_coverage {
542	0			0	0		my ($self,$pms,$body,$min,$max) = @_;
543	0	0					return unless (defined $min);
544
545							# make sure we have image data read in.
546	0	0					if (!exists $pms->{'pdfinfo'}) {
547	0						$self->_find_pdf_mime_parts($pms);
548							}
549
550	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
551	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
552
553							# dbg("pdfinfo: pc_$type: $min, ".($max ? $max:'').", $type, ".$pms->{'pdfinfo'}->{"pc_pdf"});
554	0						return result_check($min, $max, $pms->{'pdfinfo'}->{"pc_pdf"});
555							}
556
557							# -----------------------------------------
558
559							sub pdf_image_to_text_ratio {
560	0			0	0		my ($self,$pms,$body,$min,$max) = @_;
561	0	0	0				return unless (defined $min && defined $max);
562
563							# make sure we have image data read in.
564	0	0					if (!exists $pms->{'pdfinfo'}) {
565	0						$self->_find_pdf_mime_parts($pms);
566							}
567
568	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
569	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
570
571							# depending on how you call this eval (body vs rawbody),
572							# the $textlen will differ.
573	0						my $textlen = length(join('',@$body));
574
575	0	0	0				return 0 unless ( $textlen > 0 && exists $pms->{'pdfinfo'}->{"pc_pdf"} && $pms->{'pdfinfo'}->{"pc_pdf"} > 0);
			0
576
577	0						my $ratio = $textlen / $pms->{'pdfinfo'}->{"pc_pdf"};
578	0						dbg("pdfinfo: image ratio=$ratio, min=$min max=$max");
579	0						return result_check($min, $max, $ratio, 1);
580							}
581
582							# -----------------------------------------
583
584							sub pdf_is_empty_body {
585	0			0	0		my ($self,$pms,$body,$min) = @_;
586
587	0		0				$min \|\|= 0; # default to 0 bytes
588
589							# make sure we have image data read in.
590	0	0					if (!exists $pms->{'pdfinfo'}) {
591	0						$self->_find_pdf_mime_parts($pms);
592							}
593
594	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
595	0	0					return 0 unless $pms->{'pdfinfo'}->{"count_pdf"};
596
597							# check for cached result
598	0	0					return 1 if $pms->{'pdfinfo'}->{"no_body_text"};
599
600	0						shift @$body; # shift body array removes line #1 -> subject line.
601
602	0						my $bytes = 0;
603	0						my $textlen = length(join('',@$body));
604	0						foreach my $line (@$body) {
605	0	0					next unless ($line =~ m/\S/);
606	0	0					next if ($line =~ m/^Subject/);
607	0						$bytes += length($line);
608							}
609
610	0						dbg("pdfinfo: is_empty_body = $bytes bytes");
611
612	0	0	0				if ($bytes == 0 \|\| ($bytes <= $min)) {
613	0						$pms->{'pdfinfo'}->{"no_body_text"} = 1;
614	0						return 1;
615							}
616
617							# cache it and return 0
618	0						$pms->{'pdfinfo'}->{"no_body_text"} = 0;
619	0						return 0;
620							}
621
622							# -----------------------------------------
623
624							sub pdf_image_size_exact {
625	0			0	0		my ($self,$pms,$body,$height,$width) = @_;
626	0	0	0				return unless (defined $height && defined $width);
627
628							# make sure we have image data read in.
629	0	0					if (!exists $pms->{'pdfinfo'}) {
630	0						$self->_find_pdf_mime_parts($pms);
631							}
632
633	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
634	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
635	0	0					return 1 if (exists $pms->{'pdfinfo'}->{"dems_pdf"}->{"${height}x${width}"});
636	0						return 0;
637							}
638
639							# -----------------------------------------
640
641							sub pdf_image_size_range {
642	0			0	0		my ($self,$pms,$body,$minh,$minw,$maxh,$maxw) = @_;
643	0	0	0				return unless (defined $minh && defined $minw);
644
645							# make sure we have image data read in.
646	0	0					if (!exists $pms->{'pdfinfo'}) {
647	0						$self->_find_pdf_mime_parts($pms);
648							}
649
650	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
651	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
652
653	0						foreach my $dem ( keys %{$pms->{'pdfinfo'}->{"dems_pdf"}}) {
	0
654	0						my ($h,$w) = split(/x/,$dem);
655	0	0					next if ($h < $minh); # height less than min height
656	0	0					next if ($w < $minw); # width less than min width
657	0	0	0				next if (defined $maxh && $h > $maxh); # height more than max height
658	0	0	0				next if (defined $maxw && $w > $maxw); # width more than max width
659
660							# if we make it here, we have a match
661	0						return 1;
662							}
663
664	0						return 0;
665							}
666
667							# -----------------------------------------
668
669							sub pdf_match_md5 {
670
671	0			0	0		my ($self,$pms,$body,$md5) = @_;
672	0	0					return unless defined $md5;
673
674	0						my $uc_md5 = uc($md5); # uppercase matches only
675
676							# make sure we have pdf data read in.
677	0	0					if (!exists $pms->{'pdfinfo'}) {
678	0						$self->_find_pdf_mime_parts($pms);
679							}
680
681	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
682	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"md5"});
683	0	0					return 1 if (exists $pms->{'pdfinfo'}->{"md5"}->{$uc_md5});
684	0						return 0;
685							}
686
687							# -----------------------------------------
688
689							sub pdf_match_fuzzy_md5 {
690
691	0			0	0		my ($self,$pms,$body,$md5) = @_;
692	0	0					return unless defined $md5;
693
694	0						my $uc_md5 = uc($md5); # uppercase matches only
695
696							# make sure we have pdf data read in.
697	0	0					if (!exists $pms->{'pdfinfo'}) {
698	0						$self->_find_pdf_mime_parts($pms);
699							}
700
701	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
702	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{"fuzzy_md5"});
703	0	0					return 1 if (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}->{$uc_md5});
704	0						return 0;
705							}
706
707							# -----------------------------------------
708
709							sub pdf_match_details {
710	0			0	0		my ($self, $pms, $body, $detail, $regex) = @_;
711	0	0	0				return unless ($detail && $regex);
712
713							# make sure we have pdf data read in.
714	0	0					if (!exists $pms->{'pdfinfo'}) {
715	0						$self->_find_pdf_mime_parts($pms);
716							}
717
718	0	0					return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
719	0	0					return 0 unless (exists $pms->{'pdfinfo'}->{'details'});
720
721	0						my $check_value = $pms->{pdfinfo}->{details}->{$detail};
722	0	0					return unless $check_value;
723
724	0						my ($rec, $err) = compile_regexp($regex, 2);
725	0	0					if (!$rec) {
726	0						info("pdfinfo: invalid regexp '$regex': $err");
727	0						return 0;
728							}
729
730	0	0					if ($check_value =~ $rec) {
731	0						dbg("pdfinfo: pdf_match_details $detail $regex matches $check_value");
732	0						return 1;
733							}
734	0						return 0;
735							}
736
737							# -----------------------------------------
738
739							sub result_check {
740	0			0	0		my ($min, $max, $value, $nomaxequal) = @_;
741	0	0					return 0 unless defined $value;
742	0	0					return 0 if ($value < $min);
743	0	0	0				return 0 if (defined $max && $value > $max);
744	0	0	0				return 0 if (defined $nomaxequal && $nomaxequal && $value == $max);
			0
745	0						return 1;
746							}
747
748							# -----------------------------------------
749
750							1;
751