File Coverage

blib/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
Criterion Covered Total %
statement 40 213 18.7
branch 0 162 0.0
condition 1 54 1.8
subroutine 8 24 33.3
pod 1 15 6.6
total 50 468 10.6


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Plugin::PDFInfo - PDFInfo Plugin for SpamAssassin
21              
22             =head1 SYNOPSIS
23              
24             loadplugin Mail::SpamAssassin::Plugin::PDFInfo
25              
26             =head1 DESCRIPTION
27              
28             This plugin helps detected spam using attached PDF files
29              
30             =over 4
31              
32             =item See "Usage:" below - more documentation see 20_pdfinfo.cf
33              
34             Original info kept for history. For later changes see SVN repo
35             -------------------------------------------------------
36             PDFInfo Plugin for SpamAssassin
37             Version: 0.8
38             Info: $Id: PDFInfo.pm 904 2007-08-12 01:36:23Z root $
39             Created: 2007-08-10
40             Modified: 2007-08-10
41             By: Dallas Engelken
42              
43             Changes:
44             0.8 - added .fdf detection (thanks John Lundin) [axb]
45             0.7 - fixed empty body/pdf count buglet(thanks Jeremy) [axb]
46             0.6 - added support for tags - PDFCOUNT, PDFVERSION, PDFPRODUCER, etc.
47             - fixed issue on perl 5.6.1 where pdf_match_details() failed to call
48             _find_pdf_mime_parts(), resulting in no detection of pdf mime parts.
49             - quoted-printable support - requires MIME::QuotedPrint (which should be in everyones
50             install as a part of the MIME-Base64 package which is a SA req)
51             - added simple pdf_is_empty_body() function with counts the body bytes minus the
52             subject line. can add optional <bytes> param if you need to allow for a few bytes.
53             0.5 - fix warns for undef $pdf_tags
54             - remove { } and \ before running eval in pdf_match_details to avoid eval error
55             0.4 - added pdf_is_encrypted() function
56             - added option to look for image HxW on same line
57             0.3 - added 2nd fuzzy md5 which uses pdf tag layout as data
58             - renamed pdf_image_named() to pdf_named()
59             - PDF images are encapsulated and have no names. We are matching the PDF file name.
60             - renamed pdf_image_name_regex() to pdf_name_regex()
61             - PDF images are encapsulated and have no names. We are matching the PDF file name.
62             - changed pdf_image_count() a bit and added pdf_count().
63             - pdf_count() checks how many pdf attachments there are on the mail
64             - pdf_image_count() checks how many images are found within all pdfs in the mail.
65             - removed the restriction of the pdf containing an image in order to md5 it.
66             - added pdf_match_details() function to check the following 'details'
67             - author: Author of PDF if specified
68             - producer: Software used to produce PDF
69             - creator: Software used to produce PDF, usually similar to producer
70             - title: Title of PDF
71             - created: Creation Date
72             - modified: Last Modified
73             0.2 - support PDF octet-stream
74             0.1 - just ported over the imageinfo code, and renamed to pdfinfo.
75             - removed all support for png, gif, and jpg from the code.
76             - prepended pdf_ to all function names to avoid conflicts with ImageInfo in SA 3.2.
77              
78             Usage:
79              
80             pdf_count()
81              
82             body RULENAME eval:pdf_count(<min>,[max])
83             min: required, message contains at least x pdf mime parts
84             max: optional, if specified, must not contain more than x pdf mime parts
85              
86             pdf_image_count()
87              
88             body RULENAME eval:pdf_image_count(<min>,[max])
89             min: required, message contains at least x images in pdf attachments.
90             max: optional, if specified, must not contain more than x pdf images
91              
92             pdf_pixel_coverage()
93              
94             body RULENAME eval:pdf_pixel_coverage(<min>,[max])
95             min: required, message contains at least this much pixel area
96             max: optional, if specified, message must not contain more than this much pixel area
97              
98             pdf_named()
99              
100             body RULENAME eval:pdf_named(<string>)
101             string: exact file name match, if you need partial match, see pdf_name_regex()
102              
103             pdf_name_regex()
104              
105             body RULENAME eval:pdf_name_regex(<regex>)
106             regex: regular expression, see examples in ruleset
107              
108             pdf_match_md5()
109              
110             body RULENAME eval:pdf_match_md5(<string>)
111             string: 32-byte md5 hex
112              
113             pdf_match_fuzzy_md5()
114              
115             body RULENAME eval:pdf_match_md5(<string>)
116             string: 32-byte md5 hex - see ruleset for obtaining the fuzzy md5
117              
118             pdf_match_details()
119              
120             body RULENAME eval:pdf_match_details(<detail>,<regex>);
121             detail: author, creator, created, modified, producer, title
122             regex: regular expression, see examples in ruleset
123              
124             pdf_is_encrypted()
125              
126             body RULENAME eval:pdf_is_encrypted()
127              
128             pdf_is_empty_body()
129              
130             body RULENAME eval:pdf_is_empty_body(<bytes>)
131             bytes: maximum byte count to allow and still consider it empty
132              
133             NOTE: See the ruleset for more examples that are not documented here.
134              
135             =back
136              
137             =cut
138              
139             # -------------------------------------------------------
140              
141             package Mail::SpamAssassin::Plugin::PDFInfo;
142              
143 19     19   160 use Mail::SpamAssassin::Plugin;
  19         50  
  19         680  
144 19     19   123 use Mail::SpamAssassin::Logger;
  19         48  
  19         1213  
145 19     19   145 use Mail::SpamAssassin::Util qw(compile_regexp);
  19         61  
  19         984  
146 19     19   137 use strict;
  19         46  
  19         486  
147 19     19   107 use warnings;
  19         53  
  19         663  
148             # use bytes;
149 19     19   173 use Digest::MD5 qw(md5_hex);
  19         44  
  19         1417  
150 19     19   9270 use MIME::QuotedPrint;
  19         5136  
  19         72583  
151              
152             our @ISA = qw(Mail::SpamAssassin::Plugin);
153              
154             # constructor: register the eval rule
155             sub new {
156 60     60 1 216 my $class = shift;
157 60         190 my $mailsaobject = shift;
158              
159             # some boilerplate...
160 60   33     445 $class = ref($class) || $class;
161 60         353 my $self = $class->SUPER::new($mailsaobject);
162 60         181 bless ($self, $class);
163              
164 60         309 $self->register_eval_rule ("pdf_count");
165 60         228 $self->register_eval_rule ("pdf_image_count");
166 60         226 $self->register_eval_rule ("pdf_pixel_coverage");
167 60         232 $self->register_eval_rule ("pdf_image_size_exact");
168 60         226 $self->register_eval_rule ("pdf_image_size_range");
169 60         221 $self->register_eval_rule ("pdf_named");
170 60         210 $self->register_eval_rule ("pdf_name_regex");
171 60         220 $self->register_eval_rule ("pdf_image_to_text_ratio");
172 60         203 $self->register_eval_rule ("pdf_match_md5");
173 60         208 $self->register_eval_rule ("pdf_match_fuzzy_md5");
174 60         215 $self->register_eval_rule ("pdf_match_details");
175 60         246 $self->register_eval_rule ("pdf_is_encrypted");
176 60         235 $self->register_eval_rule ("pdf_is_empty_body");
177              
178 60         563 return $self;
179             }
180              
181             # -----------------------------------------
182              
183             my %get_details = (
184             'pdf' => sub {
185             my ($self, $pms, $part) = @_;
186              
187             my $type = $part->{'type'} || 'base64';
188             my $data = '';
189              
190             if ($type eq 'quoted-printable') {
191             $data = decode_qp($data); # use QuotedPrint->decode_qp
192             }
193             else {
194             $data = $part->decode(); # just use built in base64 decoder
195             }
196              
197             my $index = substr($data, 0, 8);
198              
199             return unless ($index =~ /.PDF\-(\d\.\d)/);
200             my $version = $1;
201             $self->_set_tag($pms, 'PDFVERSION', $version);
202             # dbg("pdfinfo: pdf version = $version");
203              
204             my ($height, $width, $fuzzy_data, $pdf_tags);
205             my ($producer, $created, $modified, $title, $creator, $author) = ('unknown','0','0','untitled','unknown','unknown');
206             my ($md5, $fuzzy_md5) = ('', '');
207             my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0);
208              
209             my $name = $part->{'name'} || '';
210             $self->_set_tag($pms, 'PDFNAME', $name);
211              
212             my $no_more_fuzzy = 0;
213             my $got_image = 0;
214             my $encrypted = 0;
215              
216             while($data =~ /([^\n]+)/g) {
217             # dbg("pdfinfo: line=$1");
218             my $line = $1;
219              
220             $line_count++;
221              
222             # lines containing high bytes will have no data we need, so save some cycles
223             next if ($line =~ /[\x80-\xff]/);
224              
225             if (!$no_more_fuzzy && $line_count < 70) {
226             if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
227             $line =~ s/\s+$//; # strip off whitespace at end.
228             $fuzzy_data .= $line;
229             }
230             }
231              
232             if ($line =~ m/^\/([A-Za-z]+)/) {
233             $pdf_tags .= $1;
234             }
235              
236             $got_image=1 if ($line =~ m/\/Image/);
237             $encrypted=1 if ($line =~ m/^\/Encrypt/);
238              
239             # once we hit the first stream, we stop collecting data for fuzzy md5
240             $no_more_fuzzy = 1 if ($line =~ m/stream/);
241              
242             # From a v1.3 pdf
243             # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm
244             # [12234] dbg: pdfinfo: line=/Width 630
245             # [12234] dbg: pdfinfo: line=/Height 149
246             if ($got_image) {
247             if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) {
248             $width = $1;
249             $height = $2;
250             }
251             elsif ($line =~ /^\/Width\s(\d+)/) {
252             $width = $1;
253             }
254             elsif ($line =~ /^\/Height\s(\d+)/) {
255             $height = $1;
256             }
257             elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) {
258             $width = $1;
259             $height = $2;
260             }
261             }
262              
263             # did pdf contain image data?
264             if ($got_image && $width && $height) {
265             $no_more_fuzzy = 1;
266             my $area = $width * $height;
267             $total_height += $height;
268             $total_width += $width;
269             $total_area += $area;
270             $pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1;
271             $pms->{'pdfinfo'}->{"count_pdf_images"} ++;
272             dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)");
273             $self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}");
274             $height=0; $width=0; # reset and check for next image
275             $got_image = 0;
276             }
277              
278             # [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15)
279             # [5310] dbg: pdfinfo: line=/CreationDate(D:20070703144220)
280             # [5310] dbg: pdfinfo: line=/ModDate(D:20070703144220)
281             # [5310] dbg: pdfinfo: line=/Title(Microsoft Word - Document1)
282             # [5310] dbg: pdfinfo: line=/Creator(PScript5.dll Version 5.2)
283             # [5310] dbg: pdfinfo: line=/Author(colet)>>endobj
284             # or all on same line inside xml - v1.6+
285             # <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>>
286              
287             if ($line =~ /\/Producer\s?\(([^\)\\]+)/) {
288             $producer = $1;
289             }
290             if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) {
291             $created = $1;
292             }
293             if ($line =~ /\/ModDate\s?\(D\:(\d+)/) {
294             $modified = $1;
295             }
296             if ($line =~ /\/Title\s?\(([^\)\\]+)/) {
297             $title = $1;
298             # Title=\376\377\000w\000w\000n\000g
299             # Title=wwng
300             $title =~ s/\\\d{3}//g;
301             }
302             if ($line =~ /\/Creator\s?\(([^\)\\]+)/) {
303             $creator = $1;
304             }
305             if ($line =~ /\/Author\s?\(([^\)]+)/) {
306             $author = $1;
307             # Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r
308             # Author=HP_Administrator
309             $author =~ s/\\\d{3}//g;
310             }
311             }
312              
313             # store the file name so we can check pdf_named() or pdf_name_match() later.
314             $pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name;
315              
316             # store encrypted flag.
317             $pms->{pdfinfo}->{encrypted} = $encrypted;
318              
319             # if we had multiple images in the pdf, we need to store the total HxW as well.
320             # If it was a single Image PDF, then this value will already be in the hash.
321             $pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);;
322              
323             if ($total_area) {
324             $pms->{pdfinfo}->{pc_pdf} = $total_area;
325             $self->_set_tag($pms, 'PDFIMGAREA', $total_area);
326             dbg("pdfinfo: Filename=$name Total HxW: $total_height x $total_width ($total_area area)") if ($total_area);
327             }
328              
329             dbg("pdfinfo: Filename=$name Title=$title Author=$author Producer=$producer Created=$created Modified=$modified");
330              
331             $md5 = uc(md5_hex($data)) if $data;
332             $fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data;
333             my $tags_md5;
334             $tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags;
335              
336             dbg("pdfinfo: MD5 results for ".($name ? $name : '')." - md5=".($md5 ? $md5 : '')." fuzzy1=".($fuzzy_md5 ? $fuzzy_md5 : '')." fuzzy2=".($tags_md5 ? $tags_md5 : ''));
337              
338             # we dont need tags for these.
339             $pms->{pdfinfo}->{details}->{created} = $created if $created;
340             $pms->{pdfinfo}->{details}->{modified} = $modified if $modified;
341              
342             if ($producer) {
343             $pms->{pdfinfo}->{details}->{producer} = $producer if $producer;
344             $self->_set_tag($pms, 'PDFPRODUCER', $producer);
345             }
346             if ($title) {
347             $pms->{pdfinfo}->{details}->{title} = $title;
348             $self->_set_tag($pms, 'PDFTITLE', $title);
349             }
350             if ($creator) {
351             $pms->{pdfinfo}->{details}->{creator} = $creator;
352             $self->_set_tag($pms, 'PDFCREATOR', $creator);
353             }
354             if ($author) {
355             $pms->{pdfinfo}->{details}->{author} = $author;
356             $self->_set_tag($pms, 'PDFAUTHOR', $author);
357             }
358             if ($md5) {
359             $pms->{pdfinfo}->{md5}->{$md5} = 1;
360             $self->_set_tag($pms, 'PDFMD5', $fuzzy_md5);
361             }
362             if ($fuzzy_md5) {
363             $pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1;
364             $self->_set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5);
365             }
366             if ($tags_md5) {
367             $pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1;
368             $self->_set_tag($pms, 'PDFMD5FUZZY2', $tags_md5);
369             }
370             },
371              
372             );
373              
374             # ----------------------------------------
375              
376             sub _set_tag {
377              
378 0     0     my ($self, $pms, $tag, $value) = @_;
379              
380 0           dbg("pdfinfo: set_tag called for $tag $value");
381 0 0 0       return unless ($tag && $value);
382              
383 0 0         if (exists $pms->{tag_data}->{$tag}) {
384 0           $pms->{tag_data}->{$tag} .= " $value"; # append value
385             }
386             else {
387 0           $pms->{tag_data}->{$tag} = $value;
388             }
389             }
390              
391             # ----------------------------------------
392              
393             sub _find_pdf_mime_parts {
394 0     0     my ($self,$pms) = @_;
395              
396             # bail early if message does not have pdf parts
397 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
398              
399             # initialize
400 0           $pms->{'pdfinfo'}->{"pc_pdf"} = 0;
401 0           $pms->{'pdfinfo'}->{"count_pdf"} = 0;
402 0           $pms->{'pdfinfo'}->{"count_pdf_images"} = 0;
403              
404 0           my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1);
405 0           my $part_count = scalar @parts;
406              
407 0           dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content");
408              
409             # cache this so we can easily bail
410 0 0         $pms->{'pdfinfo'}->{'no_parts'} = 1 unless $part_count;
411              
412 0           foreach my $p (@parts) {
413 0           my $type = $p->{'type'} =~ m@/([\w\-]+)$@;
414 0   0       my $name = $p->{'name'} || '';
415              
416 0   0       my $cte = lc( $p->get_header('content-transfer-encoding') || '' );
417              
418 0 0         dbg("pdfinfo: found part, type=".($type ? $type : '')." file=".($name ? $name : '')." cte=".($cte ? $cte : '')."");
    0          
    0          
419              
420             # make sure its a cte we support
421 0 0         next unless ($cte =~ /^(?:base64|quoted\-printable)$/);
422              
423             # filename must end with .pdf, or application type can be pdf
424             # sometimes windows muas will wrap a pdf up inside a .dat file
425             # v0.8 - Added .fdf phoney PDF detection
426 0 0 0       next unless ($name =~ /\.[fp]df$/ || $type eq 'pdf');
427              
428             # if we get this far, make sure type is pdf for sure (not octet-stream or anything else)
429 0           $type='pdf';
430              
431 0 0 0       if ($type && exists $get_details{$type}) {
432 0           $get_details{$type}->($self, $pms, $p);
433 0           $pms->{'pdfinfo'}->{"count_$type"} ++;
434             }
435             }
436              
437 0           $self->_set_tag($pms, 'PDFCOUNT', $pms->{'pdfinfo'}->{"count_pdf"});
438 0           $self->_set_tag($pms, 'PDFIMGCOUNT', $pms->{'pdfinfo'}->{"count_pdf_images"});
439              
440             }
441              
442             # ----------------------------------------
443              
444             sub pdf_named {
445 0     0 0   my ($self,$pms,$body,$name) = @_;
446 0 0         return unless (defined $name);
447              
448             # make sure we have image data read in.
449 0 0         if (!exists $pms->{'pdfinfo'}) {
450 0           $self->_find_pdf_mime_parts($pms);
451             }
452              
453 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
454              
455 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
456 0 0         return 1 if (exists $pms->{'pdfinfo'}->{"names_pdf"}->{$name});
457 0           return 0;
458             }
459              
460             # -----------------------------------------
461              
462             sub pdf_name_regex {
463 0     0 0   my ($self,$pms,$body,$re) = @_;
464 0 0         return unless (defined $re);
465              
466             # make sure we have image data read in.
467 0 0         if (!exists $pms->{'pdfinfo'}) {
468 0           $self->_find_pdf_mime_parts($pms);
469             }
470              
471 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
472 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
473              
474 0           my ($rec, $err) = compile_regexp($re, 2);
475 0 0         if (!$rec) {
476 0           info("pdfinfo: invalid regexp '$re': $err");
477 0           return 0;
478             }
479              
480 0           my $hit = 0;
481 0           foreach my $name (keys %{$pms->{'pdfinfo'}->{"names_pdf"}}) {
  0            
482 0 0         if ($name =~ $rec) {
483 0           dbg("pdfinfo: pdf_name_regex hit on $name");
484 0           return 1;
485             }
486             }
487 0           return 0;
488              
489             }
490              
491             # -----------------------------------------
492              
493             sub pdf_is_encrypted {
494 0     0 0   my ($self,$pms,$body) = @_;
495              
496             # make sure we have image data read in.
497 0 0         if (!exists $pms->{'pdfinfo'}) {
498 0           $self->_find_pdf_mime_parts($pms);
499             }
500              
501 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
502 0           return $pms->{'pdfinfo'}->{'encrypted'};
503             }
504              
505             # -----------------------------------------
506              
507             sub pdf_count {
508 0     0 0   my ($self,$pms,$body,$min,$max) = @_;
509 0 0         return unless defined $min;
510              
511             # make sure we have image data read in.
512 0 0         if (!exists $pms->{'pdfinfo'}) {
513 0           $self->_find_pdf_mime_parts($pms);
514             }
515              
516 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
517 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf"});
518 0           return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf"});
519              
520             }
521              
522             # -----------------------------------------
523              
524             sub pdf_image_count {
525 0     0 0   my ($self,$pms,$body,$min,$max) = @_;
526 0 0         return unless defined $min;
527              
528             # make sure we have image data read in.
529 0 0         if (!exists $pms->{'pdfinfo'}) {
530 0           $self->_find_pdf_mime_parts($pms);
531             }
532              
533 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
534 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf_images"});
535 0           return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf_images"});
536              
537             }
538              
539             # -----------------------------------------
540              
541             sub pdf_pixel_coverage {
542 0     0 0   my ($self,$pms,$body,$min,$max) = @_;
543 0 0         return unless (defined $min);
544              
545             # make sure we have image data read in.
546 0 0         if (!exists $pms->{'pdfinfo'}) {
547 0           $self->_find_pdf_mime_parts($pms);
548             }
549              
550 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
551 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
552              
553             # dbg("pdfinfo: pc_$type: $min, ".($max ? $max:'').", $type, ".$pms->{'pdfinfo'}->{"pc_pdf"});
554 0           return result_check($min, $max, $pms->{'pdfinfo'}->{"pc_pdf"});
555             }
556              
557             # -----------------------------------------
558              
559             sub pdf_image_to_text_ratio {
560 0     0 0   my ($self,$pms,$body,$min,$max) = @_;
561 0 0 0       return unless (defined $min && defined $max);
562              
563             # make sure we have image data read in.
564 0 0         if (!exists $pms->{'pdfinfo'}) {
565 0           $self->_find_pdf_mime_parts($pms);
566             }
567              
568 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
569 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
570              
571             # depending on how you call this eval (body vs rawbody),
572             # the $textlen will differ.
573 0           my $textlen = length(join('',@$body));
574              
575 0 0 0       return 0 unless ( $textlen > 0 && exists $pms->{'pdfinfo'}->{"pc_pdf"} && $pms->{'pdfinfo'}->{"pc_pdf"} > 0);
      0        
576              
577 0           my $ratio = $textlen / $pms->{'pdfinfo'}->{"pc_pdf"};
578 0           dbg("pdfinfo: image ratio=$ratio, min=$min max=$max");
579 0           return result_check($min, $max, $ratio, 1);
580             }
581              
582             # -----------------------------------------
583              
584             sub pdf_is_empty_body {
585 0     0 0   my ($self,$pms,$body,$min) = @_;
586              
587 0   0       $min ||= 0; # default to 0 bytes
588              
589             # make sure we have image data read in.
590 0 0         if (!exists $pms->{'pdfinfo'}) {
591 0           $self->_find_pdf_mime_parts($pms);
592             }
593              
594 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
595 0 0         return 0 unless $pms->{'pdfinfo'}->{"count_pdf"};
596              
597             # check for cached result
598 0 0         return 1 if $pms->{'pdfinfo'}->{"no_body_text"};
599              
600 0           shift @$body; # shift body array removes line #1 -> subject line.
601              
602 0           my $bytes = 0;
603 0           my $textlen = length(join('',@$body));
604 0           foreach my $line (@$body) {
605 0 0         next unless ($line =~ m/\S/);
606 0 0         next if ($line =~ m/^Subject/);
607 0           $bytes += length($line);
608             }
609              
610 0           dbg("pdfinfo: is_empty_body = $bytes bytes");
611              
612 0 0 0       if ($bytes == 0 || ($bytes <= $min)) {
613 0           $pms->{'pdfinfo'}->{"no_body_text"} = 1;
614 0           return 1;
615             }
616              
617             # cache it and return 0
618 0           $pms->{'pdfinfo'}->{"no_body_text"} = 0;
619 0           return 0;
620             }
621              
622             # -----------------------------------------
623              
624             sub pdf_image_size_exact {
625 0     0 0   my ($self,$pms,$body,$height,$width) = @_;
626 0 0 0       return unless (defined $height && defined $width);
627              
628             # make sure we have image data read in.
629 0 0         if (!exists $pms->{'pdfinfo'}) {
630 0           $self->_find_pdf_mime_parts($pms);
631             }
632              
633 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
634 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
635 0 0         return 1 if (exists $pms->{'pdfinfo'}->{"dems_pdf"}->{"${height}x${width}"});
636 0           return 0;
637             }
638              
639             # -----------------------------------------
640              
641             sub pdf_image_size_range {
642 0     0 0   my ($self,$pms,$body,$minh,$minw,$maxh,$maxw) = @_;
643 0 0 0       return unless (defined $minh && defined $minw);
644              
645             # make sure we have image data read in.
646 0 0         if (!exists $pms->{'pdfinfo'}) {
647 0           $self->_find_pdf_mime_parts($pms);
648             }
649              
650 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
651 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
652              
653 0           foreach my $dem ( keys %{$pms->{'pdfinfo'}->{"dems_pdf"}}) {
  0            
654 0           my ($h,$w) = split(/x/,$dem);
655 0 0         next if ($h < $minh); # height less than min height
656 0 0         next if ($w < $minw); # width less than min width
657 0 0 0       next if (defined $maxh && $h > $maxh); # height more than max height
658 0 0 0       next if (defined $maxw && $w > $maxw); # width more than max width
659              
660             # if we make it here, we have a match
661 0           return 1;
662             }
663              
664 0           return 0;
665             }
666              
667             # -----------------------------------------
668              
669             sub pdf_match_md5 {
670              
671 0     0 0   my ($self,$pms,$body,$md5) = @_;
672 0 0         return unless defined $md5;
673              
674 0           my $uc_md5 = uc($md5); # uppercase matches only
675              
676             # make sure we have pdf data read in.
677 0 0         if (!exists $pms->{'pdfinfo'}) {
678 0           $self->_find_pdf_mime_parts($pms);
679             }
680              
681 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
682 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"md5"});
683 0 0         return 1 if (exists $pms->{'pdfinfo'}->{"md5"}->{$uc_md5});
684 0           return 0;
685             }
686              
687             # -----------------------------------------
688              
689             sub pdf_match_fuzzy_md5 {
690              
691 0     0 0   my ($self,$pms,$body,$md5) = @_;
692 0 0         return unless defined $md5;
693              
694 0           my $uc_md5 = uc($md5); # uppercase matches only
695              
696             # make sure we have pdf data read in.
697 0 0         if (!exists $pms->{'pdfinfo'}) {
698 0           $self->_find_pdf_mime_parts($pms);
699             }
700              
701 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
702 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"fuzzy_md5"});
703 0 0         return 1 if (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}->{$uc_md5});
704 0           return 0;
705             }
706              
707             # -----------------------------------------
708              
709             sub pdf_match_details {
710 0     0 0   my ($self, $pms, $body, $detail, $regex) = @_;
711 0 0 0       return unless ($detail && $regex);
712              
713             # make sure we have pdf data read in.
714 0 0         if (!exists $pms->{'pdfinfo'}) {
715 0           $self->_find_pdf_mime_parts($pms);
716             }
717              
718 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
719 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{'details'});
720              
721 0           my $check_value = $pms->{pdfinfo}->{details}->{$detail};
722 0 0         return unless $check_value;
723              
724 0           my ($rec, $err) = compile_regexp($regex, 2);
725 0 0         if (!$rec) {
726 0           info("pdfinfo: invalid regexp '$regex': $err");
727 0           return 0;
728             }
729              
730 0 0         if ($check_value =~ $rec) {
731 0           dbg("pdfinfo: pdf_match_details $detail $regex matches $check_value");
732 0           return 1;
733             }
734 0           return 0;
735             }
736              
737             # -----------------------------------------
738              
739             sub result_check {
740 0     0 0   my ($min, $max, $value, $nomaxequal) = @_;
741 0 0         return 0 unless defined $value;
742 0 0         return 0 if ($value < $min);
743 0 0 0       return 0 if (defined $max && $value > $max);
744 0 0 0       return 0 if (defined $nomaxequal && $nomaxequal && $value == $max);
      0        
745 0           return 1;
746             }
747              
748             # -----------------------------------------
749              
750             1;
751