File Coverage

blib/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
Criterion Covered Total %
statement 40 213 18.7
branch 0 162 0.0
condition 1 54 1.8
subroutine 8 24 33.3
pod 1 15 6.6
total 50 468 10.6


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Plugin::PDFInfo - PDFInfo Plugin for SpamAssassin
21              
22             =head1 SYNOPSIS
23              
24             loadplugin Mail::SpamAssassin::Plugin::PDFInfo
25              
26             =head1 DESCRIPTION
27              
28             This plugin helps detected spam using attached PDF files
29              
30             =over 4
31              
32             =item See "Usage:" below - more documentation see 20_pdfinfo.cf
33              
34             Original info kept for history. For later changes see SVN repo
35             -------------------------------------------------------
36             PDFInfo Plugin for SpamAssassin
37             Version: 0.8
38             Info: $Id: PDFInfo.pm 904 2007-08-12 01:36:23Z root $
39             Created: 2007-08-10
40             Modified: 2007-08-10
41             By: Dallas Engelken
42              
43             Changes:
44             0.8 - added .fdf detection (thanks John Lundin) [axb]
45             0.7 - fixed empty body/pdf count buglet(thanks Jeremy) [axb]
46             0.6 - added support for tags - PDFCOUNT, PDFVERSION, PDFPRODUCER, etc.
47             - fixed issue on perl 5.6.1 where pdf_match_details() failed to call
48             _find_pdf_mime_parts(), resulting in no detection of pdf mime parts.
49             - quoted-printable support - requires MIME::QuotedPrint (which should be in everyones
50             install as a part of the MIME-Base64 package which is a SA req)
51             - added simple pdf_is_empty_body() function with counts the body bytes minus the
52             subject line. can add optional <bytes> param if you need to allow for a few bytes.
53             0.5 - fix warns for undef $pdf_tags
54             - remove { } and \ before running eval in pdf_match_details to avoid eval error
55             0.4 - added pdf_is_encrypted() function
56             - added option to look for image HxW on same line
57             0.3 - added 2nd fuzzy md5 which uses pdf tag layout as data
58             - renamed pdf_image_named() to pdf_named()
59             - PDF images are encapsulated and have no names. We are matching the PDF file name.
60             - renamed pdf_image_name_regex() to pdf_name_regex()
61             - PDF images are encapsulated and have no names. We are matching the PDF file name.
62             - changed pdf_image_count() a bit and added pdf_count().
63             - pdf_count() checks how many pdf attachments there are on the mail
64             - pdf_image_count() checks how many images are found within all pdfs in the mail.
65             - removed the restriction of the pdf containing an image in order to md5 it.
66             - added pdf_match_details() function to check the following 'details'
67             - author: Author of PDF if specified
68             - producer: Software used to produce PDF
69             - creator: Software used to produce PDF, usually similar to producer
70             - title: Title of PDF
71             - created: Creation Date
72             - modified: Last Modified
73             0.2 - support PDF octet-stream
74             0.1 - just ported over the imageinfo code, and renamed to pdfinfo.
75             - removed all support for png, gif, and jpg from the code.
76             - prepended pdf_ to all function names to avoid conflicts with ImageInfo in SA 3.2.
77              
78             Usage:
79              
80             pdf_count()
81              
82             body RULENAME eval:pdf_count(<min>,[max])
83             min: required, message contains at least x pdf mime parts
84             max: optional, if specified, must not contain more than x pdf mime parts
85              
86             pdf_image_count()
87              
88             body RULENAME eval:pdf_image_count(<min>,[max])
89             min: required, message contains at least x images in pdf attachments.
90             max: optional, if specified, must not contain more than x pdf images
91              
92             pdf_pixel_coverage()
93              
94             body RULENAME eval:pdf_pixel_coverage(<min>,[max])
95             min: required, message contains at least this much pixel area
96             max: optional, if specified, message must not contain more than this much pixel area
97              
98             pdf_named()
99              
100             body RULENAME eval:pdf_named(<string>)
101             string: exact file name match, if you need partial match, see pdf_name_regex()
102              
103             pdf_name_regex()
104              
105             body RULENAME eval:pdf_name_regex(<regex>)
106             regex: regular expression, see examples in ruleset
107              
108             pdf_match_md5()
109              
110             body RULENAME eval:pdf_match_md5(<string>)
111             string: 32-byte md5 hex
112              
113             pdf_match_fuzzy_md5()
114              
115             body RULENAME eval:pdf_match_md5(<string>)
116             string: 32-byte md5 hex - see ruleset for obtaining the fuzzy md5
117              
118             pdf_match_details()
119              
120             body RULENAME eval:pdf_match_details(<detail>,<regex>);
121             detail: author, creator, created, modified, producer, title
122             regex: regular expression, see examples in ruleset
123              
124             pdf_is_encrypted()
125              
126             body RULENAME eval:pdf_is_encrypted()
127              
128             pdf_is_empty_body()
129              
130             body RULENAME eval:pdf_is_empty_body(<bytes>)
131             bytes: maximum byte count to allow and still consider it empty
132              
133             NOTE: See the ruleset for more examples that are not documented here.
134              
135             =back
136              
137             =cut
138              
139             # -------------------------------------------------------
140              
141              
142             use Mail::SpamAssassin::Plugin;
143 20     20   138 use Mail::SpamAssassin::Logger;
  20         40  
  20         630  
144 20     20   114 use Mail::SpamAssassin::Util qw(compile_regexp);
  20         69  
  20         1324  
145 20     20   139 use strict;
  20         35  
  20         879  
146 20     20   124 use warnings;
  20         47  
  20         506  
147 20     20   107 # use bytes;
  20         45  
  20         669  
148             use Digest::MD5 qw(md5_hex);
149 20     20   107 use MIME::QuotedPrint;
  20         39  
  20         1162  
150 20     20   8353  
  20         4927  
  20         66099  
151             our @ISA = qw(Mail::SpamAssassin::Plugin);
152              
153             # constructor: register the eval rule
154             my $class = shift;
155             my $mailsaobject = shift;
156 61     61 1 201  
157 61         177 # some boilerplate...
158             $class = ref($class) || $class;
159             my $self = $class->SUPER::new($mailsaobject);
160 61   33     350 bless ($self, $class);
161 61         323  
162 61         140 $self->register_eval_rule ("pdf_count");
163             $self->register_eval_rule ("pdf_image_count");
164 61         250 $self->register_eval_rule ("pdf_pixel_coverage");
165 61         209 $self->register_eval_rule ("pdf_image_size_exact");
166 61         193 $self->register_eval_rule ("pdf_image_size_range");
167 61         199 $self->register_eval_rule ("pdf_named");
168 61         204 $self->register_eval_rule ("pdf_name_regex");
169 61         204 $self->register_eval_rule ("pdf_image_to_text_ratio");
170 61         209 $self->register_eval_rule ("pdf_match_md5");
171 61         185 $self->register_eval_rule ("pdf_match_fuzzy_md5");
172 61         177 $self->register_eval_rule ("pdf_match_details");
173 61         183 $self->register_eval_rule ("pdf_is_encrypted");
174 61         182 $self->register_eval_rule ("pdf_is_empty_body");
175 61         189  
176 61         171 return $self;
177             }
178 61         513  
179             # -----------------------------------------
180              
181             my %get_details = (
182             'pdf' => sub {
183             my ($self, $pms, $part) = @_;
184              
185             my $type = $part->{'type'} || 'base64';
186             my $data = '';
187              
188             if ($type eq 'quoted-printable') {
189             $data = decode_qp($data); # use QuotedPrint->decode_qp
190             }
191             else {
192             $data = $part->decode(); # just use built in base64 decoder
193             }
194              
195             my $index = substr($data, 0, 8);
196              
197             return unless ($index =~ /.PDF\-(\d\.\d)/);
198             my $version = $1;
199             $self->_set_tag($pms, 'PDFVERSION', $version);
200             # dbg("pdfinfo: pdf version = $version");
201              
202             my ($height, $width, $fuzzy_data, $pdf_tags);
203             my ($producer, $created, $modified, $title, $creator, $author) = ('unknown','0','0','untitled','unknown','unknown');
204             my ($md5, $fuzzy_md5) = ('', '');
205             my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0);
206              
207             my $name = $part->{'name'} || '';
208             $self->_set_tag($pms, 'PDFNAME', $name);
209              
210             my $no_more_fuzzy = 0;
211             my $got_image = 0;
212             my $encrypted = 0;
213              
214             while($data =~ /([^\n]+)/g) {
215             # dbg("pdfinfo: line=$1");
216             my $line = $1;
217              
218             $line_count++;
219              
220             # lines containing high bytes will have no data we need, so save some cycles
221             next if ($line =~ /[\x80-\xff]/);
222              
223             if (!$no_more_fuzzy && $line_count < 70) {
224             if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
225             $line =~ s/\s+$//; # strip off whitespace at end.
226             $fuzzy_data .= $line;
227             }
228             }
229              
230             if ($line =~ m/^\/([A-Za-z]+)/) {
231             $pdf_tags .= $1;
232             }
233              
234             $got_image=1 if ($line =~ m/\/Image/);
235             $encrypted=1 if ($line =~ m/^\/Encrypt/);
236              
237             # once we hit the first stream, we stop collecting data for fuzzy md5
238             $no_more_fuzzy = 1 if ($line =~ m/stream/);
239              
240             # From a v1.3 pdf
241             # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm
242             # [12234] dbg: pdfinfo: line=/Width 630
243             # [12234] dbg: pdfinfo: line=/Height 149
244             if ($got_image) {
245             if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) {
246             $width = $1;
247             $height = $2;
248             }
249             elsif ($line =~ /^\/Width\s(\d+)/) {
250             $width = $1;
251             }
252             elsif ($line =~ /^\/Height\s(\d+)/) {
253             $height = $1;
254             }
255             elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) {
256             $width = $1;
257             $height = $2;
258             }
259             }
260              
261             # did pdf contain image data?
262             if ($got_image && $width && $height) {
263             $no_more_fuzzy = 1;
264             my $area = $width * $height;
265             $total_height += $height;
266             $total_width += $width;
267             $total_area += $area;
268             $pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1;
269             $pms->{'pdfinfo'}->{"count_pdf_images"} ++;
270             dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)");
271             $self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}");
272             $height=0; $width=0; # reset and check for next image
273             $got_image = 0;
274             }
275              
276             # [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15)
277             # [5310] dbg: pdfinfo: line=/CreationDate(D:20070703144220)
278             # [5310] dbg: pdfinfo: line=/ModDate(D:20070703144220)
279             # [5310] dbg: pdfinfo: line=/Title(Microsoft Word - Document1)
280             # [5310] dbg: pdfinfo: line=/Creator(PScript5.dll Version 5.2)
281             # [5310] dbg: pdfinfo: line=/Author(colet)>>endobj
282             # or all on same line inside xml - v1.6+
283             # <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>>
284              
285             if ($line =~ /\/Producer\s?\(([^\)\\]+)/) {
286             $producer = $1;
287             }
288             if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) {
289             $created = $1;
290             }
291             if ($line =~ /\/ModDate\s?\(D\:(\d+)/) {
292             $modified = $1;
293             }
294             if ($line =~ /\/Title\s?\(([^\)\\]+)/) {
295             $title = $1;
296             # Title=\376\377\000w\000w\000n\000g
297             # Title=wwng
298             $title =~ s/\\\d{3}//g;
299             }
300             if ($line =~ /\/Creator\s?\(([^\)\\]+)/) {
301             $creator = $1;
302             }
303             if ($line =~ /\/Author\s?\(([^\)]+)/) {
304             $author = $1;
305             # Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r
306             # Author=HP_Administrator
307             $author =~ s/\\\d{3}//g;
308             }
309             }
310              
311             # store the file name so we can check pdf_named() or pdf_name_match() later.
312             $pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name;
313              
314             # store encrypted flag.
315             $pms->{pdfinfo}->{encrypted} = $encrypted;
316              
317             # if we had multiple images in the pdf, we need to store the total HxW as well.
318             # If it was a single Image PDF, then this value will already be in the hash.
319             $pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);;
320              
321             if ($total_area) {
322             $pms->{pdfinfo}->{pc_pdf} = $total_area;
323             $self->_set_tag($pms, 'PDFIMGAREA', $total_area);
324             dbg("pdfinfo: Filename=$name Total HxW: $total_height x $total_width ($total_area area)") if ($total_area);
325             }
326              
327             dbg("pdfinfo: Filename=$name Title=$title Author=$author Producer=$producer Created=$created Modified=$modified");
328              
329             $md5 = uc(md5_hex($data)) if $data;
330             $fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data;
331             my $tags_md5;
332             $tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags;
333              
334             dbg("pdfinfo: MD5 results for ".($name ? $name : '')." - md5=".($md5 ? $md5 : '')." fuzzy1=".($fuzzy_md5 ? $fuzzy_md5 : '')." fuzzy2=".($tags_md5 ? $tags_md5 : ''));
335              
336             # we dont need tags for these.
337             $pms->{pdfinfo}->{details}->{created} = $created if $created;
338             $pms->{pdfinfo}->{details}->{modified} = $modified if $modified;
339              
340             if ($producer) {
341             $pms->{pdfinfo}->{details}->{producer} = $producer if $producer;
342             $self->_set_tag($pms, 'PDFPRODUCER', $producer);
343             }
344             if ($title) {
345             $pms->{pdfinfo}->{details}->{title} = $title;
346             $self->_set_tag($pms, 'PDFTITLE', $title);
347             }
348             if ($creator) {
349             $pms->{pdfinfo}->{details}->{creator} = $creator;
350             $self->_set_tag($pms, 'PDFCREATOR', $creator);
351             }
352             if ($author) {
353             $pms->{pdfinfo}->{details}->{author} = $author;
354             $self->_set_tag($pms, 'PDFAUTHOR', $author);
355             }
356             if ($md5) {
357             $pms->{pdfinfo}->{md5}->{$md5} = 1;
358             $self->_set_tag($pms, 'PDFMD5', $fuzzy_md5);
359             }
360             if ($fuzzy_md5) {
361             $pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1;
362             $self->_set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5);
363             }
364             if ($tags_md5) {
365             $pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1;
366             $self->_set_tag($pms, 'PDFMD5FUZZY2', $tags_md5);
367             }
368             },
369              
370             );
371              
372             # ----------------------------------------
373              
374              
375             my ($self, $pms, $tag, $value) = @_;
376              
377             dbg("pdfinfo: set_tag called for $tag $value");
378 0     0     return unless ($tag && $value);
379              
380 0           if (exists $pms->{tag_data}->{$tag}) {
381 0 0 0       $pms->{tag_data}->{$tag} .= " $value"; # append value
382             }
383 0 0         else {
384 0           $pms->{tag_data}->{$tag} = $value;
385             }
386             }
387 0            
388             # ----------------------------------------
389              
390             my ($self,$pms) = @_;
391              
392             # bail early if message does not have pdf parts
393             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
394 0     0      
395             # initialize
396             $pms->{'pdfinfo'}->{"pc_pdf"} = 0;
397 0 0         $pms->{'pdfinfo'}->{"count_pdf"} = 0;
398             $pms->{'pdfinfo'}->{"count_pdf_images"} = 0;
399              
400 0           my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1);
401 0           my $part_count = scalar @parts;
402 0            
403             dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content");
404 0            
405 0           # cache this so we can easily bail
406             $pms->{'pdfinfo'}->{'no_parts'} = 1 unless $part_count;
407 0            
408             foreach my $p (@parts) {
409             my $type = $p->{'type'} =~ m@/([\w\-]+)$@;
410 0 0         my $name = $p->{'name'} || '';
411              
412 0           my $cte = lc( $p->get_header('content-transfer-encoding') || '' );
413 0            
414 0   0       dbg("pdfinfo: found part, type=".($type ? $type : '')." file=".($name ? $name : '')." cte=".($cte ? $cte : '')."");
415              
416 0   0       # make sure its a cte we support
417             next unless ($cte =~ /^(?:base64|quoted\-printable)$/);
418 0 0          
    0          
    0          
419             # filename must end with .pdf, or application type can be pdf
420             # sometimes windows muas will wrap a pdf up inside a .dat file
421 0 0         # v0.8 - Added .fdf phoney PDF detection
422             next unless ($name =~ /\.[fp]df$/ || $type eq 'pdf');
423              
424             # if we get this far, make sure type is pdf for sure (not octet-stream or anything else)
425             $type='pdf';
426 0 0 0        
427             if ($type && exists $get_details{$type}) {
428             $get_details{$type}->($self, $pms, $p);
429 0           $pms->{'pdfinfo'}->{"count_$type"} ++;
430             }
431 0 0 0       }
432 0            
433 0           $self->_set_tag($pms, 'PDFCOUNT', $pms->{'pdfinfo'}->{"count_pdf"});
434             $self->_set_tag($pms, 'PDFIMGCOUNT', $pms->{'pdfinfo'}->{"count_pdf_images"});
435              
436             }
437 0            
438 0           # ----------------------------------------
439              
440             my ($self,$pms,$body,$name) = @_;
441             return unless (defined $name);
442              
443             # make sure we have image data read in.
444             if (!exists $pms->{'pdfinfo'}) {
445 0     0 0   $self->_find_pdf_mime_parts($pms);
446 0 0         }
447              
448             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
449 0 0          
450 0           return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
451             return 1 if (exists $pms->{'pdfinfo'}->{"names_pdf"}->{$name});
452             return 0;
453 0 0         }
454              
455 0 0         # -----------------------------------------
456 0 0          
457 0           my ($self,$pms,$body,$re) = @_;
458             return unless (defined $re);
459              
460             # make sure we have image data read in.
461             if (!exists $pms->{'pdfinfo'}) {
462             $self->_find_pdf_mime_parts($pms);
463 0     0 0   }
464 0 0          
465             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
466             return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
467 0 0          
468 0           my ($rec, $err) = compile_regexp($re, 2);
469             if (!$rec) {
470             info("pdfinfo: invalid regexp '$re': $err");
471 0 0         return 0;
472 0 0         }
473              
474 0           my $hit = 0;
475 0 0         foreach my $name (keys %{$pms->{'pdfinfo'}->{"names_pdf"}}) {
476 0           if ($name =~ $rec) {
477 0           dbg("pdfinfo: pdf_name_regex hit on $name");
478             return 1;
479             }
480 0           }
481 0           return 0;
  0            
482 0 0          
483 0           }
484 0            
485             # -----------------------------------------
486              
487 0           my ($self,$pms,$body) = @_;
488              
489             # make sure we have image data read in.
490             if (!exists $pms->{'pdfinfo'}) {
491             $self->_find_pdf_mime_parts($pms);
492             }
493              
494 0     0 0   return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
495             return $pms->{'pdfinfo'}->{'encrypted'};
496             }
497 0 0          
498 0           # -----------------------------------------
499              
500             my ($self,$pms,$body,$min,$max) = @_;
501 0 0         return unless defined $min;
502 0            
503             # make sure we have image data read in.
504             if (!exists $pms->{'pdfinfo'}) {
505             $self->_find_pdf_mime_parts($pms);
506             }
507              
508 0     0 0   return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
509 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf"});
510             return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf"});
511              
512 0 0         }
513 0            
514             # -----------------------------------------
515              
516 0 0         my ($self,$pms,$body,$min,$max) = @_;
517 0 0         return unless defined $min;
518 0            
519             # make sure we have image data read in.
520             if (!exists $pms->{'pdfinfo'}) {
521             $self->_find_pdf_mime_parts($pms);
522             }
523              
524             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
525 0     0 0   return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf_images"});
526 0 0         return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf_images"});
527              
528             }
529 0 0          
530 0           # -----------------------------------------
531              
532             my ($self,$pms,$body,$min,$max) = @_;
533 0 0         return unless (defined $min);
534 0 0          
535 0           # make sure we have image data read in.
536             if (!exists $pms->{'pdfinfo'}) {
537             $self->_find_pdf_mime_parts($pms);
538             }
539              
540             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
541             return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
542 0     0 0    
543 0 0         # dbg("pdfinfo: pc_$type: $min, ".($max ? $max:'').", $type, ".$pms->{'pdfinfo'}->{"pc_pdf"});
544             return result_check($min, $max, $pms->{'pdfinfo'}->{"pc_pdf"});
545             }
546 0 0          
547 0           # -----------------------------------------
548              
549             my ($self,$pms,$body,$min,$max) = @_;
550 0 0         return unless (defined $min && defined $max);
551 0 0          
552             # make sure we have image data read in.
553             if (!exists $pms->{'pdfinfo'}) {
554 0           $self->_find_pdf_mime_parts($pms);
555             }
556              
557             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
558             return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
559              
560 0     0 0   # depending on how you call this eval (body vs rawbody),
561 0 0 0       # the $textlen will differ.
562             my $textlen = length(join('',@$body));
563              
564 0 0         return 0 unless ( $textlen > 0 && exists $pms->{'pdfinfo'}->{"pc_pdf"} && $pms->{'pdfinfo'}->{"pc_pdf"} > 0);
565 0            
566             my $ratio = $textlen / $pms->{'pdfinfo'}->{"pc_pdf"};
567             dbg("pdfinfo: image ratio=$ratio, min=$min max=$max");
568 0 0         return result_check($min, $max, $ratio, 1);
569 0 0         }
570              
571             # -----------------------------------------
572              
573 0           my ($self,$pms,$body,$min) = @_;
574              
575 0 0 0       $min ||= 0; # default to 0 bytes
      0        
576              
577 0           # make sure we have image data read in.
578 0           if (!exists $pms->{'pdfinfo'}) {
579 0           $self->_find_pdf_mime_parts($pms);
580             }
581              
582             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
583             return 0 unless $pms->{'pdfinfo'}->{"count_pdf"};
584              
585 0     0 0   # check for cached result
586             return 1 if $pms->{'pdfinfo'}->{"no_body_text"};
587 0   0        
588             shift @$body; # shift body array removes line #1 -> subject line.
589              
590 0 0         my $bytes = 0;
591 0           my $textlen = length(join('',@$body));
592             foreach my $line (@$body) {
593             next unless ($line =~ m/\S/);
594 0 0         next if ($line =~ m/^Subject/);
595 0 0         $bytes += length($line);
596             }
597              
598 0 0         dbg("pdfinfo: is_empty_body = $bytes bytes");
599              
600 0           if ($bytes == 0 || ($bytes <= $min)) {
601             $pms->{'pdfinfo'}->{"no_body_text"} = 1;
602 0           return 1;
603 0           }
604 0            
605 0 0         # cache it and return 0
606 0 0         $pms->{'pdfinfo'}->{"no_body_text"} = 0;
607 0           return 0;
608             }
609              
610 0           # -----------------------------------------
611              
612 0 0 0       my ($self,$pms,$body,$height,$width) = @_;
613 0           return unless (defined $height && defined $width);
614 0            
615             # make sure we have image data read in.
616             if (!exists $pms->{'pdfinfo'}) {
617             $self->_find_pdf_mime_parts($pms);
618 0           }
619 0            
620             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
621             return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
622             return 1 if (exists $pms->{'pdfinfo'}->{"dems_pdf"}->{"${height}x${width}"});
623             return 0;
624             }
625 0     0 0    
626 0 0 0       # -----------------------------------------
627              
628             my ($self,$pms,$body,$minh,$minw,$maxh,$maxw) = @_;
629 0 0         return unless (defined $minh && defined $minw);
630 0            
631             # make sure we have image data read in.
632             if (!exists $pms->{'pdfinfo'}) {
633 0 0         $self->_find_pdf_mime_parts($pms);
634 0 0         }
635 0 0          
636 0           return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
637             return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
638              
639             foreach my $dem ( keys %{$pms->{'pdfinfo'}->{"dems_pdf"}}) {
640             my ($h,$w) = split(/x/,$dem);
641             next if ($h < $minh); # height less than min height
642 0     0 0   next if ($w < $minw); # width less than min width
643 0 0 0       next if (defined $maxh && $h > $maxh); # height more than max height
644             next if (defined $maxw && $w > $maxw); # width more than max width
645              
646 0 0         # if we make it here, we have a match
647 0           return 1;
648             }
649              
650 0 0         return 0;
651 0 0         }
652              
653 0           # -----------------------------------------
  0            
654 0            
655 0 0          
656 0 0         my ($self,$pms,$body,$md5) = @_;
657 0 0 0       return unless defined $md5;
658 0 0 0        
659             my $uc_md5 = uc($md5); # uppercase matches only
660              
661 0           # make sure we have pdf data read in.
662             if (!exists $pms->{'pdfinfo'}) {
663             $self->_find_pdf_mime_parts($pms);
664 0           }
665              
666             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
667             return 0 unless (exists $pms->{'pdfinfo'}->{"md5"});
668             return 1 if (exists $pms->{'pdfinfo'}->{"md5"}->{$uc_md5});
669             return 0;
670             }
671 0     0 0    
672 0 0         # -----------------------------------------
673              
674 0            
675             my ($self,$pms,$body,$md5) = @_;
676             return unless defined $md5;
677 0 0          
678 0           my $uc_md5 = uc($md5); # uppercase matches only
679              
680             # make sure we have pdf data read in.
681 0 0         if (!exists $pms->{'pdfinfo'}) {
682 0 0         $self->_find_pdf_mime_parts($pms);
683 0 0         }
684 0            
685             return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
686             return 0 unless (exists $pms->{'pdfinfo'}->{"fuzzy_md5"});
687             return 1 if (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}->{$uc_md5});
688             return 0;
689             }
690              
691 0     0 0   # -----------------------------------------
692 0 0          
693             my ($self, $pms, $body, $detail, $regex) = @_;
694 0           return unless ($detail && $regex);
695              
696             # make sure we have pdf data read in.
697 0 0         if (!exists $pms->{'pdfinfo'}) {
698 0           $self->_find_pdf_mime_parts($pms);
699             }
700              
701 0 0         return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
702 0 0         return 0 unless (exists $pms->{'pdfinfo'}->{'details'});
703 0 0          
704 0           my $check_value = $pms->{pdfinfo}->{details}->{$detail};
705             return unless $check_value;
706              
707             my ($rec, $err) = compile_regexp($regex, 2);
708             if (!$rec) {
709             info("pdfinfo: invalid regexp '$regex': $err");
710 0     0 0   return 0;
711 0 0 0       }
712              
713             if ($check_value =~ $rec) {
714 0 0         dbg("pdfinfo: pdf_match_details $detail $regex matches $check_value");
715 0           return 1;
716             }
717             return 0;
718 0 0         }
719 0 0          
720             # -----------------------------------------
721 0            
722 0 0         my ($min, $max, $value, $nomaxequal) = @_;
723             return 0 unless defined $value;
724 0           return 0 if ($value < $min);
725 0 0         return 0 if (defined $max && $value > $max);
726 0           return 0 if (defined $nomaxequal && $nomaxequal && $value == $max);
727 0           return 1;
728             }
729              
730 0 0         # -----------------------------------------
731 0            
732 0           1;
733