File Coverage

blib/lib/Mail/Mbox/MessageParser/Perl.pm
Criterion Covered Total %
statement 150 156 96.1
branch 41 50 82.0
condition 10 12 83.3
subroutine 18 18 100.0
pod 4 4 100.0
total 223 240 92.9


line stmt bran cond sub pod time code
1             package Mail::Mbox::MessageParser::Perl;
2              
3 38     38   2483001 use strict;
  38         109  
  38         1103  
4 38     38   197 use Carp;
  38         100  
  38         1973  
5              
6 38     38   719 use Mail::Mbox::MessageParser;
  38         73  
  38         1296  
7 38     38   346 use Mail::Mbox::MessageParser::Config;
  38         214  
  38         952  
8              
9 38     38   359 use vars qw( $VERSION $_DEBUG @ISA );
  38         80  
  38         76808  
10              
11             @ISA = qw( Exporter Mail::Mbox::MessageParser );
12              
13             $VERSION = sprintf "%d.%02d%02d", q/1.60.5/ =~ /(\d+)/g;
14              
15             *_DEBUG = \$Mail::Mbox::MessageParser::_DEBUG;
16             *_dprint = \&Mail::Mbox::MessageParser::_dprint;
17             sub _dprint;
18              
19             #-------------------------------------------------------------------------------
20              
21             sub new
22             {
23 392     392 1 1085 my ($proto, $self) = @_;
24              
25 392 50       1028 carp "Need file_handle option" unless defined $self->{'file_handle'};
26              
27 392         1321 bless ($self, __PACKAGE__);
28              
29 392         1213 $self->_init();
30              
31 392         862 return $self;
32             }
33              
34             #-------------------------------------------------------------------------------
35              
36             sub _init
37             {
38 392     392   801 my $self = shift;
39              
40 392         1546 $self->{'CURRENT_LINE_NUMBER'} = 1;
41 392         1323 $self->{'CURRENT_OFFSET'} = 0;
42              
43 392         1543 $self->{'READ_BUFFER'} = '';
44 392         792 $self->{'START_OF_EMAIL'} = 0;
45 392         649 $self->{'END_OF_EMAIL'} = 0;
46              
47             $self->{'READ_CHUNK_SIZE'} =
48 392         1020 $Mail::Mbox::MessageParser::Config{'read_chunk_size'};
49              
50 392         1519 $self->SUPER::_init();
51             }
52              
53             #-------------------------------------------------------------------------------
54              
55             sub reset
56             {
57 40     40 1 106 my $self = shift;
58              
59 40         99 $self->{'CURRENT_LINE_NUMBER'} = ($self->{'prologue'} =~ tr/\n//) + 1;
60 40         71 $self->{'CURRENT_OFFSET'} = length($self->{'prologue'});
61              
62 40         79 $self->{'READ_BUFFER'} = '';
63 40         64 $self->{'START_OF_EMAIL'} = 0;
64 40         76 $self->{'END_OF_EMAIL'} = 0;
65              
66 40         123 $self->SUPER::reset();
67             }
68              
69             #-------------------------------------------------------------------------------
70              
71             sub end_of_file
72             {
73 6169     6169 1 21589 my $self = shift;
74              
75             # Reset eof in case the file was appended to. Hopefully this works all the
76             # time. See perldoc -f seek for details.
77 6169 100       15932 seek($self->{'file_handle'},0,1) if eof $self->{'file_handle'};
78              
79             return eof $self->{'file_handle'} &&
80 6169   100     442050 $self->{'END_OF_EMAIL'} == length($self->{'READ_BUFFER'});
81             }
82              
83             #-------------------------------------------------------------------------------
84              
85             sub _read_prologue
86             {
87 392     392   707 my $self = shift;
88              
89 392         1183 _dprint "Reading mailbox prologue using Perl";
90              
91 392         5716 $self->_read_until_match(
92             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,0);
93              
94 392         1360 my $start_of_email = pos($self->{'READ_BUFFER'});
95 392         2125 $self->{'prologue'} = substr($self->{'READ_BUFFER'}, 0, $start_of_email);
96              
97             # Set up for read_next_email
98 392         1031 $self->{'CURRENT_LINE_NUMBER'} += ($self->{'prologue'} =~ tr/\n//);
99 392         730 $self->{'CURRENT_OFFSET'} = $start_of_email;
100 392         1224 $self->{'END_OF_EMAIL'} = $start_of_email;
101             }
102              
103             #-------------------------------------------------------------------------------
104              
105             sub read_next_email
106             {
107 2286     2286 1 31994 my $self = shift;
108              
109 2286 100       3722 return undef if $self->end_of_file(); ## no critic (ProhibitExplicitReturnUndef)
110              
111 2264         85340 $self->{'email_line_number'} = $self->{'CURRENT_LINE_NUMBER'};
112 2264         3408 $self->{'email_offset'} = $self->{'CURRENT_OFFSET'};
113              
114 2264         3094 $self->{'START_OF_EMAIL'} = $self->{'END_OF_EMAIL'};
115              
116             # Slurp in an entire multipart email (but continue looking for the next
117             # header so that we can get any following newlines as well)
118 2264 50       4280 unless ($self->_read_header())
119             {
120 0         0 return $self->_extract_email_and_finalize();
121             }
122              
123 2264 100       4702 unless ($self->_read_email_parts())
124             {
125             # Could issue a warning here, but I'm not sure how to do this cleanly for
126             # a work-only module like this. Maybe something like CGI's cgi_error()?
127 60         144 _dprint "Inconsistent multi-part message. Could not find ending for " .
128             "boundary \"" . $self->_multipart_boundary() . "\"";
129              
130             # Try to read the content length and use that
131             my $email_header = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
132 60         203 $self->{'START_OF_BODY'} - $self->{'START_OF_EMAIL'});
133              
134             my $content_length = Mail::Mbox::MessageParser::_GET_HEADER_FIELD(
135 60         163 \$email_header, 'Content-Length:', $self->{'endline'});
136              
137 60 100       142 if (defined $content_length)
138             {
139 30         125 $content_length =~ s/Content-Length: *(\d+).*/$1/i;
140 30         118 pos($self->{'READ_BUFFER'}) = $self->{'START_OF_EMAIL'} + $content_length;
141             }
142             # Otherwise use the start of the body
143             else
144             {
145 30         74 pos($self->{'READ_BUFFER'}) = $self->{'START_OF_BODY'};
146             }
147              
148             # Reset the search and look for the start of the
149             # next email.
150 60         167 $self->_read_rest_of_email();
151              
152 60         120 return $self->_extract_email_and_finalize();
153             }
154              
155 2204         5271 $self->_read_rest_of_email();
156              
157 2204         4900 return $self->_extract_email_and_finalize();
158             }
159              
160             #-------------------------------------------------------------------------------
161              
162             sub _read_rest_of_email
163             {
164 2264     2264   3193 my $self = shift;
165              
166 2264         2857 my $previous_backup;
167              
168             # Look for the start of the next email
169 2264         3024 while (1)
170             {
171 2416         18843 while ($self->{'READ_BUFFER'} =~
172             m/$Mail::Mbox::MessageParser::Config{'from_pattern'}/mg)
173             {
174 3710         9412 $self->{'END_OF_EMAIL'} = pos($self->{'READ_BUFFER'}) - length($1);
175              
176 3710         5273 my $endline = $self->{'endline'};
177              
178             # Keep looking if the header we found is part of a "Begin Included
179             # Message".
180 3710         5033 my $end_of_string = '';
181 3710         4349 my $backup_amount = 100;
182             do
183             {
184 5390         6572 $backup_amount *= 2;
185             $backup_amount = $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'}
186             if $backup_amount >
187 5390 50       9432 $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'};
188              
189             $end_of_string = substr($self->{'READ_BUFFER'},
190 5390         18904 $self->{'END_OF_EMAIL'}-$backup_amount, $backup_amount);
191             } while (index($end_of_string, "$endline$endline") == -1 &&
192 3710   66     4399 $backup_amount < $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'});
193              
194 3710 100       16241 next if $end_of_string =~
195             /$endline-----(?: Begin Included Message |Original Message)-----$endline[^\r\n]*(?:$endline)*$/i;
196              
197 3597 100       17874 next unless $end_of_string =~ /$endline$endline$/;
198              
199             # Found the next email!
200 1850         4030 return;
201             }
202              
203             # Didn't find next email in current buffer. Most likely we need to read some
204             # more of the mailbox. Shift the current email to the front of the buffer
205             # unless we've already done so.
206 566         1066 my $shift_amount = $self->{'START_OF_EMAIL'};
207             $self->{'READ_BUFFER'} =
208 566         2652 substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'});
209 566         1012 $self->{'START_OF_EMAIL'} -= $shift_amount;
210 566         847 $self->{'START_OF_BODY'} -= $shift_amount;
211 566         1358 pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
212              
213             # Start looking at the end of the buffer, but back up some in case the
214             # edge of the newly read buffer contains the start of a new header. I
215             # believe the RFC says header lines can be at most 90 characters long.
216 566         1021 my $backup_amount = 90;
217             $backup_amount = length($self->{'READ_BUFFER'}) - 1
218 566 50       1321 if length($self->{'READ_BUFFER'}) < $backup_amount;
219              
220             # Avoid an infinite loop that can occur if the pattern is within the
221             # 90-character lookback, but doesn't indicate the start of the next email.
222             # We detect this case as one where we previously shifted the email to
223             # the start of the buffer.
224 566 100       1295 if ($shift_amount == 0) {
225 234         312 $previous_backup--;
226 234         308 $backup_amount = $previous_backup;
227             } else {
228 332         594 $previous_backup = $backup_amount;
229             }
230              
231 566 100       3368 unless ($self->_read_until_match(
232             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,$backup_amount))
233             {
234 414         805 $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
235 414         703 return;
236             }
237              
238 152         331 redo;
239             }
240             }
241              
242             #-------------------------------------------------------------------------------
243              
244             sub _multipart_boundary
245             {
246 2324     2324   2933 my $self = shift;
247              
248 2324         3360 my $endline = $self->{'endline'};
249              
250 2324 100       22354 if (substr($self->{'READ_BUFFER'},$self->{'START_OF_EMAIL'},
251             $self->{'START_OF_BODY'}-$self->{'START_OF_EMAIL'}) =~
252             /^(content-type: *multipart[^\n\r]*$endline( [^\n\r]*$endline)*)/im)
253             {
254 675         1855 my $content_type_header = $1;
255 675         2599 $content_type_header =~ s/$endline//g;
256              
257 675 50 66     3899 if ($content_type_header =~ /boundary *= *"([^"]*)"/i ||
258             $content_type_header =~ /boundary *= *([-0-9A-Za-z'()+_,.\/:=? ]*[-0-9A-Za-z'()+_,.\/:=?])/i)
259             {
260 675         2244 return $1
261             }
262             }
263              
264 1649         3891 return undef; ## no critic (ProhibitExplicitReturnUndef)
265             }
266              
267             #-------------------------------------------------------------------------------
268              
269             sub _read_email_parts
270             {
271 2264     2264   3206 my $self = shift;
272              
273 2264         3888 my $boundary = $self->_multipart_boundary();
274              
275 2264 100       6034 return 1 unless defined $boundary;
276              
277             # RFC 1521 says the boundary can be no longer than 70 characters. Back up a
278             # little more than that. Unlike _read_rest_of_email() we don't have to
279             # worry about infinite loops here since the pattern is designed to not
280             # match falsely.
281 615         1050 my $endline = $self->{'endline'};
282 615 100       7128 $self->_read_until_match(qr/^--\Q$boundary\E--$endline/m,76)
283             or return 0;
284              
285 555         1557 return 1;
286             }
287              
288             #-------------------------------------------------------------------------------
289              
290             sub _extract_email_and_finalize
291             {
292 2264     2264   3231 my $self = shift;
293              
294 2264         3837 $self->{'email_length'} = $self->{'END_OF_EMAIL'}-$self->{'START_OF_EMAIL'};
295              
296             my $email = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
297 2264         9894 $self->{'email_length'});
298              
299 2264         13243 $self->{'CURRENT_LINE_NUMBER'} += ($email =~ tr/\n//);
300 2264         3539 $self->{'CURRENT_OFFSET'} += $self->{'email_length'};
301              
302 2264         3039 $self->{'email_number'}++;
303              
304 2264         7827 $self->SUPER::read_next_email();
305              
306 2264         49449 return \$email;
307             }
308              
309             #-------------------------------------------------------------------------------
310              
311             sub _read_header
312             {
313 2264     2264   3089 my $self = shift;
314              
315 2264 50       10291 $self->_read_until_match(qr/$self->{'endline'}$self->{'endline'}/m,0)
316             or return 0;
317              
318             $self->{'START_OF_BODY'} =
319 2264         6825 pos($self->{'READ_BUFFER'}) + length("$self->{'endline'}$self->{'endline'}");
320              
321 2264         5132 return 1;
322             }
323              
324             #-------------------------------------------------------------------------------
325              
326             # The search position is at the start of the pattern when this function
327             # returns 1.
328             sub _read_until_match
329             {
330 3837     3837   5989 my $self = shift;
331 3837         5128 my $pattern = shift;
332 3837         4643 my $backup = shift;
333              
334             # Start looking at the end of the buffer, but back up some in case the edge
335             # of the newly read buffer contains part of the pattern.
336 3837 100 100     14473 if (!defined pos($self->{'READ_BUFFER'}) ||
337             pos($self->{'READ_BUFFER'}) - $backup <= 0) {
338 790         3013 pos($self->{'READ_BUFFER'}) = 0;
339             } else {
340 3047         6971 pos($self->{'READ_BUFFER'}) -= $backup;
341             }
342              
343 3837         6252 while (1)
344             {
345 4681 100       60733 if ($self->{'READ_BUFFER'} =~ m/($pattern)/mg)
346             {
347 3363         11207 pos($self->{'READ_BUFFER'}) -= length($1);
348 3363         9913 return 1;
349             }
350              
351 1318         4204 pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
352              
353 1318 100       3282 unless ($self->_read_chunk()) {
354 474         894 $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
355 474         1428 return 0;
356             }
357              
358 844 100       2284 if (pos($self->{'READ_BUFFER'}) - $backup <= 0) {
359 432         1307 pos($self->{'READ_BUFFER'}) = 0;
360             } else {
361 412         1121 pos($self->{'READ_BUFFER'}) -= $backup;
362             }
363             }
364             }
365              
366             #-------------------------------------------------------------------------------
367              
368             # Maintains pos($self->{'READ_BUFFER'})
369             sub _read_chunk
370             {
371 1318     1318   1973 my $self = shift;
372              
373 1318         2169 my $search_position = pos($self->{'READ_BUFFER'});
374              
375             # Can't use sysread because it doesn't work with ungetc
376 1318 50       2715 if ($self->{'READ_CHUNK_SIZE'} == 0)
377             {
378 0         0 local $/ = undef;
379              
380 0 0       0 return 0 if eof $self->{'file_handle'};
381              
382             # < $self->{'file_handle'} > doesn't work, so we use readline
383 0         0 $self->{'READ_BUFFER'} = readline($self->{'file_handle'});
384 0         0 pos($self->{'READ_BUFFER'}) = $search_position;
385 0         0 return 1;
386             }
387             else
388             {
389 1318         1787 my $total_amount_read = 0;
390 1318         1940 my $amount_read = 0;
391              
392 1318         3541 while ($total_amount_read < $self->{'READ_CHUNK_SIZE'})
393             {
394             $amount_read = read($self->{'file_handle'}, $self->{'READ_BUFFER'},
395             $self->{'READ_CHUNK_SIZE'} - $total_amount_read,
396 1736         6652 length($self->{'READ_BUFFER'}));
397              
398 1736         60316534 pos($self->{'READ_BUFFER'}) = $search_position;
399              
400 1736 100       4919 if ($amount_read == 0)
401             {
402 892 100       2732 return 1 unless $total_amount_read == 0;
403              
404 474         1290 return 0;
405             }
406              
407 844         2101 $total_amount_read += $amount_read;
408             }
409              
410 426         1132 return 1;
411             }
412             }
413              
414             #-------------------------------------------------------------------------------
415              
416             1;
417              
418             __END__