File Coverage

blib/lib/Mail/Mbox/MessageParser/Perl.pm
Criterion Covered Total %
statement 150 156 96.1
branch 41 50 82.0
condition 10 12 83.3
subroutine 18 18 100.0
pod 4 4 100.0
total 223 240 92.9


line stmt bran cond sub pod time code
1             package Mail::Mbox::MessageParser::Perl;
2              
3 38     38   2711473 use strict;
  38         68  
  38         978  
4 38     38   159 use Carp;
  38         71  
  38         1742  
5              
6 38     38   566 use Mail::Mbox::MessageParser;
  38         65  
  38         1129  
7 38     38   293 use Mail::Mbox::MessageParser::Config;
  38         56  
  38         756  
8              
9 38     38   280 use vars qw( $VERSION $_DEBUG @ISA );
  38         74  
  38         60039  
10              
11             @ISA = qw( Exporter Mail::Mbox::MessageParser );
12              
13             $VERSION = sprintf "%d.%02d%02d", q/1.60.5/ =~ /(\d+)/g;
14              
15             *_DEBUG = \$Mail::Mbox::MessageParser::_DEBUG;
16             *_dprint = \&Mail::Mbox::MessageParser::_dprint;
17             sub _dprint;
18              
19             #-------------------------------------------------------------------------------
20              
21             sub new
22             {
23 392     392 1 1183 my ($proto, $self) = @_;
24              
25 392 50       1087 carp "Need file_handle option" unless defined $self->{'file_handle'};
26              
27 392         1360 bless ($self, __PACKAGE__);
28              
29 392         1829 $self->_init();
30              
31 392         870 return $self;
32             }
33              
34             #-------------------------------------------------------------------------------
35              
36             sub _init
37             {
38 392     392   765 my $self = shift;
39              
40 392         1619 $self->{'CURRENT_LINE_NUMBER'} = 1;
41 392         1359 $self->{'CURRENT_OFFSET'} = 0;
42              
43 392         1695 $self->{'READ_BUFFER'} = '';
44 392         837 $self->{'START_OF_EMAIL'} = 0;
45 392         742 $self->{'END_OF_EMAIL'} = 0;
46              
47             $self->{'READ_CHUNK_SIZE'} =
48 392         1358 $Mail::Mbox::MessageParser::Config{'read_chunk_size'};
49              
50 392         1661 $self->SUPER::_init();
51             }
52              
53             #-------------------------------------------------------------------------------
54              
55             sub reset
56             {
57 40     40 1 128 my $self = shift;
58              
59 40         97 $self->{'CURRENT_LINE_NUMBER'} = ($self->{'prologue'} =~ tr/\n//) + 1;
60 40         81 $self->{'CURRENT_OFFSET'} = length($self->{'prologue'});
61              
62 40         90 $self->{'READ_BUFFER'} = '';
63 40         73 $self->{'START_OF_EMAIL'} = 0;
64 40         66 $self->{'END_OF_EMAIL'} = 0;
65              
66 40         128 $self->SUPER::reset();
67             }
68              
69             #-------------------------------------------------------------------------------
70              
71             sub end_of_file
72             {
73 6169     6169 1 26230 my $self = shift;
74              
75             # Reset eof in case the file was appended to. Hopefully this works all the
76             # time. See perldoc -f seek for details.
77 6169 100       18149 seek($self->{'file_handle'},0,1) if eof $self->{'file_handle'};
78              
79             return eof $self->{'file_handle'} &&
80 6169   100     461992 $self->{'END_OF_EMAIL'} == length($self->{'READ_BUFFER'});
81             }
82              
83             #-------------------------------------------------------------------------------
84              
85             sub _read_prologue
86             {
87 392     392   753 my $self = shift;
88              
89 392         1179 _dprint "Reading mailbox prologue using Perl";
90              
91 392         5951 $self->_read_until_match(
92             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,0);
93              
94 392         1464 my $start_of_email = pos($self->{'READ_BUFFER'});
95 392         1948 $self->{'prologue'} = substr($self->{'READ_BUFFER'}, 0, $start_of_email);
96              
97             # Set up for read_next_email
98 392         1119 $self->{'CURRENT_LINE_NUMBER'} += ($self->{'prologue'} =~ tr/\n//);
99 392         910 $self->{'CURRENT_OFFSET'} = $start_of_email;
100 392         1237 $self->{'END_OF_EMAIL'} = $start_of_email;
101             }
102              
103             #-------------------------------------------------------------------------------
104              
105             sub read_next_email
106             {
107 2286     2286 1 36589 my $self = shift;
108              
109 2286 100       4068 return undef if $self->end_of_file(); ## no critic (ProhibitExplicitReturnUndef)
110              
111 2264         94126 $self->{'email_line_number'} = $self->{'CURRENT_LINE_NUMBER'};
112 2264         3835 $self->{'email_offset'} = $self->{'CURRENT_OFFSET'};
113              
114 2264         3365 $self->{'START_OF_EMAIL'} = $self->{'END_OF_EMAIL'};
115              
116             # Slurp in an entire multipart email (but continue looking for the next
117             # header so that we can get any following newlines as well)
118 2264 50       4694 unless ($self->_read_header())
119             {
120 0         0 return $self->_extract_email_and_finalize();
121             }
122              
123 2264 100       5444 unless ($self->_read_email_parts())
124             {
125             # Could issue a warning here, but I'm not sure how to do this cleanly for
126             # a work-only module like this. Maybe something like CGI's cgi_error()?
127 60         168 _dprint "Inconsistent multi-part message. Could not find ending for " .
128             "boundary \"" . $self->_multipart_boundary() . "\"";
129              
130             # Try to read the content length and use that
131             my $email_header = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
132 60         279 $self->{'START_OF_BODY'} - $self->{'START_OF_EMAIL'});
133              
134             my $content_length = Mail::Mbox::MessageParser::_GET_HEADER_FIELD(
135 60         280 \$email_header, 'Content-Length:', $self->{'endline'});
136              
137 60 100       180 if (defined $content_length)
138             {
139 30         178 $content_length =~ s/Content-Length: *(\d+).*/$1/i;
140 30         155 pos($self->{'READ_BUFFER'}) = $self->{'START_OF_EMAIL'} + $content_length;
141             }
142             # Otherwise use the start of the body
143             else
144             {
145 30         103 pos($self->{'READ_BUFFER'}) = $self->{'START_OF_BODY'};
146             }
147              
148             # Reset the search and look for the start of the
149             # next email.
150 60         238 $self->_read_rest_of_email();
151              
152 60         166 return $self->_extract_email_and_finalize();
153             }
154              
155 2204         5850 $self->_read_rest_of_email();
156              
157 2204         5475 return $self->_extract_email_and_finalize();
158             }
159              
160             #-------------------------------------------------------------------------------
161              
162             sub _read_rest_of_email
163             {
164 2264     2264   3320 my $self = shift;
165              
166 2264         3105 my $previous_backup;
167              
168             # Look for the start of the next email
169 2264         3304 while (1)
170             {
171 2416         20394 while ($self->{'READ_BUFFER'} =~
172             m/$Mail::Mbox::MessageParser::Config{'from_pattern'}/mg)
173             {
174 3710         10329 $self->{'END_OF_EMAIL'} = pos($self->{'READ_BUFFER'}) - length($1);
175              
176 3710         6215 my $endline = $self->{'endline'};
177              
178             # Keep looking if the header we found is part of a "Begin Included
179             # Message".
180 3710         5465 my $end_of_string = '';
181 3710         5169 my $backup_amount = 100;
182             do
183             {
184 5390         7381 $backup_amount *= 2;
185             $backup_amount = $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'}
186             if $backup_amount >
187 5390 50       10355 $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'};
188              
189             $end_of_string = substr($self->{'READ_BUFFER'},
190 5390         20857 $self->{'END_OF_EMAIL'}-$backup_amount, $backup_amount);
191             } while (index($end_of_string, "$endline$endline") == -1 &&
192 3710   66     4813 $backup_amount < $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'});
193              
194 3710 100       17621 next if $end_of_string =~
195             /$endline-----(?: Begin Included Message |Original Message)-----$endline[^\r\n]*(?:$endline)*$/i;
196              
197 3597 100       19574 next unless $end_of_string =~ /$endline$endline$/;
198              
199             # Found the next email!
200 1850         4514 return;
201             }
202              
203             # Didn't find next email in current buffer. Most likely we need to read some
204             # more of the mailbox. Shift the current email to the front of the buffer
205             # unless we've already done so.
206 566         1107 my $shift_amount = $self->{'START_OF_EMAIL'};
207             $self->{'READ_BUFFER'} =
208 566         2830 substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'});
209 566         1039 $self->{'START_OF_EMAIL'} -= $shift_amount;
210 566         942 $self->{'START_OF_BODY'} -= $shift_amount;
211 566         1485 pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
212              
213             # Start looking at the end of the buffer, but back up some in case the
214             # edge of the newly read buffer contains the start of a new header. I
215             # believe the RFC says header lines can be at most 90 characters long.
216 566         1202 my $backup_amount = 90;
217             $backup_amount = length($self->{'READ_BUFFER'}) - 1
218 566 50       1489 if length($self->{'READ_BUFFER'}) < $backup_amount;
219              
220             # Avoid an infinite loop that can occur if the pattern is within the
221             # 90-character lookback, but doesn't indicate the start of the next email.
222             # We detect this case as one where we previously shifted the email to
223             # the start of the buffer.
224 566 100       1360 if ($shift_amount == 0) {
225 234         336 $previous_backup--;
226 234         323 $backup_amount = $previous_backup;
227             } else {
228 332         631 $previous_backup = $backup_amount;
229             }
230              
231 566 100       3450 unless ($self->_read_until_match(
232             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,$backup_amount))
233             {
234 414         979 $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
235 414         769 return;
236             }
237              
238 152         363 redo;
239             }
240             }
241              
242             #-------------------------------------------------------------------------------
243              
244             sub _multipart_boundary
245             {
246 2324     2324   3270 my $self = shift;
247              
248 2324         3911 my $endline = $self->{'endline'};
249              
250 2324 100       24908 if (substr($self->{'READ_BUFFER'},$self->{'START_OF_EMAIL'},
251             $self->{'START_OF_BODY'}-$self->{'START_OF_EMAIL'}) =~
252             /^(content-type: *multipart[^\n\r]*$endline( [^\n\r]*$endline)*)/im)
253             {
254 675         2035 my $content_type_header = $1;
255 675         2945 $content_type_header =~ s/$endline//g;
256              
257 675 50 66     4445 if ($content_type_header =~ /boundary *= *"([^"]*)"/i ||
258             $content_type_header =~ /boundary *= *([-0-9A-Za-z'()+_,.\/:=? ]*[-0-9A-Za-z'()+_,.\/:=?])/i)
259             {
260 675         2637 return $1
261             }
262             }
263              
264 1649         4155 return undef; ## no critic (ProhibitExplicitReturnUndef)
265             }
266              
267             #-------------------------------------------------------------------------------
268              
269             sub _read_email_parts
270             {
271 2264     2264   3471 my $self = shift;
272              
273 2264         4417 my $boundary = $self->_multipart_boundary();
274              
275 2264 100       6293 return 1 unless defined $boundary;
276              
277             # RFC 1521 says the boundary can be no longer than 70 characters. Back up a
278             # little more than that. Unlike _read_rest_of_email() we don't have to
279             # worry about infinite loops here since the pattern is designed to not
280             # match falsely.
281 615         1110 my $endline = $self->{'endline'};
282 615 100       8057 $self->_read_until_match(qr/^--\Q$boundary\E--$endline/m,76)
283             or return 0;
284              
285 555         1755 return 1;
286             }
287              
288             #-------------------------------------------------------------------------------
289              
290             sub _extract_email_and_finalize
291             {
292 2264     2264   3666 my $self = shift;
293              
294 2264         4234 $self->{'email_length'} = $self->{'END_OF_EMAIL'}-$self->{'START_OF_EMAIL'};
295              
296             my $email = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
297 2264         9882 $self->{'email_length'});
298              
299 2264         14563 $self->{'CURRENT_LINE_NUMBER'} += ($email =~ tr/\n//);
300 2264         4060 $self->{'CURRENT_OFFSET'} += $self->{'email_length'};
301              
302 2264         3237 $self->{'email_number'}++;
303              
304 2264         8176 $self->SUPER::read_next_email();
305              
306 2264         53263 return \$email;
307             }
308              
309             #-------------------------------------------------------------------------------
310              
311             sub _read_header
312             {
313 2264     2264   3468 my $self = shift;
314              
315 2264 50       11535 $self->_read_until_match(qr/$self->{'endline'}$self->{'endline'}/m,0)
316             or return 0;
317              
318             $self->{'START_OF_BODY'} =
319 2264         7392 pos($self->{'READ_BUFFER'}) + length("$self->{'endline'}$self->{'endline'}");
320              
321 2264         5560 return 1;
322             }
323              
324             #-------------------------------------------------------------------------------
325              
326             # The search position is at the start of the pattern when this function
327             # returns 1.
328             sub _read_until_match
329             {
330 3837     3837   6772 my $self = shift;
331 3837         5085 my $pattern = shift;
332 3837         5091 my $backup = shift;
333              
334             # Start looking at the end of the buffer, but back up some in case the edge
335             # of the newly read buffer contains part of the pattern.
336 3837 100 100     15609 if (!defined pos($self->{'READ_BUFFER'}) ||
337             pos($self->{'READ_BUFFER'}) - $backup <= 0) {
338 790         3091 pos($self->{'READ_BUFFER'}) = 0;
339             } else {
340 3047         7911 pos($self->{'READ_BUFFER'}) -= $backup;
341             }
342              
343 3837         6950 while (1)
344             {
345 4681 100       63735 if ($self->{'READ_BUFFER'} =~ m/($pattern)/mg)
346             {
347 3363         11981 pos($self->{'READ_BUFFER'}) -= length($1);
348 3363         11198 return 1;
349             }
350              
351 1318         4464 pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
352              
353 1318 100       3731 unless ($self->_read_chunk()) {
354 474         964 $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
355 474         1749 return 0;
356             }
357              
358 844 100       2619 if (pos($self->{'READ_BUFFER'}) - $backup <= 0) {
359 432         1552 pos($self->{'READ_BUFFER'}) = 0;
360             } else {
361 412         1191 pos($self->{'READ_BUFFER'}) -= $backup;
362             }
363             }
364             }
365              
366             #-------------------------------------------------------------------------------
367              
368             # Maintains pos($self->{'READ_BUFFER'})
369             sub _read_chunk
370             {
371 1318     1318   2148 my $self = shift;
372              
373 1318         2335 my $search_position = pos($self->{'READ_BUFFER'});
374              
375             # Can't use sysread because it doesn't work with ungetc
376 1318 50       3282 if ($self->{'READ_CHUNK_SIZE'} == 0)
377             {
378 0         0 local $/ = undef;
379              
380 0 0       0 return 0 if eof $self->{'file_handle'};
381              
382             # < $self->{'file_handle'} > doesn't work, so we use readline
383 0         0 $self->{'READ_BUFFER'} = readline($self->{'file_handle'});
384 0         0 pos($self->{'READ_BUFFER'}) = $search_position;
385 0         0 return 1;
386             }
387             else
388             {
389 1318         2080 my $total_amount_read = 0;
390 1318         1867 my $amount_read = 0;
391              
392 1318         3013 while ($total_amount_read < $self->{'READ_CHUNK_SIZE'})
393             {
394             $amount_read = read($self->{'file_handle'}, $self->{'READ_BUFFER'},
395             $self->{'READ_CHUNK_SIZE'} - $total_amount_read,
396 1736         7388 length($self->{'READ_BUFFER'}));
397              
398 1736         58742748 pos($self->{'READ_BUFFER'}) = $search_position;
399              
400 1736 100       4998 if ($amount_read == 0)
401             {
402 892 100       2939 return 1 unless $total_amount_read == 0;
403              
404 474         1387 return 0;
405             }
406              
407 844         2277 $total_amount_read += $amount_read;
408             }
409              
410 426         1290 return 1;
411             }
412             }
413              
414             #-------------------------------------------------------------------------------
415              
416             1;
417              
418             __END__