File Coverage

blib/lib/Mail/Mbox/MessageParser/Perl.pm
Criterion Covered Total %
statement 150 156 96.1
branch 41 50 82.0
condition 10 12 83.3
subroutine 18 18 100.0
pod 4 4 100.0
total 223 240 92.9


line stmt bran cond sub pod time code
1             package Mail::Mbox::MessageParser::Perl;
2              
3 42     42   2552279 use strict;
  42         101  
  42         1109  
4 42     42   168 use Carp;
  42         206  
  42         1971  
5              
6 42     42   631 use Mail::Mbox::MessageParser;
  42         58  
  42         1247  
7 42     42   300 use Mail::Mbox::MessageParser::Config;
  42         60  
  42         1157  
8              
9 42     42   298 use vars qw( $VERSION $_DEBUG @ISA );
  42         78  
  42         68132  
10              
11             @ISA = qw( Exporter Mail::Mbox::MessageParser );
12              
13             $VERSION = sprintf "%d.%02d%02d", q/1.60.5/ =~ /(\d+)/g;
14              
15             *_DEBUG = \$Mail::Mbox::MessageParser::_DEBUG;
16             *_dprint = \&Mail::Mbox::MessageParser::_dprint;
17             sub _dprint;
18              
19             #-------------------------------------------------------------------------------
20              
21             sub new
22             {
23 479     479 1 1356 my ($proto, $self) = @_;
24              
25 479 50       1111 carp "Need file_handle option" unless defined $self->{'file_handle'};
26              
27 479         1702 bless ($self, __PACKAGE__);
28              
29 479         2242 $self->_init();
30              
31 479         974 return $self;
32             }
33              
34             #-------------------------------------------------------------------------------
35              
36             sub _init
37             {
38 479     479   829 my $self = shift;
39              
40 479         2473 $self->{'CURRENT_LINE_NUMBER'} = 1;
41 479         1871 $self->{'CURRENT_OFFSET'} = 0;
42              
43 479         1973 $self->{'READ_BUFFER'} = '';
44 479         949 $self->{'START_OF_EMAIL'} = 0;
45 479         683 $self->{'END_OF_EMAIL'} = 0;
46              
47             $self->{'READ_CHUNK_SIZE'} =
48 479         845 $Mail::Mbox::MessageParser::Config{'read_chunk_size'};
49              
50 479         1681 $self->SUPER::_init();
51             }
52              
53             #-------------------------------------------------------------------------------
54              
55             sub reset
56             {
57 44     44 1 114 my $self = shift;
58              
59 44         110 $self->{'CURRENT_LINE_NUMBER'} = ($self->{'prologue'} =~ tr/\n//) + 1;
60 44         76 $self->{'CURRENT_OFFSET'} = length($self->{'prologue'});
61              
62 44         82 $self->{'READ_BUFFER'} = '';
63 44         69 $self->{'START_OF_EMAIL'} = 0;
64 44         81 $self->{'END_OF_EMAIL'} = 0;
65              
66 44         114 $self->SUPER::reset();
67             }
68              
69             #-------------------------------------------------------------------------------
70              
71             sub end_of_file
72             {
73 7101     7101 1 30130 my $self = shift;
74              
75             # Reset eof in case the file was appended to. Hopefully this works all the
76             # time. See perldoc -f seek for details.
77 7101 100       18020 seek($self->{'file_handle'},0,1) if eof $self->{'file_handle'};
78              
79             return eof $self->{'file_handle'} &&
80 7101   100     547550 $self->{'END_OF_EMAIL'} == length($self->{'READ_BUFFER'});
81             }
82              
83             #-------------------------------------------------------------------------------
84              
85             sub _read_prologue
86             {
87 479     479   851 my $self = shift;
88              
89 479         1192 _dprint "Reading mailbox prologue using Perl";
90              
91 479         6260 $self->_read_until_match(
92             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,0);
93              
94 479         1865 my $start_of_email = pos($self->{'READ_BUFFER'});
95 479         2872 $self->{'prologue'} = substr($self->{'READ_BUFFER'}, 0, $start_of_email);
96              
97             # Set up for read_next_email
98 479         1302 $self->{'CURRENT_LINE_NUMBER'} += ($self->{'prologue'} =~ tr/\n//);
99 479         878 $self->{'CURRENT_OFFSET'} = $start_of_email;
100 479         1459 $self->{'END_OF_EMAIL'} = $start_of_email;
101             }
102              
103             #-------------------------------------------------------------------------------
104              
105             sub read_next_email
106             {
107 2605     2605 1 41304 my $self = shift;
108              
109 2605 100       4394 return undef if $self->end_of_file(); ## no critic (ProhibitExplicitReturnUndef)
110              
111 2581         93293 $self->{'email_line_number'} = $self->{'CURRENT_LINE_NUMBER'};
112 2581         3964 $self->{'email_offset'} = $self->{'CURRENT_OFFSET'};
113              
114 2581         3523 $self->{'START_OF_EMAIL'} = $self->{'END_OF_EMAIL'};
115              
116             # Slurp in an entire multipart email (but continue looking for the next
117             # header so that we can get any following newlines as well)
118 2581 50       5687 unless ($self->_read_header())
119             {
120 0         0 return $self->_extract_email_and_finalize();
121             }
122              
123 2581 100       6063 unless ($self->_read_email_parts())
124             {
125             # Could issue a warning here, but I'm not sure how to do this cleanly for
126             # a work-only module like this. Maybe something like CGI's cgi_error()?
127 60         139 _dprint "Inconsistent multi-part message. Could not find ending for " .
128             "boundary \"" . $self->_multipart_boundary() . "\"";
129              
130             # Try to read the content length and use that
131             my $email_header = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
132 60         221 $self->{'START_OF_BODY'} - $self->{'START_OF_EMAIL'});
133              
134             my $content_length = Mail::Mbox::MessageParser::_GET_HEADER_FIELD(
135 60         176 \$email_header, 'Content-Length:', $self->{'endline'});
136              
137 60 100       146 if (defined $content_length)
138             {
139 30         139 $content_length =~ s/Content-Length: *(\d+).*/$1/i;
140 30         119 pos($self->{'READ_BUFFER'}) = $self->{'START_OF_EMAIL'} + $content_length;
141             }
142             # Otherwise use the start of the body
143             else
144             {
145 30         73 pos($self->{'READ_BUFFER'}) = $self->{'START_OF_BODY'};
146             }
147              
148             # Reset the search and look for the start of the
149             # next email.
150 60         187 $self->_read_rest_of_email();
151              
152 60         137 return $self->_extract_email_and_finalize();
153             }
154              
155 2521         6325 $self->_read_rest_of_email();
156              
157 2521         5381 return $self->_extract_email_and_finalize();
158             }
159              
160             #-------------------------------------------------------------------------------
161              
162             sub _read_rest_of_email
163             {
164 2581     2581   3522 my $self = shift;
165              
166 2581         3273 my $previous_backup;
167              
168             # Look for the start of the next email
169 2581         3489 while (1)
170             {
171 2733         21400 while ($self->{'READ_BUFFER'} =~
172             m/$Mail::Mbox::MessageParser::Config{'from_pattern'}/mg)
173             {
174 3986         10059 $self->{'END_OF_EMAIL'} = pos($self->{'READ_BUFFER'}) - length($1);
175              
176 3986         5791 my $endline = $self->{'endline'};
177              
178             # Keep looking if the header we found is part of a "Begin Included
179             # Message".
180 3986         5729 my $end_of_string = '';
181 3986         5283 my $backup_amount = 100;
182             do
183             {
184 5666         7364 $backup_amount *= 2;
185             $backup_amount = $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'}
186             if $backup_amount >
187 5666 50       10099 $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'};
188              
189             $end_of_string = substr($self->{'READ_BUFFER'},
190 5666         20453 $self->{'END_OF_EMAIL'}-$backup_amount, $backup_amount);
191             } while (index($end_of_string, "$endline$endline") == -1 &&
192 3986   66     4921 $backup_amount < $self->{'END_OF_EMAIL'} - $self->{'START_OF_EMAIL'});
193              
194 3986 100       18615 next if $end_of_string =~
195             /$endline-----(?: Begin Included Message |Original Message)-----$endline[^\r\n]*(?:$endline)*$/i;
196              
197 3825 100       18382 next unless $end_of_string =~ /$endline$endline$/;
198              
199             # Found the next email!
200 2078         4397 return;
201             }
202              
203             # Didn't find next email in current buffer. Most likely we need to read some
204             # more of the mailbox. Shift the current email to the front of the buffer
205             # unless we've already done so.
206 655         1411 my $shift_amount = $self->{'START_OF_EMAIL'};
207             $self->{'READ_BUFFER'} =
208 655         3124 substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'});
209 655         1197 $self->{'START_OF_EMAIL'} -= $shift_amount;
210 655         1070 $self->{'START_OF_BODY'} -= $shift_amount;
211 655         1632 pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
212              
213             # Start looking at the end of the buffer, but back up some in case the
214             # edge of the newly read buffer contains the start of a new header. I
215             # believe the RFC says header lines can be at most 90 characters long.
216 655         1325 my $backup_amount = 90;
217             $backup_amount = length($self->{'READ_BUFFER'}) - 1
218 655 50       1507 if length($self->{'READ_BUFFER'}) < $backup_amount;
219              
220             # Avoid an infinite loop that can occur if the pattern is within the
221             # 90-character lookback, but doesn't indicate the start of the next email.
222             # We detect this case as one where we previously shifted the email to
223             # the start of the buffer.
224 655 100       1420 if ($shift_amount == 0) {
225 234         322 $previous_backup--;
226 234         308 $backup_amount = $previous_backup;
227             } else {
228 421         686 $previous_backup = $backup_amount;
229             }
230              
231 655 100       3437 unless ($self->_read_until_match(
232             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,$backup_amount))
233             {
234 503         1059 $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
235 503         841 return;
236             }
237              
238 152         330 redo;
239             }
240             }
241              
242             #-------------------------------------------------------------------------------
243              
244             sub _multipart_boundary
245             {
246 2641     2641   3471 my $self = shift;
247              
248 2641         3996 my $endline = $self->{'endline'};
249              
250 2641 100       26119 if (substr($self->{'READ_BUFFER'},$self->{'START_OF_EMAIL'},
251             $self->{'START_OF_BODY'}-$self->{'START_OF_EMAIL'}) =~
252             /^(content-type: *multipart[^\n\r]*$endline( [^\n\r]*$endline)*)/im)
253             {
254 675         1797 my $content_type_header = $1;
255 675         2490 $content_type_header =~ s/$endline//g;
256              
257 675 50 66     3781 if ($content_type_header =~ /boundary *= *"([^"]*)"/i ||
258             $content_type_header =~ /boundary *= *([-0-9A-Za-z'()+_,.\/:=? ]*[-0-9A-Za-z'()+_,.\/:=?])/i)
259             {
260 675         2215 return $1
261             }
262             }
263              
264 1966         4584 return undef; ## no critic (ProhibitExplicitReturnUndef)
265             }
266              
267             #-------------------------------------------------------------------------------
268              
269             sub _read_email_parts
270             {
271 2581     2581   3755 my $self = shift;
272              
273 2581         4734 my $boundary = $self->_multipart_boundary();
274              
275 2581 100       6723 return 1 unless defined $boundary;
276              
277             # RFC 1521 says the boundary can be no longer than 70 characters. Back up a
278             # little more than that. Unlike _read_rest_of_email() we don't have to
279             # worry about infinite loops here since the pattern is designed to not
280             # match falsely.
281 615         937 my $endline = $self->{'endline'};
282 615 100       6865 $self->_read_until_match(qr/^--\Q$boundary\E--$endline/m,76)
283             or return 0;
284              
285 555         1561 return 1;
286             }
287              
288             #-------------------------------------------------------------------------------
289              
290             sub _extract_email_and_finalize
291             {
292 2581     2581   3821 my $self = shift;
293              
294 2581         4359 $self->{'email_length'} = $self->{'END_OF_EMAIL'}-$self->{'START_OF_EMAIL'};
295              
296             my $email = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
297 2581         10778 $self->{'email_length'});
298              
299 2581         14592 $self->{'CURRENT_LINE_NUMBER'} += ($email =~ tr/\n//);
300 2581         3980 $self->{'CURRENT_OFFSET'} += $self->{'email_length'};
301              
302 2581         3595 $self->{'email_number'}++;
303              
304 2581         9095 $self->SUPER::read_next_email();
305              
306 2581         53900 return \$email;
307             }
308              
309             #-------------------------------------------------------------------------------
310              
311             sub _read_header
312             {
313 2581     2581   3628 my $self = shift;
314              
315 2581 50       12394 $self->_read_until_match(qr/$self->{'endline'}$self->{'endline'}/m,0)
316             or return 0;
317              
318             $self->{'START_OF_BODY'} =
319 2581         7830 pos($self->{'READ_BUFFER'}) + length("$self->{'endline'}$self->{'endline'}");
320              
321 2581         5563 return 1;
322             }
323              
324             #-------------------------------------------------------------------------------
325              
326             # The search position is at the start of the pattern when this function
327             # returns 1.
328             sub _read_until_match
329             {
330 4330     4330   6872 my $self = shift;
331 4330         5391 my $pattern = shift;
332 4330         5323 my $backup = shift;
333              
334             # Start looking at the end of the buffer, but back up some in case the edge
335             # of the newly read buffer contains part of the pattern.
336 4330 100 100     16325 if (!defined pos($self->{'READ_BUFFER'}) ||
337             pos($self->{'READ_BUFFER'}) - $backup <= 0) {
338 968         3832 pos($self->{'READ_BUFFER'}) = 0;
339             } else {
340 3362         7407 pos($self->{'READ_BUFFER'}) -= $backup;
341             }
342              
343 4330         7347 while (1)
344             {
345 5265 100       73377 if ($self->{'READ_BUFFER'} =~ m/($pattern)/mg)
346             {
347 3767         12938 pos($self->{'READ_BUFFER'}) -= length($1);
348 3767         10911 return 1;
349             }
350              
351 1498         4806 pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
352              
353 1498 100       5014 unless ($self->_read_chunk()) {
354 563         1171 $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
355 563         1811 return 0;
356             }
357              
358 935 100       2581 if (pos($self->{'READ_BUFFER'}) - $backup <= 0) {
359 523         2040 pos($self->{'READ_BUFFER'}) = 0;
360             } else {
361 412         1038 pos($self->{'READ_BUFFER'}) -= $backup;
362             }
363             }
364             }
365              
366             #-------------------------------------------------------------------------------
367              
368             # Maintains pos($self->{'READ_BUFFER'})
369             sub _read_chunk
370             {
371 1498     1498   2416 my $self = shift;
372              
373 1498         2443 my $search_position = pos($self->{'READ_BUFFER'});
374              
375             # Can't use sysread because it doesn't work with ungetc
376 1498 50       3184 if ($self->{'READ_CHUNK_SIZE'} == 0)
377             {
378 0         0 local $/ = undef;
379              
380 0 0       0 return 0 if eof $self->{'file_handle'};
381              
382             # < $self->{'file_handle'} > doesn't work, so we use readline
383 0         0 $self->{'READ_BUFFER'} = readline($self->{'file_handle'});
384 0         0 pos($self->{'READ_BUFFER'}) = $search_position;
385 0         0 return 1;
386             }
387             else
388             {
389 1498         2072 my $total_amount_read = 0;
390 1498         2008 my $amount_read = 0;
391              
392 1498         3198 while ($total_amount_read < $self->{'READ_CHUNK_SIZE'})
393             {
394             $amount_read = read($self->{'file_handle'}, $self->{'READ_BUFFER'},
395             $self->{'READ_CHUNK_SIZE'} - $total_amount_read,
396 2007         8264 length($self->{'READ_BUFFER'}));
397              
398 2007         87235838 pos($self->{'READ_BUFFER'}) = $search_position;
399              
400 2007 100       5734 if ($amount_read == 0)
401             {
402 1072 100       3557 return 1 unless $total_amount_read == 0;
403              
404 563         1530 return 0;
405             }
406              
407 935         2498 $total_amount_read += $amount_read;
408             }
409              
410 426         1128 return 1;
411             }
412             }
413              
414             #-------------------------------------------------------------------------------
415              
416             1;
417              
418             __END__