File Coverage

blib/lib/Email/Folder/Mbox.pm
Criterion Covered Total %
statement 129 141 91.4
branch 42 64 65.6
condition 15 21 71.4
subroutine 16 17 94.1
pod 3 5 60.0
total 205 248 82.6


line stmt bran cond sub pod time code
1 1     1   5 use strict;
  1         2  
  1         42  
2 1     1   6 use warnings;
  1         2  
  1         59  
3             package Email::Folder::Mbox;
4             # ABSTRACT: reads raw RFC822 mails from an mbox file
5             $Email::Folder::Mbox::VERSION = '0.859';
6 1     1   6 use Carp;
  1         1  
  1         484  
7 1     1   2041 use IO::File;
  1         13629  
  1         160  
8 1     1   644 use Email::Folder::Reader;
  1         2  
  1         24  
9 1     1   691 use parent 'Email::Folder::Reader';
  1         243  
  1         4  
10              
11             #pod =head1 SYNOPSIS
12             #pod
13             #pod This isa Email::Folder::Reader - read about its API there.
14             #pod
15             #pod =head1 DESCRIPTION
16             #pod
17             #pod Does exactly what it says on the tin - fetches raw RFC822 mails from an
18             #pod mbox.
19             #pod
20             #pod The mbox format is described at http://www.qmail.org/man/man5/mbox.html
21             #pod
22             #pod We attempt to read an mbox as through it's the mboxcl2 variant,
23             #pod falling back to regular mbox mode if there is no C
24             #pod header to be found.
25             #pod
26             #pod =head2 OPTIONS
27             #pod
28             #pod The new constructor takes extra options.
29             #pod
30             #pod =over
31             #pod
32             #pod =item C
33             #pod
34             #pod When filename is set to C<"FH"> than Email::Folder::Mbox will read mbox
35             #pod archive from filehandle C instead from disk file C.
36             #pod
37             #pod =item C
38             #pod
39             #pod This indicates what the line-ending style is to be. The default is
40             #pod C<"\n">, but for handling files with mac line-endings you would want
41             #pod to specify C "\x0d">
42             #pod
43             #pod =item C
44             #pod
45             #pod The value is taken as a boolean that governs what is used match as a
46             #pod message separator.
47             #pod
48             #pod If false we use the mutt style
49             #pod
50             #pod /^From \S+\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)/
51             #pod /^From (?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)/;
52             #pod
53             #pod If true we use
54             #pod
55             #pod /^From /
56             #pod
57             #pod In deference to this extract from L
58             #pod
59             #pod Essentially the only safe way to parse that file format is to
60             #pod consider all lines which begin with the characters ``From ''
61             #pod (From-space), which are preceded by a blank line or
62             #pod beginning-of-file, to be the division between messages. That is, the
63             #pod delimiter is "\n\nFrom .*\n" except for the very first message in the
64             #pod file, where it is "^From .*\n".
65             #pod
66             #pod Some people will tell you that you should do stricter parsing on
67             #pod those lines: check for user names and dates and so on. They are
68             #pod wrong. The random crap that has traditionally been dumped into that
69             #pod line is without bound; comparing the first five characters is the
70             #pod only safe and portable thing to do. Usually, but not always, the next
71             #pod token on the line after ``From '' will be a user-id, or email
72             #pod address, or UUCP path, and usually the next thing on the line will be
73             #pod a date specification, in some format, and usually there's nothing
74             #pod after that. But you can't rely on any of this.
75             #pod
76             #pod Defaults to false.
77             #pod
78             #pod =item C
79             #pod
80             #pod This boolean value indicates whenever lines which starts with
81             #pod
82             #pod /^>+From /
83             #pod
84             #pod should be unescaped (= removed leading '>' char). This is needed for
85             #pod mboxrd and mboxcl variants. But there is no way to detect for used mbox
86             #pod variant, so default value is false.
87             #pod
88             #pod =item C
89             #pod
90             #pod Seek to an offset when opening the mbox. When used in combination with
91             #pod ->tell you may be able to resume reading, with a trailing wind.
92             #pod
93             #pod =item
94             #pod
95             #pod This returns next message as string
96             #pod
97             #pod =item
98             #pod
99             #pod This returns next message as ref to string
100             #pod
101             #pod =item C
102             #pod
103             #pod This returns the current filehandle position in the mbox.
104             #pod
105             #pod =item C
106             #pod
107             #pod This returns the From_ line for next message. Call it before ->next_message.
108             #pod
109             #pod =back
110             #pod
111             #pod =cut
112              
113             sub defaults {
114 9     9 0 83 ( eol => "\n")
115             }
116              
117             sub _open_it {
118 9     9   16 my $self = shift;
119 9         20 my $file = $self->{_file};
120 9         19 my $fh = $self->{fh};
121              
122 9 50 33     31 unless ($file eq "FH" and $fh) {
123             # sanity checking
124 9 50       198 croak "$file does not exist" unless (-e $file);
125 9 50       130 croak "$file is not a file" unless (-f $file);
126              
127 9         50 local $/ = $self->{eol};
128 9         28 $fh = $self->_get_fh($file);
129             }
130              
131 9 50       71 if (seek $fh, tell($fh), 0) {
132             # Enable using seek only if $fh is seekable
133 9         24 $self->{seekable} = 1;
134             } else {
135             # Otherwise use cache for simulating backward seeks
136 0         0 $self->{cache} = [];
137             }
138              
139 9 100       22 if ($self->{seek_to}) {
140 1 50       6 unless ($self->{seekable}) {
141 0         0 croak "$file is not seekable but seek_to was set";
142             }
143             # we were told to seek. hope it all goes well
144 1         6 seek $fh, $self->{seek_to}, 0;
145             }
146             else {
147 8         29 local $/ = $self->{eol};
148 8         169 my $firstline = <$fh>;
149 8 100       20 if ($firstline) {
150 7 50       34 croak "$file is not an mbox file" unless $firstline =~ /^From /;
151             }
152 8         36 $self->{from} = $firstline;
153             }
154              
155 9         49 $self->{_fh} = $fh;
156             }
157              
158             sub _get_fh {
159 9     9   17 my $self = shift;
160 9         20 my $file = shift;
161 9 50       74 my $fh = IO::File->new($file) or croak "Cannot open $file";
162 9         797 binmode($fh);
163 9         41 return $fh;
164             }
165              
166             sub _read_nextline {
167 2884     2884   3020 my $self = shift;
168 2884 50 33     5618 if (not $self->{seekable} and @{$self->{cache}}) {
  0         0  
169 0         0 return shift @{$self->{cache}};
  0         0  
170             }
171 2884         2962 my $fh = $self->{_fh};
172 2884         12609 return <$fh>;
173             }
174              
175 1     1   484 use constant debug => 0;
  1         2  
  1         974  
176             my $count;
177              
178             sub next_from {
179 0     0 1 0 my $self = shift;
180 0 0       0 $self->_open_it unless $self->{_fh};
181 0         0 return $self->{from};
182             }
183              
184             sub next_messageref {
185 47     47 0 65 my $self = shift;
186              
187 47   66     142 my $fh = $self->{_fh} || $self->_open_it;
188 47         193 local $/ = $self->{eol};
189              
190 47         59 my $mail = '';
191 47         52 my $prev = '';
192 47         51 my $last;
193 47         55 my $inheaders = 1;
194 47         48 ++$count;
195 47         38 print "$count starting scanning at line $.\n" if debug;
196              
197 47         92 while (my $line = _read_nextline($self)) {
198 2310 100 100     5581 if ($line eq $/ && $inheaders) { # end of headers
199 40         39 print "$count end of headers at line $.\n" if debug;
200 40         49 $inheaders = 0; # stop looking for the end of headers
201 40         50 my $pos; # where to go back to if it goes wrong
202 40 50       113 $pos = tell $fh if $self->{seekable};
203              
204             # look for a content length header, and try to use that
205 40 100       397 if ($mail =~ m/^Content-Length:\s*(\d+)$/mi) {
206 18         20 my @cache;
207 18         65 $mail .= $prev;
208 18         21 $prev = '';
209 18         33 my $length = $1;
210 18         26 print " Content-Length: $length\n" if debug;
211 18         25 my $read = '';
212 18         31 while (my $bodyline = _read_nextline($self)) {
213 506 50       898 push @cache, $bodyline unless $self->{seekable};
214 506 100       812 last if length $read >= $length;
215             # unescape From_
216 488 50       878 $bodyline =~ s/^>(>*From )/$1/ if $self->{unescape};
217 488         879 $read .= $bodyline;
218             }
219             # grab the next line (should be /^From / or undef)
220 18         29 my $next = _read_nextline($self);
221 18 100 100     100 if (!defined $next || $next =~ /^From /) {
222 17         26 $self->{from} = $next;
223 17         78 $mail .= "$/$read";
224 17         77 return \$mail;
225             }
226 1 50       6 push @cache, $next unless $self->{seekable};
227             # seek back and scan line-by-line like the header
228             # wasn't here
229 1         2 print " Content-Length assertion failed '$next'\n" if debug;
230 1 50       5 if ($self->{seekable}) {
231 1         13 seek $fh, $pos, 0;
232             }
233             else {
234 0         0 unshift @{$self->{cache}}, @cache;
  0         0  
235             }
236             }
237              
238             # much the same, but with Lines:
239 23 100       166 if ($mail =~ m/^Lines:\s*(\d+)$/mi) {
240 2         5 my @cache;
241 2         6 $mail .= $prev;
242 2         5 $prev = '';
243 2         5 my $lines = $1;
244 2         3 print " Lines: $lines\n" if debug;
245 2         3 my $read = '';
246 2         12 for (1 .. $lines) {
247 37         55 my $bodyline = _read_nextline($self);
248 37 50       74 last unless defined $bodyline;
249 37 50       61 push @cache, $bodyline unless $self->{seekable};
250             # unescape From_
251 37 50       64 $bodyline =~ s/^>(>*From )/$1/ if $self->{unescape};
252 37         56 $read .= $bodyline;
253             }
254 2         7 my $ign = _read_nextline($self); # trailing newline
255 2         5 my $next = _read_nextline($self);
256 2 100 66     15 if (!defined $next || $next =~ /^From /) {
257 1         4 $self->{from} = $next;
258 1         6 $mail .= "$/$read";
259 1         5 return \$mail;
260             }
261 1 50       39 push @cache, $ign, $next unless $self->{seekable};
262             # seek back and scan line-by-line like the header
263             # wasn't here
264 1         2 print " Lines assertion failed '$next'\n" if debug;
265 1 50       4 if ($self->{seekable}) {
266 1         11 seek $fh, $pos, 0;
267             }
268             else {
269 0         0 unshift @{$self->{cache}}, @cache;
  0         0  
270             }
271             }
272             }
273              
274 2292 100 100     4580 if ($prev eq $/ && ($line =~ $self->_from_line_re)) {
275 20         21 $mail .= $prev;
276 20         21 $last = $line;
277 20         28 last;
278             }
279              
280 2272         3102 $mail .= $prev;
281 2272         2341 $prev = $line;
282              
283             # unescape From_
284 2272 50       5786 $prev =~ s/^>(>*From )/$1/ if $self->{unescape};
285             }
286 29         35 print "$count end of message line $.\n" if debug;
287 29         53 $self->{from} = $last;
288 29 100       63 return unless $mail;
289 22         126 return \$mail;
290             }
291              
292             sub next_message {
293 47     47 1 64 my $self = shift;
294 47         92 my $ref = $self->next_messageref;
295 47 100       108 return unless $ref;
296 40         34 return ${$ref};
  40         230  
297             }
298              
299             my @FROM_RE;
300             BEGIN {
301 1     1   74 @FROM_RE = (
302             # according to mutt:
303             # A valid message separator looks like:
304             # From [ ]
305             qr/^From (?:\S+\s+)?(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)/,
306              
307             # though, as jwz rants, only this is reliable and portable
308             qr/^From /,
309             );
310             }
311              
312             sub _from_line_re {
313 236 50   236   1437 return $FROM_RE[ $_[0]->{jwz_From_} ? 1 : 0 ];
314             }
315              
316             sub tell {
317 1     1 1 3 my $self = shift;
318 1         4 return tell $self->{_fh};
319             }
320              
321             1;
322              
323             __END__