File Coverage

blib/lib/Email/Outlook/Message.pm
Criterion Covered Total %
statement 202 205 98.5
branch 55 64 85.9
condition 9 13 69.2
subroutine 33 33 100.0
pod 2 2 100.0
total 301 317 94.9


line stmt bran cond sub pod time code
1             package Email::Outlook::Message;
2             =head1 NAME
3              
4             Email::Outlook::Message.pm - Read Outlook .msg files
5              
6             =head1 SYNOPSIS
7              
8             use Email::Outlook::Message;
9              
10             my $msg = new Email::Outlook::Message $filename, $verbose;
11             my $mime = $msg->to_email_mime;
12             $mime->as_string;
13              
14             =head1 DESCRIPTION
15              
16             Parses .msg message files as produced by Microsoft Outlook.
17              
18             =head1 METHODS
19              
20             =over 8
21              
22             =item B
23              
24             Parse the file pointed at by $msg. Set $verbose to a true value to
25             print information about skipped parts of the .msg file on STDERR.
26              
27             =item B
28              
29             Output result as an Email::MIME object.
30              
31             =back
32              
33             =head1 BUGS
34              
35             Not all data that is in the .msg file is converted. There are some
36             parts whose meaning escapes me, although more documentation on MIME
37             properties is available these days. Other parts do not make sense outside
38             of Outlook and Exchange.
39              
40             GPG signed mail is not processed correctly. Neither are attachments of
41             type 'appledoublefile'.
42              
43             It would be nice if we could write .MSG files too, but that will require
44             quite a big rewrite.
45              
46             =head1 AUTHOR
47              
48             Matijs van Zuijlen, C
49              
50             =head1 COPYRIGHT AND LICENSE
51              
52             Copyright 2002--2020 by Matijs van Zuijlen
53              
54             This module is free software; you can redistribute it and/or modify
55             it under the same terms as Perl itself.
56              
57             =cut
58 8     8   566592 use strict;
  8         80  
  8         242  
59 8     8   45 use warnings;
  8         14  
  8         195  
60 8     8   204 use 5.006;
  8         29  
61 8     8   53 use vars qw($VERSION);
  8         16  
  8         640  
62             $VERSION = "0.920";
63              
64 8     8   4231 use Email::Simple;
  8         37992  
  8         258  
65 8     8   3522 use Email::MIME::Creator;
  8         475810  
  8         280  
66 8     8   3768 use Email::Outlook::Message::AddressInfo;
  8         33  
  8         306  
67 8     8   4044 use Email::Outlook::Message::Attachment;
  8         25  
  8         258  
68 8     8   56 use Carp;
  8         15  
  8         442  
69 8     8   46 use base 'Email::Outlook::Message::Base';
  8         16  
  8         20640  
70              
71             our $skipheaders = {
72             map { uc($_) => 1 }
73             "MIME-Version",
74             "Content-Type",
75             "Content-Transfer-Encoding",
76             "X-Mailer",
77             "X-Msgconvert",
78             "X-MS-Tnef-Correlator",
79             "X-MS-Has-Attach"
80             };
81              
82             our $MAP_SUBITEM_FILE = {
83             '1000' => "BODY_PLAIN", # Body
84             '1009' => "BODY_RTF", # Compressed-RTF version of body
85             '1013' => "BODY_HTML", # HTML Version of body
86             '0037' => "SUBJECT", # Subject
87             '0047' => "SUBMISSION_ID", # Seems to contain the date
88             '007D' => "HEAD", # Full headers
89             '0C1A' => "FROM", # From: Name
90             '0C1E' => "FROM_ADDR_TYPE", # From: Address type
91             '0C1F' => "FROM_ADDR", # From: Address
92             '0E04' => "TO", # To: Names
93             '0E03' => "CC", # Cc: Names
94             '1035' => "MESSAGEID", # Message-Id
95             '1039' => "REFERENCES", # References: Header
96             '1042' => "INREPLYTO", # In reply to Message-Id
97             '3007' => 'DATE2ND', # Creation Time
98             '0039' => 'DATE1ST', # Outlook sent date
99             '3FDE' => 'CODEPAGE', # Code page for text or html body
100             };
101              
102             # Map codepage numbers to charset names. Codepages not listed here just get
103             # 'CP' prepended, so 1252 -> 'CP1252'.
104             our $MAP_CODEPAGE = {
105             20127 => 'US-ASCII',
106             20866 => 'KOI8-R',
107             28591 => 'ISO-8859-1',
108             65001 => 'UTF-8',
109             };
110              
111             #
112             # Main body of module
113             #
114              
115             sub new {
116 7     7 1 686 my $class = shift;
117 7 100       252 my $file = shift or croak "File name is required parameter";
118 6         17 my $verbose = shift;
119              
120 6         25 my $self = $class->_empty_new;
121              
122 6         56 $self->{EMBEDDED} = 0;
123              
124 6         59 my $msg = OLE::Storage_Lite->new($file);
125 6         95 my $pps = $msg->getPpsTree(1);
126 6 50       174520 $pps or croak "Parsing $file as OLE file failed";
127 6         101 $self->_set_verbosity($verbose);
128             # TODO: Use separate object as parser?
129 6         49 $self->_process_pps($pps);
130              
131 6         330 return $self;
132             }
133              
134             sub _empty_new {
135 7     7   112 my $class = shift;
136              
137 7         64 return bless {
138             ADDRESSES => [], ATTACHMENTS => [], FROM_ADDR_TYPE => "",
139             VERBOSE => 0, EMBEDDED => 1
140             }, $class;
141             }
142              
143             sub to_email_mime {
144 10     10 1 7342 my $self = shift;
145              
146 10         37 my $bodymime;
147             my $mime;
148              
149 10         0 my @parts;
150              
151 10 100       39 if ($self->{BODY_PLAIN}) { push(@parts, $self->_create_mime_plain_body()); }
  8         34  
152 10 100       27935 if ($self->{BODY_HTML}) { push(@parts, $self->_create_mime_html_body()); }
  3         17  
153 10 100       4807 if ($self->{BODY_RTF}) { push(@parts, $self->_create_mime_rtf_body()); }
  5         28  
154              
155 10 100       6830 if ((scalar @parts) > 1) {
    100          
156 7         25 for (@parts) { $self->_clean_part_header($_) };
  14         49  
157              
158 7         61 $bodymime = Email::MIME->create(
159             attributes => {
160             content_type => "multipart/alternative",
161             encoding => "8bit",
162             },
163             parts => \@parts
164             );
165             } elsif ((@parts) == 1) {
166 2         4 $bodymime = $parts[0];
167             } else {
168 1         5 $bodymime = $self->_create_mime_plain_body();
169             }
170              
171 10 100       36902 if (@{$self->{ATTACHMENTS}}>0) {
  10         51  
172 2         8 $self->_clean_part_header($bodymime);
173 2         16 my $mult = Email::MIME->create(
174             attributes => {
175             content_type => "multipart/mixed",
176             encoding => "8bit",
177             },
178             parts => [$bodymime],
179             );
180 2         6437 foreach my $att (@{$self->{ATTACHMENTS}}) {
  2         8  
181 2         10 $self->_SaveAttachment($mult, $att);
182             }
183 2         8 $mime = $mult;
184             } else {
185 8         21 $mime = $bodymime;
186             }
187              
188             #$mime->header_set('Date', undef);
189 10         48 $self->_SetHeaderFields($mime);
190 10         93 $self->_copy_header_data($mime);
191              
192 10         142 return $mime;
193             }
194              
195             #
196             # Below are functions that walk the PPS tree. This is simply a tree walk.
197             # It's not really recursive (except when an attachment contains a .msg
198             # file), since the tree is shallow (max. 1 subdirectory deep).
199             #
200             # The structure is as follows:
201             #
202             # Root
203             # Items with properties of the e-mail
204             # Dirs containing addresses
205             # Items with properties of the address
206             # Dirs containing Attachments
207             # Items with properties of the attachment (including its data)
208             # Dir that is itself a .msg file (if the attachment is an email).
209             #
210              
211             sub _property_map {
212 6     6   43 return $MAP_SUBITEM_FILE;
213             }
214              
215             #
216             # Process a subdirectory. This is either an address or an attachment.
217             #
218             sub _process_subdirectory {
219 14     14   36 my ($self, $pps) = @_;
220              
221 14         52 $self->_extract_ole_date($pps);
222              
223 14         113 my $name = $self->_get_pps_name($pps);
224              
225 14 100       80 if ($name =~ '__recip_version1 0_ ') { # Address of one recipient
    100          
226 6         51 $self->_process_address($pps);
227             } elsif ($name =~ '__attach_version1 0_ ') { # Attachment
228 2         10 $self->_process_attachment($pps);
229             } else {
230 6         45 $self->_warn_about_unknown_directory($pps);
231             }
232 14         45 return;
233             }
234              
235             #
236             # Process a subdirectory that contains an email address.
237             #
238             sub _process_address {
239 6     6   23 my ($self, $pps) = @_;
240              
241             my $addr_info = Email::Outlook::Message::AddressInfo->new($pps,
242 6         82 $self->{VERBOSE});
243              
244 6         18 push @{$self->{ADDRESSES}}, $addr_info;
  6         29  
245 6         14 return;
246             }
247              
248             #
249             # Process a subdirectory that contains an attachment.
250             #
251             sub _process_attachment {
252 2     2   7 my ($self, $pps) = @_;
253              
254             my $attachment = Email::Outlook::Message::Attachment->new($pps,
255 2         21 $self->{VERBOSE});
256 2         6 push @{$self->{ATTACHMENTS}}, $attachment;
  2         8  
257 2         6 return;
258             }
259              
260             #
261             # Header length of the property stream depends on whether the Message
262             # object is embedded or not.
263             #
264             sub _property_stream_header_length {
265 6     6   15 my $self = shift;
266 6 50       43 return ($self->{EMBEDDED} ? 24 : 32)
267             }
268              
269             #
270             # Helper functions
271             #
272              
273             #
274             # Extract time stamp of this OLE item (this is in GMT)
275             #
276             sub _extract_ole_date {
277 14     14   31 my ($self, $pps) = @_;
278 14 100       43 unless (defined ($self->{OLEDATE})) {
279             # Make Date
280 7         15 my $datearr;
281 7         17 $datearr = $pps->{Time2nd};
282 7 100 66     53 $datearr = $pps->{Time1st} unless $datearr and $datearr->[0];
283 7 100 66     74 $self->{OLEDATE} = $self->_format_date($datearr) if $datearr and $datearr->[0];
284             }
285 14         37 return;
286             }
287              
288             # If we didn't get the date from the original header data, we may be able
289             # to get it from the SUBMISSION_ID:
290             # It seems to have the format of a semicolon-separated list of key=value
291             # pairs. The key l has a value with the format:
292             # -Z-, where DATETIME is the date and time (gmt)
293             # in the format YYMMDDHHMMSS.
294             sub _submission_id_date {
295 11     11   1429 my $self = shift;
296              
297 11 100       76 my $submission_id = $self->{SUBMISSION_ID} or return;
298 5 50       81 $submission_id =~ m/ l=.*- (\d\d) (\d\d) (\d\d) (\d\d) (\d\d) (\d\d) Z-.* /x
299             or return;
300 5         19 my $year = $1;
301 5 50       17 $year += 100 if $year < 20;
302 5         40 return $self->_format_date([$6,$5,$4,$3,$2-1,$year]);
303             }
304              
305             sub _SaveAttachment {
306 2     2   6 my ($self, $mime, $att) = @_;
307              
308 2         11 my $m = $att->to_email_mime;
309 2         9 $self->_clean_part_header($m);
310 2         13 $mime->parts_add([$m]);
311 2         6323 return;
312             }
313              
314             # Set header fields
315             sub _AddHeaderField {
316 110     110   296 my ($self, $mime, $fieldname, $value) = @_;
317              
318             #my $oldvalue = $mime->header($fieldname);
319             #return if $oldvalue;
320 110 100       313 $mime->header_set($fieldname, $value) if $value;
321 110         1878 return;
322             }
323              
324             sub _Address {
325 10     10   38 my ($self, $tag) = @_;
326              
327 10   100     53 my $result = $self->{$tag} || "";
328              
329 10   100     52 my $address = $self->{$tag . "_ADDR"} || "";
330 10 100       33 if ($address) {
331 3 50       12 $result .= " " if $result;
332 3         12 $result .= "<$address>";
333             }
334              
335 10         39 return $result;
336             }
337              
338             # Find SMTP addresses for the given list of names
339             sub _expand_address_list {
340 20     20   55 my ($self, $names) = @_;
341              
342 20 100       69 return "" unless defined $names;
343              
344 11         48 my @namelist = split / ; [ ]* /x, $names;
345 11         20 my @result;
346 11         29 name: foreach my $name (@namelist) {
347 6         25 my $addresstext = $self->_find_name_in_addresspool($name);
348 6 100       23 if ($addresstext) {
349 5         18 push @result, $addresstext;
350             } else {
351 1         4 push @result, $name;
352             }
353             }
354 11         62 return join ", ", @result;
355             }
356              
357             sub _find_name_in_addresspool {
358 6     6   17 my ($self, $name) = @_;
359              
360 6         19 my $addresspool = $self->{ADDRESSES};
361              
362 6         10 foreach my $address (@{$addresspool}) {
  6         18  
363 6 100       35 if ($name eq $address->name) {
364 5         34 return $address->display_address;
365             }
366             }
367 1         8 return;
368             }
369              
370             # TODO: Don't really want to need this!
371             sub _clean_part_header {
372 18     18   45 my ($self, $part) = @_;
373 18         79 $part->header_set('Date');
374 18 100       970 unless ($part->content_type =~ m{ ^ multipart / }x) {
375 15         716 $part->header_set('MIME-Version')
376             };
377 18         804 return;
378             }
379              
380             sub _body_character_set {
381 12     12   22 my $self = shift;
382 12         43 return _codepage_to_charset($self->{CODEPAGE});
383             }
384              
385             sub _codepage_to_charset {
386 12     12   29 my $codepage = shift;
387 12 100       33 if (defined $codepage) {
388 5   33     30 return $MAP_CODEPAGE->{$codepage} || "CP$codepage";
389             }
390 7         30 return 'CP1252';
391             }
392              
393             sub _create_mime_plain_body {
394 9     9   20 my $self = shift;
395 9         31 my $charset = $self->_body_character_set;
396 9         25 my $body_str = $self->{BODY_PLAIN};
397 9 100       40 if ($charset ne "UTF-8") {
398             # In this case, the body is a string of octets and needs to be decoded.
399 6         37 $body_str = Encode::decode($charset, $body_str);
400             }
401 9         2163 return Email::MIME->create(
402             attributes => {
403             content_type => "text/plain",
404             charset => $charset,
405             disposition => "inline",
406             encoding => "8bit",
407             },
408             body_str => $body_str
409             );
410             }
411              
412             sub _create_mime_html_body {
413 3     3   6 my $self = shift;
414             return Email::MIME->create(
415             attributes => {
416             content_type => "text/html",
417             charset => $self->_body_character_set,
418             disposition => "inline",
419             encoding => "8bit",
420             },
421             body => $self->{BODY_HTML}
422 3         12 );
423             }
424              
425             # Implementation based on the information in
426             # http://www.freeutils.net/source/jtnef/rtfcompressed.jsp,
427             # and the implementation in tnef version 1.4.5.
428             my $MAGIC_COMPRESSED_RTF = 0x75465a4c;
429             my $MAGIC_UNCOMPRESSED_RTF = 0x414c454d;
430             my $BASE_BUFFER =
431             "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}{\\f0\\fnil \\froman "
432             . "\\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArial"
433             . "Times New RomanCourier{\\colortbl\\red0\\green0\\blue0\n\r\\par "
434             . "\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
435              
436              
437             sub _create_mime_rtf_body {
438 5     5   15 my $self = shift;
439 5         19 my $data = $self->{BODY_RTF};
440              
441 5         36 my ($size, $rawsize, $magic, $crc) = unpack "V4", substr $data, 0, 16;
442              
443 5         16 my $buffer;
444              
445 5 50       23 if ($magic == $MAGIC_COMPRESSED_RTF) {
    0          
446 5         14 $buffer = $BASE_BUFFER;
447 5         16 my $output_length = length($buffer) + $rawsize;
448 5         13 my @flags;
449 5         15 my $in = 16;
450 5         24 while (length($buffer) < $output_length) {
451 456 100       934 if (@flags == 0) {
452 60         251 @flags = split "", unpack "b8", substr $data, $in++, 1;
453             }
454 456         810 my $flag = shift @flags;
455 456 100       885 if ($flag eq "0") {
456 195         434 $buffer .= substr $data, $in++, 1;
457             } else {
458 261         590 my ($a, $b) = unpack "C2", substr $data, $in, 2;
459 261         481 my $offset = ($a << 4) | ($b >> 4);
460 261         441 my $length = ($b & 0xf) + 2;
461 261         394 my $buflen = length $buffer;
462 261         432 my $longoffset = $buflen - ($buflen % 4096) + $offset;
463 261 50       511 if ($longoffset >= $buflen) { $longoffset -= 4096; }
  0         0  
464 261         552 while ($length > 0) {
465 1364         2206 $buffer .= substr $buffer, $longoffset, 1;
466 1364         1966 $length--;
467 1364         2603 $longoffset++;
468             }
469 261         597 $in += 2;
470             }
471             }
472 5         28 $buffer = substr $buffer, length $BASE_BUFFER;
473             } elsif ($magic == $MAGIC_UNCOMPRESSED_RTF) {
474 0         0 $buffer = substr $data, 16;
475             } else {
476 0         0 carp "Incorrect magic number in RTF body.\n";
477             }
478 5         55 return Email::MIME->create(
479             attributes => {
480             content_type => "application/rtf",
481             disposition => "inline",
482             encoding => "base64",
483             },
484             body => $buffer
485             );
486             }
487             # Copy original header data.
488             # Note: This should contain the Date: header.
489             sub _copy_header_data {
490 11     11   2751 my ($self, $mime) = @_;
491              
492 11 100       45 defined $self->{HEAD} or return;
493              
494             # The extra \n is needed for Email::Simple to pick up all headers.
495             # This is a change in Email::Simple.
496 8         72 my $parsed = Email::Simple->new($self->{HEAD} . "\n");
497              
498 8         2835 foreach my $tag (grep { !$skipheaders->{uc $_}} $parsed->header_names) {
  115         942  
499 74         10036 $mime->header_set($tag, $parsed->header($tag));
500             }
501 8         981 return;
502             }
503              
504             # Set header fields
505             sub _SetHeaderFields {
506 10     10   30 my ($self, $mime) = @_;
507              
508 10         59 $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT});
509 10         76 $self->_AddHeaderField($mime, 'From', $self->_Address("FROM"));
510             #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO"));
511 10         56 $self->_AddHeaderField($mime, 'To', $self->_expand_address_list($self->{TO}));
512 10         63 $self->_AddHeaderField($mime, 'Cc', $self->_expand_address_list($self->{CC}));
513 10         59 $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID});
514 10         53 $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO});
515 10         45 $self->_AddHeaderField($mime, 'References', $self->{REFERENCES});
516              
517             # Least preferred option to set the Date: header; this uses the date the
518             # msg file was saved.
519 10         44 $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE});
520              
521             # Second preferred option: get it from the SUBMISSION_ID:
522 10         73 $self->_AddHeaderField($mime, 'Date', $self->_submission_id_date());
523              
524             # Most preferred option from the property list
525 10         85 $self->_AddHeaderField($mime, 'Date', $self->{DATE2ND});
526 10         43 $self->_AddHeaderField($mime, 'Date', $self->{DATE1ST});
527              
528             # After this, we'll try getting the date from the original headers.
529 10         20 return;
530             }
531              
532             1;