File Coverage

blib/lib/Email/Outlook/Message.pm
Criterion Covered Total %
statement 209 212 98.5
branch 57 66 86.3
condition 12 16 75.0
subroutine 34 34 100.0
pod 2 2 100.0
total 314 330 95.1


line stmt bran cond sub pod time code
1             package Email::Outlook::Message;
2             =head1 NAME
3              
4             Email::Outlook::Message.pm - Read Outlook .msg files
5              
6             =head1 SYNOPSIS
7              
8             use Email::Outlook::Message;
9              
10             my $msg = new Email::Outlook::Message $filename, $verbose;
11             my $mime = $msg->to_email_mime;
12             $mime->as_string;
13              
14             =head1 DESCRIPTION
15              
16             Parses .msg message files as produced by Microsoft Outlook.
17              
18             =head1 METHODS
19              
20             =over 8
21              
22             =item B
23              
24             Parse the file pointed at by $msg. Set $verbose to a true value to
25             print information about skipped parts of the .msg file on STDERR.
26              
27             =item B
28              
29             Output result as an Email::MIME object.
30              
31             =back
32              
33             =head1 BUGS
34              
35             Not all data that is in the .msg file is converted. There are some
36             parts whose meaning escapes me, although more documentation on MIME
37             properties is available these days. Other parts do not make sense outside
38             of Outlook and Exchange.
39              
40             GPG signed mail is not processed correctly. Neither are attachments of
41             type 'appledoublefile'.
42              
43             It would be nice if we could write .MSG files too, but that will require
44             quite a big rewrite.
45              
46             =head1 AUTHOR
47              
48             Matijs van Zuijlen, C
49              
50             =head1 COPYRIGHT AND LICENSE
51              
52             Copyright 2002--2020 by Matijs van Zuijlen
53              
54             This module is free software; you can redistribute it and/or modify
55             it under the same terms as Perl itself.
56              
57             =cut
58 9     9   586559 use strict;
  9         91  
  9         264  
59 9     9   45 use warnings;
  9         16  
  9         230  
60 9     9   226 use 5.006;
  9         30  
61 9     9   49 use vars qw($VERSION);
  9         15  
  9         618  
62             $VERSION = "0.921";
63              
64 9     9   4084 use Email::Simple;
  9         39448  
  9         305  
65 9     9   4113 use Email::MIME::Creator;
  9         494590  
  9         292  
66 9     9   4185 use Email::Outlook::Message::AddressInfo;
  9         34  
  9         334  
67 9     9   4531 use Email::Outlook::Message::Attachment;
  9         27  
  9         244  
68 9     9   54 use Carp;
  9         18  
  9         443  
69 9     9   48 use base 'Email::Outlook::Message::Base';
  9         16  
  9         21559  
70              
71             our $skipheaders = {
72             map { uc($_) => 1 }
73             "MIME-Version",
74             "Content-Type",
75             "Content-Transfer-Encoding",
76             "X-Mailer",
77             "X-Msgconvert",
78             "X-MS-Tnef-Correlator",
79             "X-MS-Has-Attach"
80             };
81              
82             our $MAP_SUBITEM_FILE = {
83             '1000' => "BODY_PLAIN", # Body
84             '1009' => "BODY_RTF", # Compressed-RTF version of body
85             '1013' => "BODY_HTML", # HTML Version of body
86             '0037' => "SUBJECT", # Subject
87             '0047' => "SUBMISSION_ID", # Seems to contain the date
88             '007D' => "HEAD", # Full headers
89             '0C1A' => "FROM", # From: Name
90             '0C1E' => "FROM_ADDR_TYPE", # From: Address type
91             '0C1F' => "FROM_ADDR", # From: Address
92             '0E04' => "TO", # To: Names
93             '0E03' => "CC", # Cc: Names
94             '1035' => "MESSAGEID", # Message-Id
95             '1039' => "REFERENCES", # References: Header
96             '1042' => "INREPLYTO", # In reply to Message-Id
97             '3007' => 'DATE2ND', # Creation Time
98             '0039' => 'DATE1ST', # Outlook sent date
99             '3FDE' => 'CODEPAGE', # Code page for text or html body
100             };
101              
102             # Map codepage numbers to charset names. Codepages not listed here just get
103             # 'CP' prepended, so 1252 -> 'CP1252'.
104             our $MAP_CODEPAGE = {
105             20127 => 'US-ASCII',
106             20866 => 'KOI8-R',
107             28591 => 'ISO-8859-1',
108             65001 => 'UTF-8',
109             };
110              
111             #
112             # Main body of module
113             #
114              
115             sub new {
116 13     13 1 106066 my $class = shift;
117 13 100       219 my $file = shift or croak "File name is required parameter";
118 12         24 my $verbose = shift;
119              
120 12         45 my $self = $class->_empty_new;
121              
122 12         74 $self->{EMBEDDED} = 0;
123              
124 12         111 my $msg = OLE::Storage_Lite->new($file);
125 12         165 my $pps = $msg->getPpsTree(1);
126 12 50       322153 $pps or croak "Parsing $file as OLE file failed";
127 12         186 $self->_set_verbosity($verbose);
128             # TODO: Use separate object as parser?
129 12         74 $self->_process_pps($pps);
130              
131 12         836 return $self;
132             }
133              
134             sub _empty_new {
135 13     13   100 my $class = shift;
136              
137 13         95 return bless {
138             ADDRESSES => [], ATTACHMENTS => [], FROM_ADDR_TYPE => "",
139             VERBOSE => 0, EMBEDDED => 1
140             }, $class;
141             }
142              
143             sub to_email_mime {
144 16     16 1 7063 my $self = shift;
145              
146 16         54 my $bodymime;
147             my $mime;
148              
149 16         0 my @parts;
150              
151 16 100       51 if ($self->{BODY_PLAIN}) { push(@parts, $self->_create_mime_plain_body()); }
  14         54  
152 16 100       35125 if ($self->{BODY_HTML}) { push(@parts, $self->_create_mime_html_body()); }
  4         15  
153 16 100       5585 if ($self->{BODY_RTF}) { push(@parts, $self->_create_mime_rtf_body()); }
  10         44  
154              
155 16 100       12865 if ((scalar @parts) > 1) {
    100          
156 13         39 for (@parts) { $self->_clean_part_header($_) };
  26         78  
157              
158 13         88 $bodymime = Email::MIME->create(
159             attributes => {
160             content_type => "multipart/alternative",
161             encoding => "8bit",
162             },
163             parts => \@parts
164             );
165             } elsif ((@parts) == 1) {
166 2         3 $bodymime = $parts[0];
167             } else {
168 1         3 $bodymime = $self->_create_mime_plain_body();
169             }
170              
171 16 100       53372 if (@{$self->{ATTACHMENTS}}>0) {
  16         75  
172 4         17 $self->_clean_part_header($bodymime);
173 4         31 my $mult = Email::MIME->create(
174             attributes => {
175             content_type => "multipart/mixed",
176             encoding => "8bit",
177             },
178             parts => [$bodymime],
179             );
180 4         11928 foreach my $att (@{$self->{ATTACHMENTS}}) {
  4         18  
181 4         19 $self->_SaveAttachment($mult, $att);
182             }
183 4         14 $mime = $mult;
184             } else {
185 12         22 $mime = $bodymime;
186             }
187              
188             #$mime->header_set('Date', undef);
189 16         105 $self->_SetHeaderFields($mime);
190 16         61 $self->_copy_header_data($mime);
191              
192 16         274 return $mime;
193             }
194              
195             #
196             # Below are functions that walk the PPS tree. This is simply a tree walk.
197             # It's not really recursive (except when an attachment contains a .msg
198             # file), since the tree is shallow (max. 1 subdirectory deep).
199             #
200             # The structure is as follows:
201             #
202             # Root
203             # Items with properties of the e-mail
204             # Dirs containing addresses
205             # Items with properties of the address
206             # Dirs containing Attachments
207             # Items with properties of the attachment (including its data)
208             # Dir that is itself a .msg file (if the attachment is an email).
209             #
210              
211             sub _property_map {
212 12     12   68 return $MAP_SUBITEM_FILE;
213             }
214              
215             #
216             # Process a subdirectory. This is either an address or an attachment.
217             #
218             sub _process_subdirectory {
219 28     28   57 my ($self, $pps) = @_;
220              
221 28         98 $self->_extract_ole_date($pps);
222              
223 28         155 my $name = $self->_get_pps_name($pps);
224              
225 28 100       136 if ($name =~ '__recip_version1 0_ ') { # Address of one recipient
    100          
226 12         52 $self->_process_address($pps);
227             } elsif ($name =~ '__attach_version1 0_ ') { # Attachment
228 4         27 $self->_process_attachment($pps);
229             } else {
230 12         67 $self->_warn_about_unknown_directory($pps);
231             }
232 28         75 return;
233             }
234              
235             #
236             # Process a subdirectory that contains an email address.
237             #
238             sub _process_address {
239 12     12   43 my ($self, $pps) = @_;
240              
241             my $addr_info = Email::Outlook::Message::AddressInfo->new($pps,
242 12         166 $self->{VERBOSE});
243              
244 12         23 push @{$self->{ADDRESSES}}, $addr_info;
  12         50  
245 12         56 return;
246             }
247              
248             #
249             # Process a subdirectory that contains an attachment.
250             #
251             sub _process_attachment {
252 4     4   31 my ($self, $pps) = @_;
253              
254             my $attachment = Email::Outlook::Message::Attachment->new($pps,
255 4         40 $self->{VERBOSE});
256 4         10 push @{$self->{ATTACHMENTS}}, $attachment;
  4         13  
257 4         10 return;
258             }
259              
260             #
261             # Header length of the property stream depends on whether the Message
262             # object is embedded or not.
263             #
264             sub _property_stream_header_length {
265 12     12   31 my $self = shift;
266 12 50       56 return ($self->{EMBEDDED} ? 24 : 32)
267             }
268              
269             #
270             # Helper functions
271             #
272              
273             #
274             # Extract time stamp of this OLE item (this is in GMT)
275             #
276             sub _extract_ole_date {
277 28     28   57 my ($self, $pps) = @_;
278 28 100       80 unless (defined ($self->{OLEDATE})) {
279             # Make Date
280 14         22 my $datearr;
281 14         30 $datearr = $pps->{Time2nd};
282 14 100 66     90 $datearr = $pps->{Time1st} unless $datearr and $datearr->[0];
283 14 100 66     128 $self->{OLEDATE} = $self->_format_date($datearr) if $datearr and $datearr->[0];
284             }
285 28         57 return;
286             }
287              
288             # If we didn't get the date from the original header data, we may be able
289             # to get it from the SUBMISSION_ID:
290             # It seems to have the format of a semicolon-separated list of key=value
291             # pairs. The key l has a value with the format:
292             # -Z-, where DATETIME is the date and time (gmt)
293             # in the format YYMMDDHHMMSS.
294             sub _submission_id_date {
295 17     17   1174 my $self = shift;
296              
297 17 100       106 my $submission_id = $self->{SUBMISSION_ID} or return;
298 5 50       30 $submission_id =~ m/ l=.*- (\d\d) (\d\d) (\d\d) (\d\d) (\d\d) (\d\d) Z-.* /x
299             or return;
300 5         12 my $year = $1;
301 5 50       16 $year += 100 if $year < 20;
302 5         32 return $self->_format_date([$6,$5,$4,$3,$2-1,$year]);
303             }
304              
305             sub _SaveAttachment {
306 4     4   13 my ($self, $mime, $att) = @_;
307              
308 4         19 my $m = $att->to_email_mime;
309 4         39 $self->_clean_part_header($m);
310 4         28 $mime->parts_add([$m]);
311 4         11165 return;
312             }
313              
314             # Set header fields
315             sub _AddHeaderField {
316 176     176   399 my ($self, $mime, $fieldname, $value) = @_;
317              
318             #my $oldvalue = $mime->header($fieldname);
319             #return if $oldvalue;
320 176 100       504 $mime->header_set($fieldname, $value) if $value;
321 176         2922 return;
322             }
323              
324             sub _Address {
325 16     16   42 my ($self, $tag) = @_;
326              
327 16   100     90 my $result = $self->{$tag} || "";
328              
329 16   100     83 my $address = $self->{$tag . "_ADDR"} || "";
330 16 100       45 if ($address) {
331 6 50       23 $result .= " " if $result;
332 6         27 $result .= "<$address>";
333             }
334              
335 16         77 return $result;
336             }
337              
338             # Find SMTP addresses for the given list of names
339             sub _expand_address_list {
340 32     32   79 my ($self, $names) = @_;
341              
342 32 100       94 return "" unless defined $names;
343              
344 22         83 my @namelist = split / ; [ ]* /x, $names;
345 22         35 my @result;
346 22         55 name: foreach my $name (@namelist) {
347 12         46 my $addresstext = $self->_find_name_in_addresspool($name);
348 12 100       39 if ($addresstext) {
349 10         35 push @result, $addresstext;
350             } else {
351 2         6 push @result, $name;
352             }
353             }
354 22         101 return join ", ", @result;
355             }
356              
357             sub _find_name_in_addresspool {
358 12     12   43 my ($self, $name) = @_;
359              
360 12         29 my $addresspool = $self->{ADDRESSES};
361              
362 12         19 foreach my $address (@{$addresspool}) {
  12         31  
363 12 100       75 if ($name eq $address->name) {
364 10         45 return $address->display_address;
365             }
366             }
367 2         6 return;
368             }
369              
370             # TODO: Don't really want to need this!
371             sub _clean_part_header {
372 34     34   78 my ($self, $part) = @_;
373 34         125 $part->header_set('Date');
374 34 100       1513 unless ($part->content_type =~ m{ ^ multipart / }x) {
375 28         1184 $part->header_set('MIME-Version')
376             };
377 34         1289 return;
378             }
379              
380             sub _body_plain_character_set {
381 15     15   37 my $self = shift;
382 15         46 my $body_encoding = $self->{BODY_PLAIN_ENCODING};
383 15         54 $self->_body_character_set($body_encoding)
384             }
385              
386             sub _body_html_character_set {
387 4     4   7 my $self = shift;
388 4         8 my $body_encoding = $self->{BODY_HTML_ENCODING};
389 4         12 $self->_body_character_set($body_encoding)
390             }
391              
392             sub _body_character_set {
393 19     19   38 my $self = shift;
394 19         35 my $body_encoding = shift;
395 19         40 my $codepage = $self->{CODEPAGE};
396 19 100 100     118 if (defined $body_encoding && $body_encoding eq "001F") {
    100          
397 4         14 return "UTF-8";
398             } elsif (defined $codepage) {
399 6   33     31 return $MAP_CODEPAGE->{$codepage} || "CP$codepage";
400             } else {
401 9         39 return 'CP1252';
402             }
403             }
404              
405             sub _create_mime_plain_body {
406 15     15   38 my $self = shift;
407 15         51 my $charset = $self->_body_plain_character_set;
408 15         36 my $body_str = $self->{BODY_PLAIN};
409 15 100       56 if ($charset ne "UTF-8") {
410             # In this case, the body is a string of octets and needs to be decoded.
411 9         55 $body_str = Encode::decode($charset, $body_str);
412             }
413 15         2173 return Email::MIME->create(
414             attributes => {
415             content_type => "text/plain",
416             charset => $charset,
417             disposition => "inline",
418             encoding => "8bit",
419             },
420             body_str => $body_str
421             );
422             }
423              
424             sub _create_mime_html_body {
425 4     4   7 my $self = shift;
426             return Email::MIME->create(
427             attributes => {
428             content_type => "text/html",
429             charset => $self->_body_html_character_set,
430             disposition => "inline",
431             encoding => "8bit",
432             },
433             body => $self->{BODY_HTML}
434 4         16 );
435             }
436              
437             # Implementation based on the information in
438             # http://www.freeutils.net/source/jtnef/rtfcompressed.jsp,
439             # and the implementation in tnef version 1.4.5.
440             my $MAGIC_COMPRESSED_RTF = 0x75465a4c;
441             my $MAGIC_UNCOMPRESSED_RTF = 0x414c454d;
442             my $BASE_BUFFER =
443             "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}{\\f0\\fnil \\froman "
444             . "\\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArial"
445             . "Times New RomanCourier{\\colortbl\\red0\\green0\\blue0\n\r\\par "
446             . "\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
447              
448              
449             sub _create_mime_rtf_body {
450 10     10   23 my $self = shift;
451 10         28 my $data = $self->{BODY_RTF};
452              
453 10         55 my ($size, $rawsize, $magic, $crc) = unpack "V4", substr $data, 0, 16;
454              
455 10         21 my $buffer;
456              
457 10 50       36 if ($magic == $MAGIC_COMPRESSED_RTF) {
    0          
458 10         22 $buffer = $BASE_BUFFER;
459 10         25 my $output_length = length($buffer) + $rawsize;
460 10         18 my @flags;
461 10         20 my $in = 16;
462 10         37 while (length($buffer) < $output_length) {
463 912 100       1415 if (@flags == 0) {
464 120         378 @flags = split "", unpack "b8", substr $data, $in++, 1;
465             }
466 912         1229 my $flag = shift @flags;
467 912 100       1289 if ($flag eq "0") {
468 390         718 $buffer .= substr $data, $in++, 1;
469             } else {
470 522         897 my ($a, $b) = unpack "C2", substr $data, $in, 2;
471 522         756 my $offset = ($a << 4) | ($b >> 4);
472 522         642 my $length = ($b & 0xf) + 2;
473 522         604 my $buflen = length $buffer;
474 522         672 my $longoffset = $buflen - ($buflen % 4096) + $offset;
475 522 50       809 if ($longoffset >= $buflen) { $longoffset -= 4096; }
  0         0  
476 522         788 while ($length > 0) {
477 2728         3428 $buffer .= substr $buffer, $longoffset, 1;
478 2728         2932 $length--;
479 2728         3868 $longoffset++;
480             }
481 522         926 $in += 2;
482             }
483             }
484 10         43 $buffer = substr $buffer, length $BASE_BUFFER;
485             } elsif ($magic == $MAGIC_UNCOMPRESSED_RTF) {
486 0         0 $buffer = substr $data, 16;
487             } else {
488 0         0 carp "Incorrect magic number in RTF body.\n";
489             }
490 10         86 return Email::MIME->create(
491             attributes => {
492             content_type => "application/rtf",
493             disposition => "inline",
494             encoding => "base64",
495             },
496             body => $buffer
497             );
498             }
499             # Copy original header data.
500             # Note: This should contain the Date: header.
501             sub _copy_header_data {
502 17     17   2177 my ($self, $mime) = @_;
503              
504 17 100       52 defined $self->{HEAD} or return;
505              
506             # The extra \n is needed for Email::Simple to pick up all headers.
507             # This is a change in Email::Simple.
508 11         117 my $parsed = Email::Simple->new($self->{HEAD} . "\n");
509              
510 11         4362 foreach my $tag (grep { !$skipheaders->{uc $_}} $parsed->header_names) {
  190         1280  
511 143         18081 $mime->header_set($tag, $parsed->header($tag));
512             }
513 11         1393 return;
514             }
515              
516             # Set header fields
517             sub _SetHeaderFields {
518 16     16   47 my ($self, $mime) = @_;
519              
520 16         85 $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT});
521 16         60 $self->_AddHeaderField($mime, 'From', $self->_Address("FROM"));
522             #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO"));
523 16         60 $self->_AddHeaderField($mime, 'To', $self->_expand_address_list($self->{TO}));
524 16         64 $self->_AddHeaderField($mime, 'Cc', $self->_expand_address_list($self->{CC}));
525 16         63 $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID});
526 16         69 $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO});
527 16         59 $self->_AddHeaderField($mime, 'References', $self->{REFERENCES});
528              
529             # Least preferred option to set the Date: header; this uses the date the
530             # msg file was saved.
531 16         60 $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE});
532              
533             # Second preferred option: get it from the SUBMISSION_ID:
534 16         74 $self->_AddHeaderField($mime, 'Date', $self->_submission_id_date());
535              
536             # Most preferred option from the property list
537 16         165 $self->_AddHeaderField($mime, 'Date', $self->{DATE2ND});
538 16         61 $self->_AddHeaderField($mime, 'Date', $self->{DATE1ST});
539              
540             # After this, we'll try getting the date from the original headers.
541 16         35 return;
542             }
543              
544             1;