File Coverage

blib/lib/IMAP/BodyStructure.pm
Criterion Covered Total %
statement 173 182 95.0
branch 62 72 86.1
condition 15 24 62.5
subroutine 26 28 92.8
pod 8 8 100.0
total 284 314 90.4


line stmt bran cond sub pod time code
1             package IMAP::BodyStructure;
2 1     1   45147 use strict;
  1         3  
  1         44  
3              
4             # $Id: BodyStructure.pm,v 1.17 2006/05/02 16:56:36 kappa Exp $
5              
6             =head1 NAME
7              
8             IMAP::BodyStructure - IMAP4-compatible BODYSTRUCTURE and ENVELOPE parser
9              
10             =head1 SYNOPSIS
11            
12             use IMAP::BodyStructure;
13              
14             # $imap is a low-level IMAP-client with an ability to fetch items
15             # by message uids
16              
17             my $bs = new IMAP::BodyStructure
18             $imap->imap_fetch($msg_uid,
19             'BODYSTRUCTURE', 1)->[0]->{BODYSTRUCTURE};
20              
21             print "[UID:$msg_uid] message is in Russian. Sure.\n"
22             if $bs->charset =~ /(?:koi8-r|windows-1251)/i;
23              
24             my $part = $bs->part_at('1.3');
25             $part->type =~ m#^image/#
26             and print "The 3rd part is an image named \""
27             . $part->filename . "\"\n";
28              
29             =head1 DESCRIPTION
30              
31             An IMAP4-compatible IMAP server MUST include a full MIME-parser which
32             parses the messages inside IMAP mailboxes and is accessible via
33             BODYSTRUCTURE fetch item. This module provides a Perl interface to
34             parse the output of IMAP4 MIME-parser. Hope no one will have problems
35             with parsing this doc.
36              
37             It is a rather straightforward C-style parser and is
38             therefore much, much faster then the venerable L
39             which is based on a L grammar. I believe it is also
40             more correct when parsing nested multipart C parts. See
41             testsuite if interested.
42              
43             I'd also like to emphasize that I
44             client!> You will need to employ one from CPAN, there are many. A
45             section with examples of getting to a BODYSTRUCTURE fetch item with
46             various Perl IMAP clients available on CPAN would greatly
47             enhance this document.
48              
49             =head1 INTERFACE
50              
51             =cut
52              
53 1     1   22 use 5.005;
  1         3  
  1         49  
54              
55 1     1   6 use vars qw/$VERSION/;
  1         2  
  1         119  
56              
57             $VERSION = '1.01';
58              
59             sub _get_envelope($\$);
60             sub _get_bodystructure(\$;$$);
61             sub _get_npairs(\$);
62             sub _get_ndisp(\$);
63             sub _get_nstring(\$);
64              
65             =head2 METHODS
66              
67             =over 4
68              
69             =item new($)
70              
71             The constructor does most of the work here. It initializes the
72             hierarchial data structure representing all the message parts and their
73             properties. It takes one argument which should be a string returned
74             by IMAP server in reply to a FETCH command with BODYSTRUCTURE item.
75              
76             All the parts on all the levels are represented by IMAP::BodyStructure
77             objects and that enables the uniform access to them. It is a direct
78             implementation of the Composite Design Pattern.
79              
80             =cut
81              
82 1         6 use fields qw/type encoding size disp params parts desc bodystructure
83 1     1   958 part_id cid textlines md5 lang loc envelope/;
  1         1639  
84              
85             sub new {
86 13     13 1 1285 my $class = shift;
87 13   33     86 $class = ref $class || $class;
88 13         27 my $imap_str= shift;
89              
90 13         46 return _get_bodystructure($imap_str, $class);
91             }
92              
93             =item type()
94              
95             Returns the MIME type of the part. Expect something like C
96             or C.
97              
98             =item encoding()
99              
100             Returns the MIME encoding of the part. This is usually one of '7bit',
101             '8bit', 'base64' or 'quoted-printable'.
102              
103             =item size()
104              
105             Returns the size of the part in octets. It is I the size of the
106             data in the part, which may be encoded as quoted-printable leaving us
107             without an obvious method of calculating the exact size of original
108             data.
109              
110             =cut
111              
112             for my $field (qw/type encoding size/) {
113 1     1   200 no strict 'refs';
  1         2  
  1         2166  
114 123     123   525 *$field = sub { return $_[0]->{$field} };
115             }
116              
117             =item disp()
118              
119             Returns the content-disposition of the part. One of 'inline' or
120             'attachment', usually. Defaults to inline, but you should remember
121             that if there IS a disposition but you cannot recognize it than act as
122             if it's 'attachment'. And use case-insensitive comparisons.
123              
124             =cut
125              
126             sub disp {
127 2     2 1 4 my $self = shift;
128              
129 2 50 50     17 return $self->{disp} ? $self->{disp}->[0] || 'inline' : 'inline';
130             }
131              
132             =item charset()
133              
134             Returns the charset of the part OR the charset of the first nested
135             part. This looks like a good heuristic really. Charset is something
136             resembling 'UTF-8', 'US-ASCII', 'ISO-8859-13' or 'KOI8-R'. The standard
137             does not say it should be uppercase, by the way.
138              
139             Can be undefined.
140              
141             =cut
142              
143             sub charset {
144 2     2 1 764 my $self = shift;
145              
146             # get charset from params OR dive into the first part
147             return $self->{params}->{charset}
148 2   0     18 || ($self->{parts} && @{$self->{parts}} && $self->{parts}->[0]->charset)
149             || undef; # please oh please, no '' or '0' charsets
150             }
151              
152             =item filename()
153              
154             Returns the filename specified as a part of Content-Disposition
155             header.
156              
157             Can be undefined.
158              
159             =cut
160              
161             sub filename {
162 2     2 1 4 my $self = shift;
163              
164 2         15 return $self->{disp}->[1]->{filename};
165             }
166              
167             =item description()
168              
169             Returns the description of the part.
170              
171             =cut
172              
173             sub description {
174 0     0 1 0 my $self = shift;
175              
176 0         0 return $self->{desc};
177             }
178              
179             =item parts(;$)
180              
181             This sub acts differently depending on whether you pass it an
182             argument or not.
183              
184             Without any arguments it returns a list of parts in list context and
185             the number in scalar context.
186              
187             Specifying a scalar argument allows you to get an individual part with
188             that index.
189              
190             I
191             etc. but IMAP::BodyStructure objects containing information about the
192             message parts which was extracted from parsing BODYSTRUCTURE IMAP
193             response!>
194              
195             =cut
196              
197             sub parts {
198 14     14 1 26 my $self = shift;
199 14         18 my $arg = shift;
200              
201 14 100       32 if (defined $arg) {
202 13         58 return $self->{parts}->[$arg];
203             } else {
204 1 50       4 return wantarray ? @{$self->{parts}} : scalar @{$self->{parts}};
  0         0  
  1         5  
205             }
206             }
207              
208             =item part_at($)
209              
210             This method returns a message part by its path. A path to a part in
211             the hierarchy is a dot-separated string of part indices. See L for
212             an example. A nested C does not add a hierarchy level
213             UNLESS it is a single part of another C part (with no
214             C levels in between). Instead, it has an additional
215             C<.TEXT> part which refers to the internal IMAP::BodyStructure object.
216             Look, here is an outline of an example message structure with part
217             paths alongside each part.
218              
219             multipart/mixed 1
220             text/plain 1.1
221             application/msword 1.2
222             message/rfc822 1.3
223             multipart/alternative 1.3.TEXT
224             text/plain 1.3.1
225             multipart/related 1.3.2
226             text/html 1.3.2.1
227             image/png 1.3.2.2
228             image/png 1.3.2.3
229              
230             This is a text email with two attachments, one being an MS Word document,
231             and the other is itself a message (probably a forward) which is composed in a
232             graphical MUA and contains two alternative representations, one
233             plain text fallback and one HTML with images (bundled as a
234             C).
235              
236             Another one with several levels of C. This one is hard
237             to compose in a modern MUA, however.
238              
239             multipart/mixed 1
240             text/plain 1.1
241             message/rfc822 1.2
242             message/rfc822 1.2.TEXT
243             text/plain 1.2.1
244              
245             =cut
246              
247             sub part_at {
248 33     33 1 2274 my $self = shift;
249 33         48 my $path = shift;
250              
251 33         137 return $self->_part_at(split /\./, $path);
252             }
253              
254             sub _part_at {
255 77     77   108 my $self = shift;
256 77         154 my @parts = @_;
257              
258 77 100       262 return $self unless @parts; # (cond ((null? l) s)
259              
260 60         119 my $part_num = shift @parts; # (car l)
261              
262 60 100       152 if ($self->type =~ /^multipart\//) {
    100          
263 33 100       105 if (exists $self->{parts}->[$part_num - 1]) {
264 27         89 return $self->{parts}->[$part_num - 1]->_part_at(@parts);
265             } else {
266 6         38 return;
267             }
268             } elsif ($self->type eq 'message/rfc822') {
269 22 100       69 return $self->{bodystructure} if $part_num eq 'TEXT';
270              
271 18 100       35 if ($self->{bodystructure}->type =~ m{^ multipart/ | ^ message/rfc822 \z}xms) {
272 12         34 return $self->{bodystructure}->_part_at($part_num, @parts);
273             } else {
274 6 100       29 return $part_num == 1 ? $self->{bodystructure}->_part_at(@parts) : undef;
275             }
276             } else {
277             # there's no included parts in single non-rfc822 parts
278             # so if you still want one you get undef
279 5 100 66     39 if ($part_num && $part_num ne '1' || @parts) {
      100        
280 3         23 return;
281             } else {
282 2         15 return $self;
283             }
284             }
285             }
286              
287             =item part_path()
288              
289             Returns the part path to the current part.
290              
291             =back
292              
293             =head2 DATA MEMBERS
294              
295             These are additional pieces of information returned by IMAP server and
296             parsed. They are rarely used, though (and rarely defined too, btw), so
297             I chose not to provide access methods for them.
298              
299             =over 4
300              
301             =item params
302              
303             This is a hashref of MIME parameters. The only interesting param is
304             charset and there's a shortcut method for it.
305              
306             =item lang
307              
308             Content language.
309              
310             =item loc
311              
312             Content location.
313              
314             =item cid
315              
316             Content ID.
317              
318             =item md5
319              
320             Content MD5. No one seems to bother with calculating and it is usually
321             undefined.
322              
323             =back
324              
325             B and B members exist only in singlepart parts.
326              
327             =cut
328              
329             sub part_path {
330 0     0 1 0 my $self = shift;
331              
332 0         0 return $self->{part_id};
333             }
334              
335             sub _get_envelope($\$) {
336 7     7   654 eval "$_[0]::Envelope->new(\$_[1])";
337             }
338              
339             sub _get_bodystructure(\$;$$) {
340 66     66   85 my $str = shift;
341 66   50     147 my $class = shift || __PACKAGE__;
342 66         84 my $id = shift;
343              
344 66         177 my __PACKAGE__ $bs = fields::new($class);
345 66   100     10889 $bs->{part_id} = $id || 1; # !defined $id --> top-level message
346             # and single-part has one part with part_id 1
347              
348 66 100       223 my $id_prefix = $id ? "$id." : '';
349              
350 66 100       415 $$str =~ m/\G\s*(?:\(BODYSTRUCTURE\s*)?\(/gc
351             or return 0;
352              
353 53         110 $bs->{parts} = [];
354 53 100       230 if ($$str =~ /\G(?=\()/gc) {
355             # multipart
356 13         27 $bs->{type} = 'multipart/';
357 13         22 my $part_id = 1;
358 13         28 $id_prefix =~ s/\.?TEXT//;
359 13         69 while (my $part_bs = _get_bodystructure($$str, $class, $id_prefix . $part_id++)) {
360 33         33 push @{$bs->{parts}}, $part_bs;
  33         168  
361             }
362              
363 13         43 $bs->{type} .= lc(_get_nstring($$str));
364 13         36 $bs->{params} = _get_npairs($$str);
365 13         33 $bs->{disp} = _get_ndisp($$str);
366 13         27 $bs->{lang} = _get_nstring($$str);
367 13         29 $bs->{loc} = _get_nstring($$str);
368             } else {
369 40         104 $bs->{type} = lc (_get_nstring($$str) . '/' . _get_nstring($$str));
370 40         116 $bs->{params} = _get_npairs($$str);
371 40         87 $bs->{cid} = _get_nstring($$str);
372 40         73 $bs->{desc} = _get_nstring($$str);
373 40         84 $bs->{encoding} = _get_nstring($$str);
374 40         89 $bs->{size} = _get_nstring($$str);
375              
376 40 100       182 if ($bs->{type} eq 'message/rfc822') {
    100          
377 7         29 $bs->{envelope} = _get_envelope($class, $$str);
378 7 100       35 if ($id_prefix =~ s/\.?TEXT//) {
379 1         5 $bs->{bodystructure} = _get_bodystructure($$str, $class, $id_prefix . '1');
380             } else {
381 6         26 $bs->{bodystructure} = _get_bodystructure($$str, $class, $id_prefix . 'TEXT');
382             }
383 7         19 $bs->{textlines} = _get_nstring($$str);
384             } elsif ($bs->{type} =~ /^text\//) {
385 23         45 $bs->{textlines} = _get_nstring($$str);
386             }
387              
388 40         85 $bs->{md5} = _get_nstring($$str);
389 40         92 $bs->{disp} = _get_ndisp($$str);
390 40         80 $bs->{lang} = _get_nstring($$str);
391 40         78 $bs->{loc} = _get_nstring($$str);
392             }
393              
394 53         126 $$str =~ m/\G\s*\)/gc;
395              
396 53         209 return $bs;
397             }
398              
399             sub _get_ndisp(\$) {
400 53     53   65 my $str = shift;
401              
402 53         94 $$str =~ /\G\s+/gc;
403              
404 53 100       178 if ($$str =~ /\GNIL/gc) {
    50          
405 23         49 return undef;
406             } elsif ($$str =~ m/\G\s*\(/gc) {
407 30         36 my @disp;
408              
409 30         55 $disp[0] = _get_nstring($$str);
410 30         67 $disp[1] = _get_npairs($$str);
411              
412 30         76 $$str =~ m/\G\s*\)/gc;
413 30         87 return \@disp;
414             }
415            
416 0         0 return 0;
417             }
418              
419             sub _get_npairs(\$) {
420 83     83   125 my $str = shift;
421              
422 83         166 $$str =~ /\G\s+/gc;
423              
424 83 100       311 if ($$str =~ /\GNIL/gc) {
    50          
425 27         48 return undef;
426             } elsif ($$str =~ m/\G\s*\(/gc) {
427 56         68 my %r;
428 56         60 while ('fareva') {
429 114         210 my ($key, $data) = (_get_nstring($$str), _get_nstring($$str));
430 114 100       324 $key or last;
431              
432 58         181 $r{$key} = $data;
433             }
434              
435 56         131 $$str =~ m/\G\s*\)/gc;
436 56         160 return \%r;
437             }
438            
439 0         0 return 0;
440             }
441              
442             sub _get_nstring(\$) {
443 789     789   14075 my $str = $_[0];
444              
445             # nstring = string / nil
446             # nil = "NIL"
447             # string = quoted / literal
448             # quoted = DQUOTE *QUOTED-CHAR DQUOTE
449             # QUOTED-CHAR = /
450             # "\" quoted-specials
451             # quoted-specials = DQUOTE / "\"
452             # literal = "{" number "}" CRLF *CHAR8
453             # ; Number represents the number of CHAR8s
454              
455             # astring = 1*(any CHAR except "(" / ")" / "{" / SP / CTL / list-wildcards / quoted-specials)
456              
457 789         1327 $$str =~ /\G\s+/gc;
458              
459 789 100       5765 if ($$str =~ /\GNIL/gc) {
    100          
    100          
    100          
460 208         449 return undef;
461             } elsif ($$str =~ m/\G(\"(?>[^\\\"]*(?:\\.[^\\\"]*)*)\")/gc) { # delimited re ala Regexp::Common::delimited + (?>...)
462 340         604 return _unescape($1);
463             } elsif ($$str =~ /\G\{(\d+)\}\r\n/gc) {
464 5         12 my $pos = pos($$str);
465 5         63 my $data = substr $$str, $pos, $1;
466 5         15 pos($$str) = $pos + $1;
467 5         19 return $data;
468             } elsif ($$str =~ /\G([^"\(\)\{ \%\*\"\\\x00-\x1F]+)/gc) {
469 71         227 return $1;
470             }
471              
472 165         362 return 0;
473             }
474              
475             sub _unescape {
476 340     340   625 my $str = shift;
477              
478 340         951 $str =~ s/^"//;
479 340         946 $str =~ s/"$//;
480 340         534 $str =~ s/\\\"/\"/g;
481 340         436 $str =~ s/\\\\/\\/g;
482              
483 340         8056 return $str;
484             }
485              
486             =over 4
487              
488             =item get_enveleope($)
489              
490             Parses a string into IMAP::BodyStructure::Envelope object. See below.
491              
492             =back
493              
494             =head2 IMAP::BodyStructure::Envelope CLASS
495              
496             Every message on an IMAP server has an envelope. You can get it
497             using ENVELOPE fetch item or, and this is relevant, from BODYSTRUCTURE
498             response in case there are some nested messages (parts with type of
499             C). So, if we have a part with such a type then the
500             corresponding IMAP::BodyStructure object always has
501             B data member which is, in turn, an object of
502             IMAP::BodyStructure::Envelope.
503              
504             You can of course use this satellite class on its own, this is very
505             useful when generating meaningful message lists in IMAP folders.
506              
507             =cut
508              
509             package IMAP::BodyStructure::Envelope;
510              
511             sub _get_nstring(\$); # proto
512              
513             *_get_nstring = \&IMAP::BodyStructure::_get_nstring;
514              
515             sub _get_naddrlist(\$);
516             sub _get_naddress(\$);
517              
518 1     1   7 use vars qw/@envelope_addrs/;
  1         2  
  1         68  
519             @envelope_addrs = qw/from sender reply_to to cc bcc/;
520              
521             =head2 METHODS
522              
523             =over 4
524              
525             =item new($)
526              
527             The constructor create Envelope object from string which should be an
528             IMAP server respone to a fetch with ENVELOPE item or a substring of
529             BODYSTRUCTURE response for a message with message/rfc822 parts inside.
530              
531             =back
532              
533             =head2 DATA MEMBERS
534              
535             =over 4
536              
537             =item date
538              
539             Date of the message as specified in the envelope. Not the IMAP
540             INTERNALDATE, be careful!
541              
542             =item subject
543              
544             Subject of the message, may be RFC2047 encoded, of course.
545              
546             =item message_id
547              
548             =item in_reply_to
549              
550             Message-IDs of the current message and the message in reply to which
551             this one was composed.
552              
553             =item to, from, cc, bcc, sender, reply_to
554              
555             These are the so called address-lists or just arrays of addresses.
556             Remember, a message may be addressed to lots of people.
557              
558             Each address is a hash of four elements:
559              
560             =over 4
561              
562             =item name
563              
564             The informal part, "A.U.Thor" from "A.U.Thor, Ea.u.thor@somewhere.comE"
565              
566             =item sroute
567              
568             Source-routing information, not used. (By the way, IMAP4r1 spec was
569             born after the last email address with sroute passed away.)
570              
571             =item account
572              
573             The part before @.
574              
575             =item host
576              
577             The part after @.
578              
579             =item full
580              
581             The full address for display purposes.
582              
583             =back
584              
585             =back
586              
587             =cut
588              
589 1     1   9 use fields qw/from sender reply_to to cc bcc date subject in_reply_to message_id/;
  1         1  
  1         5  
590              
591             sub new(\$) {
592 7     7   17 my $class = shift;
593 7         10 my $str = shift;
594            
595 7 50       41 $$str =~ m/\G\s*(?:\(ENVELOPE)?\s*\(/gc
596             or return 0;
597              
598 7         23 my __PACKAGE__ $self = fields::new($class);
599              
600 7         783 $self->{'date'} = _get_nstring($$str);
601 7         19 $self->{'subject'} = _get_nstring($$str);
602              
603 7         21 foreach my $header (@envelope_addrs) {
604 42         87 $self->{$header} = _get_naddrlist($$str);
605             }
606              
607 7         70 $self->{'in_reply_to'} = _get_nstring($$str);
608 7         18 $self->{'message_id'} = _get_nstring($$str);
609              
610 7         21 $$str =~ m/\G\s*\)/gc;
611              
612 7         68 return $self;
613             }
614              
615             sub _get_naddress(\$) {
616 32     32   39 my $str = shift;
617              
618 32 50       121 if ($$str =~ /\GNIL/gc) {
    100          
619 0         0 return undef;
620             } elsif ($$str =~ m/\G\s*\(/gc) {
621 16         30 my %addr = (
622             name => _get_nstring($$str),
623             sroute => _get_nstring($$str),
624             account => _get_nstring($$str),
625             host => _get_nstring($$str),
626             );
627 16 50 100     102 $addr{address} = ($addr{account}
628             ? "$addr{account}@" . ($addr{host} || '')
629             : '');
630              
631 16         39 $addr{full} = _format_address($addr{name}, $addr{address});
632              
633 16         44 $$str =~ m/\G\s*\)/gc;
634 16         52 return \%addr;
635             }
636 16         39 return 0;
637             }
638              
639             sub _get_naddrlist(\$) {
640 42     42   46 my $str = shift;
641            
642 42         83 $$str =~ /\G\s+/gc;
643              
644 42 100       128 if ($$str =~ /\GNIL/gc) {
    50          
645 26         77 return undef;
646             } elsif ($$str =~ m/\G\s*\(/gc) {
647 16         27 my @addrs = ();
648 16         36 while (my $addr = _get_naddress($$str)) {
649 16         42 push @addrs, $addr;
650             }
651              
652 16         31 $$str =~ m/\G\s*\)/gc;
653 16         51 return \@addrs;
654             }
655 0         0 return 0;
656             }
657              
658             my $rfc2822_atext = q(a-zA-Z0-9!#$%&'*+/=?^_`{|}~-); # simple non-interpolating string (think apostrophs)
659             my $rfc2822_atom = qr/[$rfc2822_atext]+/; # straight from rfc2822
660              
661 1     1   647 use constant EMPTY_STR => q{};
  1         2  
  1         220  
662             sub _format_address {
663 16     16   25 my ($phrase, $email) = @_;
664              
665 16 100 66     70 if (defined $phrase && $phrase ne EMPTY_STR) {
666 13 50       36 if ($phrase !~ /^ \s* " [^"]+ " \s* \z/xms) {
667             # $phrase is not already quoted
668              
669 13         23 $phrase =~ s/ (["\\]) /\\$1/xmsg;
670              
671 13 100       186 if ($phrase !~ m/^ \s* $rfc2822_atom (?: \s+ $rfc2822_atom)* \s* \z/xms) {
672 3         8 $phrase = qq{"$phrase"};
673             }
674             }
675              
676 13 50       57 return $email ? "$phrase <$email>" : $phrase;
677             } else {
678 3   50     11 return $email || '';
679             }
680             }
681              
682             1;
683              
684             __END__