File Coverage

blib/lib/MIME/Structure.pm
Criterion Covered Total %
statement 170 191 89.0
branch 48 88 54.5
condition 10 16 62.5
subroutine 24 28 85.7
pod 10 15 66.6
total 262 338 77.5


line stmt bran cond sub pod time code
1             package MIME::Structure;
2              
3 5     5   6392 use strict;
  5         12  
  5         233  
4              
5 5     5   32 use vars qw($VERSION);
  5         10  
  5         300  
6              
7             $VERSION = '0.07';
8              
9 5     5   6589 use Text::Balanced qw(extract_delimited);
  5         125272  
  5         519  
10              
11 5     5   53 use constant IN_HEADER => 1;
  5         10  
  5         353  
12 5     5   25 use constant IN_BODY => 2;
  5         9  
  5         192  
13 5     5   25 use constant IN_PREAMBLE => 3;
  5         9  
  5         834  
14 5     5   28 use constant IN_EPILOGUE => 4;
  5         10  
  5         201  
15              
16 5     5   27 use constant PRINT_NONE => 0;
  5         8  
  5         194  
17 5     5   24 use constant PRINT_HEADER => 1;
  5         10  
  5         195  
18 5     5   24 use constant PRINT_PREAMBLE => 2;
  5         8  
  5         195  
19 5     5   23 use constant PRINT_BODY => 4;
  5         7  
  5         287  
20 5     5   23 use constant PRINT_EPILOGUE => 8;
  5         9  
  5         12463  
21              
22             # --- Constructor, accessors, initializer
23              
24             sub new {
25 5     5 1 2833 my $cls = shift;
26 5         39 my $self = bless {
27             'keep_header' => 0,
28             'keep_fields' => 1,
29             'print_header' => 0,
30             'print_preamble' => 0,
31             'print_body' => 0,
32             'print_epilogue' => 0,
33             @_,
34             }, $cls;
35 5         19 $self->init;
36             }
37              
38 15 50   15 1 149 sub keep_header { @_ > 1 ? $_[0]->{'keep_header'} = $_[1] : $_[0]->{'keep_header'} }
39 15 50   15 1 54 sub keep_fields { @_ > 1 ? $_[0]->{'keep_fields'} = $_[1] : $_[0]->{'keep_fields'} }
40 15 50   15 1 48 sub print { @_ > 1 ? $_[0]->{'print'} = $_[1] : $_[0]->{'print'} }
41 0 0   0 1 0 sub print_header { @_ > 1 ? $_[0]->{'print_header'} = $_[1] : $_[0]->{'print_header'} }
42 0 0   0 1 0 sub print_body { @_ > 1 ? $_[0]->{'print_body'} = $_[1] : $_[0]->{'print_body'} }
43 0 0   0 1 0 sub print_preamble { @_ > 1 ? $_[0]->{'print_preamble'} = $_[1] : $_[0]->{'print_preamble'} }
44 0 0   0 1 0 sub print_epilogue { @_ > 1 ? $_[0]->{'print_epilogue'} = $_[1] : $_[0]->{'print_epilogue'} }
45              
46             sub init {
47 5     5 0 9 my ($self) = @_;
48 5         29 my $print_spec = $self->{'print'};
49 5         10 my $print;
50 5 50       21 if (!defined $print_spec) {
    0          
51 5         10 $print = PRINT_NONE;
52             }
53             elsif ($print_spec =~ /^\d+$/) {
54 0         0 $print = $print_spec;
55             }
56             else {
57 0 0       0 if ($print_spec =~ /header/i) {
58 0         0 $print |= PRINT_HEADER;
59             }
60 0 0       0 if ($print_spec =~ /body/i) {
61 0         0 $print |= PRINT_BODY;
62             }
63 0 0       0 if ($print_spec =~ /preamble/i) {
64 0         0 $print |= PRINT_PREAMBLE;
65             }
66 0 0       0 if ($print_spec =~ /epilogue/i) {
67 0         0 $print |= PRINT_EPILOGUE;
68             }
69             }
70 5 50       28 if ($self->{'print_header'}) {
71 0         0 $print |= PRINT_HEADER;
72             }
73 5 50       16 if ($self->{'print_body'}) {
74 0         0 $print |= PRINT_BODY;
75             }
76 5 50       31 if ($self->{'print_preamble'}) {
77 0         0 $print |= PRINT_PREAMBLE;
78             }
79 5 50       20 if ($self->{'print_epilogue'}) {
80 0         0 $print |= PRINT_EPILOGUE;
81             }
82 5         10 $self->{'print'} = $print;
83 5         24 $self;
84             }
85              
86             # --- Parsing
87              
88             sub parse {
89 15     15 1 79108 my ($self, $fh) = @_;
90 15         36 my ($ofs, $line) = (0, 1);
91 15         81 my $message = {
92             'kind' => 'message',
93             'offset' => $ofs,
94             'line' => $line,
95             'number' => '1',
96             };
97 15         38 my @context = ($message);
98 15         22 my @entities;
99             my @boundaries;
100              
101             # --- Parsing options
102 15         51 my $keep_header = $self->keep_header;
103 15         48 my $keep_fields = $self->keep_fields;
104 15         45 my $print = $self->print;
105            
106 15         25 my $state = IN_HEADER;
107 15         24 my $header = '';
108 15         239 while (<$fh>) {
109 350         407 my $len = length $_;
110 350         344 $ofs += $len;
111 350         337 $line++;
112 350 100 66     1393 if ($state == IN_HEADER) {
    100 66        
    100          
    50          
113 245         293 $header .= $_;
114 245 100       836 if (/^$/) {
115             # --- Parse the header that has just ended
116 50 50       113 print $header if $print & PRINT_HEADER;
117 50         122 my $fields = $self->parse_header($header);
118             # @context is (..., $parent, $entity)
119             # or ($parent, $entity) if in header of a part of message
120             # or ($entity) if in message header itself
121 50         177 my $entity = $context[-1];
122 50         54 my $parent;
123 50         106 my $level = $entity->{'level'} = @context - 1;
124 50 100       178 if (@context > 1) {
125             # Current entity is $context[-1]
126 35         62 $parent = $entity->{'parent'} = $context[-2];
127             }
128 50 100       53 my ($content_type) = @{ $fields->{'content-type'} || [] };
  50         157  
129 50 100       101 if (!defined $content_type) {
130 5 50 33     23 if ($parent && "$parent->{'type'}/$parent->{'subtype'}" eq 'multipart/digest') {
131 0         0 $content_type = 'message/rfc822'
132             }
133             else {
134 5         10 $content_type = 'text/plain; charset=us-ascii';
135             }
136             }
137 50         101 my ($type, $subtype, $type_params) = parse_content_type($content_type);
138 50         126 $entity->{'type'} = $type;
139 50         112 $entity->{'subtype'} = $subtype;
140 50         90 $entity->{'type_params'} = $type_params;
141 50 50       110 $entity->{'header'} = $header if $keep_header;
142 50 50       167 $entity->{'fields'} = $fields if $keep_fields;
143 50         79 $entity->{'body_offset'} = $ofs;
144 50         63 $header = '';
145 50   100     54 ($entity->{'encoding'}) = map lc, @{ $fields->{'content-transfer-encoding'} ||= ['7bit'] };
  50         381  
146 50 100       122 if ($type eq 'multipart') {
147             # --- Header is for a multipart entity
148 15         20 $state = IN_PREAMBLE;
149 15         28 my $boundary = $type_params->{'boundary'};
150 15 50       42 die "No boundary specified for multipart entity with header at $ofs"
151             unless defined $boundary;
152 15         24 push @boundaries, $boundary;
153 15         28 $entity->{'parts'} = [];
154 15         40 $entity->{'parts_boundary'} = $boundary;
155             }
156             else {
157             # --- Header is for a leaf entity
158 35         39 $state = IN_BODY;
159 35         44 pop @context; # The entity whose header we just finished reading
160 35 100 66     141 if ($level == 0 && !($print & PRINT_BODY)) {
161             # Minor optimization: message is not multipart, so we
162             # can stop if we're not going to be printing the body
163 5         8 push @entities, $entity;
164 5         20 while (<$fh>) { $ofs += length };
  5         38  
165 5         15 last;
166             }
167             }
168 45         207 push @entities, $entity;
169             }
170             }
171             elsif (@boundaries && /^--(.+?)(--)?$/ && $1 eq $boundaries[-1]) {
172 50 50       100 print if $print != PRINT_NONE;
173 50 100       93 if (defined $2) {
174             # End of parent's parts
175 15         18 pop @boundaries;
176 15         21 pop @context;
177 15         122 $state = IN_EPILOGUE;
178             }
179             else {
180             # Another part begins
181 35         40 $state = IN_HEADER;
182 35         111 my $part = {
183             'kind' => 'part',
184             'offset' => $ofs,
185             'line' => $line,
186             };
187 35         44 my $parent = $context[-1];
188 35         53 my $parent_parts = $parent->{'parts'};
189 35         52 push @$parent_parts, $part;
190 35         49 $part->{'parent'} = $parent;
191 35         102 $part->{'number'} = $parent->{'number'} . '.' . scalar @$parent_parts;
192 35         141 push @context, $part;
193             }
194             }
195             elsif ($state == IN_PREAMBLE) {
196             # A line within the preamble: ignore per RFC 2049
197 15 50       68 print if $print & PRINT_PREAMBLE;
198             }
199             elsif ($state == IN_EPILOGUE) {
200             # A line within the epilogue: ignore per RFC 2049
201 0 0       0 print if $print & PRINT_EPILOGUE;
202             }
203             else {
204             # Normal body line
205 40 50       187 print if $print & PRINT_BODY;
206             }
207             }
208             # We're all done reading
209 15 50       55 if (@context) {
210 0         0 die "Unfinished parts!";
211             }
212 15         50 $message->{'content_length'} = $ofs - $message->{'body_offset'};
213 15         30 $message->{'length'} = $ofs;
214            
215 15 100       111 return wantarray ? @entities : $message;
216             }
217              
218             # --- Reporting
219              
220             sub concise_structure {
221 3     3 1 6 my ($self, $message) = @_;
222             # (text/plain:0)
223             # (multipart/mixed:0 (text/plain:681) (image/gif:774))
224 3         4 my $visitor;
225             $visitor = sub {
226 10     10   14 my ($entity) = @_;
227 10         16 my $type = $entity->{'type'};
228 10         16 my $subtype = $entity->{'subtype'};
229 10         12 my $number = $entity->{'number'};
230 10         12 my $ofs = $entity->{'offset'};
231 10 100       26 if ($type eq 'multipart') {
232 3         11 my $str = "($number $type/$subtype:$ofs";
233 3         5 $str .= ' ' . $visitor->($_) for @{ $entity->{'parts'} };
  3         25  
234 3         24 return $str . ')';
235             }
236             else {
237 7         38 return "($number $type/$subtype:$ofs)";
238             }
239 3         15 };
240 3         7 $visitor->($message);
241             }
242              
243             # --- Utility functions
244              
245             sub parse_header {
246 50     50 0 87 my ($self, $str) = @_;
247             #my $str = $$hdrref;
248 50         170 $str =~ s/\n(?=[ \t])//g;
249 50         58 my @fields;
250 50         194 while ($str =~ /(.+)/g) {
251 195         1294 push @fields, [split /:\s+/, $1, 2];
252             }
253 50         117 return fields2hash(\@fields);
254             }
255              
256             sub fields2hash {
257 50     50 0 61 my ($F) = @_;
258 50         56 my %F;
259 50         100 foreach (@$F) {
260 195         291 my ($name, $value) = @$_;
261 195   50     176 push @{ $F{lc $name} ||= [] }, $value;
  195         2742  
262             }
263 50         412 return \%F;
264             }
265              
266             sub parse_content_type {
267 50     50 0 60 my ($str) = @_;
268 50         308 my ($type, $subtype, $params_str) = split m{/|;\s*}, $str, 3;
269 50         129 return (lc $type, lc $subtype, parse_params($params_str));
270             }
271              
272             sub parse_params {
273 50     50 0 61 my ($str) = @_;
274 50 100       106 $str = '' unless defined $str;
275 50         46 my %param;
276 50         208 while ($str =~ s/^([^\s=]+)=//) {
277 20         48 my $name = lc $1;
278 20 100       93 if ($str =~ /^"/) {
    50          
279 10         50 my $value = extract_delimited($str, q{"}, '');
280 10         840 $value =~ s/^"|"$//g;
281 10         48 $value =~ s/\\(.)|([^\\"]+)|(.)/$+/g;
282 10         27 $param{$name} = $value;
283             #
284             }
285             elsif ($str =~ s/^([^\s()<>@,;:\\"\/\[\]?=]+)//) {
286 10         32 $param{$name} = $1;
287             }
288             else {
289 0         0 die "Bad params: $str";
290             }
291 20 50       125 die "Bad params: $str" unless $str =~ s/^(\s*;\s*|\s*$)//;
292             }
293 50         189 return \%param;
294             }
295              
296              
297             1;
298              
299             =pod
300              
301             =head1 NAME
302              
303             MIME::Structure - determine structure of MIME messages
304              
305             =head1 SYNOPSIS
306              
307             use MIME::Structure;
308             $parser = MIME::Structure->new;
309             $message = $parser->parse($filehandle);
310             print $message->{'header'};
311             $parts = $message->{'parts'};
312             foreach ($parts) {
313             $offset = $_->{'offset'};
314             $type = $_->{'type'};
315             $subtype = $_->{'subtype'};
316             $line = $_->{'line'};
317             $header = $_->{'header'};
318             }
319             print $parser->concise_structure($message), "\n";
320              
321             =cut
322              
323             =head1 METHODS
324              
325             =over 4
326              
327             =item B
328              
329             $parser = MIME::Structure->new;
330              
331             =item B
332              
333             $message = $parser->parse($filehandle);
334             ($message, @other_entities) = $parser->parse($filehandle);
335              
336             Parses the message found in the given filehandle.
337              
338             A MIME message takes the form of a non-empty tree, each of whose nodes is
339             termed an I (see RFCs 2045-2049). The root entity is the message
340             itself; the children of a multipart message are the parts it contains. (A
341             non-multipart message has no children.)
342              
343             When called in list context, the B method returns a list of references
344             to hashes; each hash contains information about a single entity in the message.
345              
346             The first hash represents the message itself; if it is a multipart message,
347             subsequent entities are its parts and subparts B
348             occur in the message> -- in other words, in pre-order. If called in scalar
349             context, only a reference to the hash containing information about the message
350             itself is returned.
351              
352             The following elements may appear in these hashes:
353              
354             =over 4
355              
356             =item B
357              
358             The offset, in bytes, of the entity's body.
359              
360             =item B
361              
362             The length, in bytes, of the entity's body. Currently only set for the message
363             itself.
364              
365             =item B
366              
367             The value of the entity's Content-Transfer-Encoding field.
368              
369             =item B
370              
371             If the B option is set, this will be a reference to a hash
372             whose keys are the names (converted to lower case) are the names of all fields
373             present in the entity;s header and whose values xxx.
374              
375             =item B
376              
377             The entity's full header as it appeared in the message, not including the final
378             blank line. This will be presently only if the B option is set.
379              
380             =item B
381              
382             C if the entity is the message, or C if it is a part within a message
383             (or within another part).
384              
385             =item B
386              
387             The length, in bytes, of the entire entity, including its header and body.
388             Currently only set for the message itself.
389              
390             =item B
391              
392             The level at which the entity is found. The message itself is at level 0, its
393             parts (if any) are at level 1, their parts are at level 2, and so on.
394              
395             =item B
396              
397             The line number (1-based) of the first line of the message's header. The message itself always, by definition,
398             is at line 1.
399              
400             =item B
401              
402             A dotted-decimal notation that indicates the entity's place within the message.
403             The root entity (the message itself) has number 1; its parts (if it has any any)
404             are numbered 1.1, 1.2, 1.3, etc., and the numbers of their parts in turn (if
405             they have any) are constructed in like manner.
406              
407             =item B
408              
409             The offset B of the first line of the entity's header, measured from
410             the first line of the message's header. The message itself always, by definition,
411             is at offset 0.
412              
413             =item B
414              
415             A reference to the hash representing the entity's parent. If the entity is
416             the message itself, this is undefined.
417              
418             =item B
419              
420             A reference to an array of the entity's parts. This will be present only if
421             the entity is of type B.
422              
423             =item B
424              
425             The string used as a boundary to delimit the entity's parts. Present only in
426             multipart entities.
427              
428             =item B
429              
430             The MIME media subtype of the entity's content, e.g., C or C.
431              
432             =item B
433              
434             The MIME media type of the entity's content, e.g., C or C.
435              
436             =item B
437              
438             A reference to a hash containing the attributes (if any) found in the
439             Content-Type: header field. For example, given the following Content-Type header:
440              
441             Content-Type: text/html; charset=UTF-8
442              
443             The entity's B element will be this:
444              
445             $entity{'type_params'} = {
446             'charset' => 'UTF-8',
447             }
448              
449             =back
450              
451             Besides parsing the message, this method may also be used to print the message,
452             or portions thereof, as it parses; the B method (q.v.) may be used to
453             specify what to print.
454              
455             =item B
456              
457             $keep_header = $parser->keep_header;
458             $parser->keep_header(1);
459              
460             Set (or get) whether headers should be remembered during parsing.
461              
462             =item B
463              
464             Set (or get) whether fields (normalized headers) should be remembered.
465              
466             =item B
467              
468             $print = $parser->print;
469             $parser->print($MIME::Structure::PRINT_HEADER | $MIME::Structure::PRINT_BODY);
470             $parser->print('header,body');
471              
472             Set (or get) what should be printed. This may be specified either as any of the
473             following symbolic constants, ORed together:
474              
475             =over 4
476              
477             =item B
478              
479             =item B
480              
481             =item B
482              
483             =item B
484              
485             =item B
486              
487             =back
488              
489             Or using the following string constants concatenated using any delimiter:
490              
491             =over 4
492              
493             =item B
494              
495             =item B
496              
497             =item B
498              
499             =item B
500              
501             =item B
502              
503             =back
504              
505             =item B
506              
507             $print_header = $parser->print_header;
508             $parser->print_header(1);
509              
510             Set (or get) whether headers should be printed.
511              
512             =item B
513              
514             $print_body = $parser->print_body;
515             $parser->print_body(1);
516              
517             Set (or get) whether bodies should be printed.
518              
519             =item B
520              
521             $print_preamble = $parser->print_preamble;
522             $parser->print_preamble(1);
523              
524             Set (or get) whether preambles should be printed.
525              
526             =item B
527              
528             $print_epilogue = $parser->print_epilogue;
529             $parser->print_epilogue(1);
530              
531             Set (or get) whether epilogues should be printed.
532              
533             =item B
534              
535             $parser->parse;
536             print "$_->{type}/$_->{subtype} $_->{offset}\n"
537             for @{ $parser->entities };
538              
539             Returns a reference to an array of all the entities in a message, in the order
540             in which they occur in the message. Thus the first entity is always the root
541             entity, i.e., the message itself).
542              
543             =item B
544              
545             $parser->parse;
546             print $parser->concise_structure;
547             # e.g., '(multipart/alternative:0 (text/html:291) (text/plain:9044))'
548              
549             Returns a string showing the structure of a message, including the content
550             type and offset of each entity (i.e., the message and [if it's multipart] all
551             of its parts, recursively). Each entity is printed in the form:
552              
553             "(" content-type ":" byte-offset [ " " parts... ")"
554              
555             Offsets are B offsets of the entity's header from the beginning of the
556             message. (If B was called with an I parameter, this is added
557             to the offset of the entity's header.)
558              
559             N.B.: The first offset is always 0.
560              
561             =back
562              
563             =head1 BUGS
564              
565             Documentation is sketchy.
566              
567             =head1 AUTHOR
568              
569             Paul Hoffman Enkuitse (at) cpan (dot) orgE
570              
571             =head1 COPYRIGHT
572              
573             Copyright 2008 Paul M. Hoffman. All rights reserved.
574              
575             This program is free software; you can redistribute it
576             and modify it under the same terms as Perl itself.
577              
578             =cut
579              
580