File Coverage

blib/lib/Mail/Field/Received.pm
Criterion Covered Total %
statement 168 280 60.0
branch 82 192 42.7
condition 57 87 65.5
subroutine 12 12 100.0
pod 7 7 100.0
total 326 578 56.4


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2             #
3             # Mail::Field::Received --
4             # mostly RFC822-compliant parser of Received headers
5             #
6             # Copyright (c) 2000 Adam Spiers . All rights
7             # reserved. This program is free software; you can redistribute it and/or
8             # modify it under the same terms as Perl itself.
9             #
10             # $Id: Received.pm,v 1.28 2003/03/17 23:45:17 adams Exp $
11             #
12              
13             require 5.005;
14              
15             package Mail::Field::Received;
16              
17 1     1   40316 use strict;
  1         1  
  1         35  
18              
19 1     1   3021 use Mail::Field ();
  1         5044  
  1         23  
20 1     1   9 use Carp;
  1         8  
  1         69  
21              
22 1     1   5 use vars qw($VERSION @ISA @EXPORT_OK);
  1         2  
  1         226  
23             @ISA = qw(Exporter Mail::Field Mail::Field::Generic);
24             @EXPORT_OK = qw(%RC &diagnose);
25              
26             $VERSION = '0.26';
27              
28             =head1 NAME
29              
30             Mail::Field::Received -- mostly RFC822-compliant parser of Received headers
31              
32             =head1 SYNOPSIS
33              
34             use Mail::Field;
35              
36             my $received = Mail::Field->new('Received', $header);
37             my $results = $received->parse_tree();
38             my $parsed_ok = $received->parsed_ok();
39             my $diagnostics = $received->diagnostics();
40              
41             =head1 DESCRIPTION
42              
43             I Instead ask Mail::Field for new
44             instances based on the field name!
45              
46             Mail::Field::Received provides subroutines for parsing Received
47             headers from e-mails. It mostly complies with RFC822, but deviates to
48             accomodate a number of broken MTAs which are in common use. It also
49             attempts to extract useful information which MTAs often embed within
50             the C<(comments)>.
51              
52             It is a subclass derived from the Mail::Field and Mail::Field::Generic
53             classes.
54              
55             =head1 ROUTINES
56              
57             =over 4
58              
59             =cut
60              
61             INIT: {
62             bless([])->register('Received');
63             }
64              
65             ##
66              
67             =item * B
68              
69             Returns current debugging level obtained via the C method.
70             If a parameter is given, the debugging level is changed. The default
71             level is 3.
72              
73             =cut
74              
75             my $debug = 3;
76              
77             sub debug {
78 2     2 1 1780 my $self = shift;
79 2 100       37 if (@_) {
80 1         3 $debug = shift;
81             }
82 2         8 return $debug;
83             }
84              
85             ##
86              
87             =item * B
88              
89             $received->diagnose("foo", "\n");
90              
91             Appends stuff to the parser's diagnostics buffer.
92              
93             =cut
94              
95             sub diagnose {
96 70     70 1 107 my $self = shift;
97 70         136 my (@msgs) = @_;
98 70         231 $self->{Diags} .= join '', @msgs;
99             }
100              
101             =item * B
102              
103             my $diagnostics = $received->diagnostics();
104              
105             Returns the contents of the parser's diagnostics buffer.
106              
107             =cut
108              
109             sub diagnostics {
110 1     1 1 2 my $self = shift;
111 1   50     17 return $self->{Diags} || '';
112             }
113              
114             ##
115              
116             # Here be all the roughly (!) RFC822-compliant regexps. They
117             # sometimes deviate from RFC822 to allow for many common MTAs which
118             # don't comply either.
119             #
120             # N.B. we need lots of butt-ugly extra ()s to avoid a nasty bug with
121             # (?-x:) in many recent Perls (fixed by 5.005_63 it seems, maybe earlier).
122              
123 1     1   4 use vars qw(%RC);
  1         2  
  1         5663  
124             %RC = ();
125              
126             # Atoms consist of all CHARs except SPACE, CTLs, and SPECIALs.
127             $RC{atom} = qr/(?:[\041\043-\047\052\053\055-\071\075\077\101-\132\136-\176]+)/;
128              
129             $RC{ctext} = qr/[\000-\014\016-\047\052-\133\135-\177]/;
130             $RC{dtext} = qr/[\000-\014\016-\132\136-\177]/;
131             $RC{quoted_pair} = qr/(?:\\[\000-\177])/;
132             $RC{qtext} = qr/[\000-\014\016-\041\043-\133\135-\177]/;
133             $RC{quoted_str} = qr/(?:"(?:$RC{qtext}|$RC{quoted_pair})*")/;
134              
135             # Comments can be arbitrarily nested but I can't be bothered to
136             # support that here; it's too much effort and no-one will nest more than
137             # once ... I hope!
138             $RC{comment_base}= qr/(\((?:$RC{ctext}|$RC{quoted_pair})*\))/;
139             $RC{comment} = qr/(\((?:$RC{ctext}|$RC{quoted_pair}|$RC{comment_base})*\))/;
140              
141             $RC{word} = qr/(?:$RC{atom}|$RC{quoted_str})/;
142             $RC{words} = qr/($RC{atom}(\s+$RC{atom})*|$RC{quoted_str})/;
143              
144             # ' 1' isn't 2DIGIT according to RFC822 but some MTAs use it anyway
145             $RC{TWO_DIGIT} = qr/((?:\d|(?<= )| )\d)/;
146            
147             # This could be improved upon. I left the common triples in, even
148             # though [A-Z]{3} makes them redundant.
149             $RC{zone_name} = qr/(UT|GMT|[CEMPW][DES]T|[A-Z]|[A-Z]{3})/;
150              
151             $RC{zone} = qr/(
152             ([+-]?[01]\d(?:00|15|30|45))(?:
153             )(?:\s(?:$RC{zone_name}|\($RC{zone_name}\)))?
154             |
155             (?:$RC{zone_name})(?:
156             ))/x;
157             $RC{hms} = qr/($RC{TWO_DIGIT}:(\d\d)(?::(\d\d))?)/;
158             # Note: case-insensitivity is not RFC-compliant here, but some MTAs
159             # write days/months in all lower case.
160             $RC{month} = qr/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/i;
161             $RC{week_day} = qr/(Mon|Tue|Wed|Thu|Fri|Sat|Sun)/i;
162             $RC{year} = qr/((?:19|20)?\d{2}|100)/; # god-DAMN the incompetence!
163             $RC{year_day1} = qr/(?:$RC{TWO_DIGIT}\s$RC{month})/;
164             $RC{year_day2} = qr/(?:$RC{month}\s$RC{TWO_DIGIT})/;
165             $RC{day_of_year} = qr/(?:$RC{year_day1}|$RC{year_day2})/;
166             $RC{date_time1} = qr/(?:$RC{hms}\s+$RC{year}\s+(?:$RC{zone})?)/;
167             $RC{date_time2} = qr/(?:$RC{hms}\s+$RC{zone}\s+$RC{year})/;
168             $RC{date_time3} = qr/(?:$RC{year}\s+$RC{hms}\s+(?:$RC{zone})?)/;
169             $RC{date_time} = qr/(
170             (?: $RC{week_day} ,? \s* )?
171             ($RC{day_of_year}) \s+
172             ($RC{date_time1}|$RC{date_time2}|$RC{date_time3})
173             )/x;
174              
175             $RC{ipv4_addr} = qr/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/;
176             # check valid with inet_aton()
177              
178             $RC{domain_lit} = qr/(?:\[(?:$RC{dtext}|$RC{quoted_pair})*\])/;
179             $RC{sub_domain} = qr/(?:$RC{atom}|$RC{domain_lit})/;
180             $RC{domain} = qr/(?:$RC{sub_domain}(?:\.$RC{sub_domain})*)/;
181             $RC{local_part} = qr/(?:$RC{word}(?:\.$RC{word})*)/;
182              
183             # This is the RFC822 addr-spec ...
184             $RC{addr_spec} = qr/($RC{local_part})\@($RC{domain})/;
185              
186             # ... but many MTAs are non-compliant:
187             $RC{addr_spec2} = qr/($RC{local_part})(?:\@($RC{domain}))?/;
188             $RC{addr_spec3} = qr/$RC{addr_spec2}|($RC{domain})/;
189             $RC{addr_spec4} = qr/((?:$RC{words}\s+)?<$RC{addr_spec3}>|$RC{addr_spec3})
190             (?:,\s?\.\.\.)?/x;
191             $RC{addr_spec5} = qr/(?:(?:($RC{local_part})\@)?($RC{domain}))/;
192              
193             # RFC822 dictates that msg-id is "<" addr-spec ">" but in practice
194             # many MTAs do not adhere to this for the "id" part of Received headers.
195             $RC{msg_id} = qr/(<$RC{addr_spec2}>|\#?[\w\.-]+)/;
196              
197             $RC{from1} = qr/((?i:from) \s+ (<$RC{addr_spec}>))/x;
198             $RC{from2} = qr/((?i:from) \s+ ($RC{addr_spec5})?)/x;
199             $RC{by} = qr/((?i:by) \s+ ($RC{domain}))/x;
200             $RC{via} = qr/((?i:via) \s+ ($RC{atom}))/x;
201             $RC{with} = qr/((?i:with) \s ($RC{atom})?)/x; # sometimes empty atom
202             $RC{id} = qr/((?i:id) \s+ $RC{msg_id}(?::(\d+))?)/x;
203             $RC{for} = qr/((?i:for) \s+ $RC{addr_spec4})/x;
204             $RC{sent_by} = qr/((?i:sent \s by) \s+ $RC{addr_spec4})/x;
205             $RC{convert} = qr/((?i:convert) \s+ ($RC{atom}))/x;
206              
207             ##
208              
209             sub set {
210 1     1 1 93 my $self = shift;
211 1         7 return $self;
212             }
213              
214             ##
215              
216             =item * B
217              
218             The actual parser. Returns the object (Mail::Field barfs otherwise).
219              
220             =cut
221              
222             sub parse {
223 6     6 1 1374 my ($self, $recv) = @_;
224              
225 6         12 $self->{Text} = $recv;
226 6         12 $self->{Diags} = '';
227              
228 6         17 my %parsed = (whole => $recv);
229              
230             # \234 sometimes crops up for some unknown reason. Huh?!
231 6         20 $recv =~ tr/\234//d;
232              
233             # From RFC822:
234             # received = "Received" ":" ; one per relay
235             # ["from" domain] ; sending host
236             # ["by" domain] ; receiving host
237             # ["via" atom] ; physical path
238             # *("with" atom) ; link/mail protocol
239             # ["id" msg-id] ; receiver msg id
240             # ["for" addr-spec] ; initial form
241             # ";" date-time ; time received
242             #
243             # Sadly many many MTAs are broken, however, so we have to deal with
244             # a lot of special cases. Improvements to this section are very welcome.
245              
246 6         15 my %expecting = map { $_ => 1 }
  54         97  
247             (qw/from by via with id convert for sent_by date_time/);
248              
249 6         17 for ($recv) {
250 6         10 my $last_section = '';
251              
252             TOKEN:
253 6         7 while (1) {
254 43 50       377 $self->diagnose("---- Expecting: ", (join ' ', sort keys %expecting),
255             "\n") if $debug >= 5;
256 43 50       126 $self->diagnose("---- Last section: $last_section\n")
257             if $debug >= 6;
258              
259 43 100       420 if (/\G$RC{comment}/cg) {
260 7         20 my $comment = $1;
261 7 50       37 $self->diagnose("Got comment $comment\n") if $debug >= 4;
262              
263 7 100       18 push @{$parsed{$last_section}{comments}}, $comment
  4         15  
264             if $last_section;
265 7         8 push @{$parsed{comments}}, $comment;
  7         19  
266              
267 7 100       20 if ($last_section eq 'from') {
268             FROMCOMMENT:
269             {
270 2 50       3 if ($comment =~ /\(
  2         177  
271             (?:(?:($RC{local_part})\@)?($RC{domain})\s+)?
272             (?:\[ $RC{ipv4_addr} \])(?:
273             )\)/x)
274             {
275 2 50       9 if ($1) {
276 0 0       0 $self->diagnose("Got `from' ident in comments: $1\n")
277             if $debug >= 3;
278 0         0 $parsed{from}{ident} = $1;
279             }
280              
281 2 50       9 if ($2) {
282 2 50       16 $self->diagnose("Got `from' domain in comments: $2\n")
283             if $debug >= 3;
284 2         9 $parsed{from}{domain} = $2;
285             }
286              
287 2 50       8 if ($3) {
288 2 50       13 $self->diagnose("Got `from' IP address in comments: $3\n")
289             if $debug >= 3;
290 2         6 $parsed{from}{address} = $3;
291             }
292              
293 2         5 last FROMCOMMENT;
294             }
295              
296 0 0       0 if ($comment =~ /(HELO|EHLO)(?:\s+|=)($RC{domain})/i) {
297             # HELO domain is in comments, not outside, so swap
298 0 0       0 $self->diagnose("Got `from' $1 domain in comments: $2\n")
299             if $debug >= 3;
300 0         0 @{$parsed{from}}{qw/domain HELO/}
  0         0  
301             = ($parsed{from}{HELO}, $2);
302             }
303              
304 0 0       0 if ($comment =~ /$RC{ipv4_addr}\]?(?::(\d{1,5}))?/) {
305 0 0       0 $self->diagnose("Got `from' IP address in comments: $1\n")
306             if $debug >= 3;
307              
308 0         0 $parsed{from}{address} = $1;
309              
310 0 0       0 if ($2) {
311 0         0 $parsed{from}{port} = $2;
312 0 0       0 $self->diagnose("Got `from' port in comments: $1\n")
313             if $debug >= 3;
314             }
315             }
316             }
317 2         10 $parsed{from}{whole} .= " $comment\n";
318             }
319              
320 7         18 next TOKEN;
321             }
322            
323 36 100       117 if (/\G(\s+)/cg) {
324 11 50       27 $self->diagnose("Got whitespace: <$1>\n") if $debug >= 7;
325 11         21 next TOKEN;
326             }
327            
328 25 50 66     328 if ($expecting{from} and /\G$RC{from1}/cg) {
329 0   0     0 print map { ($_ || '__undef__') . "\n---\n" } $1, $2, $3, $4, $5, $6;
  0         0  
330 0 0       0 $self->diagnose("Got from type1: $1\n") if $debug >= 2;
331 0         0 $last_section = 'from';
332            
333 0         0 $parsed{from}{whole} = $1;
334 0         0 $parsed{from}{from} = $2;
335 0 0       0 $parsed{from}{ident} = $3 if $3;
336 0         0 $parsed{from}{HELO} = $4;
337              
338 0         0 delete $expecting{from};
339 0         0 delete @expecting{grep /^after_/, keys %expecting};
340 0         0 $expecting{after_from}++;
341 0         0 next TOKEN;
342             }
343            
344 25 100 100     369 if ($expecting{from} and /\G$RC{from2}/cg) {
345 2 50       27 $self->diagnose("Got from type2: $1\n") if $debug >= 2;
346 2         4 $last_section = 'from';
347              
348 2         8 $parsed{from}{whole} = $1;
349 2         6 $parsed{from}{from} = $2;
350 2 50       9 $parsed{from}{ident} = $3 if $3;
351 2         6 $parsed{from}{HELO} = $4;
352              
353 2         6 delete $expecting{from};
354 2         11 delete @expecting{grep /^after_/, keys %expecting};
355 2         17 $expecting{after_from}++;
356 2         9 next TOKEN;
357             }
358            
359 23 50 66     111 if ($expecting{after_from} and /\G($RC{domain_lit})/cg) {
360 0 0       0 $self->diagnose("Got address from bad `from': $1\n") if $debug >= 3;
361 0         0 $parsed{from}{address} = $1;
362 0         0 delete $expecting{after_from};
363 0         0 next TOKEN;
364             }
365              
366 23 50 66     78 if ($expecting{after_from} and $parsed{from}{whole} eq 'from mail' and
      33        
367             /\G(pickup service)/cg) {
368 0 0       0 $self->diagnose("Got bad `from': appending: $1\n")
369             if $debug >= 3;
370 0         0 $parsed{from}{whole} .= $1;
371 0         0 delete $expecting{after_from};
372 0         0 next TOKEN;
373             }
374              
375             # Deal with incompetence from the fucking /imbeciles/ at M$.
376 23 50 66     69 if ($expecting{after_from} and $parsed{whole} =~ /Microsoft SMTPSVC/ and
      33        
377             /\G-\s+$RC{ipv4_addr}/cg) {
378 0 0       0 $self->diagnose("Got IP from bad M\$ from: $1\n") if $debug >= 3;
379 0         0 $parsed{from}{address} = $1;
380 0         0 delete $expecting{after_from};
381 0         0 next TOKEN;
382             }
383              
384 23 50 66     111 if ($expecting{after_from} and /\G, claiming to be ($RC{word})/cg) {
385 0 0       0 $self->diagnose("Got HELO: $1 from brain-dead MTA\n") if $debug >= 3;
386 0         0 $parsed{allow_parse_fail}++; # More brain-dead MTAs
387 0         0 $parsed{from}{HELO} = $1;
388 0         0 delete $expecting{after_from};
389 0         0 next TOKEN;
390             }
391              
392 23 100 100     183 if ($expecting{by} and /\G$RC{by},?/cg) {
393 2 50       15 $self->diagnose("Got by: $1\n") if $debug >= 2;
394 2         3 $last_section = 'by';
395              
396 2         9 $parsed{by}{whole} = $1;
397 2         8 $parsed{by}{domain} = $2;
398              
399 2         5 delete @expecting{qw/by/};
400 2         15 delete @expecting{grep /^after_/, keys %expecting};
401 2         5 $expecting{after_by}++;
402 2         9 next TOKEN;
403             }
404              
405 21 50 66     113 if ($expecting{after_by} and /\G($RC{domain_lit})/cg) {
406 0 0       0 $self->diagnose("Got address from bad `by': $1\n") if $debug >= 3;
407 0         0 $parsed{by}{address} = $1;
408 0         0 delete $expecting{after_by};
409 0         0 next TOKEN;
410             }
411              
412 21 50 66     61 if ($expecting{after_by} and /\G(Sendmail)/cg) {
413 0 0       0 $self->diagnose("Got MTA from bad `by': $1\n") if $debug >= 3;
414 0         0 $parsed{by}{MTA} = $1;
415              
416 0 0       0 if ($expecting{via}) {
417 0         0 $parsed{via}{via} = $1;
418             }
419            
420 0         0 delete $expecting{after_by};
421 0         0 next TOKEN;
422             }
423              
424 21 50 66     129 if ($expecting{via} and /\G$RC{via}/cg) {
425 0 0       0 $self->diagnose("Got via: $1\n") if $debug >= 2;
426 0         0 $last_section = 'via';
427              
428 0         0 $parsed{via}{whole} = $1;
429 0         0 $parsed{via}{via} = $2;
430              
431 0         0 delete $expecting{via};
432 0         0 delete @expecting{grep /^after_/, keys %expecting};
433 0         0 $expecting{after_via}++;
434 0         0 next TOKEN;
435             }
436              
437 21 50 33     57 if ($expecting{after_via} and /\G\[$RC{ipv4_addr}\]/cg) {
438 0 0       0 $self->diagnose("Got address from bad `via': $1\n") if $debug >= 3;
439 0         0 $parsed{via}{address} = $1;
440 0         0 delete $expecting{after_via};
441 0         0 next TOKEN;
442             }
443              
444 21 50 66     82 if (! $expecting{from} and /\Gfrom\s+stdin/cg) {
445 0 0       0 $self->diagnose("Got `from stdin'\n") if $debug >= 3;
446 0         0 $parsed{from}{stdin} = 'yep';
447 0         0 next TOKEN;
448             }
449              
450 21 50 66     77 if ($expecting{with} and
451             m!
452             \G((?i:with) \s
453             (P:(stdio|smtp)/R:(inet|bind)_hosts/T:(smtp|inet_zone_bind_smtp)))
454             !cgx) {
455 0 0       0 $self->diagnose("Got weird with: $1\n") if $debug >= 2;
456 0         0 $last_section = 'with';
457              
458 0         0 $parsed{with}{whole} = $1;
459 0         0 $parsed{with}{with} = $2;
460              
461 0         0 delete @expecting{grep /^after_/, keys %expecting};
462 0         0 $expecting{after_with}++;
463             # I've seen the `from' bit come after the `with' bit sometimes.
464             # Why oh why ...
465 0         0 $expecting{from}++;
466 0         0 next TOKEN;
467             }
468              
469 21 100 100     173 if ($expecting{with} and /\G$RC{with}/cg) {
470 2 50       79 $self->diagnose("Got with: $1\n") if $debug >= 2;
471 2         5 $last_section = 'with';
472              
473 2         27 $parsed{with}{whole} = $1;
474 2         6 $parsed{with}{with} = $2;
475 2 50       16 $parsed{with}{with} .= $3 if $3;
476              
477 2         16 delete @expecting{grep /^after_/, keys %expecting};
478 2         5 $expecting{after_with}++;
479             # I've seen the `from' bit come after the `with' bit sometimes.
480             # Why oh why ...
481 2         5 $expecting{from}++;
482 2         7 next TOKEN;
483             }
484              
485 19 100 66     63 if ($expecting{after_with} && $parsed{with}{with}) {
486              
487             # Microsoft SMTPSVC uses two atoms -- yet /another/ example of
488             # Microsoft not following standards ... *gasp*
489              
490 2 50       10 if ($parsed{with}{with} eq 'Microsoft') {
491 0 0       0 if (/\GSMTPSVC(?:\(([\d\.]+)\))?/cg) {
    0          
492 0 0       0 $self->diagnose("Got M\$ SMTPSVC version from bad `with'",
    0          
493             $1 ? ": $1" : '',
494             "\n")
495             if $debug >= 3;
496 0         0 delete $expecting{after_with};
497 0         0 next TOKEN;
498             }
499             elsif (/\GMAPI/cg) {
500 0 0       0 $self->diagnose("Got Microsoft MAPI from bad `with'\n")
501             if $debug >= 3;
502 0         0 delete $expecting{after_with};
503 0         0 next TOKEN;
504             }
505             }
506              
507             # More brain damage ...
508              
509 2 50 33     10 if ($parsed{with}{with} eq 'Internet' and
510             /\GMail Service\s*\(([\d\.]+)\)/cg) {
511 0 0       0 $self->diagnose("Got Internet Mail Service version from bad `with': $1\n")
512             if $debug >= 3;
513 0         0 delete $expecting{after_with};
514 0         0 next TOKEN;
515             }
516              
517 2 50 33     20 if ($parsed{with}{with} eq 'WorldClient' and
518             /\G($RC{domain_lit})/cg) {
519 0 0       0 $self->diagnose("Got WorldClient address from bad `with': $1\n")
520             if $debug >= 3;
521 0         0 delete $expecting{after_with};
522 0         0 next TOKEN;
523             }
524              
525 2 50 33     9 if ($parsed{with}{with} eq 'Local' and
526             /\GSMTP/cg) {
527 0 0       0 $self->diagnose("Got Local SMTP from bad `with'\n")
528             if $debug >= 3;
529 0         0 delete $expecting{after_with};
530 0         0 next TOKEN;
531             }
532              
533             }
534              
535 19 100 100     350 if ($expecting{id} and /\G$RC{id}/cg) {
536 2 50       17 $self->diagnose("Got id: $1\n") if $debug >= 2;
537 2         5 $last_section = 'id';
538            
539 2         8 $parsed{id}{whole} = $1;
540 2         6 $parsed{id}{id} = $2;
541 2 50       9 $parsed{id}{port} = $3 if $3;
542              
543 2         7 delete @expecting{qw/by via with/};
544 2         17 delete @expecting{grep /^after_/, keys %expecting};
545 2         9 next TOKEN;
546             }
547              
548 17 50 66     129 if ($expecting{convert} and /\G$RC{convert}/cg) {
549 0 0       0 $self->diagnose("Got convert: $1\n") if $debug >= 2;
550 0         0 $last_section = 'convert';
551            
552 0         0 $parsed{convert}{whole} = $1;
553            
554 0         0 delete @expecting{qw/from by via with convert/};
555 0         0 delete @expecting{grep /^after_/, keys %expecting};
556 0         0 next TOKEN;
557             }
558              
559 17 100 100     527 if ($expecting{for} and
560             /\G$RC{for}(\s+bugtraq\@securityfocus\.com)?/cgi) {
561 1 50       11 $self->diagnose("Got for: $1\n") if $debug >= 2;
562 1         2 $last_section = 'for';
563              
564 1         5 $parsed{for}{whole} = $1;
565 1         4 $parsed{for}{for} = $2;
566 1 50       6 $parsed{for}{bugtraq} = $3 if $3;
567              
568 1         5 delete @expecting{qw/from by convert for/};
569 1         5 delete @expecting{grep /^after_/, keys %expecting};
570 1         8 next TOKEN;
571             }
572              
573 16 50 66     472 if ($expecting{sent_by} and /\G$RC{sent_by}/cg) {
574 0 0       0 $self->diagnose("Got sent by: $1\n") if $debug >= 2;
575 0         0 $last_section = 'sent_by';
576              
577 0         0 $parsed{sent_by}{whole} = $1;
578 0         0 $parsed{sent_by}{sent_by} = $2;
579              
580 0         0 delete @expecting{qw/from by via with convert for sent_by/};
581 0         0 delete @expecting{grep /^after_/, keys %expecting};
582 0         0 next TOKEN;
583             }
584              
585 16 100 100     902 if ($expecting{date_time} and /\G((?:on\s+)?$RC{date_time})/cg) {
586 5 50       32 $self->diagnose("Got date_time: $1\n") if $debug >= 2;
587 5         8 $last_section = 'date_time';
588            
589             # Eugh. This is horrible. Maybe I should have used
590             # Parse::RecDescent after all ...
591              
592 5         11 @{$parsed{date_time}}{qw/whole date_time week_day day_of_year rest/}
  5         44  
593             = ($1, $2, $3, $4, $9);
594              
595 5 50       56 if (" $parsed{date_time}{day_of_year}" =~ $RC{year_day1}) {
    0          
596 5         8 @{$parsed{date_time}}{qw/month_day month/} = ($1, $2);
  5         23  
597             }
598             elsif (" $parsed{date_time}{day_of_year}" =~ $RC{year_day2}) {
599 0         0 @{$parsed{date_time}}{qw/month month_day/} = ($1, $2);
  0         0  
600             }
601             else {
602 0         0 $self->diagnose("Couldn't parse day_of_year: <$parsed{date_time}{day_of_year}>");
603 0         0 $parsed{parse_failed}++;
604             }
605            
606 5 100       179 if ($parsed{date_time}{rest} =~ $RC{date_time1}) {
    100          
    50          
607 1         4 @{$parsed{date_time}}{qw/hms hour minute second year/}
  1         9  
608             = ($1, $2, $3, $4, $5);
609 1 50       7 $parsed{date_time}{zone} = $6 if defined $6;
610             }
611             elsif ($parsed{date_time}{rest} =~ $RC{date_time2}) {
612 1         3 @{$parsed{date_time}}{qw/hms hour minute second zone year/}
  1         29  
613             = ($1, $2, $3, $4, $5, $10);
614             }
615             elsif ($parsed{date_time}{rest} =~ $RC{date_time3}) {
616 3         13 @{$parsed{date_time}}{qw/year hms hour minute second/}
  3         29  
617             = ($1, $2, $3, $4, $5);
618 3 50       20 $parsed{date_time}{zone} = $6 if defined $6;
619             }
620             else {
621 0         0 $self->diagnose("Couldn't parse rest of date_time: <$parsed{date_time}{rest}>");
622 0         0 $parsed{parse_failed}++;
623             }
624            
625 5         21 %expecting = (after_date_time => 1);
626 5         19 next TOKEN;
627             }
628              
629 11 50 66     59 if ($expecting{after_date_time} and /\G((mail.from|env.from).+)/cg) {
630 0 0       0 $self->diagnose("Got random crap after date: $1\n") if $debug >= 3;
631 0         0 $parsed{after_date_time} = $1;
632 0         0 next TOKEN;
633             }
634              
635             # Reluctantly allow semi-colons in random places
636 11 100       41 if (/\G(;\s+)/cg) {
637 5 50       11 $self->diagnose("Got semi-colon: <$1>\n") if $debug >= 7;
638 5         17 next TOKEN;
639             }
640              
641 6   100     88 my $old_pos = pos() || 0;
642 6         19 my @start = ($old_pos - 35, $old_pos);
643 6 100       17 $start[0] = 0 if $start[0] < 0;
644 6         10 my $length = $old_pos - $start[0];
645 6 100       19 if (/\G(.{1,35})/cg) {
646 1 50       12 $self->diagnose("** Ran out of things to match at position $old_pos:\n",
647             substr($_, $start[0], $length), "<<<\n",
648             ' ' x ($length - 3), ">>>$1\n\n")
649             if $debug >= 1;
650 1         3 $parsed{parse_failed}++;
651             }
652 6         26 last TOKEN;
653             }
654             }
655              
656 6         19 $self->{parse_tree} = \%parsed;
657              
658 6   66     45 my $failed = $parsed{parse_failed} && ! $parsed{allow_parse_fail};
659 6 100       18 $self->{parsed_ok} = $failed ? 0 : 1;
660 6         31 return $self;
661             }
662              
663             ##
664              
665             =item * B
666              
667             if ($received->parsed_ok()) {
668             ...
669             }
670              
671             Returns true if the parse succeed, or if it failed, but was permitted
672             to fail for some reason, such as encountering evidence of a known
673             broken (non-RFC822-compliant) format mid-parse.
674              
675             =cut
676              
677             sub parsed_ok {
678 6     6 1 17 my $self = shift;
679 6 50       19 croak "Header not parsed yet" unless $self->{parse_tree};
680 6         33 return $self->{parsed_ok};
681             }
682              
683             ##
684              
685             =item * B
686              
687             my $parse_tree = $received->parse_tree();
688              
689             Returns the actual parse tree, which is where you get all the useful
690             information. It is returned as a hashref whose keys are strings like
691             `from', `by', `with', `id', `via' etc., corresponding to the
692             components of Received headers as defined by RFC822:
693              
694             received = "Received" ":" ; one per relay
695             ["from" domain] ; sending host
696             ["by" domain] ; receiving host
697             ["via" atom] ; physical path
698             *("with" atom) ; link/mail protocol
699             ["id" msg-id] ; receiver msg id
700             ["for" addr-spec] ; initial form
701             ";" date-time ; time received
702              
703             The corresponding values are more hashrefs which are mini-parse-trees
704             for these individual components. A typical parse tree looks something
705             like:
706              
707             {
708             'by' => {
709             'domain' => 'host5.hostingcheck.com',
710             'whole' => 'by host5.hostingcheck.com',
711             'comments' => [
712             '(8.9.3/8.9.3)'
713             ],
714             },
715             'date_time' => {
716             'year' => 2000,
717             'week_day' => 'Tue',
718             'minute' => 57,
719             'day_of_year' => '1 Feb',
720             'month_day' => ' 1',
721             'zone' => '-0500',
722             'second' => 18,
723             'hms' => '21:57:18',
724             'date_time' => 'Tue, 1 Feb 2000 21:57:18 -0500',
725             'hour' => 21,
726             'month' => 'Feb',
727             'rest' => '2000 21:57:18 -0500',
728             'whole' => 'Tue, 1 Feb 2000 21:57:18 -0500'
729             },
730             'with' => {
731             'with' => 'ESMTP',
732             'whole' => 'with ESMTP'
733             },
734             'from' => {
735             'domain' => 'mediacons.tecc.co.uk',
736             'HELO' => 'tr909.mediaconsult.com',
737             'from' => 'tr909.mediaconsult.com',
738             'address' => '193.128.6.132',
739             'comments' => [
740             '(mediacons.tecc.co.uk [193.128.6.132])',
741             ],
742             'whole' => 'from tr909.mediaconsult.com (mediacons.tecc.co.uk [193.128.6.132])
743             '
744             },
745             'id' => {
746             'id' => 'VAA24164',
747             'whole' => 'id VAA24164'
748             },
749             'comments' => [
750             '(mediacons.tecc.co.uk [193.128.6.132])',
751             '(8.9.3/8.9.3)'
752             ],
753             'for' => {
754             'for' => '',
755             'whole' => 'for '
756             },
757             'whole' => 'from tr909.mediaconsult.com (mediacons.tecc.co.uk [193.128.6.132]) by host5.hostingcheck.com (8.9.3/8.9.3) with ESMTP id VAA24164 for ; Tue, 1 Feb 2000 21:57:18 -0500'
758             }
759              
760             =cut
761              
762             sub parse_tree {
763 5     5 1 12 my $self = shift;
764 5 50       23 croak "Header not parsed yet" unless $self->{parse_tree};
765 5         26 return $self->{parse_tree};
766             }
767              
768             =back
769              
770             =head1 BUGS
771              
772             Doesn't use Parse::RecDescent, which it maybe should.
773              
774             Doesn't offer a `strict RFC822' parsing mode. To implement that would
775             be a royal pain in the arse, unless we move to Parse::RecDescent.
776              
777             =head1 SEE ALSO
778              
779             L, L
780              
781             =head1 AUTHOR
782              
783             Adam Spiers
784              
785             =head1 LICENSE
786              
787             All rights reserved. This program is free software; you can redistribute
788             it and/or modify it under the same terms as Perl itself.
789              
790             =cut