File Coverage

blib/lib/Email/Address/List.pm
Criterion Covered Total %
statement 63 72 87.5
branch 25 44 56.8
condition 12 20 60.0
subroutine 6 6 100.0
pod 1 1 100.0
total 107 143 74.8


line stmt bran cond sub pod time code
1 5     5   72324 use strict;
  5         23  
  5         142  
2 5     5   25 use warnings;
  5         13  
  5         119  
3 5     5   130 use 5.010;
  5         24  
4              
5             package Email::Address::List;
6              
7             our $VERSION = '0.06';
8 5     5   2749 use Email::Address;
  5         139107  
  5         5251  
9              
10             =head1 NAME
11              
12             Email::Address::List - RFC close address list parsing
13              
14             =head1 SYNOPSIS
15              
16             use Email::Address::List;
17              
18             my $header = <<'END';
19             Foo Bar , (an obsolete comment),,,
20             a group:
21             a . weird . address @
22             for-real .biz
23             ; invalid thingy, <
24             more@example.com
25             >
26             END
27              
28             my @list = Email::Address::List->parse($header);
29             foreach my $e ( @list ) {
30             if ($e->{'type'} eq 'mailbox') {
31             print "an address: ", $e->{'value'}->format ,"\n";
32             }
33             else {
34             print $e->{'type'}, "\n"
35             }
36             }
37              
38             # prints:
39             # an address: "Foo Bar"
40             # comment
41             # group start
42             # an address: a.weird.address@forreal.biz
43             # group end
44             # unknown
45             # an address: more@example.com
46              
47             =head1 DESCRIPTION
48              
49             Parser for From, To, Cc, Bcc, Reply-To, Sender and
50             previous prefixed with Resent- (eg Resent-From) headers.
51              
52             =head1 REASONING
53              
54             L is good at parsing addresses out of any text
55             even mentioned headers and this module is derived work
56             from Email::Address.
57              
58             However, mentioned headers are structured and contain lists
59             of addresses. Most of the time you want to parse such field
60             from start to end keeping everything even if it's an invalid
61             input.
62              
63             =head1 METHODS
64              
65             =head2 parse
66              
67             A class method that takes a header value (w/o name and :) and
68             a set of named options, for example:
69              
70             my @list = Email::Address::List->parse( $line, option => 1 );
71              
72             Returns list of hashes. Each hash at least has 'type' key that
73             describes the entry. Types:
74              
75             =over 4
76              
77             =item mailbox
78              
79             A mailbox entry with L object under value key.
80              
81             If mailbox has obsolete parts then 'obsolete' is true.
82              
83             If address (not display-name/phrase or comments, but
84             local-part@domain) contains not ASCII chars then 'not_ascii' is
85             set to true. According to RFC 5322 not ASCII chars are not
86             allowed within mailbox. However, there are no big problems if
87             those are used and actually RFC 6532 extends a few rules
88             from 5322 with UTF8-non-ascii. Either use the feature or just
89             skip such addresses with skip_not_ascii option.
90              
91             =item group start
92              
93             Some headers with mailboxes may contain groupped addresses. This
94             element is returned for position where group starts. Under value
95             key you find name of the group. B that value is not post
96             processed at the moment, so it may contain spaces, comments,
97             quoted strings and other noise. Author willing to take patches
98             and warns that this will be changed at some point without additional
99             notifications, so if you need groups info then you better send a
100             patch :)
101              
102             Groups can not be nested, but one field may have multiple groups or
103             mix of addresses that are in a group and not in any.
104              
105             See skip_groups option.
106              
107             =item group end
108              
109             Returned when a group ends.
110              
111             =item comment
112              
113             Obsolete syntax allows one to use standalone comments between mailboxes
114             that can not be addressed to any mailbox. In such situations a comment
115             returned as an entry of this type. Comment itself is under value.
116              
117             =item unknown
118              
119             Returned if parser met something that shouldn't be there. Parser
120             tries to recover by jumping over to next comma (or semicolon if inside
121             group) that is out quoted string or comment, so "foo, bar, baz" string
122             results in three unknown entries. Jumping over comments and quoted strings
123             means that parser is very sensitive to unbalanced quotes and parens,
124             but it's on purpose.
125              
126             =back
127              
128             It can be controlled which elements are skipped, for example:
129              
130             Email::Address::List->parse($line, skip_unknown => 1, ...);
131              
132             =over 4
133              
134             =item skip_comments
135              
136             Skips comments between mailboxes. Comments inside and next to a mailbox
137             are not skipped, but returned as part of mailbox entry.
138              
139             =item skip_not_ascii
140              
141             Skips mailboxes where address part has not ASCII characters.
142              
143             =item skip_groups
144              
145             Skips group starts and end elements, however emails within groups are
146             still returned.
147              
148             =item skip_unknown
149              
150             Skip anything that is not recognizable. It still tries to recover as
151             described earlier.
152              
153             =back
154              
155             =cut
156              
157             # mailbox = name-addr / addr-spec
158             # display-name = phrase
159             #
160             # from = "From:" mailbox-list CRLF
161             # sender = "Sender:" mailbox CRLF
162             # reply-to = "Reply-To:" address-list CRLF
163             #
164             # to = "To:" address-list CRLF
165             # cc = "Cc:" address-list CRLF
166             # bcc = "Bcc:" [address-list / CFWS] CRLF
167             #
168             # resent-from = "Resent-From:" mailbox-list CRLF
169             # resent-sender = "Resent-Sender:" mailbox CRLF
170             # resent-to = "Resent-To:" address-list CRLF
171             # resent-cc = "Resent-Cc:" address-list CRLF
172             # resent-bcc = "Resent-Bcc:" [address-list / CFWS] CRLF
173             #
174             # obs-from = "From" *WSP ":" mailbox-list CRLF
175             # obs-sender = "Sender" *WSP ":" mailbox CRLF
176             # obs-reply-to = "Reply-To" *WSP ":" address-list CRLF
177             #
178             # obs-to = "To" *WSP ":" address-list CRLF
179             # obs-cc = "Cc" *WSP ":" address-list CRLF
180             # obs-bcc = "Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
181             #
182             # obs-resent-from = "Resent-From" *WSP ":" mailbox-list CRLF
183             # obs-resent-send = "Resent-Sender" *WSP ":" mailbox CRLF
184             # obs-resent-date = "Resent-Date" *WSP ":" date-time CRLF
185             # obs-resent-to = "Resent-To" *WSP ":" address-list CRLF
186             # obs-resent-cc = "Resent-Cc" *WSP ":" address-list CRLF
187             # obs-resent-bcc = "Resent-Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
188             # obs-resent-mid = "Resent-Message-ID" *WSP ":" msg-id CRLF
189             # obs-resent-rply = "Resent-Reply-To" *WSP ":" address-list CRLF
190              
191             our $COMMENT_NEST_LEVEL ||= 2;
192              
193             our %RE;
194             our %CRE;
195              
196             $RE{'CTL'} = q{\x00-\x1F\x7F};
197             $RE{'special'} = q{()<>\\[\\]:;@\\\\,."};
198              
199             $RE{'text'} = qr/[^\x0A\x0D]/;
200              
201             $RE{'quoted_pair'} = qr/\\$RE{'text'}/;
202              
203             $RE{'atext'} = qr/[^$RE{'CTL'}$RE{'special'}\s]/;
204             $RE{'ctext'} = qr/[^()\\]++/;
205             $RE{'qtext'} = qr/[^\\"]/;
206             $RE{'dtext'} = qr/[^\[\]\\]/;
207              
208             ($RE{'ccontent'}, $RE{'comment'}) = (q{})x2;
209             for (1 .. $COMMENT_NEST_LEVEL) {
210             $RE{'ccontent'} = qr/$RE{'ctext'}|$RE{'quoted_pair'}|$RE{'comment'}/;
211             $RE{'comment'} = qr/(?>\s*+\((?:\s*+$RE{'ccontent'})*+\s*+\)\s*+)/;
212             }
213             $RE{'cfws'} = qr/$RE{'comment'}++|\s*+/;
214              
215             $RE{'qcontent'} = qr/$RE{'qtext'}|$RE{'quoted_pair'}/;
216             $RE{'quoted-string'} = qr/$RE{'cfws'}"$RE{'qcontent'}*+"$RE{'cfws'}/;
217              
218             $RE{'atom'} = qr/$RE{'cfws'}$RE{'atext'}++$RE{'cfws'}/;
219              
220             $RE{'word'} = qr/$RE{'atom'} | $RE{'quoted-string'}/x;
221             $RE{'phrase'} = qr/$RE{'word'}+/x;
222             $RE{'display-name'} = $RE{'phrase'};
223              
224             $RE{'dot_atom_text'} = qr/$RE{'atext'}++(?:\.$RE{'atext'}++)*/;
225             $RE{'dot_atom'} = qr/$RE{'cfws'}$RE{'dot_atom_text'}$RE{'cfws'}/;
226             $RE{'local-part'} = qr/$RE{'dot_atom'}|$RE{'quoted-string'}/;
227              
228             $RE{'dcontent'} = qr/$RE{'dtext'}|$RE{'quoted_pair'}/;
229             $RE{'domain_literal'} = qr/$RE{'cfws'}\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}/;
230             $RE{'domain'} = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/;
231              
232             $RE{'addr-spec'} = qr/$RE{'local-part'}\@$RE{'domain'}/;
233             $RE{'angle-addr'} = qr/$RE{'cfws'} < $RE{'addr-spec'} > $RE{'cfws'}/x;
234              
235             $RE{'name-addr'} = qr/$RE{'display-name'}?$RE{'angle-addr'}/;
236             $RE{'mailbox'} = qr/(?:$RE{'name-addr'}|$RE{'addr-spec'})$RE{'comment'}*/;
237              
238             $CRE{'addr-spec'} = qr/($RE{'local-part'})\@($RE{'domain'})/;
239             $CRE{'mailbox'} = qr/
240             (?:
241             ($RE{'display-name'})?($RE{'cfws'})<$CRE{'addr-spec'}>($RE{'cfws'})
242             |$CRE{'addr-spec'}
243             )($RE{'comment'}*+)
244             /x;
245              
246             $RE{'dword'} = qr/$RE{'cfws'} (?: $RE{'atom'} | \. | "$RE{'qcontent'}++" ) $RE{'cfws'}/x;
247             $RE{'obs-phrase'} = qr/$RE{'word'} $RE{'dword'}*+/x;
248             $RE{'obs-display-name'} = $RE{'obs-phrase'};
249             $RE{'obs-route'} = qr/
250             (?:$RE{'cfws'}|,)*
251             \@$RE{'domain'}
252             (?:,$RE{'cfws'}?(?:\@$RE{'domain'})?)*
253             :
254             /x;
255             $RE{'obs-domain'} = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
256             $RE{'obs-local-part'} = qr/$RE{'word'}(?:\.$RE{'word'})*/;
257             $RE{'obs-addr-spec'} = qr/$RE{'obs-local-part'}\@$RE{'obs-domain'}/;
258             $CRE{'obs-addr-spec'} = qr/($RE{'obs-local-part'})\@($RE{'obs-domain'})/;
259             $CRE{'obs-mailbox'} = qr/
260             (?:
261             ($RE{'obs-display-name'})?
262             ($RE{'cfws'})< $RE{'obs-route'}? $CRE{'obs-addr-spec'} >($RE{'cfws'})
263             |$CRE{'obs-addr-spec'}
264             )($RE{'comment'}*+)
265             /x;
266              
267             sub parse {
268 226     226 1 1157668 my $self = shift;
269 226 50       988 my %args = @_%2? (line => @_) : @_;
270 226         526 my $line = delete $args{'line'};
271              
272 226         359 my $in_group = 0;
273              
274 226         354 my @res;
275 226         1208 while ($line =~ /\S/) {
276             # in obs- case we have number of optional comments/spaces/
277             # address-list = (address *("," address)) / obs-addr-list
278             # obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]))
279 1632 100       7861 if ( $line =~ s/^(?:($RE{'cfws'})?,)//o ) {
280             push @res, {type => 'comment', value => $1 }
281 1003 50 66     5378 if $1 && !$args{'skip_comments'} && $1 =~ /($RE{'comment'})/;
      66        
282 1003         2843 next;
283             }
284 629         1446 $line =~ s/^\s+//o;
285              
286             # now it's only comma separated address where address is:
287             # address = mailbox / group
288              
289             # deal with groups
290             # group = display-name ":" [group-list] ";" [CFWS]
291             # group-list = mailbox-list / CFWS / obs-group-list
292             # obs-group-list = 1*([CFWS] ",") [CFWS])
293 629 50 33     4449 if ( !$in_group && $line =~ s/^($RE{'display-name'})://o ) {
294             push @res, {type => 'group start', value => $1 }
295 0 0       0 unless $args{'skip_groups'};
296 0         0 $in_group = 1; next;
  0         0  
297             }
298 629 50 33     1491 if ( $in_group && $line =~ s/^;// ) {
299 0 0       0 push @res, {type => 'group end'} unless $args{'skip_groups'};
300 0         0 $in_group = 0; next;
  0         0  
301             }
302              
303             # now we got rid of groups and cfws, 'address = mailbox'
304             # mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
305             # obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]))
306              
307             # so address-list is now comma separated list of mailboxes:
308             # address-list = (mailbox *("," mailbox))
309 629         906 my $obsolete = 0;
310 629 100 100     29551 if ( $line =~ s/^($CRE{'mailbox'})($RE{cfws}*)(?=,|;|$)//o
      100        
311             || ($line =~ s/^($CRE{'obs-mailbox'})($RE{cfws}*)(?=,|;|$)//o and $obsolete = 1)
312             ) {
313 624         1964 my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
314             $1,$2,$3,$4,$5,$6,$7,$8,$9
315             );
316 624 50       2323 my $not_ascii = "$user\@$host" =~ /\P{ASCII}/? 1 : 0;
317 624 0 33     1253 next if $not_ascii && $args{skip_not_ascii};
318              
319 624         3075 push @res, {
320             type => 'mailbox',
321             value => Email::Address->new(
322             $phrase, "$user\@$host", join(' ', @comments),
323             $original,
324             ),
325             obsolete => $obsolete,
326             not_ascii => $not_ascii,
327             };
328 624         8428 next;
329             }
330              
331             # if we got here then something unknown on our way
332             # try to recorver
333 5 50       68 if ($in_group) {
334 0 0       0 if ( $line =~ s/^([^;,"\)]*+(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^;,"\)]*+)*+)(?=;|,)//o ) {
335 0 0       0 push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
336 0         0 next;
337             }
338             } else {
339 5 100       350 if ( $line =~ s/^([^,"\)]*+(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^,"\)]*+)*+)(?=,)//o ) {
340 2 50       15 push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
341 2         8 next;
342             }
343             }
344 3 50       30 push @res, { type => 'unknown', value => $line } unless $args{'skip_unknown'};
345 3         9 last;
346             }
347 226         817 return @res;
348             }
349              
350             my $dequote = sub {
351             local $_ = shift;
352             s/^"//; s/"$//; s/\\(.)/$1/g;
353             return "$_";
354             };
355             my $quote = sub {
356             local $_ = shift;
357             s/([\\"])/\\$1/g;
358             return qq{"$_"};
359             };
360              
361             sub _process_mailbox {
362 624     624   1025 my $self = shift;
363 624         1368 my $original = shift;
364 624         2890 my @rest = (@_);
365              
366 624         983 my @comments;
367 624         2266 foreach ( grep defined, splice @rest ) {
368 3428 100       14854 s{ ($RE{'quoted-string'}) | ($RE{comment}) }
  861 100       2305  
  722         1359  
  722         5010  
369 3428         5428 { $1? $1 : do { push @comments, $2; $comments[-1] =~ /^\s|\s$/? ' ' : '' } }xgoe;
  3428         5800  
370 3428 100       6592 s/^\s+//; s/\s+$//;
371             next unless length;
372 1748         3194  
373             push @rest, $_;
374 624         1905 }
375             my ($host, $user, $phrase) = reverse @rest;
376              
377 448 100       3136 # deal with spaces out of quoted strings
378 624         3872 s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : ' ' }xgoe
379 285 100       1909 foreach grep defined, $phrase;
380 624         6572 s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : '' }xgoe
381             foreach $user, $host;
382              
383 139         330 # dequote
384 624         2823 s{ ($RE{'quoted-string'}) }{ $dequote->($1) }xgoe
385 624 50       5029 foreach grep defined, $phrase, $user;
386             $user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/;
387 624         1336  
  722         1442  
  722         1671  
  722         1722  
388 624         3063 @comments = grep length, map { s/^\s+//; s/\s+$//; $_ } grep defined, @comments;
389             return $original, $phrase, $user, $host, @comments;
390             }
391              
392              
393             =head1 AUTHOR
394              
395             Ruslan Zakirov Eruz@bestpractical.comE
396              
397             =head1 LICENSE
398              
399             Under the same terms as Perl itself.
400              
401             =cut
402              
403             1;