File Coverage

blib/lib/Email/Address/List.pm
Criterion Covered Total %
statement 64 73 87.6
branch 26 44 59.0
condition 11 20 55.0
subroutine 6 6 100.0
pod 1 1 100.0
total 108 144 75.0


line stmt bran cond sub pod time code
1 5     5   61348 use strict;
  5         10  
  5         186  
2 5     5   23 use warnings;
  5         12  
  5         140  
3 5     5   110 use 5.010;
  5         30  
  5         375  
4              
5             package Email::Address::List;
6              
7             our $VERSION = '0.05';
8 5     5   5223 use Email::Address;
  5         174408  
  5         12755  
9              
10             =head1 NAME
11              
12             Email::Address::List - RFC close address list parsing
13              
14             =head1 SYNOPSIS
15              
16             use Email::Address::List;
17              
18             my $header = <<'END';
19             Foo Bar , (an obsolete comment),,,
20             a group:
21             a . weird . address @
22             for-real .biz
23             ; invalid thingy, <
24             more@example.com
25             >
26             END
27              
28             my @list = Email::Address::List->parse($header);
29             foreach my $e ( @list ) {
30             if ($e->{'type'} eq 'mailbox') {
31             print "an address: ", $e->{'value'}->format ,"\n";
32             }
33             else {
34             print $e->{'type'}, "\n"
35             }
36             }
37              
38             # prints:
39             # an address: "Foo Bar"
40             # comment
41             # group start
42             # an address: a.weird.address@forreal.biz
43             # group end
44             # unknown
45             # an address: more@example.com
46              
47             =head1 DESCRIPTION
48              
49             Parser for From, To, Cc, Bcc, Reply-To, Sender and
50             previous prefixed with Resent- (eg Resent-From) headers.
51              
52             =head1 REASONING
53              
54             L is good at parsing addresses out of any text
55             even mentioned headers and this module is derived work
56             from Email::Address.
57              
58             However, mentioned headers are structured and contain lists
59             of addresses. Most of the time you want to parse such field
60             from start to end keeping everything even if it's an invalid
61             input.
62              
63             =head1 METHODS
64              
65             =head2 parse
66              
67             A class method that takes a header value (w/o name and :) and
68             a set of named options, for example:
69              
70             my @list = Email::Address::List->parse( $line, option => 1 );
71              
72             Returns list of hashes. Each hash at least has 'type' key that
73             describes the entry. Types:
74              
75             =over 4
76              
77             =item mailbox
78              
79             A mailbox entry with L object under value key.
80              
81             If mailbox has obsolete parts then 'obsolete' is true.
82              
83             If address (not display-name/phrase or comments, but
84             local-part@domain) contains not ASCII chars then 'not_ascii' is
85             set to true. According to RFC 5322 not ASCII chars are not
86             allowed within mailbox. However, there are no big problems if
87             those are used and actually RFC 6532 extends a few rules
88             from 5322 with UTF8-non-ascii. Either use the feature or just
89             skip such addresses with skip_not_ascii option.
90              
91             =item group start
92              
93             Some headers with mailboxes may contain groupped addresses. This
94             element is returned for position where group starts. Under value
95             key you find name of the group. B that value is not post
96             processed at the moment, so it may contain spaces, comments,
97             quoted strings and other noise. Author willing to take patches
98             and warns that this will be changed at some point without additional
99             notifications, so if you need groups info then you better send a
100             patch :)
101              
102             Groups can not be nested, but one field may have multiple groups or
103             mix of addresses that are in a group and not in any.
104              
105             See skip_groups option.
106              
107             =item group end
108              
109             Returned when a group ends.
110              
111             =item comment
112              
113             Obsolete syntax allows one to use standalone comments between mailboxes
114             that can not be addressed to any mailbox. In such situations a comment
115             returned as an entry of this type. Comment itself is under value.
116              
117             =item unknown
118              
119             Returned if parser met something that shouldn't be there. Parser
120             tries to recover by jumping over to next comma (or semicolon if inside
121             group) that is out quoted string or comment, so "foo, bar, baz" string
122             results in three unknown entries. Jumping over comments and quoted strings
123             means that parser is very sensitive to unbalanced quotes and parens,
124             but it's on purpose.
125              
126             =back
127              
128             It can be controlled which elements are skipped, for example:
129              
130             Email::Address::List->parse($line, skip_unknown => 1, ...);
131              
132             =over 4
133              
134             =item skip_comments
135              
136             Skips comments between mailboxes. Comments inside and next to a mailbox
137             are not skipped, but returned as part of mailbox entry.
138              
139             =item skip_not_ascii
140              
141             Skips mailboxes where address part has not ASCII characters.
142              
143             =item skip_groups
144              
145             Skips group starts and end elements, however emails within groups are
146             still returned.
147              
148             =item skip_unknown
149              
150             Skip anything that is not recognizable. It still tries to recover as
151             described earlier.
152              
153             =back
154              
155             =cut
156              
157             # mailbox = name-addr / addr-spec
158             # display-name = phrase
159             #
160             # from = "From:" mailbox-list CRLF
161             # sender = "Sender:" mailbox CRLF
162             # reply-to = "Reply-To:" address-list CRLF
163             #
164             # to = "To:" address-list CRLF
165             # cc = "Cc:" address-list CRLF
166             # bcc = "Bcc:" [address-list / CFWS] CRLF
167             #
168             # resent-from = "Resent-From:" mailbox-list CRLF
169             # resent-sender = "Resent-Sender:" mailbox CRLF
170             # resent-to = "Resent-To:" address-list CRLF
171             # resent-cc = "Resent-Cc:" address-list CRLF
172             # resent-bcc = "Resent-Bcc:" [address-list / CFWS] CRLF
173             #
174             # obs-from = "From" *WSP ":" mailbox-list CRLF
175             # obs-sender = "Sender" *WSP ":" mailbox CRLF
176             # obs-reply-to = "Reply-To" *WSP ":" address-list CRLF
177             #
178             # obs-to = "To" *WSP ":" address-list CRLF
179             # obs-cc = "Cc" *WSP ":" address-list CRLF
180             # obs-bcc = "Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
181             #
182             # obs-resent-from = "Resent-From" *WSP ":" mailbox-list CRLF
183             # obs-resent-send = "Resent-Sender" *WSP ":" mailbox CRLF
184             # obs-resent-date = "Resent-Date" *WSP ":" date-time CRLF
185             # obs-resent-to = "Resent-To" *WSP ":" address-list CRLF
186             # obs-resent-cc = "Resent-Cc" *WSP ":" address-list CRLF
187             # obs-resent-bcc = "Resent-Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
188             # obs-resent-mid = "Resent-Message-ID" *WSP ":" msg-id CRLF
189             # obs-resent-rply = "Resent-Reply-To" *WSP ":" address-list CRLF
190              
191             our $COMMENT_NEST_LEVEL ||= 2;
192              
193             our %RE;
194             our %CRE;
195              
196             $RE{'CTL'} = q{\x00-\x1F\x7F};
197             $RE{'special'} = q{()<>\\[\\]:;@\\\\,."};
198              
199             $RE{'text'} = qr/[^\x0A\x0D]/;
200              
201             $RE{'quoted_pair'} = qr/\\$RE{'text'}/;
202              
203             $RE{'atext'} = qr/[^$RE{'CTL'}$RE{'special'}\s]/;
204             $RE{'ctext'} = qr/(?>[^()\\]+)/;
205             $RE{'qtext'} = qr/[^\\"]/;
206             $RE{'dtext'} = qr/[^\[\]\\]/;
207              
208             ($RE{'ccontent'}, $RE{'comment'}) = (q{})x2;
209             for (1 .. $COMMENT_NEST_LEVEL) {
210             $RE{'ccontent'} = qr/$RE{'ctext'}|$RE{'quoted_pair'}|$RE{'comment'}/;
211             $RE{'comment'} = qr/\s*\((?:\s*$RE{'ccontent'})*\s*\)\s*/;
212             }
213             $RE{'cfws'} = qr/$RE{'comment'}|\s+/;
214              
215             $RE{'qcontent'} = qr/$RE{'qtext'}|$RE{'quoted_pair'}/;
216             $RE{'quoted-string'} = qr/$RE{'cfws'}*"$RE{'qcontent'}+"$RE{'cfws'}*/;
217              
218             $RE{'atom'} = qr/$RE{'cfws'}*$RE{'atext'}++$RE{'cfws'}*/;
219              
220             $RE{'word'} = qr/$RE{'cfws'}* (?: $RE{'atom'} | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
221             $RE{'phrase'} = qr/$RE{'word'}+/x;
222             $RE{'display-name'} = $RE{'phrase'};
223              
224             $RE{'dot_atom_text'} = qr/$RE{'atext'}+(?:\.$RE{'atext'}+)*/;
225             $RE{'dot_atom'} = qr/$RE{'cfws'}*$RE{'dot_atom_text'}$RE{'cfws'}*/;
226             $RE{'local-part'} = qr/$RE{'dot_atom'}|$RE{'quoted-string'}/;
227              
228             $RE{'dcontent'} = qr/$RE{'dtext'}|$RE{'quoted_pair'}/;
229             $RE{'domain_literal'} = qr/$RE{'cfws'}*\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}*/;
230             $RE{'domain'} = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/;
231              
232             $RE{'addr-spec'} = qr/$RE{'local-part'}\@$RE{'domain'}/;
233             $RE{'angle-addr'} = qr/$RE{'cfws'}* < $RE{'addr-spec'} > $RE{'cfws'}*/x;
234              
235             $RE{'name-addr'} = qr/$RE{'display-name'}?$RE{'angle-addr'}/;
236             $RE{'mailbox'} = qr/(?:$RE{'name-addr'}|$RE{'addr-spec'})$RE{'comment'}*/;
237              
238             $CRE{'addr-spec'} = qr/($RE{'local-part'})\@($RE{'domain'})/;
239             $CRE{'mailbox'} = qr/
240             (?:
241             ($RE{'display-name'})?($RE{'cfws'}*)<$CRE{'addr-spec'}>($RE{'cfws'}*)
242             |$CRE{'addr-spec'}
243             )($RE{'comment'}*)
244             /x;
245              
246             $RE{'dword'} = qr/$RE{'cfws'}* (?: $RE{'atom'} | \. | "$RE{'qcontent'}+" ) $RE{'cfws'}*/x;
247             $RE{'obs-phrase'} = qr/$RE{'word'} $RE{'dword'}*/x;
248             $RE{'obs-display-name'} = $RE{'obs-phrase'};
249             $RE{'obs-route'} = qr/
250             (?:$RE{'cfws'}|,)*
251             \@$RE{'domain'}
252             (?:,$RE{'cfws'}?(?:\@$RE{'domain'})?)*
253             :
254             /x;
255             $RE{'obs-domain'} = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
256             $RE{'obs-local-part'} = qr/$RE{'word'}(?:\.$RE{'word'})*/;
257             $RE{'obs-addr-spec'} = qr/$RE{'obs-local-part'}\@$RE{'obs-domain'}/;
258             $CRE{'obs-addr-spec'} = qr/($RE{'obs-local-part'})\@($RE{'obs-domain'})/;
259             $CRE{'obs-mailbox'} = qr/
260             (?:
261             ($RE{'obs-display-name'})?
262             ($RE{'cfws'}*)< $RE{'obs-route'}? $CRE{'obs-addr-spec'} >($RE{'cfws'}*)
263             |$CRE{'obs-addr-spec'}
264             )($RE{'comment'}*)
265             /x;
266              
267             sub parse {
268 226     226 1 1095183 my $self = shift;
269 226 50       1747 my %args = @_%2? (line => @_) : @_;
270 226         610 my $line = delete $args{'line'};
271              
272 226         453 my $in_group = 0;
273              
274 226         312 my @res;
275 226         1200 while ($line =~ /\S/) {
276             # in obs- case we have number of optional comments/spaces/
277             # address-list = (address *("," address)) / obs-addr-list
278             # obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]))
279 1632 100       10018 if ( $line =~ s/^(?:($RE{'cfws'})?,)//o ) {
280 1003 50 66     7350 push @res, {type => 'comment', value => $1 }
      66        
281             if $1 && !$args{'skip_comments'} && $1 =~ /($RE{'comment'})/;
282 1003         3757 next;
283             }
284 629         1666 $line =~ s/^\s+//o;
285              
286             # now it's only comma separated address where address is:
287             # address = mailbox / group
288              
289             # deal with groups
290             # group = display-name ":" [group-list] ";" [CFWS]
291             # group-list = mailbox-list / CFWS / obs-group-list
292             # obs-group-list = 1*([CFWS] ",") [CFWS])
293 629 50 33     24821 if ( !$in_group && $line =~ s/^($RE{'display-name'})://o ) {
294 0 0       0 push @res, {type => 'group start', value => $1 }
295             unless $args{'skip_groups'};
296 0         0 $in_group = 1; next;
  0         0  
297             }
298 629 50 33     3563 if ( $in_group && $line =~ s/^;// ) {
299 0 0       0 push @res, {type => 'group end'} unless $args{'skip_groups'};
300 0         0 $in_group = 0; next;
  0         0  
301             }
302              
303             # now we got rid of groups and cfws, 'address = mailbox'
304             # mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
305             # obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]))
306              
307             # so address-list is now comma separated list of mailboxes:
308             # address-list = (mailbox *("," mailbox))
309 629         841 my $obsolete = 0;
310 629 100 100     491359 if ( $line =~ s/^($CRE{'mailbox'})($RE{cfws}*)(?=,|;|$)//o
      66        
311             || ($line =~ s/^($CRE{'obs-mailbox'})($RE{cfws}*)(?=,|;|$)//o and $obsolete = 1)
312             ) {
313 624         3505 my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
314             $1,$2,$3,$4,$5,$6,$7,$8,$9
315             );
316 624 50       2731 my $not_ascii = "$user\@$host" =~ /\P{ASCII}/? 1 : 0;
317 624 50 33     1892 next if $not_ascii && $args{skip_not_ascii};
318              
319 624         3806 push @res, {
320             type => 'mailbox',
321             value => Email::Address->new(
322             $phrase, "$user\@$host", join(' ', @comments),
323             $original,
324             ),
325             obsolete => $obsolete,
326             not_ascii => $not_ascii,
327             };
328 624         14078 next;
329             }
330              
331             # if we got here then something unknown on our way
332             # try to recorver
333 5 50       167 if ($in_group) {
334 0 0       0 if ( $line =~ s/^([^;,"\)]*(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^;,"\)]*)*)(?=;|,)//o ) {
335 0 0       0 push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
336 0         0 next;
337             }
338             } else {
339 5 100       482 if ( $line =~ s/^([^,"\)]*(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^,"\)]*)*)(?=,)//o ) {
340 2 50       19 push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
341 2         11 next;
342             }
343             }
344 3 50       33 push @res, { type => 'unknown', value => $line } unless $args{'skip_unknown'};
345 3         11 last;
346             }
347 226         1019 return @res;
348             }
349              
350             my $dequote = sub {
351             local $_ = shift;
352             s/^"//; s/"$//; s/\\(.)/$1/g;
353             return "$_";
354             };
355             my $quote = sub {
356             local $_ = shift;
357             s/([\\"])/\\$1/g;
358             return qq{"$_"};
359             };
360              
361             sub _process_mailbox {
362 624     624   1055 my $self = shift;
363 624         1401 my $original = shift;
364 624         3746 my @rest = (@_);
365              
366 624         821 my @comments;
367 624         2591 foreach ( grep defined, splice @rest ) {
368 3410         23013 s{ ($RE{'quoted-string'}) | ($RE{comment}) }
369 859 100       2399 { $1? $1 : do { push @comments, $2; $comments[-1] =~ /^\s|\s$/? ' ' : '' } }xgoe;
  711 100       1814  
  711         8229  
370 3410         5085 s/^\s+//; s/\s+$//;
  3410         6293  
371 3410 100       8341 next unless length;
372              
373 1734         3927 push @rest, $_;
374             }
375 624         2481 my ($host, $user, $phrase) = reverse @rest;
376              
377             # deal with spaces out of quoted strings
378 425 100       3526 s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : ' ' }xgoe
379 624         5023 foreach grep defined, $phrase;
380 253 100       2288 s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : '' }xgoe
381 624         8922 foreach $user, $host;
382              
383             # dequote
384 148         459 s{ ($RE{'quoted-string'}) }{ $dequote->($1) }xgoe
385 624         4045 foreach grep defined, $phrase, $user;
386 624 50       7131 $user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/;
387              
388 624         1465 @comments = grep length, map { s/^\s+//; s/\s+$//; $_ } grep defined, @comments;
  711         1908  
  711         1765  
  711         1982  
389 624         4124 return $original, $phrase, $user, $host, @comments;
390             }
391              
392              
393             =head1 AUTHOR
394              
395             Ruslan Zakirov Eruz@bestpractical.comE
396              
397             =head1 LICENSE
398              
399             Under the same terms as Perl itself.
400              
401             =cut
402              
403             1;