File Coverage

lib/Email/Address/Classify.pm
Criterion Covered Total %
statement 43 44 97.7
branch 11 16 68.7
condition 2 6 33.3
subroutine 9 9 100.0
pod 3 3 100.0
total 68 78 87.1


line stmt bran cond sub pod time code
1             package Email::Address::Classify;
2 1     1   109730 use strict;
  1         2  
  1         33  
3 1     1   5 use warnings FATAL => 'all';
  1         2  
  1         31  
4 1     1   4 use File::Basename;
  1         2  
  1         760  
5              
6             =head1 NAME
7              
8             Email::Address::Classify - Classify email addresses
9              
10             =head1 SYNOPSIS
11              
12             use Email::Address::Classify;
13              
14             $email = Email::Address::Classify->new('a.johnson@example.com');
15              
16             print "Is valid: " . $email->is_valid() ? "Y\n" : "N\n"; # Y
17             print "Is random: " . $email->is_random() ? "Y\n" : "N\n"; # N
18              
19             =head1 DESCRIPTION
20              
21             This module provides a simple way to classify email addresses. At the moment, it only
22             provides two classifications is_valid() and is_random(). More classifications may be
23             added in the future.
24              
25             =head1 METHODS
26              
27             =over 4
28              
29             =item new($address)
30              
31             Creates a new Email::Address::Classify object. The only argument is the email address.
32              
33             =item is_valid()
34              
35             Performs a simple check to determine if the address is formatted properly.
36             Note that this method does not check if the domain exists or if the mailbox is valid.
37             Nor is it a complete RFC 2822 validator. For that, you should use a module such as
38             L.
39              
40             If this method returns false, all other methods will return false as well.
41              
42             =item is_random()
43              
44             Returns true if the localpart is likely to be randomly generated, false otherwise.
45             Note that randomness is subjective and depends on the user's locale and other factors.
46             This method uses a list of common trigrams to determine if the localpart is random. The trigrams
47             were generated from a corpus of 30,000 email messages, mostly in English. The accuracy of this
48             method is about 95% for English email addresses.
49              
50             If you would like to generate your own list of trigrams, you can use the included
51             C script in the C directory of the source repository.
52              
53             =back
54              
55             =head1 TODO
56              
57             Ideas for future classification methods:
58              
59             is_freemail()
60             is_disposable()
61             is_role_based()
62             is_bounce()
63             is_verp()
64             is_srs()
65             is_batv()
66              
67             =head1 AUTHOR
68              
69             Kent Oyer
70              
71             =head1 LICENSE AND COPYRIGHT
72              
73             Copyright (C) 2023 MXGuardian LLC
74              
75             This program is free software: you can redistribute it and/or modify
76             it under the terms of the GNU General Public License as published by
77             the Free Software Foundation, either version 3 of the License, or
78             (at your option) any later version.
79              
80             This program is distributed in the hope that it will be useful,
81             but WITHOUT ANY WARRANTY; without even the implied warranty of
82             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE
83             file included with this distribution for more information.
84              
85             You should have received a copy of the GNU General Public License
86             along with this program. If not, see https://www.gnu.org/licenses/.
87              
88             =cut
89              
90             my %ngrams;
91             our $min_length = 4;
92             our $VERSION = '0.01';
93              
94             sub _init {
95              
96             # read ngrams from Classify/ngrams.txt
97 1     1   90 my $filename = dirname($INC{'Email/Address/Classify.pm'}).'/Classify/ngrams.txt';
98 1 50       59 open(my $fh, '<', $filename) or die "Can't open $filename: $!";
99 1         62 while (my $line = <$fh>) {
100 2251         2361 chomp $line;
101 2251         5161 $ngrams{$line} = 1;
102             }
103 1         15 close($fh);
104              
105             }
106              
107             sub new {
108 40     40 1 25151 my ($class,$address) = @_;
109 40         114 my $self = bless {
110             address => $address,
111             }, $class;
112 40         81 my $email = _parse_email($address);
113 40 50       86 if ( $email ) {
114 40         72 $self->{localpart} = $email->{localpart};
115 40         54 $self->{domain} = $email->{domain};
116 40         56 $self->{valid} = 1;
117             } else {
118 0         0 $self->{valid} = 0;
119             }
120              
121 40         103 return $self;
122             }
123              
124             sub is_valid {
125 40     40 1 112 my $self = shift;
126 40         139 return $self->{valid};
127             }
128              
129             sub _find_ngrams {
130 40     40   72 my $str = lc($_[0]);
131 40         42 my @ngrams;
132 40         90 for (my $i = 0; $i < length($str) - 2; $i++) {
133 254         494 push @ngrams, substr($str, $i, 3);
134             }
135 40         136 return @ngrams;
136             }
137              
138             sub is_random {
139 40     40 1 79 my $self = shift;
140              
141 40 50       91 return $self->{random} if exists $self->{random};
142              
143 40 50 33     164 return $self->{random} = 0 unless $self->{valid} && length($self->{localpart}) >= $min_length;
144              
145 40 100       69 _init() unless %ngrams;
146              
147 40         76 my ($common,$uncommon) = (0,0);
148 40         78 foreach (_find_ngrams($self->{localpart})) {
149 254 100       363 if (exists $ngrams{$_} ) {
150 130         148 $common++;
151             } else {
152 124         149 $uncommon++;
153             }
154             }
155 40 100       149 return $self->{random} = ($uncommon > $common ? 1 : 0);
156             }
157              
158             sub _parse_email {
159 40     40   60 my $email = shift;
160 40 50 33     371 return undef unless defined($email) &&
161             $email =~ /^((?:[a-zA-Z0-9\+\_\=\.\-])+)@((?:[a-zA-Z0-9\-])+(?:\.[a-zA-Z0-9\-]+)+)$/;
162              
163             return {
164 40         181 address => $email,
165             localpart => $1,
166             domain => $2,
167             };
168              
169             }
170              
171             1;