File Coverage

lib/Email/Address/Classify.pm
Criterion Covered Total %
statement 45 46 97.8
branch 14 20 70.0
condition 2 6 33.3
subroutine 9 9 100.0
pod 3 3 100.0
total 73 84 86.9


line stmt bran cond sub pod time code
1             package Email::Address::Classify;
2 1     1   129554 use strict;
  1         3  
  1         37  
3 1     1   9 use warnings FATAL => 'all';
  1         2  
  1         38  
4 1     1   6 use File::Basename;
  1         1  
  1         861  
5              
6             =head1 NAME
7              
8             Email::Address::Classify - Classify email addresses
9              
10             =head1 SYNOPSIS
11              
12             use Email::Address::Classify;
13              
14             $email = Email::Address::Classify->new('a.johnson@example.com');
15              
16             print "Is valid: " . $email->is_valid() ? "Y\n" : "N\n"; # Y
17             print "Is random: " . $email->is_random() ? "Y\n" : "N\n"; # N
18              
19             =head1 DESCRIPTION
20              
21             This module provides a simple way to classify email addresses. At the moment, it only
22             provides two classifications is_valid() and is_random(). More classifications may be
23             added in the future.
24              
25             =head1 METHODS
26              
27             =over 4
28              
29             =item new($address)
30              
31             Creates a new Email::Address::Classify object. The only argument is the email address.
32              
33             =item is_valid()
34              
35             Performs a simple check to determine if the address is formatted properly.
36             Note that this method does not check if the domain exists or if the mailbox is valid.
37             Nor is it a complete RFC 2822 validator. For that, you should use a module such as
38             L.
39              
40             If this method returns false, all other methods will return false as well.
41              
42             =item is_random()
43              
44             Returns true if the localpart is likely to be randomly generated, false otherwise.
45             Note that randomness is subjective and depends on the user's locale and other factors.
46             This method uses a list of common trigrams to determine if the localpart is random. The trigrams
47             were generated from a corpus of 30,000 email messages, mostly in English. The accuracy of this
48             method is about 95% for English email addresses.
49              
50             If you would like to generate your own list of trigrams, you can use the included
51             C script in the C directory of the source repository.
52              
53             =back
54              
55             =head1 TODO
56              
57             Ideas for future classification methods:
58              
59             is_freemail()
60             is_disposable()
61             is_role_based()
62             is_bounce()
63             is_verp()
64             is_srs()
65             is_batv()
66             is_sms_gateway()
67              
68             =head1 AUTHOR
69              
70             Kent Oyer
71              
72             =head1 LICENSE AND COPYRIGHT
73              
74             Copyright (C) 2023 MXGuardian LLC
75              
76             This program is free software: you can redistribute it and/or modify
77             it under the terms of the GNU General Public License as published by
78             the Free Software Foundation, either version 3 of the License, or
79             (at your option) any later version.
80              
81             This program is distributed in the hope that it will be useful,
82             but WITHOUT ANY WARRANTY; without even the implied warranty of
83             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE
84             file included with this distribution for more information.
85              
86             You should have received a copy of the GNU General Public License
87             along with this program. If not, see https://www.gnu.org/licenses/.
88              
89             =cut
90              
91             my %ngrams;
92             our $min_length = 4;
93             our $VERSION = '0.02';
94              
95             sub _init {
96              
97             # read ngrams from Classify/ngrams.txt
98 1     1   77 my $filename = dirname($INC{'Email/Address/Classify.pm'}).'/Classify/ngrams.txt';
99 1 50       86 open(my $fh, '<', $filename) or die "Can't open $filename: $!";
100 1         58 while (my $line = <$fh>) {
101 3767         3824 chomp $line;
102 3767         8844 $ngrams{$line} = 1;
103             }
104 1         15 close($fh);
105              
106             }
107              
108             sub new {
109 46     46 1 23978 my ($class,$address) = @_;
110 46         122 my $self = bless {
111             address => $address,
112             }, $class;
113 46         87 my $email = _parse_email($address);
114 46 50       95 if ( $email ) {
115 46         78 $self->{localpart} = $email->{localpart};
116 46         65 $self->{domain} = $email->{domain};
117 46         59 $self->{valid} = 1;
118             } else {
119 0         0 $self->{valid} = 0;
120             }
121              
122 46         117 return $self;
123             }
124              
125             sub is_valid {
126 46     46 1 150 my $self = shift;
127 46         160 return $self->{valid};
128             }
129              
130             sub _find_ngrams {
131 46     46   85 my $str = lc($_[0]);
132 46         50 my @ngrams;
133 46         102 for (my $i = 0; $i < length($str) - 2; $i++) {
134 287         528 push @ngrams, substr($str, $i, 3);
135             }
136 46         159 return @ngrams;
137             }
138              
139             sub is_random {
140 46     46 1 71 my $self = shift;
141              
142 46 50       92 return $self->{random} if exists $self->{random};
143              
144 46 50 33     189 return $self->{random} = 0 unless $self->{valid} && length($self->{localpart}) >= $min_length;
145              
146 46 100       88 _init() unless %ngrams;
147              
148 46         77 my ($common,$uncommon) = (0,0);
149 46         77 foreach (_find_ngrams($self->{localpart})) {
150 287 100       425 if (exists $ngrams{$_} ) {
151 155         166 $common++;
152             } else {
153 132         145 $uncommon++;
154             }
155             }
156 46 100       95 if ( $common == $uncommon ) {
157             # tie breaker
158 1 50       8 $uncommon++ if $self->{localpart} =~ /[bcdfgjklmnpqrtvwxz]{5}|[aeiouy]{5}|([a-z]{1,2})(?:\1){3}/;
159             }
160 46 100       149 return $self->{random} = ($uncommon > $common ? 1 : 0);
161             }
162              
163             sub _parse_email {
164 46     46   59 my $email = shift;
165 46 50 33     419 return undef unless defined($email) &&
166             $email =~ /^((?:[a-zA-Z0-9\+\_\=\.\-])+)@((?:[a-zA-Z0-9\-])+(?:\.[a-zA-Z0-9\-]+)+)$/;
167              
168             return {
169 46         192 address => $email,
170             localpart => $1,
171             domain => $2,
172             };
173              
174             }
175              
176             1;