File Coverage

blib/lib/Data/Password/zxcvbn/Match/Dictionary.pm
Criterion Covered Total %
statement 132 137 96.3
branch 50 52 96.1
condition 18 20 90.0
subroutine 19 22 86.3
pod 10 10 100.0
total 229 241 95.0


line stmt bran cond sub pod time code
1             package Data::Password::zxcvbn::Match::Dictionary;
2 4     4   9878 use Moo;
  4         12  
  4         32  
3             with 'Data::Password::zxcvbn::Match';
4 4     4   4775 use Data::Password::zxcvbn::Combinatorics qw(nCk enumerate_substitution_maps);
  4         13  
  4         280  
5 4     4   32 use List::AllUtils qw(min);
  4         9  
  4         4577  
6             our $VERSION = '1.1.1'; # VERSION
7             # ABSTRACT: match class for words in passwords
8              
9              
10             has reversed => (is => 'ro', default => 0); # bool
11             has substitutions => ( is => 'ro', default => sub { +{} } );
12             has rank => ( is => 'ro', default => 1 ); # int
13             # this should be constrained to the keys of %ranked_dictionaries, but
14             # we can't do that because users can pass their own dictionaries to
15             # ->make
16             has dictionary_name => ( is => 'ro', default => 'passwords' );
17              
18              
19             sub l33t {
20 451     451 1 59050 return scalar(keys %{shift->substitutions})!=0;
  451         2750  
21             }
22              
23              
24             our %l33t_table = ( ## no critic (ProhibitPackageVars)
25             a => ['4', '@'],
26             b => ['8'],
27             c => ['(', '{', '[', '<'],
28             e => ['3'],
29             g => ['6', '9'],
30             i => ['1', '!', '|'],
31             l => ['1', '|', '7'],
32             o => ['0'],
33             s => ['$', '5'],
34             t => ['+', '7'],
35             x => ['%'],
36             z => ['2'],
37             );
38              
39             sub make {
40 1533     1533 1 107751 my ($class, $password, $opts) = @_;
41             ## no critic (ProhibitPackageVars)
42             my $dictionaries = $opts->{ranked_dictionaries}
43 1533   66     8483 || do {
44             require Data::Password::zxcvbn::RankedDictionaries;
45             \%Data::Password::zxcvbn::RankedDictionaries::ranked_dictionaries;
46             };
47 1533   100     8658 my $l33t_table = $opts->{l33t_table} || \%l33t_table;
48              
49 1533         3600 my @matches;
50 1533         6625 $class->_make_simple(\@matches,$password,$dictionaries);
51 1533         8860 $class->_make_reversed(\@matches,$password,$dictionaries);
52 1533         11059 $class->_make_l33t(\@matches,$password,$dictionaries, $l33t_table);
53              
54 1533         14487 @matches = sort @matches;
55 1533         8306 return \@matches;
56             }
57              
58             sub _make_simple {
59 4648     4648   12801 my ($class, $matches, $password, $dictionaries) = @_;
60 4648         9879 my $password_lc = lc($password);
61             # lc may change the length of the password...
62 4648         9747 my $length = length($password_lc);
63              
64 4648         8748 for my $dictionary_name (keys %{$dictionaries}) {
  4648         16634  
65 27453         49371 my $ranked_dict = $dictionaries->{$dictionary_name};
66 27453         50572 for my $i (0..$length-1) {
67 156745         264822 for my $j ($i..$length-1) {
68 674738         1121645 my $word = substr($password_lc,$i,$j-$i+1);
69 674738 100       1753953 if (my $rank = $ranked_dict->{$word}) {
70 8139         15932 push @{$matches}, $class->new({
  8139         185872  
71             token => substr($password,$i,$j-$i+1),
72             i => $i, j=> $j,
73             rank => $rank,
74             dictionary_name => $dictionary_name,
75             });
76             }
77             }
78             }
79             }
80             }
81              
82             sub _make_reversed {
83 1533     1533   5228 my ($class, $matches, $password, $dictionaries) = @_;
84              
85 1533         3955 my $rev_password = reverse($password);
86 1533         3079 my @rev_matches;
87 1533         6382 $class->_make_simple(\@rev_matches,$rev_password,$dictionaries);
88              
89 1533         6700 my $rev_length = length($password)-1;
90 1533         5413 for my $rev_match (@rev_matches) {
91 1553         7676 my $word = $rev_match->token;
92             # no need to add this, the normal matching will have produced
93             # it already
94 1553 100       7111 next if $word eq reverse($word);
95 661         1365 push @{$matches}, $class->new({
  661         16532  
96             token => reverse($word),
97             i => $rev_length - $rev_match->j,
98             j=> $rev_length - $rev_match->i,
99             rank => $rev_match->rank,
100             dictionary_name => $rev_match->dictionary_name,
101             reversed => 1,
102             });
103             }
104             }
105              
106             # makes a pruned copy of l33t_table that only includes password's
107             # possible substitutions
108             sub _relevant_l33t_subtable {
109 1539     1539   23472 my ($class, $password, $l33t_table) = @_;
110             # set of characters
111 1539         3537 my %password_chars; @password_chars{split //,$password} = ();
  1539         10649  
112              
113 1539         4093 my %subtable;
114 1539         3607 for my $letter (keys %{$l33t_table}) {
  1539         9683  
115 34681         73446 my @relevant_subs = grep { exists $password_chars{$_} }
116 18084         27197 @{$l33t_table->{$letter}};
  18084         39557  
117 18084 100       45890 $subtable{$letter} = \@relevant_subs
118             if @relevant_subs;
119             }
120              
121 1539         12410 return \%subtable;
122             }
123              
124             sub _translate {
125 1582     1582   4323 my ($class, $string, $table) = @_;
126 1582         3336 my $keys = join '', keys %{$table};
  1582         5533  
127 1582         37215 $string =~ s{([\Q$keys\E])}
128 1582         7220 {$table->{$1}}g;
129             return $string;
130             }
131              
132 1533     1533   6109 sub _make_l33t {
133             my ($class, $matches, $password, $dictionaries, $l33t_table) = @_;
134 1533         5727  
135             my $subs = enumerate_substitution_maps(
136             $class->_relevant_l33t_subtable($password,$l33t_table)
137 1533         6057 );
  1533         4178  
138 2246 100       7083 for my $sub (@{$subs}) {
  2246         6583  
139 1582         5598 next unless %{$sub};
140 1582         3364 my $subbed_password = $class->_translate($password,$sub);
141 1582         5858 my @subbed_matches;
142             $class->_make_simple(\@subbed_matches,$subbed_password,$dictionaries);
143 1582         6111  
144 3651         23300 for my $subbed_match (@subbed_matches) {
145             my $token = substr($password,
146             $subbed_match->i,
147             $subbed_match->j - $subbed_match->i + 1);
148 3651 100       10755 # too short, ignore
149             next if length($token) <= 1;
150 1930 100       8298 # only return the matches that contain an actual substitution
151             next if lc($token) eq lc($subbed_match->token);
152             # subset of mappings in $sub that are in use for this match
153             my %min_subs = map {
154 2164 100       20337 $token =~ m{\Q$_}
155             ? ( $_ => $sub->{$_} )
156 589         1444 : ()
  589         2016  
157 589         1716 } keys %{$sub};
  589         15034  
158             push @{$matches}, $class->new({
159             token => $token,
160             substitutions => \%min_subs,
161             i => $subbed_match->i,
162             j=> $subbed_match->j,
163             rank => $subbed_match->rank,
164             dictionary_name => $subbed_match->dictionary_name,
165             });
166             }
167             }
168             }
169              
170              
171 4179     4179 1 67598 sub estimate_guesses {
172             my ($self,$min_guesses) = @_;
173 4179         14523  
174             return $self->rank *
175             $self->_uppercase_variations *
176             $self->_l33t_variations *
177             $self->_reversed_variations;
178             }
179              
180              
181             # an uppercase letter, followed by stuff that is *not* uppercase
182 3     3   1809 # letters
  3         44  
  3         42  
183             my $START_UPPER_RE = qr{\A \p{Lu} \P{Lu}+ \z}x;
184             # stuff that is *not* uppercase letters, followed by an uppercase
185             # letter
186             my $END_UPPER_RE = qr{\A \P{Lu}+ \p{Lu} \z}x;
187             # no characters that are *not* uppercase letters
188             my $ALL_NOT_UPPER_RE = qr{\A \P{Lu}+ \z}x;
189             # no characters that are *not* lowercase letters
190             my $ALL_NOT_LOWER_RE = qr{\A \P{Ll}+ \z}x;
191              
192 386     386 1 3970  
193 0     0 1 0 sub does_word_start_upper { return $_[1] =~ $START_UPPER_RE }
194 0     0 1 0 sub does_word_end_upper { return $_[1] =~ $END_UPPER_RE }
195 0     0 1 0 sub is_word_all_not_upper { return $_[1] =~ $ALL_NOT_UPPER_RE }
196 356   100 356 1 3557 sub is_word_all_not_lower { return $_[1] =~ $ALL_NOT_LOWER_RE }
197             sub is_word_all_upper { return $_[1] =~ $ALL_NOT_LOWER_RE && $_[1] ne lc($_[1]) }
198              
199 4179     4179   8488 sub _uppercase_variations {
200             my ($self) = @_;
201 4179         9600  
202             my $word = $self->token;
203              
204 4179 100       29166 # if the word has no uppercase letters, count it as 1 variation
205 270 100       1065 return 1 if $word =~ $ALL_NOT_UPPER_RE;
206             return 1 if lc($word) eq $word;
207              
208             # a capitalized word is the most common capitalization scheme, so
209             # it only doubles the search space (uncapitalized + capitalized).
210             # allcaps and end-capitalized are common enough too, underestimate
211 268 100       1638 # as 2x factor to be safe.
212 148 100       810 return 2 if $word =~ $START_UPPER_RE;
213 127 100       829 return 2 if $word =~ $END_UPPER_RE;
214             return 2 if $word =~ $ALL_NOT_LOWER_RE;
215              
216             # otherwise calculate the number of ways to capitalize U+L
217             # uppercase+lowercase letters with U uppercase letters or
218             # less. or, if there's more uppercase than lower (for
219             # eg. PASSwORD), the number of ways to lowercase U+L letters with
220 37         191 # L lowercase letters or less.
221 37         177 my $U = () = $word =~ m/\p{Lu}/g;
222             my $L = () = $word =~ m/\p{Ll}/g;
223 37         100  
224 37         289 my $variations = 0;
225 37         127 $variations += nCk($U+$L,$_) for 1..min($U,$L);
226             return $variations;
227             }
228              
229 4179     4179   9910 sub _l33t_variations {
230             my ($self) = @_;
231 4179         9155  
232             my $word = $self->token;
233 4179         7202  
234 4179         7172 my $variations = 1;
  4179         14773  
235 1098         2910 for my $subbed (keys %{$self->substitutions}) {
236             my $unsubbed = $self->substitutions->{$subbed};
237              
238 1098         9990 # number of Substituted characters
239             my $S = () = $word =~ m{\Q$subbed}gi;
240 1098         7274 # number of Unsubstituted characters
241             my $U = () = $word =~ m{\Q$unsubbed}gi;
242 1098 100 66     4826  
243             if ($S==0 || $U==0) {
244             # for this substitution, password is either fully subbed
245             # (444) or fully unsubbed (aaa); treat that as doubling
246             # the space (attacker needs to try fully subbed chars in
247 1054         2465 # addition to unsubbed.)
248             $variations *= 2;
249             }
250             else {
251             # this case is similar to capitalization: with aa44a, U =
252             # 3, S = 2, attacker needs to try unsubbed + one sub + two
253 44         98 # subs
254 44         309 my $possibilities = 0;
255 44         113 $possibilities += nCk($U+$S,$_) for 1..min($U,$S);
256             $variations *= $possibilities;
257             }
258             }
259 4179         12522  
260             return $variations;
261             }
262              
263 4179 100   4179   23224 sub _reversed_variations {
264             return shift->reversed ? 2 : 1;
265             }
266              
267              
268 386     386 1 1360 sub feedback_warning {
269             my ($self, $is_sole_match) = @_;
270 386 100       3592  
    100          
    100          
271 107 100 100     913 if ($self->dictionary_name eq 'passwords') {
    100 100        
272 26 50       196 if ($is_sole_match && !$self->l33t && !$self->reversed) {
    50          
273 0         0 if ($self->rank <= 10) {
274             return 'This is a top-10 common password';
275             }
276 0         0 elsif ($self->rank <= 100) {
277             return 'This is a top-100 common password';
278             }
279 26         174 else {
280             return 'This is a very common password';
281             }
282             }
283 64         1463 elsif ($self->guesses_log10 <= 4) {
284             return 'This is similar to a commonly used password';
285             }
286             }
287 131 100       443 elsif ($self->dictionary_name =~ /names$/) {
288 22         148 if ($is_sole_match) {
289             return 'Names and surnames by themselves are easy to guess'
290             }
291 109         687 else {
292             return 'Common names and surnames are easy to guess';
293             }
294             }
295 27         166 elsif ($is_sole_match) {
296             return 'A word by itself is easy to guess';
297             }
298 138         1134  
299             return undef;
300             }
301              
302 386     386 1 1280 sub feedback_suggestions {
303             my ($self) = @_;
304 386         1288  
305 386         1037 my $word = $self->token;
306             my @suggestions;
307 386 100       1998  
    100          
308 30         140 if ($self->does_word_start_upper($word)) {
309             push @suggestions, q{Capitalization doesn't help very much};
310             }
311 7         24 elsif ($self->is_word_all_upper($word)) {
312             push @suggestions, 'All-uppercase is almost as easy to guess as all-lowercase';
313             }
314 386 100 100     2473  
315 13         55 if ($self->reversed && length($word) >= 4) {
316             push @suggestions, q{Reversed words aren't much harder to guess};
317             }
318 386 100       1847  
319 10         35 if ($self->l33t) {
320             push @suggestions, q{Predictable substitutions like '@' instead of 'a' don't help very much};
321             }
322 386         2036  
323             return \@suggestions;
324             }
325              
326              
327             around fields_for_json => sub {
328             my ($orig,$self) = @_;
329             ( $self->$orig(), qw(dictionary_name reversed rank substitutions) )
330             };
331              
332             1;
333              
334             __END__
335              
336             =pod
337              
338             =encoding UTF-8
339              
340             =for :stopwords Wiktionary xato
341              
342             =head1 NAME
343              
344             Data::Password::zxcvbn::Match::Dictionary - match class for words in passwords
345              
346             =head1 VERSION
347              
348             version 1.1.1
349              
350             =head1 DESCRIPTION
351              
352             This class represents the guess that a certain substring of a password
353             can be guessed by going through a dictionary.
354              
355             =head1 ATTRIBUTES
356              
357             =head2 C<reversed>
358              
359             Boolean, true if the token appears to be a dictionary word that's been
360             reversed (i.e. last letter first)
361              
362             =head2 C<substitutions>
363              
364             Hashref representing the characters that need to be substituted to
365             make the token match a dictionary work (e.g. if the token is
366             C<s!mpl3>, this hash would be C<< { '!' => 'i', '3' => 'e' } >>).
367              
368             =head2 C<rank>
369              
370             Number, indicating how common the dictionary word is. 1 means "most
371             common".
372              
373             =head2 C<dictionary_name>
374              
375             String, the name of the dictionary that the word was found in. Usually one of:
376              
377             =over 4
378              
379             =item *
380              
381             C<english_wikipedia>
382              
383             words extracted from a dump of the English edition of Wikipedia
384              
385             =item *
386              
387             C<male_names>, C<female_names>, C<surnames>
388              
389             common names from the 1990 US census
390              
391             =item *
392              
393             C<passwords>
394              
395             most common passwords, extracted from the "xato" password dump
396              
397             =item *
398              
399             C<us_tv_and_film>
400              
401             words from a 2006 Wiktionary word frequency study over American
402             television and movies
403              
404             =back
405              
406             =head1 METHODS
407              
408             =head2 C<l33t>
409              
410             Returns true if the token had any L</substitutions> (i.e. it was
411             written in "l33t-speak")
412              
413             =head2 C<make>
414              
415             my @matches = @{ Data::Password::zxcvbn::Match::Dictionary->make(
416             $password,
417             { # these are the defaults
418             ranked_dictionaries => \%Data::Password::zxcvbn::RankedDictionaries::ranked_dictionaries,
419             l33t_table => \%Data::Password::zxcvbn::Match::Dictionary::l33t_table,
420             },
421             ) };
422              
423             Scans the C<$password> for substrings that match words in the
424             C<ranked_dictionaries>, possibly reversed, possibly with substitutions
425             from the C<l33t_table>.
426              
427             The C<ranked_dictionaries> should look like:
428              
429             { some_dictionary_name => { 'word' => 156, 'another' => 13, ... },
430             ... }
431              
432             (i.e. a hash of dictionaries, each mapping words to their frequency
433             rank) and the C<l33t_table> should look like:
434              
435             { a => [ '4', '@' ], ... }
436              
437             (i.e. a hash mapping characters to arrays of other characters)
438              
439             =head2 C<estimate_guesses>
440              
441             The number of guesses is the product of the rank of the word, how many
442             case combinations match it, how many substitutions were used, doubled
443             if the token is reversed.
444              
445             =head2 C<does_word_start_upper>
446              
447             =head2 C<does_word_end_upper>
448              
449             =head2 C<is_word_all_not_upper>
450              
451             =head2 C<is_word_all_not_lower>
452              
453             =head2 C<is_word_all_upper>
454              
455             if ($self->does_word_start_upper($word)) { ... }
456              
457             These are mainly for sub-classes, to use in L<< /C<feedback_warning>
458             >> and L<< /C<feedback_suggestions> >>.
459              
460             =head2 C<feedback_warning>
461              
462             =head2 C<feedback_suggestions>
463              
464             This class suggests not using common words or passwords, especially on
465             their own. It also suggests that capitalisation, "special characters"
466             substitutions, and writing things backwards are not very useful.
467              
468             =head2 C<fields_for_json>
469              
470             The JSON serialisation for matches of this class will contain C<token
471             i j guesses guesses_log10 dictionary_name reversed rank
472             substitutions>.
473              
474             =head1 AUTHOR
475              
476             Gianni Ceccarelli <gianni.ceccarelli@broadbean.com>
477              
478             =head1 COPYRIGHT AND LICENSE
479              
480             This software is copyright (c) 2022 by BroadBean UK, a CareerBuilder Company.
481              
482             This is free software; you can redistribute it and/or modify it under
483             the same terms as the Perl 5 programming language system itself.
484              
485             =cut