File Coverage

blib/lib/Perl/Critic/Policy/RegularExpressions/ProhibitComplexRegexes.pm
Criterion Covered Total %
statement 24 39 61.5
branch 0 16 0.0
condition n/a
subroutine 11 12 91.6
pod 4 5 80.0
total 39 72 54.1


line stmt bran cond sub pod time code
1             package Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes;
2              
3 40     40   27969 use 5.010001;
  40         206  
4 40     40   267 use strict;
  40         128  
  40         898  
5 40     40   257 use warnings;
  40         128  
  40         1217  
6              
7 40     40   332 use List::Util qw{ min };
  40         143  
  40         2792  
8 40     40   320 use Readonly;
  40         138  
  40         1805  
9              
10 40     40   341 use Perl::Critic::Utils qw( :severities );
  40         171  
  40         2025  
11              
12 40     40   5264 use parent 'Perl::Critic::Policy';
  40         137  
  40         267  
13              
14             our $VERSION = '1.150';
15              
16             #-----------------------------------------------------------------------------
17              
18             Readonly::Scalar my $DESC => q{Split long regexps into smaller qr// chunks};
19             Readonly::Scalar my $EXPL => [261];
20              
21             Readonly::Scalar my $MAX_LITERAL_LENGTH => 7;
22             Readonly::Scalar my $MAX_VARIABLE_LENGTH => 4;
23              
24             #-----------------------------------------------------------------------------
25              
26             sub supported_parameters {
27             return (
28             {
29 90     90 0 2000 name => 'max_characters',
30             description =>
31             'The maximum number of characters to allow in a regular expression.',
32             default_string => '60',
33             behavior => 'integer',
34             integer_minimum => 1,
35             },
36             );
37             }
38              
39 74     74 1 327 sub default_severity { return $SEVERITY_MEDIUM }
40 86     86 1 367 sub default_themes { return qw( core pbp maintenance ) }
41 30     30 1 109 sub applies_to { return qw(PPI::Token::Regexp::Match
42             PPI::Token::Regexp::Substitute
43             PPI::Token::QuoteLike::Regexp) }
44              
45             #-----------------------------------------------------------------------------
46              
47             sub violates {
48 0     0 1   my ( $self, $elem, $document ) = @_;
49              
50             # Optimization: if its short enough now, parsing won't make it longer
51 0 0         return if $self->{_max_characters} >= length $elem->get_match_string();
52              
53 0 0         my $re = $document->ppix_regexp_from_element( $elem )
54             or return; # Abort on syntax error.
55 0 0         $re->failures()
56             and return; # Abort if parse errors found.
57 0 0         my $qr = $re->regular_expression()
58             or return; # Abort if no regular expression.
59              
60 0           my $length = 0;
61             # We use map { $_->tokens() } qr->children() rather than just
62             # $qr->tokens() because we are not interested in the delimiters.
63 0           foreach my $token ( map { $_->tokens() } $qr->children() ) {
  0            
64              
65             # Do not count whitespace or comments
66 0 0         $token->significant() or next;
67              
68 0 0         if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' ) ) {
    0          
69              
70             # Do not penalize long variable names
71 0           $length += min( $MAX_VARIABLE_LENGTH, length $token->content() );
72              
73             } elsif ( $token->isa( 'PPIx::Regexp::Token::Literal' ) ) {
74              
75             # Do not penalize long literals like \p{...}
76 0           $length += min( $MAX_LITERAL_LENGTH, length $token->content() );
77              
78             } else {
79              
80             # Take everything else at face value
81 0           $length += length $token->content();
82              
83             }
84              
85             }
86              
87 0 0         return if $self->{_max_characters} >= $length;
88              
89 0           return $self->violation( $DESC, $EXPL, $elem );
90             }
91              
92             1;
93              
94             __END__
95              
96             #-----------------------------------------------------------------------------
97              
98             =pod
99              
100             =for stopwords BNF Tatsuhiko Miyagawa
101              
102             =head1 NAME
103              
104             Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes - Split long regexps into smaller C<qr//> chunks.
105              
106              
107             =head1 AFFILIATION
108              
109             This Policy is part of the core L<Perl::Critic|Perl::Critic>
110             distribution.
111              
112              
113             =head1 DESCRIPTION
114              
115             Big regexps are hard to read, perhaps even the hardest part of Perl.
116             A good practice to write digestible chunks of regexp and put them
117             together. This policy flags any regexp that is longer than C<N>
118             characters, where C<N> is a configurable value that defaults to 60.
119             If the regexp uses the C<x> flag, then the length is computed after
120             parsing out any comments or whitespace.
121              
122             Unfortunately the use of descriptive (and therefore longish) variable
123             names can cause regexps to be in violation of this policy, so
124             interpolated variables are counted as 4 characters no matter how long
125             their names actually are.
126              
127              
128             =head1 CASE STUDY
129              
130             As an example, look at the regexp used to match email addresses in
131             L<Email::Valid::Loose|Email::Valid::Loose> (tweaked lightly to wrap
132             for POD)
133              
134             (?x-ism:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]
135             \000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015
136             "]*)*")(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[
137             \]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n
138             \015"]*)*")|\.)*\@(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,
139             ;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]
140             )(?:\.(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000
141             -\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)
142              
143             which is constructed from the following code:
144              
145             my $esc = '\\\\';
146             my $period = '\.';
147             my $space = '\040';
148             my $open_br = '\[';
149             my $close_br = '\]';
150             my $nonASCII = '\x80-\xff';
151             my $ctrl = '\000-\037';
152             my $cr_list = '\n\015';
153             my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; # "
154             my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
155             my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
156             my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;# "
157             my $atom = qq<$atom_char+(?!$atom_char)>;
158             my $quoted_str = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; # "
159             my $word = qq<(?:$atom|$quoted_str)>;
160             my $domain_ref = $atom;
161             my $domain_lit = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>;
162             my $sub_domain = qq<(?:$domain_ref|$domain_lit)>;
163             my $domain = qq<$sub_domain(?:$period$sub_domain)*>;
164             my $local_part = qq<$word(?:$word|$period)*>; # This part is modified
165             $Addr_spec_re = qr<$local_part\@$domain>;
166              
167             If you read the code from bottom to top, it is quite readable. And,
168             you can even see the one violation of RFC822 that Tatsuhiko Miyagawa
169             deliberately put into Email::Valid::Loose to allow periods. Look for
170             the C<|\.> in the upper regexp to see that same deviation.
171              
172             One could certainly argue that the top regexp could be re-written more
173             legibly with C<m//x> and comments. But the bottom version is
174             self-documenting and, for example, doesn't repeat C<\x80-\xff> 18
175             times. Furthermore, it's much easier to compare the second version
176             against the source BNF grammar in RFC 822 to judge whether the
177             implementation is sound even before running tests.
178              
179              
180             =head1 CONFIGURATION
181              
182             This policy allows regexps up to C<N> characters long, where C<N>
183             defaults to 60. You can override this to set it to a different number
184             with the C<max_characters> setting. To do this, put entries in a
185             F<.perlcriticrc> file like this:
186              
187             [RegularExpressions::ProhibitComplexRegexes]
188             max_characters = 40
189              
190              
191             =head1 CREDITS
192              
193             Initial development of this policy was supported by a grant from the
194             Perl Foundation.
195              
196              
197             =head1 AUTHOR
198              
199             Chris Dolan <cdolan@cpan.org>
200              
201              
202             =head1 COPYRIGHT
203              
204             Copyright (c) 2007-2023 Chris Dolan
205              
206             This program is free software; you can redistribute it and/or modify
207             it under the same terms as Perl itself. The full text of this license
208             can be found in the LICENSE file included with this module
209              
210             =cut
211              
212             # Local Variables:
213             # mode: cperl
214             # cperl-indent-level: 4
215             # fill-column: 78
216             # indent-tabs-mode: nil
217             # c-indentation-style: bsd
218             # End:
219             # ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :