File Coverage

blib/lib/Perl/Critic/Policy/RegularExpressions/ProhibitComplexRegexes.pm
Criterion Covered Total %
statement 45 45 100.0
branch 14 16 87.5
condition n/a
subroutine 14 14 100.0
pod 4 5 80.0
total 77 80 96.2


line stmt bran cond sub pod time code
1             package Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes;
2              
3 40     40   27950 use 5.010001;
  40         196  
4 40     40   278 use strict;
  40         137  
  40         1008  
5 40     40   245 use warnings;
  40         128  
  40         1044  
6              
7 40     40   258 use Carp;
  40         111  
  40         2513  
8 40     40   301 use English qw(-no_match_vars);
  40         135  
  40         282  
9 40     40   14728 use List::Util qw{ min };
  40         123  
  40         2739  
10 40     40   304 use Readonly;
  40         144  
  40         2066  
11              
12 40     40   400 use Perl::Critic::Utils qw{ :booleans :severities };
  40         127  
  40         2005  
13              
14 40     40   6582 use parent 'Perl::Critic::Policy';
  40         148  
  40         326  
15              
16             our $VERSION = '1.148';
17              
18             #-----------------------------------------------------------------------------
19              
20             Readonly::Scalar my $DESC => q{Split long regexps into smaller qr// chunks};
21             Readonly::Scalar my $EXPL => [261];
22              
23             Readonly::Scalar my $MAX_LITERAL_LENGTH => 7;
24             Readonly::Scalar my $MAX_VARIABLE_LENGTH => 4;
25              
26             #-----------------------------------------------------------------------------
27              
28             sub supported_parameters {
29             return (
30             {
31 97     97 0 2151 name => 'max_characters',
32             description =>
33             'The maximum number of characters to allow in a regular expression.',
34             default_string => '60',
35             behavior => 'integer',
36             integer_minimum => 1,
37             },
38             );
39             }
40              
41 80     80 1 408 sub default_severity { return $SEVERITY_MEDIUM }
42 86     86 1 386 sub default_themes { return qw( core pbp maintenance ) }
43 37     37 1 163 sub applies_to { return qw(PPI::Token::Regexp::Match
44             PPI::Token::Regexp::Substitute
45             PPI::Token::QuoteLike::Regexp) }
46              
47             #-----------------------------------------------------------------------------
48              
49             sub violates {
50 24     24 1 69 my ( $self, $elem, $document ) = @_;
51              
52             # Optimization: if its short enough now, parsing won't make it longer
53 24 100       100 return if $self->{_max_characters} >= length $elem->get_match_string();
54              
55 20 50       450 my $re = $document->ppix_regexp_from_element( $elem )
56             or return; # Abort on syntax error.
57 20 100       217143 $re->failures()
58             and return; # Abort if parse errors found.
59 19 50       157 my $qr = $re->regular_expression()
60             or return; # Abort if no regular expression.
61              
62 19         428 my $length = 0;
63             # We use map { $_->tokens() } qr->children() rather than just
64             # $qr->tokens() because we are not interested in the delimiters.
65 19         62 foreach my $token ( map { $_->tokens() } $qr->children() ) {
  392         1839  
66              
67             # Do not count whitespace or comments
68 392 100       2229 $token->significant() or next;
69              
70 363 100       1892 if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' ) ) {
    100          
71              
72             # Do not penalize long variable names
73 12         47 $length += min( $MAX_VARIABLE_LENGTH, length $token->content() );
74              
75             } elsif ( $token->isa( 'PPIx::Regexp::Token::Literal' ) ) {
76              
77             # Do not penalize long literals like \p{...}
78 350         699 $length += min( $MAX_LITERAL_LENGTH, length $token->content() );
79              
80             } else {
81              
82             # Take everything else at face value
83 1         10 $length += length $token->content();
84              
85             }
86              
87             }
88              
89 19 100       198 return if $self->{_max_characters} >= $length;
90              
91 6         44 return $self->violation( $DESC, $EXPL, $elem );
92             }
93              
94             1;
95              
96             __END__
97              
98             #-----------------------------------------------------------------------------
99              
100             =pod
101              
102             =for stopwords BNF Tatsuhiko Miyagawa
103              
104             =head1 NAME
105              
106             Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes - Split long regexps into smaller C<qr//> chunks.
107              
108              
109             =head1 AFFILIATION
110              
111             This Policy is part of the core L<Perl::Critic|Perl::Critic>
112             distribution.
113              
114              
115             =head1 DESCRIPTION
116              
117             Big regexps are hard to read, perhaps even the hardest part of Perl.
118             A good practice to write digestible chunks of regexp and put them
119             together. This policy flags any regexp that is longer than C<N>
120             characters, where C<N> is a configurable value that defaults to 60.
121             If the regexp uses the C<x> flag, then the length is computed after
122             parsing out any comments or whitespace.
123              
124             Unfortunately the use of descriptive (and therefore longish) variable
125             names can cause regexps to be in violation of this policy, so
126             interpolated variables are counted as 4 characters no matter how long
127             their names actually are.
128              
129              
130             =head1 CASE STUDY
131              
132             As an example, look at the regexp used to match email addresses in
133             L<Email::Valid::Loose|Email::Valid::Loose> (tweaked lightly to wrap
134             for POD)
135              
136             (?x-ism:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]
137             \000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015
138             "]*)*")(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[
139             \]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n
140             \015"]*)*")|\.)*\@(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,
141             ;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]
142             )(?:\.(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000
143             -\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)
144              
145             which is constructed from the following code:
146              
147             my $esc = '\\\\';
148             my $period = '\.';
149             my $space = '\040';
150             my $open_br = '\[';
151             my $close_br = '\]';
152             my $nonASCII = '\x80-\xff';
153             my $ctrl = '\000-\037';
154             my $cr_list = '\n\015';
155             my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; # "
156             my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
157             my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
158             my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;# "
159             my $atom = qq<$atom_char+(?!$atom_char)>;
160             my $quoted_str = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; # "
161             my $word = qq<(?:$atom|$quoted_str)>;
162             my $domain_ref = $atom;
163             my $domain_lit = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>;
164             my $sub_domain = qq<(?:$domain_ref|$domain_lit)>;
165             my $domain = qq<$sub_domain(?:$period$sub_domain)*>;
166             my $local_part = qq<$word(?:$word|$period)*>; # This part is modified
167             $Addr_spec_re = qr<$local_part\@$domain>;
168              
169             If you read the code from bottom to top, it is quite readable. And,
170             you can even see the one violation of RFC822 that Tatsuhiko Miyagawa
171             deliberately put into Email::Valid::Loose to allow periods. Look for
172             the C<|\.> in the upper regexp to see that same deviation.
173              
174             One could certainly argue that the top regexp could be re-written more
175             legibly with C<m//x> and comments. But the bottom version is
176             self-documenting and, for example, doesn't repeat C<\x80-\xff> 18
177             times. Furthermore, it's much easier to compare the second version
178             against the source BNF grammar in RFC 822 to judge whether the
179             implementation is sound even before running tests.
180              
181              
182             =head1 CONFIGURATION
183              
184             This policy allows regexps up to C<N> characters long, where C<N>
185             defaults to 60. You can override this to set it to a different number
186             with the C<max_characters> setting. To do this, put entries in a
187             F<.perlcriticrc> file like this:
188              
189             [RegularExpressions::ProhibitComplexRegexes]
190             max_characters = 40
191              
192              
193             =head1 CREDITS
194              
195             Initial development of this policy was supported by a grant from the
196             Perl Foundation.
197              
198              
199             =head1 AUTHOR
200              
201             Chris Dolan <cdolan@cpan.org>
202              
203              
204             =head1 COPYRIGHT
205              
206             Copyright (c) 2007-2011 Chris Dolan. Many rights reserved.
207              
208             This program is free software; you can redistribute it and/or modify
209             it under the same terms as Perl itself. The full text of this license
210             can be found in the LICENSE file included with this module
211              
212             =cut
213              
214             # Local Variables:
215             # mode: cperl
216             # cperl-indent-level: 4
217             # fill-column: 78
218             # indent-tabs-mode: nil
219             # c-indentation-style: bsd
220             # End:
221             # ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :