File Coverage

blib/lib/Perl/Critic/Policy/RegularExpressions/ProhibitEmptyAlternatives.pm
Criterion Covered Total %
statement 77 79 97.4
branch 39 46 84.7
condition 30 39 76.9
subroutine 18 19 94.7
pod 4 5 80.0
total 168 188 89.3


line stmt bran cond sub pod time code
1             package Perl::Critic::Policy::RegularExpressions::ProhibitEmptyAlternatives;
2              
3 2     2   493149 use 5.006001;
  2         16  
4 2     2   11 use strict;
  2         4  
  2         33  
5 2     2   8 use warnings;
  2         4  
  2         63  
6              
7 2     2   12 use English qw{ -no_match_vars };
  2         10  
  2         15  
8 2     2   1799 use PPIx::Regexp 0.070; # For is_quantifier()
  2         223859  
  2         58  
9 2     2   16 use Readonly;
  2         5  
  2         103  
10              
11             use Perl::Critic::Exception::Configuration::Option::Policy::ParameterValue
12 2     2   13 qw{ throw_policy_value };
  2         5  
  2         55  
13 2     2   102 use Perl::Critic::Utils qw< :booleans :characters hashify :severities >;
  2         4  
  2         127  
14              
15 2     2   618 use base 'Perl::Critic::Policy';
  2         5  
  2         1055  
16              
17             our $VERSION = '0.005';
18             # The problem we are solving with the following is that older Perls do
19             # not like the underscore in a development version number. I do not
20             # believe this violates the spirit of the disabled policy.
21             $VERSION =~ s/ _ //smxg; ## no critic (RequireConstantVersion)
22              
23             #-----------------------------------------------------------------------------
24              
25             Readonly::Scalar my $DESC => q<Empty alternative>;
26             Readonly::Scalar my $EXPL => q<Empty alternatives always match>;
27              
28             Readonly::Scalar my $LAST_ELEMENT => -1;
29             Readonly::Scalar my $MAIN_CLASS => 'PPIx::Regexp::Structure::Main';
30             Readonly::Scalar my $NODE_CLASS => 'PPIx::Regexp::Node';
31             Readonly::Scalar my $OPERATOR_CLASS => 'PPIx::Regexp::Token::Operator';
32              
33             #-----------------------------------------------------------------------------
34              
35             sub supported_parameters { return (
36             {
37 10     10 0 53946 name => 'allow_empty_final_alternative',
38             description => 'Allow final alternative to be empty',
39             behavior => 'boolean',
40             default_string => '0',
41             },
42             {
43             name => 'allow_if_group_anchored',
44             description => 'Allow empty alternatives if the group is anchored on the right',
45             behavior => 'boolean',
46             default_string => '0',
47             },
48             {
49             name => 'ignore_files',
50             description => 'Ignore the specified files',
51             behavior => 'string',
52             parser => \&_make_ignore_regexp,
53             },
54             ) }
55              
56 11     11 1 126 sub default_severity { return $SEVERITY_MEDIUM }
57 0     0 1 0 sub default_themes { return qw< trw maintenance > }
58 9     9 1 50299 sub applies_to { return qw<
59             PPI::Token::Regexp::Match
60             PPI::Token::Regexp::Substitute
61             PPI::Token::QuoteLike::Regexp
62             > }
63              
64             #-----------------------------------------------------------------------------
65              
66             sub violates {
67 25     25 1 1394 my ( $self, $elem, $document ) = @_;
68              
69             # Ignore if told to do so.
70 25 100 66     81 if ( $self->{_ignore_files__re} &&
71             defined( my $logical_filename = $document->logical_filename() )
72             ) {
73             $logical_filename =~ $self->{_ignore_files__re}
74 1 50       80 and return;
75             }
76              
77             # Make a PPIx::Regexp from the PPI element for further analysis.
78 24 50       67 my $ppix = $document->ppix_regexp_from_element( $elem )
79             or return;
80              
81             # We are only interested in the regexp portion.
82 24 50       143659 my $re = $ppix->regular_expression()
83             or return;
84              
85 24 100       437 $self->_is_node_in_violation( $re )
86             or return;
87              
88 11         63 return $self->violation( $DESC, $EXPL, $elem );
89             }
90              
91             #-----------------------------------------------------------------------------
92              
93             # Analyze the given node. Return a true value if it represents a
94             # violation, and a false value otherwise.
95             sub _is_node_in_violation {
96 42     42   90 my ( $self, $node ) = @_;
97              
98 42 100       113 my @schildren = $node->schildren()
99             or return $FALSE; # No children, no empty alternatives.
100              
101 41         915 my $prev_is_alternation = $TRUE; # Assume just saw an alternation.
102 41         77 my $found_empty_alternative = $FALSE; # Have not found an empty one yet
103              
104 41         127 foreach my $kid ( @schildren ) {
105              
106 220 100 66     675 if ( $kid->isa( $OPERATOR_CLASS ) &&
107             $PIPE eq $kid->content() ) {
108             # $kid is an alternation operator
109 48   100     366 $found_empty_alternative ||= $prev_is_alternation;
110 48         100 $prev_is_alternation = $TRUE;
111             } else {
112 172 100 100     443 $kid->isa( $NODE_CLASS )
113             and $self->_is_node_in_violation( $kid )
114             and return $TRUE; # Found violation.
115             # $kid is something else
116 164         277 $prev_is_alternation = $FALSE;
117             }
118             }
119              
120             # At this point:
121             # $found_empty_alternative is true if at least one alternative
122             # before the last is empty;
123             # $prev_is_alternation is true if the last alternative is empty.
124              
125             # IF we found no empty alternatives THEN we are not in violation.
126             $found_empty_alternative
127 33 100 100     130 or $prev_is_alternation
128             or return $FALSE;
129              
130             # IF we are in an extended bracketed character class an empty
131             # alternative is a syntax error. So we call it a violation.
132 17 100       75 $node->in_regex_set()
133             and return $TRUE;
134              
135             # IF the last alternative is empty AND no other alternative is empty
136             # AND allow_empty_final_alternative is true THEN we are not in
137             # violation.
138             $prev_is_alternation
139             and not $found_empty_alternative
140             and $self->{_allow_empty_final_alternative}
141 14 100 100     536 and return $FALSE;
      100        
142              
143             # IF allow_if_group_anchored is true AND the group is in fact
144             # anchored THEN we are not in violation.
145             $self->{_allow_if_group_anchored}
146 13 100 100     47 and $self->_is_node_anchored( $node )
147             and return $FALSE;
148              
149             # We have exhausted all appeals
150 8         33 return $TRUE;
151             }
152              
153             #-----------------------------------------------------------------------------
154              
155             Readonly::Hash my %ZERO_LENGTH_LOOKBEHIND => hashify( qw{
156             ?<! *nlb: *negative_lookbehind: ?<= *plb: *positive_lookbehind:
157             } );
158              
159             sub _is_node_anchored {
160 8     8   18 my ( $self, $node ) = @_;
161 8         15 my $elem = $node;
162              
163 8   66     28 while ( $elem = $elem->snext_sibling() || $elem->parent() ) {
164              
165             # If $elem is a main structure we must terminate in failure,
166             # since anything beyond can not be an anchor.
167 13 100       753 $elem->isa( $MAIN_CLASS )
168             and return $FALSE;
169              
170             # If $elem is an alternation operator we need to skip to the end
171             # of the group.
172 10 100 66     35 if ( $elem->isa( $OPERATOR_CLASS ) &&
173             $PIPE eq $elem->content() ) {
174 1         10 $elem = _last_ssibling( $elem );
175 1         23 next;
176             }
177              
178             # If $is_matcher is undef it means we can not determine whether
179             # $elem is a matcher or not. It is (or at least used to be) the
180             # policy to prefer false negatives over false positives, so if
181             # we get undef we assume the empty alternation is anchored.
182 9         13 my $is_matcher;
183 9 50       34 defined( $is_matcher = $elem->is_matcher() )
184             or return $TRUE; # Assume anchored.
185              
186             # If $is_matcher is defined but false it means we are something
187             # that does not actually do matching -- say, an operator,
188             # something that does control like \Q, or some such. In this
189             # case we keep looking for matchers.
190 9 100       61 not $is_matcher
191             and next;
192              
193             # If the element can be quantified to zero it is not a
194             # suitable anchor, but maybe something beyond it is.
195 7 100       18 _maybe_quantified_to_zero( $elem )
196             and next;
197              
198             # A zero-length lookbehind does not provide a suitable
199             # anchor. Look some more.
200             $elem->isa( 'PPIx::Regexp::Structure::Assertion' )
201 5 50 33     217 and $ZERO_LENGTH_LOOKBEHIND{ $elem->content() }
202             and next;
203              
204             # At this point some hand-waving occurs.
205              
206             # What I believe we have here is one of the following:
207             # * An assertion;
208             # * A character class;
209             # * A literal;
210             # * A reference; or
211             # * A group.
212             #
213             # All but the last two should be OK.
214             #
215             # The reference is problematic because since Perl 5.10 it is not
216             # possible to unambiguously identify what a reference refers to.
217             # There can be more than one capture of a given name, and
218             # without actually running the regexp against the actual string
219             # we can't realy know which one(s) actually captured something.
220             # Numbered captures would be better, except that numbers are
221             # duplicated inside a branch reset.
222             #
223             # Groups can in principal be analyzed, but whether they can all
224             # be analyzed adequately is another question.
225             #
226             # In practice what we do is punt using the aforementioned
227             # "prefer false negatives" convention.
228              
229 5         28 return $TRUE; # Anchored.
230             }
231              
232             # We hit the end of the regex without finding a suitable anchor.
233 0         0 return $FALSE; # Not anchored.
234             }
235              
236             #-----------------------------------------------------------------------------
237              
238             # Return the last significant sibling of the given element. This may be
239             # the element passed in.
240             sub _last_ssibling {
241 1     1   3 my ( $elem ) = @_;
242 1 50       7 my $parent = $elem->parent()
243             or return $elem;
244 1   33     20 return $parent->schild( $LAST_ELEMENT ) || $elem;
245             }
246              
247             #-----------------------------------------------------------------------------
248              
249             # Custom parser for the ignore_files configuration item. The regexp
250             # ends up in {_ignore_files__re}.
251             sub _make_ignore_regexp {
252 10     10   11695 my ( $self, $parameter, $config_string ) = @_;
253 10 100 66     44 if ( defined $config_string && $EMPTY ne $config_string ) {
254 1 50       4 $self->{_ignore_files__re} = eval {
255 1         20 qr<$config_string>; ## no critic (RequireDotMatchAnything,RequireExtendedFormatting,RequireLineBoundaryMatching)
256             } or throw_policy_value
257             policy => $self->get_short_name(),
258             option_name => $parameter->get_name(),
259             option_value => $config_string,
260             message_suffix => "failed to parse: $EVAL_ERROR",
261             ;
262             }
263 10         23 return;
264             }
265              
266             #-----------------------------------------------------------------------------
267              
268             # Return true if the given element is quantified AND 0 is an allowed
269             # quantity. In practice this means quantifiers *, ?, {0}, {0,...}
270             sub _maybe_quantified_to_zero {
271 7     7   16 my ( $elem ) = @_;
272 7 100       67 my $quant = $elem->snext_sibling()
273             or return $FALSE;
274 3 100       144 $quant->is_quantifier()
275             or return $FALSE;
276 2         12 local $_ = $quant->content();
277 2         19 return m/ \A (?: [*?] \z | [{] 0+ [,}] ) /smx;
278             }
279              
280             #-----------------------------------------------------------------------------
281              
282             1;
283              
284             __END__
285              
286             #-----------------------------------------------------------------------------
287              
288             =pod
289              
290             =head1 NAME
291              
292             Perl::Critic::Policy::RegularExpressions::ProhibitEmptyAlternatives - Beware empty alternatives, because they always match.
293              
294             =head1 AFFILIATION
295              
296             This Policy is stand-alone, and is not part of the core
297             L<Perl::Critic|Perl::Critic>.
298              
299             =head1 DESCRIPTION
300              
301             This L<Perl::Critic|Perl::Critic> policy checks for empty alternatives;
302             that is, things like C</a||b/>. The problem with these is that they
303             always match, which is very probably not what you want.
304              
305             The possible exception is the final alternative, where you may indeed
306             want something like C</glass(?es|y|)/> to match C<'glass'>, C<'glassy'>,
307             or C<'glasses'>, though this is not the usual idiom. This policy does
308             not allow empty final alternatives by default, but it can be configured
309             to do so.
310              
311             B<Note> that empty alternatives are syntax errors in extended bracketed
312             character classes, so this policy treats them as violations no matter
313             how it is configured.
314              
315             This policy was inspired by y's
316             L<https://github.com/Perl-Critic/Perl-Critic/issues/727>.
317              
318             =head1 CONFIGURATION
319              
320             This policy supports the following configuration items.
321              
322             =head2 allow_empty_final_alternative
323              
324             By default, this policy prohibits all empty alternatives, since they
325             match anything. It may make sense, though, to leave the final
326             alternative in a regexp or group empty. For example,
327             C</(?:Larry|Moe|Curly|)/> is equivalent to the perhaps-more-usual idiom
328             C</(?:Larry|Moe|Curly)?/>.
329              
330             If you wish to allow this, you can add a block like this to your
331             F<.perlcriticrc> file:
332              
333             [RegularExpressions::ProhibitEmptyAlternatives]
334             allow_empty_final_alternative = 1
335              
336             =head2 allow_if_group_anchored
337              
338             It may make sense to allow empty alternatives if they occur in a group
339             that is anchored on the right. For example,
340              
341             "What ho, Porthos!" =~ /(|Athos|Porthos|Aramis)!/
342              
343             captures C<'Porthos'> because the regular expression engine sees
344             C<'Porthos!'> before it sees C<'!'>.
345              
346             If you wish to allow this, you can add a block like this to your
347             F<.perlcriticrc> file:
348              
349             [RegularExpressions::ProhibitEmptyAlternatives]
350             allow_if_group_anchored = 1
351              
352             B<Caveat:> I believe that a full static analysis of this case is not
353             possible when back references or recursions must be considered as
354             anchors. Correct analysis of groups (captures or otherwise) is not
355             currently attempted. In these cases the code assumes that the
356             entity represents an anchor.
357              
358             =head2 ignore_files
359              
360             It may make sense to ignore some files. For example,
361             L<Module::Install|Module::Install> component
362             F<inc/Module/Install/Metadata.pm> is known to violate this policy, at
363             least in its default configuration -- though it passes if
364             C<allow_empty_final_alternative> is enabled.
365              
366             If you wish to ignore certain files, you can add a block like this to
367             your F<.perlcriticrc> file:
368              
369             [RegularExpressions::ProhibitEmptyAlternatives]
370             allow_if_group_anchored = inc/Module/Install/Metadata\.pm\z
371              
372             The value is a regular expression.
373              
374             =head1 SUPPORT
375              
376             Support is by the author. Please file bug reports at
377             L<https://rt.cpan.org/Public/Dist/Display.html?Name=Perl-Critic-Policy-RegularExpressions-ProhibitEmptyAlternatives>,
378             L<https://github.com/trwyant/perl-Perl-Critic-Policy-RegularExpressions-ProhibitEmptyAlternatives/issues>, or in
379             electronic mail to the author.
380              
381             =head1 AUTHOR
382              
383             Thomas R. Wyant, III F<wyant at cpan dot org>
384              
385             =head1 COPYRIGHT
386              
387             Copyright (C) 2020-2021 Thomas R. Wyant, III
388              
389             =head1 LICENSE
390              
391             This program is free software; you can redistribute it and/or modify it
392             under the same terms as Perl 5.10.0. For more details, see the full text
393             of the licenses in the directory LICENSES.
394              
395             This program is distributed in the hope that it will be useful, but
396             without any warranty; without even the implied warranty of
397             merchantability or fitness for a particular purpose.
398              
399             =cut
400              
401             # Local Variables:
402             # mode: cperl
403             # cperl-indent-level: 4
404             # fill-column: 72
405             # indent-tabs-mode: nil
406             # c-indentation-style: bsd
407             # End:
408             # ex: set ts=8 sts=4 sw=4 tw=72 ft=perl expandtab shiftround :