File Coverage

blib/lib/Perl/Tidy/Tokenizer.pm
Criterion Covered Total %
statement 2330 3195 72.9
branch 1118 1692 66.0
condition 516 874 59.0
subroutine 160 191 83.7
pod 0 146 0.0
total 4124 6098 67.6


line stmt bran cond sub pod time code
1             #####################################################################
2             #
3             # The Perl::Tidy::Tokenizer package is essentially a filter which
4             # reads lines of perl source code from a source object and provides
5             # corresponding tokenized lines through its get_line() method. Lines
6             # flow from the source_object to the caller like this:
7             #
8             # source_object --> Tokenizer --> calling routine
9             # get_line() get_line() line_of_tokens
10             #
11             # The source object can be a STRING ref, an ARRAY ref, or an object with a
12             # get_line() method which supplies one line (a character string) perl call.
13             # The Tokenizer returns a reference to a data structure 'line_of_tokens'
14             # containing one tokenized line for each call to its get_line() method.
15             #
16             # NOTE: This is not a real class. Only one tokenizer my be used.
17             #
18             ########################################################################
19              
20             package Perl::Tidy::Tokenizer;
21 39     39   307 use strict;
  39         147  
  39         1330  
22 39     39   254 use warnings;
  39         101  
  39         1153  
23 39     39   206 use English qw( -no_match_vars );
  39         105  
  39         218  
24              
25             our $VERSION = '20230912';
26              
27 39     39   14536 use Carp;
  39         130  
  39         2290  
28              
29 39     39   289 use constant DEVEL_MODE => 0;
  39         107  
  39         2572  
30 39     39   279 use constant EMPTY_STRING => q{};
  39         121  
  39         2593  
31 39     39   263 use constant SPACE => q{ };
  39         98  
  39         2483  
32              
33             # Decimal values of some ascii characters for quick checks
34 39     39   315 use constant ORD_TAB => 9;
  39         83  
  39         2105  
35 39     39   284 use constant ORD_SPACE => 32;
  39         124  
  39         2213  
36 39     39   280 use constant ORD_PRINTABLE_MIN => 33;
  39         95  
  39         2140  
37 39     39   259 use constant ORD_PRINTABLE_MAX => 126;
  39         108  
  39         8817  
38              
39             # GLOBAL VARIABLES which change during tokenization:
40             # These could also be stored in $self but it is more convenient and
41             # efficient to make them global lexical variables.
42             # INITIALIZER: sub prepare_for_a_new_file
43             my (
44              
45             $brace_depth,
46             $context,
47             $current_package,
48             $last_nonblank_block_type,
49             $last_nonblank_token,
50             $last_nonblank_type,
51             $next_sequence_number,
52             $paren_depth,
53             $rbrace_context,
54             $rbrace_package,
55             $rbrace_structural_type,
56             $rbrace_type,
57             $rcurrent_depth,
58             $rcurrent_sequence_number,
59             $rdepth_array,
60             $ris_block_function,
61             $ris_block_list_function,
62             $ris_constant,
63             $ris_user_function,
64             $rnested_statement_type,
65             $rnested_ternary_flag,
66             $rparen_semicolon_count,
67             $rparen_vars,
68             $rparen_type,
69             $rsaw_function_definition,
70             $rsaw_use_module,
71             $rsquare_bracket_structural_type,
72             $rsquare_bracket_type,
73             $rstarting_line_of_current_depth,
74             $rtotal_depth,
75             $ruser_function_prototype,
76             $square_bracket_depth,
77             $statement_type,
78             $total_depth,
79             );
80              
81             my (
82              
83             # GLOBAL CONSTANTS for routines in this package,
84             # INITIALIZER: BEGIN block.
85             %can_start_digraph,
86             %expecting_operator_token,
87             %expecting_operator_types,
88             %expecting_term_token,
89             %expecting_term_types,
90             %is_block_operator,
91             %is_digraph,
92             %is_file_test_operator,
93             %is_if_elsif_unless,
94             %is_if_elsif_unless_case_when,
95             %is_indirect_object_taker,
96             %is_keyword_rejecting_question_as_pattern_delimiter,
97             %is_keyword_rejecting_slash_as_pattern_delimiter,
98             %is_keyword_taking_list,
99             %is_keyword_taking_optional_arg,
100             %is_q_qq_qw_qx_qr_s_y_tr_m,
101             %is_q_qq_qx_qr_s_y_tr_m,
102             %is_semicolon_or_t,
103             %is_sort_map_grep,
104             %is_sort_map_grep_eval_do,
105             %is_tetragraph,
106             %is_trigraph,
107             %is_valid_token_type,
108             %other_line_endings,
109             %really_want_term,
110             @closing_brace_names,
111             @opening_brace_names,
112              
113             # GLOBAL CONSTANT hash lookup table of operator expected values
114             # INITIALIZER: BEGIN block
115             %op_expected_table,
116              
117             # GLOBAL VARIABLES which are constant after being configured.
118             # INITIALIZER: BEGIN block and modified by sub check_options
119             %is_code_block_token,
120             %is_keyword,
121             %is_my_our_state,
122             %is_package,
123              
124             # INITIALIZER: sub check_options
125             $code_skipping_pattern_begin,
126             $code_skipping_pattern_end,
127              
128             $rOpts_code_skipping,
129             $rOpts_code_skipping_begin,
130             $rOpts_starting_indentation_level,
131             $rOpts_indent_columns,
132             $rOpts_look_for_hash_bang,
133             $rOpts_look_for_autoloader,
134             $rOpts_look_for_selfloader,
135             $rOpts_trim_qw,
136             $rOpts_extended_syntax,
137             $rOpts_continuation_indentation,
138             $rOpts_outdent_labels,
139             $rOpts_maximum_level_errors,
140             $rOpts_maximum_unexpected_errors,
141              
142             $tabsize,
143             %is_END_DATA_format_sub,
144             %is_grep_alias,
145             %is_sub,
146             $guess_if_method,
147             );
148              
149             # possible values of operator_expected()
150 39     39   1581 use constant TERM => -1;
  39         99  
  39         2270  
151 39     39   269 use constant UNKNOWN => 0;
  39         86  
  39         2222  
152 39     39   285 use constant OPERATOR => 1;
  39         149  
  39         2227  
153              
154             # possible values of context
155 39     39   341 use constant SCALAR_CONTEXT => -1;
  39         151  
  39         2168  
156 39     39   278 use constant UNKNOWN_CONTEXT => 0;
  39         139  
  39         2172  
157 39     39   308 use constant LIST_CONTEXT => 1;
  39         155  
  39         2427  
158              
159             # Maximum number of little messages; probably need not be changed.
160 39     39   307 use constant MAX_NAG_MESSAGES => 6;
  39         101  
  39         7947  
161              
162 0         0 BEGIN {
163              
164             # Array index names for $self.
165             # Do not combine with other BEGIN blocks (c101).
166 39     39   284507 my $i = 0;
167             use constant {
168 39         18312 _rhere_target_list_ => $i++,
169             _in_here_doc_ => $i++,
170             _here_doc_target_ => $i++,
171             _here_quote_character_ => $i++,
172             _in_data_ => $i++,
173             _in_end_ => $i++,
174             _in_format_ => $i++,
175             _in_error_ => $i++,
176             _in_trouble_ => $i++,
177             _warning_count_ => $i++,
178             _html_tag_count_ => $i++,
179             _in_pod_ => $i++,
180             _in_skipped_ => $i++,
181             _in_attribute_list_ => $i++,
182             _in_quote_ => $i++,
183             _quote_target_ => $i++,
184             _line_start_quote_ => $i++,
185             _starting_level_ => $i++,
186             _know_starting_level_ => $i++,
187             _last_line_number_ => $i++,
188             _saw_perl_dash_P_ => $i++,
189             _saw_perl_dash_w_ => $i++,
190             _saw_use_strict_ => $i++,
191             _saw_v_string_ => $i++,
192             _saw_brace_error_ => $i++,
193             _hit_bug_ => $i++,
194             _look_for_autoloader_ => $i++,
195             _look_for_selfloader_ => $i++,
196             _saw_autoloader_ => $i++,
197             _saw_selfloader_ => $i++,
198             _saw_hash_bang_ => $i++,
199             _saw_end_ => $i++,
200             _saw_data_ => $i++,
201             _saw_negative_indentation_ => $i++,
202             _started_tokenizing_ => $i++,
203             _debugger_object_ => $i++,
204             _diagnostics_object_ => $i++,
205             _logger_object_ => $i++,
206             _unexpected_error_count_ => $i++,
207             _started_looking_for_here_target_at_ => $i++,
208             _nearly_matched_here_target_at_ => $i++,
209             _line_of_text_ => $i++,
210             _rlower_case_labels_at_ => $i++,
211             _maximum_level_ => $i++,
212             _true_brace_error_count_ => $i++,
213             _rOpts_ => $i++,
214             _rinput_lines_ => $i++,
215             _input_line_index_next_ => $i++,
216 39     39   290 };
  39         116  
217             } ## end BEGIN
218              
219             { ## closure for subs to count instances
220              
221             # methods to count instances
222             my $_count = 0;
223 0     0 0 0 sub get_count { return $_count; }
224 562     562   2495 sub _increment_count { return ++$_count }
225 562     562   1343 sub _decrement_count { return --$_count }
226             }
227              
228             sub DESTROY {
229 562     562   1415 my $self = shift;
230 562         3016 $self->_decrement_count();
231 562         8022 return;
232             }
233              
234             sub AUTOLOAD {
235              
236             # Catch any undefined sub calls so that we are sure to get
237             # some diagnostic information. This sub should never be called
238             # except for a programming error.
239 0     0   0 our $AUTOLOAD;
240 0 0       0 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
241 0         0 my ( $pkg, $fname, $lno ) = caller();
242 0         0 my $my_package = __PACKAGE__;
243 0         0 print {*STDERR} <<EOM;
  0         0  
244             ======================================================================
245             Error detected in package '$my_package', version $VERSION
246             Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
247             Called from package: '$pkg'
248             Called from File '$fname' at line '$lno'
249             This error is probably due to a recent programming change
250             ======================================================================
251             EOM
252 0         0 exit 1;
253             } ## end sub AUTOLOAD
254              
255             sub Die {
256 0     0 0 0 my ($msg) = @_;
257 0         0 Perl::Tidy::Die($msg);
258 0         0 croak "unexpected return from Perl::Tidy::Die";
259             }
260              
261             sub Fault {
262 0     0 0 0 my ( $self, $msg ) = @_;
263              
264             # This routine is called for errors that really should not occur
265             # except if there has been a bug introduced by a recent program change.
266             # Please add comments at calls to Fault to explain why the call
267             # should not occur, and where to look to fix it.
268 0         0 my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
269 0         0 my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
270 0         0 my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
271 0         0 my $pkg = __PACKAGE__;
272              
273             # Catch potential error of Fault not called as a method
274 0         0 my $input_stream_name;
275 0 0       0 if ( !ref($self) ) {
276 0         0 $msg = "Fault not called as a method - please fix\n";
277 0 0 0     0 if ( $self && length($self) < 200 ) { $msg .= $self }
  0         0  
278 0         0 $self = undef;
279 0         0 $input_stream_name = "(UNKNOWN)";
280             }
281             else {
282 0         0 $input_stream_name = $self->get_input_stream_name();
283             }
284              
285 0         0 Die(<<EOM);
286             ==============================================================================
287             While operating on input stream with name: '$input_stream_name'
288             A fault was detected at line $line0 of sub '$subroutine1'
289             in file '$filename1'
290             which was called from line $line1 of sub '$subroutine2'
291             Message: '$msg'
292             This is probably an error introduced by a recent programming change.
293             $pkg reports VERSION='$VERSION'.
294             ==============================================================================
295             EOM
296              
297             # We shouldn't get here, but this return is to keep Perl-Critic from
298             # complaining.
299 0         0 return;
300             } ## end sub Fault
301              
302             sub make_code_skipping_pattern {
303 1120     1120 0 3122 my ( $rOpts, $opt_name, $default ) = @_;
304 1120         2549 my $param = $rOpts->{$opt_name};
305 1120 100       2899 if ( !$param ) { $param = $default }
  1118         2042  
306 1120         4634 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
307 1120 50       5015 if ( $param !~ /^#/ ) {
308 0         0 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
309             }
310 1120         3561 my $pattern = '^\s*' . $param . '\b';
311 1120 50       3362 if ( Perl::Tidy::Formatter::bad_pattern($pattern) ) {
312 0         0 Die(
313             "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
314             );
315             }
316 1120         3885 return $pattern;
317             } ## end sub make_code_skipping_pattern
318              
319             sub check_options {
320              
321             # Check Tokenizer parameters
322 560     560 0 1641 my $rOpts = shift;
323              
324 560         2221 %is_sub = ();
325 560         1876 $is_sub{'sub'} = 1;
326              
327 560         4258 %is_END_DATA_format_sub = (
328             '__END__' => 1,
329             '__DATA__' => 1,
330             'format' => 1,
331             'sub' => 1,
332             );
333              
334             # Install any aliases to 'sub'
335 560 100       2292 if ( $rOpts->{'sub-alias-list'} ) {
336              
337             # Note that any 'sub-alias-list' has been preprocessed to
338             # be a trimmed, space-separated list which includes 'sub'
339             # for example, it might be 'sub method fun'
340 3         31 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
341 3         17 foreach my $word (@sub_alias_list) {
342 11         30 $is_sub{$word} = 1;
343 11         24 $is_END_DATA_format_sub{$word} = 1;
344             }
345             }
346              
347             # Set global flag to say if we have to guess if bareword 'method' is
348             # a sub when 'method' is in %is_sub. This will be true unless:
349             # (1) the user entered 'method' as sub alias, or
350             # (2) the user set --use-feature=class
351             # In these two cases we can assume that 'method' is a sub alias.
352 560         1406 $guess_if_method = 1;
353 560 100       2168 if ( $is_sub{'method'} ) { $guess_if_method = 0 }
  2         7  
354              
355             #------------------------------------------------
356             # Update hash values for any -use-feature options
357             #------------------------------------------------
358              
359 560         1315 my $use_feature_class = 1;
360 560 50       2133 if ( $rOpts->{'use-feature'} ) {
361 0 0       0 if ( $rOpts->{'use-feature'} =~ /\bnoclass\b/ ) {
    0          
362 0         0 $use_feature_class = 0;
363             }
364             elsif ( $rOpts->{'use-feature'} =~ /\bclass\b/ ) {
365 0         0 $guess_if_method = 0;
366             }
367             else {
368             ## neither 'class' nor 'noclass' seen so use default
369             }
370             }
371              
372             # These are the main updates for this option. There are additional
373             # changes elsewhere, usually indicated with a comment 'rt145706'
374              
375             # Update hash values for use_feature=class, added for rt145706
376             # see 'perlclass.pod'
377              
378             # IMPORTANT: We are changing global hash values initially set in a BEGIN
379             # block. Values must be defined (true or false) for each of these new
380             # words whether true or false. Otherwise, programs using the module which
381             # change options between runs (such as test code) will have
382             # incorrect settings and fail.
383              
384             # There are 4 new keywords:
385              
386             # 'class' - treated specially as generalization of 'package'
387             # Note: we must not set 'class' to be a keyword to avoid problems
388             # with older uses.
389 560         2014 $is_package{'class'} = $use_feature_class;
390              
391             # 'method' - treated like sub using the sub-alias-list option
392             # Note: we must not set 'method' to be a keyword to avoid problems
393             # with older uses.
394 560 50       1886 if ($use_feature_class) {
395 560         1565 $is_sub{'method'} = 1;
396 560         1433 $is_END_DATA_format_sub{'method'} = 1;
397             }
398              
399             # 'field' - added as a keyword, and works like 'my'
400 560         1680 $is_keyword{'field'} = $use_feature_class;
401 560         1603 $is_my_our_state{'field'} = $use_feature_class;
402              
403             # 'ADJUST' - added as a keyword and works like 'BEGIN'
404             # TODO: if ADJUST gets a paren list, this will need to be updated
405 560         1363 $is_keyword{'ADJUST'} = $use_feature_class;
406 560         1431 $is_code_block_token{'ADJUST'} = $use_feature_class;
407              
408 560         1984 %is_grep_alias = ();
409 560 50       1939 if ( $rOpts->{'grep-alias-list'} ) {
410              
411             # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
412             # space-separated list
413 560         5512 my @q = split /\s+/, $rOpts->{'grep-alias-list'};
414 560         5176 @{is_grep_alias}{@q} = (1) x scalar(@q);
415             }
416              
417 560         2022 $rOpts_starting_indentation_level = $rOpts->{'starting-indentation-level'};
418 560         1614 $rOpts_indent_columns = $rOpts->{'indent-columns'};
419 560         1348 $rOpts_look_for_hash_bang = $rOpts->{'look-for-hash-bang'};
420 560         1296 $rOpts_look_for_autoloader = $rOpts->{'look-for-autoloader'};
421 560         1259 $rOpts_look_for_selfloader = $rOpts->{'look-for-selfloader'};
422 560         1269 $rOpts_trim_qw = $rOpts->{'trim-qw'};
423 560         1254 $rOpts_extended_syntax = $rOpts->{'extended-syntax'};
424 560         1252 $rOpts_continuation_indentation = $rOpts->{'continuation-indentation'};
425 560         1207 $rOpts_outdent_labels = $rOpts->{'outdent-labels'};
426 560         1221 $rOpts_maximum_level_errors = $rOpts->{'maximum-level-errors'};
427 560         1264 $rOpts_maximum_unexpected_errors = $rOpts->{'maximum-unexpected-errors'};
428 560         1214 $rOpts_code_skipping = $rOpts->{'code-skipping'};
429 560         1241 $rOpts_code_skipping_begin = $rOpts->{'code-skipping-begin'};
430              
431             # In the Tokenizer, --indent-columns is just used for guessing old
432             # indentation, and must be positive. If -i=0 is used for this run (which
433             # is possible) we'll just guess that the old run used 4 spaces per level.
434 560 100       2028 if ( !$rOpts_indent_columns ) { $rOpts_indent_columns = 4 }
  12         28  
435              
436             # Define $tabsize, the number of spaces per tab for use in
437             # guessing the indentation of source lines with leading tabs.
438             # Assume same as for this run if tabs are used, otherwise assume
439             # a default value, typically 8
440             $tabsize =
441             $rOpts->{'entab-leading-whitespace'}
442             ? $rOpts->{'entab-leading-whitespace'}
443             : $rOpts->{'tabs'} ? $rOpts->{'indent-columns'}
444 560 50       2769 : $rOpts->{'default-tabsize'};
    100          
445 560 50       2048 if ( !$tabsize ) { $tabsize = 8 }
  0         0  
446              
447             $code_skipping_pattern_begin =
448 560         2699 make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
449 560         2045 $code_skipping_pattern_end =
450             make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
451              
452 560         2001 return;
453             } ## end sub check_options
454              
455             sub new {
456              
457 562     562 0 3192 my ( $class, @args ) = @_;
458              
459 562         5657 my %defaults = (
460             source_object => undef,
461             debugger_object => undef,
462             diagnostics_object => undef,
463             logger_object => undef,
464             starting_level => undef,
465             starting_line_number => 1,
466             rOpts => {},
467             );
468 562         4155 my %args = ( %defaults, @args );
469              
470             # we are given an object with a get_line() method to supply source lines
471 562         2177 my $source_object = $args{source_object};
472 562         1491 my $rOpts = $args{rOpts};
473              
474             # Check call args
475 562 50       2166 if ( !defined($source_object) ) {
476 0         0 Die(
477             "Perl::Tidy::Tokenizer::new called without a 'source_object' parameter\n"
478             );
479             }
480 562 50       2365 if ( !ref($source_object) ) {
481 0         0 Die(<<EOM);
482             sub Perl::Tidy::Tokenizer::new received a 'source_object' parameter which is not a reference;
483             'source_object' must be a reference to a STRING, ARRAY, or object with a 'getline' method
484             EOM
485             }
486              
487             # Tokenizer state data is as follows:
488             # _rhere_target_list_ reference to list of here-doc targets
489             # _here_doc_target_ the target string for a here document
490             # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
491             # to determine if interpolation is done
492             # _quote_target_ character we seek if chasing a quote
493             # _line_start_quote_ line where we started looking for a long quote
494             # _in_here_doc_ flag indicating if we are in a here-doc
495             # _in_pod_ flag set if we are in pod documentation
496             # _in_skipped_ flag set if we are in a skipped section
497             # _in_error_ flag set if we saw severe error (binary in script)
498             # _in_trouble_ set if we saw a troublesome lexical like 'my sub s'
499             # _warning_count_ number of calls to logger sub warning
500             # _html_tag_count_ number of apparent html tags seen (indicates html)
501             # _in_data_ flag set if we are in __DATA__ section
502             # _in_end_ flag set if we are in __END__ section
503             # _in_format_ flag set if we are in a format description
504             # _in_attribute_list_ flag telling if we are looking for attributes
505             # _in_quote_ flag telling if we are chasing a quote
506             # _starting_level_ indentation level of first line
507             # _diagnostics_object_ place to write debugging information
508             # _unexpected_error_count_ error count used to limit output
509             # _lower_case_labels_at_ line numbers where lower case labels seen
510             # _hit_bug_ program bug detected
511              
512 562         1339 my $self = [];
513 562         1543 $self->[_rhere_target_list_] = [];
514 562         1524 $self->[_in_here_doc_] = 0;
515 562         1865 $self->[_here_doc_target_] = EMPTY_STRING;
516 562         1478 $self->[_here_quote_character_] = EMPTY_STRING;
517 562         1499 $self->[_in_data_] = 0;
518 562         1528 $self->[_in_end_] = 0;
519 562         1566 $self->[_in_format_] = 0;
520 562         1335 $self->[_in_error_] = 0;
521 562         1467 $self->[_in_trouble_] = 0;
522 562         1290 $self->[_warning_count_] = 0;
523 562         1387 $self->[_html_tag_count_] = 0;
524 562         1287 $self->[_in_pod_] = 0;
525 562         1392 $self->[_in_skipped_] = 0;
526 562         1364 $self->[_in_attribute_list_] = 0;
527 562         1319 $self->[_in_quote_] = 0;
528 562         1441 $self->[_quote_target_] = EMPTY_STRING;
529 562         1266 $self->[_line_start_quote_] = -1;
530 562         1330 $self->[_starting_level_] = $args{starting_level};
531 562         1888 $self->[_know_starting_level_] = defined( $args{starting_level} );
532 562         1877 $self->[_last_line_number_] = $args{starting_line_number} - 1;
533 562         1405 $self->[_saw_perl_dash_P_] = 0;
534 562         1219 $self->[_saw_perl_dash_w_] = 0;
535 562         1411 $self->[_saw_use_strict_] = 0;
536 562         1267 $self->[_saw_v_string_] = 0;
537 562         1268 $self->[_saw_brace_error_] = 0;
538 562         1212 $self->[_hit_bug_] = 0;
539 562         1315 $self->[_look_for_autoloader_] = $rOpts_look_for_autoloader;
540 562         1500 $self->[_look_for_selfloader_] = $rOpts_look_for_selfloader;
541 562         1318 $self->[_saw_autoloader_] = 0;
542 562         1626 $self->[_saw_selfloader_] = 0;
543 562         1227 $self->[_saw_hash_bang_] = 0;
544 562         1247 $self->[_saw_end_] = 0;
545 562         1186 $self->[_saw_data_] = 0;
546 562         1399 $self->[_saw_negative_indentation_] = 0;
547 562         1273 $self->[_started_tokenizing_] = 0;
548 562         1325 $self->[_debugger_object_] = $args{debugger_object};
549 562         1393 $self->[_diagnostics_object_] = $args{diagnostics_object};
550 562         1371 $self->[_logger_object_] = $args{logger_object};
551 562         1233 $self->[_unexpected_error_count_] = 0;
552 562         1212 $self->[_started_looking_for_here_target_at_] = 0;
553 562         1400 $self->[_nearly_matched_here_target_at_] = undef;
554 562         1396 $self->[_line_of_text_] = EMPTY_STRING;
555 562         1167 $self->[_rlower_case_labels_at_] = undef;
556 562         1292 $self->[_maximum_level_] = 0;
557 562         1304 $self->[_true_brace_error_count_] = 0;
558 562         1283 $self->[_rOpts_] = $rOpts;
559              
560 562         1254 bless $self, $class;
561              
562 562         3422 $self->prepare_for_a_new_file($source_object);
563 562         3268 $self->find_starting_indentation_level();
564              
565             # This is not a full class yet, so die if an attempt is made to
566             # create more than one object.
567              
568 562 50       2684 if ( _increment_count() > 1 ) {
569 0         0 confess
570             "Attempt to create more than 1 object in $class, which is not a true class yet\n";
571             }
572              
573 562         4259 return $self;
574              
575             } ## end sub new
576              
577             # Called externally
578             sub get_unexpected_error_count {
579 4     4 0 14 my ($self) = @_;
580 4         19 return $self->[_unexpected_error_count_];
581             }
582              
583             # Called externally
584             sub is_keyword {
585 2796     2796 0 5090 my ($str) = @_;
586 2796         10028 return $is_keyword{$str};
587             }
588              
589             #----------------------------------------------------------------
590             # Line input routines, previously handled by the LineBuffer class
591             #----------------------------------------------------------------
592             sub make_source_array {
593              
594 562     562 0 1745 my ( $self, $line_source_object ) = @_;
595              
596             # Convert the source into an array of lines
597 562         1464 my $rinput_lines = [];
598              
599 562         1589 my $rsource = ref($line_source_object);
600              
601 562 50       3677 if ( !$rsource ) {
    50          
    50          
602              
603             # shouldn't happen: this should have been checked in sub new
604 0         0 $self->Fault(<<EOM);
605             sub Perl::Tidy::Tokenizer::new received a 'source_object' parameter which is not a reference;
606             'source_object' must be a reference to a STRING, ARRAY, or object with a 'getline' method
607             EOM
608             }
609              
610             # handle an ARRAY ref
611             elsif ( $rsource eq 'ARRAY' ) {
612 0         0 $rinput_lines = $line_source_object;
613             }
614              
615             # handle a SCALAR ref
616             elsif ( $rsource eq 'SCALAR' ) {
617 562         1074 my @lines = split /^/, ${$line_source_object};
  562         9247  
618 562         2316 $rinput_lines = \@lines;
619             }
620              
621             # handle an object - must have a get_line method
622             else {
623 0         0 while ( my $line = $line_source_object->get_line() ) {
624 0         0 push( @{$rinput_lines}, $line );
  0         0  
625             }
626             }
627              
628 562         2139 $self->[_rinput_lines_] = $rinput_lines;
629 562         1566 $self->[_input_line_index_next_] = 0;
630 562         1390 return;
631             } ## end sub make_source_array
632              
633             sub peek_ahead {
634 1235     1235 0 3077 my ( $self, $buffer_index ) = @_;
635              
636             # look $buffer_index lines ahead of the current location without disturbing
637             # the input
638 1235         2259 my $line;
639 1235         2626 my $rinput_lines = $self->[_rinput_lines_];
640 1235         2687 my $line_index = $buffer_index + $self->[_input_line_index_next_];
641 1235 100       2156 if ( $line_index < @{$rinput_lines} ) {
  1235         3502  
642 1223         2701 $line = $rinput_lines->[$line_index];
643             }
644 1235         4880 return $line;
645             } ## end sub peek_ahead
646              
647             #-----------------------------------------
648             # interface to Perl::Tidy::Logger routines
649             #-----------------------------------------
650             sub warning {
651              
652 0     0 0 0 my ( $self, $msg ) = @_;
653              
654 0         0 my $logger_object = $self->[_logger_object_];
655 0         0 $self->[_warning_count_]++;
656 0 0       0 if ($logger_object) {
657 0         0 my $msg_line_number = $self->[_last_line_number_];
658 0         0 $logger_object->warning( $msg, $msg_line_number );
659             }
660 0         0 return;
661             } ## end sub warning
662              
663             sub get_input_stream_name {
664              
665 0     0 0 0 my $self = shift;
666              
667 0         0 my $input_stream_name = EMPTY_STRING;
668 0         0 my $logger_object = $self->[_logger_object_];
669 0 0       0 if ($logger_object) {
670 0         0 $input_stream_name = $logger_object->get_input_stream_name();
671             }
672 0         0 return $input_stream_name;
673             } ## end sub get_input_stream_name
674              
675             sub complain {
676              
677 32     32 0 105 my ( $self, $msg ) = @_;
678              
679 32         84 my $logger_object = $self->[_logger_object_];
680 32 50       112 if ($logger_object) {
681 32         66 my $input_line_number = $self->[_last_line_number_];
682 32         179 $logger_object->complain( $msg, $input_line_number );
683             }
684 32         95 return;
685             } ## end sub complain
686              
687             sub write_logfile_entry {
688              
689 1860     1860 0 5072 my ( $self, $msg ) = @_;
690              
691 1860         3638 my $logger_object = $self->[_logger_object_];
692 1860 100       4637 if ($logger_object) {
693 1854         5576 $logger_object->write_logfile_entry($msg);
694             }
695 1860         5933 return;
696             } ## end sub write_logfile_entry
697              
698             sub interrupt_logfile {
699              
700 0     0 0 0 my $self = shift;
701              
702 0         0 my $logger_object = $self->[_logger_object_];
703 0 0       0 if ($logger_object) {
704 0         0 $logger_object->interrupt_logfile();
705             }
706 0         0 return;
707             } ## end sub interrupt_logfile
708              
709             sub resume_logfile {
710              
711 0     0 0 0 my $self = shift;
712              
713 0         0 my $logger_object = $self->[_logger_object_];
714 0 0       0 if ($logger_object) {
715 0         0 $logger_object->resume_logfile();
716             }
717 0         0 return;
718             } ## end sub resume_logfile
719              
720             sub brace_warning {
721 0     0 0 0 my ( $self, $msg ) = @_;
722 0         0 $self->[_saw_brace_error_]++;
723              
724 0         0 my $logger_object = $self->[_logger_object_];
725 0 0       0 if ($logger_object) {
726 0         0 my $msg_line_number = $self->[_last_line_number_];
727 0         0 $logger_object->brace_warning( $msg, $msg_line_number );
728             }
729 0         0 return;
730             } ## end sub brace_warning
731              
732             sub increment_brace_error {
733              
734             # This is same as sub brace_warning but without a message
735 0     0 0 0 my $self = shift;
736 0         0 $self->[_saw_brace_error_]++;
737              
738 0         0 my $logger_object = $self->[_logger_object_];
739 0 0       0 if ($logger_object) {
740 0         0 $logger_object->increment_brace_error();
741             }
742 0         0 return;
743             } ## end sub increment_brace_error
744              
745             sub get_saw_brace_error {
746 0     0 0 0 my $self = shift;
747 0         0 return $self->[_saw_brace_error_];
748             } ## end sub get_saw_brace_error
749              
750             sub report_definite_bug {
751 0     0 0 0 my $self = shift;
752 0         0 $self->[_hit_bug_] = 1;
753 0         0 my $logger_object = $self->[_logger_object_];
754 0 0       0 if ($logger_object) {
755 0         0 $logger_object->report_definite_bug();
756             }
757 0         0 return;
758             } ## end sub report_definite_bug
759              
760             #-------------------------------------
761             # Interface to Perl::Tidy::Diagnostics
762             #-------------------------------------
763             sub write_diagnostics {
764 0     0 0 0 my ( $self, $msg ) = @_;
765 0         0 my $input_line_number = $self->[_last_line_number_];
766 0         0 my $diagnostics_object = $self->[_diagnostics_object_];
767 0 0       0 if ($diagnostics_object) {
768 0         0 $diagnostics_object->write_diagnostics( $msg, $input_line_number );
769             }
770 0         0 return;
771             } ## end sub write_diagnostics
772              
773             sub report_tokenization_errors {
774              
775 562     562 0 1910 my ($self) = @_;
776              
777             # Report any tokenization errors and return a flag '$severe_error'.
778             # Set $severe_error = 1 if the tokenization errors are so severe that
779             # the formatter should not attempt to format the file. Instead, it will
780             # just output the file verbatim.
781              
782             # set severe error flag if tokenizer has encountered file reading problems
783             # (i.e. unexpected binary characters)
784             # or code which may not be formatted correctly (such as 'my sub q')
785             # The difference between _in_error_ and _in_trouble_ is that
786             # _in_error_ stops the tokenizer immediately whereas
787             # _in_trouble_ lets the tokenizer finish so that all errors are seen
788             # Both block formatting and cause the input stream to be output verbatim.
789 562   33     3616 my $severe_error = $self->[_in_error_] || $self->[_in_trouble_];
790              
791             # And do not format if it looks like an html file (c209)
792 562   33     4390 $severe_error ||= $self->[_html_tag_count_] && $self->[_warning_count_];
      33        
793              
794             # Inform the logger object on length of input stream
795 562         1884 my $logger_object = $self->[_logger_object_];
796 562 100       2076 if ($logger_object) {
797 560         1403 my $last_line_number = $self->[_last_line_number_];
798 560         2906 $logger_object->set_last_input_line_number($last_line_number);
799             }
800              
801 562         1485 my $maxle = $rOpts_maximum_level_errors;
802 562         1244 my $maxue = $rOpts_maximum_unexpected_errors;
803 562 50       1843 $maxle = 1 unless defined($maxle);
804 562 50       1658 $maxue = 0 unless defined($maxue);
805              
806 562         2432 my $level = get_indentation_level();
807 562 50       2393 if ( $level != $self->[_starting_level_] ) {
808 0         0 $self->warning("final indentation level: $level\n");
809 0         0 my $level_diff = $self->[_starting_level_] - $level;
810 0 0       0 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
  0         0  
811              
812             # Set severe error flag if the level error is greater than 1.
813             # The formatter can function for any level error but it is probably
814             # best not to attempt formatting for a high level error.
815 0 0 0     0 if ( $maxle >= 0 && $level_diff > $maxle ) {
816 0         0 $severe_error = 1;
817 0         0 $self->warning(<<EOM);
818             Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
819             EOM
820             }
821             }
822              
823 562         3213 $self->check_final_nesting_depths();
824              
825             # Likewise, large numbers of brace errors usually indicate non-perl
826             # scripts, so set the severe error flag at a low number. This is similar
827             # to the level check, but different because braces may balance but be
828             # incorrectly interlaced.
829 562 50       2855 if ( $self->[_true_brace_error_count_] > 2 ) {
830 0         0 $severe_error = 1;
831             }
832              
833 562 50 66     2662 if ( $rOpts_look_for_hash_bang
834             && !$self->[_saw_hash_bang_] )
835             {
836 0         0 $self->warning(
837             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
838             }
839              
840 562 50       1888 if ( $self->[_in_format_] ) {
841 0         0 $self->warning("hit EOF while in format description\n");
842             }
843              
844 562 50       2012 if ( $self->[_in_skipped_] ) {
845 0         0 $self->write_logfile_entry(
846             "hit EOF while in lines skipped with --code-skipping\n");
847             }
848              
849 562 50       2051 if ( $self->[_in_pod_] ) {
850              
851             # Just write log entry if this is after __END__ or __DATA__
852             # because this happens to often, and it is not likely to be
853             # a parsing error.
854 0 0 0     0 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
855 0         0 $self->write_logfile_entry(
856             "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
857             );
858             }
859              
860             else {
861 0         0 $self->complain(
862             "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
863             );
864             }
865              
866             }
867              
868 562 50       2043 if ( $self->[_in_here_doc_] ) {
869 0         0 $severe_error = 1;
870 0         0 my $here_doc_target = $self->[_here_doc_target_];
871 0         0 my $started_looking_for_here_target_at =
872             $self->[_started_looking_for_here_target_at_];
873 0 0       0 if ($here_doc_target) {
874 0         0 $self->warning(
875             "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
876             );
877             }
878             else {
879 0         0 $self->warning(<<EOM);
880             Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
881             (Perl will match to the end of file but this may not be intended).
882             EOM
883             }
884 0         0 my $nearly_matched_here_target_at =
885             $self->[_nearly_matched_here_target_at_];
886 0 0       0 if ($nearly_matched_here_target_at) {
887 0         0 $self->warning(
888             "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
889             );
890             }
891             }
892              
893             # Something is seriously wrong if we ended inside a quote
894 562 50       1966 if ( $self->[_in_quote_] ) {
895 0         0 $severe_error = 1;
896 0         0 my $line_start_quote = $self->[_line_start_quote_];
897 0         0 my $quote_target = $self->[_quote_target_];
898 0 0       0 my $what =
899             ( $self->[_in_attribute_list_] )
900             ? "attribute list"
901             : "quote/pattern";
902 0         0 $self->warning(
903             "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
904             );
905             }
906              
907 562 50       1891 if ( $self->[_hit_bug_] ) {
908 0         0 $severe_error = 1;
909             }
910              
911             # Multiple "unexpected" type tokenization errors usually indicate parsing
912             # non-perl scripts, or that something is seriously wrong, so we should
913             # avoid formatting them. This can happen for example if we run perltidy on
914             # a shell script or an html file. But unfortunately this check can
915             # interfere with some extended syntaxes, such as RPerl, so it has to be off
916             # by default.
917 562         1447 my $ue_count = $self->[_unexpected_error_count_];
918 562 50 33     2324 if ( $maxue > 0 && $ue_count > $maxue ) {
919 0         0 $self->warning(<<EOM);
920             Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
921             EOM
922 0         0 $severe_error = 1;
923             }
924              
925 562 100       1952 if ( !$self->[_saw_perl_dash_w_] ) {
926 546 50       2310 if ( $] < 5.006 ) {
927 0         0 $self->write_logfile_entry("Suggest including '-w parameter'\n");
928             }
929             else {
930 546         2006 $self->write_logfile_entry("Suggest including 'use warnings;'\n");
931             }
932             }
933              
934 562 50       3460 if ( $self->[_saw_perl_dash_P_] ) {
935 0         0 $self->write_logfile_entry(
936             "Use of -P parameter for defines is discouraged\n");
937             }
938              
939 562 100       2579 if ( !$self->[_saw_use_strict_] ) {
940 548         1707 $self->write_logfile_entry("Suggest including 'use strict;'\n");
941             }
942              
943             # it is suggested that labels have at least one upper case character
944             # for legibility and to avoid code breakage as new keywords are introduced
945 562 100       3575 if ( $self->[_rlower_case_labels_at_] ) {
946 12         38 my @lower_case_labels_at = @{ $self->[_rlower_case_labels_at_] };
  12         55  
947 12         47 $self->write_logfile_entry(
948             "Suggest using upper case characters in label(s)\n");
949 12         72 local $LIST_SEPARATOR = ')(';
950 12         99 $self->write_logfile_entry(
951             " defined at line(s): (@lower_case_labels_at)\n");
952             }
953 562         2156 return $severe_error;
954             } ## end sub report_tokenization_errors
955              
956             sub report_v_string {
957              
958             # warn if this version can't handle v-strings
959 2     2 0 8 my ( $self, $tok ) = @_;
960 2 50       13 if ( !$self->[_saw_v_string_] ) {
961 2         5 $self->[_saw_v_string_] = $self->[_last_line_number_];
962             }
963 2 50       10 if ( $] < 5.006 ) {
964 0         0 $self->warning(
965             "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
966             );
967             }
968 2         6 return;
969             } ## end sub report_v_string
970              
971             sub is_valid_token_type {
972 3     3 0 6 my ($type) = @_;
973 3         15 return $is_valid_token_type{$type};
974             }
975              
976             sub log_numbered_msg {
977 167     167 0 458 my ( $self, $msg ) = @_;
978              
979             # write input line number + message to logfile
980 167         350 my $input_line_number = $self->[_last_line_number_];
981 167         793 $self->write_logfile_entry("Line $input_line_number: $msg");
982 167         383 return;
983             } ## end sub log_numbered_msg
984              
985             sub get_line {
986              
987 8230     8230 0 15243 my $self = shift;
988              
989             # Read the next input line and tokenize it
990             # Returns:
991             # $line_of_tokens = ref to hash of info for the tokenized line
992              
993             # USES GLOBAL VARIABLES:
994             # $brace_depth, $square_bracket_depth, $paren_depth
995              
996             # get the next line from the input array
997 8230         11566 my $input_line;
998 8230         13964 my $rinput_lines = $self->[_rinput_lines_];
999 8230         12579 my $line_index = $self->[_input_line_index_next_];
1000 8230 100       12222 if ( $line_index < @{$rinput_lines} ) {
  8230         18590  
1001 7668         16174 $input_line = $rinput_lines->[ $line_index++ ];
1002 7668         12299 $self->[_input_line_index_next_] = $line_index;
1003             }
1004              
1005 8230         14061 $self->[_line_of_text_] = $input_line;
1006              
1007 8230 100       19902 return if ( !defined($input_line) );
1008              
1009 7668         12308 my $input_line_number = ++$self->[_last_line_number_];
1010              
1011             # Find and remove what characters terminate this line, including any
1012             # control r
1013 7668         11981 my $input_line_separator = EMPTY_STRING;
1014 7668 100       19789 if ( chomp($input_line) ) {
1015 7667         16826 $input_line_separator = $INPUT_RECORD_SEPARATOR;
1016             }
1017              
1018             # The first test here very significantly speeds things up, but be sure to
1019             # keep the regex and hash %other_line_endings the same.
1020 7668 100       23205 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
1021 24 50       238 if ( $input_line =~ s/([\r\035\032])+$// ) {
1022 24         81 $input_line_separator = $1 . $input_line_separator;
1023             }
1024             }
1025              
1026             # for backwards compatibility we keep the line text terminated with
1027             # a newline character
1028 7668         15772 $input_line .= "\n";
1029 7668         13311 $self->[_line_of_text_] = $input_line;
1030              
1031             # create a data structure describing this line which will be
1032             # returned to the caller.
1033              
1034             # _line_type codes are:
1035             # SYSTEM - system-specific code before hash-bang line
1036             # CODE - line of perl code (including comments)
1037             # POD_START - line starting pod, such as '=head'
1038             # POD - pod documentation text
1039             # POD_END - last line of pod section, '=cut'
1040             # HERE - text of here-document
1041             # HERE_END - last line of here-doc (target word)
1042             # FORMAT - format section
1043             # FORMAT_END - last line of format section, '.'
1044             # SKIP - code skipping section
1045             # SKIP_END - last line of code skipping section, '#>>V'
1046             # DATA_START - __DATA__ line
1047             # DATA - unidentified text following __DATA__
1048             # END_START - __END__ line
1049             # END - unidentified text following __END__
1050             # ERROR - we are in big trouble, probably not a perl script
1051              
1052             # Other variables:
1053             # _curly_brace_depth - depth of curly braces at start of line
1054             # _square_bracket_depth - depth of square brackets at start of line
1055             # _paren_depth - depth of parens at start of line
1056             # _starting_in_quote - this line continues a multi-line quote
1057             # (so don't trim leading blanks!)
1058             # _ending_in_quote - this line ends in a multi-line quote
1059             # (so don't trim trailing blanks!)
1060 7668         42755 my $line_of_tokens = {
1061             _line_type => 'EOF',
1062             _line_text => $input_line,
1063             _line_number => $input_line_number,
1064             _guessed_indentation_level => 0,
1065             _curly_brace_depth => $brace_depth,
1066             _square_bracket_depth => $square_bracket_depth,
1067             _paren_depth => $paren_depth,
1068             _quote_character => EMPTY_STRING,
1069             ## Skip these needless initializations for efficiency:
1070             ## _rtoken_type => undef,
1071             ## _rtokens => undef,
1072             ## _rlevels => undef,
1073             ## _rblock_type => undef,
1074             ## _rtype_sequence => undef,
1075             ## _rci_levels => undef,
1076             ## _starting_in_quote => 0,
1077             ## _ending_in_quote => 0,
1078             };
1079              
1080             # must print line unchanged if we are in a here document
1081 7668 100       40835 if ( $self->[_in_here_doc_] ) {
    100          
    100          
    100          
    50          
    100          
    100          
1082              
1083 24         91 $line_of_tokens->{_line_type} = 'HERE';
1084 24         73 my $here_doc_target = $self->[_here_doc_target_];
1085 24         56 my $here_quote_character = $self->[_here_quote_character_];
1086 24         49 my $candidate_target = $input_line;
1087 24         49 chomp $candidate_target;
1088              
1089             # Handle <<~ targets, which are indicated here by a leading space on
1090             # the here quote character
1091 24 100       97 if ( $here_quote_character =~ /^\s/ ) {
1092 4         15 $candidate_target =~ s/^\s*//;
1093             }
1094 24 100       80 if ( $candidate_target eq $here_doc_target ) {
1095 9         49 $self->[_nearly_matched_here_target_at_] = undef;
1096 9         26 $line_of_tokens->{_line_type} = 'HERE_END';
1097 9         56 $self->log_numbered_msg("Exiting HERE document $here_doc_target\n");
1098              
1099 9         35 my $rhere_target_list = $self->[_rhere_target_list_];
1100 9 50       41 if ( @{$rhere_target_list} ) { # there can be multiple here targets
  9         40  
1101             ( $here_doc_target, $here_quote_character ) =
1102 0         0 @{ shift @{$rhere_target_list} };
  0         0  
  0         0  
1103 0         0 $self->[_here_doc_target_] = $here_doc_target;
1104 0         0 $self->[_here_quote_character_] = $here_quote_character;
1105 0         0 $self->log_numbered_msg(
1106             "Entering HERE document $here_doc_target\n");
1107 0         0 $self->[_nearly_matched_here_target_at_] = undef;
1108 0         0 $self->[_started_looking_for_here_target_at_] =
1109             $input_line_number;
1110             }
1111             else {
1112 9         33 $self->[_in_here_doc_] = 0;
1113 9         24 $self->[_here_doc_target_] = EMPTY_STRING;
1114 9         28 $self->[_here_quote_character_] = EMPTY_STRING;
1115             }
1116             }
1117              
1118             # check for error of extra whitespace
1119             # note for PERL6: leading whitespace is allowed
1120             else {
1121 15         146 $candidate_target =~ s/\s*$//;
1122 15         78 $candidate_target =~ s/^\s*//;
1123 15 50       64 if ( $candidate_target eq $here_doc_target ) {
1124 0         0 $self->[_nearly_matched_here_target_at_] = $input_line_number;
1125             }
1126             }
1127 24         99 return $line_of_tokens;
1128             }
1129              
1130             # Print line unchanged if we are in a format section
1131             elsif ( $self->[_in_format_] ) {
1132              
1133 3 100       28 if ( $input_line =~ /^\.[\s#]*$/ ) {
1134              
1135             # Decrement format depth count at a '.' after a 'format'
1136 1         14 $self->[_in_format_]--;
1137              
1138             # This is the end when count reaches 0
1139 1 50       7 if ( !$self->[_in_format_] ) {
1140 1         7 $self->log_numbered_msg("Exiting format section\n");
1141 1         3 $line_of_tokens->{_line_type} = 'FORMAT_END';
1142              
1143             # Make the tokenizer mark an opening brace which follows
1144             # as a code block. Fixes issue c202/t032.
1145 1         4 $last_nonblank_token = ';';
1146 1         7 $last_nonblank_type = ';';
1147             }
1148             }
1149             else {
1150 2         6 $line_of_tokens->{_line_type} = 'FORMAT';
1151 2 50       10 if ( $input_line =~ /^\s*format\s+\w+/ ) {
1152              
1153             # Increment format depth count at a 'format' within a 'format'
1154             # This is a simple way to handle nested formats (issue c019).
1155 0         0 $self->[_in_format_]++;
1156             }
1157             }
1158 3         11 return $line_of_tokens;
1159             }
1160              
1161             # must print line unchanged if we are in pod documentation
1162             elsif ( $self->[_in_pod_] ) {
1163              
1164 47         114 $line_of_tokens->{_line_type} = 'POD';
1165 47 100       231 if ( $input_line =~ /^=cut/ ) {
1166 20         74 $line_of_tokens->{_line_type} = 'POD_END';
1167 20         80 $self->log_numbered_msg("Exiting POD section\n");
1168 20         54 $self->[_in_pod_] = 0;
1169             }
1170 47 50 33     149 if ( $input_line =~ /^\#\!.*perl\b/ && !$self->[_in_end_] ) {
1171 0         0 $self->warning(
1172             "Hash-bang in pod can cause older versions of perl to fail! \n"
1173             );
1174             }
1175              
1176 47         181 return $line_of_tokens;
1177             }
1178              
1179             # print line unchanged if in skipped section
1180             elsif ( $self->[_in_skipped_] ) {
1181              
1182 8         22 $line_of_tokens->{_line_type} = 'SKIP';
1183 8 100       93 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
    50          
1184 2         11 $line_of_tokens->{_line_type} = 'SKIP_END';
1185 2         10 $self->log_numbered_msg("Exiting code-skipping section\n");
1186 2         8 $self->[_in_skipped_] = 0;
1187             }
1188             elsif ( $input_line =~ /$code_skipping_pattern_begin/ ) {
1189              
1190             # warn of duplicate starting comment lines, git #118
1191 0         0 my $lno = $self->[_in_skipped_];
1192 0         0 $self->warning(
1193             "Already in code-skipping section which started at line $lno\n"
1194             );
1195             }
1196             else {
1197             ## ok - not a code-skipping control line
1198             }
1199 8         32 return $line_of_tokens;
1200             }
1201              
1202             # must print line unchanged if we have seen a severe error (i.e., we
1203             # are seeing illegal tokens and cannot continue. Syntax errors do
1204             # not pass this route). Calling routine can decide what to do, but
1205             # the default can be to just pass all lines as if they were after __END__
1206             elsif ( $self->[_in_error_] ) {
1207 0         0 $line_of_tokens->{_line_type} = 'ERROR';
1208 0         0 return $line_of_tokens;
1209             }
1210              
1211             # print line unchanged if we are __DATA__ section
1212             elsif ( $self->[_in_data_] ) {
1213              
1214             # ...but look for POD
1215             # Note that the _in_data and _in_end flags remain set
1216             # so that we return to that state after seeing the
1217             # end of a pod section
1218 1 50 33     9 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1219 0         0 $line_of_tokens->{_line_type} = 'POD_START';
1220 0         0 $self->log_numbered_msg("Entering POD section\n");
1221 0         0 $self->[_in_pod_] = 1;
1222 0         0 return $line_of_tokens;
1223             }
1224             else {
1225 1         3 $line_of_tokens->{_line_type} = 'DATA';
1226 1         5 return $line_of_tokens;
1227             }
1228             }
1229              
1230             # print line unchanged if we are in __END__ section
1231             elsif ( $self->[_in_end_] ) {
1232              
1233             # ...but look for POD
1234             # Note that the _in_data and _in_end flags remain set
1235             # so that we return to that state after seeing the
1236             # end of a pod section
1237 48 100 66     273 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1238 6         34 $line_of_tokens->{_line_type} = 'POD_START';
1239 6         39 $self->log_numbered_msg("Entering POD section\n");
1240 6         34 $self->[_in_pod_] = 1;
1241 6         27 return $line_of_tokens;
1242             }
1243             else {
1244 42         75 $line_of_tokens->{_line_type} = 'END';
1245 42         119 return $line_of_tokens;
1246             }
1247             }
1248             else {
1249             ## ok
1250             }
1251              
1252             # check for a hash-bang line if we haven't seen one
1253 7537 100 100     34371 if ( !$self->[_saw_hash_bang_]
      66        
1254             && substr( $input_line, 0, 2 ) eq '#!'
1255             && $input_line =~ /^\#\!.*perl\b/ )
1256             {
1257 15         62 $self->[_saw_hash_bang_] = $input_line_number;
1258              
1259             # check for -w and -P flags
1260 15 50       100 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1261 0         0 $self->[_saw_perl_dash_P_] = 1;
1262             }
1263              
1264 15 100       95 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1265 8         24 $self->[_saw_perl_dash_w_] = 1;
1266             }
1267              
1268 15 100 33     138 if (
      66        
      100        
      66        
1269             $input_line_number > 1
1270              
1271             # leave any hash bang in a BEGIN block alone
1272             # i.e. see 'debugger-duck_type.t'
1273             && !(
1274             $last_nonblank_block_type
1275             && $last_nonblank_block_type eq 'BEGIN'
1276             )
1277             && !$rOpts_look_for_hash_bang
1278              
1279             # Try to avoid giving a false alarm at a simple comment.
1280             # These look like valid hash-bang lines:
1281              
1282             #!/usr/bin/perl -w
1283             #! /usr/bin/perl -w
1284             #!c:\perl\bin\perl.exe
1285              
1286             # These are comments:
1287             #! I love perl
1288             #! sunos does not yet provide a /usr/bin/perl
1289              
1290             # Comments typically have multiple spaces, which suggests
1291             # the filter
1292             && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1293             )
1294             {
1295              
1296             # this is helpful for VMS systems; we may have accidentally
1297             # tokenized some DCL commands
1298 1 50       5 if ( $self->[_started_tokenizing_] ) {
1299 0         0 $self->warning(
1300             "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1301             );
1302             }
1303             else {
1304 1         5 $self->complain("Useless hash-bang after line 1\n");
1305             }
1306             }
1307              
1308             # Report the leading hash-bang as a system line
1309             # This will prevent -dac from deleting it
1310             else {
1311 14         60 $line_of_tokens->{_line_type} = 'SYSTEM';
1312 14         97 return $line_of_tokens;
1313             }
1314             }
1315              
1316             # wait for a hash-bang before parsing if the user invoked us with -x
1317 7523 100 100     18648 if ( $rOpts_look_for_hash_bang
1318             && !$self->[_saw_hash_bang_] )
1319             {
1320 5         8 $line_of_tokens->{_line_type} = 'SYSTEM';
1321 5         18 return $line_of_tokens;
1322             }
1323              
1324             # a first line of the form ': #' will be marked as SYSTEM
1325             # since lines of this form may be used by tcsh
1326 7518 50 66     18877 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1327 0         0 $line_of_tokens->{_line_type} = 'SYSTEM';
1328 0         0 return $line_of_tokens;
1329             }
1330              
1331             # now we know that it is ok to tokenize the line...
1332             # the line tokenizer will modify any of these private variables:
1333             # _rhere_target_list_
1334             # _in_data_
1335             # _in_end_
1336             # _in_format_
1337             # _in_error_
1338             # _in_skipped_
1339             # _in_pod_
1340             # _in_quote_
1341 7518         23512 $self->tokenize_this_line($line_of_tokens);
1342              
1343             # Now finish defining the return structure and return it
1344 7518         15959 $line_of_tokens->{_ending_in_quote} = $self->[_in_quote_];
1345              
1346             # handle severe error (binary data in script)
1347 7518 50       17578 if ( $self->[_in_error_] ) {
1348 0         0 $self->[_in_quote_] = 0; # to avoid any more messages
1349 0         0 $self->warning("Giving up after error\n");
1350 0         0 $line_of_tokens->{_line_type} = 'ERROR';
1351 0         0 reset_indentation_level(0); # avoid error messages
1352 0         0 return $line_of_tokens;
1353             }
1354              
1355             # handle start of pod documentation
1356 7518 100       15941 if ( $self->[_in_pod_] ) {
1357              
1358             # This gets tricky..above a __DATA__ or __END__ section, perl
1359             # accepts '=cut' as the start of pod section. But afterwards,
1360             # only pod utilities see it and they may ignore an =cut without
1361             # leading =head. In any case, this isn't good.
1362 14 50       83 if ( $input_line =~ /^=cut\b/ ) {
1363 0 0 0     0 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
1364 0         0 $self->complain("=cut while not in pod ignored\n");
1365 0         0 $self->[_in_pod_] = 0;
1366 0         0 $line_of_tokens->{_line_type} = 'POD_END';
1367             }
1368             else {
1369 0         0 $line_of_tokens->{_line_type} = 'POD_START';
1370 0         0 if ( !DEVEL_MODE ) {
1371 0         0 $self->warning(
1372             "=cut starts a pod section .. this can fool pod utilities.\n"
1373             );
1374             }
1375 0         0 $self->log_numbered_msg("Entering POD section\n");
1376             }
1377             }
1378              
1379             else {
1380 14         45 $line_of_tokens->{_line_type} = 'POD_START';
1381 14         54 $self->log_numbered_msg("Entering POD section\n");
1382             }
1383              
1384 14         71 return $line_of_tokens;
1385             }
1386              
1387             # handle start of skipped section
1388 7504 100       15048 if ( $self->[_in_skipped_] ) {
1389              
1390 2         12 $line_of_tokens->{_line_type} = 'SKIP';
1391 2         12 $self->log_numbered_msg("Entering code-skipping section\n");
1392 2         9 return $line_of_tokens;
1393             }
1394              
1395             # see if this line contains here doc targets
1396 7502         11986 my $rhere_target_list = $self->[_rhere_target_list_];
1397 7502 100       10457 if ( @{$rhere_target_list} ) {
  7502         16825  
1398              
1399             my ( $here_doc_target, $here_quote_character ) =
1400 9         61 @{ shift @{$rhere_target_list} };
  9         33  
  9         51  
1401 9         37 $self->[_in_here_doc_] = 1;
1402 9         27 $self->[_here_doc_target_] = $here_doc_target;
1403 9         28 $self->[_here_quote_character_] = $here_quote_character;
1404 9         74 $self->log_numbered_msg("Entering HERE document $here_doc_target\n");
1405 9         38 $self->[_started_looking_for_here_target_at_] = $input_line_number;
1406             }
1407              
1408             # NOTE: __END__ and __DATA__ statements are written unformatted
1409             # because they can theoretically contain additional characters
1410             # which are not tokenized (and cannot be read with <DATA> either!).
1411 7502 100       20544 if ( $self->[_in_data_] ) {
    100          
1412 1         4 $line_of_tokens->{_line_type} = 'DATA_START';
1413 1         6 $self->log_numbered_msg("Starting __DATA__ section\n");
1414 1         3 $self->[_saw_data_] = 1;
1415              
1416             # keep parsing after __DATA__ if use SelfLoader was seen
1417 1 50       3 if ( $self->[_saw_selfloader_] ) {
1418 0         0 $self->[_in_data_] = 0;
1419 0         0 $self->log_numbered_msg(
1420             "SelfLoader seen, continuing; -nlsl deactivates\n");
1421             }
1422              
1423 1         6 return $line_of_tokens;
1424             }
1425              
1426             elsif ( $self->[_in_end_] ) {
1427 6         31 $line_of_tokens->{_line_type} = 'END_START';
1428 6         33 $self->log_numbered_msg("Starting __END__ section\n");
1429 6         16 $self->[_saw_end_] = 1;
1430              
1431             # keep parsing after __END__ if use AutoLoader was seen
1432 6 50       30 if ( $self->[_saw_autoloader_] ) {
1433 0         0 $self->[_in_end_] = 0;
1434 0         0 $self->log_numbered_msg(
1435             "AutoLoader seen, continuing; -nlal deactivates\n");
1436             }
1437 6         33 return $line_of_tokens;
1438             }
1439             else {
1440             ## ok: not in __END__ or __DATA__
1441             }
1442              
1443             # now, finally, we know that this line is type 'CODE'
1444 7495         14161 $line_of_tokens->{_line_type} = 'CODE';
1445              
1446             # remember if we have seen any real code
1447 7495 100 100     24100 if ( !$self->[_started_tokenizing_]
      100        
1448             && $input_line !~ /^\s*$/
1449             && $input_line !~ /^\s*#/ )
1450             {
1451 558         2411 $self->[_started_tokenizing_] = 1;
1452             }
1453              
1454 7495 100       15908 if ( $self->[_debugger_object_] ) {
1455 7         38 $self->[_debugger_object_]->write_debug_entry($line_of_tokens);
1456             }
1457              
1458             # Note: if keyword 'format' occurs in this line code, it is still CODE
1459             # (keyword 'format' need not start a line)
1460 7495 100       15580 if ( $self->[_in_format_] ) {
1461 1         22 $self->log_numbered_msg("Entering format section\n");
1462             }
1463              
1464 7495 100 100     28246 if ( $self->[_in_quote_]
    100 100        
1465             and ( $self->[_line_start_quote_] < 0 ) )
1466             {
1467              
1468             #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1469 49 100       391 if ( ( my $quote_target = $self->[_quote_target_] ) !~ /^\s*$/ ) {
1470 48         130 $self->[_line_start_quote_] = $input_line_number;
1471 48         341 $self->log_numbered_msg(
1472             "Start multi-line quote or pattern ending in $quote_target\n");
1473             }
1474             }
1475             elsif ( ( $self->[_line_start_quote_] >= 0 )
1476             && !$self->[_in_quote_] )
1477             {
1478 48         166 $self->[_line_start_quote_] = -1;
1479 48         196 $self->log_numbered_msg("End of multi-line quote or pattern\n");
1480             }
1481             else {
1482             ## ok
1483             }
1484              
1485             # we are returning a line of CODE
1486 7495         29868 return $line_of_tokens;
1487             } ## end sub get_line
1488              
1489             sub find_starting_indentation_level {
1490              
1491             # We need to find the indentation level of the first line of the
1492             # script being formatted. Often it will be zero for an entire file,
1493             # but if we are formatting a local block of code (within an editor for
1494             # example) it may not be zero. The user may specify this with the
1495             # -sil=n parameter but normally doesn't so we have to guess.
1496             #
1497 562     562 0 1814 my ($self) = @_;
1498 562         1433 my $starting_level = 0;
1499              
1500             # use value if given as parameter
1501 562 100       2813 if ( $self->[_know_starting_level_] ) {
    100          
1502 1         4 $starting_level = $self->[_starting_level_];
1503             }
1504              
1505             # if we know there is a hash_bang line, the level must be zero
1506             elsif ($rOpts_look_for_hash_bang) {
1507 1         3 $self->[_know_starting_level_] = 1;
1508             }
1509              
1510             # otherwise figure it out from the input file
1511             else {
1512 560         1530 my $line;
1513 560         1119 my $i = 0;
1514              
1515             # keep looking at lines until we find a hash bang or piece of code
1516             # ( or, for now, an =pod line)
1517 560         1311 my $msg = EMPTY_STRING;
1518 560         1178 my $in_code_skipping;
1519 560         2899 while ( $line = $self->peek_ahead( $i++ ) ) {
1520              
1521             # if first line is #! then assume starting level is zero
1522 871 100 100     5640 if ( $i == 1 && $line =~ /^\#\!/ ) {
1523 13         40 $starting_level = 0;
1524 13         43 last;
1525             }
1526              
1527             # ignore lines fenced off with code-skipping comments
1528 858 100       4602 if ( $line =~ /^\s*#/ ) {
1529 296 50       1129 if ( !$in_code_skipping ) {
1530 296 50 33     2988 if ( $rOpts_code_skipping
1531             && $line =~ /$code_skipping_pattern_begin/ )
1532             {
1533 0         0 $in_code_skipping = 1;
1534             }
1535             }
1536             else {
1537 0 0       0 if ( $line =~ /$code_skipping_pattern_end/ ) {
1538 0         0 $in_code_skipping = 0;
1539             }
1540             }
1541 296         1030 next;
1542             }
1543 562 50       2356 next if ($in_code_skipping);
1544              
1545 562 100       3712 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1546              
1547 544         2778 $starting_level = $self->guess_old_indentation_level($line);
1548 544         1374 last;
1549             }
1550 560         2662 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1551 560         3021 $self->write_logfile_entry("$msg");
1552             }
1553 562         2315 $self->[_starting_level_] = $starting_level;
1554 562         3659 reset_indentation_level($starting_level);
1555 562         1103 return;
1556             } ## end sub find_starting_indentation_level
1557              
1558             sub guess_old_indentation_level {
1559 544     544 0 1803 my ( $self, $line ) = @_;
1560              
1561             # Guess the indentation level of an input line.
1562             #
1563             # For the first line of code this result will define the starting
1564             # indentation level. It will mainly be non-zero when perltidy is applied
1565             # within an editor to a local block of code.
1566             #
1567             # This is an impossible task in general because we can't know what tabs
1568             # meant for the old script and how many spaces were used for one
1569             # indentation level in the given input script. For example it may have
1570             # been previously formatted with -i=7 -et=3. But we can at least try to
1571             # make sure that perltidy guesses correctly if it is applied repeatedly to
1572             # a block of code within an editor, so that the block stays at the same
1573             # level when perltidy is applied repeatedly.
1574             #
1575             # USES GLOBAL VARIABLES: (none)
1576 544         1190 my $level = 0;
1577              
1578             # find leading tabs, spaces, and any statement label
1579 544         1150 my $spaces = 0;
1580 544 50       4607 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1581              
1582             # If there are leading tabs, we use the tab scheme for this run, if
1583             # any, so that the code will remain stable when editing.
1584 544 100       2655 if ($1) { $spaces += length($1) * $tabsize }
  2         10  
1585              
1586 544 100       2379 if ($2) { $spaces += length($2) }
  79         301  
1587              
1588             # correct for outdented labels
1589 544 100 66     2744 if ( $3 && $rOpts_outdent_labels ) {
1590 1         6 $spaces += $rOpts_continuation_indentation;
1591             }
1592             }
1593              
1594 544         2118 $level = int( $spaces / $rOpts_indent_columns );
1595 544         1444 return ($level);
1596             } ## end sub guess_old_indentation_level
1597              
1598             # This is a currently unused debug routine
1599             sub dump_functions {
1600              
1601 0     0 0 0 my $fh = *STDOUT;
1602 0         0 foreach my $pkg ( keys %{$ris_user_function} ) {
  0         0  
1603 0         0 $fh->print("\nnon-constant subs in package $pkg\n");
1604              
1605 0         0 foreach my $sub ( keys %{ $ris_user_function->{$pkg} } ) {
  0         0  
1606 0         0 my $msg = EMPTY_STRING;
1607 0 0       0 if ( $ris_block_list_function->{$pkg}{$sub} ) {
1608 0         0 $msg = 'block_list';
1609             }
1610              
1611 0 0       0 if ( $ris_block_function->{$pkg}{$sub} ) {
1612 0         0 $msg = 'block';
1613             }
1614 0         0 $fh->print("$sub $msg\n");
1615             }
1616             }
1617              
1618 0         0 foreach my $pkg ( keys %{$ris_constant} ) {
  0         0  
1619 0         0 $fh->print("\nconstants and constant subs in package $pkg\n");
1620              
1621 0         0 foreach my $sub ( keys %{ $ris_constant->{$pkg} } ) {
  0         0  
1622 0         0 $fh->print("$sub\n");
1623             }
1624             }
1625 0         0 return;
1626             } ## end sub dump_functions
1627              
1628             sub prepare_for_a_new_file {
1629              
1630 562     562 0 1903 my ( $self, $source_object ) = @_;
1631              
1632             # copy the source object lines to an array of lines
1633 562         3019 $self->make_source_array($source_object);
1634              
1635             # previous tokens needed to determine what to expect next
1636 562         1767 $last_nonblank_token = ';'; # the only possible starting state which
1637 562         1573 $last_nonblank_type = ';'; # will make a leading brace a code block
1638 562         1398 $last_nonblank_block_type = EMPTY_STRING;
1639              
1640             # scalars for remembering statement types across multiple lines
1641 562         1371 $statement_type = EMPTY_STRING; # '' or 'use' or 'sub..' or 'case..'
1642              
1643             # scalars for remembering where we are in the file
1644 562         1401 $current_package = "main";
1645 562         1140 $context = UNKNOWN_CONTEXT;
1646              
1647             # hashes used to remember function information
1648 562         2028 $ris_constant = {}; # user-defined constants
1649 562         1983 $ris_user_function = {}; # user-defined functions
1650 562         1632 $ruser_function_prototype = {}; # their prototypes
1651 562         1651 $ris_block_function = {};
1652 562         1645 $ris_block_list_function = {};
1653 562         1465 $rsaw_function_definition = {};
1654 562         1341 $rsaw_use_module = {};
1655              
1656             # variables used to track depths of various containers
1657             # and report nesting errors
1658 562         1176 $paren_depth = 0;
1659 562         1064 $brace_depth = 0;
1660 562         1086 $square_bracket_depth = 0;
1661 562         2535 $rcurrent_depth = [ (0) x scalar @closing_brace_names ];
1662 562         1277 $total_depth = 0;
1663 562         2224 $rtotal_depth = [];
1664 562         2071 $rcurrent_sequence_number = [];
1665 562         1252 $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
1666              
1667 562         1851 $rparen_type = [];
1668 562         1637 $rparen_semicolon_count = [];
1669 562         2421 $rparen_vars = [];
1670 562         1893 $rbrace_type = [];
1671 562         1726 $rbrace_structural_type = [];
1672 562         1489 $rbrace_context = [];
1673 562         1633 $rbrace_package = [];
1674 562         1598 $rsquare_bracket_type = [];
1675 562         1529 $rsquare_bracket_structural_type = [];
1676 562         3534 $rdepth_array = [];
1677 562         1306 $rnested_ternary_flag = [];
1678 562         3776 $rnested_statement_type = [];
1679 562         3253 $rstarting_line_of_current_depth = [];
1680              
1681 562         1820 $rparen_type->[$paren_depth] = EMPTY_STRING;
1682 562         1651 $rparen_semicolon_count->[$paren_depth] = 0;
1683 562         1561 $rparen_vars->[$paren_depth] = [];
1684 562         1657 $rbrace_type->[$brace_depth] = ';'; # identify opening brace as code block
1685 562         1542 $rbrace_structural_type->[$brace_depth] = EMPTY_STRING;
1686 562         1433 $rbrace_context->[$brace_depth] = UNKNOWN_CONTEXT;
1687 562         1331 $rbrace_package->[$paren_depth] = $current_package;
1688 562         1270 $rsquare_bracket_type->[$square_bracket_depth] = EMPTY_STRING;
1689 562         1475 $rsquare_bracket_structural_type->[$square_bracket_depth] = EMPTY_STRING;
1690              
1691 562         2804 initialize_tokenizer_state();
1692 562         1085 return;
1693             } ## end sub prepare_for_a_new_file
1694              
1695             { ## closure for sub tokenize_this_line
1696              
1697 39     39   396 use constant BRACE => 0;
  39         130  
  39         3022  
1698 39     39   323 use constant SQUARE_BRACKET => 1;
  39         130  
  39         2317  
1699 39     39   326 use constant PAREN => 2;
  39         138  
  39         3787  
1700 39     39   308 use constant QUESTION_COLON => 3;
  39         118  
  39         84547  
1701              
1702             # TV1: scalars for processing one LINE.
1703             # Re-initialized on each entry to sub tokenize_this_line.
1704             my (
1705             $block_type, $container_type, $expecting,
1706             $i, $i_tok, $input_line,
1707             $input_line_number, $last_nonblank_i, $max_token_index,
1708             $next_tok, $next_type, $peeked_ahead,
1709             $prototype, $rhere_target_list, $rtoken_map,
1710             $rtoken_type, $rtokens, $tok,
1711             $type, $type_sequence, $indent_flag,
1712             );
1713              
1714             # TV2: refs to ARRAYS for processing one LINE
1715             # Re-initialized on each call.
1716             my $routput_token_list = []; # stack of output token indexes
1717             my $routput_token_type = []; # token types
1718             my $routput_block_type = []; # types of code block
1719             my $routput_container_type = []; # paren types, such as if, elsif, ..
1720             my $routput_type_sequence = []; # nesting sequential number
1721             my $routput_indent_flag = []; #
1722              
1723             # TV3: SCALARS for quote variables. These are initialized with a
1724             # subroutine call and continually updated as lines are processed.
1725             my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1726             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1727              
1728             # TV4: SCALARS for multi-line identifiers and
1729             # statements. These are initialized with a subroutine call
1730             # and continually updated as lines are processed.
1731             my ( $id_scan_state, $identifier, $want_paren );
1732              
1733             # TV5: SCALARS for tracking indentation level.
1734             # Initialized once and continually updated as lines are
1735             # processed.
1736             my (
1737             $nesting_token_string, $nesting_block_string,
1738             $nesting_block_flag, $level_in_tokenizer,
1739             );
1740              
1741             # TV6: SCALARS for remembering several previous
1742             # tokens. Initialized once and continually updated as
1743             # lines are processed.
1744             my (
1745             $last_nonblank_container_type, $last_nonblank_type_sequence,
1746             $last_last_nonblank_token, $last_last_nonblank_type,
1747             $last_nonblank_prototype,
1748             );
1749              
1750             # ----------------------------------------------------------------
1751             # beginning of tokenizer variable access and manipulation routines
1752             # ----------------------------------------------------------------
1753              
1754             sub initialize_tokenizer_state {
1755              
1756             # GV1: initialized once
1757             # TV1: initialized on each call
1758             # TV2: initialized on each call
1759             # TV3:
1760 562     562 0 1230 $in_quote = 0;
1761 562         1388 $quote_type = 'Q';
1762 562         1277 $quote_character = EMPTY_STRING;
1763 562         1107 $quote_pos = 0;
1764 562         1135 $quote_depth = 0;
1765 562         1100 $quoted_string_1 = EMPTY_STRING;
1766 562         1251 $quoted_string_2 = EMPTY_STRING;
1767 562         1109 $allowed_quote_modifiers = EMPTY_STRING;
1768              
1769             # TV4:
1770 562         1101 $id_scan_state = EMPTY_STRING;
1771 562         1205 $identifier = EMPTY_STRING;
1772 562         1201 $want_paren = EMPTY_STRING;
1773              
1774             # TV5:
1775 562         1300 $nesting_token_string = EMPTY_STRING;
1776 562         1249 $nesting_block_string = '1'; # initially in a block
1777 562         1056 $nesting_block_flag = 1;
1778 562         1089 $level_in_tokenizer = 0;
1779              
1780             # TV6:
1781 562         1170 $last_nonblank_container_type = EMPTY_STRING;
1782 562         1188 $last_nonblank_type_sequence = EMPTY_STRING;
1783 562         1178 $last_last_nonblank_token = ';';
1784 562         1192 $last_last_nonblank_type = ';';
1785 562         1178 $last_nonblank_prototype = EMPTY_STRING;
1786 562         1075 return;
1787             } ## end sub initialize_tokenizer_state
1788              
1789             sub save_tokenizer_state {
1790              
1791             # Global variables:
1792 0     0 0 0 my $rGV1 = [
1793             $brace_depth,
1794             $context,
1795             $current_package,
1796             $last_nonblank_block_type,
1797             $last_nonblank_token,
1798             $last_nonblank_type,
1799             $next_sequence_number,
1800             $paren_depth,
1801             $rbrace_context,
1802             $rbrace_package,
1803             $rbrace_structural_type,
1804             $rbrace_type,
1805             $rcurrent_depth,
1806             $rcurrent_sequence_number,
1807             $rdepth_array,
1808             $ris_block_function,
1809             $ris_block_list_function,
1810             $ris_constant,
1811             $ris_user_function,
1812             $rnested_statement_type,
1813             $rnested_ternary_flag,
1814             $rparen_semicolon_count,
1815             $rparen_vars,
1816             $rparen_type,
1817             $rsaw_function_definition,
1818             $rsaw_use_module,
1819             $rsquare_bracket_structural_type,
1820             $rsquare_bracket_type,
1821             $rstarting_line_of_current_depth,
1822             $rtotal_depth,
1823             $ruser_function_prototype,
1824             $square_bracket_depth,
1825             $statement_type,
1826             $total_depth,
1827              
1828             ];
1829              
1830             # Tokenizer closure variables:
1831 0         0 my $rTV1 = [
1832             $block_type, $container_type, $expecting,
1833             $i, $i_tok, $input_line,
1834             $input_line_number, $last_nonblank_i, $max_token_index,
1835             $next_tok, $next_type, $peeked_ahead,
1836             $prototype, $rhere_target_list, $rtoken_map,
1837             $rtoken_type, $rtokens, $tok,
1838             $type, $type_sequence, $indent_flag,
1839             ];
1840              
1841 0         0 my $rTV2 = [
1842             $routput_token_list, $routput_token_type,
1843             $routput_block_type, $routput_container_type,
1844             $routput_type_sequence, $routput_indent_flag,
1845             ];
1846              
1847 0         0 my $rTV3 = [
1848             $in_quote, $quote_type,
1849             $quote_character, $quote_pos,
1850             $quote_depth, $quoted_string_1,
1851             $quoted_string_2, $allowed_quote_modifiers,
1852             ];
1853              
1854 0         0 my $rTV4 = [ $id_scan_state, $identifier, $want_paren ];
1855              
1856 0         0 my $rTV5 = [
1857             $nesting_token_string, $nesting_block_string,
1858             $nesting_block_flag, $level_in_tokenizer,
1859             ];
1860              
1861 0         0 my $rTV6 = [
1862             $last_nonblank_container_type, $last_nonblank_type_sequence,
1863             $last_last_nonblank_token, $last_last_nonblank_type,
1864             $last_nonblank_prototype,
1865             ];
1866 0         0 return [ $rGV1, $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1867             } ## end sub save_tokenizer_state
1868              
1869             sub restore_tokenizer_state {
1870 0     0 0 0 my ($rstate) = @_;
1871 0         0 my ( $rGV1, $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
  0         0  
1872              
1873             (
1874             $brace_depth,
1875             $context,
1876             $current_package,
1877             $last_nonblank_block_type,
1878             $last_nonblank_token,
1879             $last_nonblank_type,
1880             $next_sequence_number,
1881             $paren_depth,
1882             $rbrace_context,
1883             $rbrace_package,
1884             $rbrace_structural_type,
1885             $rbrace_type,
1886             $rcurrent_depth,
1887             $rcurrent_sequence_number,
1888             $rdepth_array,
1889             $ris_block_function,
1890             $ris_block_list_function,
1891             $ris_constant,
1892             $ris_user_function,
1893             $rnested_statement_type,
1894             $rnested_ternary_flag,
1895             $rparen_semicolon_count,
1896             $rparen_vars,
1897             $rparen_type,
1898             $rsaw_function_definition,
1899             $rsaw_use_module,
1900             $rsquare_bracket_structural_type,
1901             $rsquare_bracket_type,
1902             $rstarting_line_of_current_depth,
1903             $rtotal_depth,
1904             $ruser_function_prototype,
1905             $square_bracket_depth,
1906             $statement_type,
1907             $total_depth,
1908              
1909 0         0 ) = @{$rGV1};
  0         0  
1910              
1911             (
1912             $block_type, $container_type, $expecting,
1913             $i, $i_tok, $input_line,
1914             $input_line_number, $last_nonblank_i, $max_token_index,
1915             $next_tok, $next_type, $peeked_ahead,
1916             $prototype, $rhere_target_list, $rtoken_map,
1917             $rtoken_type, $rtokens, $tok,
1918             $type, $type_sequence, $indent_flag,
1919 0         0 ) = @{$rTV1};
  0         0  
1920              
1921             (
1922             $routput_token_list, $routput_token_type,
1923             $routput_block_type, $routput_container_type,
1924             $routput_type_sequence, $routput_indent_flag,
1925 0         0 ) = @{$rTV2};
  0         0  
1926              
1927             (
1928             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1929             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1930 0         0 ) = @{$rTV3};
  0         0  
1931              
1932 0         0 ( $id_scan_state, $identifier, $want_paren ) = @{$rTV4};
  0         0  
1933              
1934             (
1935             $nesting_token_string, $nesting_block_string,
1936             $nesting_block_flag, $level_in_tokenizer,
1937 0         0 ) = @{$rTV5};
  0         0  
1938              
1939             (
1940             $last_nonblank_container_type, $last_nonblank_type_sequence,
1941             $last_last_nonblank_token, $last_last_nonblank_type,
1942             $last_nonblank_prototype,
1943 0         0 ) = @{$rTV6};
  0         0  
1944 0         0 return;
1945             } ## end sub restore_tokenizer_state
1946              
1947             sub split_pretoken {
1948              
1949 8     8 0 18 my ( $self, $numc ) = @_;
1950              
1951             # Split the leading $numc characters from the current token (at index=$i)
1952             # which is pre-type 'w' and insert the remainder back into the pretoken
1953             # stream with appropriate settings. Since we are splitting a pre-type 'w',
1954             # there are three cases, depending on if the remainder starts with a digit:
1955             # Case 1: remainder is type 'd', all digits
1956             # Case 2: remainder is type 'd' and type 'w': digits and other characters
1957             # Case 3: remainder is type 'w'
1958              
1959             # Examples, for $numc=1:
1960             # $tok => $tok_0 $tok_1 $tok_2
1961             # 'x10' => 'x' '10' # case 1
1962             # 'x10if' => 'x' '10' 'if' # case 2
1963             # '0ne => 'O' 'ne' # case 3
1964              
1965             # where:
1966             # $tok_1 is a possible string of digits (pre-type 'd')
1967             # $tok_2 is a possible word (pre-type 'w')
1968              
1969             # return 1 if successful
1970             # return undef if error (shouldn't happen)
1971              
1972             # Calling routine should update '$type' and '$tok' if successful.
1973              
1974 8         21 my $pretoken = $rtokens->[$i];
1975 8 50 33     86 if ( $pretoken
      33        
1976             && length($pretoken) > $numc
1977             && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1978             {
1979              
1980             # Split $tok into up to 3 tokens:
1981 8         20 my $tok_0 = substr( $pretoken, 0, $numc );
1982 8 50       32 my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1983 8 50       27 my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1984              
1985 8         17 my $len_0 = length($tok_0);
1986 8         19 my $len_1 = length($tok_1);
1987 8         14 my $len_2 = length($tok_2);
1988              
1989 8         16 my $pre_type_0 = 'w';
1990 8         16 my $pre_type_1 = 'd';
1991 8         13 my $pre_type_2 = 'w';
1992              
1993 8         15 my $pos_0 = $rtoken_map->[$i];
1994 8         15 my $pos_1 = $pos_0 + $len_0;
1995 8         17 my $pos_2 = $pos_1 + $len_1;
1996              
1997 8         15 my $isplice = $i + 1;
1998              
1999             # Splice in any digits
2000 8 100       28 if ($len_1) {
2001 5         8 splice @{$rtoken_map}, $isplice, 0, $pos_1;
  5         19  
2002 5         11 splice @{$rtokens}, $isplice, 0, $tok_1;
  5         15  
2003 5         10 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
  5         16  
2004 5         7 $max_token_index++;
2005 5         9 $isplice++;
2006             }
2007              
2008             # Splice in any trailing word
2009 8 100       20 if ($len_2) {
2010 4         6 splice @{$rtoken_map}, $isplice, 0, $pos_2;
  4         14  
2011 4         6 splice @{$rtokens}, $isplice, 0, $tok_2;
  4         9  
2012 4         8 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
  4         8  
2013 4         7 $max_token_index++;
2014             }
2015              
2016 8         19 $rtokens->[$i] = $tok_0;
2017 8         31 return 1;
2018             }
2019             else {
2020              
2021             # Shouldn't get here
2022 0         0 if (DEVEL_MODE) {
2023             $self->Fault(<<EOM);
2024             While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
2025             EOM
2026             }
2027             }
2028 0         0 return;
2029             } ## end sub split_pretoken
2030              
2031             sub get_indentation_level {
2032 562     562 0 1603 return $level_in_tokenizer;
2033             }
2034              
2035             sub reset_indentation_level {
2036 562     562 0 1573 $level_in_tokenizer = shift;
2037 562         1208 return;
2038             }
2039              
2040             sub peeked_ahead {
2041 252     252 0 465 my $flag = shift;
2042 252 100       631 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
2043 252         927 return $peeked_ahead;
2044             }
2045              
2046             # ------------------------------------------------------------
2047             # end of tokenizer variable access and manipulation routines
2048             # ------------------------------------------------------------
2049              
2050             #------------------------------
2051             # beginning of tokenizer hashes
2052             #------------------------------
2053              
2054             my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
2055              
2056             # These block types terminate statements and do not need a trailing
2057             # semicolon
2058             # patched for SWITCH/CASE/
2059             # NOTE: not currently used but may be used in the future
2060             my %is_zero_continuation_block_type;
2061             my @q;
2062             @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
2063             if elsif else unless while until for foreach switch case given when);
2064             @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
2065              
2066             my %is_logical_container;
2067             @q = qw(if elsif unless while and or err not && ! || for foreach);
2068             @is_logical_container{@q} = (1) x scalar(@q);
2069              
2070             my %is_binary_type;
2071             @q = qw(|| &&);
2072             @is_binary_type{@q} = (1) x scalar(@q);
2073              
2074             my %is_binary_keyword;
2075             @q = qw(and or err eq ne cmp);
2076             @is_binary_keyword{@q} = (1) x scalar(@q);
2077              
2078             # 'L' is token for opening { at hash key
2079             my %is_opening_type;
2080             @q = qw< L { ( [ >;
2081             @is_opening_type{@q} = (1) x scalar(@q);
2082              
2083             my %is_opening_or_ternary_type;
2084             push @q, '?';
2085             @is_opening_or_ternary_type{@q} = (1) x scalar(@q);
2086              
2087             # 'R' is token for closing } at hash key
2088             my %is_closing_type;
2089             @q = qw< R } ) ] >;
2090             @is_closing_type{@q} = (1) x scalar(@q);
2091              
2092             my %is_closing_or_ternary_type;
2093             push @q, ':';
2094             @is_closing_or_ternary_type{@q} = (1) x scalar(@q);
2095              
2096             my %is_redo_last_next_goto;
2097             @q = qw(redo last next goto);
2098             @is_redo_last_next_goto{@q} = (1) x scalar(@q);
2099              
2100             my %is_use_require;
2101             @q = qw(use require);
2102             @is_use_require{@q} = (1) x scalar(@q);
2103              
2104             # This hash holds the array index in $self for these keywords:
2105             # Fix for issue c035: removed 'format' from this hash
2106             my %is_END_DATA = (
2107             '__END__' => _in_end_,
2108             '__DATA__' => _in_data_,
2109             );
2110              
2111             my %is_list_end_type;
2112             @q = qw( ; { } );
2113             push @q, ',';
2114             @is_list_end_type{@q} = (1) x scalar(@q);
2115              
2116             # original ref: camel 3 p 147,
2117             # but perl may accept undocumented flags
2118             # perl 5.10 adds 'p' (preserve)
2119             # Perl version 5.22 added 'n'
2120             # From http://perldoc.perl.org/perlop.html we have
2121             # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
2122             # s/PATTERN/REPLACEMENT/msixpodualngcer
2123             # y/SEARCHLIST/REPLACEMENTLIST/cdsr
2124             # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
2125             # qr/STRING/msixpodualn
2126             my %quote_modifiers = (
2127             's' => '[msixpodualngcer]',
2128             'y' => '[cdsr]',
2129             'tr' => '[cdsr]',
2130             'm' => '[msixpodualngc]',
2131             'qr' => '[msixpodualn]',
2132             'q' => EMPTY_STRING,
2133             'qq' => EMPTY_STRING,
2134             'qw' => EMPTY_STRING,
2135             'qx' => EMPTY_STRING,
2136             );
2137              
2138             # table showing how many quoted things to look for after quote operator..
2139             # s, y, tr have 2 (pattern and replacement)
2140             # others have 1 (pattern only)
2141             my %quote_items = (
2142             's' => 2,
2143             'y' => 2,
2144             'tr' => 2,
2145             'm' => 1,
2146             'qr' => 1,
2147             'q' => 1,
2148             'qq' => 1,
2149             'qw' => 1,
2150             'qx' => 1,
2151             );
2152              
2153             my %is_for_foreach;
2154             @q = qw(for foreach);
2155             @is_for_foreach{@q} = (1) x scalar(@q);
2156              
2157             # These keywords may introduce blocks after parenthesized expressions,
2158             # in the form:
2159             # keyword ( .... ) { BLOCK }
2160             # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
2161             # NOTE for --use-feature=class: if ADJUST blocks eventually take a
2162             # parameter list, then ADJUST might need to be added to this list (see
2163             # perlclass.pod)
2164             my %is_blocktype_with_paren;
2165             @q =
2166             qw(if elsif unless while until for foreach switch case given when catch);
2167             @is_blocktype_with_paren{@q} = (1) x scalar(@q);
2168              
2169             my %is_case_default;
2170             @q = qw(case default);
2171             @is_case_default{@q} = (1) x scalar(@q);
2172              
2173             #------------------------
2174             # end of tokenizer hashes
2175             #------------------------
2176              
2177             # ------------------------------------------------------------
2178             # beginning of various scanner interface routines
2179             # ------------------------------------------------------------
2180             sub scan_replacement_text {
2181              
2182             # check for here-docs in replacement text invoked by
2183             # a substitution operator with executable modifier 'e'.
2184             #
2185             # given:
2186             # $replacement_text
2187             # return:
2188             # $rht = reference to any here-doc targets
2189 0     0 0 0 my ( $self, $replacement_text ) = @_;
2190              
2191             # quick check
2192 0 0       0 return if ( $replacement_text !~ /<</ );
2193              
2194 0         0 $self->write_logfile_entry(
2195             "scanning replacement text for here-doc targets\n");
2196              
2197             # save the logger object for error messages
2198 0         0 my $logger_object = $self->[_logger_object_];
2199              
2200             # save all lexical variables
2201 0         0 my $rstate = save_tokenizer_state();
2202 0         0 _decrement_count(); # avoid error check for multiple tokenizers
2203              
2204             # make a new tokenizer
2205 0         0 my $tokenizer = Perl::Tidy::Tokenizer->new(
2206             source_object => \$replacement_text,
2207             logger_object => $logger_object,
2208             starting_line_number => $input_line_number,
2209             );
2210              
2211             # scan the replacement text
2212 0         0 while ( $tokenizer->get_line() ) { }
2213              
2214             # remove any here doc targets
2215 0         0 my $rht = undef;
2216 0 0       0 if ( $tokenizer->[_in_here_doc_] ) {
2217 0         0 $rht = [];
2218 0         0 push @{$rht},
  0         0  
2219             [
2220             $tokenizer->[_here_doc_target_],
2221             $tokenizer->[_here_quote_character_]
2222             ];
2223 0 0       0 if ( $tokenizer->[_rhere_target_list_] ) {
2224 0         0 push @{$rht}, @{ $tokenizer->[_rhere_target_list_] };
  0         0  
  0         0  
2225 0         0 $tokenizer->[_rhere_target_list_] = undef;
2226             }
2227 0         0 $tokenizer->[_in_here_doc_] = undef;
2228             }
2229              
2230             # now its safe to report errors
2231 0         0 my $severe_error = $tokenizer->report_tokenization_errors();
2232              
2233             # TODO: Could propagate a severe error up
2234              
2235             # restore all tokenizer lexical variables
2236 0         0 restore_tokenizer_state($rstate);
2237              
2238             # return the here doc targets
2239 0         0 return $rht;
2240             } ## end sub scan_replacement_text
2241              
2242             sub scan_bare_identifier {
2243 1674     1674 0 3031 my $self = shift;
2244 1674         5209 ( $i, $tok, $type, $prototype ) =
2245             $self->scan_bare_identifier_do( $input_line, $i, $tok, $type,
2246             $prototype, $rtoken_map, $max_token_index );
2247 1674         3444 return;
2248             } ## end sub scan_bare_identifier
2249              
2250             sub scan_identifier {
2251              
2252 486     486 0 913 my $self = shift;
2253              
2254             (
2255 486         1975 $i, $tok, $type, $id_scan_state, $identifier,
2256             my $split_pretoken_flag
2257             )
2258             = $self->scan_complex_identifier( $i, $id_scan_state, $identifier,
2259             $rtokens, $max_token_index, $expecting,
2260             $rparen_type->[$paren_depth] );
2261              
2262             # Check for signal to fix a special variable adjacent to a keyword,
2263             # such as '$^One$0'.
2264 486 100       1492 if ($split_pretoken_flag) {
2265              
2266             # Try to fix it by splitting the pretoken
2267 3 50 33     41 if ( $i > 0
      33        
2268             && $rtokens->[ $i - 1 ] eq '^'
2269             && $self->split_pretoken(1) )
2270             {
2271 3         9 $identifier = substr( $identifier, 0, 3 );
2272 3         5 $tok = $identifier;
2273             }
2274             else {
2275              
2276             # This shouldn't happen ...
2277 0         0 my $var = substr( $tok, 0, 3 );
2278 0         0 my $excess = substr( $tok, 3 );
2279 0         0 $self->interrupt_logfile();
2280 0         0 $self->warning(<<EOM);
2281             $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2282             A space may be needed after '$var'.
2283             EOM
2284 0         0 $self->resume_logfile();
2285             }
2286             }
2287 486         958 return;
2288             } ## end sub scan_identifier
2289              
2290 39     39   403 use constant VERIFY_FASTSCAN => 0;
  39         141  
  39         5185  
2291             my %fast_scan_context;
2292              
2293             BEGIN {
2294 39     39   49293 %fast_scan_context = (
2295             '$' => SCALAR_CONTEXT,
2296             '*' => SCALAR_CONTEXT,
2297             '@' => LIST_CONTEXT,
2298             '%' => LIST_CONTEXT,
2299             '&' => UNKNOWN_CONTEXT,
2300             );
2301             } ## end BEGIN
2302              
2303             sub scan_simple_identifier {
2304              
2305             # This is a wrapper for sub scan_identifier. It does a fast preliminary
2306             # scan for certain common identifiers:
2307             # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2308             # If it does not find one of these, or this is a restart, it calls the
2309             # original scanner directly.
2310              
2311             # This gives the same results as the full scanner in about 1/4 the
2312             # total runtime for a typical input stream.
2313              
2314             # Notation:
2315             # $var * 2
2316             # ^^ ^
2317             # || |
2318             # || ---- $i_next [= next nonblank pretoken ]
2319             # |----$i_plus_1 [= a bareword ]
2320             # ---$i_begin [= a sigil]
2321              
2322 4793     4793 0 7352 my $self = shift;
2323              
2324 4793         7288 my $i_begin = $i;
2325 4793         7126 my $tok_begin = $tok;
2326 4793         7882 my $i_plus_1 = $i + 1;
2327 4793         7137 my $fast_scan_type;
2328              
2329             #-------------------------------------------------------
2330             # Do full scan for anything following a pointer, such as
2331             # $cref->&*; # a postderef
2332             #-------------------------------------------------------
2333 4793 100 66     27980 if ( $last_nonblank_token eq '->' ) {
    100 66        
    50 33        
      0        
      33        
2334              
2335             }
2336              
2337             #------------------------------
2338             # quick scan with leading sigil
2339             #------------------------------
2340             elsif ( !$id_scan_state
2341             && $i_plus_1 <= $max_token_index
2342             && $fast_scan_context{$tok} )
2343             {
2344 4680         8227 $context = $fast_scan_context{$tok};
2345              
2346             # look for $var, @var, ...
2347 4680 100 100     11395 if ( $rtoken_type->[$i_plus_1] eq 'w' ) {
    100 66        
2348 4392         7101 my $pretype_next = EMPTY_STRING;
2349 4392 100       9430 if ( $i_plus_1 < $max_token_index ) {
2350 4276         6741 my $i_next = $i_plus_1 + 1;
2351 4276 100 100     13227 if ( $rtoken_type->[$i_next] eq 'b'
2352             && $i_next < $max_token_index )
2353             {
2354 1708         2986 $i_next += 1;
2355             }
2356 4276         7393 $pretype_next = $rtoken_type->[$i_next];
2357             }
2358 4392 100 100     15824 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2359              
2360             # Found type 'i' like '$var', '@var', or '%var'
2361 4284         8325 $identifier = $tok . $rtokens->[$i_plus_1];
2362 4284         6609 $tok = $identifier;
2363 4284         6995 $type = 'i';
2364 4284         6198 $i = $i_plus_1;
2365 4284         7347 $fast_scan_type = $type;
2366             }
2367             }
2368              
2369             # Look for @{ or %{ .
2370             # But we must let the full scanner handle things ${ because it may
2371             # keep going to get a complete identifier like '${#}' .
2372             elsif (
2373             $rtoken_type->[$i_plus_1] eq '{'
2374             && ( $tok_begin eq '@'
2375             || $tok_begin eq '%' )
2376             )
2377             {
2378              
2379 30         76 $identifier = $tok;
2380 30         62 $type = 't';
2381 30         53 $fast_scan_type = $type;
2382             }
2383             else {
2384             ## out of tricks
2385             }
2386             }
2387              
2388             #---------------------------
2389             # Quick scan with leading ->
2390             # Look for ->[ and ->{
2391             #---------------------------
2392             elsif (
2393             $tok eq '->'
2394             && $i < $max_token_index
2395             && ( $rtokens->[$i_plus_1] eq '{'
2396             || $rtokens->[$i_plus_1] eq '[' )
2397             )
2398             {
2399 0         0 $type = $tok;
2400 0         0 $fast_scan_type = $type;
2401 0         0 $identifier = $tok;
2402 0         0 $context = UNKNOWN_CONTEXT;
2403             }
2404             else {
2405             ## out of tricks
2406             }
2407              
2408             #--------------------------------------
2409             # Verify correctness during development
2410             #--------------------------------------
2411 4793         6692 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2412              
2413             # We will call the full method
2414             my $identifier_simple = $identifier;
2415             my $tok_simple = $tok;
2416             my $i_simple = $i;
2417             my $context_simple = $context;
2418              
2419             $tok = $tok_begin;
2420             $i = $i_begin;
2421             $self->scan_identifier();
2422              
2423             if ( $tok ne $tok_simple
2424             || $type ne $fast_scan_type
2425             || $i != $i_simple
2426             || $identifier ne $identifier_simple
2427             || $id_scan_state
2428             || $context ne $context_simple )
2429             {
2430             print {*STDERR} <<EOM;
2431             scan_simple_identifier differs from scan_identifier:
2432             simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2433             full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2434             EOM
2435             }
2436             }
2437              
2438             #-------------------------------------------------
2439             # call full scanner if fast method did not succeed
2440             #-------------------------------------------------
2441 4793 100       10040 if ( !$fast_scan_type ) {
2442 479         1675 $self->scan_identifier();
2443             }
2444 4793         8359 return;
2445             } ## end sub scan_simple_identifier
2446              
2447             sub method_ok_here {
2448              
2449             # Return:
2450             # false if this is definitely an invalid method declaration
2451             # true otherwise (even if not sure)
2452              
2453             # We are trying to avoid problems with old uses of 'method'
2454             # when --use-feature=class is set (rt145706).
2455             # For example, this should cause a return of 'false':
2456              
2457             # method paint => sub {
2458             # return;
2459             # };
2460              
2461 6     6 0 18 my $self = shift;
2462              
2463             # from do_scan_sub:
2464 6         14 my $i_beg = $i + 1;
2465 6         13 my $pos_beg = $rtoken_map->[$i_beg];
2466 6         19 pos($input_line) = $pos_beg;
2467              
2468             # TEST 1: look a valid sub NAME
2469 6 50       43 if (
2470             $input_line =~ m{\G\s*
2471             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2472             (\w+) # NAME - required
2473             }gcx
2474             )
2475             {
2476             # For possible future use..
2477 6         15 my $subname = $2;
2478 6 50       27 my $package = $1 ? $1 : EMPTY_STRING;
2479             }
2480             else {
2481 0         0 return;
2482             }
2483              
2484             # TEST 2: look for invalid characters after name, such as here:
2485             # method paint => sub {
2486             # ...
2487             # }
2488 6         14 my $next_char = EMPTY_STRING;
2489 6 100       31 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
  5         14  
2490 6 100 66     38 if ( !$next_char || $next_char eq '#' ) {
2491 1         5 ( $next_char, my $i_next ) =
2492             $self->find_next_nonblank_token( $max_token_index,
2493             $rtokens, $max_token_index );
2494             }
2495              
2496 6 50       21 if ( !$next_char ) {
2497              
2498             # out of characters - give up
2499 0         0 return;
2500             }
2501              
2502             # Possibly valid next token types:
2503             # '(' could start prototype or signature
2504             # ':' could start ATTRIBUTE
2505             # '{' cold start BLOCK
2506             # ';' or '}' could end a statement
2507 6 100       30 if ( $next_char !~ /^[\(\:\{\;\}]/ ) {
2508              
2509             # This does not match use feature 'class' syntax
2510 3         13 return;
2511             }
2512              
2513             # We will stop here and assume that this is valid syntax for
2514             # use feature 'class'.
2515 3         18 return 1;
2516             } ## end sub method_ok_here
2517              
2518             sub class_ok_here {
2519              
2520             # Return:
2521             # false if this is definitely an invalid class declaration
2522             # true otherwise (even if not sure)
2523              
2524             # We are trying to avoid problems with old uses of 'class'
2525             # when --use-feature=class is set (rt145706). We look ahead
2526             # see if this use of 'class' is obviously inconsistent with
2527             # the syntax of use feature 'class'. This allows the default
2528             # setting --use-feature=class to work for old syntax too.
2529              
2530             # Valid class declarations look like
2531             # class NAME ?ATTRS ?VERSION ?BLOCK
2532             # where ATTRS VERSION and BLOCK are optional
2533              
2534             # For example, this should produce a return of 'false':
2535             #
2536             # class ExtendsBasicAttributes is BasicAttributes{
2537              
2538 6     6 0 13 my $self = shift;
2539              
2540             # TEST 1: class stmt can only go where a new statment can start
2541 6 50       15 if ( !new_statement_ok() ) { return }
  0         0  
2542              
2543 6         14 my $i_beg = $i + 1;
2544 6         12 my $pos_beg = $rtoken_map->[$i_beg];
2545 6         19 pos($input_line) = $pos_beg;
2546              
2547             # TEST 2: look for a valid NAME
2548 6 50       36 if (
2549             $input_line =~ m{\G\s*
2550             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2551             (\w+) # NAME - required
2552             }gcx
2553             )
2554             {
2555             # For possible future use..
2556 6         17 my $subname = $2;
2557 6 100       20 my $package = $1 ? $1 : EMPTY_STRING;
2558             }
2559             else {
2560 0         0 return;
2561             }
2562              
2563             # TEST 3: look for valid characters after NAME
2564 6         13 my $next_char = EMPTY_STRING;
2565 6 100       23 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
  5         11  
2566 6 100 66     29 if ( !$next_char || $next_char eq '#' ) {
2567 1         12 ( $next_char, my $i_next ) =
2568             $self->find_next_nonblank_token( $max_token_index,
2569             $rtokens, $max_token_index );
2570             }
2571 6 50       16 if ( !$next_char ) {
2572              
2573             # out of characters - give up
2574 0         0 return;
2575             }
2576              
2577             # Must see one of: ATTRIBUTE, VERSION, BLOCK, or end stmt
2578              
2579             # Possibly valid next token types:
2580             # ':' could start ATTRIBUTE
2581             # '\d' could start VERSION
2582             # '{' cold start BLOCK
2583             # ';' could end a statement
2584             # '}' could end statement but would be strange
2585              
2586 6 100       23 if ( $next_char !~ /^[\:\d\{\;\}]/ ) {
2587              
2588             # This does not match use feature 'class' syntax
2589 2         10 return;
2590             }
2591              
2592             # We will stop here and assume that this is valid syntax for
2593             # use feature 'class'.
2594 4         15 return 1;
2595             } ## end sub class_ok_here
2596              
2597             sub scan_id {
2598 332     332 0 690 my $self = shift;
2599 332         1382 ( $i, $tok, $type, $id_scan_state ) =
2600             $self->scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2601             $id_scan_state, $max_token_index );
2602 332         802 return;
2603             } ## end sub scan_id
2604              
2605             sub scan_number {
2606 629     629 0 1069 my $self = shift;
2607 629         949 my $number;
2608 629         1826 ( $i, $type, $number ) =
2609             $self->scan_number_do( $input_line, $i, $rtoken_map, $type,
2610             $max_token_index );
2611 629         1512 return $number;
2612             } ## end sub scan_number
2613              
2614 39     39   373 use constant VERIFY_FASTNUM => 0;
  39         100  
  39         30580  
2615              
2616             sub scan_number_fast {
2617              
2618             # This is a wrapper for sub scan_number. It does a fast preliminary
2619             # scan for a simple integer. It calls the original scan_number if it
2620             # does not find one.
2621              
2622 2277     2277 0 3615 my $self = shift;
2623 2277         3611 my $i_begin = $i;
2624 2277         3574 my $tok_begin = $tok;
2625 2277         3251 my $number;
2626              
2627             #---------------------------------
2628             # Quick check for (signed) integer
2629             #---------------------------------
2630              
2631             # This will be the string of digits:
2632 2277         3723 my $i_d = $i;
2633 2277         3626 my $tok_d = $tok;
2634 2277         4030 my $typ_d = $rtoken_type->[$i_d];
2635              
2636             # check for signed integer
2637 2277         3739 my $sign = EMPTY_STRING;
2638 2277 50 66     7009 if ( $typ_d ne 'd'
      66        
      33        
2639             && ( $typ_d eq '+' || $typ_d eq '-' )
2640             && $i_d < $max_token_index )
2641             {
2642 343         581 $sign = $tok_d;
2643 343         625 $i_d++;
2644 343         646 $tok_d = $rtokens->[$i_d];
2645 343         647 $typ_d = $rtoken_type->[$i_d];
2646             }
2647              
2648             # Handle integers
2649 2277 100 100     17023 if (
      100        
2650             $typ_d eq 'd'
2651             && (
2652             $i_d == $max_token_index
2653             || ( $i_d < $max_token_index
2654             && $rtoken_type->[ $i_d + 1 ] ne '.'
2655             && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2656             )
2657             )
2658             {
2659             # Let let full scanner handle multi-digit integers beginning with
2660             # '0' because there could be error messages. For example, '009' is
2661             # not a valid number.
2662              
2663 1715 100 100     7373 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2664 1658         3124 $number = $sign . $tok_d;
2665 1658         2696 $type = 'n';
2666 1658         4018 $i = $i_d;
2667             }
2668             }
2669              
2670             #--------------------------------------
2671             # Verify correctness during development
2672             #--------------------------------------
2673 2277         3305 if ( VERIFY_FASTNUM && defined($number) ) {
2674              
2675             # We will call the full method
2676             my $type_simple = $type;
2677             my $i_simple = $i;
2678             my $number_simple = $number;
2679              
2680             $tok = $tok_begin;
2681             $i = $i_begin;
2682             $number = $self->scan_number();
2683              
2684             if ( $type ne $type_simple
2685             || ( $i != $i_simple && $i <= $max_token_index )
2686             || $number ne $number_simple )
2687             {
2688             print {*STDERR} <<EOM;
2689             scan_number_fast differs from scan_number:
2690             simple: i=$i_simple, type=$type_simple, number=$number_simple
2691             full: i=$i, type=$type, number=$number
2692             EOM
2693             }
2694             }
2695              
2696             #----------------------------------------
2697             # call full scanner if may not be integer
2698             #----------------------------------------
2699 2277 100       5476 if ( !defined($number) ) {
2700 619         1682 $number = $self->scan_number();
2701             }
2702 2277         5480 return $number;
2703             } ## end sub scan_number_fast
2704              
2705             # a sub to warn if token found where term expected
2706             sub error_if_expecting_TERM {
2707 9     9 0 21 my $self = shift;
2708 9 50       43 if ( $expecting == TERM ) {
2709 9 50       37 if ( $really_want_term{$last_nonblank_type} ) {
2710 0         0 $self->report_unexpected( $tok, "term", $i_tok,
2711             $last_nonblank_i, $rtoken_map, $rtoken_type, $input_line );
2712 0         0 return 1;
2713             }
2714             }
2715 9         21 return;
2716             } ## end sub error_if_expecting_TERM
2717              
2718             # a sub to warn if token found where operator expected
2719             sub error_if_expecting_OPERATOR {
2720 769     769 0 1631 my ( $self, $thing ) = @_;
2721 769 50       1829 if ( $expecting == OPERATOR ) {
2722 0 0       0 if ( !defined($thing) ) { $thing = $tok }
  0         0  
2723 0         0 $self->report_unexpected( $thing, "operator", $i_tok,
2724             $last_nonblank_i, $rtoken_map, $rtoken_type, $input_line );
2725 0 0       0 if ( $i_tok == 0 ) {
2726 0         0 $self->interrupt_logfile();
2727 0         0 $self->warning("Missing ';' or ',' above?\n");
2728 0         0 $self->resume_logfile();
2729             }
2730 0         0 return 1;
2731             }
2732 769         1491 return;
2733             } ## end sub error_if_expecting_OPERATOR
2734              
2735             # ------------------------------------------------------------
2736             # end scanner interfaces
2737             # ------------------------------------------------------------
2738              
2739             #------------------
2740             # Tokenization subs
2741             #------------------
2742             sub do_GREATER_THAN_SIGN {
2743              
2744 31     31 0 92 my $self = shift;
2745              
2746             # '>'
2747 31 50       118 $self->error_if_expecting_TERM()
2748             if ( $expecting == TERM );
2749 31         76 return;
2750             } ## end sub do_GREATER_THAN_SIGN
2751              
2752             sub do_VERTICAL_LINE {
2753              
2754 4     4 0 9 my $self = shift;
2755              
2756             # '|'
2757 4 50       19 $self->error_if_expecting_TERM()
2758             if ( $expecting == TERM );
2759 4         11 return;
2760             } ## end sub do_VERTICAL_LINE
2761              
2762             # An identifier in possible indirect object location followed by any of
2763             # these tokens: -> , ; } (plus others) is not an indirect object. Fix c257.
2764             my %Z_test_hash;
2765              
2766             BEGIN {
2767 39     39   468 my @qZ = qw#
2768             -> ; } ) ]
2769             => =~ = == !~ || >= != *= .. && |= .= -= += <= %=
2770             ^= &&= ||= //= <=>
2771             #;
2772 39         169 push @qZ, ',';
2773 39         354211 @{Z_test_hash}{@qZ} = (1) x scalar(@qZ);
2774             }
2775              
2776             sub do_DOLLAR_SIGN {
2777              
2778 4038     4038 0 7027 my $self = shift;
2779              
2780             # '$'
2781             # start looking for a scalar
2782 4038 50       9222 $self->error_if_expecting_OPERATOR("Scalar")
2783             if ( $expecting == OPERATOR );
2784 4038         12372 $self->scan_simple_identifier();
2785              
2786 4038 100       9436 if ( $identifier eq '$^W' ) {
2787 1         4 $self->[_saw_perl_dash_w_] = 1;
2788             }
2789              
2790             # Check for identifier in indirect object slot
2791             # (vorboard.pl, sort.t). Something like:
2792             # /^(print|printf|sort|exec|system)$/
2793 4038 100 66     32279 if (
      100        
      100        
      66        
      66        
2794             $is_indirect_object_taker{$last_nonblank_token}
2795             && $last_nonblank_type eq 'k'
2796             || ( ( $last_nonblank_token eq '(' )
2797             && $is_indirect_object_taker{ $rparen_type->[$paren_depth] } )
2798             || ( $last_nonblank_type eq 'w'
2799             || $last_nonblank_type eq 'U' ) # possible object
2800             )
2801             {
2802              
2803             # An identifier followed by '->' is not indirect object;
2804             # fixes b1175, b1176. Fix c257: Likewise for other tokens like
2805             # comma, semicolon, closing brace, and single space.
2806 98         683 my ( $next_nonblank_token, $i_next ) =
2807             $self->find_next_noncomment_token( $i, $rtokens,
2808             $max_token_index );
2809 98 100       435 $type = 'Z' if ( !$Z_test_hash{$next_nonblank_token} );
2810             }
2811 4038         6741 return;
2812             } ## end sub do_DOLLAR_SIGN
2813              
2814             sub do_LEFT_PARENTHESIS {
2815              
2816 2126     2126 0 4074 my $self = shift;
2817              
2818             # '('
2819 2126         3498 ++$paren_depth;
2820              
2821             # variable to enable check for brace after closing paren (c230)
2822 2126         3846 my $want_brace = EMPTY_STRING;
2823              
2824 2126 100       6613 if ($want_paren) {
    100          
2825 240         591 $container_type = $want_paren;
2826 240         585 $want_brace = $want_paren;
2827 240         492 $want_paren = EMPTY_STRING;
2828             }
2829             elsif ( $statement_type =~ /^sub\b/ ) {
2830 14         35 $container_type = $statement_type;
2831             }
2832             else {
2833 1872         3370 $container_type = $last_nonblank_token;
2834              
2835             # We can check for a syntax error here of unexpected '(',
2836             # but this is going to get messy...
2837 1872 100 100     7495 if (
2838             $expecting == OPERATOR
2839              
2840             # Be sure this is not a method call of the form
2841             # &method(...), $method->(..), &{method}(...),
2842             # $ref[2](list) is ok & short for $ref[2]->(list)
2843             # NOTE: at present, braces in something like &{ xxx }
2844             # are not marked as a block, we might have a method call.
2845             # Added ')' to fix case c017, something like ()()()
2846             && $last_nonblank_token !~ /^(?:[\]\}\)\&]|\-\>)/
2847             )
2848             {
2849              
2850             # ref: camel 3 p 703.
2851 3 50       12 if ( $last_last_nonblank_token eq 'do' ) {
2852 0         0 $self->complain(
2853             "do SUBROUTINE is deprecated; consider & or -> notation\n"
2854             );
2855             }
2856             else {
2857              
2858             # if this is an empty list, (), then it is not an
2859             # error; for example, we might have a constant pi and
2860             # invoke it with pi() or just pi;
2861 3         9 my ( $next_nonblank_token, $i_next ) =
2862             $self->find_next_nonblank_token( $i, $rtokens,
2863             $max_token_index );
2864              
2865             # Patch for c029: give up error check if
2866             # a side comment follows
2867 3 50 33     19 if ( $next_nonblank_token ne ')'
2868             && $next_nonblank_token ne '#' )
2869             {
2870 0         0 my $hint;
2871              
2872 0         0 $self->error_if_expecting_OPERATOR('(');
2873              
2874 0 0       0 if ( $last_nonblank_type eq 'C' ) {
    0          
2875 0         0 $hint =
2876             "$last_nonblank_token has a void prototype\n";
2877             }
2878             elsif ( $last_nonblank_type eq 'i' ) {
2879 0 0 0     0 if ( $i_tok > 0
2880             && $last_nonblank_token =~ /^\$/ )
2881             {
2882 0         0 $hint =
2883             "Do you mean '$last_nonblank_token->(' ?\n";
2884             }
2885             }
2886             else {
2887             ## no hint
2888             }
2889 0 0       0 if ($hint) {
2890 0         0 $self->interrupt_logfile();
2891 0         0 $self->warning($hint);
2892 0         0 $self->resume_logfile();
2893             }
2894             } ## end if ( $next_nonblank_token...
2895             } ## end else [ if ( $last_last_nonblank_token...
2896             } ## end if ( $expecting == OPERATOR...
2897             }
2898              
2899 2126         7126 ( $type_sequence, $indent_flag ) =
2900             $self->increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2901              
2902             # propagate types down through nested parens
2903             # for example: the second paren in 'if ((' would be structural
2904             # since the first is.
2905              
2906 2126 100       5623 if ( $last_nonblank_token eq '(' ) {
2907 61         240 $type = $last_nonblank_type;
2908             }
2909              
2910             # We exclude parens as structural after a ',' because it
2911             # causes subtle problems with continuation indentation for
2912             # something like this, where the first 'or' will not get
2913             # indented.
2914             #
2915             # assert(
2916             # __LINE__,
2917             # ( not defined $check )
2918             # or ref $check
2919             # or $check eq "new"
2920             # or $check eq "old",
2921             # );
2922             #
2923             # Likewise, we exclude parens where a statement can start
2924             # because of problems with continuation indentation, like
2925             # these:
2926             #
2927             # ($firstline =~ /^#\!.*perl/)
2928             # and (print $File::Find::name, "\n")
2929             # and (return 1);
2930             #
2931             # (ref($usage_fref) =~ /CODE/)
2932             # ? &$usage_fref
2933             # : (&blast_usage, &blast_params, &blast_general_params);
2934              
2935             else {
2936 2065         3606 $type = '{';
2937             }
2938              
2939 2126 50       5182 if ( $last_nonblank_type eq ')' ) {
2940 0         0 $self->warning(
2941             "Syntax error? found token '$last_nonblank_type' then '('\n");
2942             }
2943              
2944             # git #105: Copy container type and want-brace flag at ') (';
2945             # propagate the container type onward so that any subsequent brace gets
2946             # correctly marked. I have implemented this as a general rule, which
2947             # should be safe, but if necessary it could be restricted to certain
2948             # container statement types such as 'for'.
2949 2126 100       4882 if ( $last_nonblank_token eq ')' ) {
2950 1         3 my $rvars = $rparen_vars->[$paren_depth];
2951 1 50       5 if ( defined($rvars) ) {
2952 1         5 $container_type = $rparen_type->[$paren_depth];
2953 1         2 ( my $type_lp, $want_brace ) = @{$rvars};
  1         8  
2954             }
2955             }
2956              
2957 2126         4378 $rparen_type->[$paren_depth] = $container_type;
2958 2126         5772 $rparen_vars->[$paren_depth] = [ $type, $want_brace ];
2959 2126         4062 $rparen_semicolon_count->[$paren_depth] = 0;
2960              
2961 2126         3769 return;
2962              
2963             } ## end sub do_LEFT_PARENTHESIS
2964              
2965             sub do_RIGHT_PARENTHESIS {
2966              
2967 2126     2126 0 4292 my $self = shift;
2968              
2969             # ')'
2970 2126         7302 ( $type_sequence, $indent_flag ) =
2971             $self->decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2972              
2973 2126         4775 my $rvars = $rparen_vars->[$paren_depth];
2974 2126 50       5588 if ( defined($rvars) ) {
2975 2126         3456 my ( $type_lp, $want_brace ) = @{$rvars};
  2126         4868  
2976 2126 50 33     8624 if ( $type_lp && $type_lp eq '{' ) {
2977 2126         4039 $type = '}';
2978             }
2979             }
2980              
2981 2126         3895 $container_type = $rparen_type->[$paren_depth];
2982              
2983             # restore statement type as 'sub' at closing paren of a signature
2984             # so that a subsequent ':' is identified as an attribute
2985 2126 100       6463 if ( $container_type =~ /^sub\b/ ) {
2986 24         58 $statement_type = $container_type;
2987             }
2988              
2989             # /^(for|foreach)$/
2990 2126 100       5915 if ( $is_for_foreach{ $rparen_type->[$paren_depth] } ) {
2991 69         235 my $num_sc = $rparen_semicolon_count->[$paren_depth];
2992 69 50 66     439 if ( $num_sc > 0 && $num_sc != 2 ) {
2993 0         0 $self->warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2994             }
2995             }
2996              
2997 2126 50       5220 if ( $paren_depth > 0 ) { $paren_depth-- }
  2126         3343  
2998 2126         4166 return;
2999             } ## end sub do_RIGHT_PARENTHESIS
3000              
3001             sub do_COMMA {
3002              
3003 3075     3075 0 5316 my $self = shift;
3004              
3005             # ','
3006 3075 100 33     10751 if ( $last_nonblank_type eq ',' ) {
    50          
3007 10         39 $self->complain("Repeated ','s \n");
3008             }
3009              
3010             # Note that we have to check both token and type here because a
3011             # comma following a qw list can have last token='(' but type = 'q'
3012             elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
3013 0         0 $self->warning("Unexpected leading ',' after a '('\n");
3014             }
3015             else {
3016             ## ok: no complaints needed
3017             }
3018              
3019             # patch for operator_expected: note if we are in the list (use.t)
3020 3075 100       6436 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
  6         15  
3021 3075         4920 return;
3022              
3023             } ## end sub do_COMMA
3024              
3025             sub do_SEMICOLON {
3026              
3027 2450     2450 0 4939 my $self = shift;
3028              
3029             # ';'
3030 2450         4403 $context = UNKNOWN_CONTEXT;
3031 2450         3952 $statement_type = EMPTY_STRING;
3032 2450         4230 $want_paren = EMPTY_STRING;
3033              
3034             # /^(for|foreach)$/
3035 2450 100       7397 if ( $is_for_foreach{ $rparen_type->[$paren_depth] } )
3036             { # mark ; in for loop
3037              
3038             # Be careful: we do not want a semicolon such as the
3039             # following to be included:
3040             #
3041             # for (sort {strcoll($a,$b);} keys %investments) {
3042              
3043 35 100 66     274 if ( $brace_depth == $rdepth_array->[PAREN][BRACE][$paren_depth]
3044             && $square_bracket_depth ==
3045             $rdepth_array->[PAREN][SQUARE_BRACKET][$paren_depth] )
3046             {
3047              
3048 34         78 $type = 'f';
3049 34         66 $rparen_semicolon_count->[$paren_depth]++;
3050             }
3051             }
3052 2450         4092 return;
3053             } ## end sub do_SEMICOLON
3054              
3055             sub do_QUOTATION_MARK {
3056              
3057 1125     1125 0 2302 my $self = shift;
3058              
3059             # '"'
3060 1125 50       4079 $self->error_if_expecting_OPERATOR("String")
3061             if ( $expecting == OPERATOR );
3062 1125         1960 $in_quote = 1;
3063 1125         1914 $type = 'Q';
3064 1125         1846 $allowed_quote_modifiers = EMPTY_STRING;
3065 1125         1862 return;
3066             } ## end sub do_QUOTATION_MARK
3067              
3068             sub do_APOSTROPHE {
3069              
3070 1164     1164 0 2318 my $self = shift;
3071              
3072             # "'"
3073 1164 50       2905 $self->error_if_expecting_OPERATOR("String")
3074             if ( $expecting == OPERATOR );
3075 1164         1931 $in_quote = 1;
3076 1164         1915 $type = 'Q';
3077 1164         1829 $allowed_quote_modifiers = EMPTY_STRING;
3078 1164         1832 return;
3079             } ## end sub do_APOSTROPHE
3080              
3081             sub do_BACKTICK {
3082              
3083 0     0 0 0 my $self = shift;
3084              
3085             # '`'
3086 0 0       0 $self->error_if_expecting_OPERATOR("String")
3087             if ( $expecting == OPERATOR );
3088 0         0 $in_quote = 1;
3089 0         0 $type = 'Q';
3090 0         0 $allowed_quote_modifiers = EMPTY_STRING;
3091 0         0 return;
3092             } ## end sub do_BACKTICK
3093              
3094             sub do_SLASH {
3095              
3096 207     207 0 505 my $self = shift;
3097              
3098             # '/'
3099 207         364 my $is_pattern;
3100              
3101             # a pattern cannot follow certain keywords which take optional
3102             # arguments, like 'shift' and 'pop'. See also '?'.
3103 207 50 66     995 if (
    50          
3104             $last_nonblank_type eq 'k'
3105             && $is_keyword_rejecting_slash_as_pattern_delimiter{
3106             $last_nonblank_token}
3107             )
3108             {
3109 0         0 $is_pattern = 0;
3110             }
3111             elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
3112 0         0 my $msg;
3113 0         0 ( $is_pattern, $msg ) =
3114             $self->guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
3115             $max_token_index );
3116              
3117 0 0       0 if ($msg) {
3118 0         0 $self->write_diagnostics("DIVIDE:$msg\n");
3119 0         0 $self->write_logfile_entry($msg);
3120             }
3121             }
3122 207         435 else { $is_pattern = ( $expecting == TERM ) }
3123              
3124 207 100       512 if ($is_pattern) {
3125 78         148 $in_quote = 1;
3126 78         170 $type = 'Q';
3127 78         149 $allowed_quote_modifiers = '[msixpodualngc]';
3128             }
3129             else { # not a pattern; check for a /= token
3130              
3131 129 50       371 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
3132 0         0 $i++;
3133 0         0 $tok = '/=';
3134 0         0 $type = $tok;
3135             }
3136              
3137             #DEBUG - collecting info on what tokens follow a divide
3138             # for development of guessing algorithm
3139             ## if (
3140             ## $self->is_possible_numerator( $i, $rtokens,
3141             ## $max_token_index ) < 0
3142             ## )
3143             ## {
3144             ## $self->write_diagnostics("DIVIDE? $input_line\n");
3145             ## }
3146             }
3147 207         412 return;
3148             } ## end sub do_SLASH
3149              
3150             sub do_LEFT_CURLY_BRACKET {
3151              
3152 1670     1670 0 3516 my $self = shift;
3153              
3154             # '{'
3155             # if we just saw a ')', we will label this block with
3156             # its type. We need to do this to allow sub
3157             # code_block_type to determine if this brace starts a
3158             # code block or anonymous hash. (The type of a paren
3159             # pair is the preceding token, such as 'if', 'else',
3160             # etc).
3161 1670         3068 $container_type = EMPTY_STRING;
3162              
3163             # ATTRS: for a '{' following an attribute list, reset
3164             # things to look like we just saw a sub name
3165             # Added 'package' (can be 'class') for --use-feature=class (rt145706)
3166 1670 100 100     16337 if ( substr( $statement_type, 0, 3 ) eq 'sub' ) {
    100 66        
    50 33        
    100          
    50          
3167 34         73 $last_nonblank_token = $statement_type;
3168 34         74 $last_nonblank_type = 'S'; # c250 change
3169 34         73 $statement_type = EMPTY_STRING;
3170             }
3171             elsif ( substr( $statement_type, 0, 7 ) eq 'package' ) {
3172 4         6 $last_nonblank_token = $statement_type;
3173 4         8 $last_nonblank_type = 'P'; # c250 change
3174 4         6 $statement_type = EMPTY_STRING;
3175             }
3176              
3177             # patch for SWITCH/CASE: hide these keywords from an immediately
3178             # following opening brace
3179             elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
3180             && $statement_type eq $last_nonblank_token )
3181             {
3182 0         0 $last_nonblank_token = ";";
3183             }
3184              
3185             elsif ( $last_nonblank_token eq ')' ) {
3186 242         745 $last_nonblank_token = $rparen_type->[ $paren_depth + 1 ];
3187              
3188             # defensive move in case of a nesting error (pbug.t)
3189             # in which this ')' had no previous '('
3190             # this nesting error will have been caught
3191 242 50       817 if ( !defined($last_nonblank_token) ) {
3192 0         0 $last_nonblank_token = 'if';
3193             }
3194              
3195             # Syntax check at '){'
3196 242 100       818 if ( $is_blocktype_with_paren{$last_nonblank_token} ) {
3197              
3198 228         596 my $rvars = $rparen_vars->[ $paren_depth + 1 ];
3199 228 50       756 if ( defined($rvars) ) {
3200 228         469 my ( $type_lp, $want_brace ) = @{$rvars};
  228         879  
3201              
3202             # OLD: Now verify that this is not a trailing form
3203             # FIX for git #124: we have to skip this check because
3204             # the 'gather' keyword of List::Gather can operate on
3205             # a full statement, so it isn't possible to be sure
3206             # this is a trailing form.
3207 228         466 if ( 0 && !$want_brace ) {
3208             $self->warning(
3209             "syntax error at ') {', unexpected '{' after closing ')' of a trailing '$last_nonblank_token'\n"
3210             );
3211             }
3212             }
3213             }
3214             else {
3215 14 50       63 if ($rOpts_extended_syntax) {
3216              
3217             # we append a trailing () to mark this as an unknown
3218             # block type. This allows perltidy to format some
3219             # common extensions of perl syntax.
3220             # This is used by sub code_block_type
3221 14         60 $last_nonblank_token .= '()';
3222             }
3223             else {
3224 0         0 my $list =
3225             join( SPACE, sort keys %is_blocktype_with_paren );
3226 0         0 $self->warning(
3227             "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
3228             );
3229             }
3230             }
3231             }
3232              
3233             # patch for paren-less for/foreach glitch, part 2.
3234             # see note below under 'qw'
3235             elsif ($last_nonblank_token eq 'qw'
3236             && $is_for_foreach{$want_paren} )
3237             {
3238 0         0 $last_nonblank_token = $want_paren;
3239 0 0       0 if ( $last_last_nonblank_token eq $want_paren ) {
3240 0         0 $self->warning(
3241             "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
3242             );
3243              
3244             }
3245 0         0 $want_paren = EMPTY_STRING;
3246             }
3247             else {
3248             ## ok: not special
3249             }
3250              
3251             # now identify which of the three possible types of
3252             # curly braces we have: hash index container, anonymous
3253             # hash reference, or code block.
3254              
3255             # non-structural (hash index) curly brace pair
3256             # get marked 'L' and 'R'
3257 1670 100       4904 if ( is_non_structural_brace() ) {
3258 367         1403 $type = 'L';
3259              
3260             # patch for SWITCH/CASE:
3261             # allow paren-less identifier after 'when'
3262             # if the brace is preceded by a space
3263 367 0 33     1459 if ( $statement_type eq 'when'
      33        
      0        
      0        
3264             && $last_nonblank_type eq 'i'
3265             && $last_last_nonblank_type eq 'k'
3266             && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
3267             {
3268 0         0 $type = '{';
3269 0         0 $block_type = $statement_type;
3270             }
3271             }
3272              
3273             # code and anonymous hash have the same type, '{', but are
3274             # distinguished by 'block_type',
3275             # which will be blank for an anonymous hash
3276             else {
3277              
3278 1303         4884 $block_type =
3279             $self->code_block_type( $i_tok, $rtokens, $rtoken_type,
3280             $max_token_index );
3281              
3282             # patch to promote bareword type to function taking block
3283 1303 100 100     6136 if ( $block_type
      66        
3284             && $last_nonblank_type eq 'w'
3285             && $last_nonblank_i >= 0 )
3286             {
3287 34 50       164 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
3288             $routput_token_type->[$last_nonblank_i] =
3289 34 100       148 $is_grep_alias{$block_type} ? 'k' : 'G';
3290             }
3291             }
3292              
3293             # patch for SWITCH/CASE: if we find a stray opening block brace
3294             # where we might accept a 'case' or 'when' block, then take it
3295 1303 100 100     5462 if ( $statement_type eq 'case'
3296             || $statement_type eq 'when' )
3297             {
3298 38 100 66     221 if ( !$block_type || $block_type eq '}' ) {
3299 4         7 $block_type = $statement_type;
3300             }
3301             }
3302             }
3303              
3304 1670         3983 $rbrace_type->[ ++$brace_depth ] = $block_type;
3305              
3306             # Patch for CLASS BLOCK definitions: do not update the package for the
3307             # current depth if this is a BLOCK type definition.
3308             # TODO: should make 'class' separate from 'package' and only do
3309             # this for 'class'
3310 1670 100       5596 $rbrace_package->[$brace_depth] = $current_package
3311             if ( substr( $block_type, 0, 8 ) ne 'package ' );
3312              
3313 1670         3507 $rbrace_structural_type->[$brace_depth] = $type;
3314 1670         3221 $rbrace_context->[$brace_depth] = $context;
3315 1670         4958 ( $type_sequence, $indent_flag ) =
3316             $self->increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
3317 1670         3277 return;
3318             } ## end sub do_LEFT_CURLY_BRACKET
3319              
3320             sub do_RIGHT_CURLY_BRACKET {
3321              
3322 1670     1670 0 3481 my $self = shift;
3323              
3324             # '}'
3325 1670         3769 $block_type = $rbrace_type->[$brace_depth];
3326 1670 100       4162 if ($block_type) { $statement_type = EMPTY_STRING }
  974         1918  
3327 1670 100       4031 if ( defined( $rbrace_package->[$brace_depth] ) ) {
3328 1666         3232 $current_package = $rbrace_package->[$brace_depth];
3329             }
3330              
3331             # can happen on brace error (caught elsewhere)
3332             else {
3333             }
3334 1670         5174 ( $type_sequence, $indent_flag ) =
3335             $self->decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
3336              
3337 1670 100       5262 if ( $rbrace_structural_type->[$brace_depth] eq 'L' ) {
3338 367         848 $type = 'R';
3339             }
3340              
3341             # propagate type information for 'do' and 'eval' blocks, and also
3342             # for smartmatch operator. This is necessary to enable us to know
3343             # if an operator or term is expected next.
3344 1670 100       4851 if ( $is_block_operator{$block_type} ) {
3345 83         203 $tok = $block_type;
3346             }
3347              
3348 1670         2919 $context = $rbrace_context->[$brace_depth];
3349 1670 50       4220 if ( $brace_depth > 0 ) { $brace_depth--; }
  1670         2588  
3350 1670         2846 return;
3351             } ## end sub do_RIGHT_CURLY_BRACKET
3352              
3353             sub do_AMPERSAND {
3354              
3355 126     126 0 334 my $self = shift;
3356              
3357             # '&' = maybe sub call? start looking
3358             # We have to check for sub call unless we are sure we
3359             # are expecting an operator. This example from s2p
3360             # got mistaken as a q operator in an early version:
3361             # print BODY &q(<<'EOT');
3362 126 100       433 if ( $expecting != OPERATOR ) {
3363              
3364             # But only look for a sub call if we are expecting a term or
3365             # if there is no existing space after the &.
3366             # For example we probably don't want & as sub call here:
3367             # Fcntl::S_IRUSR & $mode;
3368 107 100 66     444 if ( $expecting == TERM || $next_type ne 'b' ) {
3369 104         320 $self->scan_simple_identifier();
3370             }
3371             }
3372             else {
3373             }
3374 126         291 return;
3375             } ## end sub do_AMPERSAND
3376              
3377             sub do_LESS_THAN_SIGN {
3378              
3379 29     29 0 86 my $self = shift;
3380              
3381             # '<' - angle operator or less than?
3382 29 100       147 if ( $expecting != OPERATOR ) {
3383 8         51 ( $i, $type ) =
3384             $self->find_angle_operator_termination( $input_line, $i,
3385             $rtoken_map, $expecting, $max_token_index );
3386              
3387             ## This message is not very helpful and quite confusing if the above
3388             ## routine decided not to write a message with the line number.
3389             ## if ( $type eq '<' && $expecting == TERM ) {
3390             ## $self->error_if_expecting_TERM();
3391             ## $self->interrupt_logfile();
3392             ## $self->warning("Unterminated <> operator?\n");
3393             ## $self->resume_logfile();
3394             ## }
3395              
3396             }
3397             else {
3398             }
3399 29         65 return;
3400             } ## end sub do_LESS_THAN_SIGN
3401              
3402             sub do_QUESTION_MARK {
3403              
3404 187     187 0 594 my $self = shift;
3405              
3406             # '?' = conditional or starting pattern?
3407 187         455 my $is_pattern;
3408              
3409             # Patch for rt #126965
3410             # a pattern cannot follow certain keywords which take optional
3411             # arguments, like 'shift' and 'pop'. See also '/'.
3412 187 100 66     1737 if (
    100          
    100          
3413             $last_nonblank_type eq 'k'
3414             && $is_keyword_rejecting_question_as_pattern_delimiter{
3415             $last_nonblank_token}
3416             )
3417             {
3418 1         2 $is_pattern = 0;
3419             }
3420              
3421             # patch for RT#131288, user constant function without prototype
3422             # last type is 'U' followed by ?.
3423             elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
3424 1         5 $is_pattern = 0;
3425             }
3426             elsif ( $expecting == UNKNOWN ) {
3427              
3428             # In older versions of Perl, a bare ? can be a pattern
3429             # delimiter. In perl version 5.22 this was
3430             # dropped, but we have to support it in order to format
3431             # older programs. See:
3432             ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
3433             # For example, the following line worked
3434             # at one time:
3435             # ?(.*)? && (print $1,"\n");
3436             # In current versions it would have to be written with slashes:
3437             # /(.*)/ && (print $1,"\n");
3438 11         32 my $msg;
3439 11         72 ( $is_pattern, $msg ) =
3440             $self->guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
3441             $max_token_index );
3442              
3443 11 50       49 if ($msg) { $self->write_logfile_entry($msg) }
  11         48  
3444             }
3445 174         514 else { $is_pattern = ( $expecting == TERM ) }
3446              
3447 187 50       573 if ($is_pattern) {
3448 0         0 $in_quote = 1;
3449 0         0 $type = 'Q';
3450 0         0 $allowed_quote_modifiers = '[msixpodualngc]';
3451             }
3452             else {
3453 187         735 ( $type_sequence, $indent_flag ) =
3454             $self->increase_nesting_depth( QUESTION_COLON,
3455             $rtoken_map->[$i_tok] );
3456             }
3457 187         447 return;
3458             } ## end sub do_QUESTION_MARK
3459              
3460             sub do_STAR {
3461              
3462 238     238 0 538 my $self = shift;
3463              
3464             # '*' = typeglob, or multiply?
3465 238 50 66     893 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
3466 0 0 0     0 if ( $next_type ne 'b'
      0        
3467             && $next_type ne '('
3468             && $next_type ne '#' ) # Fix c036
3469             {
3470 0         0 $expecting = TERM;
3471             }
3472             }
3473 238 100       636 if ( $expecting == TERM ) {
3474 21         88 $self->scan_simple_identifier();
3475             }
3476             else {
3477              
3478 217 50       880 if ( $rtokens->[ $i + 1 ] eq '=' ) {
    100          
3479 0         0 $tok = '*=';
3480 0         0 $type = $tok;
3481 0         0 $i++;
3482             }
3483             elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
3484 36         95 $tok = '**';
3485 36         86 $type = $tok;
3486 36         62 $i++;
3487 36 50       125 if ( $rtokens->[ $i + 1 ] eq '=' ) {
3488 0         0 $tok = '**=';
3489 0         0 $type = $tok;
3490 0         0 $i++;
3491             }
3492             }
3493             else {
3494             ## not multiple characters
3495             }
3496             }
3497 238         439 return;
3498             } ## end sub do_STAR
3499              
3500             sub do_DOT {
3501              
3502 150     150 0 339 my $self = shift;
3503              
3504             # '.' = what kind of . ?
3505 150 100       482 if ( $expecting != OPERATOR ) {
3506 10         40 $self->scan_number();
3507 10 100       45 if ( $type eq '.' ) {
3508 2 50       7 $self->error_if_expecting_TERM()
3509             if ( $expecting == TERM );
3510             }
3511             }
3512             else {
3513             }
3514 150         302 return;
3515             } ## end sub do_DOT
3516              
3517             sub do_COLON {
3518              
3519 271     271 0 714 my $self = shift;
3520              
3521             # ':' = label, ternary, attribute, ?
3522              
3523             # if this is the first nonblank character, call it a label
3524             # since perl seems to just swallow it
3525 271 50 66     3725 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
    100 66        
    100 66        
    100 66        
    100          
3526 0         0 $type = 'J';
3527             }
3528              
3529             # ATTRS: check for a ':' which introduces an attribute list
3530             # either after a 'sub' keyword or within a paren list
3531             # Added 'package' (can be 'class') for --use-feature=class (rt145706)
3532             elsif ( $statement_type =~ /^(sub|package)\b/ ) {
3533 22         57 $type = 'A';
3534 22         50 $self->[_in_attribute_list_] = 1;
3535             }
3536              
3537             # Within a signature, unless we are in a ternary. For example,
3538             # from 't/filter_example.t':
3539             # method foo4 ( $class: $bar ) { $class->bar($bar) }
3540             elsif ( $rparen_type->[$paren_depth] =~ /^sub\b/
3541             && !is_balanced_closing_container(QUESTION_COLON) )
3542             {
3543 1         6 $type = 'A';
3544 1         3 $self->[_in_attribute_list_] = 1;
3545             }
3546              
3547             # check for scalar attribute, such as
3548             # my $foo : shared = 1;
3549             elsif ($is_my_our_state{$statement_type}
3550             && $rcurrent_depth->[QUESTION_COLON] == 0 )
3551             {
3552 15         38 $type = 'A';
3553 15         38 $self->[_in_attribute_list_] = 1;
3554             }
3555              
3556             # Look for Switch::Plain syntax if an error would otherwise occur
3557             # here. Note that we do not need to check if the extended syntax
3558             # flag is set because otherwise an error would occur, and we would
3559             # then have to output a message telling the user to set the
3560             # extended syntax flag to avoid the error.
3561             # case 1: {
3562             # default: {
3563             # default:
3564             # Note that the line 'default:' will be parsed as a label elsewhere.
3565             elsif ( $is_case_default{$statement_type}
3566             && !is_balanced_closing_container(QUESTION_COLON) )
3567             {
3568             # mark it as a perltidy label type
3569 46         100 $type = 'J';
3570             }
3571              
3572             # otherwise, it should be part of a ?/: operator
3573             else {
3574 187         792 ( $type_sequence, $indent_flag ) =
3575             $self->decrease_nesting_depth( QUESTION_COLON,
3576             $rtoken_map->[$i_tok] );
3577 187 50       1099 if ( $last_nonblank_token eq '?' ) {
3578 0         0 $self->warning("Syntax error near ? :\n");
3579             }
3580             }
3581 271         526 return;
3582             } ## end sub do_COLON
3583              
3584             sub do_PLUS_SIGN {
3585              
3586 227     227 0 533 my $self = shift;
3587              
3588             # '+' = what kind of plus?
3589 227 100       896 if ( $expecting == TERM ) {
    100          
3590 13         61 my $number = $self->scan_number_fast();
3591              
3592             # unary plus is safest assumption if not a number
3593 13 50       52 if ( !defined($number) ) { $type = 'p'; }
  13         31  
3594             }
3595             elsif ( $expecting == OPERATOR ) {
3596             }
3597             else {
3598 3 100       11 if ( $next_type eq 'w' ) { $type = 'p' }
  2         4  
3599             }
3600 227         399 return;
3601             } ## end sub do_PLUS_SIGN
3602              
3603             sub do_AT_SIGN {
3604              
3605 438     438 0 1083 my $self = shift;
3606              
3607             # '@' = sigil for array?
3608 438 50       1398 $self->error_if_expecting_OPERATOR("Array")
3609             if ( $expecting == OPERATOR );
3610 438         1729 $self->scan_simple_identifier();
3611 438         846 return;
3612             } ## end sub do_AT_SIGN
3613              
3614             sub do_PERCENT_SIGN {
3615              
3616 202     202 0 545 my $self = shift;
3617              
3618             # '%' = hash or modulo?
3619             # first guess is hash if no following blank or paren
3620 202 50       763 if ( $expecting == UNKNOWN ) {
3621 0 0 0     0 if ( $next_type ne 'b' && $next_type ne '(' ) {
3622 0         0 $expecting = TERM;
3623             }
3624             }
3625 202 100       716 if ( $expecting == TERM ) {
3626 192         700 $self->scan_simple_identifier();
3627             }
3628 202         1509 return;
3629             } ## end sub do_PERCENT_SIGN
3630              
3631             sub do_LEFT_SQUARE_BRACKET {
3632              
3633 595     595 0 1309 my $self = shift;
3634              
3635             # '['
3636 595         1421 $rsquare_bracket_type->[ ++$square_bracket_depth ] =
3637             $last_nonblank_token;
3638 595         2152 ( $type_sequence, $indent_flag ) =
3639             $self->increase_nesting_depth( SQUARE_BRACKET,
3640             $rtoken_map->[$i_tok] );
3641              
3642             # It may seem odd, but structural square brackets have
3643             # type '{' and '}'. This simplifies the indentation logic.
3644 595 100       2126 if ( !is_non_structural_brace() ) {
3645 288         762 $type = '{';
3646             }
3647 595         1449 $rsquare_bracket_structural_type->[$square_bracket_depth] = $type;
3648 595         1150 return;
3649             } ## end sub do_LEFT_SQUARE_BRACKET
3650              
3651             sub do_RIGHT_SQUARE_BRACKET {
3652              
3653 595     595 0 2223 my $self = shift;
3654              
3655             # ']'
3656 595         2244 ( $type_sequence, $indent_flag ) =
3657             $self->decrease_nesting_depth( SQUARE_BRACKET,
3658             $rtoken_map->[$i_tok] );
3659              
3660 595 100       2297 if ( $rsquare_bracket_structural_type->[$square_bracket_depth] eq '{' )
3661             {
3662 288         578 $type = '}';
3663             }
3664              
3665             # propagate type information for smartmatch operator. This is
3666             # necessary to enable us to know if an operator or term is expected
3667             # next.
3668 595 100       1777 if ( $rsquare_bracket_type->[$square_bracket_depth] eq '~~' ) {
3669 20         43 $tok = $rsquare_bracket_type->[$square_bracket_depth];
3670             }
3671              
3672 595 50       1594 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
  595         2032  
3673 595         1040 return;
3674             } ## end sub do_RIGHT_SQUARE_BRACKET
3675              
3676             sub do_MINUS_SIGN {
3677              
3678 441     441 0 995 my $self = shift;
3679              
3680             # '-' = what kind of minus?
3681 441 100 100     3020 if ( ( $expecting != OPERATOR )
    100          
    100          
3682             && $is_file_test_operator{$next_tok} )
3683             {
3684 10         77 my ( $next_nonblank_token, $i_next ) =
3685             $self->find_next_nonblank_token( $i + 1, $rtokens,
3686             $max_token_index );
3687              
3688             # check for a quoted word like "-w=>xx";
3689             # it is sufficient to just check for a following '='
3690 10 50       80 if ( $next_nonblank_token eq '=' ) {
3691 0         0 $type = 'm';
3692             }
3693             else {
3694 10         24 $i++;
3695 10         32 $tok .= $next_tok;
3696 10         57 $type = 'F';
3697             }
3698             }
3699             elsif ( $expecting == TERM ) {
3700 330         1070 my $number = $self->scan_number_fast();
3701              
3702             # maybe part of bareword token? unary is safest
3703 330 100       928 if ( !defined($number) ) { $type = 'm'; }
  288         557  
3704              
3705             }
3706             elsif ( $expecting == OPERATOR ) {
3707             }
3708             else {
3709              
3710 4 50       18 if ( $next_type eq 'w' ) {
3711 4         14 $type = 'm';
3712             }
3713             }
3714 441         760 return;
3715             } ## end sub do_MINUS_SIGN
3716              
3717             sub do_CARAT_SIGN {
3718              
3719 12     12 0 23 my $self = shift;
3720              
3721             # '^'
3722             # check for special variables like ${^WARNING_BITS}
3723 12 100       40 if ( $expecting == TERM ) {
3724              
3725 5 50 33     74 if ( $last_nonblank_token eq '{'
      33        
3726             && ( $next_tok !~ /^\d/ )
3727             && ( $next_tok =~ /^\w/ ) )
3728             {
3729              
3730 5 100       21 if ( $next_tok eq 'W' ) {
3731 1         2 $self->[_saw_perl_dash_w_] = 1;
3732             }
3733 5         15 $tok = $tok . $next_tok;
3734 5         12 $i = $i + 1;
3735 5         9 $type = 'w';
3736              
3737             # Optional coding to try to catch syntax errors. This can
3738             # be removed if it ever causes incorrect warning messages.
3739             # The '{^' should be preceded by either by a type or '$#'
3740             # Examples:
3741             # $#{^CAPTURE} ok
3742             # *${^LAST_FH}{NAME} ok
3743             # @{^HOWDY} ok
3744             # $hash{^HOWDY} error
3745              
3746             # Note that a type sigil '$' may be tokenized as 'Z'
3747             # after something like 'print', so allow type 'Z'
3748 5 0 33     19 if ( $last_last_nonblank_type ne 't'
      33        
3749             && $last_last_nonblank_type ne 'Z'
3750             && $last_last_nonblank_token ne '$#' )
3751             {
3752 0         0 $self->warning("Possible syntax error near '{^'\n");
3753             }
3754             }
3755              
3756             else {
3757 0 0       0 if ( !$self->error_if_expecting_TERM() ) {
3758              
3759             # Something like this is valid but strange:
3760             # undef ^I;
3761 0         0 $self->complain("The '^' seems unusual here\n");
3762             }
3763             }
3764             }
3765 12         23 return;
3766             } ## end sub do_CARAT_SIGN
3767              
3768             sub do_DOUBLE_COLON {
3769              
3770 9     9 0 21 my $self = shift;
3771              
3772             # '::' = probably a sub call
3773 9         25 $self->scan_bare_identifier();
3774 9         18 return;
3775             } ## end sub do_DOUBLE_COLON
3776              
3777             sub do_LEFT_SHIFT {
3778              
3779 7     7 0 28 my $self = shift;
3780              
3781             # '<<' = maybe a here-doc?
3782 7 50       36 if ( $expecting != OPERATOR ) {
3783 7         28 my ( $found_target, $here_doc_target, $here_quote_character,
3784             $saw_error );
3785             (
3786 7         43 $found_target, $here_doc_target, $here_quote_character, $i,
3787             $saw_error
3788             )
3789             = $self->find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3790             $max_token_index );
3791              
3792 7 50       35 if ($found_target) {
    0          
3793 7         19 push @{$rhere_target_list},
  7         27  
3794             [ $here_doc_target, $here_quote_character ];
3795 7         26 $type = 'h';
3796 7 50       75 if ( length($here_doc_target) > 80 ) {
    50          
    100          
3797 0         0 my $truncated = substr( $here_doc_target, 0, 80 );
3798 0         0 $self->complain("Long here-target: '$truncated' ...\n");
3799             }
3800             elsif ( !$here_doc_target ) {
3801 0 0       0 $self->warning(
3802             'Use of bare << to mean <<"" is deprecated' . "\n" )
3803             if ( !$here_quote_character );
3804             }
3805             elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3806 2         17 $self->complain(
3807             "Unconventional here-target: '$here_doc_target'\n");
3808             }
3809             else {
3810             ## ok: nothing to complain about
3811             }
3812             }
3813             elsif ( $expecting == TERM ) {
3814 0 0       0 if ( !$saw_error ) {
3815              
3816             # shouldn't happen..arriving here implies an error in
3817             # the logic in sub 'find_here_doc'
3818 0         0 if (DEVEL_MODE) {
3819             $self->Fault(<<EOM);
3820             Program bug; didn't find here doc target
3821             EOM
3822             }
3823             $self->warning(
3824 0         0 "Possible program error: didn't find here doc target\n"
3825             );
3826 0         0 $self->report_definite_bug();
3827             }
3828             }
3829              
3830             # target not found, expecting == UNKNOWN
3831             else {
3832             # assume it is a shift
3833             }
3834             }
3835             else {
3836             }
3837 7         21 return;
3838             } ## end sub do_LEFT_SHIFT
3839              
3840             sub do_NEW_HERE_DOC {
3841              
3842             # '<<~' = a here-doc, new type added in v26
3843              
3844 2     2 0 5 my $self = shift;
3845              
3846             return
3847 2 50       9 if ( $i >= $max_token_index ); # here-doc not possible if end of line
3848 2 50       10 if ( $expecting != OPERATOR ) {
3849 2         5 my ( $found_target, $here_doc_target, $here_quote_character,
3850             $saw_error );
3851             (
3852 2         14 $found_target, $here_doc_target, $here_quote_character, $i,
3853             $saw_error
3854             )
3855             = $self->find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3856             $max_token_index );
3857              
3858 2 50       9 if ($found_target) {
    0          
3859              
3860 2 50       21 if ( length($here_doc_target) > 80 ) {
    50          
3861 0         0 my $truncated = substr( $here_doc_target, 0, 80 );
3862 0         0 $self->complain("Long here-target: '$truncated' ...\n");
3863             }
3864             elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3865 0         0 $self->complain(
3866             "Unconventional here-target: '$here_doc_target'\n");
3867             }
3868             else {
3869             ## ok: nothing to complain about
3870             }
3871              
3872             # Note that we put a leading space on the here quote
3873             # character indicate that it may be preceded by spaces
3874 2         7 $here_quote_character = SPACE . $here_quote_character;
3875 2         6 push @{$rhere_target_list},
  2         9  
3876             [ $here_doc_target, $here_quote_character ];
3877 2         6 $type = 'h';
3878             }
3879              
3880             # target not found ..
3881             elsif ( $expecting == TERM ) {
3882 0 0       0 if ( !$saw_error ) {
3883              
3884             # shouldn't happen..arriving here implies an error in
3885             # the logic in sub 'find_here_doc'
3886 0         0 if (DEVEL_MODE) {
3887             $self->Fault(<<EOM);
3888             Program bug; didn't find here doc target
3889             EOM
3890             }
3891             $self->warning(
3892 0         0 "Possible program error: didn't find here doc target\n"
3893             );
3894 0         0 $self->report_definite_bug();
3895             }
3896             }
3897              
3898             # Target not found, expecting==UNKNOWN
3899             else {
3900 0         0 $self->warning("didn't find here doc target after '<<~'\n");
3901             }
3902             }
3903             else {
3904 0         0 $self->error_if_expecting_OPERATOR();
3905             }
3906 2         9 return;
3907             } ## end sub do_NEW_HERE_DOC
3908              
3909             sub do_POINTER {
3910              
3911             # '->'
3912 887     887 0 1680 return;
3913             }
3914              
3915             sub do_PLUS_PLUS {
3916              
3917 46     46 0 178 my $self = shift;
3918              
3919             # '++'
3920             # type = 'pp' for pre-increment, '++' for post-increment
3921 46 100       242 if ( $expecting == OPERATOR ) { $type = '++' }
  37 100       122  
3922 7         17 elsif ( $expecting == TERM ) { $type = 'pp' }
3923              
3924             # handle ( $expecting == UNKNOWN )
3925             else {
3926              
3927             # look ahead ..
3928 2         18 my ( $next_nonblank_token, $i_next ) =
3929             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
3930              
3931             # Fix for c042: look past a side comment
3932 2 50       23 if ( $next_nonblank_token eq '#' ) {
3933 0         0 ( $next_nonblank_token, $i_next ) =
3934             $self->find_next_nonblank_token( $max_token_index,
3935             $rtokens, $max_token_index );
3936             }
3937              
3938 2 50       16 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
  0         0  
3939             }
3940 46         98 return;
3941             } ## end sub do_PLUS_PLUS
3942              
3943             sub do_FAT_COMMA {
3944              
3945 1025     1025 0 1918 my $self = shift;
3946              
3947             # '=>'
3948 1025 50       2439 if ( $last_nonblank_type eq $tok ) {
3949 0         0 $self->complain("Repeated '=>'s \n");
3950             }
3951              
3952             # patch for operator_expected: note if we are in the list (use.t)
3953             # TODO: make version numbers a new token type
3954 1025 100       2393 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
  18         44  
3955 1025         1733 return;
3956             } ## end sub do_FAT_COMMA
3957              
3958             sub do_MINUS_MINUS {
3959              
3960 2     2 0 5 my $self = shift;
3961              
3962             # '--'
3963             # type = 'mm' for pre-decrement, '--' for post-decrement
3964              
3965 2 50       13 if ( $expecting == OPERATOR ) { $type = '--' }
  0 50       0  
3966 2         6 elsif ( $expecting == TERM ) { $type = 'mm' }
3967              
3968             # handle ( $expecting == UNKNOWN )
3969             else {
3970              
3971             # look ahead ..
3972 0         0 my ( $next_nonblank_token, $i_next ) =
3973             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
3974              
3975             # Fix for c042: look past a side comment
3976 0 0       0 if ( $next_nonblank_token eq '#' ) {
3977 0         0 ( $next_nonblank_token, $i_next ) =
3978             $self->find_next_nonblank_token( $max_token_index,
3979             $rtokens, $max_token_index );
3980             }
3981              
3982 0 0       0 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
  0         0  
3983             }
3984              
3985 2         5 return;
3986             } ## end sub do_MINUS_MINUS
3987              
3988             sub do_LOGICAL_AND {
3989              
3990 58     58 0 123 my $self = shift;
3991              
3992             # '&&'
3993 58 50 33     274 $self->error_if_expecting_TERM()
3994             if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3995 58         113 return;
3996             } ## end sub do_LOGICAL_AND
3997              
3998             sub do_LOGICAL_OR {
3999              
4000 74     74 0 181 my $self = shift;
4001              
4002             # '||'
4003 74 100 66     398 $self->error_if_expecting_TERM()
4004             if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
4005 74         165 return;
4006             } ## end sub do_LOGICAL_OR
4007              
4008             sub do_SLASH_SLASH {
4009              
4010 10     10 0 20 my $self = shift;
4011              
4012             # '//'
4013 10 100       35 $self->error_if_expecting_TERM()
4014             if ( $expecting == TERM );
4015 10         20 return;
4016             } ## end sub do_SLASH_SLASH
4017              
4018             sub do_DIGITS {
4019              
4020 1934     1934 0 3337 my $self = shift;
4021              
4022             # 'd' = string of digits
4023 1934 50       4278 $self->error_if_expecting_OPERATOR("Number")
4024             if ( $expecting == OPERATOR );
4025              
4026 1934         4857 my $number = $self->scan_number_fast();
4027 1934 50       4583 if ( !defined($number) ) {
4028              
4029             # shouldn't happen - we should always get a number
4030 0         0 if (DEVEL_MODE) {
4031             $self->Fault(<<EOM);
4032             non-number beginning with digit--program bug
4033             EOM
4034             }
4035             $self->warning(
4036 0         0 "Unexpected error condition: non-number beginning with digit\n"
4037             );
4038 0         0 $self->report_definite_bug();
4039             }
4040 1934         3206 return;
4041             } ## end sub do_DIGITS
4042              
4043             sub do_ATTRIBUTE_LIST {
4044              
4045 39     39 0 98 my ( $self, $next_nonblank_token ) = @_;
4046              
4047             # Called at a bareword encountered while in an attribute list
4048             # returns 'is_attribute':
4049             # true if attribute found
4050             # false if an attribute (continue parsing bareword)
4051              
4052             # treat bare word followed by open paren like qw(
4053 39 100       113 if ( $next_nonblank_token eq '(' ) {
4054              
4055             # For something like:
4056             # : prototype($$)
4057             # we should let do_scan_sub see it so that it can see
4058             # the prototype. All other attributes get parsed as a
4059             # quoted string.
4060 18 100       88 if ( $tok eq 'prototype' ) {
4061 2         7 $id_scan_state = 'prototype';
4062              
4063             # start just after the word 'prototype'
4064 2         9 my $i_beg = $i + 1;
4065 2         22 ( $i, $tok, $type, $id_scan_state ) = $self->do_scan_sub(
4066             {
4067             input_line => $input_line,
4068             i => $i,
4069             i_beg => $i_beg,
4070             tok => $tok,
4071             type => $type,
4072             rtokens => $rtokens,
4073             rtoken_map => $rtoken_map,
4074             id_scan_state => $id_scan_state,
4075             max_token_index => $max_token_index,
4076             }
4077             );
4078              
4079             # If successful, mark as type 'q' to be consistent
4080             # with other attributes. Type 'w' would also work.
4081 2 50       14 if ( $i > $i_beg ) {
4082 2         5 $type = 'q';
4083 2         6 return 1;
4084             }
4085              
4086             # If not successful, continue and parse as a quote.
4087             }
4088              
4089             # All other attribute lists must be parsed as quotes
4090             # (see 'signatures.t' for good examples)
4091 16         76 $in_quote = $quote_items{'q'};
4092 16         48 $allowed_quote_modifiers = $quote_modifiers{'q'};
4093 16         34 $type = 'q';
4094 16         31 $quote_type = 'q';
4095 16         39 return 1;
4096             }
4097              
4098             # handle bareword not followed by open paren
4099             else {
4100 21         57 $type = 'w';
4101 21         53 return 1;
4102             }
4103              
4104             # attribute not found
4105 0         0 return;
4106             } ## end sub do_ATTRIBUTE_LIST
4107              
4108             sub do_QUOTED_BAREWORD {
4109              
4110 786     786 0 1507 my $self = shift;
4111              
4112             # find type of a bareword followed by a '=>'
4113 786 100       4420 if ( $ris_constant->{$current_package}{$tok} ) {
    50          
    50          
4114 14         36 $type = 'C';
4115             }
4116             elsif ( $ris_user_function->{$current_package}{$tok} ) {
4117 0         0 $type = 'U';
4118 0         0 $prototype = $ruser_function_prototype->{$current_package}{$tok};
4119             }
4120             elsif ( $tok =~ /^v\d+$/ ) {
4121 0         0 $type = 'v';
4122 0         0 $self->report_v_string($tok);
4123             }
4124             else {
4125              
4126             # Bareword followed by a fat comma - see 'git18.in'
4127             # If tok is something like 'x17' then it could
4128             # actually be operator x followed by number 17.
4129             # For example, here:
4130             # 123x17 => [ 792, 1224 ],
4131             # (a key of 123 repeated 17 times, perhaps not
4132             # what was intended). We will mark x17 as type
4133             # 'n' and it will be split. If the previous token
4134             # was also a bareword then it is not very clear is
4135             # going on. In this case we will not be sure that
4136             # an operator is expected, so we just mark it as a
4137             # bareword. Perl is a little murky in what it does
4138             # with stuff like this, and its behavior can change
4139             # over time. Something like
4140             # a x18 => [792, 1224], will compile as
4141             # a key with 18 a's. But something like
4142             # push @array, a x18;
4143             # is a syntax error.
4144 772 100 66     2730 if (
      33        
      66        
4145             $expecting == OPERATOR
4146             && substr( $tok, 0, 1 ) eq 'x'
4147             && ( length($tok) == 1
4148             || substr( $tok, 1, 1 ) =~ /^\d/ )
4149             )
4150             {
4151 3         8 $type = 'n';
4152 3 50       12 if ( $self->split_pretoken(1) ) {
4153 3         7 $type = 'x';
4154 3         6 $tok = 'x';
4155             }
4156             }
4157             else {
4158              
4159             # git #18
4160 769         1354 $type = 'w';
4161 769         1995 $self->error_if_expecting_OPERATOR();
4162             }
4163             }
4164 786         1284 return;
4165             } ## end sub do_QUOTED_BAREWORD
4166              
4167             sub do_X_OPERATOR {
4168              
4169 17     17 0 51 my $self = shift;
4170              
4171 17 100       73 if ( $tok eq 'x' ) {
4172 15 50       74 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
4173 0         0 $tok = 'x=';
4174 0         0 $type = $tok;
4175 0         0 $i++;
4176             }
4177             else {
4178 15         44 $type = 'x';
4179             }
4180             }
4181             else {
4182              
4183             # Split a pretoken like 'x10' into 'x' and '10'.
4184             # Note: In previous versions of perltidy it was marked
4185             # as a number, $type = 'n', and fixed downstream by the
4186             # Formatter.
4187 2         5 $type = 'n';
4188 2 50       9 if ( $self->split_pretoken(1) ) {
4189 2         10 $type = 'x';
4190 2         16 $tok = 'x';
4191             }
4192             }
4193 17         33 return;
4194             } ## end sub do_X_OPERATOR
4195              
4196             sub do_USE_CONSTANT {
4197              
4198 16     16 0 42 my $self = shift;
4199              
4200 16         62 $self->scan_bare_identifier();
4201 16         111 my ( $next_nonblank_tok2, $i_next2 ) =
4202             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
4203              
4204 16 50       111 if ($next_nonblank_tok2) {
4205              
4206 16 100       91 if ( $is_keyword{$next_nonblank_tok2} ) {
4207              
4208             # Assume qw is used as a quote and okay, as in:
4209             # use constant qw{ DEBUG 0 };
4210             # Not worth trying to parse for just a warning
4211              
4212             # NOTE: This warning is deactivated because recent
4213             # versions of perl do not complain here, but
4214             # the coding is retained for reference.
4215 1         2 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
4216             $self->warning(
4217             "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
4218             );
4219             }
4220             }
4221              
4222             else {
4223 15         61 $ris_constant->{$current_package}{$next_nonblank_tok2} = 1;
4224             }
4225             }
4226 16         41 return;
4227             } ## end sub do_USE_CONSTANT
4228              
4229             sub do_KEYWORD {
4230              
4231 2644     2644 0 4465 my $self = shift;
4232              
4233             # found a keyword - set any associated flags
4234 2644         4563 $type = 'k';
4235              
4236             # Since for and foreach may not be followed immediately
4237             # by an opening paren, we have to remember which keyword
4238             # is associated with the next '('
4239             # Previously, before update c230 : if ( $is_for_foreach{$tok} ) {
4240             ##(if elsif unless while until for foreach switch case given when catch)
4241 2644 100       7584 if ( $is_blocktype_with_paren{$tok} ) {
4242 396 100       1488 if ( new_statement_ok() ) {
4243 309         721 $want_paren = $tok;
4244             }
4245             }
4246              
4247             # recognize 'use' statements, which are special
4248 2644 100 100     19373 if ( $is_use_require{$tok} ) {
    100 66        
    100          
    100          
    100          
    100          
4249 175         359 $statement_type = $tok;
4250 175 50       511 $self->error_if_expecting_OPERATOR()
4251             if ( $expecting == OPERATOR );
4252             }
4253              
4254             # remember my and our to check for trailing ": shared"
4255             elsif ( $is_my_our_state{$tok} ) {
4256 629         1358 $statement_type = $tok;
4257             }
4258              
4259             # Check for misplaced 'elsif' and 'else', but allow isolated
4260             # else or elsif blocks to be formatted. This is indicated
4261             # by a last noblank token of ';'
4262             elsif ( $tok eq 'elsif' ) {
4263 29 50 66     221 if (
4264             $last_nonblank_token ne ';'
4265              
4266             ## !~ /^(if|elsif|unless)$/
4267             && !$is_if_elsif_unless{$last_nonblank_block_type}
4268             )
4269             {
4270 0         0 $self->warning(
4271             "expecting '$tok' to follow one of 'if|elsif|unless'\n");
4272             }
4273             }
4274             elsif ( $tok eq 'else' ) {
4275              
4276             # patched for SWITCH/CASE
4277 44 50 66     598 if (
      66        
4278             $last_nonblank_token ne ';'
4279              
4280             ## !~ /^(if|elsif|unless|case|when)$/
4281             && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
4282              
4283             # patch to avoid an unwanted error message for
4284             # the case of a parenless 'case' (RT 105484):
4285             # switch ( 1 ) { case x { 2 } else { } }
4286             ## !~ /^(if|elsif|unless|case|when)$/
4287             && !$is_if_elsif_unless_case_when{$statement_type}
4288             )
4289             {
4290 0         0 $self->warning(
4291             "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
4292             );
4293             }
4294             }
4295              
4296             # patch for SWITCH/CASE if 'case' and 'when are
4297             # treated as keywords. Also 'default' for Switch::Plain
4298             elsif ($tok eq 'when'
4299             || $tok eq 'case'
4300             || $tok eq 'default' )
4301             {
4302 56         111 $statement_type = $tok; # next '{' is block
4303             }
4304              
4305             # feature 'err' was removed in Perl 5.10. So mark this as
4306             # a bareword unless an operator is expected (see c158).
4307             elsif ( $tok eq 'err' ) {
4308 1 50       5 if ( $expecting != OPERATOR ) { $type = 'w' }
  1         3  
4309             }
4310             else {
4311             ## no special treatment needed
4312             }
4313              
4314 2644         4709 return;
4315             } ## end sub do_KEYWORD
4316              
4317             sub do_QUOTE_OPERATOR {
4318              
4319 202     202 0 434 my $self = shift;
4320              
4321 202 50       674 if ( $expecting == OPERATOR ) {
4322              
4323             # Be careful not to call an error for a qw quote
4324             # where a parenthesized list is allowed. For example,
4325             # it could also be a for/foreach construct such as
4326             #
4327             # foreach my $key qw\Uno Due Tres Quadro\ {
4328             # print "Set $key\n";
4329             # }
4330             #
4331              
4332             # Or it could be a function call.
4333             # NOTE: Braces in something like &{ xxx } are not
4334             # marked as a block, we might have a method call.
4335             # &method(...), $method->(..), &{method}(...),
4336             # $ref[2](list) is ok & short for $ref[2]->(list)
4337             #
4338             # See notes in 'sub code_block_type' and
4339             # 'sub is_non_structural_brace'
4340              
4341             my $paren_list_possible = $tok eq 'qw'
4342             && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
4343 0   0     0 || $is_for_foreach{$want_paren} );
4344              
4345 0 0       0 if ( !$paren_list_possible ) {
4346 0         0 $self->error_if_expecting_OPERATOR();
4347             }
4348             }
4349 202         528 $in_quote = $quote_items{$tok};
4350 202         537 $allowed_quote_modifiers = $quote_modifiers{$tok};
4351              
4352             # All quote types are 'Q' except possibly qw quotes.
4353             # qw quotes are special in that they may generally be trimmed
4354             # of leading and trailing whitespace. So they are given a
4355             # separate type, 'q', unless requested otherwise.
4356 202 100 66     982 $type =
4357             ( $tok eq 'qw' && $rOpts_trim_qw )
4358             ? 'q'
4359             : 'Q';
4360 202         404 $quote_type = $type;
4361 202         394 return;
4362             } ## end sub do_QUOTE_OPERATOR
4363              
4364             sub do_UNKNOWN_BAREWORD {
4365              
4366 958     958 0 2322 my ( $self, $next_nonblank_token ) = @_;
4367              
4368 958         3298 $self->scan_bare_identifier();
4369              
4370 958 100 100     3608 if ( $statement_type eq 'use'
4371             && $last_nonblank_token eq 'use' )
4372             {
4373 108         418 $rsaw_use_module->{$current_package}->{$tok} = 1;
4374             }
4375              
4376 958 100       2590 if ( $type eq 'w' ) {
4377              
4378 933 50       2499 if ( $expecting == OPERATOR ) {
4379              
4380             # Patch to avoid error message for RPerl overloaded
4381             # operator functions: use overload
4382             # '+' => \&sse_add,
4383             # '-' => \&sse_sub,
4384             # '*' => \&sse_mul,
4385             # '/' => \&sse_div;
4386             # TODO: this could eventually be generalized
4387 0 0 0     0 if ( $rsaw_use_module->{$current_package}->{'RPerl'}
    0          
    0          
4388             && $tok =~ /^sse_(mul|div|add|sub)$/ )
4389             {
4390              
4391             }
4392              
4393             # Fix part 1 for git #63 in which a comment falls
4394             # between an -> and the following word. An
4395             # alternate fix would be to change operator_expected
4396             # to return an UNKNOWN for this type.
4397             elsif ( $last_nonblank_type eq '->' ) {
4398              
4399             }
4400              
4401             # don't complain about possible indirect object
4402             # notation.
4403             # For example:
4404             # package main;
4405             # sub new($) { ... }
4406             # $b = new A::; # calls A::new
4407             # $c = new A; # same thing but suspicious
4408             # This will call A::new but we have a 'new' in
4409             # main:: which looks like a constant.
4410             #
4411             elsif ( $last_nonblank_type eq 'C' ) {
4412 0 0       0 if ( $tok !~ /::$/ ) {
4413 0         0 $self->complain(<<EOM);
4414             Expecting operator after '$last_nonblank_token' but found bare word '$tok'
4415             Maybe indirectet object notation?
4416             EOM
4417             }
4418             }
4419             else {
4420 0         0 $self->error_if_expecting_OPERATOR("bareword");
4421             }
4422             }
4423              
4424             # mark bare words immediately followed by a paren as
4425             # functions
4426 933         2502 $next_tok = $rtokens->[ $i + 1 ];
4427 933 100       2648 if ( $next_tok eq '(' ) {
4428              
4429             # Patch for issue c151, where we are processing a snippet and
4430             # have not seen that SPACE is a constant. In this case 'x' is
4431             # probably an operator. The only disadvantage with an incorrect
4432             # guess is that the space after it may be incorrect. For example
4433             # $str .= SPACE x ( 16 - length($str) ); See also b1410.
4434 276 50 33     1510 if ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { $type = 'x' }
  0 50       0  
4435              
4436             # Fix part 2 for git #63. Leave type as 'w' to keep
4437             # the type the same as if the -> were not separated
4438 276         578 elsif ( $last_nonblank_type ne '->' ) { $type = 'U' }
4439              
4440             # not a special case
4441             else { }
4442              
4443             }
4444              
4445             # underscore after file test operator is file handle
4446 933 50 66     3254 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
4447 0         0 $type = 'Z';
4448             }
4449              
4450             # patch for SWITCH/CASE if 'case' and 'when are
4451             # not treated as keywords:
4452 933 50 33     4873 if (
      33        
      33        
4453             ( $tok eq 'case' && $rbrace_type->[$brace_depth] eq 'switch' )
4454             || ( $tok eq 'when'
4455             && $rbrace_type->[$brace_depth] eq 'given' )
4456             )
4457             {
4458 0         0 $statement_type = $tok; # next '{' is block
4459 0         0 $type = 'k'; # for keyword syntax coloring
4460             }
4461 933 100       2772 if ( $next_nonblank_token eq '(' ) {
4462              
4463             # patch for SWITCH/CASE if switch and given not keywords
4464             # Switch is not a perl 5 keyword, but we will gamble
4465             # and mark switch followed by paren as a keyword. This
4466             # is only necessary to get html syntax coloring nice,
4467             # and does not commit this as being a switch/case.
4468 241 50 33     1981 if ( $tok eq 'switch' || $tok eq 'given' ) {
    50 33        
4469 0         0 $type = 'k'; # for keyword syntax coloring
4470             }
4471              
4472             # mark 'x' as operator for something like this (see b1410)
4473             # my $line = join( LD_X, map { LD_H x ( $_ + 2 ) } @$widths );
4474             elsif ( $tok eq 'x' && $last_nonblank_type eq 'w' ) {
4475 0         0 $type = 'x';
4476             }
4477             else {
4478             ## not a special case
4479             }
4480             }
4481             }
4482 958         1823 return;
4483             } ## end sub do_UNKNOWN_BAREWORD
4484              
4485             sub sub_attribute_ok_here {
4486              
4487 35     35 0 156 my ( $self, $tok_kw, $next_nonblank_token, $i_next ) = @_;
4488              
4489             # Decide if 'sub :' can be the start of a sub attribute list.
4490             # We will decide based on if the colon is followed by a
4491             # bareword which is not a keyword.
4492             # Changed inext+1 to inext to fixed case b1190.
4493 35         69 my $sub_attribute_ok_here;
4494 35 50 66     171 if ( $is_sub{$tok_kw}
      66        
4495             && $expecting != OPERATOR
4496             && $next_nonblank_token eq ':' )
4497             {
4498 3         11 my ( $nn_nonblank_token, $i_nn ) =
4499             $self->find_next_nonblank_token( $i_next, $rtokens,
4500             $max_token_index );
4501             $sub_attribute_ok_here =
4502             $nn_nonblank_token =~ /^\w/
4503             && $nn_nonblank_token !~ /^\d/
4504 3   66     77 && !$is_keyword{$nn_nonblank_token};
4505             }
4506 35         218 return $sub_attribute_ok_here;
4507             } ## end sub sub_attribute_ok_here
4508              
4509             sub do_BAREWORD {
4510              
4511 5838     5838 0 10876 my ($self) = @_;
4512              
4513             # handle a bareword token:
4514             # returns
4515             # true if this token ends the current line
4516             # false otherwise
4517              
4518 5838         16564 my ( $next_nonblank_token, $i_next ) =
4519             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
4520              
4521             # a bare word immediately followed by :: is not a keyword;
4522             # use $tok_kw when testing for keywords to avoid a mistake
4523 5838         10977 my $tok_kw = $tok;
4524 5838 100 100     17244 if ( $rtokens->[ $i + 1 ] eq ':'
4525             && $rtokens->[ $i + 2 ] eq ':' )
4526             {
4527 266         591 $tok_kw .= '::';
4528             }
4529              
4530 5838 100       12963 if ( $self->[_in_attribute_list_] ) {
4531 39         221 my $is_attribute = $self->do_ATTRIBUTE_LIST($next_nonblank_token);
4532 39 50       131 return if ($is_attribute);
4533             }
4534              
4535             #----------------------------------------
4536             # Starting final if-elsif- chain of tests
4537             #----------------------------------------
4538              
4539             # This is the return flag:
4540             # true => this is the last token on the line
4541             # false => keep tokenizing the line
4542 5799         8544 my $is_last;
4543              
4544             # The following blocks of code must update these vars:
4545             # $type - the final token type, must always be set
4546              
4547             # In addition, if additional pretokens are added:
4548             # $tok - the final token
4549             # $i - the index of the last pretoken
4550              
4551             # They may also need to check and set various flags
4552              
4553             # Scan a bare word following a -> as an identifier; it could
4554             # have a long package name. Fixes c037, c041.
4555 5799 100 100     92091 if ( $last_nonblank_token eq '->' ) {
    100 66        
    100 100        
    100 66        
    100 66        
    100 100        
    100 66        
    50 66        
    50 100        
    100 100        
    100 33        
    100 0        
    100 0        
    100 33        
    100 0        
    100 0        
    100 66        
    100 100        
      100        
      100        
      66        
      66        
      66        
4556 670         2432 $self->scan_bare_identifier();
4557              
4558             # a bareward after '->' gets type 'i'
4559 670         1245 $type = 'i';
4560             }
4561              
4562             # Quote a word followed by => operator
4563             elsif (
4564             ( $next_nonblank_token eq '=' && $rtokens->[ $i_next + 1 ] eq '>' )
4565              
4566             # unless the word is __END__ or __DATA__ and is the only word on
4567             # the line.
4568             && ( !defined( $is_END_DATA{$tok_kw} )
4569             || $input_line !~ /^\s*__(?:END|DATA)__\s*$/ )
4570             )
4571             {
4572 786         2431 $self->do_QUOTED_BAREWORD();
4573             }
4574              
4575             # quote a bare word within braces..like xxx->{s}; note that we
4576             # must be sure this is not a structural brace, to avoid
4577             # mistaking {s} in the following for a quoted bare word:
4578             # for(@[){s}bla}BLA}
4579             # Also treat q in something like var{-q} as a bare word, not
4580             # a quote operator
4581             elsif (
4582             $next_nonblank_token eq '}'
4583             && (
4584             $last_nonblank_type eq 'L'
4585             || ( $last_nonblank_type eq 'm'
4586             && $last_last_nonblank_type eq 'L' )
4587             )
4588             )
4589             {
4590 100         259 $type = 'w';
4591             }
4592              
4593             # handle operator x (now we know it isn't $x=)
4594             elsif (
4595             $expecting == OPERATOR
4596             && substr( $tok, 0, 1 ) eq 'x'
4597             && ( length($tok) == 1
4598             || substr( $tok, 1, 1 ) =~ /^\d/ )
4599             )
4600             {
4601 17         95 $self->do_X_OPERATOR();
4602             }
4603             elsif ( $tok_kw eq 'CORE::' ) {
4604 3         6 $type = $tok = $tok_kw;
4605 3         6 $i += 2;
4606             }
4607             elsif ( ( $tok eq 'strict' )
4608             and ( $last_nonblank_token eq 'use' ) )
4609             {
4610 14         51 $self->[_saw_use_strict_] = 1;
4611 14         96 $self->scan_bare_identifier();
4612             }
4613              
4614             elsif ( ( $tok eq 'warnings' )
4615             and ( $last_nonblank_token eq 'use' ) )
4616             {
4617 7         28 $self->[_saw_perl_dash_w_] = 1;
4618              
4619             # scan as identifier, so that we pick up something like:
4620             # use warnings::register
4621 7         48 $self->scan_bare_identifier();
4622             }
4623              
4624             elsif (
4625             $tok eq 'AutoLoader'
4626             && $self->[_look_for_autoloader_]
4627             && (
4628             $last_nonblank_token eq 'use'
4629              
4630             # these regexes are from AutoSplit.pm, which we want
4631             # to mimic
4632             || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
4633             || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
4634             )
4635             )
4636             {
4637 0         0 $self->write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
4638 0         0 $self->[_saw_autoloader_] = 1;
4639 0         0 $self->[_look_for_autoloader_] = 0;
4640 0         0 $self->scan_bare_identifier();
4641             }
4642              
4643             elsif (
4644             $tok eq 'SelfLoader'
4645             && $self->[_look_for_selfloader_]
4646             && ( $last_nonblank_token eq 'use'
4647             || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4648             || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4649             )
4650             {
4651 0         0 $self->write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4652 0         0 $self->[_saw_selfloader_] = 1;
4653 0         0 $self->[_look_for_selfloader_] = 0;
4654 0         0 $self->scan_bare_identifier();
4655             }
4656              
4657             elsif ( ( $tok eq 'constant' )
4658             and ( $last_nonblank_token eq 'use' ) )
4659             {
4660 16         82 $self->do_USE_CONSTANT();
4661             }
4662              
4663             # various quote operators
4664             elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4665 202         840 $self->do_QUOTE_OPERATOR();
4666             }
4667              
4668             # check for a statement label
4669             elsif (
4670             ( $next_nonblank_token eq ':' )
4671             && ( $rtokens->[ $i_next + 1 ] ne ':' )
4672             && ( $i_next <= $max_token_index ) # colon on same line
4673              
4674             # like 'sub : lvalue' ?
4675             && !$self->sub_attribute_ok_here( $tok_kw, $next_nonblank_token,
4676             $i_next )
4677             && new_statement_ok()
4678             )
4679             {
4680 33 100       177 if ( $tok !~ /[A-Z]/ ) {
4681 15         36 push @{ $self->[_rlower_case_labels_at_] }, $input_line_number;
  15         56  
4682             }
4683 33         96 $type = 'J';
4684 33         92 $tok .= ':';
4685 33         87 $i = $i_next;
4686             }
4687              
4688             # 'sub' or other sub alias
4689             elsif ( $is_sub{$tok_kw} ) {
4690              
4691             # Update for --use-feature=class (rt145706):
4692             # We have to be extra careful to avoid misparsing other uses of
4693             # 'method' in older scripts.
4694 303 100 100     1544 if ( $tok_kw eq 'method' && $guess_if_method ) {
4695 10 100 66     113 if ( $expecting == OPERATOR
      100        
4696             || $next_nonblank_token !~ /^[\w\:]/
4697             || !$self->method_ok_here() )
4698             {
4699 7         67 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4700             }
4701             else {
4702 3         168 initialize_subname();
4703 3         18 $self->scan_id();
4704             }
4705             }
4706             else {
4707 293 50       875 $self->error_if_expecting_OPERATOR()
4708             if ( $expecting == OPERATOR );
4709 293         1116 initialize_subname();
4710 293         1110 $self->scan_id();
4711             }
4712             }
4713              
4714             # 'package'
4715             elsif ( $is_package{$tok_kw} ) {
4716              
4717             # Update for --use-feature=class (rt145706):
4718             # We have to be extra careful because 'class' may be used for other
4719             # purposes on older code; i.e.
4720             # class($x) - valid sub call
4721             # package($x) - error
4722 30 100       98 if ( $tok_kw eq 'class' ) {
4723 8 100 66     72 if ( $expecting == OPERATOR
      100        
4724             || $next_nonblank_token !~ /^[\w\:]/
4725             || !$self->class_ok_here() )
4726             {
4727 4         15 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4728             }
4729 4         10 else { $self->scan_id() }
4730             }
4731             else {
4732 22 50       373 $self->error_if_expecting_OPERATOR()
4733             if ( $expecting == OPERATOR );
4734 22         94 $self->scan_id();
4735             }
4736             }
4737              
4738             # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4739             # more restrictive. Require a new statement to be ok here.
4740             elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4741 1         3 $type = ';'; # make tokenizer look for TERM next
4742 1         3 $self->[_in_format_] = 1;
4743 1         3 $is_last = 1; ## is last token on this line
4744             }
4745              
4746             # Note on token types for format, __DATA__, __END__:
4747             # It simplifies things to give these type ';', so that when we
4748             # start rescanning we will be expecting a token of type TERM.
4749             # We will switch to type 'k' before outputting the tokens.
4750             elsif ( defined( $is_END_DATA{$tok_kw} ) ) {
4751 7         26 $type = ';'; # make tokenizer look for TERM next
4752              
4753             # Remember that we are in one of these three sections
4754 7         25 $self->[ $is_END_DATA{$tok_kw} ] = 1;
4755 7         31 $is_last = 1; ## is last token on this line
4756             }
4757              
4758             elsif ( $is_keyword{$tok_kw} ) {
4759 2644         8114 $self->do_KEYWORD();
4760             }
4761              
4762             # check for inline label following
4763             # /^(redo|last|next|goto)$/
4764             elsif (( $last_nonblank_type eq 'k' )
4765             && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4766             {
4767 19         47 $type = 'j';
4768             }
4769              
4770             # something else --
4771             else {
4772 947         3462 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4773             }
4774              
4775 5799         13062 return $is_last;
4776              
4777             } ## end sub do_BAREWORD
4778              
4779             sub do_FOLLOW_QUOTE {
4780              
4781 2768     2768 0 4495 my $self = shift;
4782              
4783             # Continue following a quote on a new line
4784 2768         4583 $type = $quote_type;
4785              
4786 2768 100       3909 if ( !@{$routput_token_list} ) { # initialize if continuation line
  2768         6721  
4787 184         376 push( @{$routput_token_list}, $i );
  184         406  
4788 184         435 $routput_token_type->[$i] = $type;
4789              
4790             }
4791              
4792             # scan for the end of the quote or pattern
4793             (
4794 2768         8584 $i,
4795             $in_quote,
4796             $quote_character,
4797             $quote_pos,
4798             $quote_depth,
4799             $quoted_string_1,
4800             $quoted_string_2,
4801              
4802             ) = $self->do_quote(
4803              
4804             $i,
4805             $in_quote,
4806             $quote_character,
4807             $quote_pos,
4808             $quote_depth,
4809             $quoted_string_1,
4810             $quoted_string_2,
4811             $rtokens,
4812             $rtoken_map,
4813             $max_token_index,
4814              
4815             );
4816              
4817             # all done if we didn't find it
4818 2768 100       6871 if ($in_quote) { return }
  183         358  
4819              
4820             # save pattern and replacement text for rescanning
4821 2585         4196 my $qs1 = $quoted_string_1;
4822              
4823             # re-initialize for next search
4824 2585         3999 $quote_character = EMPTY_STRING;
4825 2585         3743 $quote_pos = 0;
4826 2585         3947 $quote_type = 'Q';
4827 2585         3781 $quoted_string_1 = EMPTY_STRING;
4828 2585         3895 $quoted_string_2 = EMPTY_STRING;
4829 2585 100       5710 if ( ++$i > $max_token_index ) { return }
  116         345  
4830              
4831             # look for any modifiers
4832 2469 100       5397 if ($allowed_quote_modifiers) {
4833              
4834             # check for exact quote modifiers
4835 144 100       754 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4836 30         75 my $str = $rtokens->[$i];
4837 30         71 my $saw_modifier_e;
4838 30         502 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4839 47         116 my $pos = pos($str);
4840 47         124 my $char = substr( $str, $pos - 1, 1 );
4841 47   66     309 $saw_modifier_e ||= ( $char eq 'e' );
4842             }
4843              
4844             # For an 'e' quote modifier we must scan the replacement
4845             # text for here-doc targets...
4846             # but if the modifier starts a new line we can skip
4847             # this because either the here doc will be fully
4848             # contained in the replacement text (so we can
4849             # ignore it) or Perl will not find it.
4850             # See test 'here2.in'.
4851 30 50 66     155 if ( $saw_modifier_e && $i_tok >= 0 ) {
4852              
4853 0         0 my $rht = $self->scan_replacement_text($qs1);
4854              
4855             # Change type from 'Q' to 'h' for quotes with
4856             # here-doc targets so that the formatter (see sub
4857             # process_line_of_CODE) will not make any line
4858             # breaks after this point.
4859 0 0       0 if ($rht) {
4860 0         0 push @{$rhere_target_list}, @{$rht};
  0         0  
  0         0  
4861 0         0 $type = 'h';
4862 0 0       0 if ( $i_tok < 0 ) {
4863 0         0 my $ilast = $routput_token_list->[-1];
4864 0         0 $routput_token_type->[$ilast] = $type;
4865             }
4866             }
4867             }
4868              
4869 30 50       143 if ( defined( pos($str) ) ) {
4870              
4871             # matched
4872 30 50       123 if ( pos($str) == length($str) ) {
4873 30 50       143 if ( ++$i > $max_token_index ) { return }
  0         0  
4874             }
4875              
4876             # Looks like a joined quote modifier
4877             # and keyword, maybe something like
4878             # s/xxx/yyy/gefor @k=...
4879             # Example is "galgen.pl". Would have to split
4880             # the word and insert a new token in the
4881             # pre-token list. This is so rare that I haven't
4882             # done it. Will just issue a warning citation.
4883              
4884             # This error might also be triggered if my quote
4885             # modifier characters are incomplete
4886             else {
4887 0         0 $self->warning(<<EOM);
4888              
4889             Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4890             Please put a space between quote modifiers and trailing keywords.
4891             EOM
4892              
4893             # print "token $rtokens->[$i]\n";
4894             # my $num = length($str) - pos($str);
4895             # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4896             # print "continuing with new token $rtokens->[$i]\n";
4897              
4898             # skipping past this token does least damage
4899 0 0       0 if ( ++$i > $max_token_index ) { return }
  0         0  
4900             }
4901             }
4902             else {
4903              
4904             # example file: rokicki4.pl
4905             # This error might also be triggered if my quote
4906             # modifier characters are incomplete
4907 0         0 $self->write_logfile_entry(
4908             "Note: found word $str at quote modifier location\n");
4909             }
4910             }
4911              
4912             # re-initialize
4913 144         292 $allowed_quote_modifiers = EMPTY_STRING;
4914             }
4915 2469         4340 return;
4916             } ## end sub do_FOLLOW_QUOTE
4917              
4918             # ------------------------------------------------------------
4919             # begin hash of code for handling most token types
4920             # ------------------------------------------------------------
4921             my $tokenization_code = {
4922              
4923             '>' => \&do_GREATER_THAN_SIGN,
4924             '|' => \&do_VERTICAL_LINE,
4925             '$' => \&do_DOLLAR_SIGN,
4926             '(' => \&do_LEFT_PARENTHESIS,
4927             ')' => \&do_RIGHT_PARENTHESIS,
4928             ',' => \&do_COMMA,
4929             ';' => \&do_SEMICOLON,
4930             '"' => \&do_QUOTATION_MARK,
4931             "'" => \&do_APOSTROPHE,
4932             '`' => \&do_BACKTICK,
4933             '/' => \&do_SLASH,
4934             '{' => \&do_LEFT_CURLY_BRACKET,
4935             '}' => \&do_RIGHT_CURLY_BRACKET,
4936             '&' => \&do_AMPERSAND,
4937             '<' => \&do_LESS_THAN_SIGN,
4938             '?' => \&do_QUESTION_MARK,
4939             '*' => \&do_STAR,
4940             '.' => \&do_DOT,
4941             ':' => \&do_COLON,
4942             '+' => \&do_PLUS_SIGN,
4943             '@' => \&do_AT_SIGN,
4944             '%' => \&do_PERCENT_SIGN,
4945             '[' => \&do_LEFT_SQUARE_BRACKET,
4946             ']' => \&do_RIGHT_SQUARE_BRACKET,
4947             '-' => \&do_MINUS_SIGN,
4948             '^' => \&do_CARAT_SIGN,
4949             '::' => \&do_DOUBLE_COLON,
4950             '<<' => \&do_LEFT_SHIFT,
4951             '<<~' => \&do_NEW_HERE_DOC,
4952             '->' => \&do_POINTER,
4953             '++' => \&do_PLUS_PLUS,
4954             '=>' => \&do_FAT_COMMA,
4955             '--' => \&do_MINUS_MINUS,
4956             '&&' => \&do_LOGICAL_AND,
4957             '||' => \&do_LOGICAL_OR,
4958             '//' => \&do_SLASH_SLASH,
4959              
4960             # No special code for these types yet, but syntax checks
4961             # could be added.
4962             ## '!' => undef,
4963             ## '!=' => undef,
4964             ## '!~' => undef,
4965             ## '%=' => undef,
4966             ## '&&=' => undef,
4967             ## '&=' => undef,
4968             ## '+=' => undef,
4969             ## '-=' => undef,
4970             ## '..' => undef,
4971             ## '..' => undef,
4972             ## '...' => undef,
4973             ## '.=' => undef,
4974             ## '<<=' => undef,
4975             ## '<=' => undef,
4976             ## '<=>' => undef,
4977             ## '<>' => undef,
4978             ## '=' => undef,
4979             ## '==' => undef,
4980             ## '=~' => undef,
4981             ## '>=' => undef,
4982             ## '>>' => undef,
4983             ## '>>=' => undef,
4984             ## '\\' => undef,
4985             ## '^=' => undef,
4986             ## '|=' => undef,
4987             ## '||=' => undef,
4988             ## '//=' => undef,
4989             ## '~' => undef,
4990             ## '~~' => undef,
4991             ## '!~~' => undef,
4992              
4993             };
4994              
4995             # ------------------------------------------------------------
4996             # end hash of code for handling individual token types
4997             # ------------------------------------------------------------
4998              
4999 39     39   473 use constant DEBUG_TOKENIZE => 0;
  39         167  
  39         128162  
5000              
5001             sub tokenize_this_line {
5002              
5003             # This routine breaks a line of perl code into tokens which are of use in
5004             # indentation and reformatting. One of my goals has been to define tokens
5005             # such that a newline may be inserted between any pair of tokens without
5006             # changing or invalidating the program. This version comes close to this,
5007             # although there are necessarily a few exceptions which must be caught by
5008             # the formatter. Many of these involve the treatment of bare words.
5009             #
5010             # The tokens and their types are returned in arrays. See previous
5011             # routine for their names.
5012             #
5013             # See also the array "valid_token_types" in the BEGIN section for an
5014             # up-to-date list.
5015             #
5016             # To simplify things, token types are either a single character, or they
5017             # are identical to the tokens themselves.
5018             #
5019             # As a debugging aid, the -D flag creates a file containing a side-by-side
5020             # comparison of the input string and its tokenization for each line of a file.
5021             # This is an invaluable debugging aid.
5022             #
5023             # In addition to tokens, and some associated quantities, the tokenizer
5024             # also returns flags indication any special line types. These include
5025             # quotes, here_docs, formats.
5026             #
5027             # -----------------------------------------------------------------------
5028             #
5029             # How to add NEW_TOKENS:
5030             #
5031             # New token types will undoubtedly be needed in the future both to keep up
5032             # with changes in perl and to help adapt the tokenizer to other applications.
5033             #
5034             # Here are some notes on the minimal steps. I wrote these notes while
5035             # adding the 'v' token type for v-strings, which are things like version
5036             # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
5037             # can use your editor to search for the string "NEW_TOKENS" to find the
5038             # appropriate sections to change):
5039             #
5040             # *. Try to talk somebody else into doing it! If not, ..
5041             #
5042             # *. Make a backup of your current version in case things don't work out!
5043             #
5044             # *. Think of a new, unused character for the token type, and add to
5045             # the array @valid_token_types in the BEGIN section of this package.
5046             # For example, I used 'v' for v-strings.
5047             #
5048             # *. Implement coding to recognize the $type of the token in this routine.
5049             # This is the hardest part, and is best done by imitating or modifying
5050             # some of the existing coding. For example, to recognize v-strings, I
5051             # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
5052             # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
5053             #
5054             # *. Update sub operator_expected. This update is critically important but
5055             # the coding is trivial. Look at the comments in that routine for help.
5056             # For v-strings, which should behave like numbers, I just added 'v' to the
5057             # regex used to handle numbers and strings (types 'n' and 'Q').
5058             #
5059             # *. Implement a 'bond strength' rule in sub set_bond_strengths in
5060             # Perl::Tidy::Formatter for breaking lines around this token type. You can
5061             # skip this step and take the default at first, then adjust later to get
5062             # desired results. For adding type 'v', I looked at sub bond_strength and
5063             # saw that number type 'n' was using default strengths, so I didn't do
5064             # anything. I may tune it up someday if I don't like the way line
5065             # breaks with v-strings look.
5066             #
5067             # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
5068             # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
5069             # and saw that type 'n' used spaces on both sides, so I just added 'v'
5070             # to the array @spaces_both_sides.
5071             #
5072             # *. Update HtmlWriter package so that users can colorize the token as
5073             # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
5074             # that package. For v-strings, I initially chose to use a default color
5075             # equal to the default for numbers, but it might be nice to change that
5076             # eventually.
5077             #
5078             # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
5079             #
5080             # *. Run lots and lots of debug tests. Start with special files designed
5081             # to test the new token type. Run with the -D flag to create a .DEBUG
5082             # file which shows the tokenization. When these work ok, test as many old
5083             # scripts as possible. Start with all of the '.t' files in the 'test'
5084             # directory of the distribution file. Compare .tdy output with previous
5085             # version and updated version to see the differences. Then include as
5086             # many more files as possible. My own technique has been to collect a huge
5087             # number of perl scripts (thousands!) into one directory and run perltidy
5088             # *, then run diff between the output of the previous version and the
5089             # current version.
5090             #
5091             # *. For another example, search for the smartmatch operator '~~'
5092             # with your editor to see where updates were made for it.
5093             #
5094             # -----------------------------------------------------------------------
5095              
5096 7518     7518 0 15216 my ( $self, $line_of_tokens ) = @_;
5097 7518         14853 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
5098              
5099             # Extract line number for use in error messages
5100 7518         12242 $input_line_number = $line_of_tokens->{_line_number};
5101              
5102             # Check for pod documentation
5103 7518 100 66     20227 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
5104             && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
5105             {
5106              
5107             # Must not be in multi-line quote
5108             # and must not be in an equation
5109 14         37 my $blank_after_Z = 1;
5110 14 50 33     121 if (
5111             !$in_quote
5112             && ( $self->operator_expected( '=', 'b', $blank_after_Z ) ==
5113             TERM )
5114             )
5115             {
5116 14         38 $self->[_in_pod_] = 1;
5117 14         31 return;
5118             }
5119             }
5120              
5121 7504         14287 $input_line = $untrimmed_input_line;
5122              
5123 7504         14525 chomp $input_line;
5124              
5125             # Reinitialize the multi-line quote flag
5126 7504 100 100     18811 if ( $in_quote && $quote_type eq 'Q' ) {
5127 47         140 $line_of_tokens->{_starting_in_quote} = 1;
5128             }
5129             else {
5130 7457         14089 $line_of_tokens->{_starting_in_quote} = 0;
5131              
5132             # Trim start of this line unless we are continuing a quoted line.
5133             # Do not trim end because we might end in a quote (test: deken4.pl)
5134             # Perl::Tidy::Formatter will delete needless trailing blanks
5135 7457 100       34428 if ( !length($input_line) ) {
    100          
5136              
5137             # line is empty
5138             }
5139             elsif ( $input_line =~ m/\S/g ) {
5140              
5141             # There are $spaces blank characters before a nonblank character
5142 6651         13684 my $spaces = pos($input_line) - 1;
5143 6651 100       15567 if ( $spaces > 0 ) {
5144              
5145             # Trim the leading spaces
5146 3547         9101 $input_line = substr( $input_line, $spaces );
5147              
5148             # Find actual space count if there are leading tabs
5149 3547 100 66     12836 if (
5150             ord( substr( $untrimmed_input_line, 0, 1 ) ) == ORD_TAB
5151             && $untrimmed_input_line =~ /^(\t+)/ )
5152             {
5153 213         704 $spaces += length($1) * ( $tabsize - 1 );
5154             }
5155              
5156             # Calculate a guessed level for nonblank lines to avoid
5157             $line_of_tokens->{_guessed_indentation_level} =
5158 3547         10529 int( $spaces / $rOpts_indent_columns );
5159             }
5160             }
5161             else {
5162              
5163             # line has all blank characters
5164 9         42 $input_line = EMPTY_STRING;
5165             }
5166             }
5167              
5168 7504 100       16164 if ( !$in_quote ) {
5169              
5170             # Optimize handling of a blank line
5171 7320 100       15891 if ( !length($input_line) ) {
5172 806         2276 $line_of_tokens->{_line_type} = 'CODE';
5173 806         1887 $line_of_tokens->{_rtokens} = [];
5174 806         1823 $line_of_tokens->{_rtoken_type} = [];
5175 806         1825 $line_of_tokens->{_rlevels} = [];
5176 806         2036 $line_of_tokens->{_rci_levels} = [];
5177 806         1673 $line_of_tokens->{_rblock_type} = [];
5178 806         1956 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5179 806         2351 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5180 806         1699 return;
5181             }
5182              
5183             # Check comments
5184 6514 100       15987 if ( substr( $input_line, 0, 1 ) eq '#' ) {
5185              
5186             # and check for skipped section
5187 788 50 66     4686 if (
      66        
      66        
5188             (
5189             substr( $input_line, 0, 4 ) eq '#<<V'
5190             || $rOpts_code_skipping_begin
5191             )
5192             && $rOpts_code_skipping
5193             && $input_line =~ /$code_skipping_pattern_begin/
5194             )
5195             {
5196 2         10 $self->[_in_skipped_] = $self->[_last_line_number_];
5197 2         11 return;
5198             }
5199              
5200             # Optional fast processing of a block comment
5201 786         1858 $line_of_tokens->{_line_type} = 'CODE';
5202 786         2278 $line_of_tokens->{_rtokens} = [$input_line];
5203 786         2643 $line_of_tokens->{_rtoken_type} = ['#'];
5204 786         2073 $line_of_tokens->{_rlevels} = [$level_in_tokenizer];
5205 786         1898 $line_of_tokens->{_rci_levels} = [0];
5206 786         2416 $line_of_tokens->{_rblock_type} = [EMPTY_STRING];
5207 786         2858 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5208 786         2769 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5209 786         1711 return;
5210             }
5211             }
5212              
5213             # update the copy of the line for use in error messages
5214             # This must be exactly what we give the pre_tokenizer
5215 5910         11908 $self->[_line_of_text_] = $input_line;
5216              
5217             # re-initialize for the main loop
5218 5910         14679 $routput_token_list = []; # stack of output token indexes
5219 5910         18208 $routput_token_type = []; # token types
5220 5910         16039 $routput_block_type = []; # types of code block
5221 5910         15527 $routput_container_type = []; # paren types, such as if, elsif, ..
5222 5910         14322 $routput_type_sequence = []; # nesting sequential number
5223              
5224 5910         9621 $rhere_target_list = [];
5225              
5226 5910         9778 $tok = $last_nonblank_token;
5227 5910         9287 $type = $last_nonblank_type;
5228 5910         9264 $prototype = $last_nonblank_prototype;
5229 5910         8867 $last_nonblank_i = -1;
5230 5910         9371 $block_type = $last_nonblank_block_type;
5231 5910         9190 $container_type = $last_nonblank_container_type;
5232 5910         9125 $type_sequence = $last_nonblank_type_sequence;
5233 5910         9120 $indent_flag = 0;
5234 5910         8220 $peeked_ahead = 0;
5235              
5236 5910         17826 $self->tokenizer_main_loop();
5237              
5238             #-----------------------------------------------
5239             # all done tokenizing this line ...
5240             # now prepare the final list of tokens and types
5241             #-----------------------------------------------
5242 5910         18117 $self->tokenizer_wrapup_line($line_of_tokens);
5243              
5244 5910         10345 return;
5245             } ## end sub tokenize_this_line
5246              
5247             sub tokenizer_main_loop {
5248              
5249 5910     5910 0 11302 my ($self) = @_;
5250              
5251             #---------------------------------
5252             # Break one input line into tokens
5253             #---------------------------------
5254              
5255             # start by breaking the line into pre-tokens
5256 5910         14415 ( $rtokens, $rtoken_map, $rtoken_type ) = pre_tokenize($input_line);
5257              
5258 5910         23584 $max_token_index = scalar( @{$rtokens} ) - 1;
  5910         10835  
5259 5910         8889 push( @{$rtokens}, SPACE, SPACE, SPACE )
  5910         14956  
5260             ; # extra whitespace simplifies logic
5261 5910         9020 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
  5910         12567  
5262 5910         8617 push( @{$rtoken_type}, 'b', 'b', 'b' );
  5910         12279  
5263              
5264             # initialize for main loop
5265 5910         8527 if (0) { #<<< this is not necessary
5266             foreach my $ii ( 0 .. $max_token_index + 3 ) {
5267             $routput_token_type->[$ii] = EMPTY_STRING;
5268             $routput_block_type->[$ii] = EMPTY_STRING;
5269             $routput_container_type->[$ii] = EMPTY_STRING;
5270             $routput_type_sequence->[$ii] = EMPTY_STRING;
5271             $routput_indent_flag->[$ii] = 0;
5272             }
5273             }
5274              
5275 5910         8753 $i = -1;
5276 5910         8882 $i_tok = -1;
5277              
5278             #-----------------------
5279             # main tokenization loop
5280             #-----------------------
5281              
5282             # we are looking at each pre-token of one line and combining them
5283             # into tokens
5284 5910         13744 while ( ++$i <= $max_token_index ) {
5285              
5286             # continue looking for the end of a quote
5287 50826 100       87559 if ($in_quote) {
5288 2768         9010 $self->do_FOLLOW_QUOTE();
5289 2768 100 100     10477 last if ( $in_quote || $i > $max_token_index );
5290             }
5291              
5292 50527 100 100     138837 if ( $type ne 'b' && $type ne 'CORE::' ) {
5293              
5294             # try to catch some common errors
5295 35304 100 100     75603 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
5296              
5297 1592 100       5085 if ( $last_nonblank_token eq 'eq' ) {
    50          
5298 9         97 $self->complain("Should 'eq' be '==' here ?\n");
5299             }
5300             elsif ( $last_nonblank_token eq 'ne' ) {
5301 0         0 $self->complain("Should 'ne' be '!=' here ?\n");
5302             }
5303             else {
5304             # that's all
5305             }
5306             }
5307              
5308             # fix c090, only rotate vars if a new token will be stored
5309 35304 100       64839 if ( $i_tok >= 0 ) {
5310              
5311 29532         44087 $last_last_nonblank_token = $last_nonblank_token;
5312 29532         39662 $last_last_nonblank_type = $last_nonblank_type;
5313              
5314 29532         41643 $last_nonblank_prototype = $prototype;
5315 29532         40541 $last_nonblank_block_type = $block_type;
5316 29532         39995 $last_nonblank_container_type = $container_type;
5317 29532         40983 $last_nonblank_type_sequence = $type_sequence;
5318 29532         38478 $last_nonblank_i = $i_tok;
5319              
5320             # Fix part #3 for git82: propagate type 'Z' though L-R pair
5321 29532 100 100     61493 if ( !( $type eq 'R' && $last_nonblank_type eq 'Z' ) ) {
5322 29531         41843 $last_nonblank_token = $tok;
5323 29531         41875 $last_nonblank_type = $type;
5324             }
5325             }
5326              
5327             # Patch for c030: Fix things in case a '->' got separated from
5328             # the subsequent identifier by a side comment. We need the
5329             # last_nonblank_token to have a leading -> to avoid triggering
5330             # an operator expected error message at the next '('. See also
5331             # fix for git #63.
5332 35304 100       65372 if ( $last_last_nonblank_token eq '->' ) {
5333 886 100 66     5498 if ( $last_nonblank_type eq 'w'
5334             || $last_nonblank_type eq 'i' )
5335             {
5336 675         1748 $last_nonblank_token = '->' . $last_nonblank_token;
5337 675         1397 $last_nonblank_type = 'i';
5338             }
5339             }
5340             }
5341              
5342             # store previous token type
5343 50527 100       87064 if ( $i_tok >= 0 ) {
5344 44755         86851 $routput_token_type->[$i_tok] = $type;
5345 44755         73564 $routput_block_type->[$i_tok] = $block_type;
5346 44755         74748 $routput_container_type->[$i_tok] = $container_type;
5347 44755         70372 $routput_type_sequence->[$i_tok] = $type_sequence;
5348 44755         68197 $routput_indent_flag->[$i_tok] = $indent_flag;
5349             }
5350              
5351             # get the next pre-token and type
5352             # $tok and $type will be modified to make the output token
5353 50527         80462 my $pre_tok = $tok = $rtokens->[$i]; # get the next pre-token
5354 50527         76595 my $pre_type = $type = $rtoken_type->[$i]; # and type
5355              
5356             # remember the starting index of this token; we will be updating $i
5357 50527         66731 $i_tok = $i;
5358              
5359             # re-initialize various flags for the next output token
5360             (
5361              
5362 50527         89619 $block_type,
5363             $container_type,
5364             $type_sequence,
5365             $indent_flag,
5366             $prototype,
5367             )
5368             = (
5369              
5370             EMPTY_STRING,
5371             EMPTY_STRING,
5372             EMPTY_STRING,
5373             0,
5374             EMPTY_STRING,
5375             );
5376              
5377             # this pre-token will start an output token
5378 50527         64408 push( @{$routput_token_list}, $i_tok );
  50527         84339  
5379              
5380             # The search for the full token ends in one of 5 main END NODES:
5381              
5382             #-----------------------
5383             # END NODE 1: whitespace
5384             #-----------------------
5385 50527 100       107292 next if ( $pre_type eq 'b' );
5386              
5387             #----------------------
5388             # END NODE 2: a comment
5389             #----------------------
5390 35162 100       62299 last if ( $pre_type eq '#' );
5391              
5392             # continue gathering identifier if necessary
5393 34834 100       60567 if ($id_scan_state) {
5394              
5395 17 100 66     134 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
5396 10         39 $self->scan_id();
5397             }
5398             else {
5399 7         33 $self->scan_identifier();
5400             }
5401              
5402 17 100       60 if ($id_scan_state) {
5403              
5404             # Still scanning ...
5405             # Check for side comment between sub and prototype (c061)
5406              
5407             # done if nothing left to scan on this line
5408 1 50       1521 last if ( $i > $max_token_index );
5409              
5410 1         13 my ( $next_nonblank_token, $i_next ) =
5411             find_next_nonblank_token_on_this_line( $i, $rtokens,
5412             $max_token_index );
5413              
5414             # done if it was just some trailing space
5415 1 50       11 last if ( $i_next > $max_token_index );
5416              
5417             # something remains on the line ... must be a side comment
5418 1         4 next;
5419             }
5420              
5421 16 100 100     113 next if ( ( $i > 0 ) || $type );
5422              
5423             # didn't find any token; start over
5424 7         15 $type = $pre_type;
5425 7         16 $tok = $pre_tok;
5426             }
5427              
5428             #-----------------------------------------------------------
5429             # Combine pre-tokens into digraphs and trigraphs if possible
5430             #-----------------------------------------------------------
5431              
5432             # See if we can make a digraph...
5433             # The following tokens are excluded and handled specially:
5434             # '/=' is excluded because the / might start a pattern.
5435             # 'x=' is excluded since it might be $x=, with $ on previous line
5436             # '**' and *= might be typeglobs of punctuation variables
5437             # I have allowed tokens starting with <, such as <=,
5438             # because I don't think these could be valid angle operators.
5439             # test file: storrs4.pl
5440 34824 100 100     107308 if ( $can_start_digraph{$tok}
      100        
5441             && $i < $max_token_index
5442             && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
5443             {
5444              
5445 2561         4906 my $combine_ok = 1;
5446 2561         5543 my $test_tok = $tok . $rtokens->[ $i + 1 ];
5447              
5448             # check for special cases which cannot be combined
5449              
5450             # '//' must be defined_or operator if an operator is expected.
5451             # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
5452             # could be migrated here for clarity
5453              
5454             # Patch for RT#102371, misparsing a // in the following snippet:
5455             # state $b //= ccc();
5456             # The solution is to always accept the digraph (or trigraph)
5457             # after type 'Z' (possible file handle). The reason is that
5458             # sub operator_expected gives TERM expected here, which is
5459             # wrong in this case.
5460 2561 100 66     7172 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
5461              
5462             # note that here $tok = '/' and the next tok and type is '/'
5463 16         42 my $blank_after_Z;
5464 16         69 $expecting =
5465             $self->operator_expected( $tok, '/', $blank_after_Z );
5466              
5467             # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
5468 16 100       51 $combine_ok = 0 if ( $expecting == TERM );
5469             }
5470              
5471             # Patch for RT #114359: mis-parsing of "print $x ** 0.5;
5472             # Accept the digraphs '**' only after type 'Z'
5473             # Otherwise postpone the decision.
5474 2561 100       5910 if ( $test_tok eq '**' ) {
5475 39 100       180 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
  37         79  
5476             }
5477              
5478 2561 50 66     16559 if (
      66        
      33        
5479              
5480             # still ok to combine?
5481             $combine_ok
5482              
5483             && ( $test_tok ne '/=' ) # might be pattern
5484             && ( $test_tok ne 'x=' ) # might be $x
5485             && ( $test_tok ne '*=' ) # typeglob?
5486              
5487             # Moved above as part of fix for
5488             # RT #114359: Missparsing of "print $x ** 0.5;
5489             # && ( $test_tok ne '**' ) # typeglob?
5490             )
5491             {
5492 2520         4084 $tok = $test_tok;
5493 2520         3764 $i++;
5494              
5495             # Now try to assemble trigraphs. Note that all possible
5496             # perl trigraphs can be constructed by appending a character
5497             # to a digraph.
5498 2520         4690 $test_tok = $tok . $rtokens->[ $i + 1 ];
5499              
5500 2520 100       6400 if ( $is_trigraph{$test_tok} ) {
5501 77         182 $tok = $test_tok;
5502 77         169 $i++;
5503             }
5504              
5505             # The only current tetragraph is the double diamond operator
5506             # and its first three characters are NOT a trigraph, so
5507             # we do can do a special test for it
5508             else {
5509 2443 100       5991 if ( $test_tok eq '<<>' ) {
5510 1         4 $test_tok .= $rtokens->[ $i + 2 ];
5511 1 50       5 if ( $is_tetragraph{$test_tok} ) {
5512 1         2 $tok = $test_tok;
5513 1         3 $i += 2;
5514             }
5515             }
5516             }
5517             }
5518             }
5519              
5520 34824         50236 $type = $tok;
5521 34824         57618 $next_tok = $rtokens->[ $i + 1 ];
5522 34824         52907 $next_type = $rtoken_type->[ $i + 1 ];
5523              
5524             # expecting an operator here? first try table lookup, then function
5525 34824         65018 $expecting = $op_expected_table{$last_nonblank_type};
5526 34824 100       66481 if ( !defined($expecting) ) {
5527 8397   100     20226 my $blank_after_Z = $last_nonblank_type eq 'Z'
5528             && ( $i == 0 || $rtoken_type->[ $i - 1 ] eq 'b' );
5529 8397         21868 $expecting =
5530             $self->operator_expected( $tok, $next_type, $blank_after_Z );
5531             }
5532              
5533 34824         44690 DEBUG_TOKENIZE && do {
5534             local $LIST_SEPARATOR = ')(';
5535             my @debug_list = (
5536             $last_nonblank_token, $tok,
5537             $next_tok, $brace_depth,
5538             $rbrace_type->[$brace_depth], $paren_depth,
5539             $rparen_type->[$paren_depth],
5540             );
5541             print {*STDOUT} "TOKENIZE:(@debug_list)\n";
5542             };
5543              
5544             # We have the next token, $tok.
5545             # Now we have to examine this token and decide what it is
5546             # and define its $type
5547              
5548             #------------------------
5549             # END NODE 3: a bare word
5550             #------------------------
5551 34824 100       64751 if ( $pre_type eq 'w' ) {
5552 5838         16367 my $is_last = $self->do_BAREWORD();
5553 5838 100       12259 last if ($is_last);
5554 5830         15005 next;
5555             }
5556              
5557             # Turn off attribute list on first non-blank, non-bareword.
5558             # Added '#' to fix c038 (later moved above).
5559 28986   100     55449 $self->[_in_attribute_list_] &&= 0;
5560              
5561             #-------------------------------
5562             # END NODE 4: a string of digits
5563             #-------------------------------
5564 28986 100       52760 if ( $pre_type eq 'd' ) {
5565 1934         7226 $self->do_DIGITS();
5566 1934         4448 next;
5567             }
5568              
5569             #------------------------------------------
5570             # END NODE 5: everything else (punctuation)
5571             #------------------------------------------
5572 27052         54249 my $code = $tokenization_code->{$tok};
5573 27052 100       51029 if ($code) {
5574 25317         78446 $code->($self);
5575 25317 100       65930 redo if $in_quote;
5576             }
5577             } ## End main tokenizer loop
5578              
5579             # Store the final token
5580 5910 100       13386 if ( $i_tok >= 0 ) {
5581 5772         12489 $routput_token_type->[$i_tok] = $type;
5582 5772         10635 $routput_block_type->[$i_tok] = $block_type;
5583 5772         10237 $routput_container_type->[$i_tok] = $container_type;
5584 5772         10134 $routput_type_sequence->[$i_tok] = $type_sequence;
5585 5772         10129 $routput_indent_flag->[$i_tok] = $indent_flag;
5586             }
5587              
5588             # Remember last nonblank values
5589 5910 100 100     21752 if ( $type ne 'b' && $type ne '#' ) {
5590              
5591 5437         8758 $last_last_nonblank_token = $last_nonblank_token;
5592 5437         7944 $last_last_nonblank_type = $last_nonblank_type;
5593              
5594 5437         8489 $last_nonblank_prototype = $prototype;
5595 5437         7821 $last_nonblank_block_type = $block_type;
5596 5437         7624 $last_nonblank_container_type = $container_type;
5597 5437         7611 $last_nonblank_type_sequence = $type_sequence;
5598              
5599 5437         7874 $last_nonblank_token = $tok;
5600 5437         7947 $last_nonblank_type = $type;
5601             }
5602              
5603             # reset indentation level if necessary at a sub or package
5604             # in an attempt to recover from a nesting error
5605 5910 50       12251 if ( $level_in_tokenizer < 0 ) {
5606 0 0       0 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
5607 0         0 reset_indentation_level(0);
5608 0         0 $self->brace_warning("resetting level to 0 at $1 $2\n");
5609             }
5610             }
5611              
5612 5910         9705 $self->[_in_quote_] = $in_quote;
5613 5910 100       12980 $self->[_quote_target_] =
5614             $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
5615 5910         10817 $self->[_rhere_target_list_] = $rhere_target_list;
5616              
5617 5910         9940 return;
5618             } ## end sub tokenizer_main_loop
5619              
5620             sub tokenizer_wrapup_line {
5621 5910     5910 0 12603 my ( $self, $line_of_tokens ) = @_;
5622              
5623             #---------------------------------------------------------
5624             # Package a line of tokens for shipping back to the caller
5625             #---------------------------------------------------------
5626              
5627             # Arrays to hold token values for this line:
5628 5910         10181 my ( @levels, @block_type, @type_sequence, @token_type, @tokens );
5629              
5630 5910         14648 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5631              
5632             # Remember starting nesting block string
5633 5910         9803 my $nesting_block_string_0 = $nesting_block_string;
5634              
5635             #-----------------
5636             # Loop over tokens
5637             #-----------------
5638             # $i is the index of the pretoken which starts this full token
5639 5910         11508 foreach my $i ( @{$routput_token_list} ) {
  5910         12642  
5640              
5641 50711         73613 my $type_i = $routput_token_type->[$i];
5642              
5643             #----------------------------------------
5644             # Section 1. Handle a non-sequenced token
5645             #----------------------------------------
5646 50711 100       79551 if ( !$routput_type_sequence->[$i] ) {
5647              
5648             #-------------------------------
5649             # Section 1.1. types ';' and 't'
5650             #-------------------------------
5651             # - output anonymous 'sub' as keyword (type 'k')
5652             # - output __END__, __DATA__, and format as type 'k' instead
5653             # of ';' to make html colors correct, etc.
5654 41555 100       91878 if ( $is_semicolon_or_t{$type_i} ) {
    50          
5655 2676         6227 my $tok_i = $rtokens->[$i];
5656 2676 100       7839 if ( $is_END_DATA_format_sub{$tok_i} ) {
5657 172         509 $type_i = 'k';
5658             }
5659             }
5660              
5661             #----------------------------------------------
5662             # Section 1.2. Check for an invalid token type.
5663             #----------------------------------------------
5664             # This can happen by running perltidy on non-scripts although
5665             # it could also be bug introduced by programming change. Perl
5666             # silently accepts a 032 (^Z) and takes it as the end
5667             elsif ( !$is_valid_token_type{$type_i} ) {
5668 0         0 my $val = ord($type_i);
5669 0         0 $self->warning(
5670             "unexpected character decimal $val ($type_i) in script\n"
5671             );
5672 0         0 $self->[_in_error_] = 1;
5673             }
5674             else {
5675             ## ok - valid token type other than ; and t
5676             }
5677              
5678             #----------------------------------------------------
5679             # Section 1.3. Store values for a non-sequenced token
5680             #----------------------------------------------------
5681 41555         67407 push( @levels, $level_in_tokenizer );
5682 41555         62504 push( @block_type, EMPTY_STRING );
5683 41555         60042 push( @type_sequence, EMPTY_STRING );
5684 41555         77747 push( @token_type, $type_i );
5685              
5686             }
5687              
5688             #------------------------------------
5689             # Section 2. Handle a sequenced token
5690             # One of { [ ( ? : ) ] }
5691             #------------------------------------
5692             else {
5693              
5694             # $level_i is the level we will store. Levels of braces are
5695             # set so that the leading braces have a HIGHER level than their
5696             # CONTENTS, which is convenient for indentation.
5697 9156         13762 my $level_i = $level_in_tokenizer;
5698              
5699             # $tok_i is the PRE-token. It only equals the token for symbols
5700 9156         14859 my $tok_i = $rtokens->[$i];
5701              
5702             # $routput_indent_flag->[$i] indicates that we need a change
5703             # in level at a nested ternary, as follows
5704             # 1 => at a nested ternary ?
5705             # -1 => at a nested ternary :
5706             # 0 => otherwise
5707              
5708             #--------------------------------------------
5709             # Section 2.1 Handle a level-increasing token
5710             #--------------------------------------------
5711 9156 100       25078 if ( $is_opening_or_ternary_type{$type_i} ) {
    50          
5712              
5713 4578 100       9045 if ( $type_i eq '?' ) {
5714              
5715 187 100       997 if ( $routput_indent_flag->[$i] > 0 ) {
5716 8         39 $level_in_tokenizer++;
5717              
5718             # break BEFORE '?' in a nested ternary
5719 8         19 $level_i = $level_in_tokenizer;
5720 8         22 $nesting_block_string .= "$nesting_block_flag";
5721              
5722             }
5723             }
5724             else {
5725              
5726 4391         7246 $nesting_token_string .= $tok_i;
5727              
5728 4391 100 100     12454 if ( $type_i eq '{' || $type_i eq 'L' ) {
5729              
5730 4084         6356 $level_in_tokenizer++;
5731              
5732 4084 100       7932 if ( $routput_block_type->[$i] ) {
5733 974         1993 $nesting_block_flag = 1;
5734 974         1985 $nesting_block_string .= '1';
5735             }
5736             else {
5737 3110         5036 $nesting_block_flag = 0;
5738 3110         5440 $nesting_block_string .= '0';
5739             }
5740             }
5741             }
5742             }
5743              
5744             #---------------------------------------------
5745             # Section 2.2. Handle a level-decreasing token
5746             #---------------------------------------------
5747             elsif ( $is_closing_or_ternary_type{$type_i} ) {
5748              
5749 4578 100       11554 if ( $type_i ne ':' ) {
5750 4391         8104 my $char = chop $nesting_token_string;
5751 4391 50       11553 if ( $char ne $matching_start_token{$tok_i} ) {
5752 0         0 $nesting_token_string .= $char . $tok_i;
5753             }
5754             }
5755              
5756 4578 100 100     15163 if (
      100        
      100        
5757             $type_i eq '}'
5758             || $type_i eq 'R'
5759              
5760             # only the second and higher ? : have levels
5761             || $type_i eq ':' && $routput_indent_flag->[$i] < 0
5762             )
5763             {
5764              
5765 4092         6355 $level_i = --$level_in_tokenizer;
5766              
5767 4092 50       8211 if ( $level_in_tokenizer < 0 ) {
5768 0 0       0 if ( !$self->[_saw_negative_indentation_] ) {
5769 0         0 $self->[_saw_negative_indentation_] = 1;
5770 0         0 $self->warning(
5771             "Starting negative indentation\n");
5772             }
5773             }
5774              
5775             # restore previous level values
5776 4092 50       8478 if ( length($nesting_block_string) > 1 )
5777             { # true for valid script
5778 4092         6387 chop $nesting_block_string;
5779 4092         7985 $nesting_block_flag =
5780             substr( $nesting_block_string, -1 ) eq '1';
5781             }
5782              
5783             }
5784             }
5785              
5786             #-----------------------------------------------------
5787             # Section 2.3. Unexpected sequenced token type - error
5788             #-----------------------------------------------------
5789             else {
5790              
5791             # The tokenizer should only be assigning sequence numbers
5792             # to types { [ ( ? ) ] } :
5793 0         0 DEVEL_MODE && $self->Fault(<<EOM);
5794             unexpected sequence number on token type $type_i with pre-tok=$tok_i
5795             EOM
5796             }
5797              
5798             #------------------------------------------------
5799             # Section 2.4. Store values for a sequenced token
5800             #------------------------------------------------
5801              
5802             # The starting nesting block string, which is used in any .LOG
5803             # output, should include the first token of the line
5804 9156 100       18992 if ( !@levels ) {
5805 1578         3060 $nesting_block_string_0 = $nesting_block_string;
5806             }
5807              
5808             # Store values for a sequenced token
5809 9156         17555 push( @levels, $level_i );
5810 9156         17491 push( @block_type, $routput_block_type->[$i] );
5811 9156         15516 push( @type_sequence, $routput_type_sequence->[$i] );
5812 9156         19588 push( @token_type, $type_i );
5813              
5814             }
5815             } ## End loop to over tokens
5816              
5817             #---------------------
5818             # Post-loop operations
5819             #---------------------
5820              
5821 5910         14403 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string_0;
5822              
5823             # Form and store the tokens
5824 5910 50       13685 if (@levels) {
5825              
5826 5910         8662 my $im = shift @{$routput_token_list};
  5910         10681  
5827 5910         10636 my $offset = $rtoken_map->[$im];
5828 5910         8412 foreach my $i ( @{$routput_token_list} ) {
  5910         10585  
5829 44801         60206 my $numc = $rtoken_map->[$i] - $offset;
5830 44801         80510 push( @tokens, substr( $input_line, $offset, $numc ) );
5831 44801         55891 $offset += $numc;
5832              
5833 44801         61762 if ( DEVEL_MODE && $numc <= 0 ) {
5834              
5835             # Should not happen unless @{$rtoken_map} is corrupted
5836             $self->Fault(
5837             "number of characters is '$numc' but should be >0\n");
5838             }
5839             }
5840              
5841             # Form and store the final token of this line
5842 5910         11868 my $numc = length($input_line) - $offset;
5843 5910         12757 push( @tokens, substr( $input_line, $offset, $numc ) );
5844              
5845 5910         9109 if ( DEVEL_MODE && $numc <= 0 ) {
5846             $self->Fault(
5847             "Number of Characters is '$numc' but should be >0\n");
5848             }
5849             }
5850              
5851             # NOTE: This routine returns ci=0. Eventually '_rci_levels' can be
5852             # removed. The ci values are computed later by sub Formatter::set_ci.
5853 5910         19910 my @ci_levels = (0) x scalar(@levels);
5854              
5855             # Wrap up this line of tokens for shipping to the Formatter
5856 5910         14143 $line_of_tokens->{_rtoken_type} = \@token_type;
5857 5910         11059 $line_of_tokens->{_rtokens} = \@tokens;
5858 5910         11689 $line_of_tokens->{_rblock_type} = \@block_type;
5859 5910         11826 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5860 5910         19369 $line_of_tokens->{_rlevels} = \@levels;
5861 5910         12266 $line_of_tokens->{_rci_levels} = \@ci_levels;
5862              
5863 5910         15905 return;
5864             } ## end sub tokenizer_wrapup_line
5865              
5866             } ## end tokenize_this_line
5867              
5868             #######################################################################
5869             # Tokenizer routines which assist in identifying token types
5870             #######################################################################
5871              
5872             # Define Global '%op_expected_table'
5873             # = hash table of operator expected values based on last nonblank token
5874              
5875             # exceptions to perl's weird parsing rules after type 'Z'
5876             my %is_weird_parsing_rule_exception;
5877              
5878             my %is_paren_dollar;
5879              
5880             my %is_n_v;
5881              
5882             BEGIN {
5883              
5884             # Always expecting TERM following these types:
5885             # note: this is identical to '@value_requestor_type' defined later.
5886             # Fix for c250: add new type 'P' for package (expecting VERSION or {}
5887             # after package NAMESPACE, so expecting TERM)
5888             # Fix for c250: add new type 'S' for sub (not expecting operator)
5889 39     39   655 my @q = qw(
5890             ; ! + x & ? F J - p / Y : % f U ~ A G j L P S * . | ^ < = [ m { \ > t
5891             || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5892             &= // >> ~. &. |. ^.
5893             ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5894             );
5895 39         4141 push @q, ',';
5896 39         134 push @q, '('; # for completeness, not currently a token type
5897 39         69 push @q, '->'; # was previously in UNKNOWN
5898 39         1294 @{op_expected_table}{@q} = (TERM) x scalar(@q);
5899              
5900             # Always UNKNOWN following these types;
5901             # previously had '->' in this list for c030
5902 39         207 @q = qw( w );
5903 39         108 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5904              
5905             # Always expecting OPERATOR ...
5906             # 'n' and 'v' are currently excluded because they might be VERSION numbers
5907             # 'i' is currently excluded because it might be a package
5908             # 'q' is currently excluded because it might be a prototype
5909             # Fix for c030: removed '->' from this list:
5910             # Fix for c250: added 'i' because new type 'P' was added
5911 39         116 @q = qw( -- C h R ++ ] Q <> i ); ## n v q );
5912 39         77 push @q, ')';
5913 39         263 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5914              
5915             # Fix for git #62: added '*' and '%'
5916 39         110 @q = qw( < ? * % );
5917 39         123 @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5918              
5919 39         88 @q = qw<) $>;
5920 39         99 @{is_paren_dollar}{@q} = (1) x scalar(@q);
5921              
5922 39         85 @q = qw( n v );
5923 39         1423 @{is_n_v}{@q} = (1) x scalar(@q);
5924              
5925             } ## end BEGIN
5926              
5927 39     39   318 use constant DEBUG_OPERATOR_EXPECTED => 0;
  39         85  
  39         86888  
5928              
5929             sub operator_expected {
5930              
5931             # Returns a parameter indicating what types of tokens can occur next
5932              
5933             # Call format:
5934             # $op_expected =
5935             # $self->operator_expected( $tok, $next_type, $blank_after_Z );
5936             # where
5937             # $tok is the current token
5938             # $next_type is the type of the next token (blank or not)
5939             # $blank_after_Z = flag for guessing after a type 'Z':
5940             # true if $tok follows type 'Z' with intermediate blank
5941             # false if $tok follows type 'Z' with no intermediate blank
5942             # ignored if $tok does not follow type 'Z'
5943              
5944             # Many perl symbols have two or more meanings. For example, '<<'
5945             # can be a shift operator or a here-doc operator. The
5946             # interpretation of these symbols depends on the current state of
5947             # the tokenizer, which may either be expecting a term or an
5948             # operator. For this example, a << would be a shift if an OPERATOR
5949             # is expected, and a here-doc if a TERM is expected. This routine
5950             # is called to make this decision for any current token. It returns
5951             # one of three possible values:
5952             #
5953             # OPERATOR - operator expected (or at least, not a term)
5954             # UNKNOWN - can't tell
5955             # TERM - a term is expected (or at least, not an operator)
5956             #
5957             # The decision is based on what has been seen so far. This
5958             # information is stored in the "$last_nonblank_type" and
5959             # "$last_nonblank_token" variables. For example, if the
5960             # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5961             # if $last_nonblank_type is 'n' (numeric), we are expecting an
5962             # OPERATOR.
5963             #
5964             # If a UNKNOWN is returned, the calling routine must guess. A major
5965             # goal of this tokenizer is to minimize the possibility of returning
5966             # UNKNOWN, because a wrong guess can spoil the formatting of a
5967             # script.
5968             #
5969             # Adding NEW_TOKENS: it is critically important that this routine be
5970             # updated to allow it to determine if an operator or term is to be
5971             # expected after the new token. Doing this simply involves adding
5972             # the new token character to one of the regexes in this routine or
5973             # to one of the hash lists
5974             # that it uses, which are initialized in the BEGIN section.
5975             # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5976             # $statement_type
5977              
5978             # When possible, token types should be selected such that we can determine
5979             # the 'operator_expected' value by a simple hash lookup. If there are
5980             # exceptions, that is an indication that a new type is needed.
5981              
5982 8427     8427 0 19669 my ( $self, $tok, $next_type, $blank_after_Z ) = @_;
5983              
5984             #--------------------------------------------
5985             # Section 1: Table lookup will get most cases
5986             #--------------------------------------------
5987              
5988             # Many types are can be obtained by a table lookup. This typically handles
5989             # more than half of the calls. For speed, the caller may try table lookup
5990             # first before calling this sub.
5991 8427         13362 my $op_expected = $op_expected_table{$last_nonblank_type};
5992 8427 100       17089 if ( defined($op_expected) ) {
5993             DEBUG_OPERATOR_EXPECTED
5994 24         58 && print {*STDOUT}
5995             "OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5996 24         85 return $op_expected;
5997             }
5998              
5999             #---------------------------------------------
6000             # Section 2: Handle special cases if necessary
6001             #---------------------------------------------
6002              
6003             # Types 'k', '}' and 'Z' depend on context
6004             # Types 'n', 'v', 'q' also depend on context.
6005              
6006             # identifier...
6007             # Fix for c250: removed coding for type 'i' because 'i' and new type 'P'
6008             # are now done by hash table lookup
6009              
6010             # keyword...
6011 8403 100       24838 if ( $last_nonblank_type eq 'k' ) {
    100          
    100          
    100          
    100          
6012              
6013             # keywords expecting OPERATOR:
6014 2647 100       12295 if ( $expecting_operator_token{$last_nonblank_token} ) {
    100          
6015 7         19 $op_expected = OPERATOR;
6016             }
6017              
6018             # keywords expecting TERM:
6019             elsif ( $expecting_term_token{$last_nonblank_token} ) {
6020              
6021             # Exceptions from TERM:
6022              
6023             # // may follow perl functions which may be unary operators
6024             # see test file dor.t (defined or);
6025 2541 100 100     10557 if (
    50 100        
      66        
6026             $tok eq '/'
6027             && $next_type eq '/'
6028             && $is_keyword_rejecting_slash_as_pattern_delimiter{
6029             $last_nonblank_token}
6030             )
6031             {
6032 1         4 $op_expected = OPERATOR;
6033             }
6034              
6035             # Patch to allow a ? following 'split' to be a deprecated pattern
6036             # delimiter. This patch is coordinated with the omission of split
6037             # from the list
6038             # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
6039             # will force perltidy to guess.
6040             elsif ($tok eq '?'
6041             && $last_nonblank_token eq 'split' )
6042             {
6043 0         0 $op_expected = UNKNOWN;
6044             }
6045             else {
6046 2540         4426 $op_expected = TERM;
6047             }
6048             }
6049             else {
6050 99         292 $op_expected = TERM;
6051             }
6052             } ## end type 'k'
6053              
6054             # closing container token...
6055              
6056             # Note that the actual token for type '}' may also be a ')'.
6057              
6058             # Also note that $last_nonblank_token is not the token corresponding to
6059             # $last_nonblank_type when the type is a closing container. In that
6060             # case it is the token before the corresponding opening container token.
6061             # So for example, for this snippet
6062             # $a = do { BLOCK } / 2;
6063             # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
6064              
6065             elsif ( $last_nonblank_type eq '}' ) {
6066 3593         6443 $op_expected = UNKNOWN;
6067              
6068             # handle something after 'do' and 'eval'
6069 3593 100 66     19118 if ( $is_block_operator{$last_nonblank_token} ) {
    100          
    100          
6070              
6071             # something like $a = do { BLOCK } / 2;
6072 82         222 $op_expected = OPERATOR; # block mode following }
6073             }
6074              
6075             # $last_nonblank_token =~ /^(\)|\$|\-\>)/
6076             elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
6077             || substr( $last_nonblank_token, 0, 2 ) eq '->' )
6078             {
6079 2123         3604 $op_expected = OPERATOR;
6080 2123 50       5479 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
  0         0  
6081             }
6082              
6083             # Check for smartmatch operator before preceding brace or square
6084             # bracket. For example, at the ? after the ] in the following
6085             # expressions we are expecting an operator:
6086             #
6087             # qr/3/ ~~ ['1234'] ? 1 : 0;
6088             # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
6089             elsif ( $last_nonblank_token eq '~~' ) {
6090 20         39 $op_expected = OPERATOR;
6091             }
6092              
6093             # A right brace here indicates the end of a simple block. All
6094             # non-structural right braces have type 'R' all braces associated with
6095             # block operator keywords have been given those keywords as
6096             # "last_nonblank_token" and caught above. (This statement is order
6097             # dependent, and must come after checking $last_nonblank_token).
6098             else {
6099              
6100             # patch for dor.t (defined or).
6101 1368 50 33     6174 if ( $tok eq '/'
    100 33        
6102             && $next_type eq '/'
6103             && $last_nonblank_token eq ']' )
6104             {
6105 0         0 $op_expected = OPERATOR;
6106             }
6107              
6108             # Patch for RT #116344: misparse a ternary operator after an
6109             # anonymous hash, like this:
6110             # return ref {} ? 1 : 0;
6111             # The right brace should really be marked type 'R' in this case,
6112             # and it is safest to return an UNKNOWN here. Expecting a TERM will
6113             # cause the '?' to always be interpreted as a pattern delimiter
6114             # rather than introducing a ternary operator.
6115             elsif ( $tok eq '?' ) {
6116 1         3 $op_expected = UNKNOWN;
6117             }
6118             else {
6119 1367         2408 $op_expected = TERM;
6120             }
6121             }
6122             } ## end type '}'
6123              
6124             # number or v-string...
6125             # An exception is for VERSION numbers a 'use' statement. It has the format
6126             # use Module VERSION LIST
6127             # We could avoid this exception by writing a special sub to parse 'use'
6128             # statements and perhaps mark these numbers with a new type V (for VERSION)
6129             ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
6130             elsif ( $is_n_v{$last_nonblank_type} ) {
6131 1985         3123 $op_expected = OPERATOR;
6132 1985 100       4503 if ( $statement_type eq 'use' ) {
6133 11         40 $op_expected = UNKNOWN;
6134             }
6135             }
6136              
6137             # quote...
6138             # TODO: labeled prototype words would better be given type 'A' or maybe
6139             # 'J'; not 'q'; or maybe mark as type 'Y'?
6140             elsif ( $last_nonblank_type eq 'q' ) {
6141 137 50       609 if ( $last_nonblank_token eq 'prototype' ) {
    100          
6142 0         0 $op_expected = TERM;
6143             }
6144              
6145             # update for --use-feature=class (rt145706):
6146             # Look for class VERSION after possible attribute, as in
6147             # class Example::Subclass : isa(Example::Base) 1.345 { ... }
6148             elsif ( $statement_type =~ /^package\b/ ) {
6149 3         7 $op_expected = TERM;
6150             }
6151              
6152             # everything else
6153             else {
6154 134         291 $op_expected = OPERATOR;
6155             }
6156             }
6157              
6158             # file handle or similar
6159             elsif ( $last_nonblank_type eq 'Z' ) {
6160              
6161             # angle.t
6162 40 100 33     567 if ( $last_nonblank_token =~ /^\w/ ) {
    50 66        
    100          
    50          
    100          
6163 2         5 $op_expected = UNKNOWN;
6164             }
6165              
6166             # Exception to weird parsing rules for 'x(' ... see case b1205:
6167             # In something like 'print $vv x(...' the x is an operator;
6168             # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
6169             # otherwise x follows the weird parsing rules.
6170             elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
6171 0         0 $op_expected = OPERATOR;
6172             }
6173              
6174             # The 'weird parsing rules' of next section do not work for '<' and '?'
6175             # It is best to mark them as unknown. Test case:
6176             # print $fh <DATA>;
6177             elsif ( $is_weird_parsing_rule_exception{$tok} ) {
6178 4         15 $op_expected = UNKNOWN;
6179             }
6180              
6181             # For possible file handle like "$a", Perl uses weird parsing rules.
6182             # For example:
6183             # print $a/2,"/hi"; - division
6184             # print $a / 2,"/hi"; - division
6185             # print $a/ 2,"/hi"; - division
6186             # print $a /2,"/hi"; - pattern (and error)!
6187             # Some examples where this logic works okay, for '&','*','+':
6188             # print $fh &xsi_protos(@mods);
6189             # my $x = new $CompressClass *FH;
6190             # print $OUT +( $count % 15 ? ", " : "\n\t" );
6191             elsif ($blank_after_Z
6192             && $next_type ne 'b' )
6193             {
6194 0         0 $op_expected = TERM;
6195             }
6196              
6197             # Note that '?' and '<' have been moved above
6198             # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
6199             elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
6200              
6201             # Do not complain in 'use' statements, which have special syntax.
6202             # For example, from RT#130344:
6203             # use lib $FindBin::Bin . '/lib';
6204 9 50       38 if ( $statement_type ne 'use' ) {
6205 9         37 $self->complain(
6206             "operator in possible indirect object location not recommended\n"
6207             );
6208             }
6209 9         24 $op_expected = OPERATOR;
6210             }
6211              
6212             # all other cases
6213             else {
6214 25         72 $op_expected = UNKNOWN;
6215             }
6216             }
6217              
6218             # anything else...
6219             else {
6220 1         5 $op_expected = UNKNOWN;
6221             }
6222              
6223             DEBUG_OPERATOR_EXPECTED
6224 8403         11569 && print {*STDOUT}
6225             "OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
6226              
6227 8403         15774 return $op_expected;
6228              
6229             } ## end sub operator_expected
6230              
6231             sub new_statement_ok {
6232              
6233             # Returns:
6234             # true if a new statement can begin here
6235             # false otherwise
6236              
6237             # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6238             # $brace_depth, $rbrace_type
6239              
6240             # Uses:
6241             # - See if a 'class' statement can occur here
6242             # - See if a keyword begins at a new statement; i.e. is an 'if' a
6243             # block if or a trailing if? Also see if 'format' starts a statement.
6244             # - Decide if a ':' is part of a statement label (not a ternary)
6245              
6246             # Curly braces are tricky because some small blocks do not get marked as
6247             # blocks..
6248              
6249             # if it follows an opening curly brace..
6250 436 100 66 436 0 2366 if ( $last_nonblank_token eq '{' ) {
    100          
6251              
6252             # The safe thing is to return true in all cases because:
6253             # - a ternary ':' cannot occur here
6254             # - an 'if' here, for example, cannot be a trailing if
6255             # See test case c231 for an example.
6256             # This works but could be improved, if necessary, by returning
6257             # 'false' at obvious non-blocks.
6258 59         245 return 1;
6259             }
6260              
6261             # if it follows a closing code block curly brace..
6262             elsif ($last_nonblank_token eq '}'
6263             && $last_nonblank_type eq $last_nonblank_token )
6264             {
6265              
6266             # a new statement can follow certain closing block braces ...
6267             # FIXME: The following has worked well but returns true in some cases
6268             # where it really should not. We could fix this by either excluding
6269             # certain blocks, like sort/map/grep/eval/asub or by just including
6270             # certain blocks.
6271 99         429 return $rbrace_type->[$brace_depth];
6272             }
6273              
6274             # otherwise, it is a label if and only if it follows a ';' (real or fake)
6275             # or another label
6276             else {
6277 278   100     1892 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
6278             }
6279             } ## end sub new_statement_ok
6280              
6281             sub code_block_type {
6282              
6283             # Decide if this is a block of code, and its type.
6284             # Must be called only when $type = $token = '{'
6285             # The problem is to distinguish between the start of a block of code
6286             # and the start of an anonymous hash reference
6287             # Returns "" if not code block, otherwise returns 'last_nonblank_token'
6288             # to indicate the type of code block. (For example, 'last_nonblank_token'
6289             # might be 'if' for an if block, 'else' for an else block, etc).
6290             # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6291             # $last_nonblank_block_type, $brace_depth, $rbrace_type
6292              
6293             # handle case of multiple '{'s
6294              
6295             # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
6296              
6297 1303     1303 0 3321 my ( $self, $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6298 1303 100 66     16515 if ( $last_nonblank_token eq '{'
    100 66        
    100 66        
    100 100        
    100 66        
    100 66        
    50          
    50          
    100          
    100          
    100          
6299             && $last_nonblank_type eq $last_nonblank_token )
6300             {
6301              
6302             # opening brace where a statement may appear is probably
6303             # a code block but might be and anonymous hash reference
6304 90 50       318 if ( $rbrace_type->[$brace_depth] ) {
6305 90         355 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6306             $max_token_index );
6307             }
6308              
6309             # cannot start a code block within an anonymous hash
6310             else {
6311 0         0 return EMPTY_STRING;
6312             }
6313             }
6314              
6315             elsif ( $last_nonblank_token eq ';' ) {
6316              
6317             # an opening brace where a statement may appear is probably
6318             # a code block but might be and anonymous hash reference
6319 48         274 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6320             $max_token_index );
6321             }
6322              
6323             # handle case of '}{'
6324             elsif ($last_nonblank_token eq '}'
6325             && $last_nonblank_type eq $last_nonblank_token )
6326             {
6327              
6328             # a } { situation ...
6329             # could be hash reference after code block..(blktype1.t)
6330 9 50       29 if ($last_nonblank_block_type) {
6331 9         37 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6332             $max_token_index );
6333             }
6334              
6335             # must be a block if it follows a closing hash reference
6336             else {
6337 0         0 return $last_nonblank_token;
6338             }
6339             }
6340              
6341             #--------------------------------------------------------------
6342             # NOTE: braces after type characters start code blocks, but for
6343             # simplicity these are not identified as such. See also
6344             # sub is_non_structural_brace.
6345             #--------------------------------------------------------------
6346              
6347             ## elsif ( $last_nonblank_type eq 't' ) {
6348             ## return $last_nonblank_token;
6349             ## }
6350              
6351             # brace after label:
6352             elsif ( $last_nonblank_type eq 'J' ) {
6353 34         149 return $last_nonblank_token;
6354             }
6355              
6356             # otherwise, look at previous token. This must be a code block if
6357             # it follows any of these:
6358             # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
6359             elsif ($is_code_block_token{$last_nonblank_token}
6360             || $is_grep_alias{$last_nonblank_token} )
6361             {
6362              
6363             # Bug Patch: Note that the opening brace after the 'if' in the following
6364             # snippet is an anonymous hash ref and not a code block!
6365             # print 'hi' if { x => 1, }->{x};
6366             # We can identify this situation because the last nonblank type
6367             # will be a keyword (instead of a closing paren)
6368 481 50 33     2566 if (
      66        
6369             $last_nonblank_type eq 'k'
6370             && ( $last_nonblank_token eq 'if'
6371             || $last_nonblank_token eq 'unless' )
6372             )
6373             {
6374 0         0 return EMPTY_STRING;
6375             }
6376             else {
6377 481         1513 return $last_nonblank_token;
6378             }
6379             }
6380              
6381             # or a sub or package BLOCK
6382             # Fixed for c250 to include new package type 'P', and change 'i' to 'S'
6383             elsif (
6384             $last_nonblank_type eq 'P'
6385             || $last_nonblank_type eq 'S'
6386             || ( $last_nonblank_type eq 't'
6387             && substr( $last_nonblank_token, 0, 3 ) eq 'sub' )
6388             )
6389             {
6390 295         984 return $last_nonblank_token;
6391             }
6392              
6393             elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6394 0         0 return $statement_type;
6395             }
6396              
6397             # user-defined subs with block parameters (like grep/map/eval)
6398             elsif ( $last_nonblank_type eq 'G' ) {
6399 0         0 return $last_nonblank_token;
6400             }
6401              
6402             # check bareword
6403             elsif ( $last_nonblank_type eq 'w' ) {
6404              
6405             # check for syntax 'use MODULE LIST'
6406             # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6407 22 100       100 return EMPTY_STRING if ( $statement_type eq 'use' );
6408              
6409 21         113 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6410             $max_token_index );
6411             }
6412              
6413             # Patch for bug # RT #94338 reported by Daniel Trizen
6414             # for-loop in a parenthesized block-map triggering an error message:
6415             # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6416             # Check for a code block within a parenthesized function call
6417             elsif ( $last_nonblank_token eq '(' ) {
6418 81         203 my $paren_type = $rparen_type->[$paren_depth];
6419              
6420             # /^(map|grep|sort)$/
6421 81 100 66     425 if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6422              
6423             # We will mark this as a code block but use type 't' instead
6424             # of the name of the containing function. This will allow for
6425             # correct parsing but will usually produce better formatting.
6426             # Braces with block type 't' are not broken open automatically
6427             # in the formatter as are other code block types, and this usually
6428             # works best.
6429 1         4 return 't'; # (Not $paren_type)
6430             }
6431             else {
6432 80         244 return EMPTY_STRING;
6433             }
6434             }
6435              
6436             # handle unknown syntax ') {'
6437             # we previously appended a '()' to mark this case
6438             elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6439 14         50 return $last_nonblank_token;
6440             }
6441              
6442             # anything else must be anonymous hash reference
6443             else {
6444 229         676 return EMPTY_STRING;
6445             }
6446             } ## end sub code_block_type
6447              
6448             sub decide_if_code_block {
6449              
6450             # USES GLOBAL VARIABLES: $last_nonblank_token
6451 168     168 0 462 my ( $self, $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6452              
6453 168         591 my ( $next_nonblank_token, $i_next ) =
6454             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
6455              
6456             # we are at a '{' where a statement may appear.
6457             # We must decide if this brace starts an anonymous hash or a code
6458             # block.
6459             # return "" if anonymous hash, and $last_nonblank_token otherwise
6460              
6461             # initialize to be code BLOCK
6462 168         492 my $code_block_type = $last_nonblank_token;
6463              
6464             # Check for the common case of an empty anonymous hash reference:
6465             # Maybe something like sub { { } }
6466 168 100       519 if ( $next_nonblank_token eq '}' ) {
6467 5         14 $code_block_type = EMPTY_STRING;
6468             }
6469              
6470             else {
6471              
6472             # To guess if this '{' is an anonymous hash reference, look ahead
6473             # and test as follows:
6474             #
6475             # it is a hash reference if next come:
6476             # - a string or digit followed by a comma or =>
6477             # - bareword followed by =>
6478             # otherwise it is a code block
6479             #
6480             # Examples of anonymous hash ref:
6481             # {'aa',};
6482             # {1,2}
6483             #
6484             # Examples of code blocks:
6485             # {1; print "hello\n", 1;}
6486             # {$a,1};
6487              
6488             # We are only going to look ahead one more (nonblank/comment) line.
6489             # Strange formatting could cause a bad guess, but that's unlikely.
6490 163         410 my @pre_types;
6491             my @pre_tokens;
6492              
6493             # Ignore the rest of this line if it is a side comment
6494 163 100       541 if ( $next_nonblank_token ne '#' ) {
6495 139         525 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
  139         815  
6496 139         421 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
  139         710  
6497             }
6498              
6499             # Here 20 is arbitrary but generous, and prevents wasting lots of time
6500             # in mangled files
6501 163         664 my ( $rpre_tokens, $rpre_types ) =
6502             $self->peek_ahead_for_n_nonblank_pre_tokens(20);
6503 163 100 66     621 if ( defined($rpre_types) && @{$rpre_types} ) {
  155         592  
6504 155         315 push @pre_types, @{$rpre_types};
  155         613  
6505 155         296 push @pre_tokens, @{$rpre_tokens};
  155         732  
6506             }
6507              
6508             # put a sentinel token to simplify stopping the search
6509 163         1326 push @pre_types, '}';
6510 163         322 push @pre_types, '}';
6511              
6512 163         312 my $jbeg = 0;
6513 163 100       477 $jbeg = 1 if $pre_types[0] eq 'b';
6514              
6515             # first look for one of these
6516             # - bareword
6517             # - bareword with leading -
6518             # - digit
6519             # - quoted string
6520 163         292 my $j = $jbeg;
6521 163 100 33     1288 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
    100          
    100          
    50          
6522              
6523             # find the closing quote; don't worry about escapes
6524 1         3 my $quote_mark = $pre_types[$j];
6525 1         17 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6526 1 50       6 if ( $pre_types[$k] eq $quote_mark ) {
6527 1         2 $j = $k + 1;
6528             ##my $next = $pre_types[$j];
6529 1         3 last;
6530             }
6531             }
6532             }
6533             elsif ( $pre_types[$j] eq 'd' ) {
6534 8         14 $j++;
6535             }
6536             elsif ( $pre_types[$j] eq 'w' ) {
6537 71         177 $j++;
6538             }
6539             elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6540 0         0 $j++;
6541             }
6542             else {
6543             # none of the above
6544             }
6545 163 100       526 if ( $j > $jbeg ) {
6546              
6547 80 100       332 $j++ if $pre_types[$j] eq 'b';
6548              
6549             # Patched for RT #95708
6550 80 100 33     692 if (
      66        
      66        
6551              
6552             # it is a comma which is not a pattern delimiter except for qw
6553             (
6554             $pre_types[$j] eq ','
6555             ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6556             && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6557             )
6558              
6559             # or a =>
6560             || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6561             )
6562             {
6563 18         43 $code_block_type = EMPTY_STRING;
6564             }
6565             }
6566              
6567 163 100       556 if ($code_block_type) {
6568              
6569             # Patch for cases b1085 b1128: It is uncertain if this is a block.
6570             # If this brace follows a bareword, then append a space as a signal
6571             # to the formatter that this may not be a block brace. To find the
6572             # corresponding code in Formatter.pm search for 'b1085'.
6573 145 100       1222 $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6574             }
6575             }
6576              
6577 168         583 return $code_block_type;
6578             } ## end sub decide_if_code_block
6579              
6580             sub report_unexpected {
6581              
6582             # report unexpected token type and show where it is
6583             # USES GLOBAL VARIABLES: (none)
6584 0     0 0 0 my ( $self, $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6585             $rpretoken_type, $input_line )
6586             = @_;
6587              
6588 0 0       0 if ( ++$self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6589 0         0 my $msg = "found $found where $expecting expected";
6590 0         0 my $pos = $rpretoken_map->[$i_tok];
6591 0         0 $self->interrupt_logfile();
6592 0         0 my $input_line_number = $self->[_last_line_number_];
6593 0         0 my ( $offset, $numbered_line, $underline ) =
6594             make_numbered_line( $input_line_number, $input_line, $pos );
6595 0         0 $underline = write_on_underline( $underline, $pos - $offset, '^' );
6596              
6597 0         0 my $trailer = EMPTY_STRING;
6598 0 0 0     0 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6599 0         0 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6600 0         0 my $num;
6601 0 0       0 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6602 0         0 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6603             }
6604             else {
6605 0         0 $num = $pos - $pos_prev;
6606             }
6607 0 0       0 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
  0         0  
  0         0  
6608              
6609             $underline =
6610 0         0 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6611 0         0 $trailer = " (previous token underlined)";
6612             }
6613 0         0 $underline =~ s/\s+$//;
6614 0         0 $self->warning( $numbered_line . "\n" );
6615 0         0 $self->warning( $underline . "\n" );
6616 0         0 $self->warning( $msg . $trailer . "\n" );
6617 0         0 $self->resume_logfile();
6618             }
6619 0         0 return;
6620             } ## end sub report_unexpected
6621              
6622             my %is_sigil_or_paren;
6623             my %is_R_closing_sb;
6624              
6625             BEGIN {
6626              
6627 39     39   336 my @q = qw< $ & % * @ ) >;
6628 39         410 @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
6629              
6630 39         236 @q = qw(R ]);
6631 39         86532 @{is_R_closing_sb}{@q} = (1) x scalar(@q);
6632             } ## end BEGIN
6633              
6634             sub is_non_structural_brace {
6635              
6636             # Decide if a brace or bracket is structural or non-structural
6637             # by looking at the previous token and type
6638             # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
6639              
6640             # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
6641             # Tentatively deactivated because it caused the wrong operator expectation
6642             # for this code:
6643             # $user = @vars[1] / 100;
6644             # Must update sub operator_expected before re-implementing.
6645             # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
6646             # return 0;
6647             # }
6648              
6649             #--------------------------------------------------------------
6650             # NOTE: braces after type characters start code blocks, but for
6651             # simplicity these are not identified as such. See also
6652             # sub code_block_type
6653             #--------------------------------------------------------------
6654              
6655             ##if ($last_nonblank_type eq 't') {return 0}
6656              
6657             # otherwise, it is non-structural if it is decorated
6658             # by type information.
6659             # For example, the '{' here is non-structural: ${xxx}
6660             # Removed '::' to fix c074
6661             ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
6662             return (
6663             ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
6664             $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
6665             || substr( $last_nonblank_token, 0, 2 ) eq '->'
6666              
6667             # or if we follow a hash or array closing curly brace or bracket
6668             # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
6669             # because the first '}' would have been given type 'R'
6670             ##|| $last_nonblank_type =~ /^([R\]])$/
6671 2265   66 2265 0 14950 || $is_R_closing_sb{$last_nonblank_type}
6672             );
6673             } ## end sub is_non_structural_brace
6674              
6675             #######################################################################
6676             # Tokenizer routines for tracking container nesting depths
6677             #######################################################################
6678              
6679             # The following routines keep track of nesting depths of the nesting
6680             # types, ( [ { and ?. This is necessary for determining the indentation
6681             # level, and also for debugging programs. Not only do they keep track of
6682             # nesting depths of the individual brace types, but they check that each
6683             # of the other brace types is balanced within matching pairs. For
6684             # example, if the program sees this sequence:
6685             #
6686             # { ( ( ) }
6687             #
6688             # then it can determine that there is an extra left paren somewhere
6689             # between the { and the }. And so on with every other possible
6690             # combination of outer and inner brace types. For another
6691             # example:
6692             #
6693             # ( [ ..... ] ] )
6694             #
6695             # which has an extra ] within the parens.
6696             #
6697             # The brace types have indexes 0 .. 3 which are indexes into
6698             # the matrices.
6699             #
6700             # The pair ? : are treated as just another nesting type, with ? acting
6701             # as the opening brace and : acting as the closing brace.
6702             #
6703             # The matrix
6704             #
6705             # $rdepth_array->[$a][$b][ $rcurrent_depth->[$a] ] = $rcurrent_depth->[$b];
6706             #
6707             # saves the nesting depth of brace type $b (where $b is either of the other
6708             # nesting types) when brace type $a enters a new depth. When this depth
6709             # decreases, a check is made that the current depth of brace types $b is
6710             # unchanged, or otherwise there must have been an error. This can
6711             # be very useful for localizing errors, particularly when perl runs to
6712             # the end of a large file (such as this one) and announces that there
6713             # is a problem somewhere.
6714             #
6715             # A numerical sequence number is maintained for every nesting type,
6716             # so that each matching pair can be uniquely identified in a simple
6717             # way.
6718              
6719             sub increase_nesting_depth {
6720 4578     4578 0 9285 my ( $self, $aa, $pos ) = @_;
6721              
6722             # USES GLOBAL VARIABLES: $rcurrent_depth,
6723             # $rcurrent_sequence_number, $rdepth_array,
6724             # $rstarting_line_of_current_depth, $statement_type
6725 4578         8287 my $cd_aa = ++$rcurrent_depth->[$aa];
6726 4578         6786 $total_depth++;
6727 4578         9092 $rtotal_depth->[$aa][$cd_aa] = $total_depth;
6728 4578         7531 my $input_line_number = $self->[_last_line_number_];
6729 4578         7817 my $input_line = $self->[_line_of_text_];
6730              
6731             # Sequence numbers increment by number of items. This keeps
6732             # a unique set of numbers but still allows the relative location
6733             # of any type to be determined.
6734              
6735             # make a new unique sequence number
6736 4578         7800 my $seqno = $next_sequence_number++;
6737              
6738 4578         8231 $rcurrent_sequence_number->[$aa][$cd_aa] = $seqno;
6739              
6740 4578         14795 $rstarting_line_of_current_depth->[$aa][$cd_aa] =
6741             [ $input_line_number, $input_line, $pos ];
6742              
6743 4578         14152 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6744 18312 100       33496 next if ( $bb == $aa );
6745 13734         26884 $rdepth_array->[$aa][$bb][$cd_aa] = $rcurrent_depth->[$bb];
6746             }
6747              
6748             # set a flag for indenting a nested ternary statement
6749 4578         8568 my $indent = 0;
6750 4578 100       10504 if ( $aa == QUESTION_COLON ) {
6751 187         665 $rnested_ternary_flag->[$cd_aa] = 0;
6752 187 100       713 if ( $cd_aa > 1 ) {
6753 17 100       114 if ( $rnested_ternary_flag->[ $cd_aa - 1 ] == 0 ) {
6754 16         47 my $pdepth = $rtotal_depth->[$aa][ $cd_aa - 1 ];
6755 16 100       57 if ( $pdepth == $total_depth - 1 ) {
6756 8         14 $indent = 1;
6757 8         23 $rnested_ternary_flag->[ $cd_aa - 1 ] = -1;
6758             }
6759             }
6760             }
6761             }
6762              
6763             # Fix part #1 for git82: save last token type for propagation of type 'Z'
6764 4578         15682 $rnested_statement_type->[$aa][$cd_aa] =
6765             [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6766 4578         7775 $statement_type = EMPTY_STRING;
6767 4578         12762 return ( $seqno, $indent );
6768             } ## end sub increase_nesting_depth
6769              
6770             sub is_balanced_closing_container {
6771              
6772             # Return true if a closing container can go here without error
6773             # Return false if not
6774 47     47 0 125 my ($aa) = @_;
6775              
6776             # cannot close if there was no opening
6777 47         97 my $cd_aa = $rcurrent_depth->[$aa];
6778 47 100       189 return if ( $cd_aa <= 0 );
6779              
6780             # check that any other brace types $bb contained within would be balanced
6781 8         36 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6782 8 50       20 next if ( $bb == $aa );
6783             return
6784 8 50       43 if ( $rdepth_array->[$aa][$bb][$cd_aa] != $rcurrent_depth->[$bb] );
6785             }
6786              
6787             # OK, everything will be balanced
6788 0         0 return 1;
6789             } ## end sub is_balanced_closing_container
6790              
6791             sub decrease_nesting_depth {
6792              
6793 4578     4578 0 9194 my ( $self, $aa, $pos ) = @_;
6794              
6795             # USES GLOBAL VARIABLES: $rcurrent_depth,
6796             # $rcurrent_sequence_number, $rdepth_array, $rstarting_line_of_current_depth
6797             # $statement_type
6798 4578         7121 my $seqno = 0;
6799 4578         7454 my $input_line_number = $self->[_last_line_number_];
6800 4578         7779 my $input_line = $self->[_line_of_text_];
6801              
6802 4578         6830 my $outdent = 0;
6803 4578         6669 $total_depth--;
6804 4578         7889 my $cd_aa = $rcurrent_depth->[$aa];
6805 4578 50       9576 if ( $cd_aa > 0 ) {
6806              
6807             # set a flag for un-indenting after seeing a nested ternary statement
6808 4578         8321 $seqno = $rcurrent_sequence_number->[$aa][$cd_aa];
6809 4578 100       9983 if ( $aa == QUESTION_COLON ) {
6810 187         567 $outdent = $rnested_ternary_flag->[$cd_aa];
6811             }
6812              
6813             # Fix part #2 for git82: use saved type for propagation of type 'Z'
6814             # through type L-R braces. Perl seems to allow ${bareword}
6815             # as an indirect object, but nothing much more complex than that.
6816             ( $statement_type, my $saved_type, my $saved_token ) =
6817 4578         6871 @{ $rnested_statement_type->[$aa][ $rcurrent_depth->[$aa] ] };
  4578         13015  
6818 4578 50 100     16056 if ( $aa == BRACE
      66        
      66        
6819             && $saved_type eq 'Z'
6820             && $last_nonblank_type eq 'w'
6821             && $rbrace_structural_type->[$brace_depth] eq 'L' )
6822             {
6823 1         3 $last_nonblank_type = $saved_type;
6824             }
6825              
6826             # check that any brace types $bb contained within are balanced
6827 4578         13217 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6828 18312 100       32593 next if ( $bb == $aa );
6829              
6830 13734 50       32116 if ( $rdepth_array->[$aa][$bb][$cd_aa] != $rcurrent_depth->[$bb] ) {
6831 0         0 my $diff =
6832             $rcurrent_depth->[$bb] - $rdepth_array->[$aa][$bb][$cd_aa];
6833              
6834             # don't whine too many times
6835 0         0 my $saw_brace_error = $self->get_saw_brace_error();
6836 0 0 0     0 if (
      0        
6837             $saw_brace_error <= MAX_NAG_MESSAGES
6838              
6839             # if too many closing types have occurred, we probably
6840             # already caught this error
6841             && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6842             )
6843             {
6844 0         0 $self->interrupt_logfile();
6845 0         0 my $rsl = $rstarting_line_of_current_depth->[$aa][$cd_aa];
6846 0         0 my $sl = $rsl->[0];
6847 0         0 my $rel = [ $input_line_number, $input_line, $pos ];
6848 0         0 my $el = $rel->[0];
6849 0         0 my ($ess);
6850              
6851 0 0 0     0 if ( $diff == 1 || $diff == -1 ) {
6852 0         0 $ess = EMPTY_STRING;
6853             }
6854             else {
6855 0         0 $ess = 's';
6856             }
6857 0 0       0 my $bname =
6858             ( $diff > 0 )
6859             ? $opening_brace_names[$bb]
6860             : $closing_brace_names[$bb];
6861 0         0 $self->write_error_indicator_pair( @{$rsl}, '^' );
  0         0  
6862 0         0 my $msg = <<"EOM";
6863             Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6864             EOM
6865              
6866 0 0       0 if ( $diff > 0 ) {
6867 0         0 my $rml =
6868             $rstarting_line_of_current_depth->[$bb]
6869             [ $rcurrent_depth->[$bb] ];
6870 0         0 my $ml = $rml->[0];
6871 0         0 $msg .=
6872             " The most recent un-matched $bname is on line $ml\n";
6873 0         0 $self->write_error_indicator_pair( @{$rml}, '^' );
  0         0  
6874             }
6875 0         0 $self->write_error_indicator_pair( @{$rel}, '^' );
  0         0  
6876 0         0 $self->warning($msg);
6877 0         0 $self->resume_logfile();
6878             }
6879 0         0 $self->increment_brace_error();
6880             }
6881             }
6882 4578         8825 $rcurrent_depth->[$aa]--;
6883             }
6884             else {
6885              
6886 0         0 my $saw_brace_error = $self->get_saw_brace_error();
6887 0 0       0 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6888 0         0 my $msg = <<"EOM";
6889             There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6890             EOM
6891 0         0 $self->indicate_error( $msg, $input_line_number, $input_line, $pos,
6892             '^' );
6893             }
6894 0         0 $self->increment_brace_error();
6895              
6896             # keep track of errors in braces alone (ignoring ternary nesting errors)
6897 0 0       0 $self->[_true_brace_error_count_]++
6898             if ( $closing_brace_names[$aa] ne "':'" );
6899             }
6900 4578         12813 return ( $seqno, $outdent );
6901             } ## end sub decrease_nesting_depth
6902              
6903             sub check_final_nesting_depths {
6904              
6905             # USES GLOBAL VARIABLES: $rcurrent_depth, $rstarting_line_of_current_depth
6906 562     562 0 1401 my $self = shift;
6907              
6908 562         2321 for my $aa ( 0 .. @closing_brace_names - 1 ) {
6909              
6910 2248         3728 my $cd_aa = $rcurrent_depth->[$aa];
6911 2248 50       5218 if ($cd_aa) {
6912 0         0 my $rsl = $rstarting_line_of_current_depth->[$aa][$cd_aa];
6913 0         0 my $sl = $rsl->[0];
6914 0         0 my $msg = <<"EOM";
6915             Final nesting depth of $opening_brace_names[$aa]s is $cd_aa
6916             The most recent un-matched $opening_brace_names[$aa] is on line $sl
6917             EOM
6918 0         0 $self->indicate_error( $msg, @{$rsl}, '^' );
  0         0  
6919 0         0 $self->increment_brace_error();
6920             }
6921             }
6922 562         1613 return;
6923             } ## end sub check_final_nesting_depths
6924              
6925             #######################################################################
6926             # Tokenizer routines for looking ahead in input stream
6927             #######################################################################
6928              
6929             sub peek_ahead_for_n_nonblank_pre_tokens {
6930              
6931             # returns next n pretokens if they exist
6932             # returns undef's if hits eof without seeing any pretokens
6933             # USES GLOBAL VARIABLES: (none)
6934 170     170 0 424 my ( $self, $max_pretokens ) = @_;
6935 170         287 my $line;
6936 170         291 my $i = 0;
6937 170         361 my ( $rpre_tokens, $rmap, $rpre_types );
6938              
6939 170         553 while ( $line = $self->peek_ahead( $i++ ) ) {
6940 182         940 $line =~ s/^\s*//; # trim leading blanks
6941 182 100       606 next if ( length($line) <= 0 ); # skip blank
6942 176 100       601 next if ( $line =~ /^#/ ); # skip comment
6943 162         472 ( $rpre_tokens, $rmap, $rpre_types ) =
6944             pre_tokenize( $line, $max_pretokens );
6945 162         463 last;
6946             }
6947 170         629 return ( $rpre_tokens, $rpre_types );
6948             } ## end sub peek_ahead_for_n_nonblank_pre_tokens
6949              
6950             # look ahead for next non-blank, non-comment line of code
6951             sub peek_ahead_for_nonblank_token {
6952              
6953             # USES GLOBAL VARIABLES: (none)
6954 125     125 0 377 my ( $self, $rtokens, $max_token_index ) = @_;
6955 125         234 my $line;
6956 125         259 my $i = 0;
6957              
6958 125         534 while ( $line = $self->peek_ahead( $i++ ) ) {
6959 169         979 $line =~ s/^\s*//; # trim leading blanks
6960 169 100       682 next if ( length($line) <= 0 ); # skip blank
6961 144 100       619 next if ( $line =~ /^#/ ); # skip comment
6962              
6963             # Updated from 2 to 3 to get trigraphs, added for case b1175
6964 123         474 my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6965 123         407 my $j = $max_token_index + 1;
6966              
6967 123         278 foreach my $tok ( @{$rtok} ) {
  123         342  
6968 355 100       950 last if ( $tok =~ "\n" );
6969 320         801 $rtokens->[ ++$j ] = $tok;
6970             }
6971 123         482 last;
6972             }
6973 125         378 return;
6974             } ## end sub peek_ahead_for_nonblank_token
6975              
6976             #######################################################################
6977             # Tokenizer guessing routines for ambiguous situations
6978             #######################################################################
6979              
6980             sub guess_if_pattern_or_conditional {
6981              
6982             # this routine is called when we have encountered a ? following an
6983             # unknown bareword, and we must decide if it starts a pattern or not
6984             # input parameters:
6985             # $i - token index of the ? starting possible pattern
6986             # output parameters:
6987             # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
6988             # msg = a warning or diagnostic message
6989             # USES GLOBAL VARIABLES: $last_nonblank_token
6990              
6991 11     11 0 45 my ( $self, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6992 11         22 my $is_pattern = 0;
6993 11         55 my $msg = "guessing that ? after $last_nonblank_token starts a ";
6994              
6995 11 50       51 if ( $i >= $max_token_index ) {
6996 0         0 $msg .= "conditional (no end to pattern found on the line)\n";
6997             }
6998             else {
6999 11         33 my $ibeg = $i;
7000 11         29 $i = $ibeg + 1;
7001 11         35 my $next_token = $rtokens->[$i]; # first token after ?
7002              
7003             # look for a possible ending ? on this line..
7004 11         28 my $in_quote = 1;
7005 11         25 my $quote_depth = 0;
7006 11         22 my $quote_character = EMPTY_STRING;
7007 11         26 my $quote_pos = 0;
7008 11         21 my $quoted_string;
7009             (
7010              
7011 11         60 $i,
7012             $in_quote,
7013             $quote_character,
7014             $quote_pos,
7015             $quote_depth,
7016             $quoted_string,
7017              
7018             ) = $self->follow_quoted_string(
7019              
7020             $ibeg,
7021             $in_quote,
7022             $rtokens,
7023             $quote_character,
7024             $quote_pos,
7025             $quote_depth,
7026             $max_token_index,
7027              
7028             );
7029              
7030 11 50       97 if ($in_quote) {
7031              
7032             # we didn't find an ending ? on this line,
7033             # so we bias towards conditional
7034 11         34 $is_pattern = 0;
7035 11         48 $msg .= "conditional (no ending ? on this line)\n";
7036              
7037             # we found an ending ?, so we bias towards a pattern
7038             }
7039             else {
7040              
7041             # Watch out for an ending ? in quotes, like this
7042             # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
7043 0         0 my $s_quote = 0;
7044 0         0 my $d_quote = 0;
7045 0         0 my $colons = 0;
7046 0         0 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
7047 0         0 my $tok = $rtokens->[$ii];
7048 0 0       0 if ( $tok eq ":" ) { $colons++ }
  0         0  
7049 0 0       0 if ( $tok eq "'" ) { $s_quote++ }
  0         0  
7050 0 0       0 if ( $tok eq '"' ) { $d_quote++ }
  0         0  
7051             }
7052 0 0 0     0 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
    0 0        
7053 0         0 $is_pattern = 0;
7054 0         0 $msg .= "found ending ? but unbalanced quote chars\n";
7055             }
7056             elsif (
7057             $self->pattern_expected( $i, $rtokens, $max_token_index ) >= 0 )
7058             {
7059 0         0 $is_pattern = 1;
7060 0         0 $msg .= "pattern (found ending ? and pattern expected)\n";
7061             }
7062             else {
7063 0         0 $msg .= "pattern (uncertain, but found ending ?)\n";
7064             }
7065             }
7066             }
7067 11         45 return ( $is_pattern, $msg );
7068             } ## end sub guess_if_pattern_or_conditional
7069              
7070             my %is_known_constant;
7071             my %is_known_function;
7072              
7073             BEGIN {
7074              
7075             # Constants like 'pi' in Trig.pm are common
7076 39     39   280 my @q = qw(pi pi2 pi4 pip2 pip4);
7077 39         321 @{is_known_constant}{@q} = (1) x scalar(@q);
7078              
7079             # parenless calls of 'ok' are common
7080 39         127 @q = qw( ok );
7081 39         70613 @{is_known_function}{@q} = (1) x scalar(@q);
7082             } ## end BEGIN
7083              
7084             sub guess_if_pattern_or_division {
7085              
7086             # this routine is called when we have encountered a / following an
7087             # unknown bareword, and we must decide if it starts a pattern or is a
7088             # division
7089             # input parameters:
7090             # $i - token index of the / starting possible pattern
7091             # output parameters:
7092             # $is_pattern = 0 if probably division, =1 if probably a pattern
7093             # msg = a warning or diagnostic message
7094             # USES GLOBAL VARIABLES: $last_nonblank_token
7095 0     0 0 0 my ( $self, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
7096 0         0 my $is_pattern = 0;
7097 0         0 my $msg = "guessing that / after $last_nonblank_token starts a ";
7098              
7099 0 0       0 if ( $i >= $max_token_index ) {
7100 0         0 $msg .= "division (no end to pattern found on the line)\n";
7101             }
7102             else {
7103 0         0 my $ibeg = $i;
7104 0         0 my $divide_possible =
7105             $self->is_possible_numerator( $i, $rtokens, $max_token_index );
7106              
7107 0 0       0 if ( $divide_possible < 0 ) {
7108 0         0 $msg = "pattern (division not possible here)\n";
7109 0         0 $is_pattern = 1;
7110 0         0 return ( $is_pattern, $msg );
7111             }
7112              
7113 0         0 $i = $ibeg + 1;
7114 0         0 my $next_token = $rtokens->[$i]; # first token after slash
7115              
7116             # One of the things we can look at is the spacing around the slash.
7117             # There # are four possible spacings around the first slash:
7118             #
7119             # return pi/two;#/; -/-
7120             # return pi/ two;#/; -/+
7121             # return pi / two;#/; +/+
7122             # return pi /two;#/; +/- <-- possible pattern
7123             #
7124             # Spacing rule: a space before the slash but not after the slash
7125             # usually indicates a pattern. We can use this to break ties.
7126              
7127 0   0     0 my $is_pattern_by_spacing =
7128             ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
7129              
7130             # look for a possible ending / on this line..
7131 0         0 my $in_quote = 1;
7132 0         0 my $quote_depth = 0;
7133 0         0 my $quote_character = EMPTY_STRING;
7134 0         0 my $quote_pos = 0;
7135 0         0 my $quoted_string;
7136             (
7137 0         0 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
7138             $quoted_string
7139             )
7140             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
7141             $quote_character, $quote_pos, $quote_depth, $max_token_index );
7142              
7143 0 0       0 if ($in_quote) {
7144              
7145             # we didn't find an ending / on this line, so we bias towards
7146             # division
7147 0 0       0 if ( $divide_possible >= 0 ) {
7148 0         0 $is_pattern = 0;
7149 0         0 $msg .= "division (no ending / on this line)\n";
7150             }
7151             else {
7152              
7153             # assuming a multi-line pattern ... this is risky, but division
7154             # does not seem possible. If this fails, it would either be due
7155             # to a syntax error in the code, or the division_expected logic
7156             # needs to be fixed.
7157 0         0 $msg = "multi-line pattern (division not possible)\n";
7158 0         0 $is_pattern = 1;
7159             }
7160             }
7161              
7162             # we found an ending /, so we bias slightly towards a pattern
7163             else {
7164              
7165 0         0 my $pattern_expected =
7166             $self->pattern_expected( $i, $rtokens, $max_token_index );
7167              
7168 0 0       0 if ( $pattern_expected >= 0 ) {
7169              
7170             # pattern looks possible...
7171 0 0       0 if ( $divide_possible >= 0 ) {
7172              
7173             # Both pattern and divide can work here...
7174              
7175             # Increase weight of divide if a pure number follows
7176 0         0 $divide_possible += $next_token =~ /^\d+$/;
7177              
7178             # Check for known constants in the numerator, like 'pi'
7179 0 0       0 if ( $is_known_constant{$last_nonblank_token} ) {
    0          
    0          
    0          
7180 0         0 $msg .=
7181             "division (pattern works too but saw known constant '$last_nonblank_token')\n";
7182 0         0 $is_pattern = 0;
7183             }
7184              
7185             # A very common bare word in pattern expressions is 'ok'
7186             elsif ( $is_known_function{$last_nonblank_token} ) {
7187 0         0 $msg .=
7188             "pattern (division works too but saw '$last_nonblank_token')\n";
7189 0         0 $is_pattern = 1;
7190             }
7191              
7192             # If one rule is more definite, use it
7193             elsif ( $divide_possible > $pattern_expected ) {
7194 0         0 $msg .=
7195             "division (more likely based on following tokens)\n";
7196 0         0 $is_pattern = 0;
7197             }
7198              
7199             # otherwise, use the spacing rule
7200             elsif ($is_pattern_by_spacing) {
7201 0         0 $msg .=
7202             "pattern (guess on spacing, but division possible too)\n";
7203 0         0 $is_pattern = 1;
7204             }
7205             else {
7206 0         0 $msg .=
7207             "division (guess on spacing, but pattern is possible too)\n";
7208 0         0 $is_pattern = 0;
7209             }
7210             }
7211              
7212             # divide_possible < 0 means divide can not work here
7213             else {
7214 0         0 $is_pattern = 1;
7215 0         0 $msg .= "pattern (division not possible)\n";
7216             }
7217             }
7218              
7219             # pattern does not look possible...
7220             else {
7221              
7222 0 0       0 if ( $divide_possible >= 0 ) {
7223 0         0 $is_pattern = 0;
7224 0         0 $msg .= "division (pattern not possible)\n";
7225             }
7226              
7227             # Neither pattern nor divide look possible...go by spacing
7228             else {
7229 0 0       0 if ($is_pattern_by_spacing) {
7230 0         0 $msg .= "pattern (guess on spacing)\n";
7231 0         0 $is_pattern = 1;
7232             }
7233             else {
7234 0         0 $msg .= "division (guess on spacing)\n";
7235 0         0 $is_pattern = 0;
7236             }
7237             }
7238             }
7239             }
7240             }
7241 0         0 return ( $is_pattern, $msg );
7242             } ## end sub guess_if_pattern_or_division
7243              
7244             # try to resolve here-doc vs. shift by looking ahead for
7245             # non-code or the end token (currently only looks for end token)
7246             # returns 1 if it is probably a here doc, 0 if not
7247             sub guess_if_here_doc {
7248              
7249 0     0 0 0 my ( $self, $next_token ) = @_;
7250              
7251             # This is how many lines we will search for a target as part of the
7252             # guessing strategy. It is a constant because there is probably
7253             # little reason to change it.
7254             # USES GLOBAL VARIABLES: $current_package $ris_constant,
7255 0         0 my $HERE_DOC_WINDOW = 40;
7256              
7257 0         0 my $here_doc_expected = 0;
7258 0         0 my $line;
7259 0         0 my $k = 0;
7260 0         0 my $msg = "checking <<";
7261              
7262 0         0 while ( $line = $self->peek_ahead( $k++ ) ) {
7263 0         0 chomp $line;
7264              
7265 0 0       0 if ( $line =~ /^$next_token$/ ) {
7266 0         0 $msg .= " -- found target $next_token ahead $k lines\n";
7267 0         0 $here_doc_expected = 1; # got it
7268 0         0 last;
7269             }
7270 0 0       0 last if ( $k >= $HERE_DOC_WINDOW );
7271             }
7272              
7273 0 0       0 if ( !$here_doc_expected ) {
7274              
7275 0 0       0 if ( !defined($line) ) {
7276 0         0 $here_doc_expected = -1; # hit eof without seeing target
7277 0         0 $msg .= " -- must be shift; target $next_token not in file\n";
7278              
7279             }
7280             else { # still unsure..taking a wild guess
7281              
7282 0 0       0 if ( !$ris_constant->{$current_package}{$next_token} ) {
7283 0         0 $here_doc_expected = 1;
7284 0         0 $msg .=
7285             " -- guessing it's a here-doc ($next_token not a constant)\n";
7286             }
7287             else {
7288 0         0 $msg .=
7289             " -- guessing it's a shift ($next_token is a constant)\n";
7290             }
7291             }
7292             }
7293 0         0 $self->write_logfile_entry($msg);
7294 0         0 return $here_doc_expected;
7295             } ## end sub guess_if_here_doc
7296              
7297             #######################################################################
7298             # Tokenizer Routines for scanning identifiers and related items
7299             #######################################################################
7300              
7301             sub scan_bare_identifier_do {
7302              
7303             # this routine is called to scan a token starting with an alphanumeric
7304             # variable or package separator, :: or '.
7305             # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
7306             # $last_nonblank_type, $rparen_type, $paren_depth
7307              
7308 1674     1674 0 5086 my ( $self, $input_line, $i, $tok, $type, $prototype, $rtoken_map,
7309             $max_token_index )
7310             = @_;
7311 1674         2893 my $i_begin = $i;
7312 1674         2896 my $package = undef;
7313              
7314 1674         2557 my $i_beg = $i;
7315              
7316             # we have to back up one pretoken at a :: since each : is one pretoken
7317 1674 100       4192 if ( $tok eq '::' ) { $i_beg-- }
  9         25  
7318 1674 50       3903 if ( $tok eq '->' ) { $i_beg-- }
  0         0  
7319 1674         3099 my $pos_beg = $rtoken_map->[$i_beg];
7320 1674         5097 pos($input_line) = $pos_beg;
7321              
7322             # Examples:
7323             # A::B::C
7324             # A::
7325             # ::A
7326             # A'B
7327 1674 50       12350 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
7328              
7329 1674         3373 my $pos = pos($input_line);
7330 1674         2798 my $numc = $pos - $pos_beg;
7331 1674         3805 $tok = substr( $input_line, $pos_beg, $numc );
7332              
7333             # type 'w' includes anything without leading type info
7334             # ($,%,@,*) including something like abc::def::ghi
7335 1674         2787 $type = 'w';
7336              
7337 1674         2912 my $sub_name = EMPTY_STRING;
7338 1674 100       4762 if ( defined($2) ) { $sub_name = $2; }
  1669         3167  
7339 1674 100       4242 if ( defined($1) ) {
7340 274         669 $package = $1;
7341              
7342             # patch: don't allow isolated package name which just ends
7343             # in the old style package separator (single quote). Example:
7344             # use CGI':all';
7345 274 50 66     1162 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
7346 0         0 $pos--;
7347             }
7348              
7349 274         763 $package =~ s/\'/::/g;
7350 274 100       889 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  9         39  
7351 274         1202 $package =~ s/::$//;
7352             }
7353             else {
7354 1400         2698 $package = $current_package;
7355              
7356             # patched for c043, part 1: keyword does not follow '->'
7357 1400 50 66     5737 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
7358 0         0 $type = 'k';
7359             }
7360             }
7361              
7362             # if it is a bareword.. patched for c043, part 2: not following '->'
7363 1674 100 66     7737 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
7364              
7365             # check for v-string with leading 'v' type character
7366             # (This seems to have precedence over filehandle, type 'Y')
7367 1004 100 66     14658 if ( $tok =~ /^v\d[_\d]*$/ ) {
    100 66        
    100 66        
    50          
    50          
    100          
    100          
7368              
7369             # we only have the first part - something like 'v101' -
7370             # look for more
7371 2 50       14 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
7372 2         7 $pos = pos($input_line);
7373 2         4 $numc = $pos - $pos_beg;
7374 2         9 $tok = substr( $input_line, $pos_beg, $numc );
7375             }
7376 2         4 $type = 'v';
7377              
7378             # warn if this version can't handle v-strings
7379 2         11 $self->report_v_string($tok);
7380             }
7381              
7382             elsif ( $ris_constant->{$package}{$sub_name} ) {
7383 12         26 $type = 'C';
7384             }
7385              
7386             # bareword after sort has implied empty prototype; for example:
7387             # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7388             # This has priority over whatever the user has specified.
7389             elsif ($last_nonblank_token eq 'sort'
7390             && $last_nonblank_type eq 'k' )
7391             {
7392 1         2 $type = 'Z';
7393             }
7394              
7395             # Note: strangely, perl does not seem to really let you create
7396             # functions which act like eval and do, in the sense that eval
7397             # and do may have operators following the final }, but any operators
7398             # that you create with prototype (&) apparently do not allow
7399             # trailing operators, only terms. This seems strange.
7400             # If this ever changes, here is the update
7401             # to make perltidy behave accordingly:
7402              
7403             # elsif ( $ris_block_function->{$package}{$tok} ) {
7404             # $tok='eval'; # patch to do braces like eval - doesn't work
7405             # $type = 'k';
7406             #}
7407             # TODO: This could become a separate type to allow for different
7408             # future behavior:
7409             elsif ( $ris_block_function->{$package}{$sub_name} ) {
7410 0         0 $type = 'G';
7411             }
7412             elsif ( $ris_block_list_function->{$package}{$sub_name} ) {
7413 0         0 $type = 'G';
7414             }
7415             elsif ( $ris_user_function->{$package}{$sub_name} ) {
7416 6         19 $type = 'U';
7417 6         22 $prototype = $ruser_function_prototype->{$package}{$sub_name};
7418             }
7419              
7420             # check for indirect object
7421             elsif (
7422              
7423             # added 2001-03-27: must not be followed immediately by '('
7424             # see fhandle.t
7425             ( $input_line !~ m/\G\(/gc )
7426              
7427             # and
7428             && (
7429              
7430             # preceded by keyword like 'print', 'printf' and friends
7431             $is_indirect_object_taker{$last_nonblank_token}
7432              
7433             # or preceded by something like 'print(' or 'printf('
7434             || (
7435             ( $last_nonblank_token eq '(' )
7436             && $is_indirect_object_taker{
7437             $rparen_type->[$paren_depth]
7438             }
7439              
7440             )
7441             )
7442             )
7443             {
7444              
7445             # may not be indirect object unless followed by a space;
7446             # updated 2021-01-16 to consider newline to be a space.
7447             # updated for case b990 to look for either ';' or space
7448 4 50 33     99 if ( pos($input_line) == length($input_line)
7449             || $input_line =~ m/\G[;\s]/gc )
7450             {
7451 4         13 $type = 'Y';
7452              
7453             # Abandon Hope ...
7454             # Perl's indirect object notation is a very bad
7455             # thing and can cause subtle bugs, especially for
7456             # beginning programmers. And I haven't even been
7457             # able to figure out a sane warning scheme which
7458             # doesn't get in the way of good scripts.
7459              
7460             # Complain if a filehandle has any lower case
7461             # letters. This is suggested good practice.
7462             # Use 'sub_name' because something like
7463             # main::MYHANDLE is ok for filehandle
7464 4 100       23 if ( $sub_name =~ /[a-z]/ ) {
7465              
7466             # could be bug caused by older perltidy if
7467             # followed by '('
7468 1 50       6 if ( $input_line =~ m/\G\s*\(/gc ) {
7469 1         8 $self->complain(
7470             "Caution: unknown word '$tok' in indirect object slot\n"
7471             );
7472             }
7473             }
7474             }
7475              
7476             # bareword not followed by a space -- may not be filehandle
7477             # (may be function call defined in a 'use' statement)
7478             else {
7479 0         0 $type = 'Z';
7480             }
7481             }
7482              
7483             # none of the above special types
7484             else {
7485             }
7486             }
7487              
7488             # Now we must convert back from character position
7489             # to pre_token index.
7490             # I don't think an error flag can occur here ..but who knows
7491 1674         2854 my $error;
7492 1674         4658 ( $i, $error ) =
7493             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7494 1674 50       4601 if ($error) {
7495 0         0 $self->warning(
7496             "scan_bare_identifier: Possibly invalid tokenization\n");
7497             }
7498             }
7499              
7500             # no match but line not blank - could be syntax error
7501             # perl will take '::' alone without complaint
7502             else {
7503 0         0 $type = 'w';
7504              
7505             # change this warning to log message if it becomes annoying
7506 0         0 $self->warning("didn't find identifier after leading ::\n");
7507             }
7508 1674         6953 return ( $i, $tok, $type, $prototype );
7509             } ## end sub scan_bare_identifier_do
7510              
7511             sub scan_id_do {
7512              
7513             # This is the new scanner and will eventually replace scan_identifier.
7514             # Only type 'sub' and 'package' are implemented.
7515             # Token types $ * % @ & -> are not yet implemented.
7516             #
7517             # Scan identifier following a type token.
7518             # The type of call depends on $id_scan_state: $id_scan_state = ''
7519             # for starting call, in which case $tok must be the token defining
7520             # the type.
7521             #
7522             # If the type token is the last nonblank token on the line, a value
7523             # of $id_scan_state = $tok is returned, indicating that further
7524             # calls must be made to get the identifier. If the type token is
7525             # not the last nonblank token on the line, the identifier is
7526             # scanned and handled and a value of '' is returned.
7527              
7528 332     332 0 1256 my ( $self, $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7529             $max_token_index )
7530             = @_;
7531 39     39   474 use constant DEBUG_NSCAN => 0;
  39         104  
  39         49441  
7532 332         705 my $type = EMPTY_STRING;
7533 332         680 my ( $i_beg, $pos_beg );
7534              
7535             #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7536             #my ($a,$b,$c) = caller;
7537             #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7538              
7539             # on re-entry, start scanning at first token on the line
7540 332 100       951 if ($id_scan_state) {
7541 10         20 $i_beg = $i;
7542 10         27 $type = EMPTY_STRING;
7543             }
7544              
7545             # on initial entry, start scanning just after type token
7546             else {
7547 322         621 $i_beg = $i + 1;
7548 322         572 $id_scan_state = $tok;
7549 322         735 $type = 't';
7550             }
7551              
7552             # find $i_beg = index of next nonblank token,
7553             # and handle empty lines
7554 332         602 my $blank_line = 0;
7555 332         767 my $next_nonblank_token = $rtokens->[$i_beg];
7556 332 100       937 if ( $i_beg > $max_token_index ) {
7557 2         5 $blank_line = 1;
7558             }
7559             else {
7560              
7561             # only a '#' immediately after a '$' is not a comment
7562 330 50       1084 if ( $next_nonblank_token eq '#' ) {
7563 0 0       0 if ( $tok ne '$' ) {
7564 0         0 $blank_line = 1;
7565             }
7566             }
7567              
7568 330 100       1597 if ( $next_nonblank_token =~ /^\s/ ) {
7569 310         1288 ( $next_nonblank_token, $i_beg ) =
7570             find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7571             $max_token_index );
7572 310 100       1740 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7573 4         12 $blank_line = 1;
7574             }
7575             }
7576             }
7577              
7578             # handle non-blank line; identifier, if any, must follow
7579 332 100       971 if ( !$blank_line ) {
7580              
7581 326 100       1004 if ( $is_sub{$id_scan_state} ) {
    50          
7582 300         3371 ( $i, $tok, $type, $id_scan_state ) = $self->do_scan_sub(
7583             {
7584             input_line => $input_line,
7585             i => $i,
7586             i_beg => $i_beg,
7587             tok => $tok,
7588             type => $type,
7589             rtokens => $rtokens,
7590             rtoken_map => $rtoken_map,
7591             id_scan_state => $id_scan_state,
7592             max_token_index => $max_token_index,
7593             }
7594             );
7595             }
7596              
7597             elsif ( $is_package{$id_scan_state} ) {
7598 26         97 ( $i, $tok, $type ) =
7599             $self->do_scan_package( $input_line, $i, $i_beg, $tok, $type,
7600             $rtokens, $rtoken_map, $max_token_index );
7601 26         68 $id_scan_state = EMPTY_STRING;
7602             }
7603              
7604             else {
7605 0         0 $self->warning("invalid token in scan_id: $tok\n");
7606 0         0 $id_scan_state = EMPTY_STRING;
7607             }
7608             }
7609              
7610 332 50 33     1899 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
      66        
7611              
7612             # shouldn't happen:
7613 0         0 if (DEVEL_MODE) {
7614             $self->Fault(<<EOM);
7615             Program bug in scan_id: undefined type but scan_state=$id_scan_state
7616             EOM
7617             }
7618             $self->warning(
7619 0         0 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7620             );
7621 0         0 $self->report_definite_bug();
7622             }
7623              
7624 332         557 DEBUG_NSCAN && do {
7625             print {*STDOUT}
7626             "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7627             };
7628 332         1444 return ( $i, $tok, $type, $id_scan_state );
7629             } ## end sub scan_id_do
7630              
7631             sub check_prototype {
7632 138     138 0 482 my ( $proto, $package, $subname ) = @_;
7633 138 50       446 return if ( !defined($package) );
7634 138 50       450 return if ( !defined($subname) );
7635 138 100       396 if ( defined($proto) ) {
7636 34         174 $proto =~ s/^\s*\(\s*//;
7637 34         130 $proto =~ s/\s*\)$//;
7638 34 100       103 if ($proto) {
7639 5         26 $ris_user_function->{$package}{$subname} = 1;
7640 5         26 $ruser_function_prototype->{$package}{$subname} = "($proto)";
7641              
7642             # prototypes containing '&' must be treated specially..
7643 5 100       29 if ( $proto =~ /\&/ ) {
7644              
7645             # right curly braces of prototypes ending in
7646             # '&' may be followed by an operator
7647 1 50       9 if ( $proto =~ /\&$/ ) {
7648 0         0 $ris_block_function->{$package}{$subname} = 1;
7649             }
7650              
7651             # right curly braces of prototypes NOT ending in
7652             # '&' may NOT be followed by an operator
7653             else {
7654 1         5 $ris_block_list_function->{$package}{$subname} = 1;
7655             }
7656             }
7657             }
7658             else {
7659 29         89 $ris_constant->{$package}{$subname} = 1;
7660             }
7661             }
7662             else {
7663 104         400 $ris_user_function->{$package}{$subname} = 1;
7664             }
7665 138         358 return;
7666             } ## end sub check_prototype
7667              
7668             sub do_scan_package {
7669              
7670             # do_scan_package parses a package name
7671             # it is called with $i_beg equal to the index of the first nonblank
7672             # token following a 'package' token.
7673             # USES GLOBAL VARIABLES: $current_package,
7674              
7675             # package NAMESPACE
7676             # package NAMESPACE VERSION
7677             # package NAMESPACE BLOCK
7678             # package NAMESPACE VERSION BLOCK
7679             #
7680             # If VERSION is provided, package sets the $VERSION variable in the given
7681             # namespace to a version object with the VERSION provided. VERSION must be
7682             # a "strict" style version number as defined by the version module: a
7683             # positive decimal number (integer or decimal-fraction) without
7684             # exponentiation or else a dotted-decimal v-string with a leading 'v'
7685             # character and at least three components.
7686             # reference http://perldoc.perl.org/functions/package.html
7687              
7688             my (
7689 26     26 0 87 $self, $input_line, $i,
7690             $i_beg, $tok, $type,
7691             $rtokens, $rtoken_map, $max_token_index
7692             ) = @_;
7693 26         48 my $package = undef;
7694 26         52 my $pos_beg = $rtoken_map->[$i_beg];
7695 26         79 pos($input_line) = $pos_beg;
7696              
7697             # handle non-blank line; package name, if any, must follow
7698 26 50       154 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7699 26         103 $package = $1;
7700 26 50 33     165 $package = ( defined($1) && $1 ) ? $1 : 'main';
7701 26         77 $package =~ s/\'/::/g;
7702 26 50       83 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  0         0  
7703 26         56 $package =~ s/::$//;
7704 26         47 my $pos = pos($input_line);
7705 26         47 my $numc = $pos - $pos_beg;
7706 26         77 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
7707 26         49 $type = 'P'; # Fix for c250, previously 'i'
7708              
7709             # Now we must convert back from character position
7710             # to pre_token index.
7711             # I don't think an error flag can occur here ..but ?
7712 26         43 my $error;
7713 26         111 ( $i, $error ) =
7714             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7715 26 50       79 if ($error) { $self->warning("Possibly invalid package\n") }
  0         0  
7716 26         55 $current_package = $package;
7717              
7718             # we should now have package NAMESPACE
7719             # now expecting VERSION, BLOCK, or ; to follow ...
7720             # package NAMESPACE VERSION
7721             # package NAMESPACE BLOCK
7722             # package NAMESPACE VERSION BLOCK
7723 26         80 my ( $next_nonblank_token, $i_next ) =
7724             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
7725              
7726             # check that something recognizable follows, but do not parse.
7727             # A VERSION number will be parsed later as a number or v-string in the
7728             # normal way. What is important is to set the statement type if
7729             # everything looks okay so that the operator_expected() routine
7730             # knows that the number is in a package statement.
7731             # Examples of valid primitive tokens that might follow are:
7732             # 1235 . ; { } v3 v
7733             # FIX: added a '#' since a side comment may also follow
7734             # Added ':' for class attributes (for --use-feature=class, rt145706)
7735 26 50       143 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#\:])|v\d|\d+$/ ) {
7736 26         75 $statement_type = $tok;
7737             }
7738             else {
7739 0         0 $self->warning(
7740             "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7741             );
7742             }
7743             }
7744              
7745             # no match but line not blank --
7746             # could be a label with name package, like package: , for example.
7747             else {
7748 0         0 $type = 'k';
7749             }
7750              
7751 26         98 return ( $i, $tok, $type );
7752             } ## end sub do_scan_package
7753              
7754             { ## begin closure for sub scan_complex_identifier
7755              
7756 39     39   405 use constant DEBUG_SCAN_ID => 0;
  39         130  
  39         5324  
7757              
7758             # Constant hash:
7759             my %is_special_variable_char;
7760              
7761             BEGIN {
7762              
7763             # These are the only characters which can (currently) form special
7764             # variables, like $^W: (issue c066).
7765 39     39   297 my @q =
7766             qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7767 39         146061 @{is_special_variable_char}{@q} = (1) x scalar(@q);
7768             } ## end BEGIN
7769              
7770             # These are the possible states for this scanner:
7771             my $scan_state_SIGIL = '$';
7772             my $scan_state_ALPHA = 'A';
7773             my $scan_state_COLON = ':';
7774             my $scan_state_LPAREN = '(';
7775             my $scan_state_RPAREN = ')';
7776             my $scan_state_AMPERSAND = '&';
7777             my $scan_state_SPLIT = '^';
7778              
7779             # Only these non-blank states may be returned to caller:
7780             my %is_returnable_scan_state = (
7781             $scan_state_SIGIL => 1,
7782             $scan_state_AMPERSAND => 1,
7783             );
7784              
7785             # USES GLOBAL VARIABLES:
7786             # $context, $last_nonblank_token, $last_nonblank_type
7787              
7788             #-----------
7789             # call args:
7790             #-----------
7791             my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7792             $expecting, $container_type );
7793              
7794             #-------------------------------------------
7795             # my variables, re-initialized on each call:
7796             #-------------------------------------------
7797             my $i_begin; # starting index $i
7798             my $type; # returned identifier type
7799             my $tok_begin; # starting token
7800             my $tok; # returned token
7801             my $id_scan_state_begin; # starting scan state
7802             my $identifier_begin; # starting identifier
7803             my $i_save; # a last good index, in case of error
7804             my $message; # hold error message for log file
7805             my $tok_is_blank;
7806             my $last_tok_is_blank;
7807             my $in_prototype_or_signature;
7808             my $saw_alpha;
7809             my $saw_type;
7810             my $allow_tick;
7811              
7812             sub initialize_my_scan_id_vars {
7813              
7814             # Initialize all 'my' vars on entry
7815 486     486 0 844 $i_begin = $i;
7816 486         882 $type = EMPTY_STRING;
7817 486         970 $tok_begin = $rtokens->[$i_begin];
7818 486         917 $tok = $tok_begin;
7819 486 50       1374 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
  0         0  
7820 486         833 $id_scan_state_begin = $id_scan_state;
7821 486         1839 $identifier_begin = $identifier;
7822 486         842 $i_save = undef;
7823              
7824 486         1146 $message = EMPTY_STRING;
7825 486         839 $tok_is_blank = undef; # a flag to speed things up
7826 486         773 $last_tok_is_blank = undef;
7827              
7828 486   100     1916 $in_prototype_or_signature =
7829             $container_type && $container_type =~ /^sub\b/;
7830              
7831             # these flags will be used to help figure out the type:
7832 486         819 $saw_alpha = undef;
7833 486         783 $saw_type = undef;
7834              
7835             # allow old package separator (') except in 'use' statement
7836 486         932 $allow_tick = ( $last_nonblank_token ne 'use' );
7837 486         904 return;
7838             } ## end sub initialize_my_scan_id_vars
7839              
7840             #----------------------------------
7841             # Routines for handling scan states
7842             #----------------------------------
7843             sub do_id_scan_state_dollar {
7844              
7845 514     514 0 956 my $self = shift;
7846              
7847             # We saw a sigil, now looking to start a variable name
7848 514 100 66     4617 if ( $tok eq '$' ) {
    100 33        
    100          
    50          
    50          
    100          
    100          
    100          
    100          
7849              
7850 31         159 $identifier .= $tok;
7851              
7852             # we've got a punctuation variable if end of line (punct.t)
7853 31 50       194 if ( $i == $max_token_index ) {
7854 0         0 $type = 'i';
7855 0         0 $id_scan_state = EMPTY_STRING;
7856             }
7857             }
7858             elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
7859 253         541 $saw_alpha = 1;
7860 253         530 $identifier .= $tok;
7861              
7862             # now need :: except for special digit vars like '$1' (c208)
7863 253 100       900 $id_scan_state = $tok =~ /^\d/ ? EMPTY_STRING : $scan_state_COLON;
7864             }
7865             elsif ( $tok eq '::' ) {
7866 16         55 $id_scan_state = $scan_state_ALPHA;
7867 16         46 $identifier .= $tok;
7868             }
7869              
7870             # POSTDEFREF ->@ ->% ->& ->*
7871             elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7872 0         0 $identifier .= $tok;
7873             }
7874             elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7875 0         0 $saw_alpha = 1;
7876 0         0 $id_scan_state = $scan_state_COLON; # now need ::
7877 0         0 $identifier .= $tok;
7878              
7879             # Perl will accept leading digits in identifiers,
7880             # although they may not always produce useful results.
7881             # Something like $main::0 is ok. But this also works:
7882             #
7883             # sub howdy::123::bubba{ print "bubba $54321!\n" }
7884             # howdy::123::bubba();
7885             #
7886             }
7887             elsif ( $tok eq '#' ) {
7888              
7889 99         221 my $is_punct_var = $identifier eq '$$';
7890              
7891             # side comment or identifier?
7892 99 100 66     1099 if (
      66        
      66        
      33        
7893              
7894             # A '#' starts a comment if it follows a space. For example,
7895             # the following is equivalent to $ans=40.
7896             # my $ #
7897             # ans = 40;
7898             !$last_tok_is_blank
7899              
7900             # a # inside a prototype or signature can only start a
7901             # comment
7902             && !$in_prototype_or_signature
7903              
7904             # these are valid punctuation vars: *# %# @# $#
7905             # May also be '$#array' or POSTDEFREF ->$#
7906             && ( $identifier =~ /^[\%\@\$\*]$/
7907             || $identifier =~ /\$$/ )
7908              
7909             # but a '#' after '$$' is a side comment; see c147
7910             && !$is_punct_var
7911              
7912             )
7913             {
7914 95         260 $identifier .= $tok; # keep same state, a $ could follow
7915             }
7916             else {
7917              
7918             # otherwise it is a side comment
7919 4 50       18 if ( $identifier eq '->' ) { }
    50          
    50          
7920 0         0 elsif ($is_punct_var) { $type = 'i' }
7921 4         8 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
7922 0         0 else { $type = 'i' }
7923 4         9 $i = $i_save;
7924 4         9 $id_scan_state = EMPTY_STRING;
7925             }
7926             }
7927              
7928             elsif ( $tok eq '{' ) {
7929              
7930             # check for something like ${#} or ${?}, where ? is a special char
7931 38 100 100     529 if (
      66        
      100        
      100        
7932             (
7933             $identifier eq '$'
7934             || $identifier eq '@'
7935             || $identifier eq '$#'
7936             )
7937             && $i + 2 <= $max_token_index
7938             && $rtokens->[ $i + 2 ] eq '}'
7939             && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7940             )
7941             {
7942 1         7 my $next2 = $rtokens->[ $i + 2 ];
7943 1         6 my $next1 = $rtokens->[ $i + 1 ];
7944 1         3 $identifier .= $tok . $next1 . $next2;
7945 1         4 $i += 2;
7946 1         3 $id_scan_state = EMPTY_STRING;
7947             }
7948             else {
7949              
7950             # skip something like ${xxx} or ->{
7951 37         93 $id_scan_state = EMPTY_STRING;
7952              
7953             # if this is the first token of a line, any tokens for this
7954             # identifier have already been accumulated
7955 37 100 66     170 if ( $identifier eq '$' || $i == 0 ) {
7956 26         56 $identifier = EMPTY_STRING;
7957             }
7958 37         73 $i = $i_save;
7959             }
7960             }
7961              
7962             # space ok after leading $ % * & @
7963             elsif ( $tok =~ /^\s*$/ ) {
7964              
7965 20         70 $tok_is_blank = 1;
7966              
7967             # note: an id with a leading '&' does not actually come this way
7968 20 50       113 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
    0          
7969              
7970 20 100       97 if ( length($identifier) > 1 ) {
7971 8         20 $id_scan_state = EMPTY_STRING;
7972 8         23 $i = $i_save;
7973 8         18 $type = 'i'; # probably punctuation variable
7974             }
7975             else {
7976              
7977             # fix c139: trim line-ending type 't'
7978 12 100       64 if ( $i == $max_token_index ) {
    100          
7979 1         2 $i = $i_save;
7980 1         3 $type = 't';
7981             }
7982              
7983             # spaces after $'s are common, and space after @
7984             # is harmless, so only complain about space
7985             # after other type characters. Space after $ and
7986             # @ will be removed in formatting. Report space
7987             # after % and * because they might indicate a
7988             # parsing error. In other words '% ' might be a
7989             # modulo operator. Delete this warning if it
7990             # gets annoying.
7991             elsif ( $identifier !~ /^[\@\$]$/ ) {
7992 1         5 $message =
7993             "Space in identifier, following $identifier\n";
7994             }
7995             else {
7996             ## ok: silently accept space after '$' and '@' sigils
7997             }
7998             }
7999             }
8000              
8001             elsif ( $identifier eq '->' ) {
8002              
8003             # space after '->' is ok except at line end ..
8004             # so trim line-ending in type '->' (fixes c139)
8005 0 0       0 if ( $i == $max_token_index ) {
8006 0         0 $i = $i_save;
8007 0         0 $type = '->';
8008             }
8009             }
8010              
8011             # stop at space after something other than -> or sigil
8012             # Example of what can arrive here:
8013             # eval { $MyClass->$$ };
8014             else {
8015 0         0 $id_scan_state = EMPTY_STRING;
8016 0         0 $i = $i_save;
8017 0         0 $type = 'i';
8018             }
8019             }
8020             elsif ( $tok eq '^' ) {
8021              
8022             # check for some special variables like $^ $^W
8023 11 50       46 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
8024 11         31 $identifier .= $tok;
8025 11         28 $type = 'i';
8026              
8027             # There may be one more character, not a space, after the ^
8028 11         26 my $next1 = $rtokens->[ $i + 1 ];
8029 11         32 my $chr = substr( $next1, 0, 1 );
8030 11 100       81 if ( $is_special_variable_char{$chr} ) {
8031              
8032             # It is something like $^W
8033             # Test case (c066) : $^Oeq'linux'
8034 9         19 $i++;
8035 9         16 $identifier .= $next1;
8036              
8037             # If pretoken $next1 is more than one character long,
8038             # set a flag indicating that it needs to be split.
8039 9 100       37 $id_scan_state =
8040             ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
8041             }
8042             else {
8043              
8044             # it is just $^
8045             # Simple test case (c065): '$aa=$^if($bb)';
8046 2         9 $id_scan_state = EMPTY_STRING;
8047             }
8048             }
8049             else {
8050 0         0 $id_scan_state = EMPTY_STRING;
8051 0         0 $i = $i_save;
8052             }
8053             }
8054             else { # something else
8055              
8056 46 100 66     438 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
    100 66        
    100          
    50          
    0          
    0          
8057              
8058             # We might be in an extrusion of
8059             # sub foo2 ( $first, $, $third ) {
8060             # looking at a line starting with a comma, like
8061             # $
8062             # ,
8063             # in this case the comma ends the signature variable
8064             # '$' which will have been previously marked type 't'
8065             # rather than 'i'.
8066 3 100       10 if ( $i == $i_begin ) {
8067 1         3 $identifier = EMPTY_STRING;
8068 1         4 $type = EMPTY_STRING;
8069             }
8070              
8071             # at a # we have to mark as type 't' because more may
8072             # follow, otherwise, in a signature we can let '$' be an
8073             # identifier here for better formatting.
8074             # See 'mangle4.in' for a test case.
8075             else {
8076 2         5 $type = 'i';
8077 2 50 33     14 if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
8078 0         0 $type = 't';
8079             }
8080 2         3 $i = $i_save;
8081             }
8082 3         6 $id_scan_state = EMPTY_STRING;
8083             }
8084              
8085             # check for various punctuation variables
8086             elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
8087 35         114 $identifier .= $tok;
8088             }
8089              
8090             # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
8091             elsif ($tok eq '*'
8092             && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
8093             {
8094 6         14 $identifier .= $tok;
8095             }
8096              
8097             elsif ( $identifier eq '$#' ) {
8098              
8099 2 50       12 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
  0 50       0  
  0         0  
8100              
8101             # perl seems to allow just these: $#: $#- $#+
8102             elsif ( $tok =~ /^[\:\-\+]$/ ) {
8103 0         0 $type = 'i';
8104 0         0 $identifier .= $tok;
8105             }
8106             else {
8107 2         5 $i = $i_save;
8108 2         55 $self->write_logfile_entry(
8109             'Use of $# is deprecated' . "\n" );
8110             }
8111             }
8112             elsif ( $identifier eq '$$' ) {
8113              
8114             # perl does not allow references to punctuation
8115             # variables without braces. For example, this
8116             # won't work:
8117             # $:=\4;
8118             # $a = $$:;
8119             # You would have to use
8120             # $a = ${$:};
8121              
8122             # '$$' alone is punctuation variable for PID
8123 0         0 $i = $i_save;
8124 0 0       0 if ( $tok eq '{' ) { $type = 't' }
  0         0  
8125 0         0 else { $type = 'i' }
8126             }
8127             elsif ( $identifier eq '->' ) {
8128 0         0 $i = $i_save;
8129             }
8130             else {
8131 0         0 $i = $i_save;
8132 0 0       0 if ( length($identifier) == 1 ) {
8133 0         0 $identifier = EMPTY_STRING;
8134             }
8135             }
8136 46         118 $id_scan_state = EMPTY_STRING;
8137             }
8138 514         948 return;
8139             } ## end sub do_id_scan_state_dollar
8140              
8141             sub do_id_scan_state_alpha {
8142              
8143 113     113 0 241 my $self = shift;
8144              
8145             # looking for alphanumeric after ::
8146 113         438 $tok_is_blank = $tok =~ /^\s*$/;
8147              
8148 113 100 33     478 if ( $tok =~ /^\w/ ) { # found it
    50 66        
    50 33        
    50          
8149 100         198 $identifier .= $tok;
8150 100         189 $id_scan_state = $scan_state_COLON; # now need ::
8151 100         162 $saw_alpha = 1;
8152             }
8153             elsif ( $tok eq "'" && $allow_tick ) {
8154 0         0 $identifier .= $tok;
8155 0         0 $id_scan_state = $scan_state_COLON; # now need ::
8156 0         0 $saw_alpha = 1;
8157             }
8158             elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
8159 0         0 $id_scan_state = $scan_state_LPAREN;
8160 0         0 $identifier .= $tok;
8161             }
8162             elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8163 0         0 $id_scan_state = $scan_state_RPAREN;
8164 0         0 $identifier .= $tok;
8165             }
8166             else {
8167 13         26 $id_scan_state = EMPTY_STRING;
8168 13         28 $i = $i_save;
8169             }
8170 113         197 return;
8171             } ## end sub do_id_scan_state_alpha
8172              
8173             sub do_id_scan_state_colon {
8174              
8175 434     434 0 874 my $self = shift;
8176              
8177             # looking for possible :: after alphanumeric
8178              
8179 434         1592 $tok_is_blank = $tok =~ /^\s*$/;
8180              
8181 434 100 66     3552 if ( $tok eq '::' ) { # got it
    100 66        
    100 66        
    50          
    50          
8182 97         185 $identifier .= $tok;
8183 97         182 $id_scan_state = $scan_state_ALPHA; # now require alpha
8184             }
8185             elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
8186 20         47 $identifier .= $tok;
8187 20         46 $id_scan_state = $scan_state_COLON; # now need ::
8188 20         41 $saw_alpha = 1;
8189             }
8190             elsif ( $tok eq "'" && $allow_tick ) { # tick
8191              
8192 12 50       29 if ( $is_keyword{$identifier} ) {
8193 0         0 $id_scan_state = EMPTY_STRING; # that's all
8194 0         0 $i = $i_save;
8195             }
8196             else {
8197 12         19 $identifier .= $tok;
8198             }
8199             }
8200             elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
8201 0         0 $id_scan_state = $scan_state_LPAREN;
8202 0         0 $identifier .= $tok;
8203             }
8204             elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8205 0         0 $id_scan_state = $scan_state_RPAREN;
8206 0         0 $identifier .= $tok;
8207             }
8208             else {
8209 305         557 $id_scan_state = EMPTY_STRING; # that's all
8210 305         533 $i = $i_save;
8211             }
8212 434         709 return;
8213             } ## end sub do_id_scan_state_colon
8214              
8215             sub do_id_scan_state_left_paren {
8216              
8217 0     0 0 0 my $self = shift;
8218              
8219             # looking for possible '(' of a prototype
8220              
8221 0 0       0 if ( $tok eq '(' ) { # got it
    0          
8222 0         0 $identifier .= $tok;
8223 0         0 $id_scan_state = $scan_state_RPAREN; # now find the end of it
8224             }
8225             elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
8226 0         0 $identifier .= $tok;
8227 0         0 $tok_is_blank = 1;
8228             }
8229             else {
8230 0         0 $id_scan_state = EMPTY_STRING; # that's all - no prototype
8231 0         0 $i = $i_save;
8232             }
8233 0         0 return;
8234             } ## end sub do_id_scan_state_left_paren
8235              
8236             sub do_id_scan_state_right_paren {
8237              
8238 0     0 0 0 my $self = shift;
8239              
8240             # looking for a ')' of prototype to close a '('
8241              
8242 0         0 $tok_is_blank = $tok =~ /^\s*$/;
8243              
8244 0 0       0 if ( $tok eq ')' ) { # got it
    0          
8245 0         0 $identifier .= $tok;
8246 0         0 $id_scan_state = EMPTY_STRING; # all done
8247             }
8248             elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
8249 0         0 $identifier .= $tok;
8250             }
8251             else { # probable error in script, but keep going
8252 0         0 warning("Unexpected '$tok' while seeking end of prototype\n");
8253 0         0 $identifier .= $tok;
8254             }
8255 0         0 return;
8256             } ## end sub do_id_scan_state_right_paren
8257              
8258             sub do_id_scan_state_ampersand {
8259              
8260 105     105 0 297 my $self = shift;
8261              
8262             # Starting sub call after seeing an '&'
8263              
8264 105 100 33     686 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
    50          
    100          
    50          
    50          
    0          
8265 88         229 $id_scan_state = $scan_state_COLON; # now need ::
8266 88         172 $saw_alpha = 1;
8267 88         245 $identifier .= $tok;
8268             }
8269             elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
8270 0         0 $id_scan_state = $scan_state_COLON; # now need ::
8271 0         0 $saw_alpha = 1;
8272 0         0 $identifier .= $tok;
8273             }
8274             elsif ( $tok =~ /^\s*$/ ) { # allow space
8275 2         4 $tok_is_blank = 1;
8276              
8277             # fix c139: trim line-ending type 't'
8278 2 50 33     11 if ( length($identifier) == 1 && $i == $max_token_index ) {
8279 2         4 $i = $i_save;
8280 2         4 $type = 't';
8281             }
8282             }
8283             elsif ( $tok eq '::' ) { # leading ::
8284 0         0 $id_scan_state = $scan_state_ALPHA; # accept alpha next
8285 0         0 $identifier .= $tok;
8286             }
8287             elsif ( $tok eq '{' ) {
8288 15 50 33     75 if ( $identifier eq '&' || $i == 0 ) {
8289 15         35 $identifier = EMPTY_STRING;
8290             }
8291 15         33 $i = $i_save;
8292 15         34 $id_scan_state = EMPTY_STRING;
8293             }
8294             elsif ( $tok eq '^' ) {
8295 0 0       0 if ( $identifier eq '&' ) {
8296              
8297             # Special variable (c066)
8298 0         0 $identifier .= $tok;
8299 0         0 $type = '&';
8300              
8301             # There may be one more character, not a space, after the ^
8302 0         0 my $next1 = $rtokens->[ $i + 1 ];
8303 0         0 my $chr = substr( $next1, 0, 1 );
8304 0 0       0 if ( $is_special_variable_char{$chr} ) {
8305              
8306             # It is something like &^O
8307 0         0 $i++;
8308 0         0 $identifier .= $next1;
8309              
8310             # If pretoken $next1 is more than one character long,
8311             # set a flag indicating that it needs to be split.
8312 0 0       0 $id_scan_state =
8313             ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
8314             }
8315             else {
8316              
8317             # it is &^
8318 0         0 $id_scan_state = EMPTY_STRING;
8319             }
8320             }
8321             else {
8322 0         0 $identifier = EMPTY_STRING;
8323 0         0 $i = $i_save;
8324             }
8325             }
8326             else {
8327              
8328             # punctuation variable?
8329             # testfile: cunningham4.pl
8330             #
8331             # We have to be careful here. If we are in an unknown state,
8332             # we will reject the punctuation variable. In the following
8333             # example the '&' is a binary operator but we are in an unknown
8334             # state because there is no sigil on 'Prima', so we don't
8335             # know what it is. But it is a bad guess that
8336             # '&~' is a function variable.
8337             # $self->{text}->{colorMap}->[
8338             # Prima::PodView::COLOR_CODE_FOREGROUND
8339             # & ~tb::COLOR_INDEX ] =
8340             # $sec->{ColorCode}
8341              
8342             # Fix for case c033: a '#' here starts a side comment
8343 0 0 0     0 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
      0        
8344 0         0 $identifier .= $tok;
8345             }
8346             else {
8347 0         0 $identifier = EMPTY_STRING;
8348 0         0 $i = $i_save;
8349 0         0 $type = '&';
8350             }
8351 0         0 $id_scan_state = EMPTY_STRING;
8352             }
8353 105         221 return;
8354             } ## end sub do_id_scan_state_ampersand
8355              
8356             #-------------------
8357             # hash of scanner subs
8358             #-------------------
8359             my $scan_identifier_code = {
8360             $scan_state_SIGIL => \&do_id_scan_state_dollar,
8361             $scan_state_ALPHA => \&do_id_scan_state_alpha,
8362             $scan_state_COLON => \&do_id_scan_state_colon,
8363             $scan_state_LPAREN => \&do_id_scan_state_left_paren,
8364             $scan_state_RPAREN => \&do_id_scan_state_right_paren,
8365             $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
8366             };
8367              
8368             sub scan_complex_identifier {
8369              
8370             # This routine assembles tokens into identifiers. It maintains a
8371             # scan state, id_scan_state. It updates id_scan_state based upon
8372             # current id_scan_state and token, and returns an updated
8373             # id_scan_state and the next index after the identifier.
8374              
8375             # This routine now serves a a backup for sub scan_simple_identifier
8376             # which handles most identifiers.
8377              
8378             # Note that $self must be a 'my' variable and not be a closure
8379             # variables like the other args. Otherwise it will not get
8380             # deleted by a DESTROY call at the end of a file. Then an
8381             # attempt to create multiple tokenizers can occur when multiple
8382             # files are processed, causing an error.
8383              
8384             (
8385 486     486 0 1755 my $self, $i, $id_scan_state, $identifier, $rtokens,
8386             $max_token_index, $expecting, $container_type
8387             ) = @_;
8388              
8389             # return flag telling caller to split the pretoken
8390 486         2543 my $split_pretoken_flag;
8391              
8392             #-------------------
8393             # Initialize my vars
8394             #-------------------
8395              
8396 486         1676 initialize_my_scan_id_vars();
8397              
8398             #--------------------------------------------------------
8399             # get started by defining a type and a state if necessary
8400             #--------------------------------------------------------
8401              
8402 486 100       1560 if ( !$id_scan_state ) {
8403 479         861 $context = UNKNOWN_CONTEXT;
8404              
8405             # fixup for digraph
8406 479 50       1369 if ( $tok eq '>' ) {
8407 0         0 $tok = '->';
8408 0         0 $tok_begin = $tok;
8409             }
8410 479         861 $identifier = $tok;
8411              
8412 479 100 100     3211 if ( $last_nonblank_token eq '->' ) {
    100 100        
    100 0        
    50          
    0          
    0          
    0          
    0          
8413 6         14 $identifier = '->' . $identifier;
8414 6         13 $id_scan_state = $scan_state_SIGIL;
8415             }
8416             elsif ( $tok eq '$' || $tok eq '*' ) {
8417 293         560 $id_scan_state = $scan_state_SIGIL;
8418 293         507 $context = SCALAR_CONTEXT;
8419             }
8420             elsif ( $tok eq '%' || $tok eq '@' ) {
8421 77         185 $id_scan_state = $scan_state_SIGIL;
8422 77         149 $context = LIST_CONTEXT;
8423             }
8424             elsif ( $tok eq '&' ) {
8425 103         231 $id_scan_state = $scan_state_AMPERSAND;
8426             }
8427             elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8428 0         0 $saw_alpha = 0; # 'sub' is considered type info here
8429 0         0 $id_scan_state = $scan_state_SIGIL;
8430 0         0 $identifier .=
8431             SPACE; # need a space to separate sub from sub name
8432             }
8433             elsif ( $tok eq '::' ) {
8434 0         0 $id_scan_state = $scan_state_ALPHA;
8435             }
8436             elsif ( $tok =~ /^\w/ ) {
8437 0         0 $id_scan_state = $scan_state_COLON;
8438 0         0 $saw_alpha = 1;
8439             }
8440             elsif ( $tok eq '->' ) {
8441 0         0 $id_scan_state = $scan_state_SIGIL;
8442             }
8443             else {
8444              
8445             # shouldn't happen: bad call parameter
8446 0         0 my $msg =
8447             "Program bug detected: scan_complex_identifier received bad starting token = '$tok'\n";
8448 0         0 if (DEVEL_MODE) { $self->Fault($msg) }
8449 0 0       0 if ( !$self->[_in_error_] ) {
8450 0         0 warning($msg);
8451 0         0 $self->[_in_error_] = 1;
8452             }
8453 0         0 $id_scan_state = EMPTY_STRING;
8454              
8455             # emergency return
8456 0         0 goto RETURN;
8457             }
8458 479         960 $saw_type = !$saw_alpha;
8459             }
8460             else {
8461 7         15 $i--;
8462 7         33 $saw_alpha = ( $tok =~ /^\w/ );
8463 7         21 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
8464              
8465             # check for a valid starting state
8466 7         12 if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8467             $self->Fault(<<EOM);
8468             Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8469             EOM
8470             }
8471             }
8472              
8473             #------------------------------
8474             # loop to gather the identifier
8475             #------------------------------
8476              
8477 486         868 $i_save = $i;
8478              
8479 486   100     2159 while ( $i < $max_token_index && $id_scan_state ) {
8480              
8481             # Be sure we have code to handle this state before we proceed
8482 1169         2427 my $code = $scan_identifier_code->{$id_scan_state};
8483 1169 100       2474 if ( !$code ) {
8484              
8485 3 50       16 if ( $id_scan_state eq $scan_state_SPLIT ) {
8486             ## OK: this is the signal to exit and split the pretoken
8487             }
8488              
8489             # unknown state - should not happen
8490             else {
8491 0         0 if (DEVEL_MODE) {
8492             $self->Fault(<<EOM);
8493             Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8494             Scan state at sub entry was '$id_scan_state_begin'
8495             EOM
8496             }
8497 0         0 $id_scan_state = EMPTY_STRING;
8498 0         0 $i = $i_save;
8499             }
8500 3         7 last;
8501             }
8502              
8503             # Remember the starting index for progress check below
8504 1166         2684 my $i_start_loop = $i;
8505              
8506 1166         1749 $last_tok_is_blank = $tok_is_blank;
8507 1166 100       2049 if ($tok_is_blank) { $tok_is_blank = undef }
  11         25  
8508 1155         1594 else { $i_save = $i }
8509              
8510 1166         2131 $tok = $rtokens->[ ++$i ];
8511              
8512             # patch to make digraph :: if necessary
8513 1166 100 100     3165 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8514 113         237 $tok = '::';
8515 113         193 $i++;
8516             }
8517              
8518 1166         3484 $code->($self);
8519              
8520             # check for forward progress: a decrease in the index $i
8521             # implies that scanning has finished
8522 1166 100       4026 last if ( $i <= $i_start_loop );
8523              
8524             } ## end of main loop
8525              
8526             #-------------
8527             # Check result
8528             #-------------
8529              
8530             # Be sure a valid state is returned
8531 486 100       1351 if ($id_scan_state) {
8532              
8533 20 100       73 if ( !$is_returnable_scan_state{$id_scan_state} ) {
8534              
8535 13 100       60 if ( $id_scan_state eq $scan_state_SPLIT ) {
8536 3         7 $split_pretoken_flag = 1;
8537             }
8538              
8539 13 50       50 if ( $id_scan_state eq $scan_state_RPAREN ) {
8540 0         0 warning(
8541             "Hit end of line while seeking ) to end prototype\n");
8542             }
8543              
8544 13         29 $id_scan_state = EMPTY_STRING;
8545             }
8546              
8547             # Patch: the deprecated variable $# does not combine with anything
8548             # on the next line.
8549 20 50       64 if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
  0         0  
8550             }
8551              
8552             # Be sure the token index is valid
8553 486 50       1338 if ( $i < 0 ) { $i = 0 }
  0         0  
8554              
8555             # Be sure a token type is defined
8556 486 100       1244 if ( !$type ) {
8557              
8558 458 100       1154 if ($saw_type) {
    100          
8559              
8560 452 100 33     2015 if ($saw_alpha) {
    50 66        
    100 66        
      33        
8561              
8562             # The type without the -> should be the same as with the -> so
8563             # that if they get separated we get the same bond strengths,
8564             # etc. See b1234
8565 348 50 33     1399 if ( $identifier =~ /^->/
      33        
8566             && $last_nonblank_type eq 'w'
8567             && substr( $identifier, 2, 1 ) =~ /^\w/ )
8568             {
8569 0         0 $type = 'w';
8570             }
8571 348         701 else { $type = 'i' }
8572             }
8573             elsif ( $identifier eq '->' ) {
8574 0         0 $type = '->';
8575             }
8576             elsif (
8577             ( length($identifier) > 1 )
8578              
8579             # In something like '@$=' we have an identifier '@$'
8580             # In something like '$${' we have type '$$' (and only
8581             # part of an identifier)
8582             && !( $identifier =~ /\$$/ && $tok eq '{' )
8583              
8584             ## && ( $identifier !~ /^(sub |package )$/ )
8585             && $identifier ne 'sub '
8586             && $identifier ne 'package '
8587             )
8588             {
8589 53         153 $type = 'i';
8590             }
8591 51         113 else { $type = 't' }
8592             }
8593             elsif ($saw_alpha) {
8594              
8595             # type 'w' includes anything without leading type info
8596             # ($,%,@,*) including something like abc::def::ghi
8597 5         13 $type = 'w';
8598              
8599             # Fix for b1337, if restarting scan after line break between
8600             # '->' or sigil and identifier name, use type 'i'
8601 5 50 33     31 if ( $id_scan_state_begin
8602             && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8603             {
8604 5         12 $type = 'i';
8605             }
8606             }
8607             else {
8608 1         3 $type = EMPTY_STRING;
8609             } # this can happen on a restart
8610             }
8611              
8612             # See if we formed an identifier...
8613 486 100       1192 if ($identifier) {
8614 444         832 $tok = $identifier;
8615 444 100       1117 if ($message) { $self->write_logfile_entry($message) }
  1         5  
8616             }
8617              
8618             # did not find an identifier, back up
8619             else {
8620 42         83 $tok = $tok_begin;
8621 42         69 $i = $i_begin;
8622             }
8623              
8624             RETURN:
8625              
8626 486         736 DEBUG_SCAN_ID && do {
8627             my ( $a, $b, $c ) = caller;
8628             print {*STDOUT}
8629             "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
8630             print {*STDOUT}
8631             "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
8632             };
8633 486         2313 return ( $i, $tok, $type, $id_scan_state, $identifier,
8634             $split_pretoken_flag );
8635             } ## end sub scan_complex_identifier
8636             } ## end closure for sub scan_complex_identifier
8637              
8638             { ## closure for sub do_scan_sub
8639              
8640             my %warn_if_lexical;
8641              
8642             BEGIN {
8643              
8644             # lexical subs with these names can cause parsing errors in this version
8645 39     39   297 my @q = qw( m q qq qr qw qx s tr y );
8646 39         3456 @{warn_if_lexical}{@q} = (1) x scalar(@q);
8647             } ## end BEGIN
8648              
8649             # saved package and subnames in case prototype is on separate line
8650             my ( $package_saved, $subname_saved );
8651              
8652             # initialize subname each time a new 'sub' keyword is encountered
8653             sub initialize_subname {
8654 296     296 0 650 $package_saved = EMPTY_STRING;
8655 296         614 $subname_saved = EMPTY_STRING;
8656 296         551 return;
8657             }
8658              
8659             use constant {
8660 39         91828 SUB_CALL => 1,
8661             PAREN_CALL => 2,
8662             PROTOTYPE_CALL => 3,
8663 39     39   352 };
  39         150  
8664              
8665             sub do_scan_sub {
8666              
8667             # do_scan_sub parses a sub name and prototype.
8668              
8669             # At present there are three basic CALL TYPES which are
8670             # distinguished by the starting value of '$tok':
8671             # 1. $tok='sub', id_scan_state='sub'
8672             # it is called with $i_beg equal to the index of the first nonblank
8673             # token following a 'sub' token.
8674             # 2. $tok='(', id_scan_state='sub',
8675             # it is called with $i_beg equal to the index of a '(' which may
8676             # start a prototype.
8677             # 3. $tok='prototype', id_scan_state='prototype'
8678             # it is called with $i_beg equal to the index of a '(' which is
8679             # preceded by ': prototype' and has $id_scan_state eq 'prototype'
8680              
8681             # Examples:
8682              
8683             # A single type 1 call will get both the sub and prototype
8684             # sub foo1 ( $$ ) { }
8685             # ^
8686              
8687             # The subname will be obtained with a 'sub' call
8688             # The prototype on line 2 will be obtained with a '(' call
8689             # sub foo1
8690             # ^ <---call type 1
8691             # ( $$ ) { }
8692             # ^ <---call type 2
8693              
8694             # The subname will be obtained with a 'sub' call
8695             # The prototype will be obtained with a 'prototype' call
8696             # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
8697             # ^ <---type 1 ^ <---type 3
8698              
8699             # TODO: add future error checks to be sure we have a valid
8700             # sub name. For example, 'sub &doit' is wrong. Also, be sure
8701             # a name is given if and only if a non-anonymous sub is
8702             # appropriate.
8703             # USES GLOBAL VARS: $current_package, $last_nonblank_token,
8704             # $rsaw_function_definition,
8705             # $statement_type
8706              
8707 302     302 0 813 my ( $self, $rinput_hash ) = @_;
8708              
8709 302         754 my $input_line = $rinput_hash->{input_line};
8710 302         698 my $i = $rinput_hash->{i};
8711 302         617 my $i_beg = $rinput_hash->{i_beg};
8712 302         647 my $tok = $rinput_hash->{tok};
8713 302         670 my $type = $rinput_hash->{type};
8714 302         591 my $rtokens = $rinput_hash->{rtokens};
8715 302         569 my $rtoken_map = $rinput_hash->{rtoken_map};
8716 302         581 my $id_scan_state = $rinput_hash->{id_scan_state};
8717 302         581 my $max_token_index = $rinput_hash->{max_token_index};
8718              
8719 302         526 my $i_entry = $i;
8720              
8721             # Determine the CALL TYPE
8722             # 1=sub
8723             # 2=(
8724             # 3=prototype
8725 302 100       1095 my $call_type =
    100          
8726             $tok eq 'prototype' ? PROTOTYPE_CALL
8727             : $tok eq '(' ? PAREN_CALL
8728             : SUB_CALL;
8729              
8730 302         1713 $id_scan_state = EMPTY_STRING; # normally we get everything in one call
8731 302         600 my $subname = $subname_saved;
8732 302         551 my $package = $package_saved;
8733 302         642 my $proto = undef;
8734 302         585 my $attrs = undef;
8735 302         529 my $match;
8736              
8737 302         611 my $pos_beg = $rtoken_map->[$i_beg];
8738 302         1020 pos($input_line) = $pos_beg;
8739              
8740             # Look for the sub NAME if this is a SUB call
8741 302 100 100     2812 if (
8742             $call_type == SUB_CALL
8743             && $input_line =~ m{\G\s*
8744             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
8745             (\w+) # NAME - required
8746             }gcx
8747             )
8748             {
8749 123         305 $match = 1;
8750 123         339 $subname = $2;
8751              
8752 123   33     546 my $is_lexical_sub =
8753             $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
8754 123 0 33     436 if ( $is_lexical_sub && $1 ) {
8755 0         0 $self->warning("'my' sub $subname cannot be in package '$1'\n");
8756 0         0 $is_lexical_sub = 0;
8757             }
8758              
8759 123 50       398 if ($is_lexical_sub) {
8760              
8761             # lexical subs use the block sequence number as a package name
8762 0         0 my $seqno =
8763             $rcurrent_sequence_number->[BRACE]
8764             [ $rcurrent_depth->[BRACE] ];
8765 0 0       0 $seqno = 1 if ( !defined($seqno) );
8766 0         0 $package = $seqno;
8767 0 0       0 if ( $warn_if_lexical{$subname} ) {
8768 0         0 $self->warning(
8769             "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
8770             );
8771              
8772             # This may end badly, it is safest to block formatting
8773             # For an example, see perl527/lexsub.t (issue c203)
8774 0         0 $self->[_in_trouble_] = 1;
8775             }
8776             }
8777             else {
8778 123 100 66     841 $package = ( defined($1) && $1 ) ? $1 : $current_package;
8779 123         437 $package =~ s/\'/::/g;
8780 123 50       466 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  0         0  
8781 123         356 $package =~ s/::$//;
8782             }
8783              
8784 123         308 my $pos = pos($input_line);
8785 123         284 my $numc = $pos - $pos_beg;
8786 123         426 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
8787 123         270 $type = 'S'; ## Fix for c250, was 'i';
8788              
8789             # remember the sub name in case another call is needed to
8790             # get the prototype
8791 123         272 $package_saved = $package;
8792 123         303 $subname_saved = $subname;
8793             }
8794              
8795             # Now look for PROTO ATTRS for all call types
8796             # Look for prototype/attributes which are usually on the same
8797             # line as the sub name but which might be on a separate line.
8798             # For example, we might have an anonymous sub with attributes,
8799             # or a prototype on a separate line from its sub name
8800              
8801             # NOTE: We only want to parse PROTOTYPES here. If we see anything that
8802             # does not look like a prototype, we assume it is a SIGNATURE and we
8803             # will stop and let the the standard tokenizer handle it. In
8804             # particular, we stop if we see any nested parens, braces, or commas.
8805             # Also note, a valid prototype cannot contain any alphabetic character
8806             # -- see https://perldoc.perl.org/perlsub
8807             # But it appears that an underscore is valid in a prototype, so the
8808             # regex below uses [A-Za-z] rather than \w
8809             # This is the old regex which has been replaced:
8810             # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
8811 302         1132 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
8812 302 100 100     3075 if (
      66        
8813             $input_line =~ m{\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
8814             (\s*:)? # ATTRS leading ':'
8815             }gcx
8816             && ( $1 || $2 )
8817             )
8818             {
8819 45         148 $proto = $1;
8820 45         111 $attrs = $2;
8821              
8822             # Append the prototype to the starting token if it is 'sub' or
8823             # 'prototype'. This is not necessary but for compatibility with
8824             # previous versions when the -csc flag is used:
8825 45 100 100     299 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
    100 100        
8826 24         63 $tok .= $proto;
8827             }
8828              
8829             # If we just entered the sub at an opening paren on this call, not
8830             # a following :prototype, label it with the previous token. This is
8831             # necessary to propagate the sub name to its opening block.
8832             elsif ( $call_type == PAREN_CALL ) {
8833 2         6 $tok = $last_nonblank_token;
8834             }
8835             else {
8836             }
8837              
8838 45   100     164 $match ||= 1;
8839              
8840             # Patch part #1 to fixes cases b994 and b1053:
8841             # Mark an anonymous sub keyword without prototype as type 'k', i.e.
8842             # 'sub : lvalue { ...'
8843 45         89 $type = 'S'; ## C250, was 'i';
8844 45 100 100     205 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
  2         4  
8845             }
8846              
8847 302 100       937 if ($match) {
8848              
8849             # ATTRS: if there are attributes, back up and let the ':' be
8850             # found later by the scanner.
8851 138         319 my $pos = pos($input_line);
8852 138 100       447 if ($attrs) {
8853 15         31 $pos -= length($attrs);
8854             }
8855              
8856 138         326 my $next_nonblank_token = $tok;
8857              
8858             # catch case of line with leading ATTR ':' after anonymous sub
8859 138 100 100     605 if ( $pos == $pos_beg && $tok eq ':' ) {
8860 1         2 $type = 'A';
8861 1         4 $self->[_in_attribute_list_] = 1;
8862             }
8863              
8864             # Otherwise, if we found a match we must convert back from
8865             # string position to the pre_token index for continued parsing.
8866             else {
8867              
8868             # I don't think an error flag can occur here ..but ?
8869 137         275 my $error;
8870 137         636 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
8871             $max_token_index );
8872 137 50       502 if ($error) { $self->warning("Possibly invalid sub\n") }
  0         0  
8873              
8874             # Patch part #2 to fixes cases b994 and b1053:
8875             # Do not let spaces be part of the token of an anonymous sub
8876             # keyword which we marked as type 'k' above...i.e. for
8877             # something like:
8878             # 'sub : lvalue { ...'
8879             # Back up and let it be parsed as a blank
8880 137 50 66     635 if ( $type eq 'k'
      66        
      33        
8881             && $attrs
8882             && $i > $i_entry
8883             && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8884             {
8885 2         4 $i--;
8886             }
8887              
8888             # check for multiple definitions of a sub
8889 137         404 ( $next_nonblank_token, my $i_next ) =
8890             find_next_nonblank_token_on_this_line( $i, $rtokens,
8891             $max_token_index );
8892             }
8893              
8894 138 100       812 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8895             { # skip blank or side comment
8896 7         81 my ( $rpre_tokens, $rpre_types ) =
8897             $self->peek_ahead_for_n_nonblank_pre_tokens(1);
8898 7 50 33     42 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
  7         30  
8899 7         29 $next_nonblank_token = $rpre_tokens->[0];
8900             }
8901             else {
8902 0         0 $next_nonblank_token = '}';
8903             }
8904             }
8905              
8906             # See what's next...
8907 138 100       661 if ( $next_nonblank_token eq '{' ) {
    100          
    50          
    100          
    50          
    0          
8908 106 100       426 if ($subname) {
8909              
8910             # Check for multiple definitions of a sub, but
8911             # it is ok to have multiple sub BEGIN, etc,
8912             # so we do not complain if name is all caps
8913 96 50 33     603 if ( $rsaw_function_definition->{$subname}{$package}
8914             && $subname !~ /^[A-Z]+$/ )
8915             {
8916             my $lno =
8917 0         0 $rsaw_function_definition->{$subname}{$package};
8918 0 0       0 if ( $package =~ /^\d/ ) {
8919 0         0 $self->warning(
8920             "already saw definition of lexical 'sub $subname' at line $lno\n"
8921             );
8922              
8923             }
8924             else {
8925 0         0 if ( !DEVEL_MODE ) {
8926 0         0 $self->warning(
8927             "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8928             );
8929             }
8930             }
8931             }
8932 96         381 $rsaw_function_definition->{$subname}{$package} =
8933             $self->[_last_line_number_];
8934             }
8935             }
8936             elsif ( $next_nonblank_token eq ';' ) {
8937             }
8938             elsif ( $next_nonblank_token eq '}' ) {
8939             }
8940              
8941             # ATTRS - if an attribute list follows, remember the name
8942             # of the sub so the next opening brace can be labeled.
8943             # Setting 'statement_type' causes any ':'s to introduce
8944             # attributes.
8945             elsif ( $next_nonblank_token eq ':' ) {
8946 16 100       52 if ( $call_type == SUB_CALL ) {
8947 14 100       74 $statement_type =
8948             substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8949             }
8950             }
8951              
8952             # if we stopped before an open paren ...
8953             elsif ( $next_nonblank_token eq '(' ) {
8954              
8955             # If we DID NOT see this paren above then it must be on the
8956             # next line so we will set a flag to come back here and see if
8957             # it is a PROTOTYPE
8958              
8959             # Otherwise, we assume it is a SIGNATURE rather than a
8960             # PROTOTYPE and let the normal tokenizer handle it as a list
8961 15 100       47 if ( !$saw_opening_paren ) {
8962 4         15 $id_scan_state = 'sub'; # we must come back to get proto
8963             }
8964 15 50       64 if ( $call_type == SUB_CALL ) {
8965 15 50       61 $statement_type =
8966             substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8967             }
8968             }
8969              
8970             # something else..
8971             elsif ($next_nonblank_token) {
8972              
8973 0 0 0     0 if ( $rinput_hash->{tok} eq 'method' && $call_type == SUB_CALL )
8974             {
8975             # For a method call, silently ignore this error (rt145706)
8976             # to avoid needless warnings. Example which can produce it:
8977             # test(method Pack (), "method");
8978              
8979             # TODO: scan for use feature 'class' and:
8980             # - if we saw 'use feature 'class' then issue the warning.
8981             # - if we did not see use feature 'class' then issue the
8982             # warning and suggest turning off --use-feature=class
8983             }
8984             else {
8985 0 0       0 $subname = EMPTY_STRING unless defined($subname);
8986 0         0 $self->warning(
8987             "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8988             );
8989             }
8990             }
8991              
8992             # EOF technically ok
8993             else {
8994             }
8995              
8996 138         536 check_prototype( $proto, $package, $subname );
8997             }
8998              
8999             # no match to either sub name or prototype, but line not blank
9000             else {
9001              
9002             }
9003 302         1658 return ( $i, $tok, $type, $id_scan_state );
9004             } ## end sub do_scan_sub
9005             }
9006              
9007             #########################################################################
9008             # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
9009             #########################################################################
9010              
9011             sub find_next_nonblank_token {
9012 6166     6166 0 12204 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9013              
9014             # Returns the next nonblank token after the token at index $i
9015             # To skip past a side comment, and any subsequent block comments
9016             # and blank lines, call with i=$max_token_index
9017              
9018             # Skip any ending blank (fix c258). It would be cleaner if caller passed
9019             # $rtoken_map, so we could check for type 'b', and avoid a regex test, but
9020             # benchmarking shows that this test does not take significant time. So
9021             # that would be a nice update but not essential. Also note that ending
9022             # blanks will not occur for text previously processed by perltidy.
9023 6166 100 100     18973 if ( $i == $max_token_index - 1
9024             && $rtokens->[$max_token_index] =~ /^\s+$/ )
9025             {
9026 9         31 $i++;
9027             }
9028              
9029 6166 100       12845 if ( $i >= $max_token_index ) {
9030 127 100       762 if ( !peeked_ahead() ) {
9031 125         458 peeked_ahead(1);
9032 125         727 $self->peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
9033             }
9034             }
9035              
9036 6166         11341 my $next_nonblank_token = $rtokens->[ ++$i ];
9037 6166 50 33     22160 return ( SPACE, $i )
9038             if ( !defined($next_nonblank_token) || !length($next_nonblank_token) );
9039              
9040             # Quick test for nonblank ascii char. Note that we just have to
9041             # examine the first character here.
9042 6166         12547 my $ord = ord( substr( $next_nonblank_token, 0, 1 ) );
9043 6166 100 66     24940 if ( $ord >= ORD_PRINTABLE_MIN
    50 33        
    0          
9044             && $ord <= ORD_PRINTABLE_MAX )
9045             {
9046 2353         7780 return ( $next_nonblank_token, $i );
9047             }
9048              
9049             # Quick test to skip over an ascii space or tab
9050             elsif ( $ord == ORD_SPACE || $ord == ORD_TAB ) {
9051 3813         6795 $next_nonblank_token = $rtokens->[ ++$i ];
9052 3813 50       8084 return ( SPACE, $i ) unless defined($next_nonblank_token);
9053             }
9054              
9055             # Slow test to skip over something else identified as whitespace
9056             elsif ( $next_nonblank_token =~ /^\s*$/ ) {
9057 0         0 $next_nonblank_token = $rtokens->[ ++$i ];
9058 0 0       0 return ( SPACE, $i ) unless defined($next_nonblank_token);
9059             }
9060             else {
9061             ## at nonblank
9062             }
9063              
9064             # We should be at a nonblank now
9065 3813         11494 return ( $next_nonblank_token, $i );
9066             } ## end sub find_next_nonblank_token
9067              
9068             sub find_next_noncomment_token {
9069 98     98 0 400 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9070              
9071             # Given the current character position, look ahead past any comments
9072             # and blank lines and return the next token, including digraphs and
9073             # trigraphs.
9074              
9075 98         350 my ( $next_nonblank_token, $i_next ) =
9076             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9077              
9078             # skip past any side comment
9079 98 50       564 if ( $next_nonblank_token eq '#' ) {
9080 0         0 ( $next_nonblank_token, $i_next ) =
9081             $self->find_next_nonblank_token( $i_next, $rtokens,
9082             $max_token_index );
9083             }
9084              
9085             # check for a digraph
9086 98 50 33     915 if ( $next_nonblank_token
      33        
9087             && $next_nonblank_token ne SPACE
9088             && defined( $rtokens->[ $i_next + 1 ] ) )
9089             {
9090 98         303 my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
9091 98 100       419 if ( $is_digraph{$test2} ) {
9092 15         41 $next_nonblank_token = $test2;
9093 15         37 $i_next = $i_next + 1;
9094              
9095             # check for a trigraph
9096 15 50       67 if ( defined( $rtokens->[ $i_next + 1 ] ) ) {
9097 15         52 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
9098 15 50       85 if ( $is_trigraph{$test3} ) {
9099 0         0 $next_nonblank_token = $test3;
9100 0         0 $i_next = $i_next + 1;
9101             }
9102             }
9103             }
9104             }
9105              
9106 98         334 return ( $next_nonblank_token, $i_next );
9107             } ## end sub find_next_noncomment_token
9108              
9109             sub is_possible_numerator {
9110              
9111             # Look at the next non-comment character and decide if it could be a
9112             # numerator. Return
9113             # 1 - yes
9114             # 0 - can't tell
9115             # -1 - no
9116              
9117 0     0 0 0 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9118 0         0 my $is_possible_numerator = 0;
9119              
9120 0         0 my $next_token = $rtokens->[ $i + 1 ];
9121 0 0       0 if ( $next_token eq '=' ) { $i++; } # handle /=
  0         0  
9122 0         0 my ( $next_nonblank_token, $i_next ) =
9123             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9124              
9125 0 0       0 if ( $next_nonblank_token eq '#' ) {
9126 0         0 ( $next_nonblank_token, $i_next ) =
9127             $self->find_next_nonblank_token( $max_token_index, $rtokens,
9128             $max_token_index );
9129             }
9130              
9131 0 0       0 if ( $next_nonblank_token =~ / [ \( \$ \w \. \@ ] /x ) {
    0          
9132 0         0 $is_possible_numerator = 1;
9133             }
9134             elsif ( $next_nonblank_token =~ /^\s*$/ ) {
9135 0         0 $is_possible_numerator = 0;
9136             }
9137             else {
9138 0         0 $is_possible_numerator = -1;
9139             }
9140              
9141 0         0 return $is_possible_numerator;
9142             } ## end sub is_possible_numerator
9143              
9144             { ## closure for sub pattern_expected
9145             my %pattern_test;
9146              
9147             BEGIN {
9148              
9149             # List of tokens which may follow a pattern. Note that we will not
9150             # have formed digraphs at this point, so we will see '&' instead of
9151             # '&&' and '|' instead of '||'
9152              
9153             # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
9154 39     39   295 my @q = qw( & && | || ? : + - * and or while if unless);
9155 39         159 push @q, ')', '}', ']', '>', ',', ';';
9156 39         184768 @{pattern_test}{@q} = (1) x scalar(@q);
9157             } ## end BEGIN
9158              
9159             sub pattern_expected {
9160              
9161             # This a filter for a possible pattern.
9162             # It looks at the token after a possible pattern and tries to
9163             # determine if that token could end a pattern.
9164             # returns -
9165             # 1 - yes
9166             # 0 - can't tell
9167             # -1 - no
9168 0     0 0 0 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9169 0         0 my $is_pattern = 0;
9170              
9171 0         0 my $next_token = $rtokens->[ $i + 1 ];
9172 0 0       0 if ( $next_token =~ /^[msixpodualgc]/ ) {
9173 0         0 $i++;
9174             } # skip possible modifier
9175 0         0 my ( $next_nonblank_token, $i_next ) =
9176             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9177              
9178 0 0       0 if ( $pattern_test{$next_nonblank_token} ) {
9179 0         0 $is_pattern = 1;
9180             }
9181             else {
9182              
9183             # Added '#' to fix issue c044
9184 0 0 0     0 if ( $next_nonblank_token =~ /^\s*$/
9185             || $next_nonblank_token eq '#' )
9186             {
9187 0         0 $is_pattern = 0;
9188             }
9189             else {
9190 0         0 $is_pattern = -1;
9191             }
9192             }
9193 0         0 return $is_pattern;
9194             } ## end sub pattern_expected
9195             }
9196              
9197             sub find_next_nonblank_token_on_this_line {
9198 457     457 0 1097 my ( $i, $rtokens, $max_token_index ) = @_;
9199 457         758 my $next_nonblank_token;
9200              
9201 457 100       1202 if ( $i < $max_token_index ) {
9202 449         1087 $next_nonblank_token = $rtokens->[ ++$i ];
9203              
9204 449 100       2035 if ( $next_nonblank_token =~ /^\s*$/ ) {
9205              
9206 121 100       569 if ( $i < $max_token_index ) {
9207 119         352 $next_nonblank_token = $rtokens->[ ++$i ];
9208             }
9209             }
9210             }
9211             else {
9212 8         26 $next_nonblank_token = EMPTY_STRING;
9213             }
9214 457         1524 return ( $next_nonblank_token, $i );
9215             } ## end sub find_next_nonblank_token_on_this_line
9216              
9217             sub find_angle_operator_termination {
9218              
9219             # We are looking at a '<' and want to know if it is an angle operator.
9220             # We are to return:
9221             # $i = pretoken index of ending '>' if found, current $i otherwise
9222             # $type = 'Q' if found, '>' otherwise
9223 8     8 0 31 my ( $self, $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index )
9224             = @_;
9225 8         30 my $i = $i_beg;
9226 8         22 my $type = '<';
9227 8         35 pos($input_line) = 1 + $rtoken_map->[$i];
9228              
9229 8         22 my $filter;
9230              
9231             # we just have to find the next '>' if a term is expected
9232 8 100       47 if ( $expecting == TERM ) { $filter = '[\>]' }
  6 50       17  
9233              
9234             # we have to guess if we don't know what is expected
9235 2         7 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
9236              
9237             # shouldn't happen - we shouldn't be here if operator is expected
9238             else {
9239 0         0 if (DEVEL_MODE) {
9240             $self->Fault(<<EOM);
9241             Bad call to find_angle_operator_termination
9242             EOM
9243             }
9244 0         0 return ( $i, $type );
9245             }
9246              
9247             # To illustrate what we might be looking at, in case we are
9248             # guessing, here are some examples of valid angle operators
9249             # (or file globs):
9250             # <tmp_imp/*>
9251             # <FH>
9252             # <$fh>
9253             # <*.c *.h>
9254             # <_>
9255             # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
9256             # <${PREFIX}*img*.$IMAGE_TYPE>
9257             # <img*.$IMAGE_TYPE>
9258             # <Timg*.$IMAGE_TYPE>
9259             # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
9260             #
9261             # Here are some examples of lines which do not have angle operators:
9262             # return unless $self->[2]++ < $#{$self->[1]};
9263             # < 2 || @$t >
9264             #
9265             # the following line from dlister.pl caused trouble:
9266             # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
9267             #
9268             # If the '<' starts an angle operator, it must end on this line and
9269             # it must not have certain characters like ';' and '=' in it. I use
9270             # this to limit the testing. This filter should be improved if
9271             # possible.
9272              
9273 8 50       204 if ( $input_line =~ /($filter)/g ) {
9274              
9275 8 50       40 if ( $1 eq '>' ) {
9276              
9277             # We MAY have found an angle operator termination if we get
9278             # here, but we need to do more to be sure we haven't been
9279             # fooled.
9280 8         18 my $pos = pos($input_line);
9281              
9282 8         23 my $pos_beg = $rtoken_map->[$i];
9283 8         27 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
9284              
9285             # Test for '<' after possible filehandle, issue c103
9286             # print $fh <>; # syntax error
9287             # print $fh <DATA>; # ok
9288             # print $fh < DATA>; # syntax error at '>'
9289             # print STDERR < DATA>; # ok, prints word 'DATA'
9290             # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
9291 8 100       44 if ( $last_nonblank_type eq 'Z' ) {
9292              
9293             # $str includes brackets; something like '<DATA>'
9294 1 0 33     10 if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
9295             && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
9296             {
9297 0         0 return ( $i, $type );
9298             }
9299             }
9300              
9301             # Reject if the closing '>' follows a '-' as in:
9302             # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
9303 8 100       35 if ( $expecting eq UNKNOWN ) {
9304 2         5 my $check = substr( $input_line, $pos - 2, 1 );
9305 2 100       7 if ( $check eq '-' ) {
9306 1         7 return ( $i, $type );
9307             }
9308             }
9309              
9310             ######################################debug#####
9311             #$self->write_diagnostics( "ANGLE? :$str\n");
9312             #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
9313             ######################################debug#####
9314 7         18 $type = 'Q';
9315 7         21 my $error;
9316 7         37 ( $i, $error ) =
9317             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9318              
9319             # It may be possible that a quote ends midway in a pretoken.
9320             # If this happens, it may be necessary to split the pretoken.
9321 7 50       36 if ($error) {
9322 0         0 if (DEVEL_MODE) {
9323             $self->Fault(<<EOM);
9324             unexpected error condition returned by inverse_pretoken_map
9325             EOM
9326             }
9327             $self->warning(
9328 0         0 "Possible tokinization error..please check this line\n");
9329             }
9330              
9331             # Check for accidental formatting of a markup language doc...
9332             # Formatting will be skipped if we set _html_tag_count_ and
9333             # also set a warning of any kind.
9334 7         19 my $is_html_tag;
9335 7   33     34 my $is_first_string =
9336             $i_beg == 0 && $self->[_last_line_number_] == 1;
9337              
9338             # html comment '<!...' of any type
9339 7 50 33     102 if ( $str =~ /^<\s*!/ ) {
    50          
    50          
9340 0         0 $is_html_tag = 1;
9341 0 0       0 if ($is_first_string) {
9342 0         0 $self->warning(
9343             "looks like a markup language, continuing error checks\n"
9344             );
9345             }
9346             }
9347              
9348             # html end tag, something like </h1>
9349             elsif ( $str =~ /^<\s*\/\w+\s*>$/ ) {
9350 0         0 $is_html_tag = 1;
9351             }
9352              
9353             # xml prolog?
9354             elsif ( $str =~ /^<\?xml\s.*\?>$/i && $is_first_string ) {
9355 0         0 $is_html_tag = 1;
9356 0         0 $self->warning(
9357             "looks like a markup language, continuing error checks\n");
9358             }
9359             else {
9360             ## doesn't look like a markup tag
9361             }
9362              
9363 7 50       29 if ($is_html_tag) {
9364 0         0 $self->[_html_tag_count_]++;
9365             }
9366              
9367             # count blanks on inside of brackets
9368 7         17 my $blank_count = 0;
9369 7 100       38 $blank_count++ if ( $str =~ /<\s+/ );
9370 7 100       36 $blank_count++ if ( $str =~ /\s+>/ );
9371              
9372             # Now let's see where we stand....
9373             # OK if math op not possible
9374 7 100       30 if ( $expecting == TERM ) {
    50          
    50          
    0          
9375             }
9376              
9377             elsif ($is_html_tag) {
9378             }
9379              
9380             # OK if there are no more than 2 non-blank pre-tokens inside
9381             # (not possible to write 2 token math between < and >)
9382             # This catches most common cases
9383             elsif ( $i <= $i_beg + 3 + $blank_count ) {
9384              
9385             # No longer any need to document this common case
9386             ## $self->write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
9387             }
9388              
9389             # OK if there is some kind of identifier inside
9390             # print $fh <tvg::INPUT>;
9391             elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
9392 0         0 $self->write_diagnostics("ANGLE (contains identifier): $str\n");
9393             }
9394              
9395             # Not sure..
9396             else {
9397              
9398             # Let's try a Brace Test: any braces inside must balance
9399 0         0 my $br = 0;
9400 0         0 while ( $str =~ /\{/g ) { $br++ }
  0         0  
9401 0         0 while ( $str =~ /\}/g ) { $br-- }
  0         0  
9402 0         0 my $sb = 0;
9403 0         0 while ( $str =~ /\[/g ) { $sb++ }
  0         0  
9404 0         0 while ( $str =~ /\]/g ) { $sb-- }
  0         0  
9405 0         0 my $pr = 0;
9406 0         0 while ( $str =~ /\(/g ) { $pr++ }
  0         0  
9407 0         0 while ( $str =~ /\)/g ) { $pr-- }
  0         0  
9408              
9409             # if braces do not balance - not angle operator
9410 0 0 0     0 if ( $br || $sb || $pr ) {
      0        
9411 0         0 $i = $i_beg;
9412 0         0 $type = '<';
9413 0         0 $self->write_diagnostics(
9414             "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
9415             }
9416              
9417             # we should keep doing more checks here...to be continued
9418             # Tentatively accepting this as a valid angle operator.
9419             # There are lots more things that can be checked.
9420             else {
9421 0         0 $self->write_diagnostics(
9422             "ANGLE-Guessing yes: $str expecting=$expecting\n");
9423 0         0 $self->write_logfile_entry(
9424             "Guessing angle operator here: $str\n");
9425             }
9426             }
9427             }
9428              
9429             # didn't find ending >
9430             else {
9431 0 0       0 if ( $expecting == TERM ) {
9432 0         0 $self->warning("No ending > for angle operator\n");
9433             }
9434             }
9435             }
9436 7         36 return ( $i, $type );
9437             } ## end sub find_angle_operator_termination
9438              
9439             sub scan_number_do {
9440              
9441             # scan a number in any of the formats that Perl accepts
9442             # Underbars (_) are allowed in decimal numbers.
9443             # input parameters -
9444             # $input_line - the string to scan
9445             # $i - pre_token index to start scanning
9446             # $rtoken_map - reference to the pre_token map giving starting
9447             # character position in $input_line of token $i
9448             # output parameters -
9449             # $i - last pre_token index of the number just scanned
9450             # number - the number (characters); or undef if not a number
9451              
9452 629     629 0 1661 my ( $self, $input_line, $i, $rtoken_map, $input_type, $max_token_index ) =
9453             @_;
9454 629         1156 my $pos_beg = $rtoken_map->[$i];
9455 629         959 my $pos;
9456 629         982 my $i_begin = $i;
9457 629         1062 my $number = undef;
9458 629         1092 my $type = $input_type;
9459              
9460 629         1439 my $first_char = substr( $input_line, $pos_beg, 1 );
9461              
9462             # Look for bad starting characters; Shouldn't happen..
9463 629 50       2807 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
9464 0         0 if (DEVEL_MODE) {
9465             $self->Fault(<<EOM);
9466             Program bug - scan_number given bad first character = '$first_char'
9467             EOM
9468             }
9469 0         0 return ( $i, $type, $number );
9470             }
9471              
9472             # handle v-string without leading 'v' character ('Two Dot' rule)
9473             # (vstring.t)
9474             # Here is the format prior to including underscores:
9475             ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
9476 629         1994 pos($input_line) = $pos_beg;
9477 629 50       3072 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
9478 0         0 $pos = pos($input_line);
9479 0         0 my $numc = $pos - $pos_beg;
9480 0         0 $number = substr( $input_line, $pos_beg, $numc );
9481 0         0 $type = 'v';
9482 0         0 $self->report_v_string($number);
9483             }
9484              
9485             # handle octal, hex, binary
9486 629 50       1642 if ( !defined($number) ) {
9487 629         2375 pos($input_line) = $pos_beg;
9488              
9489             # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
9490             # For reference, the format prior to hex floating point is:
9491             # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
9492             # (hex) (octal) (binary)
9493 629 100       2489 if (
9494             $input_line =~ m{
9495              
9496             \G[+-]?0( # leading [signed] 0
9497              
9498             # a hex float, i.e. '0x0.b17217f7d1cf78p0'
9499             ([xX][0-9a-fA-F_]* # X and optional leading digits
9500             (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
9501             [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
9502             [0-9a-fA-F_]*) # optional Additional exponent digits
9503              
9504             # or hex integer
9505             |([xX][0-9a-fA-F_]+)
9506              
9507             # or octal fraction
9508             |([oO]?[0-7_]+ # string of octal digits
9509             (\.([0-7][0-7_]*)?)? # optional decimal and fraction
9510             [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
9511             [0-7_]*) # Additional exponent digits with underscores
9512              
9513             # or octal integer
9514             |([oO]?[0-7_]+) # string of octal digits
9515              
9516             # or a binary float
9517             |([bB][01_]* # 'b' with string of binary digits
9518             (\.([01][01_]*)?)? # optional decimal and fraction
9519             [Pp][+-]?[01] # Required exponent indicator, no underscore
9520             [01_]*) # additional exponent bits
9521              
9522             # or binary integer
9523             |([bB][01_]+) # 'b' with string of binary digits
9524              
9525             )}gx
9526             )
9527             {
9528 72         132 $pos = pos($input_line);
9529 72         125 my $numc = $pos - $pos_beg;
9530 72         150 $number = substr( $input_line, $pos_beg, $numc );
9531 72         132 $type = 'n';
9532             }
9533             }
9534              
9535             # handle decimal
9536 629 100       1504 if ( !defined($number) ) {
9537 557         1136 pos($input_line) = $pos_beg;
9538              
9539 557 50       2830 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9540 557         1066 $pos = pos($input_line);
9541              
9542             # watch out for things like 0..40 which would give 0. by this;
9543 557 100 100     2054 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9544             && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9545             {
9546 37         76 $pos--;
9547             }
9548 557         1006 my $numc = $pos - $pos_beg;
9549 557         1078 $number = substr( $input_line, $pos_beg, $numc );
9550 557         1063 $type = 'n';
9551             }
9552             }
9553              
9554             # filter out non-numbers like e + - . e2 .e3 +e6
9555             # the rule: at least one digit, and any 'e' must be preceded by a digit
9556 629 100 66     3397 if (
      66        
9557             $number !~ /\d/ # no digits
9558             || ( $number =~ /^(.*)[eE]/
9559             && $1 !~ /\d/ ) # or no digits before the 'e'
9560             )
9561             {
9562 303         498 $number = undef;
9563 303         540 $type = $input_type;
9564 303         1269 return ( $i, $type, $number );
9565             }
9566              
9567             # Found a number; now we must convert back from character position
9568             # to pre_token index. An error here implies user syntax error.
9569             # An example would be an invalid octal number like '009'.
9570 326         585 my $error;
9571 326         861 ( $i, $error ) =
9572             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9573 326 50       989 if ($error) { $self->warning("Possibly invalid number\n") }
  0         0  
9574              
9575 326         1285 return ( $i, $type, $number );
9576             } ## end sub scan_number_do
9577              
9578             sub inverse_pretoken_map {
9579              
9580             # Starting with the current pre_token index $i, scan forward until
9581             # finding the index of the next pre_token whose position is $pos.
9582 2170     2170 0 5101 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9583 2170         3561 my $error = 0;
9584              
9585 2170         5449 while ( ++$i <= $max_token_index ) {
9586              
9587 4040 100       9699 if ( $pos <= $rtoken_map->[$i] ) {
9588              
9589             # Let the calling routine handle errors in which we do not
9590             # land on a pre-token boundary. It can happen by running
9591             # perltidy on some non-perl scripts, for example.
9592 2135 50       5495 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
  0         0  
9593 2135         3655 $i--;
9594 2135         4032 last;
9595             }
9596             }
9597 2170         5339 return ( $i, $error );
9598             } ## end sub inverse_pretoken_map
9599              
9600             sub find_here_doc {
9601              
9602             # find the target of a here document, if any
9603             # input parameters:
9604             # $i - token index of the second < of <<
9605             # ($i must be less than the last token index if this is called)
9606             # output parameters:
9607             # $found_target = 0 didn't find target; =1 found target
9608             # HERE_TARGET - the target string (may be empty string)
9609             # $i - unchanged if not here doc,
9610             # or index of the last token of the here target
9611             # $saw_error - flag noting unbalanced quote on here target
9612 9     9 0 44 my ( $self, $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9613              
9614 9         22 my $ibeg = $i;
9615 9         23 my $found_target = 0;
9616 9         27 my $here_doc_target = EMPTY_STRING;
9617 9         22 my $here_quote_character = EMPTY_STRING;
9618 9         26 my $saw_error = 0;
9619 9         29 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9620 9         26 $next_token = $rtokens->[ $i + 1 ];
9621              
9622             # perl allows a backslash before the target string (heredoc.t)
9623 9         24 my $backslash = 0;
9624 9 50       40 if ( $next_token eq '\\' ) {
9625 0         0 $backslash = 1;
9626 0         0 $next_token = $rtokens->[ $i + 2 ];
9627             }
9628              
9629 9         50 ( $next_nonblank_token, $i_next_nonblank ) =
9630             find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9631              
9632 9 100 33     88 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
    50          
    50          
9633              
9634 6         17 my $in_quote = 1;
9635 6         13 my $quote_depth = 0;
9636 6         15 my $quote_pos = 0;
9637 6         14 my $quoted_string;
9638              
9639             (
9640 6         38 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9641             $quoted_string
9642             )
9643             = $self->follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9644             $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9645              
9646 6 50       37 if ($in_quote) { # didn't find end of quote, so no target found
9647 0         0 $i = $ibeg;
9648 0 0       0 if ( $expecting == TERM ) {
9649 0         0 $self->warning(
9650             "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9651             );
9652 0         0 $saw_error = 1;
9653             }
9654             }
9655             else { # found ending quote
9656 6         19 $found_target = 1;
9657              
9658 6         11 my $tokj;
9659 6         28 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
9660 6         20 $tokj = $rtokens->[$j];
9661              
9662             # we have to remove any backslash before the quote character
9663             # so that the here-doc-target exactly matches this string
9664             next
9665 6 0 33     37 if ( $tokj eq "\\"
      33        
9666             && $j < $i - 1
9667             && $rtokens->[ $j + 1 ] eq $here_quote_character );
9668 6         25 $here_doc_target .= $tokj;
9669             }
9670             }
9671             }
9672              
9673             elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
9674 0         0 $found_target = 1;
9675 0         0 $self->write_logfile_entry(
9676             "found blank here-target after <<; suggest using \"\"\n");
9677 0         0 $i = $ibeg;
9678             }
9679             elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
9680              
9681 3         9 my $here_doc_expected;
9682 3 50       20 if ( $expecting == UNKNOWN ) {
9683 0         0 $here_doc_expected = $self->guess_if_here_doc($next_token);
9684             }
9685             else {
9686 3         11 $here_doc_expected = 1;
9687             }
9688              
9689 3 50       13 if ($here_doc_expected) {
9690 3         11 $found_target = 1;
9691 3         7 $here_doc_target = $next_token;
9692 3         8 $i = $ibeg + 1;
9693             }
9694              
9695             }
9696             else {
9697              
9698 0 0       0 if ( $expecting == TERM ) {
9699 0         0 $found_target = 1;
9700 0         0 $self->write_logfile_entry("Note: bare here-doc operator <<\n");
9701             }
9702             else {
9703 0         0 $i = $ibeg;
9704             }
9705             }
9706              
9707             # patch to neglect any prepended backslash
9708 9 50 33     68 if ( $found_target && $backslash ) { $i++ }
  0         0  
9709              
9710 9         53 return ( $found_target, $here_doc_target, $here_quote_character, $i,
9711             $saw_error );
9712             } ## end sub find_here_doc
9713              
9714             sub do_quote {
9715              
9716             # follow (or continue following) quoted string(s)
9717             # $in_quote return code:
9718             # 0 - ok, found end
9719             # 1 - still must find end of quote whose target is $quote_character
9720             # 2 - still looking for end of first of two quotes
9721             #
9722             # Returns updated strings:
9723             # $quoted_string_1 = quoted string seen while in_quote=1
9724             # $quoted_string_2 = quoted string seen while in_quote=2
9725             my (
9726              
9727 2768     2768 0 7959 $self,
9728             $i,
9729             $in_quote,
9730             $quote_character,
9731             $quote_pos,
9732             $quote_depth,
9733             $quoted_string_1,
9734             $quoted_string_2,
9735             $rtokens,
9736             $rtoken_map,
9737             $max_token_index,
9738              
9739             ) = @_;
9740              
9741 2768         4093 my $quoted_string;
9742 2768 100       6353 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
9743 29         58 my $ibeg = $i;
9744             (
9745 29         154 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9746             $quoted_string
9747             )
9748             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
9749             $quote_character, $quote_pos, $quote_depth, $max_token_index );
9750 29         86 $quoted_string_2 .= $quoted_string;
9751 29 50       101 if ( $in_quote == 1 ) {
9752 29 50       110 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
  0         0  
9753 29         71 $quote_character = EMPTY_STRING;
9754             }
9755             else {
9756 0         0 $quoted_string_2 .= "\n";
9757             }
9758             }
9759              
9760 2768 50       5979 if ( $in_quote == 1 ) { # one (more) quote to follow
9761 2768         4425 my $ibeg = $i;
9762             (
9763 2768         7652 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9764             $quoted_string
9765             )
9766             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
9767             $quote_character, $quote_pos, $quote_depth, $max_token_index );
9768 2768         5952 $quoted_string_1 .= $quoted_string;
9769 2768 100       6210 if ( $in_quote == 1 ) {
9770 183         375 $quoted_string_1 .= "\n";
9771             }
9772             }
9773             return (
9774              
9775 2768         9569 $i,
9776             $in_quote,
9777             $quote_character,
9778             $quote_pos,
9779             $quote_depth,
9780             $quoted_string_1,
9781             $quoted_string_2,
9782              
9783             );
9784             } ## end sub do_quote
9785              
9786             sub follow_quoted_string {
9787              
9788             # scan for a specific token, skipping escaped characters
9789             # if the quote character is blank, use the first non-blank character
9790             # input parameters:
9791             # $rtokens = reference to the array of tokens
9792             # $i = the token index of the first character to search
9793             # $in_quote = number of quoted strings being followed
9794             # $beginning_tok = the starting quote character
9795             # $quote_pos = index to check next for alphanumeric delimiter
9796             # output parameters:
9797             # $i = the token index of the ending quote character
9798             # $in_quote = decremented if found end, unchanged if not
9799             # $beginning_tok = the starting quote character
9800             # $quote_pos = index to check next for alphanumeric delimiter
9801             # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
9802             # $quoted_string = the text of the quote (without quotation tokens)
9803             my (
9804              
9805 2814     2814 0 6539 $self,
9806             $i_beg,
9807             $in_quote,
9808             $rtokens,
9809             $beginning_tok,
9810             $quote_pos,
9811             $quote_depth,
9812             $max_token_index,
9813              
9814             ) = @_;
9815              
9816 2814         4447 my ( $tok, $end_tok );
9817 2814         4690 my $i = $i_beg - 1;
9818 2814         4248 my $quoted_string = EMPTY_STRING;
9819              
9820 2814         4024 0 && do {
9821             print {*STDOUT}
9822             "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
9823             };
9824              
9825             # get the corresponding end token
9826 2814 100       13276 if ( $beginning_tok !~ /^\s*$/ ) {
9827 183         560 $end_tok = matching_end_token($beginning_tok);
9828             }
9829              
9830             # a blank token means we must find and use the first non-blank one
9831             else {
9832 2631 100       6567 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
9833              
9834 2631         6092 while ( $i < $max_token_index ) {
9835 2631         4996 $tok = $rtokens->[ ++$i ];
9836              
9837 2631 50       8026 if ( $tok !~ /^\s*$/ ) {
9838              
9839 2631 50 66     7838 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
9840 0         0 $i = $max_token_index;
9841             }
9842             else {
9843              
9844 2631 100       5746 if ( length($tok) > 1 ) {
9845 1 50       5 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
  1         2  
9846 1         5 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
9847             }
9848             else {
9849 2630         5328 $beginning_tok = $tok;
9850 2630         4067 $quote_pos = 0;
9851             }
9852 2631         6289 $end_tok = matching_end_token($beginning_tok);
9853 2631         4642 $quote_depth = 1;
9854 2631         4694 last;
9855             }
9856             }
9857             else {
9858 0         0 $allow_quote_comments = 1;
9859             }
9860             }
9861             }
9862              
9863             # There are two different loops which search for the ending quote
9864             # character. In the rare case of an alphanumeric quote delimiter, we
9865             # have to look through alphanumeric tokens character-by-character, since
9866             # the pre-tokenization process combines multiple alphanumeric
9867             # characters, whereas for a non-alphanumeric delimiter, only tokens of
9868             # length 1 can match.
9869              
9870             #----------------------------------------------------------------
9871             # Case 1 (rare): loop for case of alphanumeric quote delimiter..
9872             # "quote_pos" is the position the current word to begin searching
9873             #----------------------------------------------------------------
9874 2814 100       7448 if ( $beginning_tok =~ /\w/ ) {
9875              
9876             # Note this because it is not recommended practice except
9877             # for obfuscated perl contests
9878 1 50       5 if ( $in_quote == 1 ) {
9879 1         7 $self->write_logfile_entry(
9880             "Note: alphanumeric quote delimiter ($beginning_tok) \n");
9881             }
9882              
9883             # Note: changed < to <= here to fix c109. Relying on extra end blanks.
9884 1         12 while ( $i <= $max_token_index ) {
9885              
9886 4 100 66     15 if ( $quote_pos == 0 || ( $i < 0 ) ) {
9887 3         17 $tok = $rtokens->[ ++$i ];
9888              
9889 3 100       10 if ( $tok eq '\\' ) {
9890              
9891             # retain backslash unless it hides the end token
9892 1 50       26 $quoted_string .= $tok
9893             unless $rtokens->[ $i + 1 ] eq $end_tok;
9894 1         2 $quote_pos++;
9895 1 50       6 last if ( $i >= $max_token_index );
9896 1         7 $tok = $rtokens->[ ++$i ];
9897             }
9898             }
9899 4         6 my $old_pos = $quote_pos;
9900              
9901 4         8 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
9902              
9903 4 100       11 if ( $quote_pos > 0 ) {
9904              
9905 1         5 $quoted_string .=
9906             substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
9907              
9908             # NOTE: any quote modifiers will be at the end of '$tok'. If we
9909             # wanted to check them, this is the place to get them. But
9910             # this quote form is rarely used in practice, so it isn't
9911             # worthwhile.
9912              
9913 1         1 $quote_depth--;
9914              
9915 1 50       5 if ( $quote_depth == 0 ) {
9916 1         2 $in_quote--;
9917 1         2 last;
9918             }
9919             }
9920             else {
9921 3 50       8 if ( $old_pos <= length($tok) ) {
9922 3         10 $quoted_string .= substr( $tok, $old_pos );
9923             }
9924             }
9925             }
9926             }
9927              
9928             #-----------------------------------------------------------------------
9929             # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
9930             #-----------------------------------------------------------------------
9931             else {
9932              
9933 2813         6776 while ( $i < $max_token_index ) {
9934 10798         16451 $tok = $rtokens->[ ++$i ];
9935              
9936 10798 100       24351 if ( $tok eq $end_tok ) {
    100          
    100          
9937 2620         4169 $quote_depth--;
9938              
9939 2620 100       6051 if ( $quote_depth == 0 ) {
9940 2619         3745 $in_quote--;
9941 2619         4151 last;
9942             }
9943             }
9944             elsif ( $tok eq $beginning_tok ) {
9945 1         12 $quote_depth++;
9946             }
9947             elsif ( $tok eq '\\' ) {
9948              
9949             # retain backslash unless it hides the beginning or end token
9950 376         1003 $tok = $rtokens->[ ++$i ];
9951 376 100 100     2060 $quoted_string .= '\\'
9952             if ( $tok ne $end_tok && $tok ne $beginning_tok );
9953             }
9954             else {
9955             ## nothing special
9956             }
9957 8179         15531 $quoted_string .= $tok;
9958             }
9959             }
9960 2814 50       6485 if ( $i > $max_token_index ) { $i = $max_token_index }
  0         0  
9961             return (
9962              
9963 2814         10866 $i,
9964             $in_quote,
9965             $beginning_tok,
9966             $quote_pos,
9967             $quote_depth,
9968             $quoted_string,
9969              
9970             );
9971             } ## end sub follow_quoted_string
9972              
9973             sub indicate_error {
9974 0     0 0 0 my ( $self, $msg, $line_number, $input_line, $pos, $carrat ) = @_;
9975 0         0 $self->interrupt_logfile();
9976 0         0 $self->warning($msg);
9977 0         0 $self->write_error_indicator_pair( $line_number, $input_line, $pos,
9978             $carrat );
9979 0         0 $self->resume_logfile();
9980 0         0 return;
9981             } ## end sub indicate_error
9982              
9983             sub write_error_indicator_pair {
9984 0     0 0 0 my ( $self, $line_number, $input_line, $pos, $carrat ) = @_;
9985 0         0 my ( $offset, $numbered_line, $underline ) =
9986             make_numbered_line( $line_number, $input_line, $pos );
9987 0         0 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
9988 0         0 $self->warning( $numbered_line . "\n" );
9989 0         0 $underline =~ s/\s*$//;
9990 0         0 $self->warning( $underline . "\n" );
9991 0         0 return;
9992             } ## end sub write_error_indicator_pair
9993              
9994             sub make_numbered_line {
9995              
9996             # Given an input line, its line number, and a character position of
9997             # interest, create a string not longer than 80 characters of the form
9998             # $lineno: sub_string
9999             # such that the sub_string of $str contains the position of interest
10000             #
10001             # Here is an example of what we want, in this case we add trailing
10002             # '...' because the line is long.
10003             #
10004             # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
10005             #
10006             # Here is another example, this time in which we used leading '...'
10007             # because of excessive length:
10008             #
10009             # 2: ... er of the World Wide Web Consortium's
10010             #
10011             # input parameters are:
10012             # $lineno = line number
10013             # $str = the text of the line
10014             # $pos = position of interest (the error) : 0 = first character
10015             #
10016             # We return :
10017             # - $offset = an offset which corrects the position in case we only
10018             # display part of a line, such that $pos-$offset is the effective
10019             # position from the start of the displayed line.
10020             # - $numbered_line = the numbered line as above,
10021             # - $underline = a blank 'underline' which is all spaces with the same
10022             # number of characters as the numbered line.
10023              
10024 0     0 0 0 my ( $lineno, $str, $pos ) = @_;
10025 0 0       0 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
10026 0         0 my $excess = length($str) - $offset - 68;
10027 0 0       0 my $numc = ( $excess > 0 ) ? 68 : undef;
10028              
10029 0 0       0 if ( defined($numc) ) {
10030 0 0       0 if ( $offset == 0 ) {
10031 0         0 $str = substr( $str, $offset, $numc - 4 ) . " ...";
10032             }
10033             else {
10034 0         0 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
10035             }
10036             }
10037             else {
10038              
10039 0 0       0 if ( $offset == 0 ) {
10040             }
10041             else {
10042 0         0 $str = "... " . substr( $str, $offset + 4 );
10043             }
10044             }
10045              
10046 0         0 my $numbered_line = sprintf( "%d: ", $lineno );
10047 0         0 $offset -= length($numbered_line);
10048 0         0 $numbered_line .= $str;
10049 0         0 my $underline = SPACE x length($numbered_line);
10050 0         0 return ( $offset, $numbered_line, $underline );
10051             } ## end sub make_numbered_line
10052              
10053             sub write_on_underline {
10054              
10055             # The "underline" is a string that shows where an error is; it starts
10056             # out as a string of blanks with the same length as the numbered line of
10057             # code above it, and we have to add marking to show where an error is.
10058             # In the example below, we want to write the string '--^' just below
10059             # the line of bad code:
10060             #
10061             # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
10062             # ---^
10063             # We are given the current underline string, plus a position and a
10064             # string to write on it.
10065             #
10066             # In the above example, there will be 2 calls to do this:
10067             # First call: $pos=19, pos_chr=^
10068             # Second call: $pos=16, pos_chr=---
10069             #
10070             # This is a trivial thing to do with substr, but there is some
10071             # checking to do.
10072              
10073 0     0 0 0 my ( $underline, $pos, $pos_chr ) = @_;
10074              
10075             # check for error..shouldn't happen
10076 0 0 0     0 if ( $pos < 0 || $pos > length($underline) ) {
10077 0         0 return $underline;
10078             }
10079 0         0 my $excess = length($pos_chr) + $pos - length($underline);
10080 0 0       0 if ( $excess > 0 ) {
10081 0         0 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
10082             }
10083 0         0 substr( $underline, $pos, length($pos_chr), $pos_chr );
10084 0         0 return ($underline);
10085             } ## end sub write_on_underline
10086              
10087             sub pre_tokenize {
10088              
10089 6195     6195 0 12861 my ( $str, $max_tokens_wanted ) = @_;
10090              
10091             # Input parameters:
10092             # $str = string to be parsed
10093             # $max_tokens_wanted > 0 to stop on reaching this many tokens.
10094             # = undef or 0 means get all tokens
10095              
10096             # Break a string, $str, into a sequence of preliminary tokens (pre-tokens).
10097             # We look for these types of tokens:
10098             # words (type='w'), example: 'max_tokens_wanted'
10099             # digits (type = 'd'), example: '0755'
10100             # whitespace (type = 'b'), example: ' '
10101             # single character punct (type = char) example: '='
10102              
10103             # Later operations will combine one or more of these pre-tokens into final
10104             # tokens. We cannot do better than this yet because we might be in a
10105             # quoted string or pattern.
10106              
10107             # An advantage of doing this pre-tokenization step is that it keeps almost
10108             # all of the regex parsing very simple and localized right here. A
10109             # disadvantage is that in some extremely rare instances we will have to go
10110             # back and split a pre-token.
10111              
10112             # Return parameters:
10113 6195         10273 my @tokens = (); # array of the tokens themselves
10114 6195         12682 my @token_map = (0); # string position of start of each token
10115 6195         10091 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
10116              
10117 6195 100       13121 if ( !$max_tokens_wanted ) { $max_tokens_wanted = -1 }
  5910         9137  
10118              
10119 6195         14214 while ( $max_tokens_wanted-- ) {
10120              
10121 82230 100       229074 if (
10122             $str =~ m{
10123             \G(
10124             (\s+) # type 'b' = whitespace - this must come before \W
10125             | (\W) # or type=char = single-character, non-whitespace punct
10126             | (\d+) # or type 'd' = sequence of digits - must come before \w
10127             | (\w+) # or type 'w' = words not starting with a digit
10128             )
10129             }gcx
10130             )
10131             {
10132 76179         161064 push @tokens, $1;
10133 76179 100       186872 push @type,
    100          
    100          
10134             defined($2) ? 'b' : defined($3) ? $1 : defined($4) ? 'd' : 'w';
10135 76179         150863 push @token_map, pos($str);
10136             }
10137              
10138             # that's all..
10139             else {
10140 6051         37545 return ( \@tokens, \@token_map, \@type );
10141             }
10142             }
10143              
10144 144         917 return ( \@tokens, \@token_map, \@type );
10145             } ## end sub pre_tokenize
10146              
10147             sub show_tokens {
10148              
10149             # this is an old debug routine
10150             # not called, but saved for reference
10151 0     0 0 0 my ( $rtokens, $rtoken_map ) = @_;
10152 0         0 my $num = scalar( @{$rtokens} );
  0         0  
10153              
10154 0         0 foreach my $i ( 0 .. $num - 1 ) {
10155 0         0 my $len = length( $rtokens->[$i] );
10156 0         0 print {*STDOUT} "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
  0         0  
10157             }
10158 0         0 return;
10159             } ## end sub show_tokens
10160              
10161             { ## closure for sub matching end token
10162             my %matching_end_token;
10163              
10164             BEGIN {
10165 39     39   62235 %matching_end_token = (
10166             '{' => '}',
10167             '(' => ')',
10168             '[' => ']',
10169             '<' => '>',
10170             );
10171             } ## end BEGIN
10172              
10173             sub matching_end_token {
10174              
10175             # return closing character for a pattern
10176 2998     2998 0 4991 my $beginning_token = shift;
10177 2998 100       7656 if ( $matching_end_token{$beginning_token} ) {
10178 373         882 return $matching_end_token{$beginning_token};
10179             }
10180 2625         5924 return ($beginning_token);
10181             } ## end sub matching_end_token
10182             }
10183              
10184             sub dump_token_types {
10185 0     0 0   my ( $class, $fh ) = @_;
10186              
10187             # This should be the latest list of token types in use
10188             # adding NEW_TOKENS: add a comment here
10189 0           $fh->print(<<'END_OF_LIST');
10190              
10191             Here is a list of the token types currently used for lines of type 'CODE'.
10192             For the following tokens, the "type" of a token is just the token itself.
10193              
10194             .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
10195             ( ) <= >= == =~ !~ != ++ -- /= x=
10196             ... **= <<= >>= &&= ||= //= <=>
10197             , + - / * | % ! x ~ = \ ? : . < > ^ &
10198              
10199             The following additional token types are defined:
10200              
10201             type meaning
10202             b blank (white space)
10203             { indent: opening structural curly brace or square bracket or paren
10204             (code block, anonymous hash reference, or anonymous array reference)
10205             } outdent: right structural curly brace or square bracket or paren
10206             [ left non-structural square bracket (enclosing an array index)
10207             ] right non-structural square bracket
10208             ( left non-structural paren (all but a list right of an =)
10209             ) right non-structural paren
10210             L left non-structural curly brace (enclosing a key)
10211             R right non-structural curly brace
10212             ; terminal semicolon
10213             f indicates a semicolon in a "for" statement
10214             h here_doc operator <<
10215             # a comment
10216             Q indicates a quote or pattern
10217             q indicates a qw quote block
10218             k a perl keyword
10219             C user-defined constant or constant function (with void prototype = ())
10220             U user-defined function taking parameters
10221             G user-defined function taking block parameter (like grep/map/eval)
10222             S sub definition (reported as type 'i' in older versions)
10223             P package definition (reported as type 'i' in older versions)
10224             t type indicater such as %,$,@,*,&,sub
10225             w bare word (perhaps a subroutine call)
10226             i identifier of some type (with leading %, $, @, *, &, sub, -> )
10227             n a number
10228             v a v-string
10229             F a file test operator (like -e)
10230             Y File handle
10231             Z identifier in indirect object slot: may be file handle, object
10232             J LABEL: code block label
10233             j LABEL after next, last, redo, goto
10234             p unary +
10235             m unary -
10236             pp pre-increment operator ++
10237             mm pre-decrement operator --
10238             A : used as attribute separator
10239            
10240             Here are the '_line_type' codes used internally:
10241             SYSTEM - system-specific code before hash-bang line
10242             CODE - line of perl code (including comments)
10243             POD_START - line starting pod, such as '=head'
10244             POD - pod documentation text
10245             POD_END - last line of pod section, '=cut'
10246             HERE - text of here-document
10247             HERE_END - last line of here-doc (target word)
10248             FORMAT - format section
10249             FORMAT_END - last line of format section, '.'
10250             SKIP - code skipping section
10251             SKIP_END - last line of code skipping section, '#>>V'
10252             DATA_START - __DATA__ line
10253             DATA - unidentified text following __DATA__
10254             END_START - __END__ line
10255             END - unidentified text following __END__
10256             ERROR - we are in big trouble, probably not a perl script
10257             END_OF_LIST
10258              
10259 0           return;
10260             } ## end sub dump_token_types
10261              
10262             BEGIN {
10263              
10264             # These names are used in error messages
10265 39     39   373 @opening_brace_names = qw# '{' '[' '(' '?' #;
10266 39         294 @closing_brace_names = qw# '}' ']' ')' ':' #;
10267              
10268 39         161 my @q;
10269              
10270 39         262 my @digraphs = qw(
10271             .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
10272             <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
10273             );
10274 39         692 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
10275              
10276 39         237 @q = qw(
10277             . : < > * & | / - = + - % ^ ! x ~
10278             );
10279 39         380 @can_start_digraph{@q} = (1) x scalar(@q);
10280              
10281 39         183 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
10282 39         294 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
10283              
10284 39         114 my @tetragraphs = qw( <<>> );
10285 39         194 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
10286              
10287             # make a hash of all valid token types for self-checking the tokenizer
10288             # (adding NEW_TOKENS : select a new character and add to this list)
10289             # fix for c250: added new token type 'P' and 'S'
10290 39         764 my @valid_token_types = qw#
10291             A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v P S
10292             { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
10293             #;
10294 39         313 push( @valid_token_types, @digraphs );
10295 39         228 push( @valid_token_types, @trigraphs );
10296 39         101 push( @valid_token_types, @tetragraphs );
10297 39         95 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
10298 39         1734 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
10299              
10300             # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
10301 39         260 my @file_test_operators =
10302             qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
10303 39         542 @is_file_test_operator{@file_test_operators} =
10304             (1) x scalar(@file_test_operators);
10305              
10306             # these functions have prototypes of the form (&), so when they are
10307             # followed by a block, that block MAY BE followed by an operator.
10308             # Smartmatch operator ~~ may be followed by anonymous hash or array ref
10309 39         178 @q = qw( do eval );
10310 39         186 @is_block_operator{@q} = (1) x scalar(@q);
10311              
10312             # these functions allow an identifier in the indirect object slot
10313 39         112 @q = qw( print printf sort exec system say);
10314 39         222 @is_indirect_object_taker{@q} = (1) x scalar(@q);
10315              
10316             # Note: 'field' will be added by sub check_options if --use-feature=class
10317 39         128 @q = qw(my our state);
10318 39         149 @is_my_our_state{@q} = (1) x scalar(@q);
10319              
10320             # These tokens may precede a code block
10321             # patched for SWITCH/CASE/CATCH. Actually these could be removed
10322             # now and we could let the extended-syntax coding handle them.
10323             # Added 'default' for Switch::Plain.
10324             # Note: 'ADJUST' will be added by sub check_options if --use-feature=class
10325 39         255 @q =
10326             qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
10327             unless do while until eval for foreach map grep sort
10328             switch case given when default catch try finally);
10329 39         812 @is_code_block_token{@q} = (1) x scalar(@q);
10330              
10331             # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
10332             # to contrast it with the block types in '%is_zero_continuation_block_type'
10333 39         208 @q = qw( sort map grep eval do );
10334 39         160 @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
10335              
10336 39         99 @q = qw( sort map grep );
10337 39         154 @is_sort_map_grep{@q} = (1) x scalar(@q);
10338              
10339 39         85 %is_grep_alias = ();
10340              
10341             # I'll build the list of keywords incrementally
10342 39         93 my @Keywords = ();
10343              
10344             # keywords and tokens after which a value or pattern is expected,
10345             # but not an operator. In other words, these should consume terms
10346             # to their right, or at least they are not expected to be followed
10347             # immediately by operators.
10348 39         1547 my @value_requestor = qw(
10349             AUTOLOAD
10350             BEGIN
10351             CHECK
10352             DESTROY
10353             END
10354             EQ
10355             GE
10356             GT
10357             INIT
10358             LE
10359             LT
10360             NE
10361             UNITCHECK
10362             abs
10363             accept
10364             alarm
10365             and
10366             atan2
10367             bind
10368             binmode
10369             bless
10370             break
10371             caller
10372             chdir
10373             chmod
10374             chomp
10375             chop
10376             chown
10377             chr
10378             chroot
10379             close
10380             closedir
10381             cmp
10382             connect
10383             continue
10384             cos
10385             crypt
10386             dbmclose
10387             dbmopen
10388             defined
10389             delete
10390             die
10391             dump
10392             each
10393             else
10394             elsif
10395             eof
10396             eq
10397             evalbytes
10398             exec
10399             exists
10400             exit
10401             exp
10402             fc
10403             fcntl
10404             fileno
10405             flock
10406             for
10407             foreach
10408             formline
10409             ge
10410             getc
10411             getgrgid
10412             getgrnam
10413             gethostbyaddr
10414             gethostbyname
10415             getnetbyaddr
10416             getnetbyname
10417             getpeername
10418             getpgrp
10419             getpriority
10420             getprotobyname
10421             getprotobynumber
10422             getpwnam
10423             getpwuid
10424             getservbyname
10425             getservbyport
10426             getsockname
10427             getsockopt
10428             glob
10429             gmtime
10430             goto
10431             grep
10432             gt
10433             hex
10434             if
10435             index
10436             int
10437             ioctl
10438             join
10439             keys
10440             kill
10441             last
10442             lc
10443             lcfirst
10444             le
10445             length
10446             link
10447             listen
10448             local
10449             localtime
10450             lock
10451             log
10452             lstat
10453             lt
10454             map
10455             mkdir
10456             msgctl
10457             msgget
10458             msgrcv
10459             msgsnd
10460             my
10461             ne
10462             next
10463             no
10464             not
10465             oct
10466             open
10467             opendir
10468             or
10469             ord
10470             our
10471             pack
10472             pipe
10473             pop
10474             pos
10475             print
10476             printf
10477             prototype
10478             push
10479             quotemeta
10480             rand
10481             read
10482             readdir
10483             readlink
10484             readline
10485             readpipe
10486             recv
10487             redo
10488             ref
10489             rename
10490             require
10491             reset
10492             return
10493             reverse
10494             rewinddir
10495             rindex
10496             rmdir
10497             scalar
10498             seek
10499             seekdir
10500             select
10501             semctl
10502             semget
10503             semop
10504             send
10505             sethostent
10506             setnetent
10507             setpgrp
10508             setpriority
10509             setprotoent
10510             setservent
10511             setsockopt
10512             shift
10513             shmctl
10514             shmget
10515             shmread
10516             shmwrite
10517             shutdown
10518             sin
10519             sleep
10520             socket
10521             socketpair
10522             sort
10523             splice
10524             split
10525             sprintf
10526             sqrt
10527             srand
10528             stat
10529             state
10530             study
10531             substr
10532             symlink
10533             syscall
10534             sysopen
10535             sysread
10536             sysseek
10537             system
10538             syswrite
10539             tell
10540             telldir
10541             tie
10542             tied
10543             truncate
10544             uc
10545             ucfirst
10546             umask
10547             undef
10548             unless
10549             unlink
10550             unpack
10551             unshift
10552             untie
10553             until
10554             use
10555             utime
10556             values
10557             vec
10558             waitpid
10559             warn
10560             while
10561             write
10562             xor
10563              
10564             switch
10565             case
10566             default
10567             given
10568             when
10569             err
10570             say
10571             isa
10572              
10573             catch
10574              
10575             );
10576              
10577             # Note: 'ADJUST', 'field' are added by sub check_options
10578             # if --use-feature=class
10579              
10580             # patched above for SWITCH/CASE given/when err say
10581             # 'err' is a fairly safe addition.
10582             # Added 'default' for Switch::Plain. Note that we could also have
10583             # a separate set of keywords to include if we see 'use Switch::Plain'
10584 39         1627 push( @Keywords, @value_requestor );
10585              
10586             # These are treated the same but are not keywords:
10587 39         148 my @extra_vr = qw(
10588             constant
10589             vars
10590             );
10591 39         281 push( @value_requestor, @extra_vr );
10592              
10593 39         4772 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10594              
10595             # this list contains keywords which do not look for arguments,
10596             # so that they might be followed by an operator, or at least
10597             # not a term.
10598 39         372 my @operator_requestor = qw(
10599             endgrent
10600             endhostent
10601             endnetent
10602             endprotoent
10603             endpwent
10604             endservent
10605             fork
10606             getgrent
10607             gethostent
10608             getlogin
10609             getnetent
10610             getppid
10611             getprotoent
10612             getpwent
10613             getservent
10614             setgrent
10615             setpwent
10616             time
10617             times
10618             wait
10619             wantarray
10620             );
10621              
10622 39         151 push( @Keywords, @operator_requestor );
10623              
10624             # These are treated the same but are not considered keywords:
10625 39         93 my @extra_or = qw(
10626             STDERR
10627             STDIN
10628             STDOUT
10629             );
10630              
10631 39         121 push( @operator_requestor, @extra_or );
10632              
10633 39         777 @expecting_operator_token{@operator_requestor} =
10634             (1) x scalar(@operator_requestor);
10635              
10636             # these token TYPES expect trailing operator but not a term
10637             # note: ++ and -- are post-increment and decrement, 'C' = constant
10638 39         170 my @operator_requestor_types = qw( ++ -- C <> q );
10639              
10640             # NOTE: This hash is available but not currently used
10641 39         178 @expecting_operator_types{@operator_requestor_types} =
10642             (1) x scalar(@operator_requestor_types);
10643              
10644             # these token TYPES consume values (terms)
10645             # note: pp and mm are pre-increment and decrement
10646             # f=semicolon in for, F=file test operator
10647 39         768 my @value_requestor_type = qw#
10648             L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10649             **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10650             <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10651             f F pp mm Y p m U J G j >> << ^ t
10652             ~. ^. |. &. ^.= |.= &.=
10653             #;
10654 39         281 push( @value_requestor_type, ',' )
10655             ; # (perl doesn't like a ',' in a qw block)
10656              
10657             # NOTE: This hash is available but not currently used
10658 39         1025 @expecting_term_types{@value_requestor_type} =
10659             (1) x scalar(@value_requestor_type);
10660              
10661             # Note: the following valid token types are not assigned here to
10662             # hashes requesting to be followed by values or terms, but are
10663             # instead currently hard-coded into sub operator_expected:
10664             # ) -> :: Q R Z ] b h i k n v w } #
10665              
10666             # For simple syntax checking, it is nice to have a list of operators which
10667             # will really be unhappy if not followed by a term. This includes most
10668             # of the above...
10669 39         1292 @really_want_term{@value_requestor_type} =
10670             (1) x scalar(@value_requestor_type);
10671              
10672             # with these exceptions...
10673 39         182 delete $really_want_term{'U'}; # user sub, depends on prototype
10674 39         105 delete $really_want_term{'F'}; # file test works on $_ if no following term
10675 39         88 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
10676             # let perl do it
10677 39         262 @q = qw(q qq qx qr s y tr m);
10678 39         239 @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10679              
10680             # Note added 'qw' here
10681 39         153 @q = qw(q qq qw qx qr s y tr m);
10682 39         194 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10683              
10684             # Note: 'class' will be added by sub check_options if -use-feature=class
10685 39         125 @q = qw(package);
10686 39         164 @is_package{@q} = (1) x scalar(@q);
10687              
10688 39         101 @q = qw( if elsif unless );
10689 39         114 @is_if_elsif_unless{@q} = (1) x scalar(@q);
10690              
10691 39         108 @q = qw( ; t );
10692 39         158 @is_semicolon_or_t{@q} = (1) x scalar(@q);
10693              
10694 39         129 @q = qw( if elsif unless case when );
10695 39         168 @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
10696              
10697             # Hash of other possible line endings which may occur.
10698             # Keep these coordinated with the regex where this is used.
10699             # Note: chr(13) = chr(015)="\r".
10700 39         152 @q = ( chr(13), chr(29), chr(26) );
10701 39         189 @other_line_endings{@q} = (1) x scalar(@q);
10702              
10703             # These keywords are handled specially in the tokenizer code:
10704 39         134 my @special_keywords = qw(
10705             do
10706             eval
10707             format
10708             m
10709             package
10710             q
10711             qq
10712             qr
10713             qw
10714             qx
10715             s
10716             sub
10717             tr
10718             y
10719             );
10720 39         387 push( @Keywords, @special_keywords );
10721              
10722             # Keywords after which list formatting may be used
10723             # WARNING: do not include |map|grep|eval or perl may die on
10724             # syntax errors (map1.t).
10725 39         623 my @keyword_taking_list = qw(
10726             and
10727             chmod
10728             chomp
10729             chop
10730             chown
10731             dbmopen
10732             die
10733             elsif
10734             exec
10735             fcntl
10736             for
10737             foreach
10738             formline
10739             getsockopt
10740             if
10741             index
10742             ioctl
10743             join
10744             kill
10745             local
10746             msgctl
10747             msgrcv
10748             msgsnd
10749             my
10750             open
10751             or
10752             our
10753             pack
10754             print
10755             printf
10756             push
10757             read
10758             readpipe
10759             recv
10760             return
10761             reverse
10762             rindex
10763             seek
10764             select
10765             semctl
10766             semget
10767             send
10768             setpriority
10769             setsockopt
10770             shmctl
10771             shmget
10772             shmread
10773             shmwrite
10774             socket
10775             socketpair
10776             sort
10777             splice
10778             split
10779             sprintf
10780             state
10781             substr
10782             syscall
10783             sysopen
10784             sysread
10785             sysseek
10786             system
10787             syswrite
10788             tie
10789             unless
10790             unlink
10791             unpack
10792             unshift
10793             until
10794             vec
10795             warn
10796             while
10797             given
10798             when
10799             );
10800              
10801             # NOTE: This hash is available but not currently used
10802 39         1307 @is_keyword_taking_list{@keyword_taking_list} =
10803             (1) x scalar(@keyword_taking_list);
10804              
10805             # perl functions which may be unary operators.
10806              
10807             # This list is used to decide if a pattern delimited by slashes, /pattern/,
10808             # can follow one of these keywords.
10809 39         218 @q = qw(
10810             chomp eof eval fc lc pop shift uc undef
10811             );
10812              
10813 39         269 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
10814             (1) x scalar(@q);
10815              
10816             # These are keywords for which an arg may optionally be omitted. They are
10817             # currently only used to disambiguate a ? used as a ternary from one used
10818             # as a (deprecated) pattern delimiter. In the future, they might be used
10819             # to give a warning about ambiguous syntax before a /.
10820             # Note: split has been omitted (see note below).
10821 39         499 my @keywords_taking_optional_arg = qw(
10822             abs
10823             alarm
10824             caller
10825             chdir
10826             chomp
10827             chop
10828             chr
10829             chroot
10830             close
10831             cos
10832             defined
10833             die
10834             eof
10835             eval
10836             evalbytes
10837             exit
10838             exp
10839             fc
10840             getc
10841             glob
10842             gmtime
10843             hex
10844             int
10845             last
10846             lc
10847             lcfirst
10848             length
10849             localtime
10850             log
10851             lstat
10852             mkdir
10853             next
10854             oct
10855             ord
10856             pop
10857             pos
10858             print
10859             printf
10860             prototype
10861             quotemeta
10862             rand
10863             readline
10864             readlink
10865             readpipe
10866             redo
10867             ref
10868             require
10869             reset
10870             reverse
10871             rmdir
10872             say
10873             select
10874             shift
10875             sin
10876             sleep
10877             sqrt
10878             srand
10879             stat
10880             study
10881             tell
10882             uc
10883             ucfirst
10884             umask
10885             undef
10886             unlink
10887             warn
10888             write
10889             );
10890 39         1074 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
10891             (1) x scalar(@keywords_taking_optional_arg);
10892              
10893             # This list is used to decide if a pattern delimited by question marks,
10894             # ?pattern?, can follow one of these keywords. Note that from perl 5.22
10895             # on, a ?pattern? is not recognized, so we can be much more strict than
10896             # with a /pattern/. Note that 'split' is not in this list. In current
10897             # versions of perl a question following split must be a ternary, but
10898             # in older versions it could be a pattern. The guessing algorithm will
10899             # decide. We are combining two lists here to simplify the test.
10900 39         772 @q = ( @keywords_taking_optional_arg, @operator_requestor );
10901 39         1574 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
10902             (1) x scalar(@q);
10903              
10904             # These are not used in any way yet
10905             # my @unused_keywords = qw(
10906             # __FILE__
10907             # __LINE__
10908             # __PACKAGE__
10909             # );
10910              
10911             # The list of keywords was originally extracted from function 'keyword' in
10912             # perl file toke.c version 5.005.03, using this utility, plus a
10913             # little editing: (file getkwd.pl):
10914             # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
10915             # Add 'get' prefix where necessary, then split into the above lists.
10916             # This list should be updated as necessary.
10917             # The list should not contain these special variables:
10918             # ARGV DATA ENV SIG STDERR STDIN STDOUT
10919             # __DATA__ __END__
10920              
10921 39         9059 @is_keyword{@Keywords} = (1) x scalar(@Keywords);
10922             } ## end BEGIN
10923             1;