File Coverage

blib/lib/Perl/Tidy/Tokenizer.pm
Criterion Covered Total %
statement 2334 3199 72.9
branch 1118 1694 66.0
condition 517 874 59.1
subroutine 160 191 83.7
pod 0 146 0.0
total 4129 6104 67.6


line stmt bran cond sub pod time code
1             #####################################################################
2             #
3             # The Perl::Tidy::Tokenizer package is essentially a filter which
4             # reads lines of perl source code from a source object and provides
5             # corresponding tokenized lines through its get_line() method. Lines
6             # flow from the source_object to the caller like this:
7             #
8             # source_object --> Tokenizer --> calling routine
9             # get_line() get_line() line_of_tokens
10             #
11             # The source object can be a STRING ref, an ARRAY ref, or an object with a
12             # get_line() method which supplies one line (a character string) perl call.
13             # The Tokenizer returns a reference to a data structure 'line_of_tokens'
14             # containing one tokenized line for each call to its get_line() method.
15             #
16             # NOTE: This is not a real class. Only one tokenizer my be used.
17             #
18             ########################################################################
19              
20             package Perl::Tidy::Tokenizer;
21 39     39   306 use strict;
  39         109  
  39         1322  
22 39     39   250 use warnings;
  39         105  
  39         1140  
23 39     39   220 use English qw( -no_match_vars );
  39         127  
  39         283  
24              
25             our $VERSION = '20230909';
26              
27 39     39   14644 use Carp;
  39         116  
  39         2461  
28              
29 39     39   311 use constant DEVEL_MODE => 0;
  39         132  
  39         2491  
30 39     39   275 use constant EMPTY_STRING => q{};
  39         157  
  39         2267  
31 39     39   289 use constant SPACE => q{ };
  39         117  
  39         2522  
32              
33             # Decimal values of some ascii characters for quick checks
34 39     39   286 use constant ORD_TAB => 9;
  39         98  
  39         2044  
35 39     39   265 use constant ORD_SPACE => 32;
  39         127  
  39         2163  
36 39     39   297 use constant ORD_PRINTABLE_MIN => 33;
  39         123  
  39         2033  
37 39     39   279 use constant ORD_PRINTABLE_MAX => 126;
  39         100  
  39         7971  
38              
39             # GLOBAL VARIABLES which change during tokenization:
40             # These could also be stored in $self but it is more convenient and
41             # efficient to make them global lexical variables.
42             # INITIALIZER: sub prepare_for_a_new_file
43             my (
44              
45             $brace_depth,
46             $context,
47             $current_package,
48             $last_nonblank_block_type,
49             $last_nonblank_token,
50             $last_nonblank_type,
51             $next_sequence_number,
52             $paren_depth,
53             $rbrace_context,
54             $rbrace_package,
55             $rbrace_structural_type,
56             $rbrace_type,
57             $rcurrent_depth,
58             $rcurrent_sequence_number,
59             $rdepth_array,
60             $ris_block_function,
61             $ris_block_list_function,
62             $ris_constant,
63             $ris_user_function,
64             $rnested_statement_type,
65             $rnested_ternary_flag,
66             $rparen_semicolon_count,
67             $rparen_vars,
68             $rparen_type,
69             $rsaw_function_definition,
70             $rsaw_use_module,
71             $rsquare_bracket_structural_type,
72             $rsquare_bracket_type,
73             $rstarting_line_of_current_depth,
74             $rtotal_depth,
75             $ruser_function_prototype,
76             $square_bracket_depth,
77             $statement_type,
78             $total_depth,
79             );
80              
81             my (
82              
83             # GLOBAL CONSTANTS for routines in this package,
84             # INITIALIZER: BEGIN block.
85             %can_start_digraph,
86             %expecting_operator_token,
87             %expecting_operator_types,
88             %expecting_term_token,
89             %expecting_term_types,
90             %is_block_operator,
91             %is_digraph,
92             %is_file_test_operator,
93             %is_if_elsif_unless,
94             %is_if_elsif_unless_case_when,
95             %is_indirect_object_taker,
96             %is_keyword_rejecting_question_as_pattern_delimiter,
97             %is_keyword_rejecting_slash_as_pattern_delimiter,
98             %is_keyword_taking_list,
99             %is_keyword_taking_optional_arg,
100             %is_q_qq_qw_qx_qr_s_y_tr_m,
101             %is_q_qq_qx_qr_s_y_tr_m,
102             %is_semicolon_or_t,
103             %is_sort_map_grep,
104             %is_sort_map_grep_eval_do,
105             %is_tetragraph,
106             %is_trigraph,
107             %is_valid_token_type,
108             %other_line_endings,
109             %really_want_term,
110             @closing_brace_names,
111             @opening_brace_names,
112              
113             # GLOBAL CONSTANT hash lookup table of operator expected values
114             # INITIALIZER: BEGIN block
115             %op_expected_table,
116              
117             # GLOBAL VARIABLES which are constant after being configured.
118             # INITIALIZER: BEGIN block and modified by sub check_options
119             %is_code_block_token,
120             %is_keyword,
121             %is_my_our_state,
122             %is_package,
123              
124             # INITIALIZER: sub check_options
125             $code_skipping_pattern_begin,
126             $code_skipping_pattern_end,
127             $rOpts_code_skipping,
128             $rOpts_code_skipping_begin,
129             %is_END_DATA_format_sub,
130             %is_grep_alias,
131             %is_sub,
132             $guess_if_method,
133             );
134              
135             # possible values of operator_expected()
136 39     39   301 use constant TERM => -1;
  39         1477  
  39         2274  
137 39     39   265 use constant UNKNOWN => 0;
  39         138  
  39         2074  
138 39     39   257 use constant OPERATOR => 1;
  39         138  
  39         2357  
139              
140             # possible values of context
141 39     39   320 use constant SCALAR_CONTEXT => -1;
  39         156  
  39         2332  
142 39     39   313 use constant UNKNOWN_CONTEXT => 0;
  39         100  
  39         2013  
143 39     39   265 use constant LIST_CONTEXT => 1;
  39         111  
  39         2251  
144              
145             # Maximum number of little messages; probably need not be changed.
146 39     39   339 use constant MAX_NAG_MESSAGES => 6;
  39         140  
  39         8660  
147              
148 0         0 BEGIN {
149              
150             # Array index names for $self.
151             # Do not combine with other BEGIN blocks (c101).
152 39     39   284081 my $i = 0;
153             use constant {
154 39         26373 _rhere_target_list_ => $i++,
155             _in_here_doc_ => $i++,
156             _here_doc_target_ => $i++,
157             _here_quote_character_ => $i++,
158             _in_data_ => $i++,
159             _in_end_ => $i++,
160             _in_format_ => $i++,
161             _in_error_ => $i++,
162             _in_trouble_ => $i++,
163             _warning_count_ => $i++,
164             _html_tag_count_ => $i++,
165             _in_pod_ => $i++,
166             _in_skipped_ => $i++,
167             _in_attribute_list_ => $i++,
168             _in_quote_ => $i++,
169             _quote_target_ => $i++,
170             _line_start_quote_ => $i++,
171             _starting_level_ => $i++,
172             _know_starting_level_ => $i++,
173             _tabsize_ => $i++,
174             _indent_columns_ => $i++,
175             _look_for_hash_bang_ => $i++,
176             _trim_qw_ => $i++,
177             _continuation_indentation_ => $i++,
178             _outdent_labels_ => $i++,
179             _last_line_number_ => $i++,
180             _saw_perl_dash_P_ => $i++,
181             _saw_perl_dash_w_ => $i++,
182             _saw_use_strict_ => $i++,
183             _saw_v_string_ => $i++,
184             _saw_brace_error_ => $i++,
185             _hit_bug_ => $i++,
186             _look_for_autoloader_ => $i++,
187             _look_for_selfloader_ => $i++,
188             _saw_autoloader_ => $i++,
189             _saw_selfloader_ => $i++,
190             _saw_hash_bang_ => $i++,
191             _saw_end_ => $i++,
192             _saw_data_ => $i++,
193             _saw_negative_indentation_ => $i++,
194             _started_tokenizing_ => $i++,
195             _debugger_object_ => $i++,
196             _diagnostics_object_ => $i++,
197             _logger_object_ => $i++,
198             _unexpected_error_count_ => $i++,
199             _started_looking_for_here_target_at_ => $i++,
200             _nearly_matched_here_target_at_ => $i++,
201             _line_of_text_ => $i++,
202             _rlower_case_labels_at_ => $i++,
203             _extended_syntax_ => $i++,
204             _maximum_level_ => $i++,
205             _true_brace_error_count_ => $i++,
206             _rOpts_maximum_level_errors_ => $i++,
207             _rOpts_maximum_unexpected_errors_ => $i++,
208             _rOpts_logfile_ => $i++,
209             _rOpts_ => $i++,
210             _rinput_lines_ => $i++,
211             _input_line_index_next_ => $i++,
212 39     39   294 };
  39         111  
213             } ## end BEGIN
214              
215             { ## closure for subs to count instances
216              
217             # methods to count instances
218             my $_count = 0;
219 0     0 0 0 sub get_count { return $_count; }
220 561     561   2473 sub _increment_count { return ++$_count }
221 561     561   1217 sub _decrement_count { return --$_count }
222             }
223              
224             sub DESTROY {
225 561     561   1441 my $self = shift;
226 561         3087 $self->_decrement_count();
227 561         8561 return;
228             }
229              
230             sub AUTOLOAD {
231              
232             # Catch any undefined sub calls so that we are sure to get
233             # some diagnostic information. This sub should never be called
234             # except for a programming error.
235 0     0   0 our $AUTOLOAD;
236 0 0       0 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
237 0         0 my ( $pkg, $fname, $lno ) = caller();
238 0         0 my $my_package = __PACKAGE__;
239 0         0 print {*STDERR} <<EOM;
  0         0  
240             ======================================================================
241             Error detected in package '$my_package', version $VERSION
242             Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
243             Called from package: '$pkg'
244             Called from File '$fname' at line '$lno'
245             This error is probably due to a recent programming change
246             ======================================================================
247             EOM
248 0         0 exit 1;
249             } ## end sub AUTOLOAD
250              
251             sub Die {
252 0     0 0 0 my ($msg) = @_;
253 0         0 Perl::Tidy::Die($msg);
254 0         0 croak "unexpected return from Perl::Tidy::Die";
255             }
256              
257             sub Fault {
258 0     0 0 0 my ( $self, $msg ) = @_;
259              
260             # This routine is called for errors that really should not occur
261             # except if there has been a bug introduced by a recent program change.
262             # Please add comments at calls to Fault to explain why the call
263             # should not occur, and where to look to fix it.
264 0         0 my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
265 0         0 my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
266 0         0 my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
267 0         0 my $pkg = __PACKAGE__;
268              
269             # Catch potential error of Fault not called as a method
270 0         0 my $input_stream_name;
271 0 0       0 if ( !ref($self) ) {
272 0         0 $msg = "Fault not called as a method - please fix\n";
273 0 0 0     0 if ( $self && length($self) < 200 ) { $msg .= $self }
  0         0  
274 0         0 $self = undef;
275 0         0 $input_stream_name = "(UNKNOWN)";
276             }
277             else {
278 0         0 $input_stream_name = $self->get_input_stream_name();
279             }
280              
281 0         0 Die(<<EOM);
282             ==============================================================================
283             While operating on input stream with name: '$input_stream_name'
284             A fault was detected at line $line0 of sub '$subroutine1'
285             in file '$filename1'
286             which was called from line $line1 of sub '$subroutine2'
287             Message: '$msg'
288             This is probably an error introduced by a recent programming change.
289             $pkg reports VERSION='$VERSION'.
290             ==============================================================================
291             EOM
292              
293             # We shouldn't get here, but this return is to keep Perl-Critic from
294             # complaining.
295 0         0 return;
296             } ## end sub Fault
297              
298             sub make_code_skipping_pattern {
299 1118     1118 0 3400 my ( $rOpts, $opt_name, $default ) = @_;
300 1118         2594 my $param = $rOpts->{$opt_name};
301 1118 100       3016 if ( !$param ) { $param = $default }
  1116         2054  
302 1118         4614 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
303 1118 50       5200 if ( $param !~ /^#/ ) {
304 0         0 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
305             }
306 1118         3484 my $pattern = '^\s*' . $param . '\b';
307 1118 50       3342 if ( Perl::Tidy::Formatter::bad_pattern($pattern) ) {
308 0         0 Die(
309             "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
310             );
311             }
312 1118         3777 return $pattern;
313             } ## end sub make_code_skipping_pattern
314              
315             sub check_options {
316              
317             # Check Tokenizer parameters
318 559     559 0 1751 my $rOpts = shift;
319              
320 559         2161 %is_sub = ();
321 559         1809 $is_sub{'sub'} = 1;
322              
323 559         4658 %is_END_DATA_format_sub = (
324             '__END__' => 1,
325             '__DATA__' => 1,
326             'format' => 1,
327             'sub' => 1,
328             );
329              
330             # Install any aliases to 'sub'
331 559 100       2355 if ( $rOpts->{'sub-alias-list'} ) {
332              
333             # Note that any 'sub-alias-list' has been preprocessed to
334             # be a trimmed, space-separated list which includes 'sub'
335             # for example, it might be 'sub method fun'
336 3         32 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
337 3         17 foreach my $word (@sub_alias_list) {
338 11         24 $is_sub{$word} = 1;
339 11         22 $is_END_DATA_format_sub{$word} = 1;
340             }
341             }
342              
343             # Set global flag to say if we have to guess if bareword 'method' is
344             # a sub when 'method' is in %is_sub. This will be true unless:
345             # (1) the user entered 'method' as sub alias, or
346             # (2) the user set --use-feature=class
347             # In these two cases we can assume that 'method' is a sub alias.
348 559         1323 $guess_if_method = 1;
349 559 100       2104 if ( $is_sub{'method'} ) { $guess_if_method = 0 }
  2         6  
350              
351             #------------------------------------------------
352             # Update hash values for any -use-feature options
353             #------------------------------------------------
354              
355 559         1381 my $use_feature_class = 1;
356 559 50       2180 if ( $rOpts->{'use-feature'} ) {
357 0 0       0 if ( $rOpts->{'use-feature'} =~ /\bnoclass\b/ ) {
    0          
358 0         0 $use_feature_class = 0;
359             }
360             elsif ( $rOpts->{'use-feature'} =~ /\bclass\b/ ) {
361 0         0 $guess_if_method = 0;
362             }
363             else {
364             ## neither 'class' nor 'noclass' seen so use default
365             }
366             }
367              
368             # These are the main updates for this option. There are additional
369             # changes elsewhere, usually indicated with a comment 'rt145706'
370              
371             # Update hash values for use_feature=class, added for rt145706
372             # see 'perlclass.pod'
373              
374             # IMPORTANT: We are changing global hash values initially set in a BEGIN
375             # block. Values must be defined (true or false) for each of these new
376             # words whether true or false. Otherwise, programs using the module which
377             # change options between runs (such as test code) will have
378             # incorrect settings and fail.
379              
380             # There are 4 new keywords:
381              
382             # 'class' - treated specially as generalization of 'package'
383             # Note: we must not set 'class' to be a keyword to avoid problems
384             # with older uses.
385 559         1828 $is_package{'class'} = $use_feature_class;
386              
387             # 'method' - treated like sub using the sub-alias-list option
388             # Note: we must not set 'method' to be a keyword to avoid problems
389             # with older uses.
390 559 50       1851 if ($use_feature_class) {
391 559         1649 $is_sub{'method'} = 1;
392 559         1615 $is_END_DATA_format_sub{'method'} = 1;
393             }
394              
395             # 'field' - added as a keyword, and works like 'my'
396 559         1606 $is_keyword{'field'} = $use_feature_class;
397 559         1614 $is_my_our_state{'field'} = $use_feature_class;
398              
399             # 'ADJUST' - added as a keyword and works like 'BEGIN'
400             # TODO: if ADJUST gets a paren list, this will need to be updated
401 559         1429 $is_keyword{'ADJUST'} = $use_feature_class;
402 559         1478 $is_code_block_token{'ADJUST'} = $use_feature_class;
403              
404 559         2079 %is_grep_alias = ();
405 559 50       2077 if ( $rOpts->{'grep-alias-list'} ) {
406              
407             # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
408             # space-separated list
409 559         5709 my @q = split /\s+/, $rOpts->{'grep-alias-list'};
410 559         5057 @{is_grep_alias}{@q} = (1) x scalar(@q);
411             }
412              
413 559         2000 $rOpts_code_skipping = $rOpts->{'code-skipping'};
414 559         1449 $rOpts_code_skipping_begin = $rOpts->{'code-skipping-begin'};
415 559         2862 $code_skipping_pattern_begin =
416             make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
417 559         2116 $code_skipping_pattern_end =
418             make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
419              
420 559         2161 return;
421             } ## end sub check_options
422              
423             sub new {
424              
425 561     561 0 5686 my ( $class, @args ) = @_;
426              
427             # Note: 'tabs' and 'indent_columns' are temporary and should be
428             # removed asap
429 561         9472 my %defaults = (
430             source_object => undef,
431             debugger_object => undef,
432             diagnostics_object => undef,
433             logger_object => undef,
434             starting_level => undef,
435             indent_columns => 4,
436             tabsize => 8,
437             look_for_hash_bang => 0,
438             trim_qw => 1,
439             look_for_autoloader => 1,
440             look_for_selfloader => 1,
441             starting_line_number => 1,
442             extended_syntax => 0,
443             rOpts => {},
444             );
445 561         7570 my %args = ( %defaults, @args );
446              
447             # we are given an object with a get_line() method to supply source lines
448 561         2327 my $source_object = $args{source_object};
449 561         1952 my $rOpts = $args{rOpts};
450              
451             # Check call args
452 561 50       2165 if ( !defined($source_object) ) {
453 0         0 Die(
454             "Perl::Tidy::Tokenizer::new called without a 'source_object' parameter\n"
455             );
456             }
457 561 50       2187 if ( !ref($source_object) ) {
458 0         0 Die(<<EOM);
459             sub Perl::Tidy::Tokenizer::new received a 'source_object' parameter which is not a reference;
460             'source_object' must be a reference to a STRING, ARRAY, or object with a 'getline' method
461             EOM
462             }
463              
464             # Tokenizer state data is as follows:
465             # _rhere_target_list_ reference to list of here-doc targets
466             # _here_doc_target_ the target string for a here document
467             # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
468             # to determine if interpolation is done
469             # _quote_target_ character we seek if chasing a quote
470             # _line_start_quote_ line where we started looking for a long quote
471             # _in_here_doc_ flag indicating if we are in a here-doc
472             # _in_pod_ flag set if we are in pod documentation
473             # _in_skipped_ flag set if we are in a skipped section
474             # _in_error_ flag set if we saw severe error (binary in script)
475             # _in_trouble_ set if we saw a troublesome lexical like 'my sub s'
476             # _warning_count_ number of calls to logger sub warning
477             # _html_tag_count_ number of apparent html tags seen (indicates html)
478             # _in_data_ flag set if we are in __DATA__ section
479             # _in_end_ flag set if we are in __END__ section
480             # _in_format_ flag set if we are in a format description
481             # _in_attribute_list_ flag telling if we are looking for attributes
482             # _in_quote_ flag telling if we are chasing a quote
483             # _starting_level_ indentation level of first line
484             # _diagnostics_object_ place to write debugging information
485             # _unexpected_error_count_ error count used to limit output
486             # _lower_case_labels_at_ line numbers where lower case labels seen
487             # _hit_bug_ program bug detected
488              
489 561         1432 my $self = [];
490 561         1775 $self->[_rhere_target_list_] = [];
491 561         1427 $self->[_in_here_doc_] = 0;
492 561         1651 $self->[_here_doc_target_] = EMPTY_STRING;
493 561         1423 $self->[_here_quote_character_] = EMPTY_STRING;
494 561         1480 $self->[_in_data_] = 0;
495 561         1531 $self->[_in_end_] = 0;
496 561         1549 $self->[_in_format_] = 0;
497 561         1350 $self->[_in_error_] = 0;
498 561         1376 $self->[_in_trouble_] = 0;
499 561         1224 $self->[_warning_count_] = 0;
500 561         1318 $self->[_html_tag_count_] = 0;
501 561         1309 $self->[_in_pod_] = 0;
502 561         1383 $self->[_in_skipped_] = 0;
503 561         1221 $self->[_in_attribute_list_] = 0;
504 561         1260 $self->[_in_quote_] = 0;
505 561         1468 $self->[_quote_target_] = EMPTY_STRING;
506 561         1348 $self->[_line_start_quote_] = -1;
507 561         1436 $self->[_starting_level_] = $args{starting_level};
508 561         1839 $self->[_know_starting_level_] = defined( $args{starting_level} );
509 561         1548 $self->[_tabsize_] = $args{tabsize};
510 561         1871 $self->[_indent_columns_] = $args{indent_columns};
511 561         1347 $self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
512 561         1461 $self->[_trim_qw_] = $args{trim_qw};
513 561         1461 $self->[_continuation_indentation_] = $args{continuation_indentation};
514 561         1390 $self->[_outdent_labels_] = $args{outdent_labels};
515 561         1515 $self->[_last_line_number_] = $args{starting_line_number} - 1;
516 561         1329 $self->[_saw_perl_dash_P_] = 0;
517 561         1475 $self->[_saw_perl_dash_w_] = 0;
518 561         1195 $self->[_saw_use_strict_] = 0;
519 561         1187 $self->[_saw_v_string_] = 0;
520 561         1214 $self->[_saw_brace_error_] = 0;
521 561         1221 $self->[_hit_bug_] = 0;
522 561         1240 $self->[_look_for_autoloader_] = $args{look_for_autoloader};
523 561         1543 $self->[_look_for_selfloader_] = $args{look_for_selfloader};
524 561         1428 $self->[_saw_autoloader_] = 0;
525 561         1190 $self->[_saw_selfloader_] = 0;
526 561         1282 $self->[_saw_hash_bang_] = 0;
527 561         1163 $self->[_saw_end_] = 0;
528 561         1188 $self->[_saw_data_] = 0;
529 561         1190 $self->[_saw_negative_indentation_] = 0;
530 561         1438 $self->[_started_tokenizing_] = 0;
531 561         1343 $self->[_debugger_object_] = $args{debugger_object};
532 561         1335 $self->[_diagnostics_object_] = $args{diagnostics_object};
533 561         1312 $self->[_logger_object_] = $args{logger_object};
534 561         1407 $self->[_unexpected_error_count_] = 0;
535 561         1242 $self->[_started_looking_for_here_target_at_] = 0;
536 561         1228 $self->[_nearly_matched_here_target_at_] = undef;
537 561         1219 $self->[_line_of_text_] = EMPTY_STRING;
538 561         1469 $self->[_rlower_case_labels_at_] = undef;
539 561         1497 $self->[_extended_syntax_] = $args{extended_syntax};
540 561         1515 $self->[_maximum_level_] = 0;
541 561         1234 $self->[_true_brace_error_count_] = 0;
542 561         1425 $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
543             $self->[_rOpts_maximum_unexpected_errors_] =
544 561         1486 $rOpts->{'maximum-unexpected-errors'};
545 561         1572 $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
546 561         1573 $self->[_rOpts_] = $rOpts;
547              
548             # These vars are used for guessing indentation and must be positive
549 561 50       2116 $self->[_tabsize_] = 8 if ( !$self->[_tabsize_] );
550 561 100       2111 $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
551              
552 561         1480 bless $self, $class;
553              
554 561         3744 $self->prepare_for_a_new_file($source_object);
555 561         3477 $self->find_starting_indentation_level();
556              
557             # This is not a full class yet, so die if an attempt is made to
558             # create more than one object.
559              
560 561 50       2747 if ( _increment_count() > 1 ) {
561 0         0 confess
562             "Attempt to create more than 1 object in $class, which is not a true class yet\n";
563             }
564              
565 561         6418 return $self;
566              
567             } ## end sub new
568              
569             # Called externally
570             sub get_unexpected_error_count {
571 4     4 0 17 my ($self) = @_;
572 4         18 return $self->[_unexpected_error_count_];
573             }
574              
575             # Called externally
576             sub is_keyword {
577 2791     2791 0 5126 my ($str) = @_;
578 2791         10244 return $is_keyword{$str};
579             }
580              
581             #----------------------------------------------------------------
582             # Line input routines, previously handled by the LineBuffer class
583             #----------------------------------------------------------------
584             sub make_source_array {
585              
586 561     561 0 1788 my ( $self, $line_source_object ) = @_;
587              
588             # Convert the source into an array of lines
589 561         1332 my $rinput_lines = [];
590              
591 561         1529 my $rsource = ref($line_source_object);
592              
593 561 50       3719 if ( !$rsource ) {
    50          
    50          
594              
595             # shouldn't happen: this should have been checked in sub new
596 0         0 $self->Fault(<<EOM);
597             sub Perl::Tidy::Tokenizer::new received a 'source_object' parameter which is not a reference;
598             'source_object' must be a reference to a STRING, ARRAY, or object with a 'getline' method
599             EOM
600             }
601              
602             # handle an ARRAY ref
603             elsif ( $rsource eq 'ARRAY' ) {
604 0         0 $rinput_lines = $line_source_object;
605             }
606              
607             # handle a SCALAR ref
608             elsif ( $rsource eq 'SCALAR' ) {
609 561         974 my @lines = split /^/, ${$line_source_object};
  561         9070  
610 561         2442 $rinput_lines = \@lines;
611             }
612              
613             # handle an object - must have a get_line method
614             else {
615 0         0 while ( my $line = $line_source_object->get_line() ) {
616 0         0 push( @{$rinput_lines}, $line );
  0         0  
617             }
618             }
619              
620 561         2245 $self->[_rinput_lines_] = $rinput_lines;
621 561         1574 $self->[_input_line_index_next_] = 0;
622 561         1310 return;
623             } ## end sub make_source_array
624              
625             sub peek_ahead {
626 1234     1234 0 2926 my ( $self, $buffer_index ) = @_;
627              
628             # look $buffer_index lines ahead of the current location without disturbing
629             # the input
630 1234         2014 my $line;
631 1234         2429 my $rinput_lines = $self->[_rinput_lines_];
632 1234         2534 my $line_index = $buffer_index + $self->[_input_line_index_next_];
633 1234 100       2091 if ( $line_index < @{$rinput_lines} ) {
  1234         3595  
634 1222         2607 $line = $rinput_lines->[$line_index];
635             }
636 1234         4657 return $line;
637             } ## end sub peek_ahead
638              
639             #-----------------------------------------
640             # interface to Perl::Tidy::Logger routines
641             #-----------------------------------------
642             sub warning {
643              
644 0     0 0 0 my ( $self, $msg ) = @_;
645              
646 0         0 my $logger_object = $self->[_logger_object_];
647 0         0 $self->[_warning_count_]++;
648 0 0       0 if ($logger_object) {
649 0         0 my $msg_line_number = $self->[_last_line_number_];
650 0         0 $logger_object->warning( $msg, $msg_line_number );
651             }
652 0         0 return;
653             } ## end sub warning
654              
655             sub get_input_stream_name {
656              
657 0     0 0 0 my $self = shift;
658              
659 0         0 my $input_stream_name = EMPTY_STRING;
660 0         0 my $logger_object = $self->[_logger_object_];
661 0 0       0 if ($logger_object) {
662 0         0 $input_stream_name = $logger_object->get_input_stream_name();
663             }
664 0         0 return $input_stream_name;
665             } ## end sub get_input_stream_name
666              
667             sub complain {
668              
669 32     32 0 112 my ( $self, $msg ) = @_;
670              
671 32         80 my $logger_object = $self->[_logger_object_];
672 32 50       101 if ($logger_object) {
673 32         72 my $input_line_number = $self->[_last_line_number_];
674 32         185 $logger_object->complain( $msg, $input_line_number );
675             }
676 32         82 return;
677             } ## end sub complain
678              
679             sub write_logfile_entry {
680              
681 1857     1857 0 4409 my ( $self, $msg ) = @_;
682              
683 1857         3529 my $logger_object = $self->[_logger_object_];
684 1857 100       4484 if ($logger_object) {
685 1851         5579 $logger_object->write_logfile_entry($msg);
686             }
687 1857         5683 return;
688             } ## end sub write_logfile_entry
689              
690             sub interrupt_logfile {
691              
692 0     0 0 0 my $self = shift;
693              
694 0         0 my $logger_object = $self->[_logger_object_];
695 0 0       0 if ($logger_object) {
696 0         0 $logger_object->interrupt_logfile();
697             }
698 0         0 return;
699             } ## end sub interrupt_logfile
700              
701             sub resume_logfile {
702              
703 0     0 0 0 my $self = shift;
704              
705 0         0 my $logger_object = $self->[_logger_object_];
706 0 0       0 if ($logger_object) {
707 0         0 $logger_object->resume_logfile();
708             }
709 0         0 return;
710             } ## end sub resume_logfile
711              
712             sub brace_warning {
713 0     0 0 0 my ( $self, $msg ) = @_;
714 0         0 $self->[_saw_brace_error_]++;
715              
716 0         0 my $logger_object = $self->[_logger_object_];
717 0 0       0 if ($logger_object) {
718 0         0 my $msg_line_number = $self->[_last_line_number_];
719 0         0 $logger_object->brace_warning( $msg, $msg_line_number );
720             }
721 0         0 return;
722             } ## end sub brace_warning
723              
724             sub increment_brace_error {
725              
726             # This is same as sub brace_warning but without a message
727 0     0 0 0 my $self = shift;
728 0         0 $self->[_saw_brace_error_]++;
729              
730 0         0 my $logger_object = $self->[_logger_object_];
731 0 0       0 if ($logger_object) {
732 0         0 $logger_object->increment_brace_error();
733             }
734 0         0 return;
735             } ## end sub increment_brace_error
736              
737             sub get_saw_brace_error {
738 0     0 0 0 my $self = shift;
739 0         0 return $self->[_saw_brace_error_];
740             } ## end sub get_saw_brace_error
741              
742             sub report_definite_bug {
743 0     0 0 0 my $self = shift;
744 0         0 $self->[_hit_bug_] = 1;
745 0         0 my $logger_object = $self->[_logger_object_];
746 0 0       0 if ($logger_object) {
747 0         0 $logger_object->report_definite_bug();
748             }
749 0         0 return;
750             } ## end sub report_definite_bug
751              
752             #-------------------------------------
753             # Interface to Perl::Tidy::Diagnostics
754             #-------------------------------------
755             sub write_diagnostics {
756 0     0 0 0 my ( $self, $msg ) = @_;
757 0         0 my $input_line_number = $self->[_last_line_number_];
758 0         0 my $diagnostics_object = $self->[_diagnostics_object_];
759 0 0       0 if ($diagnostics_object) {
760 0         0 $diagnostics_object->write_diagnostics( $msg, $input_line_number );
761             }
762 0         0 return;
763             } ## end sub write_diagnostics
764              
765             sub report_tokenization_errors {
766              
767 561     561 0 1990 my ($self) = @_;
768              
769             # Report any tokenization errors and return a flag '$severe_error'.
770             # Set $severe_error = 1 if the tokenization errors are so severe that
771             # the formatter should not attempt to format the file. Instead, it will
772             # just output the file verbatim.
773              
774             # set severe error flag if tokenizer has encountered file reading problems
775             # (i.e. unexpected binary characters)
776             # or code which may not be formatted correctly (such as 'my sub q')
777             # The difference between _in_error_ and _in_trouble_ is that
778             # _in_error_ stops the tokenizer immediately whereas
779             # _in_trouble_ lets the tokenizer finish so that all errors are seen
780             # Both block formatting and cause the input stream to be output verbatim.
781 561   33     3403 my $severe_error = $self->[_in_error_] || $self->[_in_trouble_];
782              
783             # And do not format if it looks like an html file (c209)
784 561   33     4248 $severe_error ||= $self->[_html_tag_count_] && $self->[_warning_count_];
      33        
785              
786             # Inform the logger object on length of input stream
787 561         1544 my $logger_object = $self->[_logger_object_];
788 561 100       1985 if ($logger_object) {
789 559         1486 my $last_line_number = $self->[_last_line_number_];
790 559         2955 $logger_object->set_last_input_line_number($last_line_number);
791             }
792              
793 561         1645 my $maxle = $self->[_rOpts_maximum_level_errors_];
794 561         1410 my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
795 561 50       1835 $maxle = 1 unless defined($maxle);
796 561 50       1898 $maxue = 0 unless defined($maxue);
797              
798 561         2396 my $level = get_indentation_level();
799 561 50       2263 if ( $level != $self->[_starting_level_] ) {
800 0         0 $self->warning("final indentation level: $level\n");
801 0         0 my $level_diff = $self->[_starting_level_] - $level;
802 0 0       0 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
  0         0  
803              
804             # Set severe error flag if the level error is greater than 1.
805             # The formatter can function for any level error but it is probably
806             # best not to attempt formatting for a high level error.
807 0 0 0     0 if ( $maxle >= 0 && $level_diff > $maxle ) {
808 0         0 $severe_error = 1;
809 0         0 $self->warning(<<EOM);
810             Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
811             EOM
812             }
813             }
814              
815 561         3127 $self->check_final_nesting_depths();
816              
817             # Likewise, large numbers of brace errors usually indicate non-perl
818             # scripts, so set the severe error flag at a low number. This is similar
819             # to the level check, but different because braces may balance but be
820             # incorrectly interlaced.
821 561 50       2903 if ( $self->[_true_brace_error_count_] > 2 ) {
822 0         0 $severe_error = 1;
823             }
824              
825 561 50 66     2606 if ( $self->[_look_for_hash_bang_]
826             && !$self->[_saw_hash_bang_] )
827             {
828 0         0 $self->warning(
829             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
830             }
831              
832 561 50       1969 if ( $self->[_in_format_] ) {
833 0         0 $self->warning("hit EOF while in format description\n");
834             }
835              
836 561 50       2105 if ( $self->[_in_skipped_] ) {
837 0         0 $self->write_logfile_entry(
838             "hit EOF while in lines skipped with --code-skipping\n");
839             }
840              
841 561 50       2028 if ( $self->[_in_pod_] ) {
842              
843             # Just write log entry if this is after __END__ or __DATA__
844             # because this happens to often, and it is not likely to be
845             # a parsing error.
846 0 0 0     0 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
847 0         0 $self->write_logfile_entry(
848             "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
849             );
850             }
851              
852             else {
853 0         0 $self->complain(
854             "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
855             );
856             }
857              
858             }
859              
860 561 50       1933 if ( $self->[_in_here_doc_] ) {
861 0         0 $severe_error = 1;
862 0         0 my $here_doc_target = $self->[_here_doc_target_];
863 0         0 my $started_looking_for_here_target_at =
864             $self->[_started_looking_for_here_target_at_];
865 0 0       0 if ($here_doc_target) {
866 0         0 $self->warning(
867             "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
868             );
869             }
870             else {
871 0         0 $self->warning(<<EOM);
872             Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
873             (Perl will match to the end of file but this may not be intended).
874             EOM
875             }
876 0         0 my $nearly_matched_here_target_at =
877             $self->[_nearly_matched_here_target_at_];
878 0 0       0 if ($nearly_matched_here_target_at) {
879 0         0 $self->warning(
880             "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
881             );
882             }
883             }
884              
885             # Something is seriously wrong if we ended inside a quote
886 561 50       2337 if ( $self->[_in_quote_] ) {
887 0         0 $severe_error = 1;
888 0         0 my $line_start_quote = $self->[_line_start_quote_];
889 0         0 my $quote_target = $self->[_quote_target_];
890 0 0       0 my $what =
891             ( $self->[_in_attribute_list_] )
892             ? "attribute list"
893             : "quote/pattern";
894 0         0 $self->warning(
895             "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
896             );
897             }
898              
899 561 50       1995 if ( $self->[_hit_bug_] ) {
900 0         0 $severe_error = 1;
901             }
902              
903             # Multiple "unexpected" type tokenization errors usually indicate parsing
904             # non-perl scripts, or that something is seriously wrong, so we should
905             # avoid formatting them. This can happen for example if we run perltidy on
906             # a shell script or an html file. But unfortunately this check can
907             # interfere with some extended syntaxes, such as RPerl, so it has to be off
908             # by default.
909 561         1463 my $ue_count = $self->[_unexpected_error_count_];
910 561 50 33     2529 if ( $maxue > 0 && $ue_count > $maxue ) {
911 0         0 $self->warning(<<EOM);
912             Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
913             EOM
914 0         0 $severe_error = 1;
915             }
916              
917 561 100       1927 if ( !$self->[_saw_perl_dash_w_] ) {
918 545 50       3621 if ( $] < 5.006 ) {
919 0         0 $self->write_logfile_entry("Suggest including '-w parameter'\n");
920             }
921             else {
922 545         1894 $self->write_logfile_entry("Suggest including 'use warnings;'\n");
923             }
924             }
925              
926 561 50       5161 if ( $self->[_saw_perl_dash_P_] ) {
927 0         0 $self->write_logfile_entry(
928             "Use of -P parameter for defines is discouraged\n");
929             }
930              
931 561 100       2472 if ( !$self->[_saw_use_strict_] ) {
932 547         1675 $self->write_logfile_entry("Suggest including 'use strict;'\n");
933             }
934              
935             # it is suggested that labels have at least one upper case character
936             # for legibility and to avoid code breakage as new keywords are introduced
937 561 100       3932 if ( $self->[_rlower_case_labels_at_] ) {
938 12         44 my @lower_case_labels_at = @{ $self->[_rlower_case_labels_at_] };
  12         46  
939 12         48 $self->write_logfile_entry(
940             "Suggest using upper case characters in label(s)\n");
941 12         54 local $LIST_SEPARATOR = ')(';
942 12         112 $self->write_logfile_entry(
943             " defined at line(s): (@lower_case_labels_at)\n");
944             }
945 561         2365 return $severe_error;
946             } ## end sub report_tokenization_errors
947              
948             sub report_v_string {
949              
950             # warn if this version can't handle v-strings
951 2     2 0 9 my ( $self, $tok ) = @_;
952 2 50       8 if ( !$self->[_saw_v_string_] ) {
953 2         6 $self->[_saw_v_string_] = $self->[_last_line_number_];
954             }
955 2 50       10 if ( $] < 5.006 ) {
956 0         0 $self->warning(
957             "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
958             );
959             }
960 2         7 return;
961             } ## end sub report_v_string
962              
963             sub is_valid_token_type {
964 3     3 0 7 my ($type) = @_;
965 3         15 return $is_valid_token_type{$type};
966             }
967              
968             sub log_numbered_msg {
969 167     167 0 480 my ( $self, $msg ) = @_;
970              
971             # write input line number + message to logfile
972 167         350 my $input_line_number = $self->[_last_line_number_];
973 167         896 $self->write_logfile_entry("Line $input_line_number: $msg");
974 167         359 return;
975             } ## end sub log_numbered_msg
976              
977             sub get_line {
978              
979 8221     8221 0 15270 my $self = shift;
980              
981             # Read the next input line and tokenize it
982             # Returns:
983             # $line_of_tokens = ref to hash of info for the tokenized line
984              
985             # USES GLOBAL VARIABLES:
986             # $brace_depth, $square_bracket_depth, $paren_depth
987              
988             # get the next line from the input array
989 8221         12202 my $input_line;
990 8221         14231 my $rinput_lines = $self->[_rinput_lines_];
991 8221         13022 my $line_index = $self->[_input_line_index_next_];
992 8221 100       11827 if ( $line_index < @{$rinput_lines} ) {
  8221         19167  
993 7660         16116 $input_line = $rinput_lines->[ $line_index++ ];
994 7660         12420 $self->[_input_line_index_next_] = $line_index;
995             }
996              
997 8221         14154 $self->[_line_of_text_] = $input_line;
998              
999 8221 100       19934 return if ( !defined($input_line) );
1000              
1001 7660         12922 my $input_line_number = ++$self->[_last_line_number_];
1002              
1003             # Find and remove what characters terminate this line, including any
1004             # control r
1005 7660         12011 my $input_line_separator = EMPTY_STRING;
1006 7660 100       20193 if ( chomp($input_line) ) {
1007 7659         17040 $input_line_separator = $INPUT_RECORD_SEPARATOR;
1008             }
1009              
1010             # The first test here very significantly speeds things up, but be sure to
1011             # keep the regex and hash %other_line_endings the same.
1012 7660 100       23205 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
1013 24 50       218 if ( $input_line =~ s/([\r\035\032])+$// ) {
1014 24         71 $input_line_separator = $1 . $input_line_separator;
1015             }
1016             }
1017              
1018             # for backwards compatibility we keep the line text terminated with
1019             # a newline character
1020 7660         15945 $input_line .= "\n";
1021 7660         13464 $self->[_line_of_text_] = $input_line;
1022              
1023             # create a data structure describing this line which will be
1024             # returned to the caller.
1025              
1026             # _line_type codes are:
1027             # SYSTEM - system-specific code before hash-bang line
1028             # CODE - line of perl code (including comments)
1029             # POD_START - line starting pod, such as '=head'
1030             # POD - pod documentation text
1031             # POD_END - last line of pod section, '=cut'
1032             # HERE - text of here-document
1033             # HERE_END - last line of here-doc (target word)
1034             # FORMAT - format section
1035             # FORMAT_END - last line of format section, '.'
1036             # SKIP - code skipping section
1037             # SKIP_END - last line of code skipping section, '#>>V'
1038             # DATA_START - __DATA__ line
1039             # DATA - unidentified text following __DATA__
1040             # END_START - __END__ line
1041             # END - unidentified text following __END__
1042             # ERROR - we are in big trouble, probably not a perl script
1043              
1044             # Other variables:
1045             # _curly_brace_depth - depth of curly braces at start of line
1046             # _square_bracket_depth - depth of square brackets at start of line
1047             # _paren_depth - depth of parens at start of line
1048             # _starting_in_quote - this line continues a multi-line quote
1049             # (so don't trim leading blanks!)
1050             # _ending_in_quote - this line ends in a multi-line quote
1051             # (so don't trim trailing blanks!)
1052 7660         42656 my $line_of_tokens = {
1053             _line_type => 'EOF',
1054             _line_text => $input_line,
1055             _line_number => $input_line_number,
1056             _guessed_indentation_level => 0,
1057             _curly_brace_depth => $brace_depth,
1058             _square_bracket_depth => $square_bracket_depth,
1059             _paren_depth => $paren_depth,
1060             _quote_character => EMPTY_STRING,
1061             ## Skip these needless initializations for efficiency:
1062             ## _rtoken_type => undef,
1063             ## _rtokens => undef,
1064             ## _rlevels => undef,
1065             ## _rblock_type => undef,
1066             ## _rtype_sequence => undef,
1067             ## _rci_levels => undef,
1068             ## _starting_in_quote => 0,
1069             ## _ending_in_quote => 0,
1070             };
1071              
1072             # must print line unchanged if we are in a here document
1073 7660 100       38934 if ( $self->[_in_here_doc_] ) {
    100          
    100          
    100          
    50          
    100          
    100          
1074              
1075 24         86 $line_of_tokens->{_line_type} = 'HERE';
1076 24         63 my $here_doc_target = $self->[_here_doc_target_];
1077 24         58 my $here_quote_character = $self->[_here_quote_character_];
1078 24         40 my $candidate_target = $input_line;
1079 24         53 chomp $candidate_target;
1080              
1081             # Handle <<~ targets, which are indicated here by a leading space on
1082             # the here quote character
1083 24 100       94 if ( $here_quote_character =~ /^\s/ ) {
1084 4         18 $candidate_target =~ s/^\s*//;
1085             }
1086 24 100       67 if ( $candidate_target eq $here_doc_target ) {
1087 9         47 $self->[_nearly_matched_here_target_at_] = undef;
1088 9         30 $line_of_tokens->{_line_type} = 'HERE_END';
1089 9         53 $self->log_numbered_msg("Exiting HERE document $here_doc_target\n");
1090              
1091 9         54 my $rhere_target_list = $self->[_rhere_target_list_];
1092 9 50       24 if ( @{$rhere_target_list} ) { # there can be multiple here targets
  9         48  
1093             ( $here_doc_target, $here_quote_character ) =
1094 0         0 @{ shift @{$rhere_target_list} };
  0         0  
  0         0  
1095 0         0 $self->[_here_doc_target_] = $here_doc_target;
1096 0         0 $self->[_here_quote_character_] = $here_quote_character;
1097 0         0 $self->log_numbered_msg(
1098             "Entering HERE document $here_doc_target\n");
1099 0         0 $self->[_nearly_matched_here_target_at_] = undef;
1100 0         0 $self->[_started_looking_for_here_target_at_] =
1101             $input_line_number;
1102             }
1103             else {
1104 9         33 $self->[_in_here_doc_] = 0;
1105 9         26 $self->[_here_doc_target_] = EMPTY_STRING;
1106 9         30 $self->[_here_quote_character_] = EMPTY_STRING;
1107             }
1108             }
1109              
1110             # check for error of extra whitespace
1111             # note for PERL6: leading whitespace is allowed
1112             else {
1113 15         142 $candidate_target =~ s/\s*$//;
1114 15         66 $candidate_target =~ s/^\s*//;
1115 15 50       60 if ( $candidate_target eq $here_doc_target ) {
1116 0         0 $self->[_nearly_matched_here_target_at_] = $input_line_number;
1117             }
1118             }
1119 24         99 return $line_of_tokens;
1120             }
1121              
1122             # Print line unchanged if we are in a format section
1123             elsif ( $self->[_in_format_] ) {
1124              
1125 3 100       23 if ( $input_line =~ /^\.[\s#]*$/ ) {
1126              
1127             # Decrement format depth count at a '.' after a 'format'
1128 1         3 $self->[_in_format_]--;
1129              
1130             # This is the end when count reaches 0
1131 1 50       7 if ( !$self->[_in_format_] ) {
1132 1         5 $self->log_numbered_msg("Exiting format section\n");
1133 1         7 $line_of_tokens->{_line_type} = 'FORMAT_END';
1134              
1135             # Make the tokenizer mark an opening brace which follows
1136             # as a code block. Fixes issue c202/t032.
1137 1         5 $last_nonblank_token = ';';
1138 1         3 $last_nonblank_type = ';';
1139             }
1140             }
1141             else {
1142 2         5 $line_of_tokens->{_line_type} = 'FORMAT';
1143 2 50       11 if ( $input_line =~ /^\s*format\s+\w+/ ) {
1144              
1145             # Increment format depth count at a 'format' within a 'format'
1146             # This is a simple way to handle nested formats (issue c019).
1147 0         0 $self->[_in_format_]++;
1148             }
1149             }
1150 3         11 return $line_of_tokens;
1151             }
1152              
1153             # must print line unchanged if we are in pod documentation
1154             elsif ( $self->[_in_pod_] ) {
1155              
1156 47         164 $line_of_tokens->{_line_type} = 'POD';
1157 47 100       225 if ( $input_line =~ /^=cut/ ) {
1158 20         95 $line_of_tokens->{_line_type} = 'POD_END';
1159 20         104 $self->log_numbered_msg("Exiting POD section\n");
1160 20         92 $self->[_in_pod_] = 0;
1161             }
1162 47 50 33     150 if ( $input_line =~ /^\#\!.*perl\b/ && !$self->[_in_end_] ) {
1163 0         0 $self->warning(
1164             "Hash-bang in pod can cause older versions of perl to fail! \n"
1165             );
1166             }
1167              
1168 47         165 return $line_of_tokens;
1169             }
1170              
1171             # print line unchanged if in skipped section
1172             elsif ( $self->[_in_skipped_] ) {
1173              
1174 8         18 $line_of_tokens->{_line_type} = 'SKIP';
1175 8 100       90 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
    50          
1176 2         7 $line_of_tokens->{_line_type} = 'SKIP_END';
1177 2         10 $self->log_numbered_msg("Exiting code-skipping section\n");
1178 2         5 $self->[_in_skipped_] = 0;
1179             }
1180             elsif ( $input_line =~ /$code_skipping_pattern_begin/ ) {
1181              
1182             # warn of duplicate starting comment lines, git #118
1183 0         0 my $lno = $self->[_in_skipped_];
1184 0         0 $self->warning(
1185             "Already in code-skipping section which started at line $lno\n"
1186             );
1187             }
1188             else {
1189             ## ok - not a code-skipping control line
1190             }
1191 8         31 return $line_of_tokens;
1192             }
1193              
1194             # must print line unchanged if we have seen a severe error (i.e., we
1195             # are seeing illegal tokens and cannot continue. Syntax errors do
1196             # not pass this route). Calling routine can decide what to do, but
1197             # the default can be to just pass all lines as if they were after __END__
1198             elsif ( $self->[_in_error_] ) {
1199 0         0 $line_of_tokens->{_line_type} = 'ERROR';
1200 0         0 return $line_of_tokens;
1201             }
1202              
1203             # print line unchanged if we are __DATA__ section
1204             elsif ( $self->[_in_data_] ) {
1205              
1206             # ...but look for POD
1207             # Note that the _in_data and _in_end flags remain set
1208             # so that we return to that state after seeing the
1209             # end of a pod section
1210 1 50 33     8 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1211 0         0 $line_of_tokens->{_line_type} = 'POD_START';
1212 0         0 $self->log_numbered_msg("Entering POD section\n");
1213 0         0 $self->[_in_pod_] = 1;
1214 0         0 return $line_of_tokens;
1215             }
1216             else {
1217 1         4 $line_of_tokens->{_line_type} = 'DATA';
1218 1         4 return $line_of_tokens;
1219             }
1220             }
1221              
1222             # print line unchanged if we are in __END__ section
1223             elsif ( $self->[_in_end_] ) {
1224              
1225             # ...but look for POD
1226             # Note that the _in_data and _in_end flags remain set
1227             # so that we return to that state after seeing the
1228             # end of a pod section
1229 48 100 66     347 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1230 6         29 $line_of_tokens->{_line_type} = 'POD_START';
1231 6         31 $self->log_numbered_msg("Entering POD section\n");
1232 6         34 $self->[_in_pod_] = 1;
1233 6         69 return $line_of_tokens;
1234             }
1235             else {
1236 42         73 $line_of_tokens->{_line_type} = 'END';
1237 42         139 return $line_of_tokens;
1238             }
1239             }
1240             else {
1241             ## ok
1242             }
1243              
1244             # check for a hash-bang line if we haven't seen one
1245 7529 100 100     33761 if ( !$self->[_saw_hash_bang_]
      66        
1246             && substr( $input_line, 0, 2 ) eq '#!'
1247             && $input_line =~ /^\#\!.*perl\b/ )
1248             {
1249 15         53 $self->[_saw_hash_bang_] = $input_line_number;
1250              
1251             # check for -w and -P flags
1252 15 50       102 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1253 0         0 $self->[_saw_perl_dash_P_] = 1;
1254             }
1255              
1256 15 100       97 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1257 8         26 $self->[_saw_perl_dash_w_] = 1;
1258             }
1259              
1260 15 100 33     137 if (
      66        
      100        
      66        
1261             $input_line_number > 1
1262              
1263             # leave any hash bang in a BEGIN block alone
1264             # i.e. see 'debugger-duck_type.t'
1265             && !(
1266             $last_nonblank_block_type
1267             && $last_nonblank_block_type eq 'BEGIN'
1268             )
1269             && !$self->[_look_for_hash_bang_]
1270              
1271             # Try to avoid giving a false alarm at a simple comment.
1272             # These look like valid hash-bang lines:
1273              
1274             #!/usr/bin/perl -w
1275             #! /usr/bin/perl -w
1276             #!c:\perl\bin\perl.exe
1277              
1278             # These are comments:
1279             #! I love perl
1280             #! sunos does not yet provide a /usr/bin/perl
1281              
1282             # Comments typically have multiple spaces, which suggests
1283             # the filter
1284             && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1285             )
1286             {
1287              
1288             # this is helpful for VMS systems; we may have accidentally
1289             # tokenized some DCL commands
1290 1 50       5 if ( $self->[_started_tokenizing_] ) {
1291 0         0 $self->warning(
1292             "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1293             );
1294             }
1295             else {
1296 1         6 $self->complain("Useless hash-bang after line 1\n");
1297             }
1298             }
1299              
1300             # Report the leading hash-bang as a system line
1301             # This will prevent -dac from deleting it
1302             else {
1303 14         51 $line_of_tokens->{_line_type} = 'SYSTEM';
1304 14         79 return $line_of_tokens;
1305             }
1306             }
1307              
1308             # wait for a hash-bang before parsing if the user invoked us with -x
1309 7515 100 100     18601 if ( $self->[_look_for_hash_bang_]
1310             && !$self->[_saw_hash_bang_] )
1311             {
1312 5         13 $line_of_tokens->{_line_type} = 'SYSTEM';
1313 5         19 return $line_of_tokens;
1314             }
1315              
1316             # a first line of the form ': #' will be marked as SYSTEM
1317             # since lines of this form may be used by tcsh
1318 7510 50 66     19881 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1319 0         0 $line_of_tokens->{_line_type} = 'SYSTEM';
1320 0         0 return $line_of_tokens;
1321             }
1322              
1323             # now we know that it is ok to tokenize the line...
1324             # the line tokenizer will modify any of these private variables:
1325             # _rhere_target_list_
1326             # _in_data_
1327             # _in_end_
1328             # _in_format_
1329             # _in_error_
1330             # _in_skipped_
1331             # _in_pod_
1332             # _in_quote_
1333 7510         23419 $self->tokenize_this_line($line_of_tokens);
1334              
1335             # Now finish defining the return structure and return it
1336 7510         15892 $line_of_tokens->{_ending_in_quote} = $self->[_in_quote_];
1337              
1338             # handle severe error (binary data in script)
1339 7510 50       16987 if ( $self->[_in_error_] ) {
1340 0         0 $self->[_in_quote_] = 0; # to avoid any more messages
1341 0         0 $self->warning("Giving up after error\n");
1342 0         0 $line_of_tokens->{_line_type} = 'ERROR';
1343 0         0 reset_indentation_level(0); # avoid error messages
1344 0         0 return $line_of_tokens;
1345             }
1346              
1347             # handle start of pod documentation
1348 7510 100       15976 if ( $self->[_in_pod_] ) {
1349              
1350             # This gets tricky..above a __DATA__ or __END__ section, perl
1351             # accepts '=cut' as the start of pod section. But afterwards,
1352             # only pod utilities see it and they may ignore an =cut without
1353             # leading =head. In any case, this isn't good.
1354 14 50       84 if ( $input_line =~ /^=cut\b/ ) {
1355 0 0 0     0 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
1356 0         0 $self->complain("=cut while not in pod ignored\n");
1357 0         0 $self->[_in_pod_] = 0;
1358 0         0 $line_of_tokens->{_line_type} = 'POD_END';
1359             }
1360             else {
1361 0         0 $line_of_tokens->{_line_type} = 'POD_START';
1362 0         0 if ( !DEVEL_MODE ) {
1363 0         0 $self->warning(
1364             "=cut starts a pod section .. this can fool pod utilities.\n"
1365             );
1366             }
1367 0         0 $self->log_numbered_msg("Entering POD section\n");
1368             }
1369             }
1370              
1371             else {
1372 14         70 $line_of_tokens->{_line_type} = 'POD_START';
1373 14         76 $self->log_numbered_msg("Entering POD section\n");
1374             }
1375              
1376 14         94 return $line_of_tokens;
1377             }
1378              
1379             # handle start of skipped section
1380 7496 100       14876 if ( $self->[_in_skipped_] ) {
1381              
1382 2         8 $line_of_tokens->{_line_type} = 'SKIP';
1383 2         14 $self->log_numbered_msg("Entering code-skipping section\n");
1384 2         10 return $line_of_tokens;
1385             }
1386              
1387             # see if this line contains here doc targets
1388 7494         12459 my $rhere_target_list = $self->[_rhere_target_list_];
1389 7494 100       10782 if ( @{$rhere_target_list} ) {
  7494         17331  
1390              
1391             my ( $here_doc_target, $here_quote_character ) =
1392 9         52 @{ shift @{$rhere_target_list} };
  9         25  
  9         43  
1393 9         32 $self->[_in_here_doc_] = 1;
1394 9         31 $self->[_here_doc_target_] = $here_doc_target;
1395 9         24 $self->[_here_quote_character_] = $here_quote_character;
1396 9         69 $self->log_numbered_msg("Entering HERE document $here_doc_target\n");
1397 9         34 $self->[_started_looking_for_here_target_at_] = $input_line_number;
1398             }
1399              
1400             # NOTE: __END__ and __DATA__ statements are written unformatted
1401             # because they can theoretically contain additional characters
1402             # which are not tokenized (and cannot be read with <DATA> either!).
1403 7494 100       20350 if ( $self->[_in_data_] ) {
    100          
1404 1         4 $line_of_tokens->{_line_type} = 'DATA_START';
1405 1         6 $self->log_numbered_msg("Starting __DATA__ section\n");
1406 1         4 $self->[_saw_data_] = 1;
1407              
1408             # keep parsing after __DATA__ if use SelfLoader was seen
1409 1 50       5 if ( $self->[_saw_selfloader_] ) {
1410 0         0 $self->[_in_data_] = 0;
1411 0         0 $self->log_numbered_msg(
1412             "SelfLoader seen, continuing; -nlsl deactivates\n");
1413             }
1414              
1415 1         5 return $line_of_tokens;
1416             }
1417              
1418             elsif ( $self->[_in_end_] ) {
1419 6         50 $line_of_tokens->{_line_type} = 'END_START';
1420 6         31 $self->log_numbered_msg("Starting __END__ section\n");
1421 6         22 $self->[_saw_end_] = 1;
1422              
1423             # keep parsing after __END__ if use AutoLoader was seen
1424 6 50       71 if ( $self->[_saw_autoloader_] ) {
1425 0         0 $self->[_in_end_] = 0;
1426 0         0 $self->log_numbered_msg(
1427             "AutoLoader seen, continuing; -nlal deactivates\n");
1428             }
1429 6         28 return $line_of_tokens;
1430             }
1431             else {
1432             ## ok: not in __END__ or __DATA__
1433             }
1434              
1435             # now, finally, we know that this line is type 'CODE'
1436 7487         14137 $line_of_tokens->{_line_type} = 'CODE';
1437              
1438             # remember if we have seen any real code
1439 7487 100 100     23773 if ( !$self->[_started_tokenizing_]
      100        
1440             && $input_line !~ /^\s*$/
1441             && $input_line !~ /^\s*#/ )
1442             {
1443 557         2855 $self->[_started_tokenizing_] = 1;
1444             }
1445              
1446 7487 100       16073 if ( $self->[_debugger_object_] ) {
1447 7         36 $self->[_debugger_object_]->write_debug_entry($line_of_tokens);
1448             }
1449              
1450             # Note: if keyword 'format' occurs in this line code, it is still CODE
1451             # (keyword 'format' need not start a line)
1452 7487 100       15537 if ( $self->[_in_format_] ) {
1453 1         7 $self->log_numbered_msg("Entering format section\n");
1454             }
1455              
1456 7487 100 100     29299 if ( $self->[_in_quote_]
    100 100        
1457             and ( $self->[_line_start_quote_] < 0 ) )
1458             {
1459              
1460             #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1461 49 100       431 if ( ( my $quote_target = $self->[_quote_target_] ) !~ /^\s*$/ ) {
1462 48         140 $self->[_line_start_quote_] = $input_line_number;
1463 48         361 $self->log_numbered_msg(
1464             "Start multi-line quote or pattern ending in $quote_target\n");
1465             }
1466             }
1467             elsif ( ( $self->[_line_start_quote_] >= 0 )
1468             && !$self->[_in_quote_] )
1469             {
1470 48         175 $self->[_line_start_quote_] = -1;
1471 48         200 $self->log_numbered_msg("End of multi-line quote or pattern\n");
1472             }
1473             else {
1474             ## ok
1475             }
1476              
1477             # we are returning a line of CODE
1478 7487         29946 return $line_of_tokens;
1479             } ## end sub get_line
1480              
1481             sub find_starting_indentation_level {
1482              
1483             # We need to find the indentation level of the first line of the
1484             # script being formatted. Often it will be zero for an entire file,
1485             # but if we are formatting a local block of code (within an editor for
1486             # example) it may not be zero. The user may specify this with the
1487             # -sil=n parameter but normally doesn't so we have to guess.
1488             #
1489 561     561 0 1796 my ($self) = @_;
1490 561         1370 my $starting_level = 0;
1491              
1492             # use value if given as parameter
1493 561 100       2942 if ( $self->[_know_starting_level_] ) {
    100          
1494 1         4 $starting_level = $self->[_starting_level_];
1495             }
1496              
1497             # if we know there is a hash_bang line, the level must be zero
1498             elsif ( $self->[_look_for_hash_bang_] ) {
1499 1         4 $self->[_know_starting_level_] = 1;
1500             }
1501              
1502             # otherwise figure it out from the input file
1503             else {
1504 559         1222 my $line;
1505 559         1209 my $i = 0;
1506              
1507             # keep looking at lines until we find a hash bang or piece of code
1508             # ( or, for now, an =pod line)
1509 559         1298 my $msg = EMPTY_STRING;
1510 559         1143 my $in_code_skipping;
1511 559         3067 while ( $line = $self->peek_ahead( $i++ ) ) {
1512              
1513             # if first line is #! then assume starting level is zero
1514 870 100 100     5504 if ( $i == 1 && $line =~ /^\#\!/ ) {
1515 13         42 $starting_level = 0;
1516 13         51 last;
1517             }
1518              
1519             # ignore lines fenced off with code-skipping comments
1520 857 100       5043 if ( $line =~ /^\s*#/ ) {
1521 296 50       1103 if ( !$in_code_skipping ) {
1522 296 50 33     3102 if ( $rOpts_code_skipping
1523             && $line =~ /$code_skipping_pattern_begin/ )
1524             {
1525 0         0 $in_code_skipping = 1;
1526             }
1527             }
1528             else {
1529 0 0       0 if ( $line =~ /$code_skipping_pattern_end/ ) {
1530 0         0 $in_code_skipping = 0;
1531             }
1532             }
1533 296         1054 next;
1534             }
1535 561 50       2583 next if ($in_code_skipping);
1536              
1537 561 100       3613 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1538              
1539 543         2793 $starting_level = $self->guess_old_indentation_level($line);
1540 543         1485 last;
1541             }
1542 559         2559 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1543 559         3140 $self->write_logfile_entry("$msg");
1544             }
1545 561         2207 $self->[_starting_level_] = $starting_level;
1546 561         3775 reset_indentation_level($starting_level);
1547 561         1082 return;
1548             } ## end sub find_starting_indentation_level
1549              
1550             sub guess_old_indentation_level {
1551 543     543 0 1812 my ( $self, $line ) = @_;
1552              
1553             # Guess the indentation level of an input line.
1554             #
1555             # For the first line of code this result will define the starting
1556             # indentation level. It will mainly be non-zero when perltidy is applied
1557             # within an editor to a local block of code.
1558             #
1559             # This is an impossible task in general because we can't know what tabs
1560             # meant for the old script and how many spaces were used for one
1561             # indentation level in the given input script. For example it may have
1562             # been previously formatted with -i=7 -et=3. But we can at least try to
1563             # make sure that perltidy guesses correctly if it is applied repeatedly to
1564             # a block of code within an editor, so that the block stays at the same
1565             # level when perltidy is applied repeatedly.
1566             #
1567             # USES GLOBAL VARIABLES: (none)
1568 543         1315 my $level = 0;
1569              
1570             # find leading tabs, spaces, and any statement label
1571 543         1094 my $spaces = 0;
1572 543 50       4279 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1573              
1574             # If there are leading tabs, we use the tab scheme for this run, if
1575             # any, so that the code will remain stable when editing.
1576 543 100       2518 if ($1) { $spaces += length($1) * $self->[_tabsize_] }
  2         10  
1577              
1578 543 100       2143 if ($2) { $spaces += length($2) }
  79         294  
1579              
1580             # correct for outdented labels
1581 543 50 66     2345 if ( $3 && $self->[_outdent_labels_] ) {
1582 1         3 $spaces += $self->[_continuation_indentation_];
1583             }
1584             }
1585              
1586             # compute indentation using the value of -i for this run.
1587             # If -i=0 is used for this run (which is possible) it doesn't matter
1588             # what we do here but we'll guess that the old run used 4 spaces per level.
1589 543         1612 my $indent_columns = $self->[_indent_columns_];
1590 543 50       1830 $indent_columns = 4 if ( !$indent_columns );
1591 543         1961 $level = int( $spaces / $indent_columns );
1592 543         1430 return ($level);
1593             } ## end sub guess_old_indentation_level
1594              
1595             # This is a currently unused debug routine
1596             sub dump_functions {
1597              
1598 0     0 0 0 my $fh = *STDOUT;
1599 0         0 foreach my $pkg ( keys %{$ris_user_function} ) {
  0         0  
1600 0         0 $fh->print("\nnon-constant subs in package $pkg\n");
1601              
1602 0         0 foreach my $sub ( keys %{ $ris_user_function->{$pkg} } ) {
  0         0  
1603 0         0 my $msg = EMPTY_STRING;
1604 0 0       0 if ( $ris_block_list_function->{$pkg}{$sub} ) {
1605 0         0 $msg = 'block_list';
1606             }
1607              
1608 0 0       0 if ( $ris_block_function->{$pkg}{$sub} ) {
1609 0         0 $msg = 'block';
1610             }
1611 0         0 $fh->print("$sub $msg\n");
1612             }
1613             }
1614              
1615 0         0 foreach my $pkg ( keys %{$ris_constant} ) {
  0         0  
1616 0         0 $fh->print("\nconstants and constant subs in package $pkg\n");
1617              
1618 0         0 foreach my $sub ( keys %{ $ris_constant->{$pkg} } ) {
  0         0  
1619 0         0 $fh->print("$sub\n");
1620             }
1621             }
1622 0         0 return;
1623             } ## end sub dump_functions
1624              
1625             sub prepare_for_a_new_file {
1626              
1627 561     561 0 1975 my ( $self, $source_object ) = @_;
1628              
1629             # copy the source object lines to an array of lines
1630 561         3055 $self->make_source_array($source_object);
1631              
1632             # previous tokens needed to determine what to expect next
1633 561         1786 $last_nonblank_token = ';'; # the only possible starting state which
1634 561         1384 $last_nonblank_type = ';'; # will make a leading brace a code block
1635 561         1328 $last_nonblank_block_type = EMPTY_STRING;
1636              
1637             # scalars for remembering statement types across multiple lines
1638 561         1348 $statement_type = EMPTY_STRING; # '' or 'use' or 'sub..' or 'case..'
1639              
1640             # scalars for remembering where we are in the file
1641 561         1355 $current_package = "main";
1642 561         1265 $context = UNKNOWN_CONTEXT;
1643              
1644             # hashes used to remember function information
1645 561         1932 $ris_constant = {}; # user-defined constants
1646 561         1856 $ris_user_function = {}; # user-defined functions
1647 561         1611 $ruser_function_prototype = {}; # their prototypes
1648 561         1591 $ris_block_function = {};
1649 561         1531 $ris_block_list_function = {};
1650 561         1419 $rsaw_function_definition = {};
1651 561         1323 $rsaw_use_module = {};
1652              
1653             # variables used to track depths of various containers
1654             # and report nesting errors
1655 561         1137 $paren_depth = 0;
1656 561         1179 $brace_depth = 0;
1657 561         1090 $square_bracket_depth = 0;
1658 561         2693 $rcurrent_depth = [ (0) x scalar @closing_brace_names ];
1659 561         1323 $total_depth = 0;
1660 561         2316 $rtotal_depth = [];
1661 561         2114 $rcurrent_sequence_number = [];
1662 561         1240 $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
1663              
1664 561         1882 $rparen_type = [];
1665 561         1629 $rparen_semicolon_count = [];
1666 561         2379 $rparen_vars = [];
1667 561         1862 $rbrace_type = [];
1668 561         1711 $rbrace_structural_type = [];
1669 561         1517 $rbrace_context = [];
1670 561         1707 $rbrace_package = [];
1671 561         1596 $rsquare_bracket_type = [];
1672 561         1460 $rsquare_bracket_structural_type = [];
1673 561         3510 $rdepth_array = [];
1674 561         1235 $rnested_ternary_flag = [];
1675 561         3692 $rnested_statement_type = [];
1676 561         3116 $rstarting_line_of_current_depth = [];
1677              
1678 561         1773 $rparen_type->[$paren_depth] = EMPTY_STRING;
1679 561         1583 $rparen_semicolon_count->[$paren_depth] = 0;
1680 561         1488 $rparen_vars->[$paren_depth] = [];
1681 561         1579 $rbrace_type->[$brace_depth] = ';'; # identify opening brace as code block
1682 561         1535 $rbrace_structural_type->[$brace_depth] = EMPTY_STRING;
1683 561         1413 $rbrace_context->[$brace_depth] = UNKNOWN_CONTEXT;
1684 561         1376 $rbrace_package->[$paren_depth] = $current_package;
1685 561         1366 $rsquare_bracket_type->[$square_bracket_depth] = EMPTY_STRING;
1686 561         1309 $rsquare_bracket_structural_type->[$square_bracket_depth] = EMPTY_STRING;
1687              
1688 561         2697 initialize_tokenizer_state();
1689 561         1134 return;
1690             } ## end sub prepare_for_a_new_file
1691              
1692             { ## closure for sub tokenize_this_line
1693              
1694 39     39   397 use constant BRACE => 0;
  39         98  
  39         2686  
1695 39     39   307 use constant SQUARE_BRACKET => 1;
  39         109  
  39         2316  
1696 39     39   288 use constant PAREN => 2;
  39         97  
  39         2227  
1697 39     39   296 use constant QUESTION_COLON => 3;
  39         87  
  39         86024  
1698              
1699             # TV1: scalars for processing one LINE.
1700             # Re-initialized on each entry to sub tokenize_this_line.
1701             my (
1702             $block_type, $container_type, $expecting,
1703             $i, $i_tok, $input_line,
1704             $input_line_number, $last_nonblank_i, $max_token_index,
1705             $next_tok, $next_type, $peeked_ahead,
1706             $prototype, $rhere_target_list, $rtoken_map,
1707             $rtoken_type, $rtokens, $tok,
1708             $type, $type_sequence, $indent_flag,
1709             );
1710              
1711             # TV2: refs to ARRAYS for processing one LINE
1712             # Re-initialized on each call.
1713             my $routput_token_list = []; # stack of output token indexes
1714             my $routput_token_type = []; # token types
1715             my $routput_block_type = []; # types of code block
1716             my $routput_container_type = []; # paren types, such as if, elsif, ..
1717             my $routput_type_sequence = []; # nesting sequential number
1718             my $routput_indent_flag = []; #
1719              
1720             # TV3: SCALARS for quote variables. These are initialized with a
1721             # subroutine call and continually updated as lines are processed.
1722             my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1723             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1724              
1725             # TV4: SCALARS for multi-line identifiers and
1726             # statements. These are initialized with a subroutine call
1727             # and continually updated as lines are processed.
1728             my ( $id_scan_state, $identifier, $want_paren );
1729              
1730             # TV5: SCALARS for tracking indentation level.
1731             # Initialized once and continually updated as lines are
1732             # processed.
1733             my (
1734             $nesting_token_string, $nesting_block_string,
1735             $nesting_block_flag, $level_in_tokenizer,
1736             );
1737              
1738             # TV6: SCALARS for remembering several previous
1739             # tokens. Initialized once and continually updated as
1740             # lines are processed.
1741             my (
1742             $last_nonblank_container_type, $last_nonblank_type_sequence,
1743             $last_last_nonblank_token, $last_last_nonblank_type,
1744             $last_nonblank_prototype,
1745             );
1746              
1747             # ----------------------------------------------------------------
1748             # beginning of tokenizer variable access and manipulation routines
1749             # ----------------------------------------------------------------
1750              
1751             sub initialize_tokenizer_state {
1752              
1753             # GV1: initialized once
1754             # TV1: initialized on each call
1755             # TV2: initialized on each call
1756             # TV3:
1757 561     561 0 1251 $in_quote = 0;
1758 561         1371 $quote_type = 'Q';
1759 561         1218 $quote_character = EMPTY_STRING;
1760 561         1224 $quote_pos = 0;
1761 561         1177 $quote_depth = 0;
1762 561         1247 $quoted_string_1 = EMPTY_STRING;
1763 561         1247 $quoted_string_2 = EMPTY_STRING;
1764 561         1183 $allowed_quote_modifiers = EMPTY_STRING;
1765              
1766             # TV4:
1767 561         1144 $id_scan_state = EMPTY_STRING;
1768 561         1186 $identifier = EMPTY_STRING;
1769 561         1194 $want_paren = EMPTY_STRING;
1770              
1771             # TV5:
1772 561         1361 $nesting_token_string = EMPTY_STRING;
1773 561         1229 $nesting_block_string = '1'; # initially in a block
1774 561         1069 $nesting_block_flag = 1;
1775 561         1107 $level_in_tokenizer = 0;
1776              
1777             # TV6:
1778 561         1232 $last_nonblank_container_type = EMPTY_STRING;
1779 561         1155 $last_nonblank_type_sequence = EMPTY_STRING;
1780 561         1275 $last_last_nonblank_token = ';';
1781 561         1163 $last_last_nonblank_type = ';';
1782 561         1181 $last_nonblank_prototype = EMPTY_STRING;
1783 561         1154 return;
1784             } ## end sub initialize_tokenizer_state
1785              
1786             sub save_tokenizer_state {
1787              
1788             # Global variables:
1789 0     0 0 0 my $rGV1 = [
1790             $brace_depth,
1791             $context,
1792             $current_package,
1793             $last_nonblank_block_type,
1794             $last_nonblank_token,
1795             $last_nonblank_type,
1796             $next_sequence_number,
1797             $paren_depth,
1798             $rbrace_context,
1799             $rbrace_package,
1800             $rbrace_structural_type,
1801             $rbrace_type,
1802             $rcurrent_depth,
1803             $rcurrent_sequence_number,
1804             $rdepth_array,
1805             $ris_block_function,
1806             $ris_block_list_function,
1807             $ris_constant,
1808             $ris_user_function,
1809             $rnested_statement_type,
1810             $rnested_ternary_flag,
1811             $rparen_semicolon_count,
1812             $rparen_vars,
1813             $rparen_type,
1814             $rsaw_function_definition,
1815             $rsaw_use_module,
1816             $rsquare_bracket_structural_type,
1817             $rsquare_bracket_type,
1818             $rstarting_line_of_current_depth,
1819             $rtotal_depth,
1820             $ruser_function_prototype,
1821             $square_bracket_depth,
1822             $statement_type,
1823             $total_depth,
1824              
1825             ];
1826              
1827             # Tokenizer closure variables:
1828 0         0 my $rTV1 = [
1829             $block_type, $container_type, $expecting,
1830             $i, $i_tok, $input_line,
1831             $input_line_number, $last_nonblank_i, $max_token_index,
1832             $next_tok, $next_type, $peeked_ahead,
1833             $prototype, $rhere_target_list, $rtoken_map,
1834             $rtoken_type, $rtokens, $tok,
1835             $type, $type_sequence, $indent_flag,
1836             ];
1837              
1838 0         0 my $rTV2 = [
1839             $routput_token_list, $routput_token_type,
1840             $routput_block_type, $routput_container_type,
1841             $routput_type_sequence, $routput_indent_flag,
1842             ];
1843              
1844 0         0 my $rTV3 = [
1845             $in_quote, $quote_type,
1846             $quote_character, $quote_pos,
1847             $quote_depth, $quoted_string_1,
1848             $quoted_string_2, $allowed_quote_modifiers,
1849             ];
1850              
1851 0         0 my $rTV4 = [ $id_scan_state, $identifier, $want_paren ];
1852              
1853 0         0 my $rTV5 = [
1854             $nesting_token_string, $nesting_block_string,
1855             $nesting_block_flag, $level_in_tokenizer,
1856             ];
1857              
1858 0         0 my $rTV6 = [
1859             $last_nonblank_container_type, $last_nonblank_type_sequence,
1860             $last_last_nonblank_token, $last_last_nonblank_type,
1861             $last_nonblank_prototype,
1862             ];
1863 0         0 return [ $rGV1, $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1864             } ## end sub save_tokenizer_state
1865              
1866             sub restore_tokenizer_state {
1867 0     0 0 0 my ($rstate) = @_;
1868 0         0 my ( $rGV1, $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
  0         0  
1869              
1870             (
1871             $brace_depth,
1872             $context,
1873             $current_package,
1874             $last_nonblank_block_type,
1875             $last_nonblank_token,
1876             $last_nonblank_type,
1877             $next_sequence_number,
1878             $paren_depth,
1879             $rbrace_context,
1880             $rbrace_package,
1881             $rbrace_structural_type,
1882             $rbrace_type,
1883             $rcurrent_depth,
1884             $rcurrent_sequence_number,
1885             $rdepth_array,
1886             $ris_block_function,
1887             $ris_block_list_function,
1888             $ris_constant,
1889             $ris_user_function,
1890             $rnested_statement_type,
1891             $rnested_ternary_flag,
1892             $rparen_semicolon_count,
1893             $rparen_vars,
1894             $rparen_type,
1895             $rsaw_function_definition,
1896             $rsaw_use_module,
1897             $rsquare_bracket_structural_type,
1898             $rsquare_bracket_type,
1899             $rstarting_line_of_current_depth,
1900             $rtotal_depth,
1901             $ruser_function_prototype,
1902             $square_bracket_depth,
1903             $statement_type,
1904             $total_depth,
1905              
1906 0         0 ) = @{$rGV1};
  0         0  
1907              
1908             (
1909             $block_type, $container_type, $expecting,
1910             $i, $i_tok, $input_line,
1911             $input_line_number, $last_nonblank_i, $max_token_index,
1912             $next_tok, $next_type, $peeked_ahead,
1913             $prototype, $rhere_target_list, $rtoken_map,
1914             $rtoken_type, $rtokens, $tok,
1915             $type, $type_sequence, $indent_flag,
1916 0         0 ) = @{$rTV1};
  0         0  
1917              
1918             (
1919             $routput_token_list, $routput_token_type,
1920             $routput_block_type, $routput_container_type,
1921             $routput_type_sequence, $routput_indent_flag,
1922 0         0 ) = @{$rTV2};
  0         0  
1923              
1924             (
1925             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1926             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1927 0         0 ) = @{$rTV3};
  0         0  
1928              
1929 0         0 ( $id_scan_state, $identifier, $want_paren ) = @{$rTV4};
  0         0  
1930              
1931             (
1932             $nesting_token_string, $nesting_block_string,
1933             $nesting_block_flag, $level_in_tokenizer,
1934 0         0 ) = @{$rTV5};
  0         0  
1935              
1936             (
1937             $last_nonblank_container_type, $last_nonblank_type_sequence,
1938             $last_last_nonblank_token, $last_last_nonblank_type,
1939             $last_nonblank_prototype,
1940 0         0 ) = @{$rTV6};
  0         0  
1941 0         0 return;
1942             } ## end sub restore_tokenizer_state
1943              
1944             sub split_pretoken {
1945              
1946 8     8 0 19 my ( $self, $numc ) = @_;
1947              
1948             # Split the leading $numc characters from the current token (at index=$i)
1949             # which is pre-type 'w' and insert the remainder back into the pretoken
1950             # stream with appropriate settings. Since we are splitting a pre-type 'w',
1951             # there are three cases, depending on if the remainder starts with a digit:
1952             # Case 1: remainder is type 'd', all digits
1953             # Case 2: remainder is type 'd' and type 'w': digits and other characters
1954             # Case 3: remainder is type 'w'
1955              
1956             # Examples, for $numc=1:
1957             # $tok => $tok_0 $tok_1 $tok_2
1958             # 'x10' => 'x' '10' # case 1
1959             # 'x10if' => 'x' '10' 'if' # case 2
1960             # '0ne => 'O' 'ne' # case 3
1961              
1962             # where:
1963             # $tok_1 is a possible string of digits (pre-type 'd')
1964             # $tok_2 is a possible word (pre-type 'w')
1965              
1966             # return 1 if successful
1967             # return undef if error (shouldn't happen)
1968              
1969             # Calling routine should update '$type' and '$tok' if successful.
1970              
1971 8         18 my $pretoken = $rtokens->[$i];
1972 8 50 33     80 if ( $pretoken
      33        
1973             && length($pretoken) > $numc
1974             && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1975             {
1976              
1977             # Split $tok into up to 3 tokens:
1978 8         21 my $tok_0 = substr( $pretoken, 0, $numc );
1979 8 50       28 my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1980 8 50       27 my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1981              
1982 8         12 my $len_0 = length($tok_0);
1983 8         13 my $len_1 = length($tok_1);
1984 8         14 my $len_2 = length($tok_2);
1985              
1986 8         16 my $pre_type_0 = 'w';
1987 8         13 my $pre_type_1 = 'd';
1988 8         14 my $pre_type_2 = 'w';
1989              
1990 8         16 my $pos_0 = $rtoken_map->[$i];
1991 8         14 my $pos_1 = $pos_0 + $len_0;
1992 8         13 my $pos_2 = $pos_1 + $len_1;
1993              
1994 8         13 my $isplice = $i + 1;
1995              
1996             # Splice in any digits
1997 8 100       24 if ($len_1) {
1998 5         12 splice @{$rtoken_map}, $isplice, 0, $pos_1;
  5         22  
1999 5         18 splice @{$rtokens}, $isplice, 0, $tok_1;
  5         12  
2000 5         11 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
  5         11  
2001 5         11 $max_token_index++;
2002 5         10 $isplice++;
2003             }
2004              
2005             # Splice in any trailing word
2006 8 100       20 if ($len_2) {
2007 4         5 splice @{$rtoken_map}, $isplice, 0, $pos_2;
  4         11  
2008 4         11 splice @{$rtokens}, $isplice, 0, $tok_2;
  4         10  
2009 4         6 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
  4         12  
2010 4         6 $max_token_index++;
2011             }
2012              
2013 8         18 $rtokens->[$i] = $tok_0;
2014 8         28 return 1;
2015             }
2016             else {
2017              
2018             # Shouldn't get here
2019 0         0 if (DEVEL_MODE) {
2020             $self->Fault(<<EOM);
2021             While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
2022             EOM
2023             }
2024             }
2025 0         0 return;
2026             } ## end sub split_pretoken
2027              
2028             sub get_indentation_level {
2029 561     561 0 1551 return $level_in_tokenizer;
2030             }
2031              
2032             sub reset_indentation_level {
2033 561     561 0 1458 $level_in_tokenizer = shift;
2034 561         1135 return;
2035             }
2036              
2037             sub peeked_ahead {
2038 252     252 0 490 my $flag = shift;
2039 252 100       680 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
2040 252         606 return $peeked_ahead;
2041             }
2042              
2043             # ------------------------------------------------------------
2044             # end of tokenizer variable access and manipulation routines
2045             # ------------------------------------------------------------
2046              
2047             #------------------------------
2048             # beginning of tokenizer hashes
2049             #------------------------------
2050              
2051             my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
2052              
2053             # These block types terminate statements and do not need a trailing
2054             # semicolon
2055             # patched for SWITCH/CASE/
2056             # NOTE: not currently used but may be used in the future
2057             my %is_zero_continuation_block_type;
2058             my @q;
2059             @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
2060             if elsif else unless while until for foreach switch case given when);
2061             @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
2062              
2063             my %is_logical_container;
2064             @q = qw(if elsif unless while and or err not && ! || for foreach);
2065             @is_logical_container{@q} = (1) x scalar(@q);
2066              
2067             my %is_binary_type;
2068             @q = qw(|| &&);
2069             @is_binary_type{@q} = (1) x scalar(@q);
2070              
2071             my %is_binary_keyword;
2072             @q = qw(and or err eq ne cmp);
2073             @is_binary_keyword{@q} = (1) x scalar(@q);
2074              
2075             # 'L' is token for opening { at hash key
2076             my %is_opening_type;
2077             @q = qw< L { ( [ >;
2078             @is_opening_type{@q} = (1) x scalar(@q);
2079              
2080             my %is_opening_or_ternary_type;
2081             push @q, '?';
2082             @is_opening_or_ternary_type{@q} = (1) x scalar(@q);
2083              
2084             # 'R' is token for closing } at hash key
2085             my %is_closing_type;
2086             @q = qw< R } ) ] >;
2087             @is_closing_type{@q} = (1) x scalar(@q);
2088              
2089             my %is_closing_or_ternary_type;
2090             push @q, ':';
2091             @is_closing_or_ternary_type{@q} = (1) x scalar(@q);
2092              
2093             my %is_redo_last_next_goto;
2094             @q = qw(redo last next goto);
2095             @is_redo_last_next_goto{@q} = (1) x scalar(@q);
2096              
2097             my %is_use_require;
2098             @q = qw(use require);
2099             @is_use_require{@q} = (1) x scalar(@q);
2100              
2101             # This hash holds the array index in $self for these keywords:
2102             # Fix for issue c035: removed 'format' from this hash
2103             my %is_END_DATA = (
2104             '__END__' => _in_end_,
2105             '__DATA__' => _in_data_,
2106             );
2107              
2108             my %is_list_end_type;
2109             @q = qw( ; { } );
2110             push @q, ',';
2111             @is_list_end_type{@q} = (1) x scalar(@q);
2112              
2113             # original ref: camel 3 p 147,
2114             # but perl may accept undocumented flags
2115             # perl 5.10 adds 'p' (preserve)
2116             # Perl version 5.22 added 'n'
2117             # From http://perldoc.perl.org/perlop.html we have
2118             # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
2119             # s/PATTERN/REPLACEMENT/msixpodualngcer
2120             # y/SEARCHLIST/REPLACEMENTLIST/cdsr
2121             # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
2122             # qr/STRING/msixpodualn
2123             my %quote_modifiers = (
2124             's' => '[msixpodualngcer]',
2125             'y' => '[cdsr]',
2126             'tr' => '[cdsr]',
2127             'm' => '[msixpodualngc]',
2128             'qr' => '[msixpodualn]',
2129             'q' => EMPTY_STRING,
2130             'qq' => EMPTY_STRING,
2131             'qw' => EMPTY_STRING,
2132             'qx' => EMPTY_STRING,
2133             );
2134              
2135             # table showing how many quoted things to look for after quote operator..
2136             # s, y, tr have 2 (pattern and replacement)
2137             # others have 1 (pattern only)
2138             my %quote_items = (
2139             's' => 2,
2140             'y' => 2,
2141             'tr' => 2,
2142             'm' => 1,
2143             'qr' => 1,
2144             'q' => 1,
2145             'qq' => 1,
2146             'qw' => 1,
2147             'qx' => 1,
2148             );
2149              
2150             my %is_for_foreach;
2151             @q = qw(for foreach);
2152             @is_for_foreach{@q} = (1) x scalar(@q);
2153              
2154             # These keywords may introduce blocks after parenthesized expressions,
2155             # in the form:
2156             # keyword ( .... ) { BLOCK }
2157             # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
2158             # NOTE for --use-feature=class: if ADJUST blocks eventually take a
2159             # parameter list, then ADJUST might need to be added to this list (see
2160             # perlclass.pod)
2161             my %is_blocktype_with_paren;
2162             @q =
2163             qw(if elsif unless while until for foreach switch case given when catch);
2164             @is_blocktype_with_paren{@q} = (1) x scalar(@q);
2165              
2166             my %is_case_default;
2167             @q = qw(case default);
2168             @is_case_default{@q} = (1) x scalar(@q);
2169              
2170             #------------------------
2171             # end of tokenizer hashes
2172             #------------------------
2173              
2174             # ------------------------------------------------------------
2175             # beginning of various scanner interface routines
2176             # ------------------------------------------------------------
2177             sub scan_replacement_text {
2178              
2179             # check for here-docs in replacement text invoked by
2180             # a substitution operator with executable modifier 'e'.
2181             #
2182             # given:
2183             # $replacement_text
2184             # return:
2185             # $rht = reference to any here-doc targets
2186 0     0 0 0 my ( $self, $replacement_text ) = @_;
2187              
2188             # quick check
2189 0 0       0 return if ( $replacement_text !~ /<</ );
2190              
2191 0         0 $self->write_logfile_entry(
2192             "scanning replacement text for here-doc targets\n");
2193              
2194             # save the logger object for error messages
2195 0         0 my $logger_object = $self->[_logger_object_];
2196              
2197             # save all lexical variables
2198 0         0 my $rstate = save_tokenizer_state();
2199 0         0 _decrement_count(); # avoid error check for multiple tokenizers
2200              
2201             # make a new tokenizer
2202 0         0 my $tokenizer = Perl::Tidy::Tokenizer->new(
2203             source_object => \$replacement_text,
2204             logger_object => $logger_object,
2205             starting_line_number => $input_line_number,
2206             );
2207              
2208             # scan the replacement text
2209 0         0 while ( $tokenizer->get_line() ) { }
2210              
2211             # remove any here doc targets
2212 0         0 my $rht = undef;
2213 0 0       0 if ( $tokenizer->[_in_here_doc_] ) {
2214 0         0 $rht = [];
2215 0         0 push @{$rht},
  0         0  
2216             [
2217             $tokenizer->[_here_doc_target_],
2218             $tokenizer->[_here_quote_character_]
2219             ];
2220 0 0       0 if ( $tokenizer->[_rhere_target_list_] ) {
2221 0         0 push @{$rht}, @{ $tokenizer->[_rhere_target_list_] };
  0         0  
  0         0  
2222 0         0 $tokenizer->[_rhere_target_list_] = undef;
2223             }
2224 0         0 $tokenizer->[_in_here_doc_] = undef;
2225             }
2226              
2227             # now its safe to report errors
2228 0         0 my $severe_error = $tokenizer->report_tokenization_errors();
2229              
2230             # TODO: Could propagate a severe error up
2231              
2232             # restore all tokenizer lexical variables
2233 0         0 restore_tokenizer_state($rstate);
2234              
2235             # return the here doc targets
2236 0         0 return $rht;
2237             } ## end sub scan_replacement_text
2238              
2239             sub scan_bare_identifier {
2240 1672     1672 0 2959 my $self = shift;
2241 1672         5151 ( $i, $tok, $type, $prototype ) =
2242             $self->scan_bare_identifier_do( $input_line, $i, $tok, $type,
2243             $prototype, $rtoken_map, $max_token_index );
2244 1672         3466 return;
2245             } ## end sub scan_bare_identifier
2246              
2247             sub scan_identifier {
2248              
2249 486     486 0 861 my $self = shift;
2250              
2251             (
2252 486         1952 $i, $tok, $type, $id_scan_state, $identifier,
2253             my $split_pretoken_flag
2254             )
2255             = $self->scan_complex_identifier( $i, $id_scan_state, $identifier,
2256             $rtokens, $max_token_index, $expecting,
2257             $rparen_type->[$paren_depth] );
2258              
2259             # Check for signal to fix a special variable adjacent to a keyword,
2260             # such as '$^One$0'.
2261 486 100       1467 if ($split_pretoken_flag) {
2262              
2263             # Try to fix it by splitting the pretoken
2264 3 50 33     23 if ( $i > 0
      33        
2265             && $rtokens->[ $i - 1 ] eq '^'
2266             && $self->split_pretoken(1) )
2267             {
2268 3         10 $identifier = substr( $identifier, 0, 3 );
2269 3         5 $tok = $identifier;
2270             }
2271             else {
2272              
2273             # This shouldn't happen ...
2274 0         0 my $var = substr( $tok, 0, 3 );
2275 0         0 my $excess = substr( $tok, 3 );
2276 0         0 $self->interrupt_logfile();
2277 0         0 $self->warning(<<EOM);
2278             $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2279             A space may be needed after '$var'.
2280             EOM
2281 0         0 $self->resume_logfile();
2282             }
2283             }
2284 486         883 return;
2285             } ## end sub scan_identifier
2286              
2287 39     39   359 use constant VERIFY_FASTSCAN => 0;
  39         104  
  39         3977  
2288             my %fast_scan_context;
2289              
2290             BEGIN {
2291 39     39   50174 %fast_scan_context = (
2292             '$' => SCALAR_CONTEXT,
2293             '*' => SCALAR_CONTEXT,
2294             '@' => LIST_CONTEXT,
2295             '%' => LIST_CONTEXT,
2296             '&' => UNKNOWN_CONTEXT,
2297             );
2298             } ## end BEGIN
2299              
2300             sub scan_simple_identifier {
2301              
2302             # This is a wrapper for sub scan_identifier. It does a fast preliminary
2303             # scan for certain common identifiers:
2304             # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2305             # If it does not find one of these, or this is a restart, it calls the
2306             # original scanner directly.
2307              
2308             # This gives the same results as the full scanner in about 1/4 the
2309             # total runtime for a typical input stream.
2310              
2311             # Notation:
2312             # $var * 2
2313             # ^^ ^
2314             # || |
2315             # || ---- $i_next [= next nonblank pretoken ]
2316             # |----$i_plus_1 [= a bareword ]
2317             # ---$i_begin [= a sigil]
2318              
2319 4791     4791 0 7125 my $self = shift;
2320              
2321 4791         6836 my $i_begin = $i;
2322 4791         7241 my $tok_begin = $tok;
2323 4791         7307 my $i_plus_1 = $i + 1;
2324 4791         6793 my $fast_scan_type;
2325              
2326             #-------------------------------------------------------
2327             # Do full scan for anything following a pointer, such as
2328             # $cref->&*; # a postderef
2329             #-------------------------------------------------------
2330 4791 100 66     27369 if ( $last_nonblank_token eq '->' ) {
    100 66        
    50 33        
      0        
      33        
2331              
2332             }
2333              
2334             #------------------------------
2335             # quick scan with leading sigil
2336             #------------------------------
2337             elsif ( !$id_scan_state
2338             && $i_plus_1 <= $max_token_index
2339             && $fast_scan_context{$tok} )
2340             {
2341 4678         8438 $context = $fast_scan_context{$tok};
2342              
2343             # look for $var, @var, ...
2344 4678 100 100     11163 if ( $rtoken_type->[$i_plus_1] eq 'w' ) {
    100 66        
2345 4390         6957 my $pretype_next = EMPTY_STRING;
2346 4390 100       9434 if ( $i_plus_1 < $max_token_index ) {
2347 4274         6657 my $i_next = $i_plus_1 + 1;
2348 4274 100 100     13230 if ( $rtoken_type->[$i_next] eq 'b'
2349             && $i_next < $max_token_index )
2350             {
2351 1707         3085 $i_next += 1;
2352             }
2353 4274         7308 $pretype_next = $rtoken_type->[$i_next];
2354             }
2355 4390 100 100     16207 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2356              
2357             # Found type 'i' like '$var', '@var', or '%var'
2358 4282         8389 $identifier = $tok . $rtokens->[$i_plus_1];
2359 4282         6509 $tok = $identifier;
2360 4282         6973 $type = 'i';
2361 4282         6136 $i = $i_plus_1;
2362 4282         7289 $fast_scan_type = $type;
2363             }
2364             }
2365              
2366             # Look for @{ or %{ .
2367             # But we must let the full scanner handle things ${ because it may
2368             # keep going to get a complete identifier like '${#}' .
2369             elsif (
2370             $rtoken_type->[$i_plus_1] eq '{'
2371             && ( $tok_begin eq '@'
2372             || $tok_begin eq '%' )
2373             )
2374             {
2375              
2376 30         70 $identifier = $tok;
2377 30         64 $type = 't';
2378 30         58 $fast_scan_type = $type;
2379             }
2380             else {
2381             ## out of tricks
2382             }
2383             }
2384              
2385             #---------------------------
2386             # Quick scan with leading ->
2387             # Look for ->[ and ->{
2388             #---------------------------
2389             elsif (
2390             $tok eq '->'
2391             && $i < $max_token_index
2392             && ( $rtokens->[$i_plus_1] eq '{'
2393             || $rtokens->[$i_plus_1] eq '[' )
2394             )
2395             {
2396 0         0 $type = $tok;
2397 0         0 $fast_scan_type = $type;
2398 0         0 $identifier = $tok;
2399 0         0 $context = UNKNOWN_CONTEXT;
2400             }
2401             else {
2402             ## out of tricks
2403             }
2404              
2405             #--------------------------------------
2406             # Verify correctness during development
2407             #--------------------------------------
2408 4791         6827 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2409              
2410             # We will call the full method
2411             my $identifier_simple = $identifier;
2412             my $tok_simple = $tok;
2413             my $i_simple = $i;
2414             my $context_simple = $context;
2415              
2416             $tok = $tok_begin;
2417             $i = $i_begin;
2418             $self->scan_identifier();
2419              
2420             if ( $tok ne $tok_simple
2421             || $type ne $fast_scan_type
2422             || $i != $i_simple
2423             || $identifier ne $identifier_simple
2424             || $id_scan_state
2425             || $context ne $context_simple )
2426             {
2427             print {*STDERR} <<EOM;
2428             scan_simple_identifier differs from scan_identifier:
2429             simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2430             full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2431             EOM
2432             }
2433             }
2434              
2435             #-------------------------------------------------
2436             # call full scanner if fast method did not succeed
2437             #-------------------------------------------------
2438 4791 100       10064 if ( !$fast_scan_type ) {
2439 479         1641 $self->scan_identifier();
2440             }
2441 4791         8394 return;
2442             } ## end sub scan_simple_identifier
2443              
2444             sub method_ok_here {
2445              
2446             # Return:
2447             # false if this is definitely an invalid method declaration
2448             # true otherwise (even if not sure)
2449              
2450             # We are trying to avoid problems with old uses of 'method'
2451             # when --use-feature=class is set (rt145706).
2452             # For example, this should cause a return of 'false':
2453              
2454             # method paint => sub {
2455             # return;
2456             # };
2457              
2458 6     6 0 15 my $self = shift;
2459              
2460             # from do_scan_sub:
2461 6         16 my $i_beg = $i + 1;
2462 6         13 my $pos_beg = $rtoken_map->[$i_beg];
2463 6         21 pos($input_line) = $pos_beg;
2464              
2465             # TEST 1: look a valid sub NAME
2466 6 50       43 if (
2467             $input_line =~ m{\G\s*
2468             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2469             (\w+) # NAME - required
2470             }gcx
2471             )
2472             {
2473             # For possible future use..
2474 6         21 my $subname = $2;
2475 6 50       22 my $package = $1 ? $1 : EMPTY_STRING;
2476             }
2477             else {
2478 0         0 return;
2479             }
2480              
2481             # TEST 2: look for invalid characters after name, such as here:
2482             # method paint => sub {
2483             # ...
2484             # }
2485 6         15 my $next_char = EMPTY_STRING;
2486 6 100       30 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
  5         13  
2487 6 100 66     37 if ( !$next_char || $next_char eq '#' ) {
2488 1         7 ( $next_char, my $i_next ) =
2489             $self->find_next_nonblank_token( $max_token_index,
2490             $rtokens, $max_token_index );
2491             }
2492              
2493 6 50       22 if ( !$next_char ) {
2494              
2495             # out of characters - give up
2496 0         0 return;
2497             }
2498              
2499             # Possibly valid next token types:
2500             # '(' could start prototype or signature
2501             # ':' could start ATTRIBUTE
2502             # '{' cold start BLOCK
2503             # ';' or '}' could end a statement
2504 6 100       28 if ( $next_char !~ /^[\(\:\{\;\}]/ ) {
2505              
2506             # This does not match use feature 'class' syntax
2507 3         21 return;
2508             }
2509              
2510             # We will stop here and assume that this is valid syntax for
2511             # use feature 'class'.
2512 3         15 return 1;
2513             } ## end sub method_ok_here
2514              
2515             sub class_ok_here {
2516              
2517             # Return:
2518             # false if this is definitely an invalid class declaration
2519             # true otherwise (even if not sure)
2520              
2521             # We are trying to avoid problems with old uses of 'class'
2522             # when --use-feature=class is set (rt145706). We look ahead
2523             # see if this use of 'class' is obviously inconsistent with
2524             # the syntax of use feature 'class'. This allows the default
2525             # setting --use-feature=class to work for old syntax too.
2526              
2527             # Valid class declarations look like
2528             # class NAME ?ATTRS ?VERSION ?BLOCK
2529             # where ATTRS VERSION and BLOCK are optional
2530              
2531             # For example, this should produce a return of 'false':
2532             #
2533             # class ExtendsBasicAttributes is BasicAttributes{
2534              
2535 6     6 0 12 my $self = shift;
2536              
2537             # TEST 1: class stmt can only go where a new statment can start
2538 6 50       16 if ( !new_statement_ok() ) { return }
  0         0  
2539              
2540 6         17 my $i_beg = $i + 1;
2541 6         29 my $pos_beg = $rtoken_map->[$i_beg];
2542 6         18 pos($input_line) = $pos_beg;
2543              
2544             # TEST 2: look for a valid NAME
2545 6 50       39 if (
2546             $input_line =~ m{\G\s*
2547             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2548             (\w+) # NAME - required
2549             }gcx
2550             )
2551             {
2552             # For possible future use..
2553 6         17 my $subname = $2;
2554 6 100       21 my $package = $1 ? $1 : EMPTY_STRING;
2555             }
2556             else {
2557 0         0 return;
2558             }
2559              
2560             # TEST 3: look for valid characters after NAME
2561 6         12 my $next_char = EMPTY_STRING;
2562 6 100       22 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
  5         11  
2563 6 100 66     32 if ( !$next_char || $next_char eq '#' ) {
2564 1         4 ( $next_char, my $i_next ) =
2565             $self->find_next_nonblank_token( $max_token_index,
2566             $rtokens, $max_token_index );
2567             }
2568 6 50       15 if ( !$next_char ) {
2569              
2570             # out of characters - give up
2571 0         0 return;
2572             }
2573              
2574             # Must see one of: ATTRIBUTE, VERSION, BLOCK, or end stmt
2575              
2576             # Possibly valid next token types:
2577             # ':' could start ATTRIBUTE
2578             # '\d' could start VERSION
2579             # '{' cold start BLOCK
2580             # ';' could end a statement
2581             # '}' could end statement but would be strange
2582              
2583 6 100       19 if ( $next_char !~ /^[\:\d\{\;\}]/ ) {
2584              
2585             # This does not match use feature 'class' syntax
2586 2         9 return;
2587             }
2588              
2589             # We will stop here and assume that this is valid syntax for
2590             # use feature 'class'.
2591 4         20 return 1;
2592             } ## end sub class_ok_here
2593              
2594             sub scan_id {
2595 331     331 0 704 my $self = shift;
2596 331         1302 ( $i, $tok, $type, $id_scan_state ) =
2597             $self->scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2598             $id_scan_state, $max_token_index );
2599 331         832 return;
2600             } ## end sub scan_id
2601              
2602             sub scan_number {
2603 629     629 0 1053 my $self = shift;
2604 629         964 my $number;
2605 629         1770 ( $i, $type, $number ) =
2606             $self->scan_number_do( $input_line, $i, $rtoken_map, $type,
2607             $max_token_index );
2608 629         1424 return $number;
2609             } ## end sub scan_number
2610              
2611 39     39   359 use constant VERIFY_FASTNUM => 0;
  39         131  
  39         32266  
2612              
2613             sub scan_number_fast {
2614              
2615             # This is a wrapper for sub scan_number. It does a fast preliminary
2616             # scan for a simple integer. It calls the original scan_number if it
2617             # does not find one.
2618              
2619 2277     2277 0 3469 my $self = shift;
2620 2277         3559 my $i_begin = $i;
2621 2277         3709 my $tok_begin = $tok;
2622 2277         3147 my $number;
2623              
2624             #---------------------------------
2625             # Quick check for (signed) integer
2626             #---------------------------------
2627              
2628             # This will be the string of digits:
2629 2277         3369 my $i_d = $i;
2630 2277         3822 my $tok_d = $tok;
2631 2277         3912 my $typ_d = $rtoken_type->[$i_d];
2632              
2633             # check for signed integer
2634 2277         3636 my $sign = EMPTY_STRING;
2635 2277 50 66     8699 if ( $typ_d ne 'd'
      66        
      33        
2636             && ( $typ_d eq '+' || $typ_d eq '-' )
2637             && $i_d < $max_token_index )
2638             {
2639 343         585 $sign = $tok_d;
2640 343         583 $i_d++;
2641 343         606 $tok_d = $rtokens->[$i_d];
2642 343         599 $typ_d = $rtoken_type->[$i_d];
2643             }
2644              
2645             # Handle integers
2646 2277 100 100     16884 if (
      100        
2647             $typ_d eq 'd'
2648             && (
2649             $i_d == $max_token_index
2650             || ( $i_d < $max_token_index
2651             && $rtoken_type->[ $i_d + 1 ] ne '.'
2652             && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2653             )
2654             )
2655             {
2656             # Let let full scanner handle multi-digit integers beginning with
2657             # '0' because there could be error messages. For example, '009' is
2658             # not a valid number.
2659              
2660 1715 100 100     7269 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2661 1658         3208 $number = $sign . $tok_d;
2662 1658         2561 $type = 'n';
2663 1658         2802 $i = $i_d;
2664             }
2665             }
2666              
2667             #--------------------------------------
2668             # Verify correctness during development
2669             #--------------------------------------
2670 2277         3241 if ( VERIFY_FASTNUM && defined($number) ) {
2671              
2672             # We will call the full method
2673             my $type_simple = $type;
2674             my $i_simple = $i;
2675             my $number_simple = $number;
2676              
2677             $tok = $tok_begin;
2678             $i = $i_begin;
2679             $number = $self->scan_number();
2680              
2681             if ( $type ne $type_simple
2682             || ( $i != $i_simple && $i <= $max_token_index )
2683             || $number ne $number_simple )
2684             {
2685             print {*STDERR} <<EOM;
2686             scan_number_fast differs from scan_number:
2687             simple: i=$i_simple, type=$type_simple, number=$number_simple
2688             full: i=$i, type=$type, number=$number
2689             EOM
2690             }
2691             }
2692              
2693             #----------------------------------------
2694             # call full scanner if may not be integer
2695             #----------------------------------------
2696 2277 100       5066 if ( !defined($number) ) {
2697 619         1692 $number = $self->scan_number();
2698             }
2699 2277         5488 return $number;
2700             } ## end sub scan_number_fast
2701              
2702             # a sub to warn if token found where term expected
2703             sub error_if_expecting_TERM {
2704 9     9 0 23 my $self = shift;
2705 9 50       31 if ( $expecting == TERM ) {
2706 9 50       32 if ( $really_want_term{$last_nonblank_type} ) {
2707 0         0 $self->report_unexpected( $tok, "term", $i_tok,
2708             $last_nonblank_i, $rtoken_map, $rtoken_type, $input_line );
2709 0         0 return 1;
2710             }
2711             }
2712 9         20 return;
2713             } ## end sub error_if_expecting_TERM
2714              
2715             # a sub to warn if token found where operator expected
2716             sub error_if_expecting_OPERATOR {
2717 769     769 0 1577 my ( $self, $thing ) = @_;
2718 769 50       1859 if ( $expecting == OPERATOR ) {
2719 0 0       0 if ( !defined($thing) ) { $thing = $tok }
  0         0  
2720 0         0 $self->report_unexpected( $thing, "operator", $i_tok,
2721             $last_nonblank_i, $rtoken_map, $rtoken_type, $input_line );
2722 0 0       0 if ( $i_tok == 0 ) {
2723 0         0 $self->interrupt_logfile();
2724 0         0 $self->warning("Missing ';' or ',' above?\n");
2725 0         0 $self->resume_logfile();
2726             }
2727 0         0 return 1;
2728             }
2729 769         1404 return;
2730             } ## end sub error_if_expecting_OPERATOR
2731              
2732             # ------------------------------------------------------------
2733             # end scanner interfaces
2734             # ------------------------------------------------------------
2735              
2736             #------------------
2737             # Tokenization subs
2738             #------------------
2739             sub do_GREATER_THAN_SIGN {
2740              
2741 31     31 0 100 my $self = shift;
2742              
2743             # '>'
2744 31 50       120 $self->error_if_expecting_TERM()
2745             if ( $expecting == TERM );
2746 31         73 return;
2747             } ## end sub do_GREATER_THAN_SIGN
2748              
2749             sub do_VERTICAL_LINE {
2750              
2751 4     4 0 9 my $self = shift;
2752              
2753             # '|'
2754 4 50       13 $self->error_if_expecting_TERM()
2755             if ( $expecting == TERM );
2756 4         9 return;
2757             } ## end sub do_VERTICAL_LINE
2758              
2759             # An identifier in possible indirect object location followed by any of
2760             # these tokens: -> , ; } (plus others) is not an indirect object. Fix c257.
2761             my %Z_test_hash;
2762              
2763             BEGIN {
2764 39     39   384 my @qZ = qw#
2765             -> ; } ) ]
2766             => =~ = == !~ || >= != *= .. && |= .= -= += <= %=
2767             ^= &&= ||= //= <=>
2768             #;
2769 39         168 push @qZ, ',';
2770 39         352195 @{Z_test_hash}{@qZ} = (1) x scalar(@qZ);
2771             }
2772              
2773             sub do_DOLLAR_SIGN {
2774              
2775 4036     4036 0 6699 my $self = shift;
2776              
2777             # '$'
2778             # start looking for a scalar
2779 4036 50       8890 $self->error_if_expecting_OPERATOR("Scalar")
2780             if ( $expecting == OPERATOR );
2781 4036         12581 $self->scan_simple_identifier();
2782              
2783 4036 100       9110 if ( $identifier eq '$^W' ) {
2784 1         3 $self->[_saw_perl_dash_w_] = 1;
2785             }
2786              
2787             # Check for identifier in indirect object slot
2788             # (vorboard.pl, sort.t). Something like:
2789             # /^(print|printf|sort|exec|system)$/
2790 4036 100 66     32229 if (
      100        
      100        
      66        
      66        
2791             $is_indirect_object_taker{$last_nonblank_token}
2792             && $last_nonblank_type eq 'k'
2793             || ( ( $last_nonblank_token eq '(' )
2794             && $is_indirect_object_taker{ $rparen_type->[$paren_depth] } )
2795             || ( $last_nonblank_type eq 'w'
2796             || $last_nonblank_type eq 'U' ) # possible object
2797             )
2798             {
2799              
2800             # An identifier followed by '->' is not indirect object;
2801             # fixes b1175, b1176. Fix c257: Likewise for other tokens like
2802             # comma, semicolon, closing brace, and single space.
2803 98         640 my ( $next_nonblank_token, $i_next ) =
2804             $self->find_next_noncomment_token( $i, $rtokens,
2805             $max_token_index );
2806 98 100       396 $type = 'Z' if ( !$Z_test_hash{$next_nonblank_token} );
2807             }
2808 4036         6659 return;
2809             } ## end sub do_DOLLAR_SIGN
2810              
2811             sub do_LEFT_PARENTHESIS {
2812              
2813 2125     2125 0 3962 my $self = shift;
2814              
2815             # '('
2816 2125         3452 ++$paren_depth;
2817              
2818             # variable to enable check for brace after closing paren (c230)
2819 2125         3802 my $want_brace = EMPTY_STRING;
2820              
2821 2125 100       6374 if ($want_paren) {
    100          
2822 240         564 $container_type = $want_paren;
2823 240         502 $want_brace = $want_paren;
2824 240         501 $want_paren = EMPTY_STRING;
2825             }
2826             elsif ( $statement_type =~ /^sub\b/ ) {
2827 14         39 $container_type = $statement_type;
2828             }
2829             else {
2830 1871         3217 $container_type = $last_nonblank_token;
2831              
2832             # We can check for a syntax error here of unexpected '(',
2833             # but this is going to get messy...
2834 1871 100 100     7703 if (
2835             $expecting == OPERATOR
2836              
2837             # Be sure this is not a method call of the form
2838             # &method(...), $method->(..), &{method}(...),
2839             # $ref[2](list) is ok & short for $ref[2]->(list)
2840             # NOTE: at present, braces in something like &{ xxx }
2841             # are not marked as a block, we might have a method call.
2842             # Added ')' to fix case c017, something like ()()()
2843             && $last_nonblank_token !~ /^(?:[\]\}\)\&]|\-\>)/
2844             )
2845             {
2846              
2847             # ref: camel 3 p 703.
2848 3 50       10 if ( $last_last_nonblank_token eq 'do' ) {
2849 0         0 $self->complain(
2850             "do SUBROUTINE is deprecated; consider & or -> notation\n"
2851             );
2852             }
2853             else {
2854              
2855             # if this is an empty list, (), then it is not an
2856             # error; for example, we might have a constant pi and
2857             # invoke it with pi() or just pi;
2858 3         8 my ( $next_nonblank_token, $i_next ) =
2859             $self->find_next_nonblank_token( $i, $rtokens,
2860             $max_token_index );
2861              
2862             # Patch for c029: give up error check if
2863             # a side comment follows
2864 3 50 33     13 if ( $next_nonblank_token ne ')'
2865             && $next_nonblank_token ne '#' )
2866             {
2867 0         0 my $hint;
2868              
2869 0         0 $self->error_if_expecting_OPERATOR('(');
2870              
2871 0 0       0 if ( $last_nonblank_type eq 'C' ) {
    0          
2872 0         0 $hint =
2873             "$last_nonblank_token has a void prototype\n";
2874             }
2875             elsif ( $last_nonblank_type eq 'i' ) {
2876 0 0 0     0 if ( $i_tok > 0
2877             && $last_nonblank_token =~ /^\$/ )
2878             {
2879 0         0 $hint =
2880             "Do you mean '$last_nonblank_token->(' ?\n";
2881             }
2882             }
2883             else {
2884             ## no hint
2885             }
2886 0 0       0 if ($hint) {
2887 0         0 $self->interrupt_logfile();
2888 0         0 $self->warning($hint);
2889 0         0 $self->resume_logfile();
2890             }
2891             } ## end if ( $next_nonblank_token...
2892             } ## end else [ if ( $last_last_nonblank_token...
2893             } ## end if ( $expecting == OPERATOR...
2894             }
2895              
2896 2125         7273 ( $type_sequence, $indent_flag ) =
2897             $self->increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2898              
2899             # propagate types down through nested parens
2900             # for example: the second paren in 'if ((' would be structural
2901             # since the first is.
2902              
2903 2125 100       5491 if ( $last_nonblank_token eq '(' ) {
2904 61         178 $type = $last_nonblank_type;
2905             }
2906              
2907             # We exclude parens as structural after a ',' because it
2908             # causes subtle problems with continuation indentation for
2909             # something like this, where the first 'or' will not get
2910             # indented.
2911             #
2912             # assert(
2913             # __LINE__,
2914             # ( not defined $check )
2915             # or ref $check
2916             # or $check eq "new"
2917             # or $check eq "old",
2918             # );
2919             #
2920             # Likewise, we exclude parens where a statement can start
2921             # because of problems with continuation indentation, like
2922             # these:
2923             #
2924             # ($firstline =~ /^#\!.*perl/)
2925             # and (print $File::Find::name, "\n")
2926             # and (return 1);
2927             #
2928             # (ref($usage_fref) =~ /CODE/)
2929             # ? &$usage_fref
2930             # : (&blast_usage, &blast_params, &blast_general_params);
2931              
2932             else {
2933 2064         3480 $type = '{';
2934             }
2935              
2936 2125 50       5224 if ( $last_nonblank_type eq ')' ) {
2937 0         0 $self->warning(
2938             "Syntax error? found token '$last_nonblank_type' then '('\n");
2939             }
2940              
2941             # git #105: Copy container type and want-brace flag at ') (';
2942             # propagate the container type onward so that any subsequent brace gets
2943             # correctly marked. I have implemented this as a general rule, which
2944             # should be safe, but if necessary it could be restricted to certain
2945             # container statement types such as 'for'.
2946 2125 100       5036 if ( $last_nonblank_token eq ')' ) {
2947 1         2 my $rvars = $rparen_vars->[$paren_depth];
2948 1 50       7 if ( defined($rvars) ) {
2949 1         4 $container_type = $rparen_type->[$paren_depth];
2950 1         4 ( my $type_lp, $want_brace ) = @{$rvars};
  1         5  
2951             }
2952             }
2953              
2954 2125         4171 $rparen_type->[$paren_depth] = $container_type;
2955 2125         6030 $rparen_vars->[$paren_depth] = [ $type, $want_brace ];
2956 2125         3998 $rparen_semicolon_count->[$paren_depth] = 0;
2957              
2958 2125         3791 return;
2959              
2960             } ## end sub do_LEFT_PARENTHESIS
2961              
2962             sub do_RIGHT_PARENTHESIS {
2963              
2964 2125     2125 0 4387 my $self = shift;
2965              
2966             # ')'
2967 2125         7368 ( $type_sequence, $indent_flag ) =
2968             $self->decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2969              
2970 2125         4669 my $rvars = $rparen_vars->[$paren_depth];
2971 2125 50       5248 if ( defined($rvars) ) {
2972 2125         3442 my ( $type_lp, $want_brace ) = @{$rvars};
  2125         4750  
2973 2125 50 33     8903 if ( $type_lp && $type_lp eq '{' ) {
2974 2125         4131 $type = '}';
2975             }
2976             }
2977              
2978 2125         4221 $container_type = $rparen_type->[$paren_depth];
2979              
2980             # restore statement type as 'sub' at closing paren of a signature
2981             # so that a subsequent ':' is identified as an attribute
2982 2125 100       6106 if ( $container_type =~ /^sub\b/ ) {
2983 24         61 $statement_type = $container_type;
2984             }
2985              
2986             # /^(for|foreach)$/
2987 2125 100       6147 if ( $is_for_foreach{ $rparen_type->[$paren_depth] } ) {
2988 69         187 my $num_sc = $rparen_semicolon_count->[$paren_depth];
2989 69 50 66     448 if ( $num_sc > 0 && $num_sc != 2 ) {
2990 0         0 $self->warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2991             }
2992             }
2993              
2994 2125 50       4939 if ( $paren_depth > 0 ) { $paren_depth-- }
  2125         3254  
2995 2125         3617 return;
2996             } ## end sub do_RIGHT_PARENTHESIS
2997              
2998             sub do_COMMA {
2999              
3000 3075     3075 0 5176 my $self = shift;
3001              
3002             # ','
3003 3075 100 33     10478 if ( $last_nonblank_type eq ',' ) {
    50          
3004 10         49 $self->complain("Repeated ','s \n");
3005             }
3006              
3007             # Note that we have to check both token and type here because a
3008             # comma following a qw list can have last token='(' but type = 'q'
3009             elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
3010 0         0 $self->warning("Unexpected leading ',' after a '('\n");
3011             }
3012             else {
3013             ## ok: no complaints needed
3014             }
3015              
3016             # patch for operator_expected: note if we are in the list (use.t)
3017 3075 100       6466 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
  6         12  
3018 3075         4636 return;
3019              
3020             } ## end sub do_COMMA
3021              
3022             sub do_SEMICOLON {
3023              
3024 2448     2448 0 4603 my $self = shift;
3025              
3026             # ';'
3027 2448         4116 $context = UNKNOWN_CONTEXT;
3028 2448         3929 $statement_type = EMPTY_STRING;
3029 2448         4511 $want_paren = EMPTY_STRING;
3030              
3031             # /^(for|foreach)$/
3032 2448 100       6758 if ( $is_for_foreach{ $rparen_type->[$paren_depth] } )
3033             { # mark ; in for loop
3034              
3035             # Be careful: we do not want a semicolon such as the
3036             # following to be included:
3037             #
3038             # for (sort {strcoll($a,$b);} keys %investments) {
3039              
3040 35 100 66     250 if ( $brace_depth == $rdepth_array->[PAREN][BRACE][$paren_depth]
3041             && $square_bracket_depth ==
3042             $rdepth_array->[PAREN][SQUARE_BRACKET][$paren_depth] )
3043             {
3044              
3045 34         68 $type = 'f';
3046 34         72 $rparen_semicolon_count->[$paren_depth]++;
3047             }
3048             }
3049 2448         4045 return;
3050             } ## end sub do_SEMICOLON
3051              
3052             sub do_QUOTATION_MARK {
3053              
3054 1125     1125 0 2189 my $self = shift;
3055              
3056             # '"'
3057 1125 50       2920 $self->error_if_expecting_OPERATOR("String")
3058             if ( $expecting == OPERATOR );
3059 1125         1874 $in_quote = 1;
3060 1125         1958 $type = 'Q';
3061 1125         1832 $allowed_quote_modifiers = EMPTY_STRING;
3062 1125         1911 return;
3063             } ## end sub do_QUOTATION_MARK
3064              
3065             sub do_APOSTROPHE {
3066              
3067 1164     1164 0 2158 my $self = shift;
3068              
3069             # "'"
3070 1164 50       2813 $self->error_if_expecting_OPERATOR("String")
3071             if ( $expecting == OPERATOR );
3072 1164         1847 $in_quote = 1;
3073 1164         1922 $type = 'Q';
3074 1164         1918 $allowed_quote_modifiers = EMPTY_STRING;
3075 1164         1898 return;
3076             } ## end sub do_APOSTROPHE
3077              
3078             sub do_BACKTICK {
3079              
3080 0     0 0 0 my $self = shift;
3081              
3082             # '`'
3083 0 0       0 $self->error_if_expecting_OPERATOR("String")
3084             if ( $expecting == OPERATOR );
3085 0         0 $in_quote = 1;
3086 0         0 $type = 'Q';
3087 0         0 $allowed_quote_modifiers = EMPTY_STRING;
3088 0         0 return;
3089             } ## end sub do_BACKTICK
3090              
3091             sub do_SLASH {
3092              
3093 207     207 0 450 my $self = shift;
3094              
3095             # '/'
3096 207         365 my $is_pattern;
3097              
3098             # a pattern cannot follow certain keywords which take optional
3099             # arguments, like 'shift' and 'pop'. See also '?'.
3100 207 50 66     1036 if (
    50          
3101             $last_nonblank_type eq 'k'
3102             && $is_keyword_rejecting_slash_as_pattern_delimiter{
3103             $last_nonblank_token}
3104             )
3105             {
3106 0         0 $is_pattern = 0;
3107             }
3108             elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
3109 0         0 my $msg;
3110 0         0 ( $is_pattern, $msg ) =
3111             $self->guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
3112             $max_token_index );
3113              
3114 0 0       0 if ($msg) {
3115 0         0 $self->write_diagnostics("DIVIDE:$msg\n");
3116 0         0 $self->write_logfile_entry($msg);
3117             }
3118             }
3119 207         435 else { $is_pattern = ( $expecting == TERM ) }
3120              
3121 207 100       504 if ($is_pattern) {
3122 78         171 $in_quote = 1;
3123 78         159 $type = 'Q';
3124 78         148 $allowed_quote_modifiers = '[msixpodualngc]';
3125             }
3126             else { # not a pattern; check for a /= token
3127              
3128 129 50       382 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
3129 0         0 $i++;
3130 0         0 $tok = '/=';
3131 0         0 $type = $tok;
3132             }
3133              
3134             #DEBUG - collecting info on what tokens follow a divide
3135             # for development of guessing algorithm
3136             ## if (
3137             ## $self->is_possible_numerator( $i, $rtokens,
3138             ## $max_token_index ) < 0
3139             ## )
3140             ## {
3141             ## $self->write_diagnostics("DIVIDE? $input_line\n");
3142             ## }
3143             }
3144 207         393 return;
3145             } ## end sub do_SLASH
3146              
3147             sub do_LEFT_CURLY_BRACKET {
3148              
3149 1668     1668 0 3237 my $self = shift;
3150              
3151             # '{'
3152             # if we just saw a ')', we will label this block with
3153             # its type. We need to do this to allow sub
3154             # code_block_type to determine if this brace starts a
3155             # code block or anonymous hash. (The type of a paren
3156             # pair is the preceding token, such as 'if', 'else',
3157             # etc).
3158 1668         3248 $container_type = EMPTY_STRING;
3159              
3160             # ATTRS: for a '{' following an attribute list, reset
3161             # things to look like we just saw a sub name
3162             # Added 'package' (can be 'class') for --use-feature=class (rt145706)
3163 1668 100 100     15477 if ( substr( $statement_type, 0, 3 ) eq 'sub' ) {
    100 66        
    50 33        
    100          
    50          
3164 34         93 $last_nonblank_token = $statement_type;
3165 34         75 $last_nonblank_type = 'S'; # c250 change
3166 34         71 $statement_type = EMPTY_STRING;
3167             }
3168             elsif ( substr( $statement_type, 0, 7 ) eq 'package' ) {
3169 4         9 $last_nonblank_token = $statement_type;
3170 4         7 $last_nonblank_type = 'P'; # c250 change
3171 4         8 $statement_type = EMPTY_STRING;
3172             }
3173              
3174             # patch for SWITCH/CASE: hide these keywords from an immediately
3175             # following opening brace
3176             elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
3177             && $statement_type eq $last_nonblank_token )
3178             {
3179 0         0 $last_nonblank_token = ";";
3180             }
3181              
3182             elsif ( $last_nonblank_token eq ')' ) {
3183 241         694 $last_nonblank_token = $rparen_type->[ $paren_depth + 1 ];
3184              
3185             # defensive move in case of a nesting error (pbug.t)
3186             # in which this ')' had no previous '('
3187             # this nesting error will have been caught
3188 241 50       757 if ( !defined($last_nonblank_token) ) {
3189 0         0 $last_nonblank_token = 'if';
3190             }
3191              
3192             # Syntax check at '){'
3193 241 100       1095 if ( $is_blocktype_with_paren{$last_nonblank_token} ) {
3194              
3195 227         564 my $rvars = $rparen_vars->[ $paren_depth + 1 ];
3196 227 50       766 if ( defined($rvars) ) {
3197 227         430 my ( $type_lp, $want_brace ) = @{$rvars};
  227         593  
3198              
3199             # Now verify that this is not a trailing form
3200 227 50       765 if ( !$want_brace ) {
3201 0         0 $self->warning(
3202             "syntax error at ') {', unexpected '{' after closing ')' of a trailing '$last_nonblank_token'\n"
3203             );
3204             }
3205             }
3206             }
3207             else {
3208 14 50       75 if ( $self->[_extended_syntax_] ) {
3209              
3210             # we append a trailing () to mark this as an unknown
3211             # block type. This allows perltidy to format some
3212             # common extensions of perl syntax.
3213             # This is used by sub code_block_type
3214 14         49 $last_nonblank_token .= '()';
3215             }
3216             else {
3217 0         0 my $list =
3218             join( SPACE, sort keys %is_blocktype_with_paren );
3219 0         0 $self->warning(
3220             "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
3221             );
3222             }
3223             }
3224             }
3225              
3226             # patch for paren-less for/foreach glitch, part 2.
3227             # see note below under 'qw'
3228             elsif ($last_nonblank_token eq 'qw'
3229             && $is_for_foreach{$want_paren} )
3230             {
3231 0         0 $last_nonblank_token = $want_paren;
3232 0 0       0 if ( $last_last_nonblank_token eq $want_paren ) {
3233 0         0 $self->warning(
3234             "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
3235             );
3236              
3237             }
3238 0         0 $want_paren = EMPTY_STRING;
3239             }
3240             else {
3241             ## ok: not special
3242             }
3243              
3244             # now identify which of the three possible types of
3245             # curly braces we have: hash index container, anonymous
3246             # hash reference, or code block.
3247              
3248             # non-structural (hash index) curly brace pair
3249             # get marked 'L' and 'R'
3250 1668 100       4981 if ( is_non_structural_brace() ) {
3251 367         767 $type = 'L';
3252              
3253             # patch for SWITCH/CASE:
3254             # allow paren-less identifier after 'when'
3255             # if the brace is preceded by a space
3256 367 0 33     1346 if ( $statement_type eq 'when'
      33        
      0        
      0        
3257             && $last_nonblank_type eq 'i'
3258             && $last_last_nonblank_type eq 'k'
3259             && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
3260             {
3261 0         0 $type = '{';
3262 0         0 $block_type = $statement_type;
3263             }
3264             }
3265              
3266             # code and anonymous hash have the same type, '{', but are
3267             # distinguished by 'block_type',
3268             # which will be blank for an anonymous hash
3269             else {
3270              
3271 1301         4506 $block_type =
3272             $self->code_block_type( $i_tok, $rtokens, $rtoken_type,
3273             $max_token_index );
3274              
3275             # patch to promote bareword type to function taking block
3276 1301 100 100     6023 if ( $block_type
      66        
3277             && $last_nonblank_type eq 'w'
3278             && $last_nonblank_i >= 0 )
3279             {
3280 34 50       144 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
3281             $routput_token_type->[$last_nonblank_i] =
3282 34 100       166 $is_grep_alias{$block_type} ? 'k' : 'G';
3283             }
3284             }
3285              
3286             # patch for SWITCH/CASE: if we find a stray opening block brace
3287             # where we might accept a 'case' or 'when' block, then take it
3288 1301 100 100     5285 if ( $statement_type eq 'case'
3289             || $statement_type eq 'when' )
3290             {
3291 38 100 66     237 if ( !$block_type || $block_type eq '}' ) {
3292 4         9 $block_type = $statement_type;
3293             }
3294             }
3295             }
3296              
3297 1668         3905 $rbrace_type->[ ++$brace_depth ] = $block_type;
3298              
3299             # Patch for CLASS BLOCK definitions: do not update the package for the
3300             # current depth if this is a BLOCK type definition.
3301             # TODO: should make 'class' separate from 'package' and only do
3302             # this for 'class'
3303 1668 100       5418 $rbrace_package->[$brace_depth] = $current_package
3304             if ( substr( $block_type, 0, 8 ) ne 'package ' );
3305              
3306 1668         3744 $rbrace_structural_type->[$brace_depth] = $type;
3307 1668         2981 $rbrace_context->[$brace_depth] = $context;
3308 1668         4984 ( $type_sequence, $indent_flag ) =
3309             $self->increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
3310 1668         3542 return;
3311             } ## end sub do_LEFT_CURLY_BRACKET
3312              
3313             sub do_RIGHT_CURLY_BRACKET {
3314              
3315 1668     1668 0 3525 my $self = shift;
3316              
3317             # '}'
3318 1668         3575 $block_type = $rbrace_type->[$brace_depth];
3319 1668 100       4247 if ($block_type) { $statement_type = EMPTY_STRING }
  972         1811  
3320 1668 100       4020 if ( defined( $rbrace_package->[$brace_depth] ) ) {
3321 1664         3245 $current_package = $rbrace_package->[$brace_depth];
3322             }
3323              
3324             # can happen on brace error (caught elsewhere)
3325             else {
3326             }
3327 1668         5599 ( $type_sequence, $indent_flag ) =
3328             $self->decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
3329              
3330 1668 100       5515 if ( $rbrace_structural_type->[$brace_depth] eq 'L' ) {
3331 367         792 $type = 'R';
3332             }
3333              
3334             # propagate type information for 'do' and 'eval' blocks, and also
3335             # for smartmatch operator. This is necessary to enable us to know
3336             # if an operator or term is expected next.
3337 1668 100       4690 if ( $is_block_operator{$block_type} ) {
3338 83         252 $tok = $block_type;
3339             }
3340              
3341 1668         2960 $context = $rbrace_context->[$brace_depth];
3342 1668 50       4002 if ( $brace_depth > 0 ) { $brace_depth--; }
  1668         2573  
3343 1668         2840 return;
3344             } ## end sub do_RIGHT_CURLY_BRACKET
3345              
3346             sub do_AMPERSAND {
3347              
3348 126     126 0 361 my $self = shift;
3349              
3350             # '&' = maybe sub call? start looking
3351             # We have to check for sub call unless we are sure we
3352             # are expecting an operator. This example from s2p
3353             # got mistaken as a q operator in an early version:
3354             # print BODY &q(<<'EOT');
3355 126 100       447 if ( $expecting != OPERATOR ) {
3356              
3357             # But only look for a sub call if we are expecting a term or
3358             # if there is no existing space after the &.
3359             # For example we probably don't want & as sub call here:
3360             # Fcntl::S_IRUSR & $mode;
3361 107 100 66     484 if ( $expecting == TERM || $next_type ne 'b' ) {
3362 104         337 $self->scan_simple_identifier();
3363             }
3364             }
3365             else {
3366             }
3367 126         279 return;
3368             } ## end sub do_AMPERSAND
3369              
3370             sub do_LESS_THAN_SIGN {
3371              
3372 29     29 0 104 my $self = shift;
3373              
3374             # '<' - angle operator or less than?
3375 29 100       138 if ( $expecting != OPERATOR ) {
3376 8         57 ( $i, $type ) =
3377             $self->find_angle_operator_termination( $input_line, $i,
3378             $rtoken_map, $expecting, $max_token_index );
3379              
3380             ## This message is not very helpful and quite confusing if the above
3381             ## routine decided not to write a message with the line number.
3382             ## if ( $type eq '<' && $expecting == TERM ) {
3383             ## $self->error_if_expecting_TERM();
3384             ## $self->interrupt_logfile();
3385             ## $self->warning("Unterminated <> operator?\n");
3386             ## $self->resume_logfile();
3387             ## }
3388              
3389             }
3390             else {
3391             }
3392 29         69 return;
3393             } ## end sub do_LESS_THAN_SIGN
3394              
3395             sub do_QUESTION_MARK {
3396              
3397 187     187 0 538 my $self = shift;
3398              
3399             # '?' = conditional or starting pattern?
3400 187         418 my $is_pattern;
3401              
3402             # Patch for rt #126965
3403             # a pattern cannot follow certain keywords which take optional
3404             # arguments, like 'shift' and 'pop'. See also '/'.
3405 187 100 66     1759 if (
    100          
    100          
3406             $last_nonblank_type eq 'k'
3407             && $is_keyword_rejecting_question_as_pattern_delimiter{
3408             $last_nonblank_token}
3409             )
3410             {
3411 1         4 $is_pattern = 0;
3412             }
3413              
3414             # patch for RT#131288, user constant function without prototype
3415             # last type is 'U' followed by ?.
3416             elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
3417 1         4 $is_pattern = 0;
3418             }
3419             elsif ( $expecting == UNKNOWN ) {
3420              
3421             # In older versions of Perl, a bare ? can be a pattern
3422             # delimiter. In perl version 5.22 this was
3423             # dropped, but we have to support it in order to format
3424             # older programs. See:
3425             ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
3426             # For example, the following line worked
3427             # at one time:
3428             # ?(.*)? && (print $1,"\n");
3429             # In current versions it would have to be written with slashes:
3430             # /(.*)/ && (print $1,"\n");
3431 11         25 my $msg;
3432 11         76 ( $is_pattern, $msg ) =
3433             $self->guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
3434             $max_token_index );
3435              
3436 11 50       46 if ($msg) { $self->write_logfile_entry($msg) }
  11         43  
3437             }
3438 174         522 else { $is_pattern = ( $expecting == TERM ) }
3439              
3440 187 50       547 if ($is_pattern) {
3441 0         0 $in_quote = 1;
3442 0         0 $type = 'Q';
3443 0         0 $allowed_quote_modifiers = '[msixpodualngc]';
3444             }
3445             else {
3446 187         723 ( $type_sequence, $indent_flag ) =
3447             $self->increase_nesting_depth( QUESTION_COLON,
3448             $rtoken_map->[$i_tok] );
3449             }
3450 187         488 return;
3451             } ## end sub do_QUESTION_MARK
3452              
3453             sub do_STAR {
3454              
3455 238     238 0 512 my $self = shift;
3456              
3457             # '*' = typeglob, or multiply?
3458 238 50 66     898 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
3459 0 0 0     0 if ( $next_type ne 'b'
      0        
3460             && $next_type ne '('
3461             && $next_type ne '#' ) # Fix c036
3462             {
3463 0         0 $expecting = TERM;
3464             }
3465             }
3466 238 100       629 if ( $expecting == TERM ) {
3467 21         91 $self->scan_simple_identifier();
3468             }
3469             else {
3470              
3471 217 50       824 if ( $rtokens->[ $i + 1 ] eq '=' ) {
    100          
3472 0         0 $tok = '*=';
3473 0         0 $type = $tok;
3474 0         0 $i++;
3475             }
3476             elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
3477 36         103 $tok = '**';
3478 36         84 $type = $tok;
3479 36         68 $i++;
3480 36 50       127 if ( $rtokens->[ $i + 1 ] eq '=' ) {
3481 0         0 $tok = '**=';
3482 0         0 $type = $tok;
3483 0         0 $i++;
3484             }
3485             }
3486             else {
3487             ## not multiple characters
3488             }
3489             }
3490 238         438 return;
3491             } ## end sub do_STAR
3492              
3493             sub do_DOT {
3494              
3495 150     150 0 333 my $self = shift;
3496              
3497             # '.' = what kind of . ?
3498 150 100       463 if ( $expecting != OPERATOR ) {
3499 10         51 $self->scan_number();
3500 10 100       43 if ( $type eq '.' ) {
3501 2 50       8 $self->error_if_expecting_TERM()
3502             if ( $expecting == TERM );
3503             }
3504             }
3505             else {
3506             }
3507 150         289 return;
3508             } ## end sub do_DOT
3509              
3510             sub do_COLON {
3511              
3512 271     271 0 810 my $self = shift;
3513              
3514             # ':' = label, ternary, attribute, ?
3515              
3516             # if this is the first nonblank character, call it a label
3517             # since perl seems to just swallow it
3518 271 50 66     3691 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
    100 66        
    100 66        
    100 66        
    100          
3519 0         0 $type = 'J';
3520             }
3521              
3522             # ATTRS: check for a ':' which introduces an attribute list
3523             # either after a 'sub' keyword or within a paren list
3524             # Added 'package' (can be 'class') for --use-feature=class (rt145706)
3525             elsif ( $statement_type =~ /^(sub|package)\b/ ) {
3526 22         51 $type = 'A';
3527 22         53 $self->[_in_attribute_list_] = 1;
3528             }
3529              
3530             # Within a signature, unless we are in a ternary. For example,
3531             # from 't/filter_example.t':
3532             # method foo4 ( $class: $bar ) { $class->bar($bar) }
3533             elsif ( $rparen_type->[$paren_depth] =~ /^sub\b/
3534             && !is_balanced_closing_container(QUESTION_COLON) )
3535             {
3536 1         5 $type = 'A';
3537 1         3 $self->[_in_attribute_list_] = 1;
3538             }
3539              
3540             # check for scalar attribute, such as
3541             # my $foo : shared = 1;
3542             elsif ($is_my_our_state{$statement_type}
3543             && $rcurrent_depth->[QUESTION_COLON] == 0 )
3544             {
3545 15         38 $type = 'A';
3546 15         42 $self->[_in_attribute_list_] = 1;
3547             }
3548              
3549             # Look for Switch::Plain syntax if an error would otherwise occur
3550             # here. Note that we do not need to check if the extended syntax
3551             # flag is set because otherwise an error would occur, and we would
3552             # then have to output a message telling the user to set the
3553             # extended syntax flag to avoid the error.
3554             # case 1: {
3555             # default: {
3556             # default:
3557             # Note that the line 'default:' will be parsed as a label elsewhere.
3558             elsif ( $is_case_default{$statement_type}
3559             && !is_balanced_closing_container(QUESTION_COLON) )
3560             {
3561             # mark it as a perltidy label type
3562 46         103 $type = 'J';
3563             }
3564              
3565             # otherwise, it should be part of a ?/: operator
3566             else {
3567 187         834 ( $type_sequence, $indent_flag ) =
3568             $self->decrease_nesting_depth( QUESTION_COLON,
3569             $rtoken_map->[$i_tok] );
3570 187 50       1093 if ( $last_nonblank_token eq '?' ) {
3571 0         0 $self->warning("Syntax error near ? :\n");
3572             }
3573             }
3574 271         541 return;
3575             } ## end sub do_COLON
3576              
3577             sub do_PLUS_SIGN {
3578              
3579 227     227 0 428 my $self = shift;
3580              
3581             # '+' = what kind of plus?
3582 227 100       851 if ( $expecting == TERM ) {
    100          
3583 13         49 my $number = $self->scan_number_fast();
3584              
3585             # unary plus is safest assumption if not a number
3586 13 50       42 if ( !defined($number) ) { $type = 'p'; }
  13         41  
3587             }
3588             elsif ( $expecting == OPERATOR ) {
3589             }
3590             else {
3591 3 100       14 if ( $next_type eq 'w' ) { $type = 'p' }
  2         3  
3592             }
3593 227         389 return;
3594             } ## end sub do_PLUS_SIGN
3595              
3596             sub do_AT_SIGN {
3597              
3598 438     438 0 1153 my $self = shift;
3599              
3600             # '@' = sigil for array?
3601 438 50       1478 $self->error_if_expecting_OPERATOR("Array")
3602             if ( $expecting == OPERATOR );
3603 438         1791 $self->scan_simple_identifier();
3604 438         918 return;
3605             } ## end sub do_AT_SIGN
3606              
3607             sub do_PERCENT_SIGN {
3608              
3609 202     202 0 576 my $self = shift;
3610              
3611             # '%' = hash or modulo?
3612             # first guess is hash if no following blank or paren
3613 202 50       711 if ( $expecting == UNKNOWN ) {
3614 0 0 0     0 if ( $next_type ne 'b' && $next_type ne '(' ) {
3615 0         0 $expecting = TERM;
3616             }
3617             }
3618 202 100       655 if ( $expecting == TERM ) {
3619 192         794 $self->scan_simple_identifier();
3620             }
3621 202         503 return;
3622             } ## end sub do_PERCENT_SIGN
3623              
3624             sub do_LEFT_SQUARE_BRACKET {
3625              
3626 594     594 0 1188 my $self = shift;
3627              
3628             # '['
3629 594         2453 $rsquare_bracket_type->[ ++$square_bracket_depth ] =
3630             $last_nonblank_token;
3631 594         2168 ( $type_sequence, $indent_flag ) =
3632             $self->increase_nesting_depth( SQUARE_BRACKET,
3633             $rtoken_map->[$i_tok] );
3634              
3635             # It may seem odd, but structural square brackets have
3636             # type '{' and '}'. This simplifies the indentation logic.
3637 594 100       1841 if ( !is_non_structural_brace() ) {
3638 287         733 $type = '{';
3639             }
3640 594         1440 $rsquare_bracket_structural_type->[$square_bracket_depth] = $type;
3641 594         1105 return;
3642             } ## end sub do_LEFT_SQUARE_BRACKET
3643              
3644             sub do_RIGHT_SQUARE_BRACKET {
3645              
3646 594     594 0 2351 my $self = shift;
3647              
3648             # ']'
3649 594         2166 ( $type_sequence, $indent_flag ) =
3650             $self->decrease_nesting_depth( SQUARE_BRACKET,
3651             $rtoken_map->[$i_tok] );
3652              
3653 594 100       2061 if ( $rsquare_bracket_structural_type->[$square_bracket_depth] eq '{' )
3654             {
3655 287         1810 $type = '}';
3656             }
3657              
3658             # propagate type information for smartmatch operator. This is
3659             # necessary to enable us to know if an operator or term is expected
3660             # next.
3661 594 100       1710 if ( $rsquare_bracket_type->[$square_bracket_depth] eq '~~' ) {
3662 20         42 $tok = $rsquare_bracket_type->[$square_bracket_depth];
3663             }
3664              
3665 594 50       1567 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
  594         956  
3666 594         989 return;
3667             } ## end sub do_RIGHT_SQUARE_BRACKET
3668              
3669             sub do_MINUS_SIGN {
3670              
3671 441     441 0 902 my $self = shift;
3672              
3673             # '-' = what kind of minus?
3674 441 100 100     2843 if ( ( $expecting != OPERATOR )
    100          
    100          
3675             && $is_file_test_operator{$next_tok} )
3676             {
3677 10         44 my ( $next_nonblank_token, $i_next ) =
3678             $self->find_next_nonblank_token( $i + 1, $rtokens,
3679             $max_token_index );
3680              
3681             # check for a quoted word like "-w=>xx";
3682             # it is sufficient to just check for a following '='
3683 10 50       82 if ( $next_nonblank_token eq '=' ) {
3684 0         0 $type = 'm';
3685             }
3686             else {
3687 10         23 $i++;
3688 10         29 $tok .= $next_tok;
3689 10         24 $type = 'F';
3690             }
3691             }
3692             elsif ( $expecting == TERM ) {
3693 330         1051 my $number = $self->scan_number_fast();
3694              
3695             # maybe part of bareword token? unary is safest
3696 330 100       877 if ( !defined($number) ) { $type = 'm'; }
  288         519  
3697              
3698             }
3699             elsif ( $expecting == OPERATOR ) {
3700             }
3701             else {
3702              
3703 4 50       14 if ( $next_type eq 'w' ) {
3704 4         20 $type = 'm';
3705             }
3706             }
3707 441         763 return;
3708             } ## end sub do_MINUS_SIGN
3709              
3710             sub do_CARAT_SIGN {
3711              
3712 12     12 0 25 my $self = shift;
3713              
3714             # '^'
3715             # check for special variables like ${^WARNING_BITS}
3716 12 100       44 if ( $expecting == TERM ) {
3717              
3718 5 50 33     58 if ( $last_nonblank_token eq '{'
      33        
3719             && ( $next_tok !~ /^\d/ )
3720             && ( $next_tok =~ /^\w/ ) )
3721             {
3722              
3723 5 100       19 if ( $next_tok eq 'W' ) {
3724 1         4 $self->[_saw_perl_dash_w_] = 1;
3725             }
3726 5         12 $tok = $tok . $next_tok;
3727 5         10 $i = $i + 1;
3728 5         11 $type = 'w';
3729              
3730             # Optional coding to try to catch syntax errors. This can
3731             # be removed if it ever causes incorrect warning messages.
3732             # The '{^' should be preceded by either by a type or '$#'
3733             # Examples:
3734             # $#{^CAPTURE} ok
3735             # *${^LAST_FH}{NAME} ok
3736             # @{^HOWDY} ok
3737             # $hash{^HOWDY} error
3738              
3739             # Note that a type sigil '$' may be tokenized as 'Z'
3740             # after something like 'print', so allow type 'Z'
3741 5 0 33     23 if ( $last_last_nonblank_type ne 't'
      33        
3742             && $last_last_nonblank_type ne 'Z'
3743             && $last_last_nonblank_token ne '$#' )
3744             {
3745 0         0 $self->warning("Possible syntax error near '{^'\n");
3746             }
3747             }
3748              
3749             else {
3750 0 0       0 if ( !$self->error_if_expecting_TERM() ) {
3751              
3752             # Something like this is valid but strange:
3753             # undef ^I;
3754 0         0 $self->complain("The '^' seems unusual here\n");
3755             }
3756             }
3757             }
3758 12         26 return;
3759             } ## end sub do_CARAT_SIGN
3760              
3761             sub do_DOUBLE_COLON {
3762              
3763 9     9 0 16 my $self = shift;
3764              
3765             # '::' = probably a sub call
3766 9         35 $self->scan_bare_identifier();
3767 9         19 return;
3768             } ## end sub do_DOUBLE_COLON
3769              
3770             sub do_LEFT_SHIFT {
3771              
3772 7     7 0 30 my $self = shift;
3773              
3774             # '<<' = maybe a here-doc?
3775 7 50       33 if ( $expecting != OPERATOR ) {
3776 7         24 my ( $found_target, $here_doc_target, $here_quote_character,
3777             $saw_error );
3778             (
3779 7         54 $found_target, $here_doc_target, $here_quote_character, $i,
3780             $saw_error
3781             )
3782             = $self->find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3783             $max_token_index );
3784              
3785 7 50       30 if ($found_target) {
    0          
3786 7         16 push @{$rhere_target_list},
  7         34  
3787             [ $here_doc_target, $here_quote_character ];
3788 7         26 $type = 'h';
3789 7 50       69 if ( length($here_doc_target) > 80 ) {
    50          
    100          
3790 0         0 my $truncated = substr( $here_doc_target, 0, 80 );
3791 0         0 $self->complain("Long here-target: '$truncated' ...\n");
3792             }
3793             elsif ( !$here_doc_target ) {
3794 0 0       0 $self->warning(
3795             'Use of bare << to mean <<"" is deprecated' . "\n" )
3796             if ( !$here_quote_character );
3797             }
3798             elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3799 2         13 $self->complain(
3800             "Unconventional here-target: '$here_doc_target'\n");
3801             }
3802             else {
3803             ## ok: nothing to complain about
3804             }
3805             }
3806             elsif ( $expecting == TERM ) {
3807 0 0       0 if ( !$saw_error ) {
3808              
3809             # shouldn't happen..arriving here implies an error in
3810             # the logic in sub 'find_here_doc'
3811 0         0 if (DEVEL_MODE) {
3812             $self->Fault(<<EOM);
3813             Program bug; didn't find here doc target
3814             EOM
3815             }
3816             $self->warning(
3817 0         0 "Possible program error: didn't find here doc target\n"
3818             );
3819 0         0 $self->report_definite_bug();
3820             }
3821             }
3822              
3823             # target not found, expecting == UNKNOWN
3824             else {
3825             # assume it is a shift
3826             }
3827             }
3828             else {
3829             }
3830 7         20 return;
3831             } ## end sub do_LEFT_SHIFT
3832              
3833             sub do_NEW_HERE_DOC {
3834              
3835             # '<<~' = a here-doc, new type added in v26
3836              
3837 2     2 0 9 my $self = shift;
3838              
3839             return
3840 2 50       10 if ( $i >= $max_token_index ); # here-doc not possible if end of line
3841 2 50       11 if ( $expecting != OPERATOR ) {
3842 2         8 my ( $found_target, $here_doc_target, $here_quote_character,
3843             $saw_error );
3844             (
3845 2         14 $found_target, $here_doc_target, $here_quote_character, $i,
3846             $saw_error
3847             )
3848             = $self->find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3849             $max_token_index );
3850              
3851 2 50       9 if ($found_target) {
    0          
3852              
3853 2 50       18 if ( length($here_doc_target) > 80 ) {
    50          
3854 0         0 my $truncated = substr( $here_doc_target, 0, 80 );
3855 0         0 $self->complain("Long here-target: '$truncated' ...\n");
3856             }
3857             elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3858 0         0 $self->complain(
3859             "Unconventional here-target: '$here_doc_target'\n");
3860             }
3861             else {
3862             ## ok: nothing to complain about
3863             }
3864              
3865             # Note that we put a leading space on the here quote
3866             # character indicate that it may be preceded by spaces
3867 2         8 $here_quote_character = SPACE . $here_quote_character;
3868 2         5 push @{$rhere_target_list},
  2         9  
3869             [ $here_doc_target, $here_quote_character ];
3870 2         5 $type = 'h';
3871             }
3872              
3873             # target not found ..
3874             elsif ( $expecting == TERM ) {
3875 0 0       0 if ( !$saw_error ) {
3876              
3877             # shouldn't happen..arriving here implies an error in
3878             # the logic in sub 'find_here_doc'
3879 0         0 if (DEVEL_MODE) {
3880             $self->Fault(<<EOM);
3881             Program bug; didn't find here doc target
3882             EOM
3883             }
3884             $self->warning(
3885 0         0 "Possible program error: didn't find here doc target\n"
3886             );
3887 0         0 $self->report_definite_bug();
3888             }
3889             }
3890              
3891             # Target not found, expecting==UNKNOWN
3892             else {
3893 0         0 $self->warning("didn't find here doc target after '<<~'\n");
3894             }
3895             }
3896             else {
3897 0         0 $self->error_if_expecting_OPERATOR();
3898             }
3899 2         6 return;
3900             } ## end sub do_NEW_HERE_DOC
3901              
3902             sub do_POINTER {
3903              
3904             # '->'
3905 886     886 0 1568 return;
3906             }
3907              
3908             sub do_PLUS_PLUS {
3909              
3910 46     46 0 182 my $self = shift;
3911              
3912             # '++'
3913             # type = 'pp' for pre-increment, '++' for post-increment
3914 46 100       264 if ( $expecting == OPERATOR ) { $type = '++' }
  37 100       144  
3915 7         22 elsif ( $expecting == TERM ) { $type = 'pp' }
3916              
3917             # handle ( $expecting == UNKNOWN )
3918             else {
3919              
3920             # look ahead ..
3921 2         8 my ( $next_nonblank_token, $i_next ) =
3922             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
3923              
3924             # Fix for c042: look past a side comment
3925 2 50       22 if ( $next_nonblank_token eq '#' ) {
3926 0         0 ( $next_nonblank_token, $i_next ) =
3927             $self->find_next_nonblank_token( $max_token_index,
3928             $rtokens, $max_token_index );
3929             }
3930              
3931 2 50       11 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
  0         0  
3932             }
3933 46         113 return;
3934             } ## end sub do_PLUS_PLUS
3935              
3936             sub do_FAT_COMMA {
3937              
3938 1025     1025 0 1809 my $self = shift;
3939              
3940             # '=>'
3941 1025 50       2465 if ( $last_nonblank_type eq $tok ) {
3942 0         0 $self->complain("Repeated '=>'s \n");
3943             }
3944              
3945             # patch for operator_expected: note if we are in the list (use.t)
3946             # TODO: make version numbers a new token type
3947 1025 100       2364 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
  18         42  
3948 1025         1717 return;
3949             } ## end sub do_FAT_COMMA
3950              
3951             sub do_MINUS_MINUS {
3952              
3953 2     2 0 5 my $self = shift;
3954              
3955             # '--'
3956             # type = 'mm' for pre-decrement, '--' for post-decrement
3957              
3958 2 50       15 if ( $expecting == OPERATOR ) { $type = '--' }
  0 50       0  
3959 2         7 elsif ( $expecting == TERM ) { $type = 'mm' }
3960              
3961             # handle ( $expecting == UNKNOWN )
3962             else {
3963              
3964             # look ahead ..
3965 0         0 my ( $next_nonblank_token, $i_next ) =
3966             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
3967              
3968             # Fix for c042: look past a side comment
3969 0 0       0 if ( $next_nonblank_token eq '#' ) {
3970 0         0 ( $next_nonblank_token, $i_next ) =
3971             $self->find_next_nonblank_token( $max_token_index,
3972             $rtokens, $max_token_index );
3973             }
3974              
3975 0 0       0 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
  0         0  
3976             }
3977              
3978 2         5 return;
3979             } ## end sub do_MINUS_MINUS
3980              
3981             sub do_LOGICAL_AND {
3982              
3983 58     58 0 136 my $self = shift;
3984              
3985             # '&&'
3986 58 50 33     247 $self->error_if_expecting_TERM()
3987             if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3988 58         121 return;
3989             } ## end sub do_LOGICAL_AND
3990              
3991             sub do_LOGICAL_OR {
3992              
3993 74     74 0 170 my $self = shift;
3994              
3995             # '||'
3996 74 100 66     378 $self->error_if_expecting_TERM()
3997             if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3998 74         133 return;
3999             } ## end sub do_LOGICAL_OR
4000              
4001             sub do_SLASH_SLASH {
4002              
4003 10     10 0 22 my $self = shift;
4004              
4005             # '//'
4006 10 100       31 $self->error_if_expecting_TERM()
4007             if ( $expecting == TERM );
4008 10         20 return;
4009             } ## end sub do_SLASH_SLASH
4010              
4011             sub do_DIGITS {
4012              
4013 1934     1934 0 3195 my $self = shift;
4014              
4015             # 'd' = string of digits
4016 1934 50       4255 $self->error_if_expecting_OPERATOR("Number")
4017             if ( $expecting == OPERATOR );
4018              
4019 1934         5147 my $number = $self->scan_number_fast();
4020 1934 50       4490 if ( !defined($number) ) {
4021              
4022             # shouldn't happen - we should always get a number
4023 0         0 if (DEVEL_MODE) {
4024             $self->Fault(<<EOM);
4025             non-number beginning with digit--program bug
4026             EOM
4027             }
4028             $self->warning(
4029 0         0 "Unexpected error condition: non-number beginning with digit\n"
4030             );
4031 0         0 $self->report_definite_bug();
4032             }
4033 1934         3108 return;
4034             } ## end sub do_DIGITS
4035              
4036             sub do_ATTRIBUTE_LIST {
4037              
4038 39     39 0 121 my ( $self, $next_nonblank_token ) = @_;
4039              
4040             # Called at a bareword encountered while in an attribute list
4041             # returns 'is_attribute':
4042             # true if attribute found
4043             # false if an attribute (continue parsing bareword)
4044              
4045             # treat bare word followed by open paren like qw(
4046 39 100       120 if ( $next_nonblank_token eq '(' ) {
4047              
4048             # For something like:
4049             # : prototype($$)
4050             # we should let do_scan_sub see it so that it can see
4051             # the prototype. All other attributes get parsed as a
4052             # quoted string.
4053 18 100       85 if ( $tok eq 'prototype' ) {
4054 2         6 $id_scan_state = 'prototype';
4055              
4056             # start just after the word 'prototype'
4057 2         8 my $i_beg = $i + 1;
4058 2         23 ( $i, $tok, $type, $id_scan_state ) = $self->do_scan_sub(
4059             {
4060             input_line => $input_line,
4061             i => $i,
4062             i_beg => $i_beg,
4063             tok => $tok,
4064             type => $type,
4065             rtokens => $rtokens,
4066             rtoken_map => $rtoken_map,
4067             id_scan_state => $id_scan_state,
4068             max_token_index => $max_token_index,
4069             }
4070             );
4071              
4072             # If successful, mark as type 'q' to be consistent
4073             # with other attributes. Type 'w' would also work.
4074 2 50       12 if ( $i > $i_beg ) {
4075 2         5 $type = 'q';
4076 2         10 return 1;
4077             }
4078              
4079             # If not successful, continue and parse as a quote.
4080             }
4081              
4082             # All other attribute lists must be parsed as quotes
4083             # (see 'signatures.t' for good examples)
4084 16         58 $in_quote = $quote_items{'q'};
4085 16         47 $allowed_quote_modifiers = $quote_modifiers{'q'};
4086 16         35 $type = 'q';
4087 16         30 $quote_type = 'q';
4088 16         40 return 1;
4089             }
4090              
4091             # handle bareword not followed by open paren
4092             else {
4093 21         66 $type = 'w';
4094 21         55 return 1;
4095             }
4096              
4097             # attribute not found
4098 0         0 return;
4099             } ## end sub do_ATTRIBUTE_LIST
4100              
4101             sub do_QUOTED_BAREWORD {
4102              
4103 786     786 0 1475 my $self = shift;
4104              
4105             # find type of a bareword followed by a '=>'
4106 786 100       4092 if ( $ris_constant->{$current_package}{$tok} ) {
    50          
    50          
4107 14         37 $type = 'C';
4108             }
4109             elsif ( $ris_user_function->{$current_package}{$tok} ) {
4110 0         0 $type = 'U';
4111 0         0 $prototype = $ruser_function_prototype->{$current_package}{$tok};
4112             }
4113             elsif ( $tok =~ /^v\d+$/ ) {
4114 0         0 $type = 'v';
4115 0         0 $self->report_v_string($tok);
4116             }
4117             else {
4118              
4119             # Bareword followed by a fat comma - see 'git18.in'
4120             # If tok is something like 'x17' then it could
4121             # actually be operator x followed by number 17.
4122             # For example, here:
4123             # 123x17 => [ 792, 1224 ],
4124             # (a key of 123 repeated 17 times, perhaps not
4125             # what was intended). We will mark x17 as type
4126             # 'n' and it will be split. If the previous token
4127             # was also a bareword then it is not very clear is
4128             # going on. In this case we will not be sure that
4129             # an operator is expected, so we just mark it as a
4130             # bareword. Perl is a little murky in what it does
4131             # with stuff like this, and its behavior can change
4132             # over time. Something like
4133             # a x18 => [792, 1224], will compile as
4134             # a key with 18 a's. But something like
4135             # push @array, a x18;
4136             # is a syntax error.
4137 772 100 66     2813 if (
      33        
      66        
4138             $expecting == OPERATOR
4139             && substr( $tok, 0, 1 ) eq 'x'
4140             && ( length($tok) == 1
4141             || substr( $tok, 1, 1 ) =~ /^\d/ )
4142             )
4143             {
4144 3         6 $type = 'n';
4145 3 50       11 if ( $self->split_pretoken(1) ) {
4146 3         6 $type = 'x';
4147 3         5 $tok = 'x';
4148             }
4149             }
4150             else {
4151              
4152             # git #18
4153 769         1334 $type = 'w';
4154 769         2034 $self->error_if_expecting_OPERATOR();
4155             }
4156             }
4157 786         1276 return;
4158             } ## end sub do_QUOTED_BAREWORD
4159              
4160             sub do_X_OPERATOR {
4161              
4162 17     17 0 41 my $self = shift;
4163              
4164 17 100       73 if ( $tok eq 'x' ) {
4165 15 50       86 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
4166 0         0 $tok = 'x=';
4167 0         0 $type = $tok;
4168 0         0 $i++;
4169             }
4170             else {
4171 15         40 $type = 'x';
4172             }
4173             }
4174             else {
4175              
4176             # Split a pretoken like 'x10' into 'x' and '10'.
4177             # Note: In previous versions of perltidy it was marked
4178             # as a number, $type = 'n', and fixed downstream by the
4179             # Formatter.
4180 2         4 $type = 'n';
4181 2 50       8 if ( $self->split_pretoken(1) ) {
4182 2         5 $type = 'x';
4183 2         5 $tok = 'x';
4184             }
4185             }
4186 17         39 return;
4187             } ## end sub do_X_OPERATOR
4188              
4189             sub do_USE_CONSTANT {
4190              
4191 16     16 0 47 my $self = shift;
4192              
4193 16         64 $self->scan_bare_identifier();
4194 16         85 my ( $next_nonblank_tok2, $i_next2 ) =
4195             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
4196              
4197 16 50       89 if ($next_nonblank_tok2) {
4198              
4199 16 100       59 if ( $is_keyword{$next_nonblank_tok2} ) {
4200              
4201             # Assume qw is used as a quote and okay, as in:
4202             # use constant qw{ DEBUG 0 };
4203             # Not worth trying to parse for just a warning
4204              
4205             # NOTE: This warning is deactivated because recent
4206             # versions of perl do not complain here, but
4207             # the coding is retained for reference.
4208 1         2 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
4209             $self->warning(
4210             "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
4211             );
4212             }
4213             }
4214              
4215             else {
4216 15         336 $ris_constant->{$current_package}{$next_nonblank_tok2} = 1;
4217             }
4218             }
4219 16         50 return;
4220             } ## end sub do_USE_CONSTANT
4221              
4222             sub do_KEYWORD {
4223              
4224 2641     2641 0 4493 my $self = shift;
4225              
4226             # found a keyword - set any associated flags
4227 2641         4460 $type = 'k';
4228              
4229             # Since for and foreach may not be followed immediately
4230             # by an opening paren, we have to remember which keyword
4231             # is associated with the next '('
4232             # Previously, before update c230 : if ( $is_for_foreach{$tok} ) {
4233             ##(if elsif unless while until for foreach switch case given when catch)
4234 2641 100       7473 if ( $is_blocktype_with_paren{$tok} ) {
4235 395 100       1381 if ( new_statement_ok() ) {
4236 309         714 $want_paren = $tok;
4237             }
4238             }
4239              
4240             # recognize 'use' statements, which are special
4241 2641 100 100     18746 if ( $is_use_require{$tok} ) {
    100 66        
    100          
    100          
    100          
    100          
4242 175         349 $statement_type = $tok;
4243 175 50       577 $self->error_if_expecting_OPERATOR()
4244             if ( $expecting == OPERATOR );
4245             }
4246              
4247             # remember my and our to check for trailing ": shared"
4248             elsif ( $is_my_our_state{$tok} ) {
4249 628         1360 $statement_type = $tok;
4250             }
4251              
4252             # Check for misplaced 'elsif' and 'else', but allow isolated
4253             # else or elsif blocks to be formatted. This is indicated
4254             # by a last noblank token of ';'
4255             elsif ( $tok eq 'elsif' ) {
4256 29 50 66     208 if (
4257             $last_nonblank_token ne ';'
4258              
4259             ## !~ /^(if|elsif|unless)$/
4260             && !$is_if_elsif_unless{$last_nonblank_block_type}
4261             )
4262             {
4263 0         0 $self->warning(
4264             "expecting '$tok' to follow one of 'if|elsif|unless'\n");
4265             }
4266             }
4267             elsif ( $tok eq 'else' ) {
4268              
4269             # patched for SWITCH/CASE
4270 44 50 66     646 if (
      66        
4271             $last_nonblank_token ne ';'
4272              
4273             ## !~ /^(if|elsif|unless|case|when)$/
4274             && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
4275              
4276             # patch to avoid an unwanted error message for
4277             # the case of a parenless 'case' (RT 105484):
4278             # switch ( 1 ) { case x { 2 } else { } }
4279             ## !~ /^(if|elsif|unless|case|when)$/
4280             && !$is_if_elsif_unless_case_when{$statement_type}
4281             )
4282             {
4283 0         0 $self->warning(
4284             "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
4285             );
4286             }
4287             }
4288              
4289             # patch for SWITCH/CASE if 'case' and 'when are
4290             # treated as keywords. Also 'default' for Switch::Plain
4291             elsif ($tok eq 'when'
4292             || $tok eq 'case'
4293             || $tok eq 'default' )
4294             {
4295 56         115 $statement_type = $tok; # next '{' is block
4296             }
4297              
4298             # feature 'err' was removed in Perl 5.10. So mark this as
4299             # a bareword unless an operator is expected (see c158).
4300             elsif ( $tok eq 'err' ) {
4301 1 50       6 if ( $expecting != OPERATOR ) { $type = 'w' }
  1         2  
4302             }
4303             else {
4304             ## no special treatment needed
4305             }
4306              
4307 2641         4748 return;
4308             } ## end sub do_KEYWORD
4309              
4310             sub do_QUOTE_OPERATOR {
4311              
4312 202     202 0 471 my $self = shift;
4313              
4314 202 50       724 if ( $expecting == OPERATOR ) {
4315              
4316             # Be careful not to call an error for a qw quote
4317             # where a parenthesized list is allowed. For example,
4318             # it could also be a for/foreach construct such as
4319             #
4320             # foreach my $key qw\Uno Due Tres Quadro\ {
4321             # print "Set $key\n";
4322             # }
4323             #
4324              
4325             # Or it could be a function call.
4326             # NOTE: Braces in something like &{ xxx } are not
4327             # marked as a block, we might have a method call.
4328             # &method(...), $method->(..), &{method}(...),
4329             # $ref[2](list) is ok & short for $ref[2]->(list)
4330             #
4331             # See notes in 'sub code_block_type' and
4332             # 'sub is_non_structural_brace'
4333              
4334             my $paren_list_possible = $tok eq 'qw'
4335             && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
4336 0   0     0 || $is_for_foreach{$want_paren} );
4337              
4338 0 0       0 if ( !$paren_list_possible ) {
4339 0         0 $self->error_if_expecting_OPERATOR();
4340             }
4341             }
4342 202         553 $in_quote = $quote_items{$tok};
4343 202         533 $allowed_quote_modifiers = $quote_modifiers{$tok};
4344              
4345             # All quote types are 'Q' except possibly qw quotes.
4346             # qw quotes are special in that they may generally be trimmed
4347             # of leading and trailing whitespace. So they are given a
4348             # separate type, 'q', unless requested otherwise.
4349 202 100 66     1078 $type =
4350             ( $tok eq 'qw' && $self->[_trim_qw_] )
4351             ? 'q'
4352             : 'Q';
4353 202         424 $quote_type = $type;
4354 202         390 return;
4355             } ## end sub do_QUOTE_OPERATOR
4356              
4357             sub do_UNKNOWN_BAREWORD {
4358              
4359 957     957 0 2249 my ( $self, $next_nonblank_token ) = @_;
4360              
4361 957         3352 $self->scan_bare_identifier();
4362              
4363 957 100 100     3732 if ( $statement_type eq 'use'
4364             && $last_nonblank_token eq 'use' )
4365             {
4366 108         517 $rsaw_use_module->{$current_package}->{$tok} = 1;
4367             }
4368              
4369 957 100       2534 if ( $type eq 'w' ) {
4370              
4371 932 50       2556 if ( $expecting == OPERATOR ) {
4372              
4373             # Patch to avoid error message for RPerl overloaded
4374             # operator functions: use overload
4375             # '+' => \&sse_add,
4376             # '-' => \&sse_sub,
4377             # '*' => \&sse_mul,
4378             # '/' => \&sse_div;
4379             # TODO: this could eventually be generalized
4380 0 0 0     0 if ( $rsaw_use_module->{$current_package}->{'RPerl'}
    0          
    0          
4381             && $tok =~ /^sse_(mul|div|add|sub)$/ )
4382             {
4383              
4384             }
4385              
4386             # Fix part 1 for git #63 in which a comment falls
4387             # between an -> and the following word. An
4388             # alternate fix would be to change operator_expected
4389             # to return an UNKNOWN for this type.
4390             elsif ( $last_nonblank_type eq '->' ) {
4391              
4392             }
4393              
4394             # don't complain about possible indirect object
4395             # notation.
4396             # For example:
4397             # package main;
4398             # sub new($) { ... }
4399             # $b = new A::; # calls A::new
4400             # $c = new A; # same thing but suspicious
4401             # This will call A::new but we have a 'new' in
4402             # main:: which looks like a constant.
4403             #
4404             elsif ( $last_nonblank_type eq 'C' ) {
4405 0 0       0 if ( $tok !~ /::$/ ) {
4406 0         0 $self->complain(<<EOM);
4407             Expecting operator after '$last_nonblank_token' but found bare word '$tok'
4408             Maybe indirectet object notation?
4409             EOM
4410             }
4411             }
4412             else {
4413 0         0 $self->error_if_expecting_OPERATOR("bareword");
4414             }
4415             }
4416              
4417             # mark bare words immediately followed by a paren as
4418             # functions
4419 932         2563 $next_tok = $rtokens->[ $i + 1 ];
4420 932 100       2623 if ( $next_tok eq '(' ) {
4421              
4422             # Patch for issue c151, where we are processing a snippet and
4423             # have not seen that SPACE is a constant. In this case 'x' is
4424             # probably an operator. The only disadvantage with an incorrect
4425             # guess is that the space after it may be incorrect. For example
4426             # $str .= SPACE x ( 16 - length($str) ); See also b1410.
4427 276 50 33     1554 if ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { $type = 'x' }
  0 50       0  
4428              
4429             # Fix part 2 for git #63. Leave type as 'w' to keep
4430             # the type the same as if the -> were not separated
4431 276         580 elsif ( $last_nonblank_type ne '->' ) { $type = 'U' }
4432              
4433             # not a special case
4434             else { }
4435              
4436             }
4437              
4438             # underscore after file test operator is file handle
4439 932 50 66     2723 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
4440 0         0 $type = 'Z';
4441             }
4442              
4443             # patch for SWITCH/CASE if 'case' and 'when are
4444             # not treated as keywords:
4445 932 50 33     4849 if (
      33        
      33        
4446             ( $tok eq 'case' && $rbrace_type->[$brace_depth] eq 'switch' )
4447             || ( $tok eq 'when'
4448             && $rbrace_type->[$brace_depth] eq 'given' )
4449             )
4450             {
4451 0         0 $statement_type = $tok; # next '{' is block
4452 0         0 $type = 'k'; # for keyword syntax coloring
4453             }
4454 932 100       2708 if ( $next_nonblank_token eq '(' ) {
4455              
4456             # patch for SWITCH/CASE if switch and given not keywords
4457             # Switch is not a perl 5 keyword, but we will gamble
4458             # and mark switch followed by paren as a keyword. This
4459             # is only necessary to get html syntax coloring nice,
4460             # and does not commit this as being a switch/case.
4461 241 50 33     1877 if ( $tok eq 'switch' || $tok eq 'given' ) {
    50 33        
4462 0         0 $type = 'k'; # for keyword syntax coloring
4463             }
4464              
4465             # mark 'x' as operator for something like this (see b1410)
4466             # my $line = join( LD_X, map { LD_H x ( $_ + 2 ) } @$widths );
4467             elsif ( $tok eq 'x' && $last_nonblank_type eq 'w' ) {
4468 0         0 $type = 'x';
4469             }
4470             else {
4471             ## not a special case
4472             }
4473             }
4474             }
4475 957         1795 return;
4476             } ## end sub do_UNKNOWN_BAREWORD
4477              
4478             sub sub_attribute_ok_here {
4479              
4480 35     35 0 142 my ( $self, $tok_kw, $next_nonblank_token, $i_next ) = @_;
4481              
4482             # Decide if 'sub :' can be the start of a sub attribute list.
4483             # We will decide based on if the colon is followed by a
4484             # bareword which is not a keyword.
4485             # Changed inext+1 to inext to fixed case b1190.
4486 35         78 my $sub_attribute_ok_here;
4487 35 50 66     197 if ( $is_sub{$tok_kw}
      66        
4488             && $expecting != OPERATOR
4489             && $next_nonblank_token eq ':' )
4490             {
4491 3         17 my ( $nn_nonblank_token, $i_nn ) =
4492             $self->find_next_nonblank_token( $i_next, $rtokens,
4493             $max_token_index );
4494             $sub_attribute_ok_here =
4495             $nn_nonblank_token =~ /^\w/
4496             && $nn_nonblank_token !~ /^\d/
4497 3   66     50 && !$is_keyword{$nn_nonblank_token};
4498             }
4499 35         222 return $sub_attribute_ok_here;
4500             } ## end sub sub_attribute_ok_here
4501              
4502             sub do_BAREWORD {
4503              
4504 5832     5832 0 10830 my ( $self, $is_END_or_DATA ) = @_;
4505              
4506             # handle a bareword token:
4507             # returns
4508             # true if this token ends the current line
4509             # false otherwise
4510              
4511 5832         16261 my ( $next_nonblank_token, $i_next ) =
4512             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
4513              
4514             # a bare word immediately followed by :: is not a keyword;
4515             # use $tok_kw when testing for keywords to avoid a mistake
4516 5832         11025 my $tok_kw = $tok;
4517 5832 100 100     17319 if ( $rtokens->[ $i + 1 ] eq ':'
4518             && $rtokens->[ $i + 2 ] eq ':' )
4519             {
4520 266         617 $tok_kw .= '::';
4521             }
4522              
4523 5832 100       13183 if ( $self->[_in_attribute_list_] ) {
4524 39         206 my $is_attribute = $self->do_ATTRIBUTE_LIST($next_nonblank_token);
4525 39 50       148 return if ($is_attribute);
4526             }
4527              
4528             #----------------------------------------
4529             # Starting final if-elsif- chain of tests
4530             #----------------------------------------
4531              
4532             # This is the return flag:
4533             # true => this is the last token on the line
4534             # false => keep tokenizing the line
4535 5793         8750 my $is_last;
4536              
4537             # The following blocks of code must update these vars:
4538             # $type - the final token type, must always be set
4539              
4540             # In addition, if additional pretokens are added:
4541             # $tok - the final token
4542             # $i - the index of the last pretoken
4543              
4544             # They may also need to check and set various flags
4545              
4546             # Scan a bare word following a -> as an identifier; it could
4547             # have a long package name. Fixes c037, c041.
4548 5793 100 100     94991 if ( $last_nonblank_token eq '->' ) {
    100 100        
    100 66        
    100 66        
    100 100        
    100 66        
    100 66        
    50 100        
    50 100        
    100 33        
    100 0        
    100 0        
    100 33        
    100 0        
    100 0        
    100 66        
    100 100        
    100 100        
      100        
      66        
      66        
      66        
4549 669         2500 $self->scan_bare_identifier();
4550              
4551             # a bareward after '->' gets type 'i'
4552 669         1243 $type = 'i';
4553             }
4554              
4555             # Quote a word followed by => operator
4556             # unless the word __END__ or __DATA__ and the only word on
4557             # the line.
4558             elsif ( !$is_END_or_DATA
4559             && $next_nonblank_token eq '='
4560             && $rtokens->[ $i_next + 1 ] eq '>' )
4561             {
4562 786         2500 $self->do_QUOTED_BAREWORD();
4563             }
4564              
4565             # quote a bare word within braces..like xxx->{s}; note that we
4566             # must be sure this is not a structural brace, to avoid
4567             # mistaking {s} in the following for a quoted bare word:
4568             # for(@[){s}bla}BLA}
4569             # Also treat q in something like var{-q} as a bare word, not
4570             # a quote operator
4571             elsif (
4572             $next_nonblank_token eq '}'
4573             && (
4574             $last_nonblank_type eq 'L'
4575             || ( $last_nonblank_type eq 'm'
4576             && $last_last_nonblank_type eq 'L' )
4577             )
4578             )
4579             {
4580 100         263 $type = 'w';
4581             }
4582              
4583             # handle operator x (now we know it isn't $x=)
4584             elsif (
4585             $expecting == OPERATOR
4586             && substr( $tok, 0, 1 ) eq 'x'
4587             && ( length($tok) == 1
4588             || substr( $tok, 1, 1 ) =~ /^\d/ )
4589             )
4590             {
4591 17         137 $self->do_X_OPERATOR();
4592             }
4593             elsif ( $tok_kw eq 'CORE::' ) {
4594 3         9 $type = $tok = $tok_kw;
4595 3         6 $i += 2;
4596             }
4597             elsif ( ( $tok eq 'strict' )
4598             and ( $last_nonblank_token eq 'use' ) )
4599             {
4600 14         40 $self->[_saw_use_strict_] = 1;
4601 14         83 $self->scan_bare_identifier();
4602             }
4603              
4604             elsif ( ( $tok eq 'warnings' )
4605             and ( $last_nonblank_token eq 'use' ) )
4606             {
4607 7         22 $self->[_saw_perl_dash_w_] = 1;
4608              
4609             # scan as identifier, so that we pick up something like:
4610             # use warnings::register
4611 7         28 $self->scan_bare_identifier();
4612             }
4613              
4614             elsif (
4615             $tok eq 'AutoLoader'
4616             && $self->[_look_for_autoloader_]
4617             && (
4618             $last_nonblank_token eq 'use'
4619              
4620             # these regexes are from AutoSplit.pm, which we want
4621             # to mimic
4622             || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
4623             || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
4624             )
4625             )
4626             {
4627 0         0 $self->write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
4628 0         0 $self->[_saw_autoloader_] = 1;
4629 0         0 $self->[_look_for_autoloader_] = 0;
4630 0         0 $self->scan_bare_identifier();
4631             }
4632              
4633             elsif (
4634             $tok eq 'SelfLoader'
4635             && $self->[_look_for_selfloader_]
4636             && ( $last_nonblank_token eq 'use'
4637             || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4638             || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4639             )
4640             {
4641 0         0 $self->write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4642 0         0 $self->[_saw_selfloader_] = 1;
4643 0         0 $self->[_look_for_selfloader_] = 0;
4644 0         0 $self->scan_bare_identifier();
4645             }
4646              
4647             elsif ( ( $tok eq 'constant' )
4648             and ( $last_nonblank_token eq 'use' ) )
4649             {
4650 16         88 $self->do_USE_CONSTANT();
4651             }
4652              
4653             # various quote operators
4654             elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4655 202         993 $self->do_QUOTE_OPERATOR();
4656             }
4657              
4658             # check for a statement label
4659             elsif (
4660             ( $next_nonblank_token eq ':' )
4661             && ( $rtokens->[ $i_next + 1 ] ne ':' )
4662             && ( $i_next <= $max_token_index ) # colon on same line
4663              
4664             # like 'sub : lvalue' ?
4665             && !$self->sub_attribute_ok_here( $tok_kw, $next_nonblank_token,
4666             $i_next )
4667             && new_statement_ok()
4668             )
4669             {
4670 33 100       181 if ( $tok !~ /[A-Z]/ ) {
4671 15         32 push @{ $self->[_rlower_case_labels_at_] }, $input_line_number;
  15         58  
4672             }
4673 33         117 $type = 'J';
4674 33         104 $tok .= ':';
4675 33         68 $i = $i_next;
4676             }
4677              
4678             # 'sub' or other sub alias
4679             elsif ( $is_sub{$tok_kw} ) {
4680              
4681             # Update for --use-feature=class (rt145706):
4682             # We have to be extra careful to avoid misparsing other uses of
4683             # 'method' in older scripts.
4684 302 100 100     1459 if ( $tok_kw eq 'method' && $guess_if_method ) {
4685 10 100 66     102 if ( $expecting == OPERATOR
      100        
4686             || $next_nonblank_token !~ /^[\w\:]/
4687             || !$self->method_ok_here() )
4688             {
4689 7         28 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4690             }
4691             else {
4692 3         182 initialize_subname();
4693 3         18 $self->scan_id();
4694             }
4695             }
4696             else {
4697 292 50       895 $self->error_if_expecting_OPERATOR()
4698             if ( $expecting == OPERATOR );
4699 292         1143 initialize_subname();
4700 292         1063 $self->scan_id();
4701             }
4702             }
4703              
4704             # 'package'
4705             elsif ( $is_package{$tok_kw} ) {
4706              
4707             # Update for --use-feature=class (rt145706):
4708             # We have to be extra careful because 'class' may be used for other
4709             # purposes on older code; i.e.
4710             # class($x) - valid sub call
4711             # package($x) - error
4712 30 100       95 if ( $tok_kw eq 'class' ) {
4713 8 100 66     68 if ( $expecting == OPERATOR
      100        
4714             || $next_nonblank_token !~ /^[\w\:]/
4715             || !$self->class_ok_here() )
4716             {
4717 4         16 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4718             }
4719 4         12 else { $self->scan_id() }
4720             }
4721             else {
4722 22 50       65 $self->error_if_expecting_OPERATOR()
4723             if ( $expecting == OPERATOR );
4724 22         86 $self->scan_id();
4725             }
4726             }
4727              
4728             # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4729             # more restrictive. Require a new statement to be ok here.
4730             elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4731 1         3 $type = ';'; # make tokenizer look for TERM next
4732 1         4 $self->[_in_format_] = 1;
4733 1         2 $is_last = 1; ## is last token on this line
4734             }
4735              
4736             # Note on token types for format, __DATA__, __END__:
4737             # It simplifies things to give these type ';', so that when we
4738             # start rescanning we will be expecting a token of type TERM.
4739             # We will switch to type 'k' before outputting the tokens.
4740             elsif ( $is_END_DATA{$tok_kw} ) {
4741 7         28 $type = ';'; # make tokenizer look for TERM next
4742              
4743             # Remember that we are in one of these three sections
4744 7         26 $self->[ $is_END_DATA{$tok_kw} ] = 1;
4745 7         12 $is_last = 1; ## is last token on this line
4746             }
4747              
4748             elsif ( $is_keyword{$tok_kw} ) {
4749 2641         8749 $self->do_KEYWORD();
4750             }
4751              
4752             # check for inline label following
4753             # /^(redo|last|next|goto)$/
4754             elsif (( $last_nonblank_type eq 'k' )
4755             && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4756             {
4757 19         54 $type = 'j';
4758             }
4759              
4760             # something else --
4761             else {
4762 946         3254 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4763             }
4764              
4765 5793         11682 return $is_last;
4766              
4767             } ## end sub do_BAREWORD
4768              
4769             sub do_FOLLOW_QUOTE {
4770              
4771 2768     2768 0 4389 my $self = shift;
4772              
4773             # Continue following a quote on a new line
4774 2768         4571 $type = $quote_type;
4775              
4776 2768 100       4066 if ( !@{$routput_token_list} ) { # initialize if continuation line
  2768         7093  
4777 184         384 push( @{$routput_token_list}, $i );
  184         398  
4778 184         475 $routput_token_type->[$i] = $type;
4779              
4780             }
4781              
4782             # scan for the end of the quote or pattern
4783             (
4784 2768         8854 $i,
4785             $in_quote,
4786             $quote_character,
4787             $quote_pos,
4788             $quote_depth,
4789             $quoted_string_1,
4790             $quoted_string_2,
4791              
4792             ) = $self->do_quote(
4793              
4794             $i,
4795             $in_quote,
4796             $quote_character,
4797             $quote_pos,
4798             $quote_depth,
4799             $quoted_string_1,
4800             $quoted_string_2,
4801             $rtokens,
4802             $rtoken_map,
4803             $max_token_index,
4804              
4805             );
4806              
4807             # all done if we didn't find it
4808 2768 100       6930 if ($in_quote) { return }
  183         398  
4809              
4810             # save pattern and replacement text for rescanning
4811 2585         4350 my $qs1 = $quoted_string_1;
4812              
4813             # re-initialize for next search
4814 2585         4052 $quote_character = EMPTY_STRING;
4815 2585         3704 $quote_pos = 0;
4816 2585         3974 $quote_type = 'Q';
4817 2585         3831 $quoted_string_1 = EMPTY_STRING;
4818 2585         3911 $quoted_string_2 = EMPTY_STRING;
4819 2585 100       6397 if ( ++$i > $max_token_index ) { return }
  116         376  
4820              
4821             # look for any modifiers
4822 2469 100       5335 if ($allowed_quote_modifiers) {
4823              
4824             # check for exact quote modifiers
4825 144 100       750 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4826 30         63 my $str = $rtokens->[$i];
4827 30         55 my $saw_modifier_e;
4828 30         521 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4829 47         110 my $pos = pos($str);
4830 47         111 my $char = substr( $str, $pos - 1, 1 );
4831 47   66     301 $saw_modifier_e ||= ( $char eq 'e' );
4832             }
4833              
4834             # For an 'e' quote modifier we must scan the replacement
4835             # text for here-doc targets...
4836             # but if the modifier starts a new line we can skip
4837             # this because either the here doc will be fully
4838             # contained in the replacement text (so we can
4839             # ignore it) or Perl will not find it.
4840             # See test 'here2.in'.
4841 30 50 66     160 if ( $saw_modifier_e && $i_tok >= 0 ) {
4842              
4843 0         0 my $rht = $self->scan_replacement_text($qs1);
4844              
4845             # Change type from 'Q' to 'h' for quotes with
4846             # here-doc targets so that the formatter (see sub
4847             # process_line_of_CODE) will not make any line
4848             # breaks after this point.
4849 0 0       0 if ($rht) {
4850 0         0 push @{$rhere_target_list}, @{$rht};
  0         0  
  0         0  
4851 0         0 $type = 'h';
4852 0 0       0 if ( $i_tok < 0 ) {
4853 0         0 my $ilast = $routput_token_list->[-1];
4854 0         0 $routput_token_type->[$ilast] = $type;
4855             }
4856             }
4857             }
4858              
4859 30 50       110 if ( defined( pos($str) ) ) {
4860              
4861             # matched
4862 30 50       97 if ( pos($str) == length($str) ) {
4863 30 50       128 if ( ++$i > $max_token_index ) { return }
  0         0  
4864             }
4865              
4866             # Looks like a joined quote modifier
4867             # and keyword, maybe something like
4868             # s/xxx/yyy/gefor @k=...
4869             # Example is "galgen.pl". Would have to split
4870             # the word and insert a new token in the
4871             # pre-token list. This is so rare that I haven't
4872             # done it. Will just issue a warning citation.
4873              
4874             # This error might also be triggered if my quote
4875             # modifier characters are incomplete
4876             else {
4877 0         0 $self->warning(<<EOM);
4878              
4879             Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4880             Please put a space between quote modifiers and trailing keywords.
4881             EOM
4882              
4883             # print "token $rtokens->[$i]\n";
4884             # my $num = length($str) - pos($str);
4885             # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4886             # print "continuing with new token $rtokens->[$i]\n";
4887              
4888             # skipping past this token does least damage
4889 0 0       0 if ( ++$i > $max_token_index ) { return }
  0         0  
4890             }
4891             }
4892             else {
4893              
4894             # example file: rokicki4.pl
4895             # This error might also be triggered if my quote
4896             # modifier characters are incomplete
4897 0         0 $self->write_logfile_entry(
4898             "Note: found word $str at quote modifier location\n");
4899             }
4900             }
4901              
4902             # re-initialize
4903 144         298 $allowed_quote_modifiers = EMPTY_STRING;
4904             }
4905 2469         4483 return;
4906             } ## end sub do_FOLLOW_QUOTE
4907              
4908             # ------------------------------------------------------------
4909             # begin hash of code for handling most token types
4910             # ------------------------------------------------------------
4911             my $tokenization_code = {
4912              
4913             '>' => \&do_GREATER_THAN_SIGN,
4914             '|' => \&do_VERTICAL_LINE,
4915             '$' => \&do_DOLLAR_SIGN,
4916             '(' => \&do_LEFT_PARENTHESIS,
4917             ')' => \&do_RIGHT_PARENTHESIS,
4918             ',' => \&do_COMMA,
4919             ';' => \&do_SEMICOLON,
4920             '"' => \&do_QUOTATION_MARK,
4921             "'" => \&do_APOSTROPHE,
4922             '`' => \&do_BACKTICK,
4923             '/' => \&do_SLASH,
4924             '{' => \&do_LEFT_CURLY_BRACKET,
4925             '}' => \&do_RIGHT_CURLY_BRACKET,
4926             '&' => \&do_AMPERSAND,
4927             '<' => \&do_LESS_THAN_SIGN,
4928             '?' => \&do_QUESTION_MARK,
4929             '*' => \&do_STAR,
4930             '.' => \&do_DOT,
4931             ':' => \&do_COLON,
4932             '+' => \&do_PLUS_SIGN,
4933             '@' => \&do_AT_SIGN,
4934             '%' => \&do_PERCENT_SIGN,
4935             '[' => \&do_LEFT_SQUARE_BRACKET,
4936             ']' => \&do_RIGHT_SQUARE_BRACKET,
4937             '-' => \&do_MINUS_SIGN,
4938             '^' => \&do_CARAT_SIGN,
4939             '::' => \&do_DOUBLE_COLON,
4940             '<<' => \&do_LEFT_SHIFT,
4941             '<<~' => \&do_NEW_HERE_DOC,
4942             '->' => \&do_POINTER,
4943             '++' => \&do_PLUS_PLUS,
4944             '=>' => \&do_FAT_COMMA,
4945             '--' => \&do_MINUS_MINUS,
4946             '&&' => \&do_LOGICAL_AND,
4947             '||' => \&do_LOGICAL_OR,
4948             '//' => \&do_SLASH_SLASH,
4949              
4950             # No special code for these types yet, but syntax checks
4951             # could be added.
4952             ## '!' => undef,
4953             ## '!=' => undef,
4954             ## '!~' => undef,
4955             ## '%=' => undef,
4956             ## '&&=' => undef,
4957             ## '&=' => undef,
4958             ## '+=' => undef,
4959             ## '-=' => undef,
4960             ## '..' => undef,
4961             ## '..' => undef,
4962             ## '...' => undef,
4963             ## '.=' => undef,
4964             ## '<<=' => undef,
4965             ## '<=' => undef,
4966             ## '<=>' => undef,
4967             ## '<>' => undef,
4968             ## '=' => undef,
4969             ## '==' => undef,
4970             ## '=~' => undef,
4971             ## '>=' => undef,
4972             ## '>>' => undef,
4973             ## '>>=' => undef,
4974             ## '\\' => undef,
4975             ## '^=' => undef,
4976             ## '|=' => undef,
4977             ## '||=' => undef,
4978             ## '//=' => undef,
4979             ## '~' => undef,
4980             ## '~~' => undef,
4981             ## '!~~' => undef,
4982              
4983             };
4984              
4985             # ------------------------------------------------------------
4986             # end hash of code for handling individual token types
4987             # ------------------------------------------------------------
4988              
4989 39     39   412 use constant DEBUG_TOKENIZE => 0;
  39         157  
  39         134030  
4990              
4991             sub tokenize_this_line {
4992              
4993             # This routine breaks a line of perl code into tokens which are of use in
4994             # indentation and reformatting. One of my goals has been to define tokens
4995             # such that a newline may be inserted between any pair of tokens without
4996             # changing or invalidating the program. This version comes close to this,
4997             # although there are necessarily a few exceptions which must be caught by
4998             # the formatter. Many of these involve the treatment of bare words.
4999             #
5000             # The tokens and their types are returned in arrays. See previous
5001             # routine for their names.
5002             #
5003             # See also the array "valid_token_types" in the BEGIN section for an
5004             # up-to-date list.
5005             #
5006             # To simplify things, token types are either a single character, or they
5007             # are identical to the tokens themselves.
5008             #
5009             # As a debugging aid, the -D flag creates a file containing a side-by-side
5010             # comparison of the input string and its tokenization for each line of a file.
5011             # This is an invaluable debugging aid.
5012             #
5013             # In addition to tokens, and some associated quantities, the tokenizer
5014             # also returns flags indication any special line types. These include
5015             # quotes, here_docs, formats.
5016             #
5017             # -----------------------------------------------------------------------
5018             #
5019             # How to add NEW_TOKENS:
5020             #
5021             # New token types will undoubtedly be needed in the future both to keep up
5022             # with changes in perl and to help adapt the tokenizer to other applications.
5023             #
5024             # Here are some notes on the minimal steps. I wrote these notes while
5025             # adding the 'v' token type for v-strings, which are things like version
5026             # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
5027             # can use your editor to search for the string "NEW_TOKENS" to find the
5028             # appropriate sections to change):
5029             #
5030             # *. Try to talk somebody else into doing it! If not, ..
5031             #
5032             # *. Make a backup of your current version in case things don't work out!
5033             #
5034             # *. Think of a new, unused character for the token type, and add to
5035             # the array @valid_token_types in the BEGIN section of this package.
5036             # For example, I used 'v' for v-strings.
5037             #
5038             # *. Implement coding to recognize the $type of the token in this routine.
5039             # This is the hardest part, and is best done by imitating or modifying
5040             # some of the existing coding. For example, to recognize v-strings, I
5041             # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
5042             # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
5043             #
5044             # *. Update sub operator_expected. This update is critically important but
5045             # the coding is trivial. Look at the comments in that routine for help.
5046             # For v-strings, which should behave like numbers, I just added 'v' to the
5047             # regex used to handle numbers and strings (types 'n' and 'Q').
5048             #
5049             # *. Implement a 'bond strength' rule in sub set_bond_strengths in
5050             # Perl::Tidy::Formatter for breaking lines around this token type. You can
5051             # skip this step and take the default at first, then adjust later to get
5052             # desired results. For adding type 'v', I looked at sub bond_strength and
5053             # saw that number type 'n' was using default strengths, so I didn't do
5054             # anything. I may tune it up someday if I don't like the way line
5055             # breaks with v-strings look.
5056             #
5057             # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
5058             # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
5059             # and saw that type 'n' used spaces on both sides, so I just added 'v'
5060             # to the array @spaces_both_sides.
5061             #
5062             # *. Update HtmlWriter package so that users can colorize the token as
5063             # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
5064             # that package. For v-strings, I initially chose to use a default color
5065             # equal to the default for numbers, but it might be nice to change that
5066             # eventually.
5067             #
5068             # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
5069             #
5070             # *. Run lots and lots of debug tests. Start with special files designed
5071             # to test the new token type. Run with the -D flag to create a .DEBUG
5072             # file which shows the tokenization. When these work ok, test as many old
5073             # scripts as possible. Start with all of the '.t' files in the 'test'
5074             # directory of the distribution file. Compare .tdy output with previous
5075             # version and updated version to see the differences. Then include as
5076             # many more files as possible. My own technique has been to collect a huge
5077             # number of perl scripts (thousands!) into one directory and run perltidy
5078             # *, then run diff between the output of the previous version and the
5079             # current version.
5080             #
5081             # *. For another example, search for the smartmatch operator '~~'
5082             # with your editor to see where updates were made for it.
5083             #
5084             # -----------------------------------------------------------------------
5085              
5086 7510     7510 0 15198 my ( $self, $line_of_tokens ) = @_;
5087 7510         14684 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
5088              
5089             # Extract line number for use in error messages
5090 7510         12413 $input_line_number = $line_of_tokens->{_line_number};
5091              
5092             # Check for pod documentation
5093 7510 100 66     20124 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
5094             && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
5095             {
5096              
5097             # Must not be in multi-line quote
5098             # and must not be in an equation
5099 14         38 my $blank_after_Z = 1;
5100 14 50 33     109 if (
5101             !$in_quote
5102             && ( $self->operator_expected( '=', 'b', $blank_after_Z ) ==
5103             TERM )
5104             )
5105             {
5106 14         38 $self->[_in_pod_] = 1;
5107 14         34 return;
5108             }
5109             }
5110              
5111 7496         13911 $input_line = $untrimmed_input_line;
5112              
5113 7496         13780 chomp $input_line;
5114              
5115             # Set a flag to indicate if we might be at an __END__ or __DATA__ line
5116             # This will be used below to avoid quoting a bare word followed by
5117             # a fat comma.
5118 7496         10905 my $is_END_or_DATA;
5119              
5120             # Reinitialize the multi-line quote flag
5121 7496 100 100     19124 if ( $in_quote && $quote_type eq 'Q' ) {
5122 47         148 $line_of_tokens->{_starting_in_quote} = 1;
5123             }
5124             else {
5125 7449         12676 $line_of_tokens->{_starting_in_quote} = 0;
5126              
5127             # Trim start of this line unless we are continuing a quoted line.
5128             # Do not trim end because we might end in a quote (test: deken4.pl)
5129             # Perl::Tidy::Formatter will delete needless trailing blanks
5130 7449 100       34073 if ( !length($input_line) ) {
    100          
5131              
5132             # line is empty
5133             }
5134             elsif ( $input_line =~ m/\S/g ) {
5135              
5136             # There are $spaces blank characters before a nonblank character
5137 6643         13170 my $spaces = pos($input_line) - 1;
5138 6643 100       15364 if ( $spaces > 0 ) {
5139              
5140             # Trim the leading spaces
5141 3541         9010 $input_line = substr( $input_line, $spaces );
5142              
5143             # Find actual space count if there are leading tabs
5144 3541 100 66     12825 if (
5145             ord( substr( $untrimmed_input_line, 0, 1 ) ) == ORD_TAB
5146             && $untrimmed_input_line =~ /^(\t+)/ )
5147             {
5148 213         455 my $tabsize = $self->[_tabsize_];
5149 213         605 $spaces += length($1) * ( $tabsize - 1 );
5150             }
5151              
5152             # Calculate a guessed level for nonblank lines to avoid
5153             # calls to sub guess_old_indentation_level()
5154 3541         7194 my $indent_columns = $self->[_indent_columns_];
5155             $line_of_tokens->{_guessed_indentation_level} =
5156 3541         10218 int( $spaces / $indent_columns );
5157             }
5158             }
5159             else {
5160              
5161             # line has all blank characters
5162 9         37 $input_line = EMPTY_STRING;
5163             }
5164              
5165             }
5166              
5167 7496 100       16380 if ( !$in_quote ) {
5168              
5169             # Optimize handling of a blank line
5170 7312 100       16299 if ( !length($input_line) ) {
5171 806         2275 $line_of_tokens->{_line_type} = 'CODE';
5172 806         1835 $line_of_tokens->{_rtokens} = [];
5173 806         2061 $line_of_tokens->{_rtoken_type} = [];
5174 806         1774 $line_of_tokens->{_rlevels} = [];
5175 806         1679 $line_of_tokens->{_rci_levels} = [];
5176 806         1670 $line_of_tokens->{_rblock_type} = [];
5177 806         1703 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5178 806         2236 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5179 806         1694 return;
5180             }
5181              
5182             # Check comments
5183 6506 100       15435 if ( substr( $input_line, 0, 1 ) eq '#' ) {
5184              
5185             # and check for skipped section
5186 788 50 66     4623 if (
      66        
      66        
5187             (
5188             substr( $input_line, 0, 4 ) eq '#<<V'
5189             || $rOpts_code_skipping_begin
5190             )
5191             && $rOpts_code_skipping
5192             && $input_line =~ /$code_skipping_pattern_begin/
5193             )
5194             {
5195 2         9 $self->[_in_skipped_] = $self->[_last_line_number_];
5196 2         6 return;
5197             }
5198              
5199             # Optional fast processing of a block comment
5200 786         1766 $line_of_tokens->{_line_type} = 'CODE';
5201 786         3294 $line_of_tokens->{_rtokens} = [$input_line];
5202 786         2248 $line_of_tokens->{_rtoken_type} = ['#'];
5203 786         2108 $line_of_tokens->{_rlevels} = [$level_in_tokenizer];
5204 786         1847 $line_of_tokens->{_rci_levels} = [0];
5205 786         2066 $line_of_tokens->{_rblock_type} = [EMPTY_STRING];
5206 786         1798 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5207 786         2612 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5208 786         1771 return;
5209             }
5210              
5211             # Look for __END__ or __DATA__ lines
5212 5718 100 100     16018 if ( substr( $input_line, 0, 1 ) eq '_'
5213             && $input_line =~ /^__(END|DATA)__\s*$/ )
5214             {
5215 7         18 $is_END_or_DATA = 1;
5216             }
5217             }
5218              
5219             # update the copy of the line for use in error messages
5220             # This must be exactly what we give the pre_tokenizer
5221 5902         10954 $self->[_line_of_text_] = $input_line;
5222              
5223             # re-initialize for the main loop
5224 5902         14186 $routput_token_list = []; # stack of output token indexes
5225 5902         17994 $routput_token_type = []; # token types
5226 5902         16534 $routput_block_type = []; # types of code block
5227 5902         15616 $routput_container_type = []; # paren types, such as if, elsif, ..
5228 5902         14729 $routput_type_sequence = []; # nesting sequential number
5229              
5230 5902         9635 $rhere_target_list = [];
5231              
5232 5902         9766 $tok = $last_nonblank_token;
5233 5902         8996 $type = $last_nonblank_type;
5234 5902         9079 $prototype = $last_nonblank_prototype;
5235 5902         8845 $last_nonblank_i = -1;
5236 5902         8940 $block_type = $last_nonblank_block_type;
5237 5902         8807 $container_type = $last_nonblank_container_type;
5238 5902         8618 $type_sequence = $last_nonblank_type_sequence;
5239 5902         8371 $indent_flag = 0;
5240 5902         8496 $peeked_ahead = 0;
5241              
5242 5902         18210 $self->tokenizer_main_loop($is_END_or_DATA);
5243              
5244             #-----------------------------------------------
5245             # all done tokenizing this line ...
5246             # now prepare the final list of tokens and types
5247             #-----------------------------------------------
5248 5902         18260 $self->tokenizer_wrapup_line($line_of_tokens);
5249              
5250 5902         10832 return;
5251             } ## end sub tokenize_this_line
5252              
5253             sub tokenizer_main_loop {
5254              
5255 5902     5902 0 11845 my ( $self, $is_END_or_DATA ) = @_;
5256              
5257             #---------------------------------
5258             # Break one input line into tokens
5259             #---------------------------------
5260              
5261             # Input parameter:
5262             # $is_END_or_DATA is true for a __END__ or __DATA__ line
5263              
5264             # start by breaking the line into pre-tokens
5265 5902         14524 ( $rtokens, $rtoken_map, $rtoken_type ) = pre_tokenize($input_line);
5266              
5267 5902         24026 $max_token_index = scalar( @{$rtokens} ) - 1;
  5902         10827  
5268 5902         8960 push( @{$rtokens}, SPACE, SPACE, SPACE )
  5902         14800  
5269             ; # extra whitespace simplifies logic
5270 5902         8787 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
  5902         12660  
5271 5902         8480 push( @{$rtoken_type}, 'b', 'b', 'b' );
  5902         12400  
5272              
5273             # initialize for main loop
5274 5902         8449 if (0) { #<<< this is not necessary
5275             foreach my $ii ( 0 .. $max_token_index + 3 ) {
5276             $routput_token_type->[$ii] = EMPTY_STRING;
5277             $routput_block_type->[$ii] = EMPTY_STRING;
5278             $routput_container_type->[$ii] = EMPTY_STRING;
5279             $routput_type_sequence->[$ii] = EMPTY_STRING;
5280             $routput_indent_flag->[$ii] = 0;
5281             }
5282             }
5283              
5284 5902         8715 $i = -1;
5285 5902         8894 $i_tok = -1;
5286              
5287             #-----------------------
5288             # main tokenization loop
5289             #-----------------------
5290              
5291             # we are looking at each pre-token of one line and combining them
5292             # into tokens
5293 5902         13541 while ( ++$i <= $max_token_index ) {
5294              
5295             # continue looking for the end of a quote
5296 50796 100       87393 if ($in_quote) {
5297 2768         9005 $self->do_FOLLOW_QUOTE();
5298 2768 100 100     11289 last if ( $in_quote || $i > $max_token_index );
5299             }
5300              
5301 50497 100 100     137316 if ( $type ne 'b' && $type ne 'CORE::' ) {
5302              
5303             # try to catch some common errors
5304 35283 100 100     75902 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
5305              
5306 1592 100       4839 if ( $last_nonblank_token eq 'eq' ) {
    50          
5307 9         59 $self->complain("Should 'eq' be '==' here ?\n");
5308             }
5309             elsif ( $last_nonblank_token eq 'ne' ) {
5310 0         0 $self->complain("Should 'ne' be '!=' here ?\n");
5311             }
5312             else {
5313             # that's all
5314             }
5315             }
5316              
5317             # fix c090, only rotate vars if a new token will be stored
5318 35283 100       62955 if ( $i_tok >= 0 ) {
5319              
5320 29519         44184 $last_last_nonblank_token = $last_nonblank_token;
5321 29519         40593 $last_last_nonblank_type = $last_nonblank_type;
5322              
5323 29519         41735 $last_nonblank_prototype = $prototype;
5324 29519         39544 $last_nonblank_block_type = $block_type;
5325 29519         39689 $last_nonblank_container_type = $container_type;
5326 29519         39763 $last_nonblank_type_sequence = $type_sequence;
5327 29519         38589 $last_nonblank_i = $i_tok;
5328              
5329             # Fix part #3 for git82: propagate type 'Z' though L-R pair
5330 29519 100 100     61846 if ( !( $type eq 'R' && $last_nonblank_type eq 'Z' ) ) {
5331 29518         40827 $last_nonblank_token = $tok;
5332 29518         40279 $last_nonblank_type = $type;
5333             }
5334             }
5335              
5336             # Patch for c030: Fix things in case a '->' got separated from
5337             # the subsequent identifier by a side comment. We need the
5338             # last_nonblank_token to have a leading -> to avoid triggering
5339             # an operator expected error message at the next '('. See also
5340             # fix for git #63.
5341 35283 100       64498 if ( $last_last_nonblank_token eq '->' ) {
5342 885 100 66     5104 if ( $last_nonblank_type eq 'w'
5343             || $last_nonblank_type eq 'i' )
5344             {
5345 674         1690 $last_nonblank_token = '->' . $last_nonblank_token;
5346 674         1334 $last_nonblank_type = 'i';
5347             }
5348             }
5349             }
5350              
5351             # store previous token type
5352 50497 100       86378 if ( $i_tok >= 0 ) {
5353 44733         86656 $routput_token_type->[$i_tok] = $type;
5354 44733         73265 $routput_block_type->[$i_tok] = $block_type;
5355 44733         73133 $routput_container_type->[$i_tok] = $container_type;
5356 44733         70103 $routput_type_sequence->[$i_tok] = $type_sequence;
5357 44733         67550 $routput_indent_flag->[$i_tok] = $indent_flag;
5358             }
5359              
5360             # get the next pre-token and type
5361             # $tok and $type will be modified to make the output token
5362 50497         79628 my $pre_tok = $tok = $rtokens->[$i]; # get the next pre-token
5363 50497         75952 my $pre_type = $type = $rtoken_type->[$i]; # and type
5364              
5365             # remember the starting index of this token; we will be updating $i
5366 50497         67464 $i_tok = $i;
5367              
5368             # re-initialize various flags for the next output token
5369             (
5370              
5371 50497         89767 $block_type,
5372             $container_type,
5373             $type_sequence,
5374             $indent_flag,
5375             $prototype,
5376             )
5377             = (
5378              
5379             EMPTY_STRING,
5380             EMPTY_STRING,
5381             EMPTY_STRING,
5382             0,
5383             EMPTY_STRING,
5384             );
5385              
5386             # this pre-token will start an output token
5387 50497         62928 push( @{$routput_token_list}, $i_tok );
  50497         85383  
5388              
5389             # The search for the full token ends in one of 5 main END NODES:
5390              
5391             #-----------------------
5392             # END NODE 1: whitespace
5393             #-----------------------
5394 50497 100       106731 next if ( $pre_type eq 'b' );
5395              
5396             #----------------------
5397             # END NODE 2: a comment
5398             #----------------------
5399 35141 100       60791 last if ( $pre_type eq '#' );
5400              
5401             # continue gathering identifier if necessary
5402 34813 100       61058 if ($id_scan_state) {
5403              
5404 17 100 66     144 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
5405 10         56 $self->scan_id();
5406             }
5407             else {
5408 7         36 $self->scan_identifier();
5409             }
5410              
5411 17 100       71 if ($id_scan_state) {
5412              
5413             # Still scanning ...
5414             # Check for side comment between sub and prototype (c061)
5415              
5416             # done if nothing left to scan on this line
5417 1 50       3 last if ( $i > $max_token_index );
5418              
5419 1         5 my ( $next_nonblank_token, $i_next ) =
5420             find_next_nonblank_token_on_this_line( $i, $rtokens,
5421             $max_token_index );
5422              
5423             # done if it was just some trailing space
5424 1 50       4 last if ( $i_next > $max_token_index );
5425              
5426             # something remains on the line ... must be a side comment
5427 1         3 next;
5428             }
5429              
5430 16 100 100     135 next if ( ( $i > 0 ) || $type );
5431              
5432             # didn't find any token; start over
5433 7         23 $type = $pre_type;
5434 7         17 $tok = $pre_tok;
5435             }
5436              
5437             #-----------------------------------------------------------
5438             # Combine pre-tokens into digraphs and trigraphs if possible
5439             #-----------------------------------------------------------
5440              
5441             # See if we can make a digraph...
5442             # The following tokens are excluded and handled specially:
5443             # '/=' is excluded because the / might start a pattern.
5444             # 'x=' is excluded since it might be $x=, with $ on previous line
5445             # '**' and *= might be typeglobs of punctuation variables
5446             # I have allowed tokens starting with <, such as <=,
5447             # because I don't think these could be valid angle operators.
5448             # test file: storrs4.pl
5449 34803 100 100     108213 if ( $can_start_digraph{$tok}
      100        
5450             && $i < $max_token_index
5451             && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
5452             {
5453              
5454 2559         4945 my $combine_ok = 1;
5455 2559         5337 my $test_tok = $tok . $rtokens->[ $i + 1 ];
5456              
5457             # check for special cases which cannot be combined
5458              
5459             # '//' must be defined_or operator if an operator is expected.
5460             # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
5461             # could be migrated here for clarity
5462              
5463             # Patch for RT#102371, misparsing a // in the following snippet:
5464             # state $b //= ccc();
5465             # The solution is to always accept the digraph (or trigraph)
5466             # after type 'Z' (possible file handle). The reason is that
5467             # sub operator_expected gives TERM expected here, which is
5468             # wrong in this case.
5469 2559 100 66     7033 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
5470              
5471             # note that here $tok = '/' and the next tok and type is '/'
5472 16         33 my $blank_after_Z;
5473 16         56 $expecting =
5474             $self->operator_expected( $tok, '/', $blank_after_Z );
5475              
5476             # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
5477 16 100       53 $combine_ok = 0 if ( $expecting == TERM );
5478             }
5479              
5480             # Patch for RT #114359: mis-parsing of "print $x ** 0.5;
5481             # Accept the digraphs '**' only after type 'Z'
5482             # Otherwise postpone the decision.
5483 2559 100       6047 if ( $test_tok eq '**' ) {
5484 39 100       126 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
  37         79  
5485             }
5486              
5487 2559 50 66     16219 if (
      66        
      33        
5488              
5489             # still ok to combine?
5490             $combine_ok
5491              
5492             && ( $test_tok ne '/=' ) # might be pattern
5493             && ( $test_tok ne 'x=' ) # might be $x
5494             && ( $test_tok ne '*=' ) # typeglob?
5495              
5496             # Moved above as part of fix for
5497             # RT #114359: Missparsing of "print $x ** 0.5;
5498             # && ( $test_tok ne '**' ) # typeglob?
5499             )
5500             {
5501 2518         4014 $tok = $test_tok;
5502 2518         3621 $i++;
5503              
5504             # Now try to assemble trigraphs. Note that all possible
5505             # perl trigraphs can be constructed by appending a character
5506             # to a digraph.
5507 2518         4293 $test_tok = $tok . $rtokens->[ $i + 1 ];
5508              
5509 2518 100       6095 if ( $is_trigraph{$test_tok} ) {
5510 76         186 $tok = $test_tok;
5511 76         166 $i++;
5512             }
5513              
5514             # The only current tetragraph is the double diamond operator
5515             # and its first three characters are NOT a trigraph, so
5516             # we do can do a special test for it
5517             else {
5518 2442 100       5773 if ( $test_tok eq '<<>' ) {
5519 1         4 $test_tok .= $rtokens->[ $i + 2 ];
5520 1 50       4 if ( $is_tetragraph{$test_tok} ) {
5521 1         2 $tok = $test_tok;
5522 1         5 $i += 2;
5523             }
5524             }
5525             }
5526             }
5527             }
5528              
5529 34803         49416 $type = $tok;
5530 34803         57425 $next_tok = $rtokens->[ $i + 1 ];
5531 34803         51733 $next_type = $rtoken_type->[ $i + 1 ];
5532              
5533             # expecting an operator here? first try table lookup, then function
5534 34803         64390 $expecting = $op_expected_table{$last_nonblank_type};
5535 34803 100       67575 if ( !defined($expecting) ) {
5536 8391   100     20217 my $blank_after_Z = $last_nonblank_type eq 'Z'
5537             && ( $i == 0 || $rtoken_type->[ $i - 1 ] eq 'b' );
5538 8391         22469 $expecting =
5539             $self->operator_expected( $tok, $next_type, $blank_after_Z );
5540             }
5541              
5542 34803         45843 DEBUG_TOKENIZE && do {
5543             local $LIST_SEPARATOR = ')(';
5544             my @debug_list = (
5545             $last_nonblank_token, $tok,
5546             $next_tok, $brace_depth,
5547             $rbrace_type->[$brace_depth], $paren_depth,
5548             $rparen_type->[$paren_depth],
5549             );
5550             print {*STDOUT} "TOKENIZE:(@debug_list)\n";
5551             };
5552              
5553             # We have the next token, $tok.
5554             # Now we have to examine this token and decide what it is
5555             # and define its $type
5556              
5557             #------------------------
5558             # END NODE 3: a bare word
5559             #------------------------
5560 34803 100       64558 if ( $pre_type eq 'w' ) {
5561 5832         17547 my $is_last = $self->do_BAREWORD($is_END_or_DATA);
5562 5832 100       12147 last if ($is_last);
5563 5824         14552 next;
5564             }
5565              
5566             # Turn off attribute list on first non-blank, non-bareword.
5567             # Added '#' to fix c038 (later moved above).
5568 28971   100     55711 $self->[_in_attribute_list_] &&= 0;
5569              
5570             #-------------------------------
5571             # END NODE 4: a string of digits
5572             #-------------------------------
5573 28971 100       51696 if ( $pre_type eq 'd' ) {
5574 1934         7387 $self->do_DIGITS();
5575 1934         4437 next;
5576             }
5577              
5578             #------------------------------------------
5579             # END NODE 5: everything else (punctuation)
5580             #------------------------------------------
5581 27037         54229 my $code = $tokenization_code->{$tok};
5582 27037 100       51112 if ($code) {
5583 25304         76515 $code->($self);
5584 25304 100       65819 redo if $in_quote;
5585             }
5586             } ## End main tokenizer loop
5587              
5588             # Store the final token
5589 5902 100       13059 if ( $i_tok >= 0 ) {
5590 5764         12319 $routput_token_type->[$i_tok] = $type;
5591 5764         10529 $routput_block_type->[$i_tok] = $block_type;
5592 5764         10587 $routput_container_type->[$i_tok] = $container_type;
5593 5764         9909 $routput_type_sequence->[$i_tok] = $type_sequence;
5594 5764         10314 $routput_indent_flag->[$i_tok] = $indent_flag;
5595             }
5596              
5597             # Remember last nonblank values
5598 5902 100 100     21637 if ( $type ne 'b' && $type ne '#' ) {
5599              
5600 5429         8824 $last_last_nonblank_token = $last_nonblank_token;
5601 5429         8361 $last_last_nonblank_type = $last_nonblank_type;
5602              
5603 5429         8198 $last_nonblank_prototype = $prototype;
5604 5429         8068 $last_nonblank_block_type = $block_type;
5605 5429         7625 $last_nonblank_container_type = $container_type;
5606 5429         8285 $last_nonblank_type_sequence = $type_sequence;
5607              
5608 5429         7533 $last_nonblank_token = $tok;
5609 5429         8101 $last_nonblank_type = $type;
5610             }
5611              
5612             # reset indentation level if necessary at a sub or package
5613             # in an attempt to recover from a nesting error
5614 5902 50       12236 if ( $level_in_tokenizer < 0 ) {
5615 0 0       0 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
5616 0         0 reset_indentation_level(0);
5617 0         0 $self->brace_warning("resetting level to 0 at $1 $2\n");
5618             }
5619             }
5620              
5621 5902         9728 $self->[_in_quote_] = $in_quote;
5622 5902 100       13400 $self->[_quote_target_] =
5623             $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
5624 5902         11102 $self->[_rhere_target_list_] = $rhere_target_list;
5625              
5626 5902         9941 return;
5627             } ## end sub tokenizer_main_loop
5628              
5629             sub tokenizer_wrapup_line {
5630 5902     5902 0 11199 my ( $self, $line_of_tokens ) = @_;
5631              
5632             #---------------------------------------------------------
5633             # Package a line of tokens for shipping back to the caller
5634             #---------------------------------------------------------
5635              
5636             # Arrays to hold token values for this line:
5637 5902         10506 my ( @levels, @block_type, @type_sequence, @token_type, @tokens );
5638              
5639 5902         14536 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5640              
5641             # Remember starting nesting block string
5642 5902         9907 my $nesting_block_string_0 = $nesting_block_string;
5643              
5644             #-----------------
5645             # Loop over tokens
5646             #-----------------
5647             # $i is the index of the pretoken which starts this full token
5648 5902         8535 foreach my $i ( @{$routput_token_list} ) {
  5902         12720  
5649              
5650 50681         73141 my $type_i = $routput_token_type->[$i];
5651              
5652             #----------------------------------------
5653             # Section 1. Handle a non-sequenced token
5654             #----------------------------------------
5655 50681 100       80911 if ( !$routput_type_sequence->[$i] ) {
5656              
5657             #-------------------------------
5658             # Section 1.1. types ';' and 't'
5659             #-------------------------------
5660             # - output anonymous 'sub' as keyword (type 'k')
5661             # - output __END__, __DATA__, and format as type 'k' instead
5662             # of ';' to make html colors correct, etc.
5663 41533 100       90328 if ( $is_semicolon_or_t{$type_i} ) {
    50          
5664 2674         6026 my $tok_i = $rtokens->[$i];
5665 2674 100       7668 if ( $is_END_DATA_format_sub{$tok_i} ) {
5666 172         461 $type_i = 'k';
5667             }
5668             }
5669              
5670             #----------------------------------------------
5671             # Section 1.2. Check for an invalid token type.
5672             #----------------------------------------------
5673             # This can happen by running perltidy on non-scripts although
5674             # it could also be bug introduced by programming change. Perl
5675             # silently accepts a 032 (^Z) and takes it as the end
5676             elsif ( !$is_valid_token_type{$type_i} ) {
5677 0         0 my $val = ord($type_i);
5678 0         0 $self->warning(
5679             "unexpected character decimal $val ($type_i) in script\n"
5680             );
5681 0         0 $self->[_in_error_] = 1;
5682             }
5683             else {
5684             ## ok - valid token type other than ; and t
5685             }
5686              
5687             #----------------------------------------------------
5688             # Section 1.3. Store values for a non-sequenced token
5689             #----------------------------------------------------
5690 41533         66315 push( @levels, $level_in_tokenizer );
5691 41533         61537 push( @block_type, EMPTY_STRING );
5692 41533         60375 push( @type_sequence, EMPTY_STRING );
5693 41533         77073 push( @token_type, $type_i );
5694              
5695             }
5696              
5697             #------------------------------------
5698             # Section 2. Handle a sequenced token
5699             # One of { [ ( ? : ) ] }
5700             #------------------------------------
5701             else {
5702              
5703             # $level_i is the level we will store. Levels of braces are
5704             # set so that the leading braces have a HIGHER level than their
5705             # CONTENTS, which is convenient for indentation.
5706 9148         13733 my $level_i = $level_in_tokenizer;
5707              
5708             # $tok_i is the PRE-token. It only equals the token for symbols
5709 9148         14676 my $tok_i = $rtokens->[$i];
5710              
5711             # $routput_indent_flag->[$i] indicates that we need a change
5712             # in level at a nested ternary, as follows
5713             # 1 => at a nested ternary ?
5714             # -1 => at a nested ternary :
5715             # 0 => otherwise
5716              
5717             #--------------------------------------------
5718             # Section 2.1 Handle a level-increasing token
5719             #--------------------------------------------
5720 9148 100       24398 if ( $is_opening_or_ternary_type{$type_i} ) {
    50          
5721              
5722 4574 100       9431 if ( $type_i eq '?' ) {
5723              
5724 187 100       1010 if ( $routput_indent_flag->[$i] > 0 ) {
5725 8         16 $level_in_tokenizer++;
5726              
5727             # break BEFORE '?' in a nested ternary
5728 8         24 $level_i = $level_in_tokenizer;
5729 8         21 $nesting_block_string .= "$nesting_block_flag";
5730              
5731             }
5732             }
5733             else {
5734              
5735 4387         7363 $nesting_token_string .= $tok_i;
5736              
5737 4387 100 100     12162 if ( $type_i eq '{' || $type_i eq 'L' ) {
5738              
5739 4080         5925 $level_in_tokenizer++;
5740              
5741 4080 100       7935 if ( $routput_block_type->[$i] ) {
5742 972         1905 $nesting_block_flag = 1;
5743 972         1992 $nesting_block_string .= '1';
5744             }
5745             else {
5746 3108         4892 $nesting_block_flag = 0;
5747 3108         5303 $nesting_block_string .= '0';
5748             }
5749             }
5750             }
5751             }
5752              
5753             #---------------------------------------------
5754             # Section 2.2. Handle a level-decreasing token
5755             #---------------------------------------------
5756             elsif ( $is_closing_or_ternary_type{$type_i} ) {
5757              
5758 4574 100       10311 if ( $type_i ne ':' ) {
5759 4387         8285 my $char = chop $nesting_token_string;
5760 4387 50       11603 if ( $char ne $matching_start_token{$tok_i} ) {
5761 0         0 $nesting_token_string .= $char . $tok_i;
5762             }
5763             }
5764              
5765 4574 100 100     14693 if (
      100        
      100        
5766             $type_i eq '}'
5767             || $type_i eq 'R'
5768              
5769             # only the second and higher ? : have levels
5770             || $type_i eq ':' && $routput_indent_flag->[$i] < 0
5771             )
5772             {
5773              
5774 4088         6339 $level_i = --$level_in_tokenizer;
5775              
5776 4088 50       8200 if ( $level_in_tokenizer < 0 ) {
5777 0 0       0 if ( !$self->[_saw_negative_indentation_] ) {
5778 0         0 $self->[_saw_negative_indentation_] = 1;
5779 0         0 $self->warning(
5780             "Starting negative indentation\n");
5781             }
5782             }
5783              
5784             # restore previous level values
5785 4088 50       8381 if ( length($nesting_block_string) > 1 )
5786             { # true for valid script
5787 4088         6409 chop $nesting_block_string;
5788 4088         8150 $nesting_block_flag =
5789             substr( $nesting_block_string, -1 ) eq '1';
5790             }
5791              
5792             }
5793             }
5794              
5795             #-----------------------------------------------------
5796             # Section 2.3. Unexpected sequenced token type - error
5797             #-----------------------------------------------------
5798             else {
5799              
5800             # The tokenizer should only be assigning sequence numbers
5801             # to types { [ ( ? ) ] } :
5802 0         0 DEVEL_MODE && $self->Fault(<<EOM);
5803             unexpected sequence number on token type $type_i with pre-tok=$tok_i
5804             EOM
5805             }
5806              
5807             #------------------------------------------------
5808             # Section 2.4. Store values for a sequenced token
5809             #------------------------------------------------
5810              
5811             # The starting nesting block string, which is used in any .LOG
5812             # output, should include the first token of the line
5813 9148 100       18805 if ( !@levels ) {
5814 1574         2941 $nesting_block_string_0 = $nesting_block_string;
5815             }
5816              
5817             # Store values for a sequenced token
5818 9148         16779 push( @levels, $level_i );
5819 9148         17473 push( @block_type, $routput_block_type->[$i] );
5820 9148         15415 push( @type_sequence, $routput_type_sequence->[$i] );
5821 9148         19071 push( @token_type, $type_i );
5822              
5823             }
5824             } ## End loop to over tokens
5825              
5826             #---------------------
5827             # Post-loop operations
5828             #---------------------
5829              
5830 5902         14720 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string_0;
5831              
5832             # Form and store the tokens
5833 5902 50       13591 if (@levels) {
5834              
5835 5902         8349 my $im = shift @{$routput_token_list};
  5902         10793  
5836 5902         10121 my $offset = $rtoken_map->[$im];
5837 5902         8476 foreach my $i ( @{$routput_token_list} ) {
  5902         10506  
5838 44779         61730 my $numc = $rtoken_map->[$i] - $offset;
5839 44779         79618 push( @tokens, substr( $input_line, $offset, $numc ) );
5840 44779         56961 $offset += $numc;
5841              
5842 44779         60753 if ( DEVEL_MODE && $numc <= 0 ) {
5843              
5844             # Should not happen unless @{$rtoken_map} is corrupted
5845             $self->Fault(
5846             "number of characters is '$numc' but should be >0\n");
5847             }
5848             }
5849              
5850             # Form and store the final token of this line
5851 5902         11997 my $numc = length($input_line) - $offset;
5852 5902         12223 push( @tokens, substr( $input_line, $offset, $numc ) );
5853              
5854 5902         9022 if ( DEVEL_MODE && $numc <= 0 ) {
5855             $self->Fault(
5856             "Number of Characters is '$numc' but should be >0\n");
5857             }
5858             }
5859              
5860             # NOTE: This routine returns ci=0. Eventually '_rci_levels' can be
5861             # removed. The ci values are computed later by sub Formatter::set_ci.
5862 5902         19678 my @ci_levels = (0) x scalar(@levels);
5863              
5864             # Wrap up this line of tokens for shipping to the Formatter
5865 5902         13822 $line_of_tokens->{_rtoken_type} = \@token_type;
5866 5902         11320 $line_of_tokens->{_rtokens} = \@tokens;
5867 5902         11345 $line_of_tokens->{_rblock_type} = \@block_type;
5868 5902         11386 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5869 5902         18923 $line_of_tokens->{_rlevels} = \@levels;
5870 5902         11777 $line_of_tokens->{_rci_levels} = \@ci_levels;
5871              
5872 5902         15726 return;
5873             } ## end sub tokenizer_wrapup_line
5874              
5875             } ## end tokenize_this_line
5876              
5877             #######################################################################
5878             # Tokenizer routines which assist in identifying token types
5879             #######################################################################
5880              
5881             # Define Global '%op_expected_table'
5882             # = hash table of operator expected values based on last nonblank token
5883              
5884             # exceptions to perl's weird parsing rules after type 'Z'
5885             my %is_weird_parsing_rule_exception;
5886              
5887             my %is_paren_dollar;
5888              
5889             my %is_n_v;
5890              
5891             BEGIN {
5892              
5893             # Always expecting TERM following these types:
5894             # note: this is identical to '@value_requestor_type' defined later.
5895             # Fix for c250: add new type 'P' for package (expecting VERSION or {}
5896             # after package NAMESPACE, so expecting TERM)
5897             # Fix for c250: add new type 'S' for sub (not expecting operator)
5898 39     39   917 my @q = qw(
5899             ; ! + x & ? F J - p / Y : % f U ~ A G j L P S * . | ^ < = [ m { \ > t
5900             || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5901             &= // >> ~. &. |. ^.
5902             ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5903             );
5904 39         4593 push @q, ',';
5905 39         133 push @q, '('; # for completeness, not currently a token type
5906 39         75 push @q, '->'; # was previously in UNKNOWN
5907 39         1325 @{op_expected_table}{@q} = (TERM) x scalar(@q);
5908              
5909             # Always UNKNOWN following these types;
5910             # previously had '->' in this list for c030
5911 39         209 @q = qw( w );
5912 39         120 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5913              
5914             # Always expecting OPERATOR ...
5915             # 'n' and 'v' are currently excluded because they might be VERSION numbers
5916             # 'i' is currently excluded because it might be a package
5917             # 'q' is currently excluded because it might be a prototype
5918             # Fix for c030: removed '->' from this list:
5919             # Fix for c250: added 'i' because new type 'P' was added
5920 39         115 @q = qw( -- C h R ++ ] Q <> i ); ## n v q );
5921 39         75 push @q, ')';
5922 39         255 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5923              
5924             # Fix for git #62: added '*' and '%'
5925 39         98 @q = qw( < ? * % );
5926 39         139 @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5927              
5928 39         87 @q = qw<) $>;
5929 39         101 @{is_paren_dollar}{@q} = (1) x scalar(@q);
5930              
5931 39         95 @q = qw( n v );
5932 39         1347 @{is_n_v}{@q} = (1) x scalar(@q);
5933              
5934             } ## end BEGIN
5935              
5936 39     39   334 use constant DEBUG_OPERATOR_EXPECTED => 0;
  39         125  
  39         86972  
5937              
5938             sub operator_expected {
5939              
5940             # Returns a parameter indicating what types of tokens can occur next
5941              
5942             # Call format:
5943             # $op_expected =
5944             # $self->operator_expected( $tok, $next_type, $blank_after_Z );
5945             # where
5946             # $tok is the current token
5947             # $next_type is the type of the next token (blank or not)
5948             # $blank_after_Z = flag for guessing after a type 'Z':
5949             # true if $tok follows type 'Z' with intermediate blank
5950             # false if $tok follows type 'Z' with no intermediate blank
5951             # ignored if $tok does not follow type 'Z'
5952              
5953             # Many perl symbols have two or more meanings. For example, '<<'
5954             # can be a shift operator or a here-doc operator. The
5955             # interpretation of these symbols depends on the current state of
5956             # the tokenizer, which may either be expecting a term or an
5957             # operator. For this example, a << would be a shift if an OPERATOR
5958             # is expected, and a here-doc if a TERM is expected. This routine
5959             # is called to make this decision for any current token. It returns
5960             # one of three possible values:
5961             #
5962             # OPERATOR - operator expected (or at least, not a term)
5963             # UNKNOWN - can't tell
5964             # TERM - a term is expected (or at least, not an operator)
5965             #
5966             # The decision is based on what has been seen so far. This
5967             # information is stored in the "$last_nonblank_type" and
5968             # "$last_nonblank_token" variables. For example, if the
5969             # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5970             # if $last_nonblank_type is 'n' (numeric), we are expecting an
5971             # OPERATOR.
5972             #
5973             # If a UNKNOWN is returned, the calling routine must guess. A major
5974             # goal of this tokenizer is to minimize the possibility of returning
5975             # UNKNOWN, because a wrong guess can spoil the formatting of a
5976             # script.
5977             #
5978             # Adding NEW_TOKENS: it is critically important that this routine be
5979             # updated to allow it to determine if an operator or term is to be
5980             # expected after the new token. Doing this simply involves adding
5981             # the new token character to one of the regexes in this routine or
5982             # to one of the hash lists
5983             # that it uses, which are initialized in the BEGIN section.
5984             # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5985             # $statement_type
5986              
5987             # When possible, token types should be selected such that we can determine
5988             # the 'operator_expected' value by a simple hash lookup. If there are
5989             # exceptions, that is an indication that a new type is needed.
5990              
5991 8421     8421 0 17898 my ( $self, $tok, $next_type, $blank_after_Z ) = @_;
5992              
5993             #--------------------------------------------
5994             # Section 1: Table lookup will get most cases
5995             #--------------------------------------------
5996              
5997             # Many types are can be obtained by a table lookup. This typically handles
5998             # more than half of the calls. For speed, the caller may try table lookup
5999             # first before calling this sub.
6000 8421         13581 my $op_expected = $op_expected_table{$last_nonblank_type};
6001 8421 100       17183 if ( defined($op_expected) ) {
6002             DEBUG_OPERATOR_EXPECTED
6003 24         47 && print {*STDOUT}
6004             "OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
6005 24         89 return $op_expected;
6006             }
6007              
6008             #---------------------------------------------
6009             # Section 2: Handle special cases if necessary
6010             #---------------------------------------------
6011              
6012             # Types 'k', '}' and 'Z' depend on context
6013             # Types 'n', 'v', 'q' also depend on context.
6014              
6015             # identifier...
6016             # Fix for c250: removed coding for type 'i' because 'i' and new type 'P'
6017             # are now done by hash table lookup
6018              
6019             # keyword...
6020 8397 100       25394 if ( $last_nonblank_type eq 'k' ) {
    100          
    100          
    100          
    100          
6021              
6022             # keywords expecting OPERATOR:
6023 2644 100       12562 if ( $expecting_operator_token{$last_nonblank_token} ) {
    100          
6024 7         23 $op_expected = OPERATOR;
6025             }
6026              
6027             # keywords expecting TERM:
6028             elsif ( $expecting_term_token{$last_nonblank_token} ) {
6029              
6030             # Exceptions from TERM:
6031              
6032             # // may follow perl functions which may be unary operators
6033             # see test file dor.t (defined or);
6034 2538 100 100     10982 if (
    50 100        
      66        
6035             $tok eq '/'
6036             && $next_type eq '/'
6037             && $is_keyword_rejecting_slash_as_pattern_delimiter{
6038             $last_nonblank_token}
6039             )
6040             {
6041 1         4 $op_expected = OPERATOR;
6042             }
6043              
6044             # Patch to allow a ? following 'split' to be a deprecated pattern
6045             # delimiter. This patch is coordinated with the omission of split
6046             # from the list
6047             # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
6048             # will force perltidy to guess.
6049             elsif ($tok eq '?'
6050             && $last_nonblank_token eq 'split' )
6051             {
6052 0         0 $op_expected = UNKNOWN;
6053             }
6054             else {
6055 2537         4479 $op_expected = TERM;
6056             }
6057             }
6058             else {
6059 99         244 $op_expected = TERM;
6060             }
6061             } ## end type 'k'
6062              
6063             # closing container token...
6064              
6065             # Note that the actual token for type '}' may also be a ')'.
6066              
6067             # Also note that $last_nonblank_token is not the token corresponding to
6068             # $last_nonblank_type when the type is a closing container. In that
6069             # case it is the token before the corresponding opening container token.
6070             # So for example, for this snippet
6071             # $a = do { BLOCK } / 2;
6072             # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
6073              
6074             elsif ( $last_nonblank_type eq '}' ) {
6075 3590         6531 $op_expected = UNKNOWN;
6076              
6077             # handle something after 'do' and 'eval'
6078 3590 100 66     19311 if ( $is_block_operator{$last_nonblank_token} ) {
    100          
    100          
6079              
6080             # something like $a = do { BLOCK } / 2;
6081 82         246 $op_expected = OPERATOR; # block mode following }
6082             }
6083              
6084             # $last_nonblank_token =~ /^(\)|\$|\-\>)/
6085             elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
6086             || substr( $last_nonblank_token, 0, 2 ) eq '->' )
6087             {
6088 2122         3753 $op_expected = OPERATOR;
6089 2122 50       5117 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
  0         0  
6090             }
6091              
6092             # Check for smartmatch operator before preceding brace or square
6093             # bracket. For example, at the ? after the ] in the following
6094             # expressions we are expecting an operator:
6095             #
6096             # qr/3/ ~~ ['1234'] ? 1 : 0;
6097             # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
6098             elsif ( $last_nonblank_token eq '~~' ) {
6099 20         47 $op_expected = OPERATOR;
6100             }
6101              
6102             # A right brace here indicates the end of a simple block. All
6103             # non-structural right braces have type 'R' all braces associated with
6104             # block operator keywords have been given those keywords as
6105             # "last_nonblank_token" and caught above. (This statement is order
6106             # dependent, and must come after checking $last_nonblank_token).
6107             else {
6108              
6109             # patch for dor.t (defined or).
6110 1366 50 33     6277 if ( $tok eq '/'
    100 33        
6111             && $next_type eq '/'
6112             && $last_nonblank_token eq ']' )
6113             {
6114 0         0 $op_expected = OPERATOR;
6115             }
6116              
6117             # Patch for RT #116344: misparse a ternary operator after an
6118             # anonymous hash, like this:
6119             # return ref {} ? 1 : 0;
6120             # The right brace should really be marked type 'R' in this case,
6121             # and it is safest to return an UNKNOWN here. Expecting a TERM will
6122             # cause the '?' to always be interpreted as a pattern delimiter
6123             # rather than introducing a ternary operator.
6124             elsif ( $tok eq '?' ) {
6125 1         3 $op_expected = UNKNOWN;
6126             }
6127             else {
6128 1365         2320 $op_expected = TERM;
6129             }
6130             }
6131             } ## end type '}'
6132              
6133             # number or v-string...
6134             # An exception is for VERSION numbers a 'use' statement. It has the format
6135             # use Module VERSION LIST
6136             # We could avoid this exception by writing a special sub to parse 'use'
6137             # statements and perhaps mark these numbers with a new type V (for VERSION)
6138             ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
6139             elsif ( $is_n_v{$last_nonblank_type} ) {
6140 1985         3257 $op_expected = OPERATOR;
6141 1985 100       4413 if ( $statement_type eq 'use' ) {
6142 11         43 $op_expected = UNKNOWN;
6143             }
6144             }
6145              
6146             # quote...
6147             # TODO: labeled prototype words would better be given type 'A' or maybe
6148             # 'J'; not 'q'; or maybe mark as type 'Y'?
6149             elsif ( $last_nonblank_type eq 'q' ) {
6150 137 50       621 if ( $last_nonblank_token eq 'prototype' ) {
    100          
6151 0         0 $op_expected = TERM;
6152             }
6153              
6154             # update for --use-feature=class (rt145706):
6155             # Look for class VERSION after possible attribute, as in
6156             # class Example::Subclass : isa(Example::Base) 1.345 { ... }
6157             elsif ( $statement_type =~ /^package\b/ ) {
6158 3         6 $op_expected = TERM;
6159             }
6160              
6161             # everything else
6162             else {
6163 134         343 $op_expected = OPERATOR;
6164             }
6165             }
6166              
6167             # file handle or similar
6168             elsif ( $last_nonblank_type eq 'Z' ) {
6169              
6170             # angle.t
6171 40 100 33     603 if ( $last_nonblank_token =~ /^\w/ ) {
    50 66        
    100          
    50          
    100          
6172 2         5 $op_expected = UNKNOWN;
6173             }
6174              
6175             # Exception to weird parsing rules for 'x(' ... see case b1205:
6176             # In something like 'print $vv x(...' the x is an operator;
6177             # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
6178             # otherwise x follows the weird parsing rules.
6179             elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
6180 0         0 $op_expected = OPERATOR;
6181             }
6182              
6183             # The 'weird parsing rules' of next section do not work for '<' and '?'
6184             # It is best to mark them as unknown. Test case:
6185             # print $fh <DATA>;
6186             elsif ( $is_weird_parsing_rule_exception{$tok} ) {
6187 4         13 $op_expected = UNKNOWN;
6188             }
6189              
6190             # For possible file handle like "$a", Perl uses weird parsing rules.
6191             # For example:
6192             # print $a/2,"/hi"; - division
6193             # print $a / 2,"/hi"; - division
6194             # print $a/ 2,"/hi"; - division
6195             # print $a /2,"/hi"; - pattern (and error)!
6196             # Some examples where this logic works okay, for '&','*','+':
6197             # print $fh &xsi_protos(@mods);
6198             # my $x = new $CompressClass *FH;
6199             # print $OUT +( $count % 15 ? ", " : "\n\t" );
6200             elsif ($blank_after_Z
6201             && $next_type ne 'b' )
6202             {
6203 0         0 $op_expected = TERM;
6204             }
6205              
6206             # Note that '?' and '<' have been moved above
6207             # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
6208             elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
6209              
6210             # Do not complain in 'use' statements, which have special syntax.
6211             # For example, from RT#130344:
6212             # use lib $FindBin::Bin . '/lib';
6213 9 50       29 if ( $statement_type ne 'use' ) {
6214 9         37 $self->complain(
6215             "operator in possible indirect object location not recommended\n"
6216             );
6217             }
6218 9         19 $op_expected = OPERATOR;
6219             }
6220              
6221             # all other cases
6222             else {
6223 25         77 $op_expected = UNKNOWN;
6224             }
6225             }
6226              
6227             # anything else...
6228             else {
6229 1         2 $op_expected = UNKNOWN;
6230             }
6231              
6232             DEBUG_OPERATOR_EXPECTED
6233 8397         11127 && print {*STDOUT}
6234             "OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
6235              
6236 8397         15924 return $op_expected;
6237              
6238             } ## end sub operator_expected
6239              
6240             sub new_statement_ok {
6241              
6242             # Returns:
6243             # true if a new statement can begin here
6244             # false otherwise
6245              
6246             # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6247             # $brace_depth, $rbrace_type
6248              
6249             # Uses:
6250             # - See if a 'class' statement can occur here
6251             # - See if a keyword begins at a new statement; i.e. is an 'if' a
6252             # block if or a trailing if? Also see if 'format' starts a statement.
6253             # - Decide if a ':' is part of a statement label (not a ternary)
6254              
6255             # Curly braces are tricky because some small blocks do not get marked as
6256             # blocks..
6257              
6258             # if it follows an opening curly brace..
6259 435 100 66 435 0 2321 if ( $last_nonblank_token eq '{' ) {
    100          
6260              
6261             # The safe thing is to return true in all cases because:
6262             # - a ternary ':' cannot occur here
6263             # - an 'if' here, for example, cannot be a trailing if
6264             # See test case c231 for an example.
6265             # This works but could be improved, if necessary, by returning
6266             # 'false' at obvious non-blocks.
6267 59         220 return 1;
6268             }
6269              
6270             # if it follows a closing code block curly brace..
6271             elsif ($last_nonblank_token eq '}'
6272             && $last_nonblank_type eq $last_nonblank_token )
6273             {
6274              
6275             # a new statement can follow certain closing block braces ...
6276             # FIXME: The following has worked well but returns true in some cases
6277             # where it really should not. We could fix this by either excluding
6278             # certain blocks, like sort/map/grep/eval/asub or by just including
6279             # certain blocks.
6280 99         377 return $rbrace_type->[$brace_depth];
6281             }
6282              
6283             # otherwise, it is a label if and only if it follows a ';' (real or fake)
6284             # or another label
6285             else {
6286 277   100     1718 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
6287             }
6288             } ## end sub new_statement_ok
6289              
6290             sub code_block_type {
6291              
6292             # Decide if this is a block of code, and its type.
6293             # Must be called only when $type = $token = '{'
6294             # The problem is to distinguish between the start of a block of code
6295             # and the start of an anonymous hash reference
6296             # Returns "" if not code block, otherwise returns 'last_nonblank_token'
6297             # to indicate the type of code block. (For example, 'last_nonblank_token'
6298             # might be 'if' for an if block, 'else' for an else block, etc).
6299             # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6300             # $last_nonblank_block_type, $brace_depth, $rbrace_type
6301              
6302             # handle case of multiple '{'s
6303              
6304             # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
6305              
6306 1301     1301 0 3333 my ( $self, $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6307 1301 100 66     16557 if ( $last_nonblank_token eq '{'
    100 66        
    100 66        
    100 100        
    100 66        
    100 66        
    50          
    50          
    100          
    100          
    100          
6308             && $last_nonblank_type eq $last_nonblank_token )
6309             {
6310              
6311             # opening brace where a statement may appear is probably
6312             # a code block but might be and anonymous hash reference
6313 90 50       342 if ( $rbrace_type->[$brace_depth] ) {
6314 90         314 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6315             $max_token_index );
6316             }
6317              
6318             # cannot start a code block within an anonymous hash
6319             else {
6320 0         0 return EMPTY_STRING;
6321             }
6322             }
6323              
6324             elsif ( $last_nonblank_token eq ';' ) {
6325              
6326             # an opening brace where a statement may appear is probably
6327             # a code block but might be and anonymous hash reference
6328 48         276 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6329             $max_token_index );
6330             }
6331              
6332             # handle case of '}{'
6333             elsif ($last_nonblank_token eq '}'
6334             && $last_nonblank_type eq $last_nonblank_token )
6335             {
6336              
6337             # a } { situation ...
6338             # could be hash reference after code block..(blktype1.t)
6339 9 50       28 if ($last_nonblank_block_type) {
6340 9         44 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6341             $max_token_index );
6342             }
6343              
6344             # must be a block if it follows a closing hash reference
6345             else {
6346 0         0 return $last_nonblank_token;
6347             }
6348             }
6349              
6350             #--------------------------------------------------------------
6351             # NOTE: braces after type characters start code blocks, but for
6352             # simplicity these are not identified as such. See also
6353             # sub is_non_structural_brace.
6354             #--------------------------------------------------------------
6355              
6356             ## elsif ( $last_nonblank_type eq 't' ) {
6357             ## return $last_nonblank_token;
6358             ## }
6359              
6360             # brace after label:
6361             elsif ( $last_nonblank_type eq 'J' ) {
6362 34         113 return $last_nonblank_token;
6363             }
6364              
6365             # otherwise, look at previous token. This must be a code block if
6366             # it follows any of these:
6367             # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
6368             elsif ($is_code_block_token{$last_nonblank_token}
6369             || $is_grep_alias{$last_nonblank_token} )
6370             {
6371              
6372             # Bug Patch: Note that the opening brace after the 'if' in the following
6373             # snippet is an anonymous hash ref and not a code block!
6374             # print 'hi' if { x => 1, }->{x};
6375             # We can identify this situation because the last nonblank type
6376             # will be a keyword (instead of a closing paren)
6377 480 50 33     2515 if (
      66        
6378             $last_nonblank_type eq 'k'
6379             && ( $last_nonblank_token eq 'if'
6380             || $last_nonblank_token eq 'unless' )
6381             )
6382             {
6383 0         0 return EMPTY_STRING;
6384             }
6385             else {
6386 480         1811 return $last_nonblank_token;
6387             }
6388             }
6389              
6390             # or a sub or package BLOCK
6391             # Fixed for c250 to include new package type 'P', and change 'i' to 'S'
6392             elsif (
6393             $last_nonblank_type eq 'P'
6394             || $last_nonblank_type eq 'S'
6395             || ( $last_nonblank_type eq 't'
6396             && substr( $last_nonblank_token, 0, 3 ) eq 'sub' )
6397             )
6398             {
6399 294         929 return $last_nonblank_token;
6400             }
6401              
6402             elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6403 0         0 return $statement_type;
6404             }
6405              
6406             # user-defined subs with block parameters (like grep/map/eval)
6407             elsif ( $last_nonblank_type eq 'G' ) {
6408 0         0 return $last_nonblank_token;
6409             }
6410              
6411             # check bareword
6412             elsif ( $last_nonblank_type eq 'w' ) {
6413              
6414             # check for syntax 'use MODULE LIST'
6415             # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6416 22 100       89 return EMPTY_STRING if ( $statement_type eq 'use' );
6417              
6418 21         122 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6419             $max_token_index );
6420             }
6421              
6422             # Patch for bug # RT #94338 reported by Daniel Trizen
6423             # for-loop in a parenthesized block-map triggering an error message:
6424             # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6425             # Check for a code block within a parenthesized function call
6426             elsif ( $last_nonblank_token eq '(' ) {
6427 81         215 my $paren_type = $rparen_type->[$paren_depth];
6428              
6429             # /^(map|grep|sort)$/
6430 81 100 66     439 if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6431              
6432             # We will mark this as a code block but use type 't' instead
6433             # of the name of the containing function. This will allow for
6434             # correct parsing but will usually produce better formatting.
6435             # Braces with block type 't' are not broken open automatically
6436             # in the formatter as are other code block types, and this usually
6437             # works best.
6438 1         5 return 't'; # (Not $paren_type)
6439             }
6440             else {
6441 80         264 return EMPTY_STRING;
6442             }
6443             }
6444              
6445             # handle unknown syntax ') {'
6446             # we previously appended a '()' to mark this case
6447             elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6448 14         117 return $last_nonblank_token;
6449             }
6450              
6451             # anything else must be anonymous hash reference
6452             else {
6453 229         724 return EMPTY_STRING;
6454             }
6455             } ## end sub code_block_type
6456              
6457             sub decide_if_code_block {
6458              
6459             # USES GLOBAL VARIABLES: $last_nonblank_token
6460 168     168 0 481 my ( $self, $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6461              
6462 168         580 my ( $next_nonblank_token, $i_next ) =
6463             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
6464              
6465             # we are at a '{' where a statement may appear.
6466             # We must decide if this brace starts an anonymous hash or a code
6467             # block.
6468             # return "" if anonymous hash, and $last_nonblank_token otherwise
6469              
6470             # initialize to be code BLOCK
6471 168         498 my $code_block_type = $last_nonblank_token;
6472              
6473             # Check for the common case of an empty anonymous hash reference:
6474             # Maybe something like sub { { } }
6475 168 100       607 if ( $next_nonblank_token eq '}' ) {
6476 5         11 $code_block_type = EMPTY_STRING;
6477             }
6478              
6479             else {
6480              
6481             # To guess if this '{' is an anonymous hash reference, look ahead
6482             # and test as follows:
6483             #
6484             # it is a hash reference if next come:
6485             # - a string or digit followed by a comma or =>
6486             # - bareword followed by =>
6487             # otherwise it is a code block
6488             #
6489             # Examples of anonymous hash ref:
6490             # {'aa',};
6491             # {1,2}
6492             #
6493             # Examples of code blocks:
6494             # {1; print "hello\n", 1;}
6495             # {$a,1};
6496              
6497             # We are only going to look ahead one more (nonblank/comment) line.
6498             # Strange formatting could cause a bad guess, but that's unlikely.
6499 163         378 my @pre_types;
6500             my @pre_tokens;
6501              
6502             # Ignore the rest of this line if it is a side comment
6503 163 100       493 if ( $next_nonblank_token ne '#' ) {
6504 139         585 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
  139         806  
6505 139         418 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
  139         757  
6506             }
6507              
6508             # Here 20 is arbitrary but generous, and prevents wasting lots of time
6509             # in mangled files
6510 163         708 my ( $rpre_tokens, $rpre_types ) =
6511             $self->peek_ahead_for_n_nonblank_pre_tokens(20);
6512 163 100 66     619 if ( defined($rpre_types) && @{$rpre_types} ) {
  155         1554  
6513 155         294 push @pre_types, @{$rpre_types};
  155         620  
6514 155         333 push @pre_tokens, @{$rpre_tokens};
  155         731  
6515             }
6516              
6517             # put a sentinel token to simplify stopping the search
6518 163         439 push @pre_types, '}';
6519 163         368 push @pre_types, '}';
6520              
6521 163         320 my $jbeg = 0;
6522 163 100       550 $jbeg = 1 if $pre_types[0] eq 'b';
6523              
6524             # first look for one of these
6525             # - bareword
6526             # - bareword with leading -
6527             # - digit
6528             # - quoted string
6529 163         303 my $j = $jbeg;
6530 163 100 33     1165 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
    100          
    100          
    50          
6531              
6532             # find the closing quote; don't worry about escapes
6533 1         3 my $quote_mark = $pre_types[$j];
6534 1         4 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6535 1 50       6 if ( $pre_types[$k] eq $quote_mark ) {
6536 1         3 $j = $k + 1;
6537             ##my $next = $pre_types[$j];
6538 1         3 last;
6539             }
6540             }
6541             }
6542             elsif ( $pre_types[$j] eq 'd' ) {
6543 8         15 $j++;
6544             }
6545             elsif ( $pre_types[$j] eq 'w' ) {
6546 71         180 $j++;
6547             }
6548             elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6549 0         0 $j++;
6550             }
6551             else {
6552             # none of the above
6553             }
6554 163 100       509 if ( $j > $jbeg ) {
6555              
6556 80 100       329 $j++ if $pre_types[$j] eq 'b';
6557              
6558             # Patched for RT #95708
6559 80 100 33     716 if (
      66        
      66        
6560              
6561             # it is a comma which is not a pattern delimiter except for qw
6562             (
6563             $pre_types[$j] eq ','
6564             ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6565             && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6566             )
6567              
6568             # or a =>
6569             || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6570             )
6571             {
6572 18         45 $code_block_type = EMPTY_STRING;
6573             }
6574             }
6575              
6576 163 100       541 if ($code_block_type) {
6577              
6578             # Patch for cases b1085 b1128: It is uncertain if this is a block.
6579             # If this brace follows a bareword, then append a space as a signal
6580             # to the formatter that this may not be a block brace. To find the
6581             # corresponding code in Formatter.pm search for 'b1085'.
6582 145 100       1258 $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6583             }
6584             }
6585              
6586 168         608 return $code_block_type;
6587             } ## end sub decide_if_code_block
6588              
6589             sub report_unexpected {
6590              
6591             # report unexpected token type and show where it is
6592             # USES GLOBAL VARIABLES: (none)
6593 0     0 0 0 my ( $self, $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6594             $rpretoken_type, $input_line )
6595             = @_;
6596              
6597 0 0       0 if ( ++$self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6598 0         0 my $msg = "found $found where $expecting expected";
6599 0         0 my $pos = $rpretoken_map->[$i_tok];
6600 0         0 $self->interrupt_logfile();
6601 0         0 my $input_line_number = $self->[_last_line_number_];
6602 0         0 my ( $offset, $numbered_line, $underline ) =
6603             make_numbered_line( $input_line_number, $input_line, $pos );
6604 0         0 $underline = write_on_underline( $underline, $pos - $offset, '^' );
6605              
6606 0         0 my $trailer = EMPTY_STRING;
6607 0 0 0     0 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6608 0         0 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6609 0         0 my $num;
6610 0 0       0 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6611 0         0 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6612             }
6613             else {
6614 0         0 $num = $pos - $pos_prev;
6615             }
6616 0 0       0 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
  0         0  
  0         0  
6617              
6618             $underline =
6619 0         0 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6620 0         0 $trailer = " (previous token underlined)";
6621             }
6622 0         0 $underline =~ s/\s+$//;
6623 0         0 $self->warning( $numbered_line . "\n" );
6624 0         0 $self->warning( $underline . "\n" );
6625 0         0 $self->warning( $msg . $trailer . "\n" );
6626 0         0 $self->resume_logfile();
6627             }
6628 0         0 return;
6629             } ## end sub report_unexpected
6630              
6631             my %is_sigil_or_paren;
6632             my %is_R_closing_sb;
6633              
6634             BEGIN {
6635              
6636 39     39   308 my @q = qw< $ & % * @ ) >;
6637 39         317 @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
6638              
6639 39         153 @q = qw(R ]);
6640 39         87832 @{is_R_closing_sb}{@q} = (1) x scalar(@q);
6641             } ## end BEGIN
6642              
6643             sub is_non_structural_brace {
6644              
6645             # Decide if a brace or bracket is structural or non-structural
6646             # by looking at the previous token and type
6647             # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
6648              
6649             # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
6650             # Tentatively deactivated because it caused the wrong operator expectation
6651             # for this code:
6652             # $user = @vars[1] / 100;
6653             # Must update sub operator_expected before re-implementing.
6654             # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
6655             # return 0;
6656             # }
6657              
6658             #--------------------------------------------------------------
6659             # NOTE: braces after type characters start code blocks, but for
6660             # simplicity these are not identified as such. See also
6661             # sub code_block_type
6662             #--------------------------------------------------------------
6663              
6664             ##if ($last_nonblank_type eq 't') {return 0}
6665              
6666             # otherwise, it is non-structural if it is decorated
6667             # by type information.
6668             # For example, the '{' here is non-structural: ${xxx}
6669             # Removed '::' to fix c074
6670             ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
6671             return (
6672             ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
6673             $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
6674             || substr( $last_nonblank_token, 0, 2 ) eq '->'
6675              
6676             # or if we follow a hash or array closing curly brace or bracket
6677             # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
6678             # because the first '}' would have been given type 'R'
6679             ##|| $last_nonblank_type =~ /^([R\]])$/
6680 2262   66 2262 0 14710 || $is_R_closing_sb{$last_nonblank_type}
6681             );
6682             } ## end sub is_non_structural_brace
6683              
6684             #######################################################################
6685             # Tokenizer routines for tracking container nesting depths
6686             #######################################################################
6687              
6688             # The following routines keep track of nesting depths of the nesting
6689             # types, ( [ { and ?. This is necessary for determining the indentation
6690             # level, and also for debugging programs. Not only do they keep track of
6691             # nesting depths of the individual brace types, but they check that each
6692             # of the other brace types is balanced within matching pairs. For
6693             # example, if the program sees this sequence:
6694             #
6695             # { ( ( ) }
6696             #
6697             # then it can determine that there is an extra left paren somewhere
6698             # between the { and the }. And so on with every other possible
6699             # combination of outer and inner brace types. For another
6700             # example:
6701             #
6702             # ( [ ..... ] ] )
6703             #
6704             # which has an extra ] within the parens.
6705             #
6706             # The brace types have indexes 0 .. 3 which are indexes into
6707             # the matrices.
6708             #
6709             # The pair ? : are treated as just another nesting type, with ? acting
6710             # as the opening brace and : acting as the closing brace.
6711             #
6712             # The matrix
6713             #
6714             # $rdepth_array->[$a][$b][ $rcurrent_depth->[$a] ] = $rcurrent_depth->[$b];
6715             #
6716             # saves the nesting depth of brace type $b (where $b is either of the other
6717             # nesting types) when brace type $a enters a new depth. When this depth
6718             # decreases, a check is made that the current depth of brace types $b is
6719             # unchanged, or otherwise there must have been an error. This can
6720             # be very useful for localizing errors, particularly when perl runs to
6721             # the end of a large file (such as this one) and announces that there
6722             # is a problem somewhere.
6723             #
6724             # A numerical sequence number is maintained for every nesting type,
6725             # so that each matching pair can be uniquely identified in a simple
6726             # way.
6727              
6728             sub increase_nesting_depth {
6729 4574     4574 0 9142 my ( $self, $aa, $pos ) = @_;
6730              
6731             # USES GLOBAL VARIABLES: $rcurrent_depth,
6732             # $rcurrent_sequence_number, $rdepth_array,
6733             # $rstarting_line_of_current_depth, $statement_type
6734 4574         8234 my $cd_aa = ++$rcurrent_depth->[$aa];
6735 4574         6591 $total_depth++;
6736 4574         9142 $rtotal_depth->[$aa][$cd_aa] = $total_depth;
6737 4574         7334 my $input_line_number = $self->[_last_line_number_];
6738 4574         7766 my $input_line = $self->[_line_of_text_];
6739              
6740             # Sequence numbers increment by number of items. This keeps
6741             # a unique set of numbers but still allows the relative location
6742             # of any type to be determined.
6743              
6744             # make a new unique sequence number
6745 4574         7684 my $seqno = $next_sequence_number++;
6746              
6747 4574         8626 $rcurrent_sequence_number->[$aa][$cd_aa] = $seqno;
6748              
6749 4574         14863 $rstarting_line_of_current_depth->[$aa][$cd_aa] =
6750             [ $input_line_number, $input_line, $pos ];
6751              
6752 4574         14300 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6753 18296 100       33421 next if ( $bb == $aa );
6754 13722         26829 $rdepth_array->[$aa][$bb][$cd_aa] = $rcurrent_depth->[$bb];
6755             }
6756              
6757             # set a flag for indenting a nested ternary statement
6758 4574         8023 my $indent = 0;
6759 4574 100       10649 if ( $aa == QUESTION_COLON ) {
6760 187         718 $rnested_ternary_flag->[$cd_aa] = 0;
6761 187 100       723 if ( $cd_aa > 1 ) {
6762 17 100       151 if ( $rnested_ternary_flag->[ $cd_aa - 1 ] == 0 ) {
6763 16         66 my $pdepth = $rtotal_depth->[$aa][ $cd_aa - 1 ];
6764 16 100       68 if ( $pdepth == $total_depth - 1 ) {
6765 8         18 $indent = 1;
6766 8         22 $rnested_ternary_flag->[ $cd_aa - 1 ] = -1;
6767             }
6768             }
6769             }
6770             }
6771              
6772             # Fix part #1 for git82: save last token type for propagation of type 'Z'
6773 4574         15681 $rnested_statement_type->[$aa][$cd_aa] =
6774             [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6775 4574         7742 $statement_type = EMPTY_STRING;
6776 4574         12467 return ( $seqno, $indent );
6777             } ## end sub increase_nesting_depth
6778              
6779             sub is_balanced_closing_container {
6780              
6781             # Return true if a closing container can go here without error
6782             # Return false if not
6783 47     47 0 150 my ($aa) = @_;
6784              
6785             # cannot close if there was no opening
6786 47         90 my $cd_aa = $rcurrent_depth->[$aa];
6787 47 100       209 return if ( $cd_aa <= 0 );
6788              
6789             # check that any other brace types $bb contained within would be balanced
6790 8         26 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6791 8 50       25 next if ( $bb == $aa );
6792             return
6793 8 50       60 if ( $rdepth_array->[$aa][$bb][$cd_aa] != $rcurrent_depth->[$bb] );
6794             }
6795              
6796             # OK, everything will be balanced
6797 0         0 return 1;
6798             } ## end sub is_balanced_closing_container
6799              
6800             sub decrease_nesting_depth {
6801              
6802 4574     4574 0 9174 my ( $self, $aa, $pos ) = @_;
6803              
6804             # USES GLOBAL VARIABLES: $rcurrent_depth,
6805             # $rcurrent_sequence_number, $rdepth_array, $rstarting_line_of_current_depth
6806             # $statement_type
6807 4574         7043 my $seqno = 0;
6808 4574         7210 my $input_line_number = $self->[_last_line_number_];
6809 4574         7971 my $input_line = $self->[_line_of_text_];
6810              
6811 4574         6950 my $outdent = 0;
6812 4574         6565 $total_depth--;
6813 4574         7947 my $cd_aa = $rcurrent_depth->[$aa];
6814 4574 50       9273 if ( $cd_aa > 0 ) {
6815              
6816             # set a flag for un-indenting after seeing a nested ternary statement
6817 4574         8396 $seqno = $rcurrent_sequence_number->[$aa][$cd_aa];
6818 4574 100       9598 if ( $aa == QUESTION_COLON ) {
6819 187         608 $outdent = $rnested_ternary_flag->[$cd_aa];
6820             }
6821              
6822             # Fix part #2 for git82: use saved type for propagation of type 'Z'
6823             # through type L-R braces. Perl seems to allow ${bareword}
6824             # as an indirect object, but nothing much more complex than that.
6825             ( $statement_type, my $saved_type, my $saved_token ) =
6826 4574         6905 @{ $rnested_statement_type->[$aa][ $rcurrent_depth->[$aa] ] };
  4574         12872  
6827 4574 50 100     16045 if ( $aa == BRACE
      66        
      66        
6828             && $saved_type eq 'Z'
6829             && $last_nonblank_type eq 'w'
6830             && $rbrace_structural_type->[$brace_depth] eq 'L' )
6831             {
6832 1         3 $last_nonblank_type = $saved_type;
6833             }
6834              
6835             # check that any brace types $bb contained within are balanced
6836 4574         12633 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6837 18296 100       32271 next if ( $bb == $aa );
6838              
6839 13722 50       31984 if ( $rdepth_array->[$aa][$bb][$cd_aa] != $rcurrent_depth->[$bb] ) {
6840 0         0 my $diff =
6841             $rcurrent_depth->[$bb] - $rdepth_array->[$aa][$bb][$cd_aa];
6842              
6843             # don't whine too many times
6844 0         0 my $saw_brace_error = $self->get_saw_brace_error();
6845 0 0 0     0 if (
      0        
6846             $saw_brace_error <= MAX_NAG_MESSAGES
6847              
6848             # if too many closing types have occurred, we probably
6849             # already caught this error
6850             && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6851             )
6852             {
6853 0         0 $self->interrupt_logfile();
6854 0         0 my $rsl = $rstarting_line_of_current_depth->[$aa][$cd_aa];
6855 0         0 my $sl = $rsl->[0];
6856 0         0 my $rel = [ $input_line_number, $input_line, $pos ];
6857 0         0 my $el = $rel->[0];
6858 0         0 my ($ess);
6859              
6860 0 0 0     0 if ( $diff == 1 || $diff == -1 ) {
6861 0         0 $ess = EMPTY_STRING;
6862             }
6863             else {
6864 0         0 $ess = 's';
6865             }
6866 0 0       0 my $bname =
6867             ( $diff > 0 )
6868             ? $opening_brace_names[$bb]
6869             : $closing_brace_names[$bb];
6870 0         0 $self->write_error_indicator_pair( @{$rsl}, '^' );
  0         0  
6871 0         0 my $msg = <<"EOM";
6872             Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6873             EOM
6874              
6875 0 0       0 if ( $diff > 0 ) {
6876 0         0 my $rml =
6877             $rstarting_line_of_current_depth->[$bb]
6878             [ $rcurrent_depth->[$bb] ];
6879 0         0 my $ml = $rml->[0];
6880 0         0 $msg .=
6881             " The most recent un-matched $bname is on line $ml\n";
6882 0         0 $self->write_error_indicator_pair( @{$rml}, '^' );
  0         0  
6883             }
6884 0         0 $self->write_error_indicator_pair( @{$rel}, '^' );
  0         0  
6885 0         0 $self->warning($msg);
6886 0         0 $self->resume_logfile();
6887             }
6888 0         0 $self->increment_brace_error();
6889             }
6890             }
6891 4574         8439 $rcurrent_depth->[$aa]--;
6892             }
6893             else {
6894              
6895 0         0 my $saw_brace_error = $self->get_saw_brace_error();
6896 0 0       0 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6897 0         0 my $msg = <<"EOM";
6898             There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6899             EOM
6900 0         0 $self->indicate_error( $msg, $input_line_number, $input_line, $pos,
6901             '^' );
6902             }
6903 0         0 $self->increment_brace_error();
6904              
6905             # keep track of errors in braces alone (ignoring ternary nesting errors)
6906 0 0       0 $self->[_true_brace_error_count_]++
6907             if ( $closing_brace_names[$aa] ne "':'" );
6908             }
6909 4574         12458 return ( $seqno, $outdent );
6910             } ## end sub decrease_nesting_depth
6911              
6912             sub check_final_nesting_depths {
6913              
6914             # USES GLOBAL VARIABLES: $rcurrent_depth, $rstarting_line_of_current_depth
6915 561     561 0 1456 my $self = shift;
6916              
6917 561         2504 for my $aa ( 0 .. @closing_brace_names - 1 ) {
6918              
6919 2244         3897 my $cd_aa = $rcurrent_depth->[$aa];
6920 2244 50       5206 if ($cd_aa) {
6921 0         0 my $rsl = $rstarting_line_of_current_depth->[$aa][$cd_aa];
6922 0         0 my $sl = $rsl->[0];
6923 0         0 my $msg = <<"EOM";
6924             Final nesting depth of $opening_brace_names[$aa]s is $cd_aa
6925             The most recent un-matched $opening_brace_names[$aa] is on line $sl
6926             EOM
6927 0         0 $self->indicate_error( $msg, @{$rsl}, '^' );
  0         0  
6928 0         0 $self->increment_brace_error();
6929             }
6930             }
6931 561         1613 return;
6932             } ## end sub check_final_nesting_depths
6933              
6934             #######################################################################
6935             # Tokenizer routines for looking ahead in input stream
6936             #######################################################################
6937              
6938             sub peek_ahead_for_n_nonblank_pre_tokens {
6939              
6940             # returns next n pretokens if they exist
6941             # returns undef's if hits eof without seeing any pretokens
6942             # USES GLOBAL VARIABLES: (none)
6943 170     170 0 430 my ( $self, $max_pretokens ) = @_;
6944 170         321 my $line;
6945 170         310 my $i = 0;
6946 170         405 my ( $rpre_tokens, $rmap, $rpre_types );
6947              
6948 170         562 while ( $line = $self->peek_ahead( $i++ ) ) {
6949 182         990 $line =~ s/^\s*//; # trim leading blanks
6950 182 100       705 next if ( length($line) <= 0 ); # skip blank
6951 176 100       642 next if ( $line =~ /^#/ ); # skip comment
6952 162         514 ( $rpre_tokens, $rmap, $rpre_types ) =
6953             pre_tokenize( $line, $max_pretokens );
6954 162         469 last;
6955             }
6956 170         620 return ( $rpre_tokens, $rpre_types );
6957             } ## end sub peek_ahead_for_n_nonblank_pre_tokens
6958              
6959             # look ahead for next non-blank, non-comment line of code
6960             sub peek_ahead_for_nonblank_token {
6961              
6962             # USES GLOBAL VARIABLES: (none)
6963 125     125 0 386 my ( $self, $rtokens, $max_token_index ) = @_;
6964 125         246 my $line;
6965 125         262 my $i = 0;
6966              
6967 125         511 while ( $line = $self->peek_ahead( $i++ ) ) {
6968 169         995 $line =~ s/^\s*//; # trim leading blanks
6969 169 100       660 next if ( length($line) <= 0 ); # skip blank
6970 144 100       682 next if ( $line =~ /^#/ ); # skip comment
6971              
6972             # Updated from 2 to 3 to get trigraphs, added for case b1175
6973 123         466 my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6974 123         429 my $j = $max_token_index + 1;
6975              
6976 123         297 foreach my $tok ( @{$rtok} ) {
  123         364  
6977 355 100       1398 last if ( $tok =~ "\n" );
6978 320         835 $rtokens->[ ++$j ] = $tok;
6979             }
6980 123         491 last;
6981             }
6982 125         360 return;
6983             } ## end sub peek_ahead_for_nonblank_token
6984              
6985             #######################################################################
6986             # Tokenizer guessing routines for ambiguous situations
6987             #######################################################################
6988              
6989             sub guess_if_pattern_or_conditional {
6990              
6991             # this routine is called when we have encountered a ? following an
6992             # unknown bareword, and we must decide if it starts a pattern or not
6993             # input parameters:
6994             # $i - token index of the ? starting possible pattern
6995             # output parameters:
6996             # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
6997             # msg = a warning or diagnostic message
6998             # USES GLOBAL VARIABLES: $last_nonblank_token
6999              
7000 11     11 0 51 my ( $self, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
7001 11         31 my $is_pattern = 0;
7002 11         55 my $msg = "guessing that ? after $last_nonblank_token starts a ";
7003              
7004 11 50       46 if ( $i >= $max_token_index ) {
7005 0         0 $msg .= "conditional (no end to pattern found on the line)\n";
7006             }
7007             else {
7008 11         29 my $ibeg = $i;
7009 11         29 $i = $ibeg + 1;
7010 11         33 my $next_token = $rtokens->[$i]; # first token after ?
7011              
7012             # look for a possible ending ? on this line..
7013 11         27 my $in_quote = 1;
7014 11         23 my $quote_depth = 0;
7015 11         23 my $quote_character = EMPTY_STRING;
7016 11         25 my $quote_pos = 0;
7017 11         25 my $quoted_string;
7018             (
7019              
7020 11         57 $i,
7021             $in_quote,
7022             $quote_character,
7023             $quote_pos,
7024             $quote_depth,
7025             $quoted_string,
7026              
7027             ) = $self->follow_quoted_string(
7028              
7029             $ibeg,
7030             $in_quote,
7031             $rtokens,
7032             $quote_character,
7033             $quote_pos,
7034             $quote_depth,
7035             $max_token_index,
7036              
7037             );
7038              
7039 11 50       71 if ($in_quote) {
7040              
7041             # we didn't find an ending ? on this line,
7042             # so we bias towards conditional
7043 11         32 $is_pattern = 0;
7044 11         38 $msg .= "conditional (no ending ? on this line)\n";
7045              
7046             # we found an ending ?, so we bias towards a pattern
7047             }
7048             else {
7049              
7050             # Watch out for an ending ? in quotes, like this
7051             # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
7052 0         0 my $s_quote = 0;
7053 0         0 my $d_quote = 0;
7054 0         0 my $colons = 0;
7055 0         0 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
7056 0         0 my $tok = $rtokens->[$ii];
7057 0 0       0 if ( $tok eq ":" ) { $colons++ }
  0         0  
7058 0 0       0 if ( $tok eq "'" ) { $s_quote++ }
  0         0  
7059 0 0       0 if ( $tok eq '"' ) { $d_quote++ }
  0         0  
7060             }
7061 0 0 0     0 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
    0 0        
7062 0         0 $is_pattern = 0;
7063 0         0 $msg .= "found ending ? but unbalanced quote chars\n";
7064             }
7065             elsif (
7066             $self->pattern_expected( $i, $rtokens, $max_token_index ) >= 0 )
7067             {
7068 0         0 $is_pattern = 1;
7069 0         0 $msg .= "pattern (found ending ? and pattern expected)\n";
7070             }
7071             else {
7072 0         0 $msg .= "pattern (uncertain, but found ending ?)\n";
7073             }
7074             }
7075             }
7076 11         41 return ( $is_pattern, $msg );
7077             } ## end sub guess_if_pattern_or_conditional
7078              
7079             my %is_known_constant;
7080             my %is_known_function;
7081              
7082             BEGIN {
7083              
7084             # Constants like 'pi' in Trig.pm are common
7085 39     39   266 my @q = qw(pi pi2 pi4 pip2 pip4);
7086 39         265 @{is_known_constant}{@q} = (1) x scalar(@q);
7087              
7088             # parenless calls of 'ok' are common
7089 39         140 @q = qw( ok );
7090 39         70432 @{is_known_function}{@q} = (1) x scalar(@q);
7091             } ## end BEGIN
7092              
7093             sub guess_if_pattern_or_division {
7094              
7095             # this routine is called when we have encountered a / following an
7096             # unknown bareword, and we must decide if it starts a pattern or is a
7097             # division
7098             # input parameters:
7099             # $i - token index of the / starting possible pattern
7100             # output parameters:
7101             # $is_pattern = 0 if probably division, =1 if probably a pattern
7102             # msg = a warning or diagnostic message
7103             # USES GLOBAL VARIABLES: $last_nonblank_token
7104 0     0 0 0 my ( $self, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
7105 0         0 my $is_pattern = 0;
7106 0         0 my $msg = "guessing that / after $last_nonblank_token starts a ";
7107              
7108 0 0       0 if ( $i >= $max_token_index ) {
7109 0         0 $msg .= "division (no end to pattern found on the line)\n";
7110             }
7111             else {
7112 0         0 my $ibeg = $i;
7113 0         0 my $divide_possible =
7114             $self->is_possible_numerator( $i, $rtokens, $max_token_index );
7115              
7116 0 0       0 if ( $divide_possible < 0 ) {
7117 0         0 $msg = "pattern (division not possible here)\n";
7118 0         0 $is_pattern = 1;
7119 0         0 return ( $is_pattern, $msg );
7120             }
7121              
7122 0         0 $i = $ibeg + 1;
7123 0         0 my $next_token = $rtokens->[$i]; # first token after slash
7124              
7125             # One of the things we can look at is the spacing around the slash.
7126             # There # are four possible spacings around the first slash:
7127             #
7128             # return pi/two;#/; -/-
7129             # return pi/ two;#/; -/+
7130             # return pi / two;#/; +/+
7131             # return pi /two;#/; +/- <-- possible pattern
7132             #
7133             # Spacing rule: a space before the slash but not after the slash
7134             # usually indicates a pattern. We can use this to break ties.
7135              
7136 0   0     0 my $is_pattern_by_spacing =
7137             ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
7138              
7139             # look for a possible ending / on this line..
7140 0         0 my $in_quote = 1;
7141 0         0 my $quote_depth = 0;
7142 0         0 my $quote_character = EMPTY_STRING;
7143 0         0 my $quote_pos = 0;
7144 0         0 my $quoted_string;
7145             (
7146 0         0 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
7147             $quoted_string
7148             )
7149             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
7150             $quote_character, $quote_pos, $quote_depth, $max_token_index );
7151              
7152 0 0       0 if ($in_quote) {
7153              
7154             # we didn't find an ending / on this line, so we bias towards
7155             # division
7156 0 0       0 if ( $divide_possible >= 0 ) {
7157 0         0 $is_pattern = 0;
7158 0         0 $msg .= "division (no ending / on this line)\n";
7159             }
7160             else {
7161              
7162             # assuming a multi-line pattern ... this is risky, but division
7163             # does not seem possible. If this fails, it would either be due
7164             # to a syntax error in the code, or the division_expected logic
7165             # needs to be fixed.
7166 0         0 $msg = "multi-line pattern (division not possible)\n";
7167 0         0 $is_pattern = 1;
7168             }
7169             }
7170              
7171             # we found an ending /, so we bias slightly towards a pattern
7172             else {
7173              
7174 0         0 my $pattern_expected =
7175             $self->pattern_expected( $i, $rtokens, $max_token_index );
7176              
7177 0 0       0 if ( $pattern_expected >= 0 ) {
7178              
7179             # pattern looks possible...
7180 0 0       0 if ( $divide_possible >= 0 ) {
7181              
7182             # Both pattern and divide can work here...
7183              
7184             # Increase weight of divide if a pure number follows
7185 0         0 $divide_possible += $next_token =~ /^\d+$/;
7186              
7187             # Check for known constants in the numerator, like 'pi'
7188 0 0       0 if ( $is_known_constant{$last_nonblank_token} ) {
    0          
    0          
    0          
7189 0         0 $msg .=
7190             "division (pattern works too but saw known constant '$last_nonblank_token')\n";
7191 0         0 $is_pattern = 0;
7192             }
7193              
7194             # A very common bare word in pattern expressions is 'ok'
7195             elsif ( $is_known_function{$last_nonblank_token} ) {
7196 0         0 $msg .=
7197             "pattern (division works too but saw '$last_nonblank_token')\n";
7198 0         0 $is_pattern = 1;
7199             }
7200              
7201             # If one rule is more definite, use it
7202             elsif ( $divide_possible > $pattern_expected ) {
7203 0         0 $msg .=
7204             "division (more likely based on following tokens)\n";
7205 0         0 $is_pattern = 0;
7206             }
7207              
7208             # otherwise, use the spacing rule
7209             elsif ($is_pattern_by_spacing) {
7210 0         0 $msg .=
7211             "pattern (guess on spacing, but division possible too)\n";
7212 0         0 $is_pattern = 1;
7213             }
7214             else {
7215 0         0 $msg .=
7216             "division (guess on spacing, but pattern is possible too)\n";
7217 0         0 $is_pattern = 0;
7218             }
7219             }
7220              
7221             # divide_possible < 0 means divide can not work here
7222             else {
7223 0         0 $is_pattern = 1;
7224 0         0 $msg .= "pattern (division not possible)\n";
7225             }
7226             }
7227              
7228             # pattern does not look possible...
7229             else {
7230              
7231 0 0       0 if ( $divide_possible >= 0 ) {
7232 0         0 $is_pattern = 0;
7233 0         0 $msg .= "division (pattern not possible)\n";
7234             }
7235              
7236             # Neither pattern nor divide look possible...go by spacing
7237             else {
7238 0 0       0 if ($is_pattern_by_spacing) {
7239 0         0 $msg .= "pattern (guess on spacing)\n";
7240 0         0 $is_pattern = 1;
7241             }
7242             else {
7243 0         0 $msg .= "division (guess on spacing)\n";
7244 0         0 $is_pattern = 0;
7245             }
7246             }
7247             }
7248             }
7249             }
7250 0         0 return ( $is_pattern, $msg );
7251             } ## end sub guess_if_pattern_or_division
7252              
7253             # try to resolve here-doc vs. shift by looking ahead for
7254             # non-code or the end token (currently only looks for end token)
7255             # returns 1 if it is probably a here doc, 0 if not
7256             sub guess_if_here_doc {
7257              
7258 0     0 0 0 my ( $self, $next_token ) = @_;
7259              
7260             # This is how many lines we will search for a target as part of the
7261             # guessing strategy. It is a constant because there is probably
7262             # little reason to change it.
7263             # USES GLOBAL VARIABLES: $current_package $ris_constant,
7264 0         0 my $HERE_DOC_WINDOW = 40;
7265              
7266 0         0 my $here_doc_expected = 0;
7267 0         0 my $line;
7268 0         0 my $k = 0;
7269 0         0 my $msg = "checking <<";
7270              
7271 0         0 while ( $line = $self->peek_ahead( $k++ ) ) {
7272 0         0 chomp $line;
7273              
7274 0 0       0 if ( $line =~ /^$next_token$/ ) {
7275 0         0 $msg .= " -- found target $next_token ahead $k lines\n";
7276 0         0 $here_doc_expected = 1; # got it
7277 0         0 last;
7278             }
7279 0 0       0 last if ( $k >= $HERE_DOC_WINDOW );
7280             }
7281              
7282 0 0       0 if ( !$here_doc_expected ) {
7283              
7284 0 0       0 if ( !defined($line) ) {
7285 0         0 $here_doc_expected = -1; # hit eof without seeing target
7286 0         0 $msg .= " -- must be shift; target $next_token not in file\n";
7287              
7288             }
7289             else { # still unsure..taking a wild guess
7290              
7291 0 0       0 if ( !$ris_constant->{$current_package}{$next_token} ) {
7292 0         0 $here_doc_expected = 1;
7293 0         0 $msg .=
7294             " -- guessing it's a here-doc ($next_token not a constant)\n";
7295             }
7296             else {
7297 0         0 $msg .=
7298             " -- guessing it's a shift ($next_token is a constant)\n";
7299             }
7300             }
7301             }
7302 0         0 $self->write_logfile_entry($msg);
7303 0         0 return $here_doc_expected;
7304             } ## end sub guess_if_here_doc
7305              
7306             #######################################################################
7307             # Tokenizer Routines for scanning identifiers and related items
7308             #######################################################################
7309              
7310             sub scan_bare_identifier_do {
7311              
7312             # this routine is called to scan a token starting with an alphanumeric
7313             # variable or package separator, :: or '.
7314             # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
7315             # $last_nonblank_type, $rparen_type, $paren_depth
7316              
7317 1672     1672 0 4993 my ( $self, $input_line, $i, $tok, $type, $prototype, $rtoken_map,
7318             $max_token_index )
7319             = @_;
7320 1672         2747 my $i_begin = $i;
7321 1672         2911 my $package = undef;
7322              
7323 1672         2599 my $i_beg = $i;
7324              
7325             # we have to back up one pretoken at a :: since each : is one pretoken
7326 1672 100       4507 if ( $tok eq '::' ) { $i_beg-- }
  9         15  
7327 1672 50       3941 if ( $tok eq '->' ) { $i_beg-- }
  0         0  
7328 1672         3076 my $pos_beg = $rtoken_map->[$i_beg];
7329 1672         5048 pos($input_line) = $pos_beg;
7330              
7331             # Examples:
7332             # A::B::C
7333             # A::
7334             # ::A
7335             # A'B
7336 1672 50       12109 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
7337              
7338 1672         3348 my $pos = pos($input_line);
7339 1672         2832 my $numc = $pos - $pos_beg;
7340 1672         3627 $tok = substr( $input_line, $pos_beg, $numc );
7341              
7342             # type 'w' includes anything without leading type info
7343             # ($,%,@,*) including something like abc::def::ghi
7344 1672         2810 $type = 'w';
7345              
7346 1672         2824 my $sub_name = EMPTY_STRING;
7347 1672 100       4796 if ( defined($2) ) { $sub_name = $2; }
  1667         3207  
7348 1672 100       4253 if ( defined($1) ) {
7349 274         671 $package = $1;
7350              
7351             # patch: don't allow isolated package name which just ends
7352             # in the old style package separator (single quote). Example:
7353             # use CGI':all';
7354 274 50 66     1093 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
7355 0         0 $pos--;
7356             }
7357              
7358 274         788 $package =~ s/\'/::/g;
7359 274 100       808 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  9         25  
7360 274         1213 $package =~ s/::$//;
7361             }
7362             else {
7363 1398         2646 $package = $current_package;
7364              
7365             # patched for c043, part 1: keyword does not follow '->'
7366 1398 50 66     5514 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
7367 0         0 $type = 'k';
7368             }
7369             }
7370              
7371             # if it is a bareword.. patched for c043, part 2: not following '->'
7372 1672 100 66     7677 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
7373              
7374             # check for v-string with leading 'v' type character
7375             # (This seems to have precedence over filehandle, type 'Y')
7376 1003 100 66     14705 if ( $tok =~ /^v\d[_\d]*$/ ) {
    100 66        
    100 66        
    50          
    50          
    100          
    100          
7377              
7378             # we only have the first part - something like 'v101' -
7379             # look for more
7380 2 50       17 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
7381 2         8 $pos = pos($input_line);
7382 2         4 $numc = $pos - $pos_beg;
7383 2         6 $tok = substr( $input_line, $pos_beg, $numc );
7384             }
7385 2         6 $type = 'v';
7386              
7387             # warn if this version can't handle v-strings
7388 2         12 $self->report_v_string($tok);
7389             }
7390              
7391             elsif ( $ris_constant->{$package}{$sub_name} ) {
7392 12         52 $type = 'C';
7393             }
7394              
7395             # bareword after sort has implied empty prototype; for example:
7396             # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7397             # This has priority over whatever the user has specified.
7398             elsif ($last_nonblank_token eq 'sort'
7399             && $last_nonblank_type eq 'k' )
7400             {
7401 1         4 $type = 'Z';
7402             }
7403              
7404             # Note: strangely, perl does not seem to really let you create
7405             # functions which act like eval and do, in the sense that eval
7406             # and do may have operators following the final }, but any operators
7407             # that you create with prototype (&) apparently do not allow
7408             # trailing operators, only terms. This seems strange.
7409             # If this ever changes, here is the update
7410             # to make perltidy behave accordingly:
7411              
7412             # elsif ( $ris_block_function->{$package}{$tok} ) {
7413             # $tok='eval'; # patch to do braces like eval - doesn't work
7414             # $type = 'k';
7415             #}
7416             # TODO: This could become a separate type to allow for different
7417             # future behavior:
7418             elsif ( $ris_block_function->{$package}{$sub_name} ) {
7419 0         0 $type = 'G';
7420             }
7421             elsif ( $ris_block_list_function->{$package}{$sub_name} ) {
7422 0         0 $type = 'G';
7423             }
7424             elsif ( $ris_user_function->{$package}{$sub_name} ) {
7425 6         16 $type = 'U';
7426 6         18 $prototype = $ruser_function_prototype->{$package}{$sub_name};
7427             }
7428              
7429             # check for indirect object
7430             elsif (
7431              
7432             # added 2001-03-27: must not be followed immediately by '('
7433             # see fhandle.t
7434             ( $input_line !~ m/\G\(/gc )
7435              
7436             # and
7437             && (
7438              
7439             # preceded by keyword like 'print', 'printf' and friends
7440             $is_indirect_object_taker{$last_nonblank_token}
7441              
7442             # or preceded by something like 'print(' or 'printf('
7443             || (
7444             ( $last_nonblank_token eq '(' )
7445             && $is_indirect_object_taker{
7446             $rparen_type->[$paren_depth]
7447             }
7448              
7449             )
7450             )
7451             )
7452             {
7453              
7454             # may not be indirect object unless followed by a space;
7455             # updated 2021-01-16 to consider newline to be a space.
7456             # updated for case b990 to look for either ';' or space
7457 4 50 33     130 if ( pos($input_line) == length($input_line)
7458             || $input_line =~ m/\G[;\s]/gc )
7459             {
7460 4         16 $type = 'Y';
7461              
7462             # Abandon Hope ...
7463             # Perl's indirect object notation is a very bad
7464             # thing and can cause subtle bugs, especially for
7465             # beginning programmers. And I haven't even been
7466             # able to figure out a sane warning scheme which
7467             # doesn't get in the way of good scripts.
7468              
7469             # Complain if a filehandle has any lower case
7470             # letters. This is suggested good practice.
7471             # Use 'sub_name' because something like
7472             # main::MYHANDLE is ok for filehandle
7473 4 100       22 if ( $sub_name =~ /[a-z]/ ) {
7474              
7475             # could be bug caused by older perltidy if
7476             # followed by '('
7477 1 50       5 if ( $input_line =~ m/\G\s*\(/gc ) {
7478 1         7 $self->complain(
7479             "Caution: unknown word '$tok' in indirect object slot\n"
7480             );
7481             }
7482             }
7483             }
7484              
7485             # bareword not followed by a space -- may not be filehandle
7486             # (may be function call defined in a 'use' statement)
7487             else {
7488 0         0 $type = 'Z';
7489             }
7490             }
7491              
7492             # none of the above special types
7493             else {
7494             }
7495             }
7496              
7497             # Now we must convert back from character position
7498             # to pre_token index.
7499             # I don't think an error flag can occur here ..but who knows
7500 1672         2924 my $error;
7501 1672         5133 ( $i, $error ) =
7502             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7503 1672 50       4632 if ($error) {
7504 0         0 $self->warning(
7505             "scan_bare_identifier: Possibly invalid tokenization\n");
7506             }
7507             }
7508              
7509             # no match but line not blank - could be syntax error
7510             # perl will take '::' alone without complaint
7511             else {
7512 0         0 $type = 'w';
7513              
7514             # change this warning to log message if it becomes annoying
7515 0         0 $self->warning("didn't find identifier after leading ::\n");
7516             }
7517 1672         6874 return ( $i, $tok, $type, $prototype );
7518             } ## end sub scan_bare_identifier_do
7519              
7520             sub scan_id_do {
7521              
7522             # This is the new scanner and will eventually replace scan_identifier.
7523             # Only type 'sub' and 'package' are implemented.
7524             # Token types $ * % @ & -> are not yet implemented.
7525             #
7526             # Scan identifier following a type token.
7527             # The type of call depends on $id_scan_state: $id_scan_state = ''
7528             # for starting call, in which case $tok must be the token defining
7529             # the type.
7530             #
7531             # If the type token is the last nonblank token on the line, a value
7532             # of $id_scan_state = $tok is returned, indicating that further
7533             # calls must be made to get the identifier. If the type token is
7534             # not the last nonblank token on the line, the identifier is
7535             # scanned and handled and a value of '' is returned.
7536              
7537 331     331 0 1229 my ( $self, $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7538             $max_token_index )
7539             = @_;
7540 39     39   476 use constant DEBUG_NSCAN => 0;
  39         93  
  39         48307  
7541 331         626 my $type = EMPTY_STRING;
7542 331         725 my ( $i_beg, $pos_beg );
7543              
7544             #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7545             #my ($a,$b,$c) = caller;
7546             #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7547              
7548             # on re-entry, start scanning at first token on the line
7549 331 100       1002 if ($id_scan_state) {
7550 10         40 $i_beg = $i;
7551 10         26 $type = EMPTY_STRING;
7552             }
7553              
7554             # on initial entry, start scanning just after type token
7555             else {
7556 321         606 $i_beg = $i + 1;
7557 321         599 $id_scan_state = $tok;
7558 321         653 $type = 't';
7559             }
7560              
7561             # find $i_beg = index of next nonblank token,
7562             # and handle empty lines
7563 331         586 my $blank_line = 0;
7564 331         751 my $next_nonblank_token = $rtokens->[$i_beg];
7565 331 100       933 if ( $i_beg > $max_token_index ) {
7566 2         8 $blank_line = 1;
7567             }
7568             else {
7569              
7570             # only a '#' immediately after a '$' is not a comment
7571 329 50       983 if ( $next_nonblank_token eq '#' ) {
7572 0 0       0 if ( $tok ne '$' ) {
7573 0         0 $blank_line = 1;
7574             }
7575             }
7576              
7577 329 100       1505 if ( $next_nonblank_token =~ /^\s/ ) {
7578 309         1327 ( $next_nonblank_token, $i_beg ) =
7579             find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7580             $max_token_index );
7581 309 100       1728 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7582 4         12 $blank_line = 1;
7583             }
7584             }
7585             }
7586              
7587             # handle non-blank line; identifier, if any, must follow
7588 331 100       1041 if ( !$blank_line ) {
7589              
7590 325 100       1019 if ( $is_sub{$id_scan_state} ) {
    50          
7591 299         3299 ( $i, $tok, $type, $id_scan_state ) = $self->do_scan_sub(
7592             {
7593             input_line => $input_line,
7594             i => $i,
7595             i_beg => $i_beg,
7596             tok => $tok,
7597             type => $type,
7598             rtokens => $rtokens,
7599             rtoken_map => $rtoken_map,
7600             id_scan_state => $id_scan_state,
7601             max_token_index => $max_token_index,
7602             }
7603             );
7604             }
7605              
7606             elsif ( $is_package{$id_scan_state} ) {
7607 26         102 ( $i, $tok, $type ) =
7608             $self->do_scan_package( $input_line, $i, $i_beg, $tok, $type,
7609             $rtokens, $rtoken_map, $max_token_index );
7610 26         64 $id_scan_state = EMPTY_STRING;
7611             }
7612              
7613             else {
7614 0         0 $self->warning("invalid token in scan_id: $tok\n");
7615 0         0 $id_scan_state = EMPTY_STRING;
7616             }
7617             }
7618              
7619 331 50 33     1921 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
      66        
7620              
7621             # shouldn't happen:
7622 0         0 if (DEVEL_MODE) {
7623             $self->Fault(<<EOM);
7624             Program bug in scan_id: undefined type but scan_state=$id_scan_state
7625             EOM
7626             }
7627             $self->warning(
7628 0         0 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7629             );
7630 0         0 $self->report_definite_bug();
7631             }
7632              
7633 331         541 DEBUG_NSCAN && do {
7634             print {*STDOUT}
7635             "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7636             };
7637 331         1390 return ( $i, $tok, $type, $id_scan_state );
7638             } ## end sub scan_id_do
7639              
7640             sub check_prototype {
7641 137     137 0 419 my ( $proto, $package, $subname ) = @_;
7642 137 50       451 return if ( !defined($package) );
7643 137 50       442 return if ( !defined($subname) );
7644 137 100       393 if ( defined($proto) ) {
7645 34         157 $proto =~ s/^\s*\(\s*//;
7646 34         131 $proto =~ s/\s*\)$//;
7647 34 100       96 if ($proto) {
7648 5         20 $ris_user_function->{$package}{$subname} = 1;
7649 5         18 $ruser_function_prototype->{$package}{$subname} = "($proto)";
7650              
7651             # prototypes containing '&' must be treated specially..
7652 5 100       36 if ( $proto =~ /\&/ ) {
7653              
7654             # right curly braces of prototypes ending in
7655             # '&' may be followed by an operator
7656 1 50       22 if ( $proto =~ /\&$/ ) {
7657 0         0 $ris_block_function->{$package}{$subname} = 1;
7658             }
7659              
7660             # right curly braces of prototypes NOT ending in
7661             # '&' may NOT be followed by an operator
7662             else {
7663 1         4 $ris_block_list_function->{$package}{$subname} = 1;
7664             }
7665             }
7666             }
7667             else {
7668 29         107 $ris_constant->{$package}{$subname} = 1;
7669             }
7670             }
7671             else {
7672 103         392 $ris_user_function->{$package}{$subname} = 1;
7673             }
7674 137         324 return;
7675             } ## end sub check_prototype
7676              
7677             sub do_scan_package {
7678              
7679             # do_scan_package parses a package name
7680             # it is called with $i_beg equal to the index of the first nonblank
7681             # token following a 'package' token.
7682             # USES GLOBAL VARIABLES: $current_package,
7683              
7684             # package NAMESPACE
7685             # package NAMESPACE VERSION
7686             # package NAMESPACE BLOCK
7687             # package NAMESPACE VERSION BLOCK
7688             #
7689             # If VERSION is provided, package sets the $VERSION variable in the given
7690             # namespace to a version object with the VERSION provided. VERSION must be
7691             # a "strict" style version number as defined by the version module: a
7692             # positive decimal number (integer or decimal-fraction) without
7693             # exponentiation or else a dotted-decimal v-string with a leading 'v'
7694             # character and at least three components.
7695             # reference http://perldoc.perl.org/functions/package.html
7696              
7697             my (
7698 26     26 0 83 $self, $input_line, $i,
7699             $i_beg, $tok, $type,
7700             $rtokens, $rtoken_map, $max_token_index
7701             ) = @_;
7702 26         54 my $package = undef;
7703 26         51 my $pos_beg = $rtoken_map->[$i_beg];
7704 26         78 pos($input_line) = $pos_beg;
7705              
7706             # handle non-blank line; package name, if any, must follow
7707 26 50       193 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7708 26         73 $package = $1;
7709 26 50 33     161 $package = ( defined($1) && $1 ) ? $1 : 'main';
7710 26         70 $package =~ s/\'/::/g;
7711 26 50       78 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  0         0  
7712 26         55 $package =~ s/::$//;
7713 26         50 my $pos = pos($input_line);
7714 26         54 my $numc = $pos - $pos_beg;
7715 26         72 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
7716 26         45 $type = 'P'; # Fix for c250, previously 'i'
7717              
7718             # Now we must convert back from character position
7719             # to pre_token index.
7720             # I don't think an error flag can occur here ..but ?
7721 26         47 my $error;
7722 26         95 ( $i, $error ) =
7723             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7724 26 50       76 if ($error) { $self->warning("Possibly invalid package\n") }
  0         0  
7725 26         64 $current_package = $package;
7726              
7727             # we should now have package NAMESPACE
7728             # now expecting VERSION, BLOCK, or ; to follow ...
7729             # package NAMESPACE VERSION
7730             # package NAMESPACE BLOCK
7731             # package NAMESPACE VERSION BLOCK
7732 26         85 my ( $next_nonblank_token, $i_next ) =
7733             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
7734              
7735             # check that something recognizable follows, but do not parse.
7736             # A VERSION number will be parsed later as a number or v-string in the
7737             # normal way. What is important is to set the statement type if
7738             # everything looks okay so that the operator_expected() routine
7739             # knows that the number is in a package statement.
7740             # Examples of valid primitive tokens that might follow are:
7741             # 1235 . ; { } v3 v
7742             # FIX: added a '#' since a side comment may also follow
7743             # Added ':' for class attributes (for --use-feature=class, rt145706)
7744 26 50       111 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#\:])|v\d|\d+$/ ) {
7745 26         66 $statement_type = $tok;
7746             }
7747             else {
7748 0         0 $self->warning(
7749             "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7750             );
7751             }
7752             }
7753              
7754             # no match but line not blank --
7755             # could be a label with name package, like package: , for example.
7756             else {
7757 0         0 $type = 'k';
7758             }
7759              
7760 26         96 return ( $i, $tok, $type );
7761             } ## end sub do_scan_package
7762              
7763             { ## begin closure for sub scan_complex_identifier
7764              
7765 39     39   399 use constant DEBUG_SCAN_ID => 0;
  39         91  
  39         5113  
7766              
7767             # Constant hash:
7768             my %is_special_variable_char;
7769              
7770             BEGIN {
7771              
7772             # These are the only characters which can (currently) form special
7773             # variables, like $^W: (issue c066).
7774 39     39   422 my @q =
7775             qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7776 39         146390 @{is_special_variable_char}{@q} = (1) x scalar(@q);
7777             } ## end BEGIN
7778              
7779             # These are the possible states for this scanner:
7780             my $scan_state_SIGIL = '$';
7781             my $scan_state_ALPHA = 'A';
7782             my $scan_state_COLON = ':';
7783             my $scan_state_LPAREN = '(';
7784             my $scan_state_RPAREN = ')';
7785             my $scan_state_AMPERSAND = '&';
7786             my $scan_state_SPLIT = '^';
7787              
7788             # Only these non-blank states may be returned to caller:
7789             my %is_returnable_scan_state = (
7790             $scan_state_SIGIL => 1,
7791             $scan_state_AMPERSAND => 1,
7792             );
7793              
7794             # USES GLOBAL VARIABLES:
7795             # $context, $last_nonblank_token, $last_nonblank_type
7796              
7797             #-----------
7798             # call args:
7799             #-----------
7800             my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7801             $expecting, $container_type );
7802              
7803             #-------------------------------------------
7804             # my variables, re-initialized on each call:
7805             #-------------------------------------------
7806             my $i_begin; # starting index $i
7807             my $type; # returned identifier type
7808             my $tok_begin; # starting token
7809             my $tok; # returned token
7810             my $id_scan_state_begin; # starting scan state
7811             my $identifier_begin; # starting identifier
7812             my $i_save; # a last good index, in case of error
7813             my $message; # hold error message for log file
7814             my $tok_is_blank;
7815             my $last_tok_is_blank;
7816             my $in_prototype_or_signature;
7817             my $saw_alpha;
7818             my $saw_type;
7819             my $allow_tick;
7820              
7821             sub initialize_my_scan_id_vars {
7822              
7823             # Initialize all 'my' vars on entry
7824 486     486 0 873 $i_begin = $i;
7825 486         1894 $type = EMPTY_STRING;
7826 486         922 $tok_begin = $rtokens->[$i_begin];
7827 486         1126 $tok = $tok_begin;
7828 486 50       1611 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
  0         0  
7829 486         849 $id_scan_state_begin = $id_scan_state;
7830 486         821 $identifier_begin = $identifier;
7831 486         831 $i_save = undef;
7832              
7833 486         855 $message = EMPTY_STRING;
7834 486         822 $tok_is_blank = undef; # a flag to speed things up
7835 486         738 $last_tok_is_blank = undef;
7836              
7837 486   100     1850 $in_prototype_or_signature =
7838             $container_type && $container_type =~ /^sub\b/;
7839              
7840             # these flags will be used to help figure out the type:
7841 486         755 $saw_alpha = undef;
7842 486         751 $saw_type = undef;
7843              
7844             # allow old package separator (') except in 'use' statement
7845 486         929 $allow_tick = ( $last_nonblank_token ne 'use' );
7846 486         824 return;
7847             } ## end sub initialize_my_scan_id_vars
7848              
7849             #----------------------------------
7850             # Routines for handling scan states
7851             #----------------------------------
7852             sub do_id_scan_state_dollar {
7853              
7854 514     514 0 882 my $self = shift;
7855              
7856             # We saw a sigil, now looking to start a variable name
7857 514 100 66     4189 if ( $tok eq '$' ) {
    100 33        
    100          
    50          
    50          
    100          
    100          
    100          
    100          
7858              
7859 31         93 $identifier .= $tok;
7860              
7861             # we've got a punctuation variable if end of line (punct.t)
7862 31 50       162 if ( $i == $max_token_index ) {
7863 0         0 $type = 'i';
7864 0         0 $id_scan_state = EMPTY_STRING;
7865             }
7866             }
7867             elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
7868 253         454 $saw_alpha = 1;
7869 253         517 $identifier .= $tok;
7870              
7871             # now need :: except for special digit vars like '$1' (c208)
7872 253 100       853 $id_scan_state = $tok =~ /^\d/ ? EMPTY_STRING : $scan_state_COLON;
7873             }
7874             elsif ( $tok eq '::' ) {
7875 16         51 $id_scan_state = $scan_state_ALPHA;
7876 16         40 $identifier .= $tok;
7877             }
7878              
7879             # POSTDEFREF ->@ ->% ->& ->*
7880             elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7881 0         0 $identifier .= $tok;
7882             }
7883             elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7884 0         0 $saw_alpha = 1;
7885 0         0 $id_scan_state = $scan_state_COLON; # now need ::
7886 0         0 $identifier .= $tok;
7887              
7888             # Perl will accept leading digits in identifiers,
7889             # although they may not always produce useful results.
7890             # Something like $main::0 is ok. But this also works:
7891             #
7892             # sub howdy::123::bubba{ print "bubba $54321!\n" }
7893             # howdy::123::bubba();
7894             #
7895             }
7896             elsif ( $tok eq '#' ) {
7897              
7898 99         259 my $is_punct_var = $identifier eq '$$';
7899              
7900             # side comment or identifier?
7901 99 100 66     959 if (
      66        
      66        
      33        
7902              
7903             # A '#' starts a comment if it follows a space. For example,
7904             # the following is equivalent to $ans=40.
7905             # my $ #
7906             # ans = 40;
7907             !$last_tok_is_blank
7908              
7909             # a # inside a prototype or signature can only start a
7910             # comment
7911             && !$in_prototype_or_signature
7912              
7913             # these are valid punctuation vars: *# %# @# $#
7914             # May also be '$#array' or POSTDEFREF ->$#
7915             && ( $identifier =~ /^[\%\@\$\*]$/
7916             || $identifier =~ /\$$/ )
7917              
7918             # but a '#' after '$$' is a side comment; see c147
7919             && !$is_punct_var
7920              
7921             )
7922             {
7923 95         221 $identifier .= $tok; # keep same state, a $ could follow
7924             }
7925             else {
7926              
7927             # otherwise it is a side comment
7928 4 50       20 if ( $identifier eq '->' ) { }
    50          
    50          
7929 0         0 elsif ($is_punct_var) { $type = 'i' }
7930 4         7 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
7931 0         0 else { $type = 'i' }
7932 4         7 $i = $i_save;
7933 4         8 $id_scan_state = EMPTY_STRING;
7934             }
7935             }
7936              
7937             elsif ( $tok eq '{' ) {
7938              
7939             # check for something like ${#} or ${?}, where ? is a special char
7940 38 100 100     596 if (
      66        
      100        
      100        
7941             (
7942             $identifier eq '$'
7943             || $identifier eq '@'
7944             || $identifier eq '$#'
7945             )
7946             && $i + 2 <= $max_token_index
7947             && $rtokens->[ $i + 2 ] eq '}'
7948             && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7949             )
7950             {
7951 1         4 my $next2 = $rtokens->[ $i + 2 ];
7952 1         6 my $next1 = $rtokens->[ $i + 1 ];
7953 1         6 $identifier .= $tok . $next1 . $next2;
7954 1         3 $i += 2;
7955 1         2 $id_scan_state = EMPTY_STRING;
7956             }
7957             else {
7958              
7959             # skip something like ${xxx} or ->{
7960 37         86 $id_scan_state = EMPTY_STRING;
7961              
7962             # if this is the first token of a line, any tokens for this
7963             # identifier have already been accumulated
7964 37 100 66     163 if ( $identifier eq '$' || $i == 0 ) {
7965 26         49 $identifier = EMPTY_STRING;
7966             }
7967 37         77 $i = $i_save;
7968             }
7969             }
7970              
7971             # space ok after leading $ % * & @
7972             elsif ( $tok =~ /^\s*$/ ) {
7973              
7974 20         58 $tok_is_blank = 1;
7975              
7976             # note: an id with a leading '&' does not actually come this way
7977 20 50       80 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
    0          
7978              
7979 20 100       69 if ( length($identifier) > 1 ) {
7980 8         15 $id_scan_state = EMPTY_STRING;
7981 8         16 $i = $i_save;
7982 8         20 $type = 'i'; # probably punctuation variable
7983             }
7984             else {
7985              
7986             # fix c139: trim line-ending type 't'
7987 12 100       60 if ( $i == $max_token_index ) {
    100          
7988 1         3 $i = $i_save;
7989 1         3 $type = 't';
7990             }
7991              
7992             # spaces after $'s are common, and space after @
7993             # is harmless, so only complain about space
7994             # after other type characters. Space after $ and
7995             # @ will be removed in formatting. Report space
7996             # after % and * because they might indicate a
7997             # parsing error. In other words '% ' might be a
7998             # modulo operator. Delete this warning if it
7999             # gets annoying.
8000             elsif ( $identifier !~ /^[\@\$]$/ ) {
8001 1         5 $message =
8002             "Space in identifier, following $identifier\n";
8003             }
8004             else {
8005             ## ok: silently accept space after '$' and '@' sigils
8006             }
8007             }
8008             }
8009              
8010             elsif ( $identifier eq '->' ) {
8011              
8012             # space after '->' is ok except at line end ..
8013             # so trim line-ending in type '->' (fixes c139)
8014 0 0       0 if ( $i == $max_token_index ) {
8015 0         0 $i = $i_save;
8016 0         0 $type = '->';
8017             }
8018             }
8019              
8020             # stop at space after something other than -> or sigil
8021             # Example of what can arrive here:
8022             # eval { $MyClass->$$ };
8023             else {
8024 0         0 $id_scan_state = EMPTY_STRING;
8025 0         0 $i = $i_save;
8026 0         0 $type = 'i';
8027             }
8028             }
8029             elsif ( $tok eq '^' ) {
8030              
8031             # check for some special variables like $^ $^W
8032 11 50       50 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
8033 11         32 $identifier .= $tok;
8034 11         23 $type = 'i';
8035              
8036             # There may be one more character, not a space, after the ^
8037 11         31 my $next1 = $rtokens->[ $i + 1 ];
8038 11         29 my $chr = substr( $next1, 0, 1 );
8039 11 100       45 if ( $is_special_variable_char{$chr} ) {
8040              
8041             # It is something like $^W
8042             # Test case (c066) : $^Oeq'linux'
8043 9         19 $i++;
8044 9         19 $identifier .= $next1;
8045              
8046             # If pretoken $next1 is more than one character long,
8047             # set a flag indicating that it needs to be split.
8048 9 100       38 $id_scan_state =
8049             ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
8050             }
8051             else {
8052              
8053             # it is just $^
8054             # Simple test case (c065): '$aa=$^if($bb)';
8055 2         4 $id_scan_state = EMPTY_STRING;
8056             }
8057             }
8058             else {
8059 0         0 $id_scan_state = EMPTY_STRING;
8060 0         0 $i = $i_save;
8061             }
8062             }
8063             else { # something else
8064              
8065 46 100 66     418 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
    100 66        
    100          
    50          
    0          
    0          
8066              
8067             # We might be in an extrusion of
8068             # sub foo2 ( $first, $, $third ) {
8069             # looking at a line starting with a comma, like
8070             # $
8071             # ,
8072             # in this case the comma ends the signature variable
8073             # '$' which will have been previously marked type 't'
8074             # rather than 'i'.
8075 3 100       12 if ( $i == $i_begin ) {
8076 1         4 $identifier = EMPTY_STRING;
8077 1         12 $type = EMPTY_STRING;
8078             }
8079              
8080             # at a # we have to mark as type 't' because more may
8081             # follow, otherwise, in a signature we can let '$' be an
8082             # identifier here for better formatting.
8083             # See 'mangle4.in' for a test case.
8084             else {
8085 2         4 $type = 'i';
8086 2 50 33     14 if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
8087 0         0 $type = 't';
8088             }
8089 2         42 $i = $i_save;
8090             }
8091 3         8 $id_scan_state = EMPTY_STRING;
8092             }
8093              
8094             # check for various punctuation variables
8095             elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
8096 35         110 $identifier .= $tok;
8097             }
8098              
8099             # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
8100             elsif ($tok eq '*'
8101             && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
8102             {
8103 6         14 $identifier .= $tok;
8104             }
8105              
8106             elsif ( $identifier eq '$#' ) {
8107              
8108 2 50       12 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
  0 50       0  
  0         0  
8109              
8110             # perl seems to allow just these: $#: $#- $#+
8111             elsif ( $tok =~ /^[\:\-\+]$/ ) {
8112 0         0 $type = 'i';
8113 0         0 $identifier .= $tok;
8114             }
8115             else {
8116 2         6 $i = $i_save;
8117 2         10 $self->write_logfile_entry(
8118             'Use of $# is deprecated' . "\n" );
8119             }
8120             }
8121             elsif ( $identifier eq '$$' ) {
8122              
8123             # perl does not allow references to punctuation
8124             # variables without braces. For example, this
8125             # won't work:
8126             # $:=\4;
8127             # $a = $$:;
8128             # You would have to use
8129             # $a = ${$:};
8130              
8131             # '$$' alone is punctuation variable for PID
8132 0         0 $i = $i_save;
8133 0 0       0 if ( $tok eq '{' ) { $type = 't' }
  0         0  
8134 0         0 else { $type = 'i' }
8135             }
8136             elsif ( $identifier eq '->' ) {
8137 0         0 $i = $i_save;
8138             }
8139             else {
8140 0         0 $i = $i_save;
8141 0 0       0 if ( length($identifier) == 1 ) {
8142 0         0 $identifier = EMPTY_STRING;
8143             }
8144             }
8145 46         109 $id_scan_state = EMPTY_STRING;
8146             }
8147 514         910 return;
8148             } ## end sub do_id_scan_state_dollar
8149              
8150             sub do_id_scan_state_alpha {
8151              
8152 113     113 0 226 my $self = shift;
8153              
8154             # looking for alphanumeric after ::
8155 113         365 $tok_is_blank = $tok =~ /^\s*$/;
8156              
8157 113 100 33     472 if ( $tok =~ /^\w/ ) { # found it
    50 66        
    50 33        
    50          
8158 100         199 $identifier .= $tok;
8159 100         201 $id_scan_state = $scan_state_COLON; # now need ::
8160 100         165 $saw_alpha = 1;
8161             }
8162             elsif ( $tok eq "'" && $allow_tick ) {
8163 0         0 $identifier .= $tok;
8164 0         0 $id_scan_state = $scan_state_COLON; # now need ::
8165 0         0 $saw_alpha = 1;
8166             }
8167             elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
8168 0         0 $id_scan_state = $scan_state_LPAREN;
8169 0         0 $identifier .= $tok;
8170             }
8171             elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8172 0         0 $id_scan_state = $scan_state_RPAREN;
8173 0         0 $identifier .= $tok;
8174             }
8175             else {
8176 13         22 $id_scan_state = EMPTY_STRING;
8177 13         25 $i = $i_save;
8178             }
8179 113         220 return;
8180             } ## end sub do_id_scan_state_alpha
8181              
8182             sub do_id_scan_state_colon {
8183              
8184 434     434 0 792 my $self = shift;
8185              
8186             # looking for possible :: after alphanumeric
8187              
8188 434         1508 $tok_is_blank = $tok =~ /^\s*$/;
8189              
8190 434 100 66     3383 if ( $tok eq '::' ) { # got it
    100 66        
    100 66        
    50          
    50          
8191 97         173 $identifier .= $tok;
8192 97         196 $id_scan_state = $scan_state_ALPHA; # now require alpha
8193             }
8194             elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
8195 20         53 $identifier .= $tok;
8196 20         62 $id_scan_state = $scan_state_COLON; # now need ::
8197 20         38 $saw_alpha = 1;
8198             }
8199             elsif ( $tok eq "'" && $allow_tick ) { # tick
8200              
8201 12 50       37 if ( $is_keyword{$identifier} ) {
8202 0         0 $id_scan_state = EMPTY_STRING; # that's all
8203 0         0 $i = $i_save;
8204             }
8205             else {
8206 12         22 $identifier .= $tok;
8207             }
8208             }
8209             elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
8210 0         0 $id_scan_state = $scan_state_LPAREN;
8211 0         0 $identifier .= $tok;
8212             }
8213             elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8214 0         0 $id_scan_state = $scan_state_RPAREN;
8215 0         0 $identifier .= $tok;
8216             }
8217             else {
8218 305         579 $id_scan_state = EMPTY_STRING; # that's all
8219 305         526 $i = $i_save;
8220             }
8221 434         687 return;
8222             } ## end sub do_id_scan_state_colon
8223              
8224             sub do_id_scan_state_left_paren {
8225              
8226 0     0 0 0 my $self = shift;
8227              
8228             # looking for possible '(' of a prototype
8229              
8230 0 0       0 if ( $tok eq '(' ) { # got it
    0          
8231 0         0 $identifier .= $tok;
8232 0         0 $id_scan_state = $scan_state_RPAREN; # now find the end of it
8233             }
8234             elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
8235 0         0 $identifier .= $tok;
8236 0         0 $tok_is_blank = 1;
8237             }
8238             else {
8239 0         0 $id_scan_state = EMPTY_STRING; # that's all - no prototype
8240 0         0 $i = $i_save;
8241             }
8242 0         0 return;
8243             } ## end sub do_id_scan_state_left_paren
8244              
8245             sub do_id_scan_state_right_paren {
8246              
8247 0     0 0 0 my $self = shift;
8248              
8249             # looking for a ')' of prototype to close a '('
8250              
8251 0         0 $tok_is_blank = $tok =~ /^\s*$/;
8252              
8253 0 0       0 if ( $tok eq ')' ) { # got it
    0          
8254 0         0 $identifier .= $tok;
8255 0         0 $id_scan_state = EMPTY_STRING; # all done
8256             }
8257             elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
8258 0         0 $identifier .= $tok;
8259             }
8260             else { # probable error in script, but keep going
8261 0         0 warning("Unexpected '$tok' while seeking end of prototype\n");
8262 0         0 $identifier .= $tok;
8263             }
8264 0         0 return;
8265             } ## end sub do_id_scan_state_right_paren
8266              
8267             sub do_id_scan_state_ampersand {
8268              
8269 105     105 0 267 my $self = shift;
8270              
8271             # Starting sub call after seeing an '&'
8272              
8273 105 100 33     705 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
    50          
    100          
    50          
    50          
    0          
8274 88         209 $id_scan_state = $scan_state_COLON; # now need ::
8275 88         164 $saw_alpha = 1;
8276 88         200 $identifier .= $tok;
8277             }
8278             elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
8279 0         0 $id_scan_state = $scan_state_COLON; # now need ::
8280 0         0 $saw_alpha = 1;
8281 0         0 $identifier .= $tok;
8282             }
8283             elsif ( $tok =~ /^\s*$/ ) { # allow space
8284 2         5 $tok_is_blank = 1;
8285              
8286             # fix c139: trim line-ending type 't'
8287 2 50 33     14 if ( length($identifier) == 1 && $i == $max_token_index ) {
8288 2         3 $i = $i_save;
8289 2         7 $type = 't';
8290             }
8291             }
8292             elsif ( $tok eq '::' ) { # leading ::
8293 0         0 $id_scan_state = $scan_state_ALPHA; # accept alpha next
8294 0         0 $identifier .= $tok;
8295             }
8296             elsif ( $tok eq '{' ) {
8297 15 50 33     74 if ( $identifier eq '&' || $i == 0 ) {
8298 15         33 $identifier = EMPTY_STRING;
8299             }
8300 15         39 $i = $i_save;
8301 15         28 $id_scan_state = EMPTY_STRING;
8302             }
8303             elsif ( $tok eq '^' ) {
8304 0 0       0 if ( $identifier eq '&' ) {
8305              
8306             # Special variable (c066)
8307 0         0 $identifier .= $tok;
8308 0         0 $type = '&';
8309              
8310             # There may be one more character, not a space, after the ^
8311 0         0 my $next1 = $rtokens->[ $i + 1 ];
8312 0         0 my $chr = substr( $next1, 0, 1 );
8313 0 0       0 if ( $is_special_variable_char{$chr} ) {
8314              
8315             # It is something like &^O
8316 0         0 $i++;
8317 0         0 $identifier .= $next1;
8318              
8319             # If pretoken $next1 is more than one character long,
8320             # set a flag indicating that it needs to be split.
8321 0 0       0 $id_scan_state =
8322             ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
8323             }
8324             else {
8325              
8326             # it is &^
8327 0         0 $id_scan_state = EMPTY_STRING;
8328             }
8329             }
8330             else {
8331 0         0 $identifier = EMPTY_STRING;
8332 0         0 $i = $i_save;
8333             }
8334             }
8335             else {
8336              
8337             # punctuation variable?
8338             # testfile: cunningham4.pl
8339             #
8340             # We have to be careful here. If we are in an unknown state,
8341             # we will reject the punctuation variable. In the following
8342             # example the '&' is a binary operator but we are in an unknown
8343             # state because there is no sigil on 'Prima', so we don't
8344             # know what it is. But it is a bad guess that
8345             # '&~' is a function variable.
8346             # $self->{text}->{colorMap}->[
8347             # Prima::PodView::COLOR_CODE_FOREGROUND
8348             # & ~tb::COLOR_INDEX ] =
8349             # $sec->{ColorCode}
8350              
8351             # Fix for case c033: a '#' here starts a side comment
8352 0 0 0     0 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
      0        
8353 0         0 $identifier .= $tok;
8354             }
8355             else {
8356 0         0 $identifier = EMPTY_STRING;
8357 0         0 $i = $i_save;
8358 0         0 $type = '&';
8359             }
8360 0         0 $id_scan_state = EMPTY_STRING;
8361             }
8362 105         197 return;
8363             } ## end sub do_id_scan_state_ampersand
8364              
8365             #-------------------
8366             # hash of scanner subs
8367             #-------------------
8368             my $scan_identifier_code = {
8369             $scan_state_SIGIL => \&do_id_scan_state_dollar,
8370             $scan_state_ALPHA => \&do_id_scan_state_alpha,
8371             $scan_state_COLON => \&do_id_scan_state_colon,
8372             $scan_state_LPAREN => \&do_id_scan_state_left_paren,
8373             $scan_state_RPAREN => \&do_id_scan_state_right_paren,
8374             $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
8375             };
8376              
8377             sub scan_complex_identifier {
8378              
8379             # This routine assembles tokens into identifiers. It maintains a
8380             # scan state, id_scan_state. It updates id_scan_state based upon
8381             # current id_scan_state and token, and returns an updated
8382             # id_scan_state and the next index after the identifier.
8383              
8384             # This routine now serves a a backup for sub scan_simple_identifier
8385             # which handles most identifiers.
8386              
8387             # Note that $self must be a 'my' variable and not be a closure
8388             # variables like the other args. Otherwise it will not get
8389             # deleted by a DESTROY call at the end of a file. Then an
8390             # attempt to create multiple tokenizers can occur when multiple
8391             # files are processed, causing an error.
8392              
8393             (
8394 486     486 0 1753 my $self, $i, $id_scan_state, $identifier, $rtokens,
8395             $max_token_index, $expecting, $container_type
8396             ) = @_;
8397              
8398             # return flag telling caller to split the pretoken
8399 486         2534 my $split_pretoken_flag;
8400              
8401             #-------------------
8402             # Initialize my vars
8403             #-------------------
8404              
8405 486         1742 initialize_my_scan_id_vars();
8406              
8407             #--------------------------------------------------------
8408             # get started by defining a type and a state if necessary
8409             #--------------------------------------------------------
8410              
8411 486 100       1177 if ( !$id_scan_state ) {
8412 479         776 $context = UNKNOWN_CONTEXT;
8413              
8414             # fixup for digraph
8415 479 50       1314 if ( $tok eq '>' ) {
8416 0         0 $tok = '->';
8417 0         0 $tok_begin = $tok;
8418             }
8419 479         826 $identifier = $tok;
8420              
8421 479 100 100     3072 if ( $last_nonblank_token eq '->' ) {
    100 100        
    100 0        
    50          
    0          
    0          
    0          
    0          
8422 6         15 $identifier = '->' . $identifier;
8423 6         13 $id_scan_state = $scan_state_SIGIL;
8424             }
8425             elsif ( $tok eq '$' || $tok eq '*' ) {
8426 293         535 $id_scan_state = $scan_state_SIGIL;
8427 293         571 $context = SCALAR_CONTEXT;
8428             }
8429             elsif ( $tok eq '%' || $tok eq '@' ) {
8430 77         186 $id_scan_state = $scan_state_SIGIL;
8431 77         159 $context = LIST_CONTEXT;
8432             }
8433             elsif ( $tok eq '&' ) {
8434 103         241 $id_scan_state = $scan_state_AMPERSAND;
8435             }
8436             elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8437 0         0 $saw_alpha = 0; # 'sub' is considered type info here
8438 0         0 $id_scan_state = $scan_state_SIGIL;
8439 0         0 $identifier .=
8440             SPACE; # need a space to separate sub from sub name
8441             }
8442             elsif ( $tok eq '::' ) {
8443 0         0 $id_scan_state = $scan_state_ALPHA;
8444             }
8445             elsif ( $tok =~ /^\w/ ) {
8446 0         0 $id_scan_state = $scan_state_COLON;
8447 0         0 $saw_alpha = 1;
8448             }
8449             elsif ( $tok eq '->' ) {
8450 0         0 $id_scan_state = $scan_state_SIGIL;
8451             }
8452             else {
8453              
8454             # shouldn't happen: bad call parameter
8455 0         0 my $msg =
8456             "Program bug detected: scan_complex_identifier received bad starting token = '$tok'\n";
8457 0         0 if (DEVEL_MODE) { $self->Fault($msg) }
8458 0 0       0 if ( !$self->[_in_error_] ) {
8459 0         0 warning($msg);
8460 0         0 $self->[_in_error_] = 1;
8461             }
8462 0         0 $id_scan_state = EMPTY_STRING;
8463              
8464             # emergency return
8465 0         0 goto RETURN;
8466             }
8467 479         908 $saw_type = !$saw_alpha;
8468             }
8469             else {
8470 7         19 $i--;
8471 7         34 $saw_alpha = ( $tok =~ /^\w/ );
8472 7         20 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
8473              
8474             # check for a valid starting state
8475 7         11 if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8476             $self->Fault(<<EOM);
8477             Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8478             EOM
8479             }
8480             }
8481              
8482             #------------------------------
8483             # loop to gather the identifier
8484             #------------------------------
8485              
8486 486         844 $i_save = $i;
8487              
8488 486   100     2118 while ( $i < $max_token_index && $id_scan_state ) {
8489              
8490             # Be sure we have code to handle this state before we proceed
8491 1169         2392 my $code = $scan_identifier_code->{$id_scan_state};
8492 1169 100       2391 if ( !$code ) {
8493              
8494 3 50       12 if ( $id_scan_state eq $scan_state_SPLIT ) {
8495             ## OK: this is the signal to exit and split the pretoken
8496             }
8497              
8498             # unknown state - should not happen
8499             else {
8500 0         0 if (DEVEL_MODE) {
8501             $self->Fault(<<EOM);
8502             Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8503             Scan state at sub entry was '$id_scan_state_begin'
8504             EOM
8505             }
8506 0         0 $id_scan_state = EMPTY_STRING;
8507 0         0 $i = $i_save;
8508             }
8509 3         5 last;
8510             }
8511              
8512             # Remember the starting index for progress check below
8513 1166         1627 my $i_start_loop = $i;
8514              
8515 1166         1613 $last_tok_is_blank = $tok_is_blank;
8516 1166 100       1987 if ($tok_is_blank) { $tok_is_blank = undef }
  11         24  
8517 1155         1579 else { $i_save = $i }
8518              
8519 1166         1947 $tok = $rtokens->[ ++$i ];
8520              
8521             # patch to make digraph :: if necessary
8522 1166 100 100     2942 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8523 113         244 $tok = '::';
8524 113         203 $i++;
8525             }
8526              
8527 1166         3480 $code->($self);
8528              
8529             # check for forward progress: a decrease in the index $i
8530             # implies that scanning has finished
8531 1166 100       3807 last if ( $i <= $i_start_loop );
8532              
8533             } ## end of main loop
8534              
8535             #-------------
8536             # Check result
8537             #-------------
8538              
8539             # Be sure a valid state is returned
8540 486 100       1367 if ($id_scan_state) {
8541              
8542 20 100       89 if ( !$is_returnable_scan_state{$id_scan_state} ) {
8543              
8544 13 100       56 if ( $id_scan_state eq $scan_state_SPLIT ) {
8545 3         7 $split_pretoken_flag = 1;
8546             }
8547              
8548 13 50       56 if ( $id_scan_state eq $scan_state_RPAREN ) {
8549 0         0 warning(
8550             "Hit end of line while seeking ) to end prototype\n");
8551             }
8552              
8553 13         29 $id_scan_state = EMPTY_STRING;
8554             }
8555              
8556             # Patch: the deprecated variable $# does not combine with anything
8557             # on the next line.
8558 20 50       73 if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
  0         0  
8559             }
8560              
8561             # Be sure the token index is valid
8562 486 50       1331 if ( $i < 0 ) { $i = 0 }
  0         0  
8563              
8564             # Be sure a token type is defined
8565 486 100       1246 if ( !$type ) {
8566              
8567 458 100       1088 if ($saw_type) {
    100          
8568              
8569 452 100 33     2118 if ($saw_alpha) {
    50 66        
    100 66        
      33        
8570              
8571             # The type without the -> should be the same as with the -> so
8572             # that if they get separated we get the same bond strengths,
8573             # etc. See b1234
8574 348 50 33     1359 if ( $identifier =~ /^->/
      33        
8575             && $last_nonblank_type eq 'w'
8576             && substr( $identifier, 2, 1 ) =~ /^\w/ )
8577             {
8578 0         0 $type = 'w';
8579             }
8580 348         746 else { $type = 'i' }
8581             }
8582             elsif ( $identifier eq '->' ) {
8583 0         0 $type = '->';
8584             }
8585             elsif (
8586             ( length($identifier) > 1 )
8587              
8588             # In something like '@$=' we have an identifier '@$'
8589             # In something like '$${' we have type '$$' (and only
8590             # part of an identifier)
8591             && !( $identifier =~ /\$$/ && $tok eq '{' )
8592              
8593             ## && ( $identifier !~ /^(sub |package )$/ )
8594             && $identifier ne 'sub '
8595             && $identifier ne 'package '
8596             )
8597             {
8598 53         187 $type = 'i';
8599             }
8600 51         104 else { $type = 't' }
8601             }
8602             elsif ($saw_alpha) {
8603              
8604             # type 'w' includes anything without leading type info
8605             # ($,%,@,*) including something like abc::def::ghi
8606 5         9 $type = 'w';
8607              
8608             # Fix for b1337, if restarting scan after line break between
8609             # '->' or sigil and identifier name, use type 'i'
8610 5 50 33     35 if ( $id_scan_state_begin
8611             && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8612             {
8613 5         10 $type = 'i';
8614             }
8615             }
8616             else {
8617 1         3 $type = EMPTY_STRING;
8618             } # this can happen on a restart
8619             }
8620              
8621             # See if we formed an identifier...
8622 486 100       1171 if ($identifier) {
8623 444         796 $tok = $identifier;
8624 444 100       1113 if ($message) { $self->write_logfile_entry($message) }
  1         8  
8625             }
8626              
8627             # did not find an identifier, back up
8628             else {
8629 42         90 $tok = $tok_begin;
8630 42         87 $i = $i_begin;
8631             }
8632              
8633             RETURN:
8634              
8635 486         746 DEBUG_SCAN_ID && do {
8636             my ( $a, $b, $c ) = caller;
8637             print {*STDOUT}
8638             "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
8639             print {*STDOUT}
8640             "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
8641             };
8642 486         2283 return ( $i, $tok, $type, $id_scan_state, $identifier,
8643             $split_pretoken_flag );
8644             } ## end sub scan_complex_identifier
8645             } ## end closure for sub scan_complex_identifier
8646              
8647             { ## closure for sub do_scan_sub
8648              
8649             my %warn_if_lexical;
8650              
8651             BEGIN {
8652              
8653             # lexical subs with these names can cause parsing errors in this version
8654 39     39   324 my @q = qw( m q qq qr qw qx s tr y );
8655 39         3504 @{warn_if_lexical}{@q} = (1) x scalar(@q);
8656             } ## end BEGIN
8657              
8658             # saved package and subnames in case prototype is on separate line
8659             my ( $package_saved, $subname_saved );
8660              
8661             # initialize subname each time a new 'sub' keyword is encountered
8662             sub initialize_subname {
8663 295     295 0 664 $package_saved = EMPTY_STRING;
8664 295         597 $subname_saved = EMPTY_STRING;
8665 295         521 return;
8666             }
8667              
8668             use constant {
8669 39         92987 SUB_CALL => 1,
8670             PAREN_CALL => 2,
8671             PROTOTYPE_CALL => 3,
8672 39     39   461 };
  39         128  
8673              
8674             sub do_scan_sub {
8675              
8676             # do_scan_sub parses a sub name and prototype.
8677              
8678             # At present there are three basic CALL TYPES which are
8679             # distinguished by the starting value of '$tok':
8680             # 1. $tok='sub', id_scan_state='sub'
8681             # it is called with $i_beg equal to the index of the first nonblank
8682             # token following a 'sub' token.
8683             # 2. $tok='(', id_scan_state='sub',
8684             # it is called with $i_beg equal to the index of a '(' which may
8685             # start a prototype.
8686             # 3. $tok='prototype', id_scan_state='prototype'
8687             # it is called with $i_beg equal to the index of a '(' which is
8688             # preceded by ': prototype' and has $id_scan_state eq 'prototype'
8689              
8690             # Examples:
8691              
8692             # A single type 1 call will get both the sub and prototype
8693             # sub foo1 ( $$ ) { }
8694             # ^
8695              
8696             # The subname will be obtained with a 'sub' call
8697             # The prototype on line 2 will be obtained with a '(' call
8698             # sub foo1
8699             # ^ <---call type 1
8700             # ( $$ ) { }
8701             # ^ <---call type 2
8702              
8703             # The subname will be obtained with a 'sub' call
8704             # The prototype will be obtained with a 'prototype' call
8705             # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
8706             # ^ <---type 1 ^ <---type 3
8707              
8708             # TODO: add future error checks to be sure we have a valid
8709             # sub name. For example, 'sub &doit' is wrong. Also, be sure
8710             # a name is given if and only if a non-anonymous sub is
8711             # appropriate.
8712             # USES GLOBAL VARS: $current_package, $last_nonblank_token,
8713             # $rsaw_function_definition,
8714             # $statement_type
8715              
8716 301     301 0 848 my ( $self, $rinput_hash ) = @_;
8717              
8718 301         708 my $input_line = $rinput_hash->{input_line};
8719 301         645 my $i = $rinput_hash->{i};
8720 301         577 my $i_beg = $rinput_hash->{i_beg};
8721 301         635 my $tok = $rinput_hash->{tok};
8722 301         631 my $type = $rinput_hash->{type};
8723 301         587 my $rtokens = $rinput_hash->{rtokens};
8724 301         548 my $rtoken_map = $rinput_hash->{rtoken_map};
8725 301         1567 my $id_scan_state = $rinput_hash->{id_scan_state};
8726 301         614 my $max_token_index = $rinput_hash->{max_token_index};
8727              
8728 301         518 my $i_entry = $i;
8729              
8730             # Determine the CALL TYPE
8731             # 1=sub
8732             # 2=(
8733             # 3=prototype
8734 301 100       1085 my $call_type =
    100          
8735             $tok eq 'prototype' ? PROTOTYPE_CALL
8736             : $tok eq '(' ? PAREN_CALL
8737             : SUB_CALL;
8738              
8739 301         530 $id_scan_state = EMPTY_STRING; # normally we get everything in one call
8740 301         584 my $subname = $subname_saved;
8741 301         535 my $package = $package_saved;
8742 301         570 my $proto = undef;
8743 301         522 my $attrs = undef;
8744 301         518 my $match;
8745              
8746 301         603 my $pos_beg = $rtoken_map->[$i_beg];
8747 301         972 pos($input_line) = $pos_beg;
8748              
8749             # Look for the sub NAME if this is a SUB call
8750 301 100 100     2785 if (
8751             $call_type == SUB_CALL
8752             && $input_line =~ m{\G\s*
8753             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
8754             (\w+) # NAME - required
8755             }gcx
8756             )
8757             {
8758 122         307 $match = 1;
8759 122         336 $subname = $2;
8760              
8761 122   33     479 my $is_lexical_sub =
8762             $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
8763 122 0 33     434 if ( $is_lexical_sub && $1 ) {
8764 0         0 $self->warning("'my' sub $subname cannot be in package '$1'\n");
8765 0         0 $is_lexical_sub = 0;
8766             }
8767              
8768 122 50       362 if ($is_lexical_sub) {
8769              
8770             # lexical subs use the block sequence number as a package name
8771 0         0 my $seqno =
8772             $rcurrent_sequence_number->[BRACE]
8773             [ $rcurrent_depth->[BRACE] ];
8774 0 0       0 $seqno = 1 if ( !defined($seqno) );
8775 0         0 $package = $seqno;
8776 0 0       0 if ( $warn_if_lexical{$subname} ) {
8777 0         0 $self->warning(
8778             "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
8779             );
8780              
8781             # This may end badly, it is safest to block formatting
8782             # For an example, see perl527/lexsub.t (issue c203)
8783 0         0 $self->[_in_trouble_] = 1;
8784             }
8785             }
8786             else {
8787 122 100 66     825 $package = ( defined($1) && $1 ) ? $1 : $current_package;
8788 122         400 $package =~ s/\'/::/g;
8789 122 50       440 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  0         0  
8790 122         317 $package =~ s/::$//;
8791             }
8792              
8793 122         284 my $pos = pos($input_line);
8794 122         287 my $numc = $pos - $pos_beg;
8795 122         403 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
8796 122         300 $type = 'S'; ## Fix for c250, was 'i';
8797              
8798             # remember the sub name in case another call is needed to
8799             # get the prototype
8800 122         251 $package_saved = $package;
8801 122         247 $subname_saved = $subname;
8802             }
8803              
8804             # Now look for PROTO ATTRS for all call types
8805             # Look for prototype/attributes which are usually on the same
8806             # line as the sub name but which might be on a separate line.
8807             # For example, we might have an anonymous sub with attributes,
8808             # or a prototype on a separate line from its sub name
8809              
8810             # NOTE: We only want to parse PROTOTYPES here. If we see anything that
8811             # does not look like a prototype, we assume it is a SIGNATURE and we
8812             # will stop and let the the standard tokenizer handle it. In
8813             # particular, we stop if we see any nested parens, braces, or commas.
8814             # Also note, a valid prototype cannot contain any alphabetic character
8815             # -- see https://perldoc.perl.org/perlsub
8816             # But it appears that an underscore is valid in a prototype, so the
8817             # regex below uses [A-Za-z] rather than \w
8818             # This is the old regex which has been replaced:
8819             # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
8820 301         1109 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
8821 301 100 100     2946 if (
      66        
8822             $input_line =~ m{\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
8823             (\s*:)? # ATTRS leading ':'
8824             }gcx
8825             && ( $1 || $2 )
8826             )
8827             {
8828 45         117 $proto = $1;
8829 45         89 $attrs = $2;
8830              
8831             # Append the prototype to the starting token if it is 'sub' or
8832             # 'prototype'. This is not necessary but for compatibility with
8833             # previous versions when the -csc flag is used:
8834 45 100 100     326 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
    100 100        
8835 24         64 $tok .= $proto;
8836             }
8837              
8838             # If we just entered the sub at an opening paren on this call, not
8839             # a following :prototype, label it with the previous token. This is
8840             # necessary to propagate the sub name to its opening block.
8841             elsif ( $call_type == PAREN_CALL ) {
8842 2         5 $tok = $last_nonblank_token;
8843             }
8844             else {
8845             }
8846              
8847 45   100     167 $match ||= 1;
8848              
8849             # Patch part #1 to fixes cases b994 and b1053:
8850             # Mark an anonymous sub keyword without prototype as type 'k', i.e.
8851             # 'sub : lvalue { ...'
8852 45         94 $type = 'S'; ## C250, was 'i';
8853 45 100 100     209 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
  2         6  
8854             }
8855              
8856 301 100       866 if ($match) {
8857              
8858             # ATTRS: if there are attributes, back up and let the ':' be
8859             # found later by the scanner.
8860 137         299 my $pos = pos($input_line);
8861 137 100       398 if ($attrs) {
8862 15         33 $pos -= length($attrs);
8863             }
8864              
8865 137         303 my $next_nonblank_token = $tok;
8866              
8867             # catch case of line with leading ATTR ':' after anonymous sub
8868 137 100 100     545 if ( $pos == $pos_beg && $tok eq ':' ) {
8869 1         2 $type = 'A';
8870 1         3 $self->[_in_attribute_list_] = 1;
8871             }
8872              
8873             # Otherwise, if we found a match we must convert back from
8874             # string position to the pre_token index for continued parsing.
8875             else {
8876              
8877             # I don't think an error flag can occur here ..but ?
8878 136         245 my $error;
8879 136         539 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
8880             $max_token_index );
8881 136 50       486 if ($error) { $self->warning("Possibly invalid sub\n") }
  0         0  
8882              
8883             # Patch part #2 to fixes cases b994 and b1053:
8884             # Do not let spaces be part of the token of an anonymous sub
8885             # keyword which we marked as type 'k' above...i.e. for
8886             # something like:
8887             # 'sub : lvalue { ...'
8888             # Back up and let it be parsed as a blank
8889 136 50 66     606 if ( $type eq 'k'
      66        
      33        
8890             && $attrs
8891             && $i > $i_entry
8892             && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8893             {
8894 2         6 $i--;
8895             }
8896              
8897             # check for multiple definitions of a sub
8898 136         404 ( $next_nonblank_token, my $i_next ) =
8899             find_next_nonblank_token_on_this_line( $i, $rtokens,
8900             $max_token_index );
8901             }
8902              
8903 137 100       785 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8904             { # skip blank or side comment
8905 7         79 my ( $rpre_tokens, $rpre_types ) =
8906             $self->peek_ahead_for_n_nonblank_pre_tokens(1);
8907 7 50 33     45 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
  7         34  
8908 7         37 $next_nonblank_token = $rpre_tokens->[0];
8909             }
8910             else {
8911 0         0 $next_nonblank_token = '}';
8912             }
8913             }
8914              
8915             # See what's next...
8916 137 100       667 if ( $next_nonblank_token eq '{' ) {
    100          
    50          
    100          
    50          
    0          
8917 105 100       325 if ($subname) {
8918              
8919             # Check for multiple definitions of a sub, but
8920             # it is ok to have multiple sub BEGIN, etc,
8921             # so we do not complain if name is all caps
8922 95 50 33     556 if ( $rsaw_function_definition->{$subname}{$package}
8923             && $subname !~ /^[A-Z]+$/ )
8924             {
8925             my $lno =
8926 0         0 $rsaw_function_definition->{$subname}{$package};
8927 0 0       0 if ( $package =~ /^\d/ ) {
8928 0         0 $self->warning(
8929             "already saw definition of lexical 'sub $subname' at line $lno\n"
8930             );
8931              
8932             }
8933             else {
8934 0         0 if ( !DEVEL_MODE ) {
8935 0         0 $self->warning(
8936             "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8937             );
8938             }
8939             }
8940             }
8941 95         380 $rsaw_function_definition->{$subname}{$package} =
8942             $self->[_last_line_number_];
8943             }
8944             }
8945             elsif ( $next_nonblank_token eq ';' ) {
8946             }
8947             elsif ( $next_nonblank_token eq '}' ) {
8948             }
8949              
8950             # ATTRS - if an attribute list follows, remember the name
8951             # of the sub so the next opening brace can be labeled.
8952             # Setting 'statement_type' causes any ':'s to introduce
8953             # attributes.
8954             elsif ( $next_nonblank_token eq ':' ) {
8955 16 100       60 if ( $call_type == SUB_CALL ) {
8956 14 100       70 $statement_type =
8957             substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8958             }
8959             }
8960              
8961             # if we stopped before an open paren ...
8962             elsif ( $next_nonblank_token eq '(' ) {
8963              
8964             # If we DID NOT see this paren above then it must be on the
8965             # next line so we will set a flag to come back here and see if
8966             # it is a PROTOTYPE
8967              
8968             # Otherwise, we assume it is a SIGNATURE rather than a
8969             # PROTOTYPE and let the normal tokenizer handle it as a list
8970 15 100       53 if ( !$saw_opening_paren ) {
8971 4         966 $id_scan_state = 'sub'; # we must come back to get proto
8972             }
8973 15 50       60 if ( $call_type == SUB_CALL ) {
8974 15 50       64 $statement_type =
8975             substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8976             }
8977             }
8978              
8979             # something else..
8980             elsif ($next_nonblank_token) {
8981              
8982 0 0 0     0 if ( $rinput_hash->{tok} eq 'method' && $call_type == SUB_CALL )
8983             {
8984             # For a method call, silently ignore this error (rt145706)
8985             # to avoid needless warnings. Example which can produce it:
8986             # test(method Pack (), "method");
8987              
8988             # TODO: scan for use feature 'class' and:
8989             # - if we saw 'use feature 'class' then issue the warning.
8990             # - if we did not see use feature 'class' then issue the
8991             # warning and suggest turning off --use-feature=class
8992             }
8993             else {
8994 0 0       0 $subname = EMPTY_STRING unless defined($subname);
8995 0         0 $self->warning(
8996             "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8997             );
8998             }
8999             }
9000              
9001             # EOF technically ok
9002             else {
9003             }
9004              
9005 137         478 check_prototype( $proto, $package, $subname );
9006             }
9007              
9008             # no match to either sub name or prototype, but line not blank
9009             else {
9010              
9011             }
9012 301         1857 return ( $i, $tok, $type, $id_scan_state );
9013             } ## end sub do_scan_sub
9014             }
9015              
9016             #########################################################################
9017             # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
9018             #########################################################################
9019              
9020             sub find_next_nonblank_token {
9021 6160     6160 0 12190 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9022              
9023             # Returns the next nonblank token after the token at index $i
9024             # To skip past a side comment, and any subsequent block comments
9025             # and blank lines, call with i=$max_token_index
9026              
9027             # Skip any ending blank (fix c258). It would be cleaner if caller passed
9028             # $rtoken_map, so we could check for type 'b', and avoid a regex test, but
9029             # benchmarking shows that this test does not take significant time. So
9030             # that would be a nice update but not essential. Also note that ending
9031             # blanks will not occur for text previously processed by perltidy.
9032 6160 100 100     19364 if ( $i == $max_token_index - 1
9033             && $rtokens->[$max_token_index] =~ /^\s+$/ )
9034             {
9035 9         26 $i++;
9036             }
9037              
9038 6160 100       12811 if ( $i >= $max_token_index ) {
9039 127 100       714 if ( !peeked_ahead() ) {
9040 125         431 peeked_ahead(1);
9041 125         796 $self->peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
9042             }
9043             }
9044              
9045 6160         10772 my $next_nonblank_token = $rtokens->[ ++$i ];
9046 6160 50 33     21916 return ( SPACE, $i )
9047             if ( !defined($next_nonblank_token) || !length($next_nonblank_token) );
9048              
9049             # Quick test for nonblank ascii char. Note that we just have to
9050             # examine the first character here.
9051 6160         12367 my $ord = ord( substr( $next_nonblank_token, 0, 1 ) );
9052 6160 100 66     24256 if ( $ord >= ORD_PRINTABLE_MIN
    50 33        
    0          
9053             && $ord <= ORD_PRINTABLE_MAX )
9054             {
9055 2353         8029 return ( $next_nonblank_token, $i );
9056             }
9057              
9058             # Quick test to skip over an ascii space or tab
9059             elsif ( $ord == ORD_SPACE || $ord == ORD_TAB ) {
9060 3807         7029 $next_nonblank_token = $rtokens->[ ++$i ];
9061 3807 50       8178 return ( SPACE, $i ) unless defined($next_nonblank_token);
9062             }
9063              
9064             # Slow test to skip over something else identified as whitespace
9065             elsif ( $next_nonblank_token =~ /^\s*$/ ) {
9066 0         0 $next_nonblank_token = $rtokens->[ ++$i ];
9067 0 0       0 return ( SPACE, $i ) unless defined($next_nonblank_token);
9068             }
9069             else {
9070             ## at nonblank
9071             }
9072              
9073             # We should be at a nonblank now
9074 3807         11932 return ( $next_nonblank_token, $i );
9075             } ## end sub find_next_nonblank_token
9076              
9077             sub find_next_noncomment_token {
9078 98     98 0 351 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9079              
9080             # Given the current character position, look ahead past any comments
9081             # and blank lines and return the next token, including digraphs and
9082             # trigraphs.
9083              
9084 98         369 my ( $next_nonblank_token, $i_next ) =
9085             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9086              
9087             # skip past any side comment
9088 98 50       580 if ( $next_nonblank_token eq '#' ) {
9089 0         0 ( $next_nonblank_token, $i_next ) =
9090             $self->find_next_nonblank_token( $i_next, $rtokens,
9091             $max_token_index );
9092             }
9093              
9094             # check for a digraph
9095 98 50 33     873 if ( $next_nonblank_token
      33        
9096             && $next_nonblank_token ne SPACE
9097             && defined( $rtokens->[ $i_next + 1 ] ) )
9098             {
9099 98         307 my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
9100 98 100       425 if ( $is_digraph{$test2} ) {
9101 15         73 $next_nonblank_token = $test2;
9102 15         51 $i_next = $i_next + 1;
9103              
9104             # check for a trigraph
9105 15 50       72 if ( defined( $rtokens->[ $i_next + 1 ] ) ) {
9106 15         52 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
9107 15 50       69 if ( $is_trigraph{$test3} ) {
9108 0         0 $next_nonblank_token = $test3;
9109 0         0 $i_next = $i_next + 1;
9110             }
9111             }
9112             }
9113             }
9114              
9115 98         327 return ( $next_nonblank_token, $i_next );
9116             } ## end sub find_next_noncomment_token
9117              
9118             sub is_possible_numerator {
9119              
9120             # Look at the next non-comment character and decide if it could be a
9121             # numerator. Return
9122             # 1 - yes
9123             # 0 - can't tell
9124             # -1 - no
9125              
9126 0     0 0 0 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9127 0         0 my $is_possible_numerator = 0;
9128              
9129 0         0 my $next_token = $rtokens->[ $i + 1 ];
9130 0 0       0 if ( $next_token eq '=' ) { $i++; } # handle /=
  0         0  
9131 0         0 my ( $next_nonblank_token, $i_next ) =
9132             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9133              
9134 0 0       0 if ( $next_nonblank_token eq '#' ) {
9135 0         0 ( $next_nonblank_token, $i_next ) =
9136             $self->find_next_nonblank_token( $max_token_index, $rtokens,
9137             $max_token_index );
9138             }
9139              
9140 0 0       0 if ( $next_nonblank_token =~ / [ \( \$ \w \. \@ ] /x ) {
    0          
9141 0         0 $is_possible_numerator = 1;
9142             }
9143             elsif ( $next_nonblank_token =~ /^\s*$/ ) {
9144 0         0 $is_possible_numerator = 0;
9145             }
9146             else {
9147 0         0 $is_possible_numerator = -1;
9148             }
9149              
9150 0         0 return $is_possible_numerator;
9151             } ## end sub is_possible_numerator
9152              
9153             { ## closure for sub pattern_expected
9154             my %pattern_test;
9155              
9156             BEGIN {
9157              
9158             # List of tokens which may follow a pattern. Note that we will not
9159             # have formed digraphs at this point, so we will see '&' instead of
9160             # '&&' and '|' instead of '||'
9161              
9162             # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
9163 39     39   307 my @q = qw( & && | || ? : + - * and or while if unless);
9164 39         152 push @q, ')', '}', ']', '>', ',', ';';
9165 39         182620 @{pattern_test}{@q} = (1) x scalar(@q);
9166             } ## end BEGIN
9167              
9168             sub pattern_expected {
9169              
9170             # This a filter for a possible pattern.
9171             # It looks at the token after a possible pattern and tries to
9172             # determine if that token could end a pattern.
9173             # returns -
9174             # 1 - yes
9175             # 0 - can't tell
9176             # -1 - no
9177 0     0 0 0 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9178 0         0 my $is_pattern = 0;
9179              
9180 0         0 my $next_token = $rtokens->[ $i + 1 ];
9181 0 0       0 if ( $next_token =~ /^[msixpodualgc]/ ) {
9182 0         0 $i++;
9183             } # skip possible modifier
9184 0         0 my ( $next_nonblank_token, $i_next ) =
9185             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9186              
9187 0 0       0 if ( $pattern_test{$next_nonblank_token} ) {
9188 0         0 $is_pattern = 1;
9189             }
9190             else {
9191              
9192             # Added '#' to fix issue c044
9193 0 0 0     0 if ( $next_nonblank_token =~ /^\s*$/
9194             || $next_nonblank_token eq '#' )
9195             {
9196 0         0 $is_pattern = 0;
9197             }
9198             else {
9199 0         0 $is_pattern = -1;
9200             }
9201             }
9202 0         0 return $is_pattern;
9203             } ## end sub pattern_expected
9204             }
9205              
9206             sub find_next_nonblank_token_on_this_line {
9207 455     455 0 1081 my ( $i, $rtokens, $max_token_index ) = @_;
9208 455         820 my $next_nonblank_token;
9209              
9210 455 100       1158 if ( $i < $max_token_index ) {
9211 447         1060 $next_nonblank_token = $rtokens->[ ++$i ];
9212              
9213 447 100       1987 if ( $next_nonblank_token =~ /^\s*$/ ) {
9214              
9215 120 100       522 if ( $i < $max_token_index ) {
9216 118         321 $next_nonblank_token = $rtokens->[ ++$i ];
9217             }
9218             }
9219             }
9220             else {
9221 8         33 $next_nonblank_token = EMPTY_STRING;
9222             }
9223 455         1449 return ( $next_nonblank_token, $i );
9224             } ## end sub find_next_nonblank_token_on_this_line
9225              
9226             sub find_angle_operator_termination {
9227              
9228             # We are looking at a '<' and want to know if it is an angle operator.
9229             # We are to return:
9230             # $i = pretoken index of ending '>' if found, current $i otherwise
9231             # $type = 'Q' if found, '>' otherwise
9232 8     8 0 34 my ( $self, $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index )
9233             = @_;
9234 8         20 my $i = $i_beg;
9235 8         17 my $type = '<';
9236 8         39 pos($input_line) = 1 + $rtoken_map->[$i];
9237              
9238 8         20 my $filter;
9239              
9240             # we just have to find the next '>' if a term is expected
9241 8 100       33 if ( $expecting == TERM ) { $filter = '[\>]' }
  6 50       18  
9242              
9243             # we have to guess if we don't know what is expected
9244 2         5 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
9245              
9246             # shouldn't happen - we shouldn't be here if operator is expected
9247             else {
9248 0         0 if (DEVEL_MODE) {
9249             $self->Fault(<<EOM);
9250             Bad call to find_angle_operator_termination
9251             EOM
9252             }
9253 0         0 return ( $i, $type );
9254             }
9255              
9256             # To illustrate what we might be looking at, in case we are
9257             # guessing, here are some examples of valid angle operators
9258             # (or file globs):
9259             # <tmp_imp/*>
9260             # <FH>
9261             # <$fh>
9262             # <*.c *.h>
9263             # <_>
9264             # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
9265             # <${PREFIX}*img*.$IMAGE_TYPE>
9266             # <img*.$IMAGE_TYPE>
9267             # <Timg*.$IMAGE_TYPE>
9268             # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
9269             #
9270             # Here are some examples of lines which do not have angle operators:
9271             # return unless $self->[2]++ < $#{$self->[1]};
9272             # < 2 || @$t >
9273             #
9274             # the following line from dlister.pl caused trouble:
9275             # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
9276             #
9277             # If the '<' starts an angle operator, it must end on this line and
9278             # it must not have certain characters like ';' and '=' in it. I use
9279             # this to limit the testing. This filter should be improved if
9280             # possible.
9281              
9282 8 50       213 if ( $input_line =~ /($filter)/g ) {
9283              
9284 8 50       44 if ( $1 eq '>' ) {
9285              
9286             # We MAY have found an angle operator termination if we get
9287             # here, but we need to do more to be sure we haven't been
9288             # fooled.
9289 8         21 my $pos = pos($input_line);
9290              
9291 8         24 my $pos_beg = $rtoken_map->[$i];
9292 8         33 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
9293              
9294             # Test for '<' after possible filehandle, issue c103
9295             # print $fh <>; # syntax error
9296             # print $fh <DATA>; # ok
9297             # print $fh < DATA>; # syntax error at '>'
9298             # print STDERR < DATA>; # ok, prints word 'DATA'
9299             # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
9300 8 100       38 if ( $last_nonblank_type eq 'Z' ) {
9301              
9302             # $str includes brackets; something like '<DATA>'
9303 1 0 33     6 if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
9304             && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
9305             {
9306 0         0 return ( $i, $type );
9307             }
9308             }
9309              
9310             # Reject if the closing '>' follows a '-' as in:
9311             # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
9312 8 100       43 if ( $expecting eq UNKNOWN ) {
9313 2         8 my $check = substr( $input_line, $pos - 2, 1 );
9314 2 100       7 if ( $check eq '-' ) {
9315 1         19 return ( $i, $type );
9316             }
9317             }
9318              
9319             ######################################debug#####
9320             #$self->write_diagnostics( "ANGLE? :$str\n");
9321             #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
9322             ######################################debug#####
9323 7         22 $type = 'Q';
9324 7         64 my $error;
9325 7         42 ( $i, $error ) =
9326             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9327              
9328             # It may be possible that a quote ends midway in a pretoken.
9329             # If this happens, it may be necessary to split the pretoken.
9330 7 50       36 if ($error) {
9331 0         0 if (DEVEL_MODE) {
9332             $self->Fault(<<EOM);
9333             unexpected error condition returned by inverse_pretoken_map
9334             EOM
9335             }
9336             $self->warning(
9337 0         0 "Possible tokinization error..please check this line\n");
9338             }
9339              
9340             # Check for accidental formatting of a markup language doc...
9341             # Formatting will be skipped if we set _html_tag_count_ and
9342             # also set a warning of any kind.
9343 7         20 my $is_html_tag;
9344 7   33     45 my $is_first_string =
9345             $i_beg == 0 && $self->[_last_line_number_] == 1;
9346              
9347             # html comment '<!...' of any type
9348 7 50 33     94 if ( $str =~ /^<\s*!/ ) {
    50          
    50          
9349 0         0 $is_html_tag = 1;
9350 0 0       0 if ($is_first_string) {
9351 0         0 $self->warning(
9352             "looks like a markup language, continuing error checks\n"
9353             );
9354             }
9355             }
9356              
9357             # html end tag, something like </h1>
9358             elsif ( $str =~ /^<\s*\/\w+\s*>$/ ) {
9359 0         0 $is_html_tag = 1;
9360             }
9361              
9362             # xml prolog?
9363             elsif ( $str =~ /^<\?xml\s.*\?>$/i && $is_first_string ) {
9364 0         0 $is_html_tag = 1;
9365 0         0 $self->warning(
9366             "looks like a markup language, continuing error checks\n");
9367             }
9368             else {
9369             ## doesn't look like a markup tag
9370             }
9371              
9372 7 50       29 if ($is_html_tag) {
9373 0         0 $self->[_html_tag_count_]++;
9374             }
9375              
9376             # count blanks on inside of brackets
9377 7         17 my $blank_count = 0;
9378 7 100       45 $blank_count++ if ( $str =~ /<\s+/ );
9379 7 100       39 $blank_count++ if ( $str =~ /\s+>/ );
9380              
9381             # Now let's see where we stand....
9382             # OK if math op not possible
9383 7 100       35 if ( $expecting == TERM ) {
    50          
    50          
    0          
9384             }
9385              
9386             elsif ($is_html_tag) {
9387             }
9388              
9389             # OK if there are no more than 2 non-blank pre-tokens inside
9390             # (not possible to write 2 token math between < and >)
9391             # This catches most common cases
9392             elsif ( $i <= $i_beg + 3 + $blank_count ) {
9393              
9394             # No longer any need to document this common case
9395             ## $self->write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
9396             }
9397              
9398             # OK if there is some kind of identifier inside
9399             # print $fh <tvg::INPUT>;
9400             elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
9401 0         0 $self->write_diagnostics("ANGLE (contains identifier): $str\n");
9402             }
9403              
9404             # Not sure..
9405             else {
9406              
9407             # Let's try a Brace Test: any braces inside must balance
9408 0         0 my $br = 0;
9409 0         0 while ( $str =~ /\{/g ) { $br++ }
  0         0  
9410 0         0 while ( $str =~ /\}/g ) { $br-- }
  0         0  
9411 0         0 my $sb = 0;
9412 0         0 while ( $str =~ /\[/g ) { $sb++ }
  0         0  
9413 0         0 while ( $str =~ /\]/g ) { $sb-- }
  0         0  
9414 0         0 my $pr = 0;
9415 0         0 while ( $str =~ /\(/g ) { $pr++ }
  0         0  
9416 0         0 while ( $str =~ /\)/g ) { $pr-- }
  0         0  
9417              
9418             # if braces do not balance - not angle operator
9419 0 0 0     0 if ( $br || $sb || $pr ) {
      0        
9420 0         0 $i = $i_beg;
9421 0         0 $type = '<';
9422 0         0 $self->write_diagnostics(
9423             "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
9424             }
9425              
9426             # we should keep doing more checks here...to be continued
9427             # Tentatively accepting this as a valid angle operator.
9428             # There are lots more things that can be checked.
9429             else {
9430 0         0 $self->write_diagnostics(
9431             "ANGLE-Guessing yes: $str expecting=$expecting\n");
9432 0         0 $self->write_logfile_entry(
9433             "Guessing angle operator here: $str\n");
9434             }
9435             }
9436             }
9437              
9438             # didn't find ending >
9439             else {
9440 0 0       0 if ( $expecting == TERM ) {
9441 0         0 $self->warning("No ending > for angle operator\n");
9442             }
9443             }
9444             }
9445 7         37 return ( $i, $type );
9446             } ## end sub find_angle_operator_termination
9447              
9448             sub scan_number_do {
9449              
9450             # scan a number in any of the formats that Perl accepts
9451             # Underbars (_) are allowed in decimal numbers.
9452             # input parameters -
9453             # $input_line - the string to scan
9454             # $i - pre_token index to start scanning
9455             # $rtoken_map - reference to the pre_token map giving starting
9456             # character position in $input_line of token $i
9457             # output parameters -
9458             # $i - last pre_token index of the number just scanned
9459             # number - the number (characters); or undef if not a number
9460              
9461 629     629 0 1994 my ( $self, $input_line, $i, $rtoken_map, $input_type, $max_token_index ) =
9462             @_;
9463 629         1157 my $pos_beg = $rtoken_map->[$i];
9464 629         996 my $pos;
9465 629         985 my $i_begin = $i;
9466 629         999 my $number = undef;
9467 629         999 my $type = $input_type;
9468              
9469 629         2584 my $first_char = substr( $input_line, $pos_beg, 1 );
9470              
9471             # Look for bad starting characters; Shouldn't happen..
9472 629 50       2759 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
9473 0         0 if (DEVEL_MODE) {
9474             $self->Fault(<<EOM);
9475             Program bug - scan_number given bad first character = '$first_char'
9476             EOM
9477             }
9478 0         0 return ( $i, $type, $number );
9479             }
9480              
9481             # handle v-string without leading 'v' character ('Two Dot' rule)
9482             # (vstring.t)
9483             # Here is the format prior to including underscores:
9484             ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
9485 629         1871 pos($input_line) = $pos_beg;
9486 629 50       3071 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
9487 0         0 $pos = pos($input_line);
9488 0         0 my $numc = $pos - $pos_beg;
9489 0         0 $number = substr( $input_line, $pos_beg, $numc );
9490 0         0 $type = 'v';
9491 0         0 $self->report_v_string($number);
9492             }
9493              
9494             # handle octal, hex, binary
9495 629 50       1618 if ( !defined($number) ) {
9496 629         1266 pos($input_line) = $pos_beg;
9497              
9498             # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
9499             # For reference, the format prior to hex floating point is:
9500             # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
9501             # (hex) (octal) (binary)
9502 629 100       2375 if (
9503             $input_line =~ m{
9504              
9505             \G[+-]?0( # leading [signed] 0
9506              
9507             # a hex float, i.e. '0x0.b17217f7d1cf78p0'
9508             ([xX][0-9a-fA-F_]* # X and optional leading digits
9509             (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
9510             [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
9511             [0-9a-fA-F_]*) # optional Additional exponent digits
9512              
9513             # or hex integer
9514             |([xX][0-9a-fA-F_]+)
9515              
9516             # or octal fraction
9517             |([oO]?[0-7_]+ # string of octal digits
9518             (\.([0-7][0-7_]*)?)? # optional decimal and fraction
9519             [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
9520             [0-7_]*) # Additional exponent digits with underscores
9521              
9522             # or octal integer
9523             |([oO]?[0-7_]+) # string of octal digits
9524              
9525             # or a binary float
9526             |([bB][01_]* # 'b' with string of binary digits
9527             (\.([01][01_]*)?)? # optional decimal and fraction
9528             [Pp][+-]?[01] # Required exponent indicator, no underscore
9529             [01_]*) # additional exponent bits
9530              
9531             # or binary integer
9532             |([bB][01_]+) # 'b' with string of binary digits
9533              
9534             )}gx
9535             )
9536             {
9537 72         150 $pos = pos($input_line);
9538 72         120 my $numc = $pos - $pos_beg;
9539 72         151 $number = substr( $input_line, $pos_beg, $numc );
9540 72         133 $type = 'n';
9541             }
9542             }
9543              
9544             # handle decimal
9545 629 100       1487 if ( !defined($number) ) {
9546 557         1037 pos($input_line) = $pos_beg;
9547              
9548 557 50       2679 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9549 557         1032 $pos = pos($input_line);
9550              
9551             # watch out for things like 0..40 which would give 0. by this;
9552 557 100 100     1934 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9553             && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9554             {
9555 37         70 $pos--;
9556             }
9557 557         983 my $numc = $pos - $pos_beg;
9558 557         1041 $number = substr( $input_line, $pos_beg, $numc );
9559 557         981 $type = 'n';
9560             }
9561             }
9562              
9563             # filter out non-numbers like e + - . e2 .e3 +e6
9564             # the rule: at least one digit, and any 'e' must be preceded by a digit
9565 629 100 66     3254 if (
      66        
9566             $number !~ /\d/ # no digits
9567             || ( $number =~ /^(.*)[eE]/
9568             && $1 !~ /\d/ ) # or no digits before the 'e'
9569             )
9570             {
9571 303         515 $number = undef;
9572 303         526 $type = $input_type;
9573 303         1253 return ( $i, $type, $number );
9574             }
9575              
9576             # Found a number; now we must convert back from character position
9577             # to pre_token index. An error here implies user syntax error.
9578             # An example would be an invalid octal number like '009'.
9579 326         610 my $error;
9580 326         872 ( $i, $error ) =
9581             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9582 326 50       916 if ($error) { $self->warning("Possibly invalid number\n") }
  0         0  
9583              
9584 326         1167 return ( $i, $type, $number );
9585             } ## end sub scan_number_do
9586              
9587             sub inverse_pretoken_map {
9588              
9589             # Starting with the current pre_token index $i, scan forward until
9590             # finding the index of the next pre_token whose position is $pos.
9591 2167     2167 0 5110 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9592 2167         3867 my $error = 0;
9593              
9594 2167         5630 while ( ++$i <= $max_token_index ) {
9595              
9596 4035 100       9661 if ( $pos <= $rtoken_map->[$i] ) {
9597              
9598             # Let the calling routine handle errors in which we do not
9599             # land on a pre-token boundary. It can happen by running
9600             # perltidy on some non-perl scripts, for example.
9601 2132 50       5297 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
  0         0  
9602 2132         3128 $i--;
9603 2132         3666 last;
9604             }
9605             }
9606 2167         5445 return ( $i, $error );
9607             } ## end sub inverse_pretoken_map
9608              
9609             sub find_here_doc {
9610              
9611             # find the target of a here document, if any
9612             # input parameters:
9613             # $i - token index of the second < of <<
9614             # ($i must be less than the last token index if this is called)
9615             # output parameters:
9616             # $found_target = 0 didn't find target; =1 found target
9617             # HERE_TARGET - the target string (may be empty string)
9618             # $i - unchanged if not here doc,
9619             # or index of the last token of the here target
9620             # $saw_error - flag noting unbalanced quote on here target
9621 9     9 0 42 my ( $self, $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9622              
9623 9         24 my $ibeg = $i;
9624 9         26 my $found_target = 0;
9625 9         26 my $here_doc_target = EMPTY_STRING;
9626 9         25 my $here_quote_character = EMPTY_STRING;
9627 9         21 my $saw_error = 0;
9628 9         22 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9629 9         29 $next_token = $rtokens->[ $i + 1 ];
9630              
9631             # perl allows a backslash before the target string (heredoc.t)
9632 9         20 my $backslash = 0;
9633 9 50       42 if ( $next_token eq '\\' ) {
9634 0         0 $backslash = 1;
9635 0         0 $next_token = $rtokens->[ $i + 2 ];
9636             }
9637              
9638 9         54 ( $next_nonblank_token, $i_next_nonblank ) =
9639             find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9640              
9641 9 100 33     78 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
    50          
    50          
9642              
9643 6         18 my $in_quote = 1;
9644 6         12 my $quote_depth = 0;
9645 6         16 my $quote_pos = 0;
9646 6         10 my $quoted_string;
9647              
9648             (
9649 6         36 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9650             $quoted_string
9651             )
9652             = $self->follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9653             $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9654              
9655 6 50       27 if ($in_quote) { # didn't find end of quote, so no target found
9656 0         0 $i = $ibeg;
9657 0 0       0 if ( $expecting == TERM ) {
9658 0         0 $self->warning(
9659             "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9660             );
9661 0         0 $saw_error = 1;
9662             }
9663             }
9664             else { # found ending quote
9665 6         14 $found_target = 1;
9666              
9667 6         14 my $tokj;
9668 6         28 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
9669 6         20 $tokj = $rtokens->[$j];
9670              
9671             # we have to remove any backslash before the quote character
9672             # so that the here-doc-target exactly matches this string
9673             next
9674 6 0 33     35 if ( $tokj eq "\\"
      33        
9675             && $j < $i - 1
9676             && $rtokens->[ $j + 1 ] eq $here_quote_character );
9677 6         27 $here_doc_target .= $tokj;
9678             }
9679             }
9680             }
9681              
9682             elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
9683 0         0 $found_target = 1;
9684 0         0 $self->write_logfile_entry(
9685             "found blank here-target after <<; suggest using \"\"\n");
9686 0         0 $i = $ibeg;
9687             }
9688             elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
9689              
9690 3         7 my $here_doc_expected;
9691 3 50       16 if ( $expecting == UNKNOWN ) {
9692 0         0 $here_doc_expected = $self->guess_if_here_doc($next_token);
9693             }
9694             else {
9695 3         8 $here_doc_expected = 1;
9696             }
9697              
9698 3 50       14 if ($here_doc_expected) {
9699 3         8 $found_target = 1;
9700 3         7 $here_doc_target = $next_token;
9701 3         7 $i = $ibeg + 1;
9702             }
9703              
9704             }
9705             else {
9706              
9707 0 0       0 if ( $expecting == TERM ) {
9708 0         0 $found_target = 1;
9709 0         0 $self->write_logfile_entry("Note: bare here-doc operator <<\n");
9710             }
9711             else {
9712 0         0 $i = $ibeg;
9713             }
9714             }
9715              
9716             # patch to neglect any prepended backslash
9717 9 50 33     70 if ( $found_target && $backslash ) { $i++ }
  0         0  
9718              
9719 9         49 return ( $found_target, $here_doc_target, $here_quote_character, $i,
9720             $saw_error );
9721             } ## end sub find_here_doc
9722              
9723             sub do_quote {
9724              
9725             # follow (or continue following) quoted string(s)
9726             # $in_quote return code:
9727             # 0 - ok, found end
9728             # 1 - still must find end of quote whose target is $quote_character
9729             # 2 - still looking for end of first of two quotes
9730             #
9731             # Returns updated strings:
9732             # $quoted_string_1 = quoted string seen while in_quote=1
9733             # $quoted_string_2 = quoted string seen while in_quote=2
9734             my (
9735              
9736 2768     2768 0 8691 $self,
9737             $i,
9738             $in_quote,
9739             $quote_character,
9740             $quote_pos,
9741             $quote_depth,
9742             $quoted_string_1,
9743             $quoted_string_2,
9744             $rtokens,
9745             $rtoken_map,
9746             $max_token_index,
9747              
9748             ) = @_;
9749              
9750 2768         4242 my $quoted_string;
9751 2768 100       6470 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
9752 29         70 my $ibeg = $i;
9753             (
9754 29         138 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9755             $quoted_string
9756             )
9757             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
9758             $quote_character, $quote_pos, $quote_depth, $max_token_index );
9759 29         72 $quoted_string_2 .= $quoted_string;
9760 29 50       89 if ( $in_quote == 1 ) {
9761 29 50       124 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
  0         0  
9762 29         61 $quote_character = EMPTY_STRING;
9763             }
9764             else {
9765 0         0 $quoted_string_2 .= "\n";
9766             }
9767             }
9768              
9769 2768 50       6124 if ( $in_quote == 1 ) { # one (more) quote to follow
9770 2768         4273 my $ibeg = $i;
9771             (
9772 2768         7826 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9773             $quoted_string
9774             )
9775             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
9776             $quote_character, $quote_pos, $quote_depth, $max_token_index );
9777 2768         5949 $quoted_string_1 .= $quoted_string;
9778 2768 100       6305 if ( $in_quote == 1 ) {
9779 183         377 $quoted_string_1 .= "\n";
9780             }
9781             }
9782             return (
9783              
9784 2768         9533 $i,
9785             $in_quote,
9786             $quote_character,
9787             $quote_pos,
9788             $quote_depth,
9789             $quoted_string_1,
9790             $quoted_string_2,
9791              
9792             );
9793             } ## end sub do_quote
9794              
9795             sub follow_quoted_string {
9796              
9797             # scan for a specific token, skipping escaped characters
9798             # if the quote character is blank, use the first non-blank character
9799             # input parameters:
9800             # $rtokens = reference to the array of tokens
9801             # $i = the token index of the first character to search
9802             # $in_quote = number of quoted strings being followed
9803             # $beginning_tok = the starting quote character
9804             # $quote_pos = index to check next for alphanumeric delimiter
9805             # output parameters:
9806             # $i = the token index of the ending quote character
9807             # $in_quote = decremented if found end, unchanged if not
9808             # $beginning_tok = the starting quote character
9809             # $quote_pos = index to check next for alphanumeric delimiter
9810             # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
9811             # $quoted_string = the text of the quote (without quotation tokens)
9812             my (
9813              
9814 2814     2814 0 6525 $self,
9815             $i_beg,
9816             $in_quote,
9817             $rtokens,
9818             $beginning_tok,
9819             $quote_pos,
9820             $quote_depth,
9821             $max_token_index,
9822              
9823             ) = @_;
9824              
9825 2814         4945 my ( $tok, $end_tok );
9826 2814         4730 my $i = $i_beg - 1;
9827 2814         4423 my $quoted_string = EMPTY_STRING;
9828              
9829 2814         4031 0 && do {
9830             print {*STDOUT}
9831             "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
9832             };
9833              
9834             # get the corresponding end token
9835 2814 100       13169 if ( $beginning_tok !~ /^\s*$/ ) {
9836 183         541 $end_tok = matching_end_token($beginning_tok);
9837             }
9838              
9839             # a blank token means we must find and use the first non-blank one
9840             else {
9841 2631 100       6187 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
9842              
9843 2631         6049 while ( $i < $max_token_index ) {
9844 2631         5315 $tok = $rtokens->[ ++$i ];
9845              
9846 2631 50       7998 if ( $tok !~ /^\s*$/ ) {
9847              
9848 2631 50 66     7666 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
9849 0         0 $i = $max_token_index;
9850             }
9851             else {
9852              
9853 2631 100       5747 if ( length($tok) > 1 ) {
9854 1 50       5 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
  1         4  
9855 1         4 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
9856             }
9857             else {
9858 2630         4654 $beginning_tok = $tok;
9859 2630         4125 $quote_pos = 0;
9860             }
9861 2631         6377 $end_tok = matching_end_token($beginning_tok);
9862 2631         4615 $quote_depth = 1;
9863 2631         4750 last;
9864             }
9865             }
9866             else {
9867 0         0 $allow_quote_comments = 1;
9868             }
9869             }
9870             }
9871              
9872             # There are two different loops which search for the ending quote
9873             # character. In the rare case of an alphanumeric quote delimiter, we
9874             # have to look through alphanumeric tokens character-by-character, since
9875             # the pre-tokenization process combines multiple alphanumeric
9876             # characters, whereas for a non-alphanumeric delimiter, only tokens of
9877             # length 1 can match.
9878              
9879             #----------------------------------------------------------------
9880             # Case 1 (rare): loop for case of alphanumeric quote delimiter..
9881             # "quote_pos" is the position the current word to begin searching
9882             #----------------------------------------------------------------
9883 2814 100       7797 if ( $beginning_tok =~ /\w/ ) {
9884              
9885             # Note this because it is not recommended practice except
9886             # for obfuscated perl contests
9887 1 50       4 if ( $in_quote == 1 ) {
9888 1         9 $self->write_logfile_entry(
9889             "Note: alphanumeric quote delimiter ($beginning_tok) \n");
9890             }
9891              
9892             # Note: changed < to <= here to fix c109. Relying on extra end blanks.
9893 1         5 while ( $i <= $max_token_index ) {
9894              
9895 4 100 66     17 if ( $quote_pos == 0 || ( $i < 0 ) ) {
9896 3         7 $tok = $rtokens->[ ++$i ];
9897              
9898 3 100       8 if ( $tok eq '\\' ) {
9899              
9900             # retain backslash unless it hides the end token
9901 1 50       54 $quoted_string .= $tok
9902             unless $rtokens->[ $i + 1 ] eq $end_tok;
9903 1         8 $quote_pos++;
9904 1 50       6 last if ( $i >= $max_token_index );
9905 1         5 $tok = $rtokens->[ ++$i ];
9906             }
9907             }
9908 4         8 my $old_pos = $quote_pos;
9909              
9910 4         8 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
9911              
9912 4 100       11 if ( $quote_pos > 0 ) {
9913              
9914 1         16 $quoted_string .=
9915             substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
9916              
9917             # NOTE: any quote modifiers will be at the end of '$tok'. If we
9918             # wanted to check them, this is the place to get them. But
9919             # this quote form is rarely used in practice, so it isn't
9920             # worthwhile.
9921              
9922 1         3 $quote_depth--;
9923              
9924 1 50       4 if ( $quote_depth == 0 ) {
9925 1         3 $in_quote--;
9926 1         3 last;
9927             }
9928             }
9929             else {
9930 3 50       7 if ( $old_pos <= length($tok) ) {
9931 3         9 $quoted_string .= substr( $tok, $old_pos );
9932             }
9933             }
9934             }
9935             }
9936              
9937             #-----------------------------------------------------------------------
9938             # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
9939             #-----------------------------------------------------------------------
9940             else {
9941              
9942 2813         6490 while ( $i < $max_token_index ) {
9943 10798         16323 $tok = $rtokens->[ ++$i ];
9944              
9945 10798 100       24279 if ( $tok eq $end_tok ) {
    100          
    100          
9946 2620         4208 $quote_depth--;
9947              
9948 2620 100       5936 if ( $quote_depth == 0 ) {
9949 2619         3860 $in_quote--;
9950 2619         4131 last;
9951             }
9952             }
9953             elsif ( $tok eq $beginning_tok ) {
9954 1         3 $quote_depth++;
9955             }
9956             elsif ( $tok eq '\\' ) {
9957              
9958             # retain backslash unless it hides the beginning or end token
9959 376         939 $tok = $rtokens->[ ++$i ];
9960 376 100 100     1960 $quoted_string .= '\\'
9961             if ( $tok ne $end_tok && $tok ne $beginning_tok );
9962             }
9963             else {
9964             ## nothing special
9965             }
9966 8179         15255 $quoted_string .= $tok;
9967             }
9968             }
9969 2814 50       6171 if ( $i > $max_token_index ) { $i = $max_token_index }
  0         0  
9970             return (
9971              
9972 2814         10673 $i,
9973             $in_quote,
9974             $beginning_tok,
9975             $quote_pos,
9976             $quote_depth,
9977             $quoted_string,
9978              
9979             );
9980             } ## end sub follow_quoted_string
9981              
9982             sub indicate_error {
9983 0     0 0 0 my ( $self, $msg, $line_number, $input_line, $pos, $carrat ) = @_;
9984 0         0 $self->interrupt_logfile();
9985 0         0 $self->warning($msg);
9986 0         0 $self->write_error_indicator_pair( $line_number, $input_line, $pos,
9987             $carrat );
9988 0         0 $self->resume_logfile();
9989 0         0 return;
9990             } ## end sub indicate_error
9991              
9992             sub write_error_indicator_pair {
9993 0     0 0 0 my ( $self, $line_number, $input_line, $pos, $carrat ) = @_;
9994 0         0 my ( $offset, $numbered_line, $underline ) =
9995             make_numbered_line( $line_number, $input_line, $pos );
9996 0         0 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
9997 0         0 $self->warning( $numbered_line . "\n" );
9998 0         0 $underline =~ s/\s*$//;
9999 0         0 $self->warning( $underline . "\n" );
10000 0         0 return;
10001             } ## end sub write_error_indicator_pair
10002              
10003             sub make_numbered_line {
10004              
10005             # Given an input line, its line number, and a character position of
10006             # interest, create a string not longer than 80 characters of the form
10007             # $lineno: sub_string
10008             # such that the sub_string of $str contains the position of interest
10009             #
10010             # Here is an example of what we want, in this case we add trailing
10011             # '...' because the line is long.
10012             #
10013             # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
10014             #
10015             # Here is another example, this time in which we used leading '...'
10016             # because of excessive length:
10017             #
10018             # 2: ... er of the World Wide Web Consortium's
10019             #
10020             # input parameters are:
10021             # $lineno = line number
10022             # $str = the text of the line
10023             # $pos = position of interest (the error) : 0 = first character
10024             #
10025             # We return :
10026             # - $offset = an offset which corrects the position in case we only
10027             # display part of a line, such that $pos-$offset is the effective
10028             # position from the start of the displayed line.
10029             # - $numbered_line = the numbered line as above,
10030             # - $underline = a blank 'underline' which is all spaces with the same
10031             # number of characters as the numbered line.
10032              
10033 0     0 0 0 my ( $lineno, $str, $pos ) = @_;
10034 0 0       0 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
10035 0         0 my $excess = length($str) - $offset - 68;
10036 0 0       0 my $numc = ( $excess > 0 ) ? 68 : undef;
10037              
10038 0 0       0 if ( defined($numc) ) {
10039 0 0       0 if ( $offset == 0 ) {
10040 0         0 $str = substr( $str, $offset, $numc - 4 ) . " ...";
10041             }
10042             else {
10043 0         0 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
10044             }
10045             }
10046             else {
10047              
10048 0 0       0 if ( $offset == 0 ) {
10049             }
10050             else {
10051 0         0 $str = "... " . substr( $str, $offset + 4 );
10052             }
10053             }
10054              
10055 0         0 my $numbered_line = sprintf( "%d: ", $lineno );
10056 0         0 $offset -= length($numbered_line);
10057 0         0 $numbered_line .= $str;
10058 0         0 my $underline = SPACE x length($numbered_line);
10059 0         0 return ( $offset, $numbered_line, $underline );
10060             } ## end sub make_numbered_line
10061              
10062             sub write_on_underline {
10063              
10064             # The "underline" is a string that shows where an error is; it starts
10065             # out as a string of blanks with the same length as the numbered line of
10066             # code above it, and we have to add marking to show where an error is.
10067             # In the example below, we want to write the string '--^' just below
10068             # the line of bad code:
10069             #
10070             # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
10071             # ---^
10072             # We are given the current underline string, plus a position and a
10073             # string to write on it.
10074             #
10075             # In the above example, there will be 2 calls to do this:
10076             # First call: $pos=19, pos_chr=^
10077             # Second call: $pos=16, pos_chr=---
10078             #
10079             # This is a trivial thing to do with substr, but there is some
10080             # checking to do.
10081              
10082 0     0 0 0 my ( $underline, $pos, $pos_chr ) = @_;
10083              
10084             # check for error..shouldn't happen
10085 0 0 0     0 if ( $pos < 0 || $pos > length($underline) ) {
10086 0         0 return $underline;
10087             }
10088 0         0 my $excess = length($pos_chr) + $pos - length($underline);
10089 0 0       0 if ( $excess > 0 ) {
10090 0         0 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
10091             }
10092 0         0 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
10093 0         0 return ($underline);
10094             } ## end sub write_on_underline
10095              
10096             sub pre_tokenize {
10097              
10098 6187     6187 0 12313 my ( $str, $max_tokens_wanted ) = @_;
10099              
10100             # Input parameters:
10101             # $str = string to be parsed
10102             # $max_tokens_wanted > 0 to stop on reaching this many tokens.
10103             # = undef or 0 means get all tokens
10104              
10105             # Break a string, $str, into a sequence of preliminary tokens (pre-tokens).
10106             # We look for these types of tokens:
10107             # words (type='w'), example: 'max_tokens_wanted'
10108             # digits (type = 'd'), example: '0755'
10109             # whitespace (type = 'b'), example: ' '
10110             # single character punct (type = char) example: '='
10111              
10112             # Later operations will combine one or more of these pre-tokens into final
10113             # tokens. We cannot do better than this yet because we might be in a
10114             # quoted string or pattern.
10115              
10116             # An advantage of doing this pre-tokenization step is that it keeps almost
10117             # all of the regex parsing very simple and localized right here. A
10118             # disadvantage is that in some extremely rare instances we will have to go
10119             # back and split a pre-token.
10120              
10121             # Return parameters:
10122 6187         10336 my @tokens = (); # array of the tokens themselves
10123 6187         12732 my @token_map = (0); # string position of start of each token
10124 6187         9681 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
10125              
10126 6187 100       13303 if ( !$max_tokens_wanted ) { $max_tokens_wanted = -1 }
  5902         9192  
10127              
10128 6187         14169 while ( $max_tokens_wanted-- ) {
10129              
10130 82185 100       227575 if (
10131             $str =~ m{
10132             \G(
10133             (\s+) # type 'b' = whitespace - this must come before \W
10134             | (\W) # or type=char = single-character, non-whitespace punct
10135             | (\d+) # or type 'd' = sequence of digits - must come before \w
10136             | (\w+) # or type 'w' = words not starting with a digit
10137             )
10138             }gcx
10139             )
10140             {
10141 76142         159576 push @tokens, $1;
10142 76142 100       185396 push @type,
    100          
    100          
10143             defined($2) ? 'b' : defined($3) ? $1 : defined($4) ? 'd' : 'w';
10144 76142         147633 push @token_map, pos($str);
10145             }
10146              
10147             # that's all..
10148             else {
10149 6043         36598 return ( \@tokens, \@token_map, \@type );
10150             }
10151             }
10152              
10153 144         1041 return ( \@tokens, \@token_map, \@type );
10154             } ## end sub pre_tokenize
10155              
10156             sub show_tokens {
10157              
10158             # this is an old debug routine
10159             # not called, but saved for reference
10160 0     0 0 0 my ( $rtokens, $rtoken_map ) = @_;
10161 0         0 my $num = scalar( @{$rtokens} );
  0         0  
10162              
10163 0         0 foreach my $i ( 0 .. $num - 1 ) {
10164 0         0 my $len = length( $rtokens->[$i] );
10165 0         0 print {*STDOUT} "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
  0         0  
10166             }
10167 0         0 return;
10168             } ## end sub show_tokens
10169              
10170             { ## closure for sub matching end token
10171             my %matching_end_token;
10172              
10173             BEGIN {
10174 39     39   62257 %matching_end_token = (
10175             '{' => '}',
10176             '(' => ')',
10177             '[' => ']',
10178             '<' => '>',
10179             );
10180             } ## end BEGIN
10181              
10182             sub matching_end_token {
10183              
10184             # return closing character for a pattern
10185 2998     2998 0 5029 my $beginning_token = shift;
10186 2998 100       7663 if ( $matching_end_token{$beginning_token} ) {
10187 373         982 return $matching_end_token{$beginning_token};
10188             }
10189 2625         6068 return ($beginning_token);
10190             } ## end sub matching_end_token
10191             }
10192              
10193             sub dump_token_types {
10194 0     0 0   my ( $class, $fh ) = @_;
10195              
10196             # This should be the latest list of token types in use
10197             # adding NEW_TOKENS: add a comment here
10198 0           $fh->print(<<'END_OF_LIST');
10199              
10200             Here is a list of the token types currently used for lines of type 'CODE'.
10201             For the following tokens, the "type" of a token is just the token itself.
10202              
10203             .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
10204             ( ) <= >= == =~ !~ != ++ -- /= x=
10205             ... **= <<= >>= &&= ||= //= <=>
10206             , + - / * | % ! x ~ = \ ? : . < > ^ &
10207              
10208             The following additional token types are defined:
10209              
10210             type meaning
10211             b blank (white space)
10212             { indent: opening structural curly brace or square bracket or paren
10213             (code block, anonymous hash reference, or anonymous array reference)
10214             } outdent: right structural curly brace or square bracket or paren
10215             [ left non-structural square bracket (enclosing an array index)
10216             ] right non-structural square bracket
10217             ( left non-structural paren (all but a list right of an =)
10218             ) right non-structural paren
10219             L left non-structural curly brace (enclosing a key)
10220             R right non-structural curly brace
10221             ; terminal semicolon
10222             f indicates a semicolon in a "for" statement
10223             h here_doc operator <<
10224             # a comment
10225             Q indicates a quote or pattern
10226             q indicates a qw quote block
10227             k a perl keyword
10228             C user-defined constant or constant function (with void prototype = ())
10229             U user-defined function taking parameters
10230             G user-defined function taking block parameter (like grep/map/eval)
10231             S sub definition (reported as type 'i' in older versions)
10232             P package definition (reported as type 'i' in older versions)
10233             t type indicater such as %,$,@,*,&,sub
10234             w bare word (perhaps a subroutine call)
10235             i identifier of some type (with leading %, $, @, *, &, sub, -> )
10236             n a number
10237             v a v-string
10238             F a file test operator (like -e)
10239             Y File handle
10240             Z identifier in indirect object slot: may be file handle, object
10241             J LABEL: code block label
10242             j LABEL after next, last, redo, goto
10243             p unary +
10244             m unary -
10245             pp pre-increment operator ++
10246             mm pre-decrement operator --
10247             A : used as attribute separator
10248            
10249             Here are the '_line_type' codes used internally:
10250             SYSTEM - system-specific code before hash-bang line
10251             CODE - line of perl code (including comments)
10252             POD_START - line starting pod, such as '=head'
10253             POD - pod documentation text
10254             POD_END - last line of pod section, '=cut'
10255             HERE - text of here-document
10256             HERE_END - last line of here-doc (target word)
10257             FORMAT - format section
10258             FORMAT_END - last line of format section, '.'
10259             SKIP - code skipping section
10260             SKIP_END - last line of code skipping section, '#>>V'
10261             DATA_START - __DATA__ line
10262             DATA - unidentified text following __DATA__
10263             END_START - __END__ line
10264             END - unidentified text following __END__
10265             ERROR - we are in big trouble, probably not a perl script
10266             END_OF_LIST
10267              
10268 0           return;
10269             } ## end sub dump_token_types
10270              
10271             BEGIN {
10272              
10273             # These names are used in error messages
10274 39     39   304 @opening_brace_names = qw# '{' '[' '(' '?' #;
10275 39         113 @closing_brace_names = qw# '}' ']' ')' ':' #;
10276              
10277 39         106 my @q;
10278              
10279 39         286 my @digraphs = qw(
10280             .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
10281             <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
10282             );
10283 39         899 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
10284              
10285 39         428 @q = qw(
10286             . : < > * & | / - = + - % ^ ! x ~
10287             );
10288 39         366 @can_start_digraph{@q} = (1) x scalar(@q);
10289              
10290 39         151 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
10291 39         254 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
10292              
10293 39         135 my @tetragraphs = qw( <<>> );
10294 39         138 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
10295              
10296             # make a hash of all valid token types for self-checking the tokenizer
10297             # (adding NEW_TOKENS : select a new character and add to this list)
10298             # fix for c250: added new token type 'P' and 'S'
10299 39         625 my @valid_token_types = qw#
10300             A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v P S
10301             { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
10302             #;
10303 39         520 push( @valid_token_types, @digraphs );
10304 39         319 push( @valid_token_types, @trigraphs );
10305 39         105 push( @valid_token_types, @tetragraphs );
10306 39         116 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
10307 39         1597 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
10308              
10309             # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
10310 39         285 my @file_test_operators =
10311             qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
10312 39         807 @is_file_test_operator{@file_test_operators} =
10313             (1) x scalar(@file_test_operators);
10314              
10315             # these functions have prototypes of the form (&), so when they are
10316             # followed by a block, that block MAY BE followed by an operator.
10317             # Smartmatch operator ~~ may be followed by anonymous hash or array ref
10318 39         167 @q = qw( do eval );
10319 39         177 @is_block_operator{@q} = (1) x scalar(@q);
10320              
10321             # these functions allow an identifier in the indirect object slot
10322 39         111 @q = qw( print printf sort exec system say);
10323 39         242 @is_indirect_object_taker{@q} = (1) x scalar(@q);
10324              
10325             # Note: 'field' will be added by sub check_options if --use-feature=class
10326 39         100 @q = qw(my our state);
10327 39         190 @is_my_our_state{@q} = (1) x scalar(@q);
10328              
10329             # These tokens may precede a code block
10330             # patched for SWITCH/CASE/CATCH. Actually these could be removed
10331             # now and we could let the extended-syntax coding handle them.
10332             # Added 'default' for Switch::Plain.
10333             # Note: 'ADJUST' will be added by sub check_options if --use-feature=class
10334 39         233 @q =
10335             qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
10336             unless do while until eval for foreach map grep sort
10337             switch case given when default catch try finally);
10338 39         681 @is_code_block_token{@q} = (1) x scalar(@q);
10339              
10340             # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
10341             # to contrast it with the block types in '%is_zero_continuation_block_type'
10342 39         181 @q = qw( sort map grep eval do );
10343 39         131 @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
10344              
10345 39         124 @q = qw( sort map grep );
10346 39         132 @is_sort_map_grep{@q} = (1) x scalar(@q);
10347              
10348 39         90 %is_grep_alias = ();
10349              
10350             # I'll build the list of keywords incrementally
10351 39         92 my @Keywords = ();
10352              
10353             # keywords and tokens after which a value or pattern is expected,
10354             # but not an operator. In other words, these should consume terms
10355             # to their right, or at least they are not expected to be followed
10356             # immediately by operators.
10357 39         1441 my @value_requestor = qw(
10358             AUTOLOAD
10359             BEGIN
10360             CHECK
10361             DESTROY
10362             END
10363             EQ
10364             GE
10365             GT
10366             INIT
10367             LE
10368             LT
10369             NE
10370             UNITCHECK
10371             abs
10372             accept
10373             alarm
10374             and
10375             atan2
10376             bind
10377             binmode
10378             bless
10379             break
10380             caller
10381             chdir
10382             chmod
10383             chomp
10384             chop
10385             chown
10386             chr
10387             chroot
10388             close
10389             closedir
10390             cmp
10391             connect
10392             continue
10393             cos
10394             crypt
10395             dbmclose
10396             dbmopen
10397             defined
10398             delete
10399             die
10400             dump
10401             each
10402             else
10403             elsif
10404             eof
10405             eq
10406             evalbytes
10407             exec
10408             exists
10409             exit
10410             exp
10411             fc
10412             fcntl
10413             fileno
10414             flock
10415             for
10416             foreach
10417             formline
10418             ge
10419             getc
10420             getgrgid
10421             getgrnam
10422             gethostbyaddr
10423             gethostbyname
10424             getnetbyaddr
10425             getnetbyname
10426             getpeername
10427             getpgrp
10428             getpriority
10429             getprotobyname
10430             getprotobynumber
10431             getpwnam
10432             getpwuid
10433             getservbyname
10434             getservbyport
10435             getsockname
10436             getsockopt
10437             glob
10438             gmtime
10439             goto
10440             grep
10441             gt
10442             hex
10443             if
10444             index
10445             int
10446             ioctl
10447             join
10448             keys
10449             kill
10450             last
10451             lc
10452             lcfirst
10453             le
10454             length
10455             link
10456             listen
10457             local
10458             localtime
10459             lock
10460             log
10461             lstat
10462             lt
10463             map
10464             mkdir
10465             msgctl
10466             msgget
10467             msgrcv
10468             msgsnd
10469             my
10470             ne
10471             next
10472             no
10473             not
10474             oct
10475             open
10476             opendir
10477             or
10478             ord
10479             our
10480             pack
10481             pipe
10482             pop
10483             pos
10484             print
10485             printf
10486             prototype
10487             push
10488             quotemeta
10489             rand
10490             read
10491             readdir
10492             readlink
10493             readline
10494             readpipe
10495             recv
10496             redo
10497             ref
10498             rename
10499             require
10500             reset
10501             return
10502             reverse
10503             rewinddir
10504             rindex
10505             rmdir
10506             scalar
10507             seek
10508             seekdir
10509             select
10510             semctl
10511             semget
10512             semop
10513             send
10514             sethostent
10515             setnetent
10516             setpgrp
10517             setpriority
10518             setprotoent
10519             setservent
10520             setsockopt
10521             shift
10522             shmctl
10523             shmget
10524             shmread
10525             shmwrite
10526             shutdown
10527             sin
10528             sleep
10529             socket
10530             socketpair
10531             sort
10532             splice
10533             split
10534             sprintf
10535             sqrt
10536             srand
10537             stat
10538             state
10539             study
10540             substr
10541             symlink
10542             syscall
10543             sysopen
10544             sysread
10545             sysseek
10546             system
10547             syswrite
10548             tell
10549             telldir
10550             tie
10551             tied
10552             truncate
10553             uc
10554             ucfirst
10555             umask
10556             undef
10557             unless
10558             unlink
10559             unpack
10560             unshift
10561             untie
10562             until
10563             use
10564             utime
10565             values
10566             vec
10567             waitpid
10568             warn
10569             while
10570             write
10571             xor
10572              
10573             switch
10574             case
10575             default
10576             given
10577             when
10578             err
10579             say
10580             isa
10581              
10582             catch
10583              
10584             );
10585              
10586             # Note: 'ADJUST', 'field' are added by sub check_options
10587             # if --use-feature=class
10588              
10589             # patched above for SWITCH/CASE given/when err say
10590             # 'err' is a fairly safe addition.
10591             # Added 'default' for Switch::Plain. Note that we could also have
10592             # a separate set of keywords to include if we see 'use Switch::Plain'
10593 39         1984 push( @Keywords, @value_requestor );
10594              
10595             # These are treated the same but are not keywords:
10596 39         166 my @extra_vr = qw(
10597             constant
10598             vars
10599             );
10600 39         327 push( @value_requestor, @extra_vr );
10601              
10602 39         4733 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10603              
10604             # this list contains keywords which do not look for arguments,
10605             # so that they might be followed by an operator, or at least
10606             # not a term.
10607 39         250 my @operator_requestor = qw(
10608             endgrent
10609             endhostent
10610             endnetent
10611             endprotoent
10612             endpwent
10613             endservent
10614             fork
10615             getgrent
10616             gethostent
10617             getlogin
10618             getnetent
10619             getppid
10620             getprotoent
10621             getpwent
10622             getservent
10623             setgrent
10624             setpwent
10625             time
10626             times
10627             wait
10628             wantarray
10629             );
10630              
10631 39         174 push( @Keywords, @operator_requestor );
10632              
10633             # These are treated the same but are not considered keywords:
10634 39         88 my @extra_or = qw(
10635             STDERR
10636             STDIN
10637             STDOUT
10638             );
10639              
10640 39         111 push( @operator_requestor, @extra_or );
10641              
10642 39         874 @expecting_operator_token{@operator_requestor} =
10643             (1) x scalar(@operator_requestor);
10644              
10645             # these token TYPES expect trailing operator but not a term
10646             # note: ++ and -- are post-increment and decrement, 'C' = constant
10647 39         202 my @operator_requestor_types = qw( ++ -- C <> q );
10648              
10649             # NOTE: This hash is available but not currently used
10650 39         184 @expecting_operator_types{@operator_requestor_types} =
10651             (1) x scalar(@operator_requestor_types);
10652              
10653             # these token TYPES consume values (terms)
10654             # note: pp and mm are pre-increment and decrement
10655             # f=semicolon in for, F=file test operator
10656 39         713 my @value_requestor_type = qw#
10657             L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10658             **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10659             <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10660             f F pp mm Y p m U J G j >> << ^ t
10661             ~. ^. |. &. ^.= |.= &.=
10662             #;
10663 39         196 push( @value_requestor_type, ',' )
10664             ; # (perl doesn't like a ',' in a qw block)
10665              
10666             # NOTE: This hash is available but not currently used
10667 39         1555 @expecting_term_types{@value_requestor_type} =
10668             (1) x scalar(@value_requestor_type);
10669              
10670             # Note: the following valid token types are not assigned here to
10671             # hashes requesting to be followed by values or terms, but are
10672             # instead currently hard-coded into sub operator_expected:
10673             # ) -> :: Q R Z ] b h i k n v w } #
10674              
10675             # For simple syntax checking, it is nice to have a list of operators which
10676             # will really be unhappy if not followed by a term. This includes most
10677             # of the above...
10678 39         952 @really_want_term{@value_requestor_type} =
10679             (1) x scalar(@value_requestor_type);
10680              
10681             # with these exceptions...
10682 39         154 delete $really_want_term{'U'}; # user sub, depends on prototype
10683 39         95 delete $really_want_term{'F'}; # file test works on $_ if no following term
10684 39         77 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
10685             # let perl do it
10686 39         138 @q = qw(q qq qx qr s y tr m);
10687 39         195 @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10688              
10689             # Note added 'qw' here
10690 39         150 @q = qw(q qq qw qx qr s y tr m);
10691 39         188 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10692              
10693             # Note: 'class' will be added by sub check_options if -use-feature=class
10694 39         129 @q = qw(package);
10695 39         129 @is_package{@q} = (1) x scalar(@q);
10696              
10697 39         115 @q = qw( if elsif unless );
10698 39         150 @is_if_elsif_unless{@q} = (1) x scalar(@q);
10699              
10700 39         99 @q = qw( ; t );
10701 39         117 @is_semicolon_or_t{@q} = (1) x scalar(@q);
10702              
10703 39         135 @q = qw( if elsif unless case when );
10704 39         173 @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
10705              
10706             # Hash of other possible line endings which may occur.
10707             # Keep these coordinated with the regex where this is used.
10708             # Note: chr(13) = chr(015)="\r".
10709 39         120 @q = ( chr(13), chr(29), chr(26) );
10710 39         210 @other_line_endings{@q} = (1) x scalar(@q);
10711              
10712             # These keywords are handled specially in the tokenizer code:
10713 39         307 my @special_keywords = qw(
10714             do
10715             eval
10716             format
10717             m
10718             package
10719             q
10720             qq
10721             qr
10722             qw
10723             qx
10724             s
10725             sub
10726             tr
10727             y
10728             );
10729 39         454 push( @Keywords, @special_keywords );
10730              
10731             # Keywords after which list formatting may be used
10732             # WARNING: do not include |map|grep|eval or perl may die on
10733             # syntax errors (map1.t).
10734 39         415 my @keyword_taking_list = qw(
10735             and
10736             chmod
10737             chomp
10738             chop
10739             chown
10740             dbmopen
10741             die
10742             elsif
10743             exec
10744             fcntl
10745             for
10746             foreach
10747             formline
10748             getsockopt
10749             if
10750             index
10751             ioctl
10752             join
10753             kill
10754             local
10755             msgctl
10756             msgrcv
10757             msgsnd
10758             my
10759             open
10760             or
10761             our
10762             pack
10763             print
10764             printf
10765             push
10766             read
10767             readpipe
10768             recv
10769             return
10770             reverse
10771             rindex
10772             seek
10773             select
10774             semctl
10775             semget
10776             send
10777             setpriority
10778             setsockopt
10779             shmctl
10780             shmget
10781             shmread
10782             shmwrite
10783             socket
10784             socketpair
10785             sort
10786             splice
10787             split
10788             sprintf
10789             state
10790             substr
10791             syscall
10792             sysopen
10793             sysread
10794             sysseek
10795             system
10796             syswrite
10797             tie
10798             unless
10799             unlink
10800             unpack
10801             unshift
10802             until
10803             vec
10804             warn
10805             while
10806             given
10807             when
10808             );
10809              
10810             # NOTE: This hash is available but not currently used
10811 39         1437 @is_keyword_taking_list{@keyword_taking_list} =
10812             (1) x scalar(@keyword_taking_list);
10813              
10814             # perl functions which may be unary operators.
10815              
10816             # This list is used to decide if a pattern delimited by slashes, /pattern/,
10817             # can follow one of these keywords.
10818 39         281 @q = qw(
10819             chomp eof eval fc lc pop shift uc undef
10820             );
10821              
10822 39         300 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
10823             (1) x scalar(@q);
10824              
10825             # These are keywords for which an arg may optionally be omitted. They are
10826             # currently only used to disambiguate a ? used as a ternary from one used
10827             # as a (deprecated) pattern delimiter. In the future, they might be used
10828             # to give a warning about ambiguous syntax before a /.
10829             # Note: split has been omitted (see note below).
10830 39         423 my @keywords_taking_optional_arg = qw(
10831             abs
10832             alarm
10833             caller
10834             chdir
10835             chomp
10836             chop
10837             chr
10838             chroot
10839             close
10840             cos
10841             defined
10842             die
10843             eof
10844             eval
10845             evalbytes
10846             exit
10847             exp
10848             fc
10849             getc
10850             glob
10851             gmtime
10852             hex
10853             int
10854             last
10855             lc
10856             lcfirst
10857             length
10858             localtime
10859             log
10860             lstat
10861             mkdir
10862             next
10863             oct
10864             ord
10865             pop
10866             pos
10867             print
10868             printf
10869             prototype
10870             quotemeta
10871             rand
10872             readline
10873             readlink
10874             readpipe
10875             redo
10876             ref
10877             require
10878             reset
10879             reverse
10880             rmdir
10881             say
10882             select
10883             shift
10884             sin
10885             sleep
10886             sqrt
10887             srand
10888             stat
10889             study
10890             tell
10891             uc
10892             ucfirst
10893             umask
10894             undef
10895             unlink
10896             warn
10897             write
10898             );
10899 39         975 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
10900             (1) x scalar(@keywords_taking_optional_arg);
10901              
10902             # This list is used to decide if a pattern delimited by question marks,
10903             # ?pattern?, can follow one of these keywords. Note that from perl 5.22
10904             # on, a ?pattern? is not recognized, so we can be much more strict than
10905             # with a /pattern/. Note that 'split' is not in this list. In current
10906             # versions of perl a question following split must be a ternary, but
10907             # in older versions it could be a pattern. The guessing algorithm will
10908             # decide. We are combining two lists here to simplify the test.
10909 39         949 @q = ( @keywords_taking_optional_arg, @operator_requestor );
10910 39         1423 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
10911             (1) x scalar(@q);
10912              
10913             # These are not used in any way yet
10914             # my @unused_keywords = qw(
10915             # __FILE__
10916             # __LINE__
10917             # __PACKAGE__
10918             # );
10919              
10920             # The list of keywords was originally extracted from function 'keyword' in
10921             # perl file toke.c version 5.005.03, using this utility, plus a
10922             # little editing: (file getkwd.pl):
10923             # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
10924             # Add 'get' prefix where necessary, then split into the above lists.
10925             # This list should be updated as necessary.
10926             # The list should not contain these special variables:
10927             # ARGV DATA ENV SIG STDERR STDIN STDOUT
10928             # __DATA__ __END__
10929              
10930 39         10120 @is_keyword{@Keywords} = (1) x scalar(@Keywords);
10931             } ## end BEGIN
10932             1;