File Coverage

blib/lib/Perl/Tidy/Tokenizer.pm
Criterion Covered Total %
statement 2315 3295 70.2
branch 1098 1748 62.8
condition 523 948 55.1
subroutine 160 192 83.3
pod 0 147 0.0
total 4096 6330 64.7


line stmt bran cond sub pod time code
1             #####################################################################
2             #
3             # The Perl::Tidy::Tokenizer package is essentially a filter which
4             # reads lines of perl source code from a source object and provides
5             # corresponding tokenized lines through its get_line() method. Lines
6             # flow from the source_object to the caller like this:
7             #
8             # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9             # get_line() get_line() get_line() line_of_tokens
10             #
11             # The source object can be any object with a get_line() method which
12             # supplies one line (a character string) perl call.
13             # The LineBuffer object is created by the Tokenizer.
14             # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15             # containing one tokenized line for each call to its get_line() method.
16             #
17             # WARNING: This is not a real class. Only one tokenizer my be used.
18             #
19             ########################################################################
20              
21             package Perl::Tidy::Tokenizer;
22 38     38   289 use strict;
  38         108  
  38         1231  
23 38     38   204 use warnings;
  38         90  
  38         1379  
24 38     38   238 use English qw( -no_match_vars );
  38         119  
  38         234  
25              
26             our $VERSION = '20230701';
27              
28 38     38   30891 use Perl::Tidy::LineBuffer;
  38         124  
  38         1176  
29 38     38   295 use Carp;
  38         112  
  38         2242  
30              
31 38     38   257 use constant DEVEL_MODE => 0;
  38         114  
  38         2142  
32 38     38   254 use constant EMPTY_STRING => q{};
  38         104  
  38         1796  
33 38     38   325 use constant SPACE => q{ };
  38         120  
  38         2281  
34              
35             # Decimal values of some ascii characters for quick checks
36 38     38   284 use constant ORD_TAB => 9;
  38         84  
  38         2092  
37 38     38   270 use constant ORD_SPACE => 32;
  38         97  
  38         1957  
38 38     38   268 use constant ORD_PRINTABLE_MIN => 33;
  38         103  
  38         2012  
39 38     38   291 use constant ORD_PRINTABLE_MAX => 126;
  38         98  
  38         7241  
40              
41             # GLOBAL VARIABLES which change during tokenization:
42             # These could also be stored in $self but it is more convenient and
43             # efficient to make them global lexical variables.
44             # INITIALIZER: sub prepare_for_a_new_file
45             my (
46              
47             $brace_depth,
48             $context,
49             $current_package,
50             $last_nonblank_block_type,
51             $last_nonblank_token,
52             $last_nonblank_type,
53             $next_sequence_number,
54             $paren_depth,
55             $rbrace_context,
56             $rbrace_package,
57             $rbrace_structural_type,
58             $rbrace_type,
59             $rcurrent_depth,
60             $rcurrent_sequence_number,
61             $rdepth_array,
62             $ris_block_function,
63             $ris_block_list_function,
64             $ris_constant,
65             $ris_user_function,
66             $rnested_statement_type,
67             $rnested_ternary_flag,
68             $rparen_semicolon_count,
69             $rparen_structural_type,
70             $rparen_type,
71             $rsaw_function_definition,
72             $rsaw_use_module,
73             $rsquare_bracket_structural_type,
74             $rsquare_bracket_type,
75             $rstarting_line_of_current_depth,
76             $rtotal_depth,
77             $ruser_function_prototype,
78             $square_bracket_depth,
79             $statement_type,
80             $total_depth,
81             );
82              
83             my (
84              
85             # GLOBAL CONSTANTS for routines in this package,
86             # INITIALIZER: BEGIN block.
87             %can_start_digraph,
88             %expecting_operator_token,
89             %expecting_operator_types,
90             %expecting_term_token,
91             %expecting_term_types,
92             %is_block_operator,
93             %is_comma_question_colon,
94             %is_digraph,
95             %is_file_test_operator,
96             %is_if_elsif_unless,
97             %is_if_elsif_unless_case_when,
98             %is_indirect_object_taker,
99             %is_keyword_rejecting_question_as_pattern_delimiter,
100             %is_keyword_rejecting_slash_as_pattern_delimiter,
101             %is_keyword_taking_list,
102             %is_keyword_taking_optional_arg,
103             %is_q_qq_qw_qx_qr_s_y_tr_m,
104             %is_q_qq_qx_qr_s_y_tr_m,
105             %is_semicolon_or_t,
106             %is_sort_map_grep,
107             %is_sort_map_grep_eval_do,
108             %is_tetragraph,
109             %is_trigraph,
110             %is_valid_token_type,
111             %other_line_endings,
112             %really_want_term,
113             @closing_brace_names,
114             @opening_brace_names,
115              
116             # GLOBAL VARIABLES which are constant after being configured.
117             # INITIALIZER: BEGIN block and modified by sub check_options
118             %is_code_block_token,
119             %is_keyword,
120             %is_my_our_state,
121             %is_package,
122              
123             # INITIALIZER: sub check_options
124             $code_skipping_pattern_begin,
125             $code_skipping_pattern_end,
126             $rOpts_code_skipping,
127             %is_END_DATA_format_sub,
128             %is_grep_alias,
129             %is_sub,
130             );
131              
132             # possible values of operator_expected()
133 38     38   321 use constant TERM => -1;
  38         145  
  38         2163  
134 38     38   274 use constant UNKNOWN => 0;
  38         104  
  38         1986  
135 38     38   316 use constant OPERATOR => 1;
  38         84  
  38         2046  
136              
137             # possible values of context
138 38     38   246 use constant SCALAR_CONTEXT => -1;
  38         127  
  38         5617  
139 38     38   274 use constant UNKNOWN_CONTEXT => 0;
  38         82  
  38         2882  
140 38     38   278 use constant LIST_CONTEXT => 1;
  38         112  
  38         1896  
141              
142             # Maximum number of little messages; probably need not be changed.
143 38     38   266 use constant MAX_NAG_MESSAGES => 6;
  38         115  
  38         8286  
144              
145 0         0 BEGIN {
146              
147             # Array index names for $self.
148             # Do not combine with other BEGIN blocks (c101).
149 38     38   249989 my $i = 0;
150             use constant {
151 38         23422 _rhere_target_list_ => $i++,
152             _in_here_doc_ => $i++,
153             _here_doc_target_ => $i++,
154             _here_quote_character_ => $i++,
155             _in_data_ => $i++,
156             _in_end_ => $i++,
157             _in_format_ => $i++,
158             _in_error_ => $i++,
159             _in_trouble_ => $i++,
160             _warning_count_ => $i++,
161             _html_tag_count_ => $i++,
162             _in_pod_ => $i++,
163             _in_skipped_ => $i++,
164             _in_attribute_list_ => $i++,
165             _in_quote_ => $i++,
166             _quote_target_ => $i++,
167             _line_start_quote_ => $i++,
168             _starting_level_ => $i++,
169             _know_starting_level_ => $i++,
170             _tabsize_ => $i++,
171             _indent_columns_ => $i++,
172             _look_for_hash_bang_ => $i++,
173             _trim_qw_ => $i++,
174             _continuation_indentation_ => $i++,
175             _outdent_labels_ => $i++,
176             _last_line_number_ => $i++,
177             _saw_perl_dash_P_ => $i++,
178             _saw_perl_dash_w_ => $i++,
179             _saw_use_strict_ => $i++,
180             _saw_v_string_ => $i++,
181             _saw_brace_error_ => $i++,
182             _hit_bug_ => $i++,
183             _look_for_autoloader_ => $i++,
184             _look_for_selfloader_ => $i++,
185             _saw_autoloader_ => $i++,
186             _saw_selfloader_ => $i++,
187             _saw_hash_bang_ => $i++,
188             _saw_end_ => $i++,
189             _saw_data_ => $i++,
190             _saw_negative_indentation_ => $i++,
191             _started_tokenizing_ => $i++,
192             _line_buffer_object_ => $i++,
193             _debugger_object_ => $i++,
194             _diagnostics_object_ => $i++,
195             _logger_object_ => $i++,
196             _unexpected_error_count_ => $i++,
197             _started_looking_for_here_target_at_ => $i++,
198             _nearly_matched_here_target_at_ => $i++,
199             _line_of_text_ => $i++,
200             _rlower_case_labels_at_ => $i++,
201             _extended_syntax_ => $i++,
202             _maximum_level_ => $i++,
203             _true_brace_error_count_ => $i++,
204             _rOpts_maximum_level_errors_ => $i++,
205             _rOpts_maximum_unexpected_errors_ => $i++,
206             _rOpts_logfile_ => $i++,
207             _rOpts_ => $i++,
208             _calculate_ci_ => $i++,
209 38     38   335 };
  38         113  
210             } ## end BEGIN
211              
212             { ## closure for subs to count instances
213              
214             # methods to count instances
215             my $_count = 0;
216 0     0 0 0 sub get_count { return $_count; }
217 556     556   2540 sub _increment_count { return ++$_count }
218 556     556   1311 sub _decrement_count { return --$_count }
219             }
220              
221             sub DESTROY {
222 556     556   1412 my $self = shift;
223 556         2961 $self->_decrement_count();
224 556         7795 return;
225             }
226              
227             sub AUTOLOAD {
228              
229             # Catch any undefined sub calls so that we are sure to get
230             # some diagnostic information. This sub should never be called
231             # except for a programming error.
232 0     0   0 our $AUTOLOAD;
233 0 0       0 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
234 0         0 my ( $pkg, $fname, $lno ) = caller();
235 0         0 my $my_package = __PACKAGE__;
236 0         0 print STDERR <<EOM;
237             ======================================================================
238             Error detected in package '$my_package', version $VERSION
239             Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
240             Called from package: '$pkg'
241             Called from File '$fname' at line '$lno'
242             This error is probably due to a recent programming change
243             ======================================================================
244             EOM
245 0         0 exit 1;
246             } ## end sub AUTOLOAD
247              
248             sub Die {
249 0     0 0 0 my ($msg) = @_;
250 0         0 Perl::Tidy::Die($msg);
251 0         0 croak "unexpected return from Perl::Tidy::Die";
252             }
253              
254             sub Fault {
255 0     0 0 0 my ( $self, $msg ) = @_;
256              
257             # This routine is called for errors that really should not occur
258             # except if there has been a bug introduced by a recent program change.
259             # Please add comments at calls to Fault to explain why the call
260             # should not occur, and where to look to fix it.
261 0         0 my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
262 0         0 my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
263 0         0 my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
264 0         0 my $pkg = __PACKAGE__;
265              
266             # Catch potential error of Fault not called as a method
267 0         0 my $input_stream_name;
268 0 0       0 if ( !ref($self) ) {
269 0         0 $msg = "Fault not called as a method - please fix\n";
270 0 0 0     0 if ( $self && length($self) < 200 ) { $msg .= $self }
  0         0  
271 0         0 $self = undef;
272 0         0 $input_stream_name = "(UNKNOWN)";
273             }
274             else {
275 0         0 $input_stream_name = $self->get_input_stream_name();
276             }
277              
278 0         0 Die(<<EOM);
279             ==============================================================================
280             While operating on input stream with name: '$input_stream_name'
281             A fault was detected at line $line0 of sub '$subroutine1'
282             in file '$filename1'
283             which was called from line $line1 of sub '$subroutine2'
284             Message: '$msg'
285             This is probably an error introduced by a recent programming change.
286             $pkg reports VERSION='$VERSION'.
287             ==============================================================================
288             EOM
289              
290             # We shouldn't get here, but this return is to keep Perl-Critic from
291             # complaining.
292 0         0 return;
293             } ## end sub Fault
294              
295             sub bad_pattern {
296              
297             # See if a pattern will compile. We have to use a string eval here,
298             # but it should be safe because the pattern has been constructed
299             # by this program.
300 1108     1108 0 2598 my ($pattern) = @_;
301 1108         82436 my $ok = eval "'##'=~/$pattern/";
302 1108   33     10116 return !defined($ok) || $EVAL_ERROR;
303             } ## end sub bad_pattern
304              
305             sub make_code_skipping_pattern {
306 1108     1108 0 3389 my ( $rOpts, $opt_name, $default ) = @_;
307 1108         2756 my $param = $rOpts->{$opt_name};
308 1108 100       2973 unless ($param) { $param = $default }
  1106         2203  
309 1108         4601 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
310 1108 50       4777 if ( $param !~ /^#/ ) {
311 0         0 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
312             }
313 1108         3863 my $pattern = '^\s*' . $param . '\b';
314 1108 50       3357 if ( bad_pattern($pattern) ) {
315 0         0 Die(
316             "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
317             );
318             }
319 1108         3945 return $pattern;
320             } ## end sub make_code_skipping_pattern
321              
322             sub check_options {
323              
324             # Check Tokenizer parameters
325 554     554 0 1543 my $rOpts = shift;
326              
327 554         2188 %is_sub = ();
328 554         1834 $is_sub{'sub'} = 1;
329              
330 554         4129 %is_END_DATA_format_sub = (
331             '__END__' => 1,
332             '__DATA__' => 1,
333             'format' => 1,
334             'sub' => 1,
335             );
336              
337             # Install any aliases to 'sub'
338 554 50       2205 if ( $rOpts->{'sub-alias-list'} ) {
339              
340             # Note that any 'sub-alias-list' has been preprocessed to
341             # be a trimmed, space-separated list which includes 'sub'
342             # for example, it might be 'sub method fun'
343 554         4373 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
344 554         2355 foreach my $word (@sub_alias_list) {
345 1114         2421 $is_sub{$word} = 1;
346 1114         2684 $is_END_DATA_format_sub{$word} = 1;
347             }
348             }
349              
350             #------------------------------------------------
351             # Update hash values for any -use-feature options
352             #------------------------------------------------
353 554         4419 my $use_feature_class = $rOpts->{'use-feature'} =~ /\bclass\b/;
354              
355             # These are the main updates for this option. There are additional
356             # changes elsewhere, usually indicated with a comment 'rt145706'
357              
358             # Update hash values for use_feature=class, added for rt145706
359             # see 'perlclass.pod'
360              
361             # IMPORTANT: We are changing global hash values initially set in a BEGIN
362             # block. Values must be defined (true or false) for each of these new
363             # words whether true or false. Otherwise, programs using the module which
364             # change options between runs (such as test code) will have
365             # incorrect settings and fail.
366              
367             # There are 4 new keywords:
368              
369             # 'class' - treated specially as generalization of 'package'
370             # Note: we must not set 'class' to be a keyword to avoid problems
371             # with older uses.
372 554         2475 $is_package{'class'} = $use_feature_class;
373              
374             # 'method' - treated like sub using the sub-alias-list option
375             # Note: we must not set 'method' to be a keyword to avoid problems
376             # with older uses.
377              
378             # 'field' - added as a keyword, and works like 'my'
379 554         1764 $is_keyword{'field'} = $use_feature_class;
380 554         1511 $is_my_our_state{'field'} = $use_feature_class;
381              
382             # 'ADJUST' - added as a keyword and works like 'BEGIN'
383             # TODO: if ADJUST gets a paren list, this will need to be updated
384 554         1587 $is_keyword{'ADJUST'} = $use_feature_class;
385 554         1587 $is_code_block_token{'ADJUST'} = $use_feature_class;
386              
387 554         2142 %is_grep_alias = ();
388 554 50       1964 if ( $rOpts->{'grep-alias-list'} ) {
389              
390             # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
391             # space-separated list
392 554         3797 my @q = split /\s+/, $rOpts->{'grep-alias-list'};
393 554         4306 @{is_grep_alias}{@q} = (1) x scalar(@q);
394             }
395              
396 554         1678 $rOpts_code_skipping = $rOpts->{'code-skipping'};
397 554         2823 $code_skipping_pattern_begin =
398             make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
399 554         2105 $code_skipping_pattern_end =
400             make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
401              
402 554         2752 return;
403             } ## end sub check_options
404              
405             sub new {
406              
407 556     556 0 5789 my ( $class, @args ) = @_;
408              
409             # Note: 'tabs' and 'indent_columns' are temporary and should be
410             # removed asap
411 556         8851 my %defaults = (
412             source_object => undef,
413             debugger_object => undef,
414             diagnostics_object => undef,
415             logger_object => undef,
416             starting_level => undef,
417             indent_columns => 4,
418             tabsize => 8,
419             look_for_hash_bang => 0,
420             trim_qw => 1,
421             look_for_autoloader => 1,
422             look_for_selfloader => 1,
423             starting_line_number => 1,
424             extended_syntax => 0,
425             rOpts => {},
426             );
427 556         7890 my %args = ( %defaults, @args );
428              
429             # we are given an object with a get_line() method to supply source lines
430 556         2259 my $source_object = $args{source_object};
431 556         1627 my $rOpts = $args{rOpts};
432              
433             # we create another object with a get_line() and peek_ahead() method
434 556         4097 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
435              
436             # Tokenizer state data is as follows:
437             # _rhere_target_list_ reference to list of here-doc targets
438             # _here_doc_target_ the target string for a here document
439             # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
440             # to determine if interpolation is done
441             # _quote_target_ character we seek if chasing a quote
442             # _line_start_quote_ line where we started looking for a long quote
443             # _in_here_doc_ flag indicating if we are in a here-doc
444             # _in_pod_ flag set if we are in pod documentation
445             # _in_skipped_ flag set if we are in a skipped section
446             # _in_error_ flag set if we saw severe error (binary in script)
447             # _in_trouble_ set if we saw a troublesome lexical like 'my sub s'
448             # _warning_count_ number of calls to logger sub warning
449             # _html_tag_count_ number of apparent html tags seen (indicates html)
450             # _in_data_ flag set if we are in __DATA__ section
451             # _in_end_ flag set if we are in __END__ section
452             # _in_format_ flag set if we are in a format description
453             # _in_attribute_list_ flag telling if we are looking for attributes
454             # _in_quote_ flag telling if we are chasing a quote
455             # _starting_level_ indentation level of first line
456             # _line_buffer_object_ object with get_line() method to supply source code
457             # _diagnostics_object_ place to write debugging information
458             # _unexpected_error_count_ error count used to limit output
459             # _lower_case_labels_at_ line numbers where lower case labels seen
460             # _hit_bug_ program bug detected
461              
462 556         1621 my $self = [];
463 556         1726 $self->[_rhere_target_list_] = [];
464 556         1298 $self->[_in_here_doc_] = 0;
465 556         1545 $self->[_here_doc_target_] = EMPTY_STRING;
466 556         1449 $self->[_here_quote_character_] = EMPTY_STRING;
467 556         1718 $self->[_in_data_] = 0;
468 556         1645 $self->[_in_end_] = 0;
469 556         1413 $self->[_in_format_] = 0;
470 556         1334 $self->[_in_error_] = 0;
471 556         1334 $self->[_in_trouble_] = 0;
472 556         1345 $self->[_warning_count_] = 0;
473 556         1363 $self->[_html_tag_count_] = 0;
474 556         1346 $self->[_in_pod_] = 0;
475 556         1336 $self->[_in_skipped_] = 0;
476 556         1215 $self->[_in_attribute_list_] = 0;
477 556         1313 $self->[_in_quote_] = 0;
478 556         1446 $self->[_quote_target_] = EMPTY_STRING;
479 556         1249 $self->[_line_start_quote_] = -1;
480 556         1302 $self->[_starting_level_] = $args{starting_level};
481 556         1658 $self->[_know_starting_level_] = defined( $args{starting_level} );
482 556         1484 $self->[_tabsize_] = $args{tabsize};
483 556         1547 $self->[_indent_columns_] = $args{indent_columns};
484 556         1506 $self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
485 556         1387 $self->[_trim_qw_] = $args{trim_qw};
486 556         1423 $self->[_continuation_indentation_] = $args{continuation_indentation};
487 556         1385 $self->[_outdent_labels_] = $args{outdent_labels};
488 556         1622 $self->[_last_line_number_] = $args{starting_line_number} - 1;
489 556         1163 $self->[_saw_perl_dash_P_] = 0;
490 556         1372 $self->[_saw_perl_dash_w_] = 0;
491 556         1205 $self->[_saw_use_strict_] = 0;
492 556         1280 $self->[_saw_v_string_] = 0;
493 556         1311 $self->[_saw_brace_error_] = 0;
494 556         1179 $self->[_hit_bug_] = 0;
495 556         1254 $self->[_look_for_autoloader_] = $args{look_for_autoloader};
496 556         1559 $self->[_look_for_selfloader_] = $args{look_for_selfloader};
497 556         1322 $self->[_saw_autoloader_] = 0;
498 556         1196 $self->[_saw_selfloader_] = 0;
499 556         1163 $self->[_saw_hash_bang_] = 0;
500 556         1200 $self->[_saw_end_] = 0;
501 556         1238 $self->[_saw_data_] = 0;
502 556         1250 $self->[_saw_negative_indentation_] = 0;
503 556         1408 $self->[_started_tokenizing_] = 0;
504 556         1270 $self->[_line_buffer_object_] = $line_buffer_object;
505 556         1283 $self->[_debugger_object_] = $args{debugger_object};
506 556         1477 $self->[_diagnostics_object_] = $args{diagnostics_object};
507 556         1297 $self->[_logger_object_] = $args{logger_object};
508 556         1275 $self->[_unexpected_error_count_] = 0;
509 556         1220 $self->[_started_looking_for_here_target_at_] = 0;
510 556         1286 $self->[_nearly_matched_here_target_at_] = undef;
511 556         1600 $self->[_line_of_text_] = EMPTY_STRING;
512 556         1351 $self->[_rlower_case_labels_at_] = undef;
513 556         1284 $self->[_extended_syntax_] = $args{extended_syntax};
514 556         1414 $self->[_maximum_level_] = 0;
515 556         1335 $self->[_true_brace_error_count_] = 0;
516 556         1364 $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
517             $self->[_rOpts_maximum_unexpected_errors_] =
518 556         1500 $rOpts->{'maximum-unexpected-errors'};
519 556         1449 $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
520 556         1302 $self->[_rOpts_] = $rOpts;
521              
522             # -exp=ci0 and -exp=ci1 turn on the tokenizer ci calculation for testing.
523             # See comments in sub Perl::Tidy::Formatter::set_ci.
524 556         1305 my $calculate_ci = 0; # current default
525 556 50 33     2581 if ( $rOpts->{'experimental'} && $rOpts->{'experimental'} =~ /\bci(\d+)\b/ )
526             {
527 0   0     0 $calculate_ci = ( $1 == 0 || $1 == 1 );
528             }
529 556         1320 $self->[_calculate_ci_] = $calculate_ci;
530              
531             # These vars are used for guessing indentation and must be positive
532 556 50       2202 $self->[_tabsize_] = 8 if ( !$self->[_tabsize_] );
533 556 100       1933 $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
534              
535 556         1375 bless $self, $class;
536              
537 556         3387 $self->prepare_for_a_new_file();
538 556         3241 $self->find_starting_indentation_level();
539              
540             # This is not a full class yet, so die if an attempt is made to
541             # create more than one object.
542              
543 556 50       2876 if ( _increment_count() > 1 ) {
544 0         0 confess
545             "Attempt to create more than 1 object in $class, which is not a true class yet\n";
546             }
547              
548 556         6064 return $self;
549              
550             } ## end sub new
551              
552             # Called externally
553             sub get_unexpected_error_count {
554 4     4 0 16 my ($self) = @_;
555 4         17 return $self->[_unexpected_error_count_];
556             }
557              
558             # Called externally
559             sub is_keyword {
560 2766     2766 0 4959 my ($str) = @_;
561 2766         9828 return $is_keyword{$str};
562             }
563              
564             #-----------------------------------------
565             # interface to Perl::Tidy::Logger routines
566             #-----------------------------------------
567             sub warning {
568              
569 0     0 0 0 my ( $self, $msg ) = @_;
570              
571 0         0 my $logger_object = $self->[_logger_object_];
572 0         0 $self->[_warning_count_]++;
573 0 0       0 if ($logger_object) {
574 0         0 my $msg_line_number = $self->[_last_line_number_];
575 0         0 $logger_object->warning( $msg, $msg_line_number );
576             }
577 0         0 return;
578             } ## end sub warning
579              
580             sub get_input_stream_name {
581              
582 0     0 0 0 my $self = shift;
583              
584 0         0 my $input_stream_name = EMPTY_STRING;
585 0         0 my $logger_object = $self->[_logger_object_];
586 0 0       0 if ($logger_object) {
587 0         0 $input_stream_name = $logger_object->get_input_stream_name();
588             }
589 0         0 return $input_stream_name;
590             } ## end sub get_input_stream_name
591              
592             sub complain {
593              
594 32     32 0 99 my ( $self, $msg ) = @_;
595              
596 32         72 my $logger_object = $self->[_logger_object_];
597 32 50       101 if ($logger_object) {
598 32         85 my $input_line_number = $self->[_last_line_number_];
599 32         166 $logger_object->complain( $msg, $input_line_number );
600             }
601 32         81 return;
602             } ## end sub complain
603              
604             sub write_logfile_entry {
605              
606 1840     1840 0 4254 my ( $self, $msg ) = @_;
607              
608 1840         3502 my $logger_object = $self->[_logger_object_];
609 1840 100       4597 if ($logger_object) {
610 1834         5296 $logger_object->write_logfile_entry($msg);
611             }
612 1840         5323 return;
613             } ## end sub write_logfile_entry
614              
615             sub interrupt_logfile {
616              
617 0     0 0 0 my $self = shift;
618              
619 0         0 my $logger_object = $self->[_logger_object_];
620 0 0       0 if ($logger_object) {
621 0         0 $logger_object->interrupt_logfile();
622             }
623 0         0 return;
624             } ## end sub interrupt_logfile
625              
626             sub resume_logfile {
627              
628 0     0 0 0 my $self = shift;
629              
630 0         0 my $logger_object = $self->[_logger_object_];
631 0 0       0 if ($logger_object) {
632 0         0 $logger_object->resume_logfile();
633             }
634 0         0 return;
635             } ## end sub resume_logfile
636              
637             sub brace_warning {
638 0     0 0 0 my ( $self, $msg ) = @_;
639 0         0 $self->[_saw_brace_error_]++;
640              
641 0         0 my $logger_object = $self->[_logger_object_];
642 0 0       0 if ($logger_object) {
643 0         0 my $msg_line_number = $self->[_last_line_number_];
644 0         0 $logger_object->brace_warning( $msg, $msg_line_number );
645             }
646 0         0 return;
647             } ## end sub brace_warning
648              
649             sub increment_brace_error {
650              
651             # This is same as sub brace_warning but without a message
652 0     0 0 0 my $self = shift;
653 0         0 $self->[_saw_brace_error_]++;
654              
655 0         0 my $logger_object = $self->[_logger_object_];
656 0 0       0 if ($logger_object) {
657 0         0 $logger_object->increment_brace_error();
658             }
659 0         0 return;
660             } ## end sub increment_brace_error
661              
662             sub get_saw_brace_error {
663 0     0 0 0 my $self = shift;
664 0         0 return $self->[_saw_brace_error_];
665             } ## end sub get_saw_brace_error
666              
667             sub report_definite_bug {
668 0     0 0 0 my $self = shift;
669 0         0 $self->[_hit_bug_] = 1;
670 0         0 my $logger_object = $self->[_logger_object_];
671 0 0       0 if ($logger_object) {
672 0         0 $logger_object->report_definite_bug();
673             }
674 0         0 return;
675             } ## end sub report_definite_bug
676              
677             #-------------------------------------
678             # Interface to Perl::Tidy::Diagnostics
679             #-------------------------------------
680             sub write_diagnostics {
681 0     0 0 0 my ( $self, $msg ) = @_;
682 0         0 my $input_line_number = $self->[_last_line_number_];
683 0         0 my $diagnostics_object = $self->[_diagnostics_object_];
684 0 0       0 if ($diagnostics_object) {
685 0         0 $diagnostics_object->write_diagnostics( $msg, $input_line_number );
686             }
687 0         0 return;
688             } ## end sub write_diagnostics
689              
690             sub report_tokenization_errors {
691              
692 556     556 0 1891 my ($self) = @_;
693              
694             # Report any tokenization errors and return a flag '$severe_error'.
695             # Set $severe_error = 1 if the tokenization errors are so severe that
696             # the formatter should not attempt to format the file. Instead, it will
697             # just output the file verbatim.
698              
699             # set severe error flag if tokenizer has encountered file reading problems
700             # (i.e. unexpected binary characters)
701             # or code which may not be formatted correctly (such as 'my sub q')
702             # The difference between _in_error_ and _in_trouble_ is that
703             # _in_error_ stops the tokenizer immediately whereas
704             # _in_trouble_ lets the tokenizer finish so that all errors are seen
705             # Both block formatting and cause the input stream to be output verbatim.
706 556   33     3617 my $severe_error = $self->[_in_error_] || $self->[_in_trouble_];
707              
708             # And do not format if it looks like an html file (c209)
709 556   33     4142 $severe_error ||= $self->[_html_tag_count_] && $self->[_warning_count_];
      33        
710              
711             # Inform the logger object on length of input stream
712 556         1467 my $logger_object = $self->[_logger_object_];
713 556 100       2044 if ($logger_object) {
714 554         1469 my $last_line_number = $self->[_last_line_number_];
715 554         2873 $logger_object->set_last_input_line_number($last_line_number);
716             }
717              
718 556         1507 my $maxle = $self->[_rOpts_maximum_level_errors_];
719 556         1569 my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
720 556 50       1783 $maxle = 1 unless defined($maxle);
721 556 50       1761 $maxue = 0 unless defined($maxue);
722              
723 556         2386 my $level = get_indentation_level();
724 556 50       2174 if ( $level != $self->[_starting_level_] ) {
725 0         0 $self->warning("final indentation level: $level\n");
726 0         0 my $level_diff = $self->[_starting_level_] - $level;
727 0 0       0 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
  0         0  
728              
729             # Set severe error flag if the level error is greater than 1.
730             # The formatter can function for any level error but it is probably
731             # best not to attempt formatting for a high level error.
732 0 0 0     0 if ( $maxle >= 0 && $level_diff > $maxle ) {
733 0         0 $severe_error = 1;
734 0         0 $self->warning(<<EOM);
735             Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
736             EOM
737             }
738             }
739              
740 556         3163 $self->check_final_nesting_depths();
741              
742             # Likewise, large numbers of brace errors usually indicate non-perl
743             # scripts, so set the severe error flag at a low number. This is similar
744             # to the level check, but different because braces may balance but be
745             # incorrectly interlaced.
746 556 50       2824 if ( $self->[_true_brace_error_count_] > 2 ) {
747 0         0 $severe_error = 1;
748             }
749              
750 556 50 66     2634 if ( $self->[_look_for_hash_bang_]
751             && !$self->[_saw_hash_bang_] )
752             {
753 0         0 $self->warning(
754             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
755             }
756              
757 556 50       1904 if ( $self->[_in_format_] ) {
758 0         0 $self->warning("hit EOF while in format description\n");
759             }
760              
761 556 50       1997 if ( $self->[_in_skipped_] ) {
762 0         0 $self->write_logfile_entry(
763             "hit EOF while in lines skipped with --code-skipping\n");
764             }
765              
766 556 50       1995 if ( $self->[_in_pod_] ) {
767              
768             # Just write log entry if this is after __END__ or __DATA__
769             # because this happens to often, and it is not likely to be
770             # a parsing error.
771 0 0 0     0 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
772 0         0 $self->write_logfile_entry(
773             "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
774             );
775             }
776              
777             else {
778 0         0 $self->complain(
779             "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
780             );
781             }
782              
783             }
784              
785 556 50       2027 if ( $self->[_in_here_doc_] ) {
786 0         0 $severe_error = 1;
787 0         0 my $here_doc_target = $self->[_here_doc_target_];
788 0         0 my $started_looking_for_here_target_at =
789             $self->[_started_looking_for_here_target_at_];
790 0 0       0 if ($here_doc_target) {
791 0         0 $self->warning(
792             "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
793             );
794             }
795             else {
796 0         0 $self->warning(<<EOM);
797             Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
798             (Perl will match to the end of file but this may not be intended).
799             EOM
800             }
801 0         0 my $nearly_matched_here_target_at =
802             $self->[_nearly_matched_here_target_at_];
803 0 0       0 if ($nearly_matched_here_target_at) {
804 0         0 $self->warning(
805             "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
806             );
807             }
808             }
809              
810             # Something is seriously wrong if we ended inside a quote
811 556 50       2020 if ( $self->[_in_quote_] ) {
812 0         0 $severe_error = 1;
813 0         0 my $line_start_quote = $self->[_line_start_quote_];
814 0         0 my $quote_target = $self->[_quote_target_];
815 0 0       0 my $what =
816             ( $self->[_in_attribute_list_] )
817             ? "attribute list"
818             : "quote/pattern";
819 0         0 $self->warning(
820             "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
821             );
822             }
823              
824 556 50       2289 if ( $self->[_hit_bug_] ) {
825 0         0 $severe_error = 1;
826             }
827              
828             # Multiple "unexpected" type tokenization errors usually indicate parsing
829             # non-perl scripts, or that something is seriously wrong, so we should
830             # avoid formatting them. This can happen for example if we run perltidy on
831             # a shell script or an html file. But unfortunately this check can
832             # interfere with some extended syntaxes, such as RPerl, so it has to be off
833             # by default.
834 556         1420 my $ue_count = $self->[_unexpected_error_count_];
835 556 50 33     2447 if ( $maxue > 0 && $ue_count > $maxue ) {
836 0         0 $self->warning(<<EOM);
837             Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
838             EOM
839 0         0 $severe_error = 1;
840             }
841              
842 556 100       1997 unless ( $self->[_saw_perl_dash_w_] ) {
843 540 50       2199 if ( $] < 5.006 ) {
844 0         0 $self->write_logfile_entry("Suggest including '-w parameter'\n");
845             }
846             else {
847 540         1905 $self->write_logfile_entry("Suggest including 'use warnings;'\n");
848             }
849             }
850              
851 556 50       3412 if ( $self->[_saw_perl_dash_P_] ) {
852 0         0 $self->write_logfile_entry(
853             "Use of -P parameter for defines is discouraged\n");
854             }
855              
856 556 100       2241 unless ( $self->[_saw_use_strict_] ) {
857 542         1592 $self->write_logfile_entry("Suggest including 'use strict;'\n");
858             }
859              
860             # it is suggested that labels have at least one upper case character
861             # for legibility and to avoid code breakage as new keywords are introduced
862 556 100       3352 if ( $self->[_rlower_case_labels_at_] ) {
863 12         33 my @lower_case_labels_at = @{ $self->[_rlower_case_labels_at_] };
  12         36  
864 12         51 $self->write_logfile_entry(
865             "Suggest using upper case characters in label(s)\n");
866 12         94 local $LIST_SEPARATOR = ')(';
867 12         81 $self->write_logfile_entry(
868             " defined at line(s): (@lower_case_labels_at)\n");
869             }
870 556         2154 return $severe_error;
871             } ## end sub report_tokenization_errors
872              
873             sub report_v_string {
874              
875             # warn if this version can't handle v-strings
876 2     2 0 10 my ( $self, $tok ) = @_;
877 2 50       10 unless ( $self->[_saw_v_string_] ) {
878 2         6 $self->[_saw_v_string_] = $self->[_last_line_number_];
879             }
880 2 50       8 if ( $] < 5.006 ) {
881 0         0 $self->warning(
882             "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
883             );
884             }
885 2         6 return;
886             } ## end sub report_v_string
887              
888             sub is_valid_token_type {
889 3     3 0 7 my ($type) = @_;
890 3         13 return $is_valid_token_type{$type};
891             }
892              
893             sub log_numbered_msg {
894 165     165 0 817 my ( $self, $msg ) = @_;
895              
896             # write input line number + message to logfile
897 165         364 my $input_line_number = $self->[_last_line_number_];
898 165         780 $self->write_logfile_entry("Line $input_line_number: $msg");
899 165         407 return;
900             } ## end sub log_numbered_msg
901              
902             # returns the next tokenized line
903             sub get_line {
904              
905 8205     8205 0 15740 my $self = shift;
906              
907             # USES GLOBAL VARIABLES:
908             # $brace_depth, $square_bracket_depth, $paren_depth
909              
910 8205         25794 my $input_line = $self->[_line_buffer_object_]->get_line();
911 8205         16028 $self->[_line_of_text_] = $input_line;
912              
913 8205 100       19855 return unless ($input_line);
914              
915 7649         12591 my $input_line_number = ++$self->[_last_line_number_];
916              
917             # Find and remove what characters terminate this line, including any
918             # control r
919 7649         11649 my $input_line_separator = EMPTY_STRING;
920 7649 50       21229 if ( chomp($input_line) ) {
921 7649         17473 $input_line_separator = $INPUT_RECORD_SEPARATOR;
922             }
923              
924             # The first test here very significantly speeds things up, but be sure to
925             # keep the regex and hash %other_line_endings the same.
926 7649 100       23927 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
927 24 50       357 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
928 24         82 $input_line_separator = $2 . $input_line_separator;
929             }
930             }
931              
932             # for backwards compatibility we keep the line text terminated with
933             # a newline character
934 7649         15528 $input_line .= "\n";
935 7649         13520 $self->[_line_of_text_] = $input_line;
936              
937             # create a data structure describing this line which will be
938             # returned to the caller.
939              
940             # _line_type codes are:
941             # SYSTEM - system-specific code before hash-bang line
942             # CODE - line of perl code (including comments)
943             # POD_START - line starting pod, such as '=head'
944             # POD - pod documentation text
945             # POD_END - last line of pod section, '=cut'
946             # HERE - text of here-document
947             # HERE_END - last line of here-doc (target word)
948             # FORMAT - format section
949             # FORMAT_END - last line of format section, '.'
950             # SKIP - code skipping section
951             # SKIP_END - last line of code skipping section, '#>>V'
952             # DATA_START - __DATA__ line
953             # DATA - unidentified text following __DATA__
954             # END_START - __END__ line
955             # END - unidentified text following __END__
956             # ERROR - we are in big trouble, probably not a perl script
957              
958             # Other variables:
959             # _curly_brace_depth - depth of curly braces at start of line
960             # _square_bracket_depth - depth of square brackets at start of line
961             # _paren_depth - depth of parens at start of line
962             # _starting_in_quote - this line continues a multi-line quote
963             # (so don't trim leading blanks!)
964             # _ending_in_quote - this line ends in a multi-line quote
965             # (so don't trim trailing blanks!)
966 7649         42598 my $line_of_tokens = {
967             _line_type => 'EOF',
968             _line_text => $input_line,
969             _line_number => $input_line_number,
970             _guessed_indentation_level => 0,
971             _curly_brace_depth => $brace_depth,
972             _square_bracket_depth => $square_bracket_depth,
973             _paren_depth => $paren_depth,
974             _quote_character => EMPTY_STRING,
975             ## Skip these needless initializations for efficiency:
976             ## _rtoken_type => undef,
977             ## _rtokens => undef,
978             ## _rlevels => undef,
979             ## _rblock_type => undef,
980             ## _rtype_sequence => undef,
981             ## _rci_levels => undef,
982             ## _starting_in_quote => 0,
983             ## _ending_in_quote => 0,
984             };
985              
986             # must print line unchanged if we are in a here document
987 7649 100       38786 if ( $self->[_in_here_doc_] ) {
    100          
    100          
    100          
    50          
    100          
    100          
988              
989 24         82 $line_of_tokens->{_line_type} = 'HERE';
990 24         56 my $here_doc_target = $self->[_here_doc_target_];
991 24         63 my $here_quote_character = $self->[_here_quote_character_];
992 24         62 my $candidate_target = $input_line;
993 24         45 chomp $candidate_target;
994              
995             # Handle <<~ targets, which are indicated here by a leading space on
996             # the here quote character
997 24 100       111 if ( $here_quote_character =~ /^\s/ ) {
998 4         16 $candidate_target =~ s/^\s*//;
999             }
1000 24 100       87 if ( $candidate_target eq $here_doc_target ) {
1001 9         52 $self->[_nearly_matched_here_target_at_] = undef;
1002 9         43 $line_of_tokens->{_line_type} = 'HERE_END';
1003 9         65 $self->log_numbered_msg("Exiting HERE document $here_doc_target\n");
1004              
1005 9         45 my $rhere_target_list = $self->[_rhere_target_list_];
1006 9 50       42 if ( @{$rhere_target_list} ) { # there can be multiple here targets
  9         41  
1007             ( $here_doc_target, $here_quote_character ) =
1008 0         0 @{ shift @{$rhere_target_list} };
  0         0  
  0         0  
1009 0         0 $self->[_here_doc_target_] = $here_doc_target;
1010 0         0 $self->[_here_quote_character_] = $here_quote_character;
1011 0         0 $self->log_numbered_msg(
1012             "Entering HERE document $here_doc_target\n");
1013 0         0 $self->[_nearly_matched_here_target_at_] = undef;
1014 0         0 $self->[_started_looking_for_here_target_at_] =
1015             $input_line_number;
1016             }
1017             else {
1018 9         32 $self->[_in_here_doc_] = 0;
1019 9         31 $self->[_here_doc_target_] = EMPTY_STRING;
1020 9         31 $self->[_here_quote_character_] = EMPTY_STRING;
1021             }
1022             }
1023              
1024             # check for error of extra whitespace
1025             # note for PERL6: leading whitespace is allowed
1026             else {
1027 15         147 $candidate_target =~ s/\s*$//;
1028 15         72 $candidate_target =~ s/^\s*//;
1029 15 50       64 if ( $candidate_target eq $here_doc_target ) {
1030 0         0 $self->[_nearly_matched_here_target_at_] = $input_line_number;
1031             }
1032             }
1033 24         103 return $line_of_tokens;
1034             }
1035              
1036             # Print line unchanged if we are in a format section
1037             elsif ( $self->[_in_format_] ) {
1038              
1039 3 100       15 if ( $input_line =~ /^\.[\s#]*$/ ) {
1040              
1041             # Decrement format depth count at a '.' after a 'format'
1042 1         5 $self->[_in_format_]--;
1043              
1044             # This is the end when count reaches 0
1045 1 50       5 if ( !$self->[_in_format_] ) {
1046 1         4 $self->log_numbered_msg("Exiting format section\n");
1047 1         3 $line_of_tokens->{_line_type} = 'FORMAT_END';
1048              
1049             # Make the tokenizer mark an opening brace which follows
1050             # as a code block. Fixes issue c202/t032.
1051 1         3 $last_nonblank_token = ';';
1052 1         3 $last_nonblank_type = ';';
1053             }
1054             }
1055             else {
1056 2         4 $line_of_tokens->{_line_type} = 'FORMAT';
1057 2 50       9 if ( $input_line =~ /^\s*format\s+\w+/ ) {
1058              
1059             # Increment format depth count at a 'format' within a 'format'
1060             # This is a simple way to handle nested formats (issue c019).
1061 0         0 $self->[_in_format_]++;
1062             }
1063             }
1064 3         11 return $line_of_tokens;
1065             }
1066              
1067             # must print line unchanged if we are in pod documentation
1068             elsif ( $self->[_in_pod_] ) {
1069              
1070 44         117 $line_of_tokens->{_line_type} = 'POD';
1071 44 100       267 if ( $input_line =~ /^=cut/ ) {
1072 19         61 $line_of_tokens->{_line_type} = 'POD_END';
1073 19         68 $self->log_numbered_msg("Exiting POD section\n");
1074 19         51 $self->[_in_pod_] = 0;
1075             }
1076 44 50 33     144 if ( $input_line =~ /^\#\!.*perl\b/ && !$self->[_in_end_] ) {
1077 0         0 $self->warning(
1078             "Hash-bang in pod can cause older versions of perl to fail! \n"
1079             );
1080             }
1081              
1082 44         152 return $line_of_tokens;
1083             }
1084              
1085             # print line unchanged if in skipped section
1086             elsif ( $self->[_in_skipped_] ) {
1087              
1088 8         27 $line_of_tokens->{_line_type} = 'SKIP';
1089 8 100       102 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
    50          
1090 2         10 $line_of_tokens->{_line_type} = 'SKIP_END';
1091 2         10 $self->log_numbered_msg("Exiting code-skipping section\n");
1092 2         10 $self->[_in_skipped_] = 0;
1093             }
1094             elsif ( $input_line =~ /$code_skipping_pattern_begin/ ) {
1095              
1096             # warn of duplicate starting comment lines, git #118
1097 0         0 my $lno = $self->[_in_skipped_];
1098 0         0 $self->warning(
1099             "Already in code-skipping section which started at line $lno\n"
1100             );
1101             }
1102 8         32 return $line_of_tokens;
1103             }
1104              
1105             # must print line unchanged if we have seen a severe error (i.e., we
1106             # are seeing illegal tokens and cannot continue. Syntax errors do
1107             # not pass this route). Calling routine can decide what to do, but
1108             # the default can be to just pass all lines as if they were after __END__
1109             elsif ( $self->[_in_error_] ) {
1110 0         0 $line_of_tokens->{_line_type} = 'ERROR';
1111 0         0 return $line_of_tokens;
1112             }
1113              
1114             # print line unchanged if we are __DATA__ section
1115             elsif ( $self->[_in_data_] ) {
1116              
1117             # ...but look for POD
1118             # Note that the _in_data and _in_end flags remain set
1119             # so that we return to that state after seeing the
1120             # end of a pod section
1121 1 50 33     10 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1122 0         0 $line_of_tokens->{_line_type} = 'POD_START';
1123 0         0 $self->log_numbered_msg("Entering POD section\n");
1124 0         0 $self->[_in_pod_] = 1;
1125 0         0 return $line_of_tokens;
1126             }
1127             else {
1128 1         3 $line_of_tokens->{_line_type} = 'DATA';
1129 1         5 return $line_of_tokens;
1130             }
1131             }
1132              
1133             # print line unchanged if we are in __END__ section
1134             elsif ( $self->[_in_end_] ) {
1135              
1136             # ...but look for POD
1137             # Note that the _in_data and _in_end flags remain set
1138             # so that we return to that state after seeing the
1139             # end of a pod section
1140 48 100 66     273 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1141 6         29 $line_of_tokens->{_line_type} = 'POD_START';
1142 6         23 $self->log_numbered_msg("Entering POD section\n");
1143 6         42 $self->[_in_pod_] = 1;
1144 6         30 return $line_of_tokens;
1145             }
1146             else {
1147 42         85 $line_of_tokens->{_line_type} = 'END';
1148 42         132 return $line_of_tokens;
1149             }
1150             }
1151              
1152             # check for a hash-bang line if we haven't seen one
1153 7521 100       16727 if ( !$self->[_saw_hash_bang_] ) {
1154 6973 100       20727 if ( $input_line =~ /^\#\!.*perl\b/ ) {
1155 15         55 $self->[_saw_hash_bang_] = $input_line_number;
1156              
1157             # check for -w and -P flags
1158 15 50       86 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1159 0         0 $self->[_saw_perl_dash_P_] = 1;
1160             }
1161              
1162 15 100       94 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1163 8         27 $self->[_saw_perl_dash_w_] = 1;
1164             }
1165              
1166 15 100 33     123 if (
      66        
      100        
      66        
1167             $input_line_number > 1
1168              
1169             # leave any hash bang in a BEGIN block alone
1170             # i.e. see 'debugger-duck_type.t'
1171             && !(
1172             $last_nonblank_block_type
1173             && $last_nonblank_block_type eq 'BEGIN'
1174             )
1175             && !$self->[_look_for_hash_bang_]
1176              
1177             # Try to avoid giving a false alarm at a simple comment.
1178             # These look like valid hash-bang lines:
1179              
1180             #!/usr/bin/perl -w
1181             #! /usr/bin/perl -w
1182             #!c:\perl\bin\perl.exe
1183              
1184             # These are comments:
1185             #! I love perl
1186             #! sunos does not yet provide a /usr/bin/perl
1187              
1188             # Comments typically have multiple spaces, which suggests
1189             # the filter
1190             && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1191             )
1192             {
1193              
1194             # this is helpful for VMS systems; we may have accidentally
1195             # tokenized some DCL commands
1196 1 50       4 if ( $self->[_started_tokenizing_] ) {
1197 0         0 $self->warning(
1198             "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1199             );
1200             }
1201             else {
1202 1         6 $self->complain("Useless hash-bang after line 1\n");
1203             }
1204             }
1205              
1206             # Report the leading hash-bang as a system line
1207             # This will prevent -dac from deleting it
1208             else {
1209 14         50 $line_of_tokens->{_line_type} = 'SYSTEM';
1210 14         131 return $line_of_tokens;
1211             }
1212             }
1213             }
1214              
1215             # wait for a hash-bang before parsing if the user invoked us with -x
1216 7507 100 100     18409 if ( $self->[_look_for_hash_bang_]
1217             && !$self->[_saw_hash_bang_] )
1218             {
1219 5         9 $line_of_tokens->{_line_type} = 'SYSTEM';
1220 5         17 return $line_of_tokens;
1221             }
1222              
1223             # a first line of the form ': #' will be marked as SYSTEM
1224             # since lines of this form may be used by tcsh
1225 7502 50 66     19912 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1226 0         0 $line_of_tokens->{_line_type} = 'SYSTEM';
1227 0         0 return $line_of_tokens;
1228             }
1229              
1230             # now we know that it is ok to tokenize the line...
1231             # the line tokenizer will modify any of these private variables:
1232             # _rhere_target_list_
1233             # _in_data_
1234             # _in_end_
1235             # _in_format_
1236             # _in_error_
1237             # _in_skipped_
1238             # _in_pod_
1239             # _in_quote_
1240 7502         24455 $self->tokenize_this_line($line_of_tokens);
1241              
1242             # Now finish defining the return structure and return it
1243 7502         15635 $line_of_tokens->{_ending_in_quote} = $self->[_in_quote_];
1244              
1245             # handle severe error (binary data in script)
1246 7502 50       17287 if ( $self->[_in_error_] ) {
1247 0         0 $self->[_in_quote_] = 0; # to avoid any more messages
1248 0         0 $self->warning("Giving up after error\n");
1249 0         0 $line_of_tokens->{_line_type} = 'ERROR';
1250 0         0 reset_indentation_level(0); # avoid error messages
1251 0         0 return $line_of_tokens;
1252             }
1253              
1254             # handle start of pod documentation
1255 7502 100       15698 if ( $self->[_in_pod_] ) {
1256              
1257             # This gets tricky..above a __DATA__ or __END__ section, perl
1258             # accepts '=cut' as the start of pod section. But afterwards,
1259             # only pod utilities see it and they may ignore an =cut without
1260             # leading =head. In any case, this isn't good.
1261 13 50       58 if ( $input_line =~ /^=cut\b/ ) {
1262 0 0 0     0 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
1263 0         0 $self->complain("=cut while not in pod ignored\n");
1264 0         0 $self->[_in_pod_] = 0;
1265 0         0 $line_of_tokens->{_line_type} = 'POD_END';
1266             }
1267             else {
1268 0         0 $line_of_tokens->{_line_type} = 'POD_START';
1269 0         0 $self->warning(
1270             "=cut starts a pod section .. this can fool pod utilities.\n"
1271             ) unless (DEVEL_MODE);
1272 0         0 $self->log_numbered_msg("Entering POD section\n");
1273             }
1274             }
1275              
1276             else {
1277 13         38 $line_of_tokens->{_line_type} = 'POD_START';
1278 13         58 $self->log_numbered_msg("Entering POD section\n");
1279             }
1280              
1281 13         62 return $line_of_tokens;
1282             }
1283              
1284             # handle start of skipped section
1285 7489 100       15155 if ( $self->[_in_skipped_] ) {
1286              
1287 2         6 $line_of_tokens->{_line_type} = 'SKIP';
1288 2         12 $self->log_numbered_msg("Entering code-skipping section\n");
1289 2         8 return $line_of_tokens;
1290             }
1291              
1292             # see if this line contains here doc targets
1293 7487         11942 my $rhere_target_list = $self->[_rhere_target_list_];
1294 7487 100       10474 if ( @{$rhere_target_list} ) {
  7487         17069  
1295              
1296             my ( $here_doc_target, $here_quote_character ) =
1297 9         39 @{ shift @{$rhere_target_list} };
  9         24  
  9         43  
1298 9         38 $self->[_in_here_doc_] = 1;
1299 9         36 $self->[_here_doc_target_] = $here_doc_target;
1300 9         30 $self->[_here_quote_character_] = $here_quote_character;
1301 9         70 $self->log_numbered_msg("Entering HERE document $here_doc_target\n");
1302 9         36 $self->[_started_looking_for_here_target_at_] = $input_line_number;
1303             }
1304              
1305             # NOTE: __END__ and __DATA__ statements are written unformatted
1306             # because they can theoretically contain additional characters
1307             # which are not tokenized (and cannot be read with <DATA> either!).
1308 7487 100       20623 if ( $self->[_in_data_] ) {
    100          
1309 1         3 $line_of_tokens->{_line_type} = 'DATA_START';
1310 1         6 $self->log_numbered_msg("Starting __DATA__ section\n");
1311 1         3 $self->[_saw_data_] = 1;
1312              
1313             # keep parsing after __DATA__ if use SelfLoader was seen
1314 1 50       3 if ( $self->[_saw_selfloader_] ) {
1315 0         0 $self->[_in_data_] = 0;
1316 0         0 $self->log_numbered_msg(
1317             "SelfLoader seen, continuing; -nlsl deactivates\n");
1318             }
1319              
1320 1         6 return $line_of_tokens;
1321             }
1322              
1323             elsif ( $self->[_in_end_] ) {
1324 6         31 $line_of_tokens->{_line_type} = 'END_START';
1325 6         32 $self->log_numbered_msg("Starting __END__ section\n");
1326 6         15 $self->[_saw_end_] = 1;
1327              
1328             # keep parsing after __END__ if use AutoLoader was seen
1329 6 50       23 if ( $self->[_saw_autoloader_] ) {
1330 0         0 $self->[_in_end_] = 0;
1331 0         0 $self->log_numbered_msg(
1332             "AutoLoader seen, continuing; -nlal deactivates\n");
1333             }
1334 6         29 return $line_of_tokens;
1335             }
1336              
1337             # now, finally, we know that this line is type 'CODE'
1338 7480         13980 $line_of_tokens->{_line_type} = 'CODE';
1339              
1340             # remember if we have seen any real code
1341 7480 100 100     23649 if ( !$self->[_started_tokenizing_]
      100        
1342             && $input_line !~ /^\s*$/
1343             && $input_line !~ /^\s*#/ )
1344             {
1345 553         2078 $self->[_started_tokenizing_] = 1;
1346             }
1347              
1348 7480 100       16071 if ( $self->[_debugger_object_] ) {
1349 7         33 $self->[_debugger_object_]->write_debug_entry($line_of_tokens);
1350             }
1351              
1352             # Note: if keyword 'format' occurs in this line code, it is still CODE
1353             # (keyword 'format' need not start a line)
1354 7480 100       15466 if ( $self->[_in_format_] ) {
1355 1         7 $self->log_numbered_msg("Entering format section\n");
1356             }
1357              
1358 7480 100 100     29097 if ( $self->[_in_quote_]
    100 100        
1359             and ( $self->[_line_start_quote_] < 0 ) )
1360             {
1361              
1362             #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1363 49 100       413 if ( ( my $quote_target = $self->[_quote_target_] ) !~ /^\s*$/ ) {
1364 48         132 $self->[_line_start_quote_] = $input_line_number;
1365 48         328 $self->log_numbered_msg(
1366             "Start multi-line quote or pattern ending in $quote_target\n");
1367             }
1368             }
1369             elsif ( ( $self->[_line_start_quote_] >= 0 )
1370             && !$self->[_in_quote_] )
1371             {
1372 48         209 $self->[_line_start_quote_] = -1;
1373 48         252 $self->log_numbered_msg("End of multi-line quote or pattern\n");
1374             }
1375              
1376             # we are returning a line of CODE
1377 7480         29692 return $line_of_tokens;
1378             } ## end sub get_line
1379              
1380             sub find_starting_indentation_level {
1381              
1382             # We need to find the indentation level of the first line of the
1383             # script being formatted. Often it will be zero for an entire file,
1384             # but if we are formatting a local block of code (within an editor for
1385             # example) it may not be zero. The user may specify this with the
1386             # -sil=n parameter but normally doesn't so we have to guess.
1387             #
1388 556     556 0 1740 my ($self) = @_;
1389 556         1355 my $starting_level = 0;
1390              
1391             # use value if given as parameter
1392 556 100       2861 if ( $self->[_know_starting_level_] ) {
    100          
1393 1         2 $starting_level = $self->[_starting_level_];
1394             }
1395              
1396             # if we know there is a hash_bang line, the level must be zero
1397             elsif ( $self->[_look_for_hash_bang_] ) {
1398 1         3 $self->[_know_starting_level_] = 1;
1399             }
1400              
1401             # otherwise figure it out from the input file
1402             else {
1403 554         1189 my $line;
1404 554         1171 my $i = 0;
1405              
1406             # keep looking at lines until we find a hash bang or piece of code
1407 554         1258 my $msg = EMPTY_STRING;
1408 554         3560 while ( $line = $self->[_line_buffer_object_]->peek_ahead( $i++ ) ) {
1409              
1410             # if first line is #! then assume starting level is zero
1411 866 100 100     4973 if ( $i == 1 && $line =~ /^\#\!/ ) {
1412 13         51 $starting_level = 0;
1413 13         33 last;
1414             }
1415 853 100       4654 next if ( $line =~ /^\s*#/ ); # skip past comments
1416 557 100       3847 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1417 539         2658 $starting_level = $self->guess_old_indentation_level($line);
1418 539         1369 last;
1419             }
1420 554         2620 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1421 554         2980 $self->write_logfile_entry("$msg");
1422             }
1423 556         2236 $self->[_starting_level_] = $starting_level;
1424 556         3524 reset_indentation_level($starting_level);
1425 556         1186 return;
1426             } ## end sub find_starting_indentation_level
1427              
1428             sub guess_old_indentation_level {
1429 539     539 0 1870 my ( $self, $line ) = @_;
1430              
1431             # Guess the indentation level of an input line.
1432             #
1433             # For the first line of code this result will define the starting
1434             # indentation level. It will mainly be non-zero when perltidy is applied
1435             # within an editor to a local block of code.
1436             #
1437             # This is an impossible task in general because we can't know what tabs
1438             # meant for the old script and how many spaces were used for one
1439             # indentation level in the given input script. For example it may have
1440             # been previously formatted with -i=7 -et=3. But we can at least try to
1441             # make sure that perltidy guesses correctly if it is applied repeatedly to
1442             # a block of code within an editor, so that the block stays at the same
1443             # level when perltidy is applied repeatedly.
1444             #
1445             # USES GLOBAL VARIABLES: (none)
1446 539         1257 my $level = 0;
1447              
1448             # find leading tabs, spaces, and any statement label
1449 539         1146 my $spaces = 0;
1450 539 50       4304 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1451              
1452             # If there are leading tabs, we use the tab scheme for this run, if
1453             # any, so that the code will remain stable when editing.
1454 539 100       2602 if ($1) { $spaces += length($1) * $self->[_tabsize_] }
  2         10  
1455              
1456 539 100       2040 if ($2) { $spaces += length($2) }
  77         292  
1457              
1458             # correct for outdented labels
1459 539 50 66     2419 if ( $3 && $self->[_outdent_labels_] ) {
1460 1         6 $spaces += $self->[_continuation_indentation_];
1461             }
1462             }
1463              
1464             # compute indentation using the value of -i for this run.
1465             # If -i=0 is used for this run (which is possible) it doesn't matter
1466             # what we do here but we'll guess that the old run used 4 spaces per level.
1467 539         1651 my $indent_columns = $self->[_indent_columns_];
1468 539 50       1730 $indent_columns = 4 if ( !$indent_columns );
1469 539         2206 $level = int( $spaces / $indent_columns );
1470 539         1449 return ($level);
1471             } ## end sub guess_old_indentation_level
1472              
1473             # This is a currently unused debug routine
1474             sub dump_functions {
1475              
1476 0     0 0 0 my $fh = *STDOUT;
1477 0         0 foreach my $pkg ( keys %{$ris_user_function} ) {
  0         0  
1478 0         0 $fh->print("\nnon-constant subs in package $pkg\n");
1479              
1480 0         0 foreach my $sub ( keys %{ $ris_user_function->{$pkg} } ) {
  0         0  
1481 0         0 my $msg = EMPTY_STRING;
1482 0 0       0 if ( $ris_block_list_function->{$pkg}{$sub} ) {
1483 0         0 $msg = 'block_list';
1484             }
1485              
1486 0 0       0 if ( $ris_block_function->{$pkg}{$sub} ) {
1487 0         0 $msg = 'block';
1488             }
1489 0         0 $fh->print("$sub $msg\n");
1490             }
1491             }
1492              
1493 0         0 foreach my $pkg ( keys %{$ris_constant} ) {
  0         0  
1494 0         0 $fh->print("\nconstants and constant subs in package $pkg\n");
1495              
1496 0         0 foreach my $sub ( keys %{ $ris_constant->{$pkg} } ) {
  0         0  
1497 0         0 $fh->print("$sub\n");
1498             }
1499             }
1500 0         0 return;
1501             } ## end sub dump_functions
1502              
1503             sub prepare_for_a_new_file {
1504              
1505 556     556 0 1359 my $self = shift;
1506              
1507             # previous tokens needed to determine what to expect next
1508 556         1662 $last_nonblank_token = ';'; # the only possible starting state which
1509 556         1438 $last_nonblank_type = ';'; # will make a leading brace a code block
1510 556         1353 $last_nonblank_block_type = EMPTY_STRING;
1511              
1512             # scalars for remembering statement types across multiple lines
1513 556         1409 $statement_type = EMPTY_STRING; # '' or 'use' or 'sub..' or 'case..'
1514              
1515             # scalars for remembering where we are in the file
1516 556         1343 $current_package = "main";
1517 556         1212 $context = UNKNOWN_CONTEXT;
1518              
1519             # hashes used to remember function information
1520 556         1915 $ris_constant = {}; # user-defined constants
1521 556         1613 $ris_user_function = {}; # user-defined functions
1522 556         1550 $ruser_function_prototype = {}; # their prototypes
1523 556         1706 $ris_block_function = {};
1524 556         1687 $ris_block_list_function = {};
1525 556         1405 $rsaw_function_definition = {};
1526 556         1418 $rsaw_use_module = {};
1527              
1528             # variables used to track depths of various containers
1529             # and report nesting errors
1530 556         1283 $paren_depth = 0;
1531 556         1124 $brace_depth = 0;
1532 556         1033 $square_bracket_depth = 0;
1533 556         2434 $rcurrent_depth = [ (0) x scalar @closing_brace_names ];
1534 556         1179 $total_depth = 0;
1535 556         2172 $rtotal_depth = [];
1536 556         1942 $rcurrent_sequence_number = [];
1537 556         1347 $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
1538              
1539 556         1973 $rparen_type = [];
1540 556         1635 $rparen_semicolon_count = [];
1541 556         1847 $rparen_structural_type = [];
1542 556         1744 $rbrace_type = [];
1543 556         1706 $rbrace_structural_type = [];
1544 556         1462 $rbrace_context = [];
1545 556         1623 $rbrace_package = [];
1546 556         1491 $rsquare_bracket_type = [];
1547 556         1430 $rsquare_bracket_structural_type = [];
1548 556         3467 $rdepth_array = [];
1549 556         1200 $rnested_ternary_flag = [];
1550 556         3879 $rnested_statement_type = [];
1551 556         3130 $rstarting_line_of_current_depth = [];
1552              
1553 556         1744 $rparen_type->[$paren_depth] = EMPTY_STRING;
1554 556         1388 $rparen_semicolon_count->[$paren_depth] = 0;
1555 556         1379 $rparen_structural_type->[$brace_depth] = EMPTY_STRING;
1556 556         1400 $rbrace_type->[$brace_depth] = ';'; # identify opening brace as code block
1557 556         1360 $rbrace_structural_type->[$brace_depth] = EMPTY_STRING;
1558 556         1270 $rbrace_context->[$brace_depth] = UNKNOWN_CONTEXT;
1559 556         1413 $rbrace_package->[$paren_depth] = $current_package;
1560 556         1426 $rsquare_bracket_type->[$square_bracket_depth] = EMPTY_STRING;
1561 556         1296 $rsquare_bracket_structural_type->[$square_bracket_depth] = EMPTY_STRING;
1562              
1563 556         2790 initialize_tokenizer_state();
1564 556         1158 return;
1565             } ## end sub prepare_for_a_new_file
1566              
1567             { ## closure for sub tokenize_this_line
1568              
1569 38     38   375 use constant BRACE => 0;
  38         88  
  38         2682  
1570 38     38   293 use constant SQUARE_BRACKET => 1;
  38         81  
  38         2187  
1571 38     38   352 use constant PAREN => 2;
  38         132  
  38         2409  
1572 38     38   307 use constant QUESTION_COLON => 3;
  38         127  
  38         87603  
1573              
1574             # TV1: scalars for processing one LINE.
1575             # Re-initialized on each entry to sub tokenize_this_line.
1576             my (
1577             $block_type, $container_type, $expecting,
1578             $i, $i_tok, $input_line,
1579             $input_line_number, $last_nonblank_i, $max_token_index,
1580             $next_tok, $next_type, $peeked_ahead,
1581             $prototype, $rhere_target_list, $rtoken_map,
1582             $rtoken_type, $rtokens, $tok,
1583             $type, $type_sequence, $indent_flag,
1584             );
1585              
1586             # TV2: refs to ARRAYS for processing one LINE
1587             # Re-initialized on each call.
1588             my $routput_token_list = []; # stack of output token indexes
1589             my $routput_token_type = []; # token types
1590             my $routput_block_type = []; # types of code block
1591             my $routput_container_type = []; # paren types, such as if, elsif, ..
1592             my $routput_type_sequence = []; # nesting sequential number
1593             my $routput_indent_flag = []; #
1594              
1595             # TV3: SCALARS for quote variables. These are initialized with a
1596             # subroutine call and continually updated as lines are processed.
1597             my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1598             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1599              
1600             # TV4: SCALARS for multi-line identifiers and
1601             # statements. These are initialized with a subroutine call
1602             # and continually updated as lines are processed.
1603             my ( $id_scan_state, $identifier, $want_paren );
1604              
1605             # TV5: SCALARS for tracking indentation level.
1606             # Initialized once and continually updated as lines are
1607             # processed.
1608             my (
1609             $nesting_token_string, $nesting_type_string,
1610             $nesting_block_string, $nesting_block_flag,
1611             $nesting_list_string, $nesting_list_flag,
1612             $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1613             $in_statement_continuation, $level_in_tokenizer,
1614             $slevel_in_tokenizer, $rslevel_stack,
1615             );
1616              
1617             # TV6: SCALARS for remembering several previous
1618             # tokens. Initialized once and continually updated as
1619             # lines are processed.
1620             my (
1621             $last_nonblank_container_type, $last_nonblank_type_sequence,
1622             $last_last_nonblank_token, $last_last_nonblank_type,
1623             $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1624             $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1625             );
1626              
1627             # ----------------------------------------------------------------
1628             # beginning of tokenizer variable access and manipulation routines
1629             # ----------------------------------------------------------------
1630              
1631             sub initialize_tokenizer_state {
1632              
1633             # GV1: initialized once
1634             # TV1: initialized on each call
1635             # TV2: initialized on each call
1636             # TV3:
1637 556     556 0 1264 $in_quote = 0;
1638 556         1335 $quote_type = 'Q';
1639 556         1140 $quote_character = EMPTY_STRING;
1640 556         1101 $quote_pos = 0;
1641 556         1072 $quote_depth = 0;
1642 556         1163 $quoted_string_1 = EMPTY_STRING;
1643 556         1144 $quoted_string_2 = EMPTY_STRING;
1644 556         1171 $allowed_quote_modifiers = EMPTY_STRING;
1645              
1646             # TV4:
1647 556         1065 $id_scan_state = EMPTY_STRING;
1648 556         1196 $identifier = EMPTY_STRING;
1649 556         1299 $want_paren = EMPTY_STRING;
1650              
1651             # TV5:
1652 556         1303 $nesting_token_string = EMPTY_STRING;
1653 556         1199 $nesting_type_string = EMPTY_STRING;
1654 556         1219 $nesting_block_string = '1'; # initially in a block
1655 556         1008 $nesting_block_flag = 1;
1656 556         1094 $nesting_list_string = '0'; # initially not in a list
1657 556         1058 $nesting_list_flag = 0; # initially not in a list
1658 556         1098 $ci_string_in_tokenizer = EMPTY_STRING;
1659 556         1164 $continuation_string_in_tokenizer = "0";
1660 556         1039 $in_statement_continuation = 0;
1661 556         1087 $level_in_tokenizer = 0;
1662 556         1082 $slevel_in_tokenizer = 0;
1663 556         1676 $rslevel_stack = [];
1664              
1665             # TV6:
1666 556         1285 $last_nonblank_container_type = EMPTY_STRING;
1667 556         1281 $last_nonblank_type_sequence = EMPTY_STRING;
1668 556         1342 $last_last_nonblank_token = ';';
1669 556         1194 $last_last_nonblank_type = ';';
1670 556         1154 $last_last_nonblank_block_type = EMPTY_STRING;
1671 556         1087 $last_last_nonblank_container_type = EMPTY_STRING;
1672 556         1093 $last_last_nonblank_type_sequence = EMPTY_STRING;
1673 556         1045 $last_nonblank_prototype = EMPTY_STRING;
1674 556         1106 return;
1675             } ## end sub initialize_tokenizer_state
1676              
1677             sub save_tokenizer_state {
1678              
1679             # Global variables:
1680 0     0 0 0 my $rGV1 = [
1681             $brace_depth,
1682             $context,
1683             $current_package,
1684             $last_nonblank_block_type,
1685             $last_nonblank_token,
1686             $last_nonblank_type,
1687             $next_sequence_number,
1688             $paren_depth,
1689             $rbrace_context,
1690             $rbrace_package,
1691             $rbrace_structural_type,
1692             $rbrace_type,
1693             $rcurrent_depth,
1694             $rcurrent_sequence_number,
1695             $rdepth_array,
1696             $ris_block_function,
1697             $ris_block_list_function,
1698             $ris_constant,
1699             $ris_user_function,
1700             $rnested_statement_type,
1701             $rnested_ternary_flag,
1702             $rparen_semicolon_count,
1703             $rparen_structural_type,
1704             $rparen_type,
1705             $rsaw_function_definition,
1706             $rsaw_use_module,
1707             $rsquare_bracket_structural_type,
1708             $rsquare_bracket_type,
1709             $rstarting_line_of_current_depth,
1710             $rtotal_depth,
1711             $ruser_function_prototype,
1712             $square_bracket_depth,
1713             $statement_type,
1714             $total_depth,
1715              
1716             ];
1717              
1718             # Tokenizer closure variables:
1719 0         0 my $rTV1 = [
1720             $block_type, $container_type, $expecting,
1721             $i, $i_tok, $input_line,
1722             $input_line_number, $last_nonblank_i, $max_token_index,
1723             $next_tok, $next_type, $peeked_ahead,
1724             $prototype, $rhere_target_list, $rtoken_map,
1725             $rtoken_type, $rtokens, $tok,
1726             $type, $type_sequence, $indent_flag,
1727             ];
1728              
1729 0         0 my $rTV2 = [
1730             $routput_token_list, $routput_token_type,
1731             $routput_block_type, $routput_container_type,
1732             $routput_type_sequence, $routput_indent_flag,
1733             ];
1734              
1735 0         0 my $rTV3 = [
1736             $in_quote, $quote_type,
1737             $quote_character, $quote_pos,
1738             $quote_depth, $quoted_string_1,
1739             $quoted_string_2, $allowed_quote_modifiers,
1740             ];
1741              
1742 0         0 my $rTV4 = [ $id_scan_state, $identifier, $want_paren ];
1743              
1744 0         0 my $rTV5 = [
1745             $nesting_token_string, $nesting_type_string,
1746             $nesting_block_string, $nesting_block_flag,
1747             $nesting_list_string, $nesting_list_flag,
1748             $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1749             $in_statement_continuation, $level_in_tokenizer,
1750             $slevel_in_tokenizer, $rslevel_stack,
1751             ];
1752              
1753 0         0 my $rTV6 = [
1754             $last_nonblank_container_type,
1755             $last_nonblank_type_sequence,
1756             $last_last_nonblank_token,
1757             $last_last_nonblank_type,
1758             $last_last_nonblank_block_type,
1759             $last_last_nonblank_container_type,
1760             $last_last_nonblank_type_sequence,
1761             $last_nonblank_prototype,
1762             ];
1763 0         0 return [ $rGV1, $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1764             } ## end sub save_tokenizer_state
1765              
1766             sub restore_tokenizer_state {
1767 0     0 0 0 my ($rstate) = @_;
1768 0         0 my ( $rGV1, $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
  0         0  
1769              
1770             (
1771             $brace_depth,
1772             $context,
1773             $current_package,
1774             $last_nonblank_block_type,
1775             $last_nonblank_token,
1776             $last_nonblank_type,
1777             $next_sequence_number,
1778             $paren_depth,
1779             $rbrace_context,
1780             $rbrace_package,
1781             $rbrace_structural_type,
1782             $rbrace_type,
1783             $rcurrent_depth,
1784             $rcurrent_sequence_number,
1785             $rdepth_array,
1786             $ris_block_function,
1787             $ris_block_list_function,
1788             $ris_constant,
1789             $ris_user_function,
1790             $rnested_statement_type,
1791             $rnested_ternary_flag,
1792             $rparen_semicolon_count,
1793             $rparen_structural_type,
1794             $rparen_type,
1795             $rsaw_function_definition,
1796             $rsaw_use_module,
1797             $rsquare_bracket_structural_type,
1798             $rsquare_bracket_type,
1799             $rstarting_line_of_current_depth,
1800             $rtotal_depth,
1801             $ruser_function_prototype,
1802             $square_bracket_depth,
1803             $statement_type,
1804             $total_depth,
1805              
1806 0         0 ) = @{$rGV1};
  0         0  
1807              
1808             (
1809             $block_type, $container_type, $expecting,
1810             $i, $i_tok, $input_line,
1811             $input_line_number, $last_nonblank_i, $max_token_index,
1812             $next_tok, $next_type, $peeked_ahead,
1813             $prototype, $rhere_target_list, $rtoken_map,
1814             $rtoken_type, $rtokens, $tok,
1815             $type, $type_sequence, $indent_flag,
1816 0         0 ) = @{$rTV1};
  0         0  
1817              
1818             (
1819             $routput_token_list, $routput_token_type,
1820             $routput_block_type, $routput_container_type,
1821             $routput_type_sequence, $routput_indent_flag,
1822 0         0 ) = @{$rTV2};
  0         0  
1823              
1824             (
1825             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1826             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1827 0         0 ) = @{$rTV3};
  0         0  
1828              
1829 0         0 ( $id_scan_state, $identifier, $want_paren ) = @{$rTV4};
  0         0  
1830              
1831             (
1832             $nesting_token_string, $nesting_type_string,
1833             $nesting_block_string, $nesting_block_flag,
1834             $nesting_list_string, $nesting_list_flag,
1835             $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1836             $in_statement_continuation, $level_in_tokenizer,
1837             $slevel_in_tokenizer, $rslevel_stack,
1838 0         0 ) = @{$rTV5};
  0         0  
1839              
1840             (
1841             $last_nonblank_container_type,
1842             $last_nonblank_type_sequence,
1843             $last_last_nonblank_token,
1844             $last_last_nonblank_type,
1845             $last_last_nonblank_block_type,
1846             $last_last_nonblank_container_type,
1847             $last_last_nonblank_type_sequence,
1848             $last_nonblank_prototype,
1849 0         0 ) = @{$rTV6};
  0         0  
1850 0         0 return;
1851             } ## end sub restore_tokenizer_state
1852              
1853             sub split_pretoken {
1854              
1855 8     8 0 18 my ( $self, $numc ) = @_;
1856              
1857             # Split the leading $numc characters from the current token (at index=$i)
1858             # which is pre-type 'w' and insert the remainder back into the pretoken
1859             # stream with appropriate settings. Since we are splitting a pre-type 'w',
1860             # there are three cases, depending on if the remainder starts with a digit:
1861             # Case 1: remainder is type 'd', all digits
1862             # Case 2: remainder is type 'd' and type 'w': digits and other characters
1863             # Case 3: remainder is type 'w'
1864              
1865             # Examples, for $numc=1:
1866             # $tok => $tok_0 $tok_1 $tok_2
1867             # 'x10' => 'x' '10' # case 1
1868             # 'x10if' => 'x' '10' 'if' # case 2
1869             # '0ne => 'O' 'ne' # case 3
1870              
1871             # where:
1872             # $tok_1 is a possible string of digits (pre-type 'd')
1873             # $tok_2 is a possible word (pre-type 'w')
1874              
1875             # return 1 if successful
1876             # return undef if error (shouldn't happen)
1877              
1878             # Calling routine should update '$type' and '$tok' if successful.
1879              
1880 8         21 my $pretoken = $rtokens->[$i];
1881 8 50 33     75 if ( $pretoken
      33        
1882             && length($pretoken) > $numc
1883             && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1884             {
1885              
1886             # Split $tok into up to 3 tokens:
1887 8         22 my $tok_0 = substr( $pretoken, 0, $numc );
1888 8 50       29 my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1889 8 50       27 my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1890              
1891 8         17 my $len_0 = length($tok_0);
1892 8         14 my $len_1 = length($tok_1);
1893 8         13 my $len_2 = length($tok_2);
1894              
1895 8         14 my $pre_type_0 = 'w';
1896 8         14 my $pre_type_1 = 'd';
1897 8         12 my $pre_type_2 = 'w';
1898              
1899 8         16 my $pos_0 = $rtoken_map->[$i];
1900 8         25 my $pos_1 = $pos_0 + $len_0;
1901 8         19 my $pos_2 = $pos_1 + $len_1;
1902              
1903 8         15 my $isplice = $i + 1;
1904              
1905             # Splice in any digits
1906 8 100       20 if ($len_1) {
1907 5         10 splice @{$rtoken_map}, $isplice, 0, $pos_1;
  5         14  
1908 5         10 splice @{$rtokens}, $isplice, 0, $tok_1;
  5         13  
1909 5         11 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
  5         10  
1910 5         10 $max_token_index++;
1911 5         8 $isplice++;
1912             }
1913              
1914             # Splice in any trailing word
1915 8 100       22 if ($len_2) {
1916 4         7 splice @{$rtoken_map}, $isplice, 0, $pos_2;
  4         11  
1917 4         21 splice @{$rtokens}, $isplice, 0, $tok_2;
  4         9  
1918 4         8 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
  4         8  
1919 4         6 $max_token_index++;
1920             }
1921              
1922 8         18 $rtokens->[$i] = $tok_0;
1923 8         31 return 1;
1924             }
1925             else {
1926              
1927             # Shouldn't get here
1928 0         0 if (DEVEL_MODE) {
1929             $self->Fault(<<EOM);
1930             While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
1931             EOM
1932             }
1933             }
1934 0         0 return;
1935             } ## end sub split_pretoken
1936              
1937             sub get_indentation_level {
1938 556     556 0 1528 return $level_in_tokenizer;
1939             }
1940              
1941             sub reset_indentation_level {
1942 556     556 0 1553 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1943 556         1062 push @{$rslevel_stack}, $slevel_in_tokenizer;
  556         1478  
1944 556         1056 return;
1945             }
1946              
1947             sub peeked_ahead {
1948 232     232 0 429 my $flag = shift;
1949 232 100       588 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1950 232         534 return $peeked_ahead;
1951             }
1952              
1953             # ------------------------------------------------------------
1954             # end of tokenizer variable access and manipulation routines
1955             # ------------------------------------------------------------
1956              
1957             #------------------------------
1958             # beginning of tokenizer hashes
1959             #------------------------------
1960              
1961             my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
1962              
1963             # These block types terminate statements and do not need a trailing
1964             # semicolon
1965             # patched for SWITCH/CASE/
1966             my %is_zero_continuation_block_type;
1967             my @q;
1968             @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
1969             if elsif else unless while until for foreach switch case given when);
1970             @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
1971              
1972             my %is_logical_container;
1973             @q = qw(if elsif unless while and or err not && ! || for foreach);
1974             @is_logical_container{@q} = (1) x scalar(@q);
1975              
1976             my %is_binary_type;
1977             @q = qw(|| &&);
1978             @is_binary_type{@q} = (1) x scalar(@q);
1979              
1980             my %is_binary_keyword;
1981             @q = qw(and or err eq ne cmp);
1982             @is_binary_keyword{@q} = (1) x scalar(@q);
1983              
1984             # 'L' is token for opening { at hash key
1985             my %is_opening_type;
1986             @q = qw< L { ( [ >;
1987             @is_opening_type{@q} = (1) x scalar(@q);
1988              
1989             my %is_opening_or_ternary_type;
1990             push @q, '?';
1991             @is_opening_or_ternary_type{@q} = (1) x scalar(@q);
1992              
1993             # 'R' is token for closing } at hash key
1994             my %is_closing_type;
1995             @q = qw< R } ) ] >;
1996             @is_closing_type{@q} = (1) x scalar(@q);
1997              
1998             my %is_closing_or_ternary_type;
1999             push @q, ':';
2000             @is_closing_or_ternary_type{@q} = (1) x scalar(@q);
2001              
2002             my %is_redo_last_next_goto;
2003             @q = qw(redo last next goto);
2004             @is_redo_last_next_goto{@q} = (1) x scalar(@q);
2005              
2006             my %is_use_require;
2007             @q = qw(use require);
2008             @is_use_require{@q} = (1) x scalar(@q);
2009              
2010             # This hash holds the array index in $self for these keywords:
2011             # Fix for issue c035: removed 'format' from this hash
2012             my %is_END_DATA = (
2013             '__END__' => _in_end_,
2014             '__DATA__' => _in_data_,
2015             );
2016              
2017             my %is_list_end_type;
2018             @q = qw( ; { } );
2019             push @q, ',';
2020             @is_list_end_type{@q} = (1) x scalar(@q);
2021              
2022             # original ref: camel 3 p 147,
2023             # but perl may accept undocumented flags
2024             # perl 5.10 adds 'p' (preserve)
2025             # Perl version 5.22 added 'n'
2026             # From http://perldoc.perl.org/perlop.html we have
2027             # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
2028             # s/PATTERN/REPLACEMENT/msixpodualngcer
2029             # y/SEARCHLIST/REPLACEMENTLIST/cdsr
2030             # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
2031             # qr/STRING/msixpodualn
2032             my %quote_modifiers = (
2033             's' => '[msixpodualngcer]',
2034             'y' => '[cdsr]',
2035             'tr' => '[cdsr]',
2036             'm' => '[msixpodualngc]',
2037             'qr' => '[msixpodualn]',
2038             'q' => EMPTY_STRING,
2039             'qq' => EMPTY_STRING,
2040             'qw' => EMPTY_STRING,
2041             'qx' => EMPTY_STRING,
2042             );
2043              
2044             # table showing how many quoted things to look for after quote operator..
2045             # s, y, tr have 2 (pattern and replacement)
2046             # others have 1 (pattern only)
2047             my %quote_items = (
2048             's' => 2,
2049             'y' => 2,
2050             'tr' => 2,
2051             'm' => 1,
2052             'qr' => 1,
2053             'q' => 1,
2054             'qq' => 1,
2055             'qw' => 1,
2056             'qx' => 1,
2057             );
2058              
2059             my %is_for_foreach;
2060             @q = qw(for foreach);
2061             @is_for_foreach{@q} = (1) x scalar(@q);
2062              
2063             # These keywords may introduce blocks after parenthesized expressions,
2064             # in the form:
2065             # keyword ( .... ) { BLOCK }
2066             # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
2067             # NOTE for --use-feature=class: if ADJUST blocks eventually take a
2068             # parameter list, then ADJUST might need to be added to this list (see
2069             # perlclass.pod)
2070             my %is_blocktype_with_paren;
2071             @q =
2072             qw(if elsif unless while until for foreach switch case given when catch);
2073             @is_blocktype_with_paren{@q} = (1) x scalar(@q);
2074              
2075             my %is_case_default;
2076             @q = qw(case default);
2077             @is_case_default{@q} = (1) x scalar(@q);
2078              
2079             #------------------------
2080             # end of tokenizer hashes
2081             #------------------------
2082              
2083             # ------------------------------------------------------------
2084             # beginning of various scanner interface routines
2085             # ------------------------------------------------------------
2086             sub scan_replacement_text {
2087              
2088             # check for here-docs in replacement text invoked by
2089             # a substitution operator with executable modifier 'e'.
2090             #
2091             # given:
2092             # $replacement_text
2093             # return:
2094             # $rht = reference to any here-doc targets
2095 0     0 0 0 my ( $self, $replacement_text ) = @_;
2096              
2097             # quick check
2098 0 0       0 return unless ( $replacement_text =~ /<</ );
2099              
2100 0         0 $self->write_logfile_entry(
2101             "scanning replacement text for here-doc targets\n");
2102              
2103             # save the logger object for error messages
2104 0         0 my $logger_object = $self->[_logger_object_];
2105              
2106             # save all lexical variables
2107 0         0 my $rstate = save_tokenizer_state();
2108 0         0 _decrement_count(); # avoid error check for multiple tokenizers
2109              
2110             # make a new tokenizer
2111 0         0 my $rOpts = {};
2112 0         0 my $source_object = Perl::Tidy::LineSource->new(
2113             input_file => \$replacement_text,
2114             rOpts => $rOpts,
2115             );
2116 0         0 my $tokenizer = Perl::Tidy::Tokenizer->new(
2117             source_object => $source_object,
2118             logger_object => $logger_object,
2119             starting_line_number => $input_line_number,
2120             );
2121              
2122             # scan the replacement text
2123 0         0 1 while ( $tokenizer->get_line() );
2124              
2125             # remove any here doc targets
2126 0         0 my $rht = undef;
2127 0 0       0 if ( $tokenizer->[_in_here_doc_] ) {
2128 0         0 $rht = [];
2129 0         0 push @{$rht},
  0         0  
2130             [
2131             $tokenizer->[_here_doc_target_],
2132             $tokenizer->[_here_quote_character_]
2133             ];
2134 0 0       0 if ( $tokenizer->[_rhere_target_list_] ) {
2135 0         0 push @{$rht}, @{ $tokenizer->[_rhere_target_list_] };
  0         0  
  0         0  
2136 0         0 $tokenizer->[_rhere_target_list_] = undef;
2137             }
2138 0         0 $tokenizer->[_in_here_doc_] = undef;
2139             }
2140              
2141             # now its safe to report errors
2142 0         0 my $severe_error = $tokenizer->report_tokenization_errors();
2143              
2144             # TODO: Could propagate a severe error up
2145              
2146             # restore all tokenizer lexical variables
2147 0         0 restore_tokenizer_state($rstate);
2148              
2149             # return the here doc targets
2150 0         0 return $rht;
2151             } ## end sub scan_replacement_text
2152              
2153             sub scan_bare_identifier {
2154 1672     1672 0 3074 my $self = shift;
2155 1672         5135 ( $i, $tok, $type, $prototype ) =
2156             $self->scan_bare_identifier_do( $input_line, $i, $tok, $type,
2157             $prototype, $rtoken_map, $max_token_index );
2158 1672         3496 return;
2159             } ## end sub scan_bare_identifier
2160              
2161             sub scan_identifier {
2162              
2163 486     486 0 923 my $self = shift;
2164              
2165             (
2166 486         2936 $i, $tok, $type, $id_scan_state, $identifier,
2167             my $split_pretoken_flag
2168             )
2169             = $self->scan_complex_identifier( $i, $id_scan_state, $identifier,
2170             $rtokens, $max_token_index, $expecting,
2171             $rparen_type->[$paren_depth] );
2172              
2173             # Check for signal to fix a special variable adjacent to a keyword,
2174             # such as '$^One$0'.
2175 486 100       1587 if ($split_pretoken_flag) {
2176              
2177             # Try to fix it by splitting the pretoken
2178 3 50 33     34 if ( $i > 0
      33        
2179             && $rtokens->[ $i - 1 ] eq '^'
2180             && $self->split_pretoken(1) )
2181             {
2182 3         8 $identifier = substr( $identifier, 0, 3 );
2183 3         6 $tok = $identifier;
2184             }
2185             else {
2186              
2187             # This shouldn't happen ...
2188 0         0 my $var = substr( $tok, 0, 3 );
2189 0         0 my $excess = substr( $tok, 3 );
2190 0         0 $self->interrupt_logfile();
2191 0         0 $self->warning(<<EOM);
2192             $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2193             A space may be needed after '$var'.
2194             EOM
2195 0         0 $self->resume_logfile();
2196             }
2197             }
2198 486         908 return;
2199             } ## end sub scan_identifier
2200              
2201 38     38   484 use constant VERIFY_FASTSCAN => 0;
  38         95  
  38         3780  
2202             my %fast_scan_context;
2203              
2204             BEGIN {
2205 38     38   45804 %fast_scan_context = (
2206             '$' => SCALAR_CONTEXT,
2207             '*' => SCALAR_CONTEXT,
2208             '@' => LIST_CONTEXT,
2209             '%' => LIST_CONTEXT,
2210             '&' => UNKNOWN_CONTEXT,
2211             );
2212             } ## end BEGIN
2213              
2214             sub scan_simple_identifier {
2215              
2216             # This is a wrapper for sub scan_identifier. It does a fast preliminary
2217             # scan for certain common identifiers:
2218             # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2219             # If it does not find one of these, or this is a restart, it calls the
2220             # original scanner directly.
2221              
2222             # This gives the same results as the full scanner in about 1/4 the
2223             # total runtime for a typical input stream.
2224              
2225             # Notation:
2226             # $var * 2
2227             # ^^ ^
2228             # || |
2229             # || ---- $i_next [= next nonblank pretoken ]
2230             # |----$i_plus_1 [= a bareword ]
2231             # ---$i_begin [= a sigil]
2232              
2233 4779     4779 0 7198 my $self = shift;
2234              
2235 4779         6976 my $i_begin = $i;
2236 4779         7512 my $tok_begin = $tok;
2237 4779         7187 my $i_plus_1 = $i + 1;
2238 4779         6847 my $fast_scan_type;
2239              
2240             #-------------------------------------------------------
2241             # Do full scan for anything following a pointer, such as
2242             # $cref->&*; # a postderef
2243             #-------------------------------------------------------
2244 4779 100 66     27116 if ( $last_nonblank_token eq '->' ) {
    100 66        
    50 33        
      0        
      33        
2245              
2246             }
2247              
2248             #------------------------------
2249             # quick scan with leading sigil
2250             #------------------------------
2251             elsif ( !$id_scan_state
2252             && $i_plus_1 <= $max_token_index
2253             && $fast_scan_context{$tok} )
2254             {
2255 4666         8780 $context = $fast_scan_context{$tok};
2256              
2257             # look for $var, @var, ...
2258 4666 100 100     11470 if ( $rtoken_type->[$i_plus_1] eq 'w' ) {
    100 66        
2259 4378         7355 my $pretype_next = EMPTY_STRING;
2260 4378 100       9429 if ( $i_plus_1 < $max_token_index ) {
2261 4262         7113 my $i_next = $i_plus_1 + 1;
2262 4262 100 100     13989 if ( $rtoken_type->[$i_next] eq 'b'
2263             && $i_next < $max_token_index )
2264             {
2265 1699         2902 $i_next += 1;
2266             }
2267 4262         7283 $pretype_next = $rtoken_type->[$i_next];
2268             }
2269 4378 100 100     15813 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2270              
2271             # Found type 'i' like '$var', '@var', or '%var'
2272 4270         8548 $identifier = $tok . $rtokens->[$i_plus_1];
2273 4270         6820 $tok = $identifier;
2274 4270         6984 $type = 'i';
2275 4270         6107 $i = $i_plus_1;
2276 4270         7201 $fast_scan_type = $type;
2277             }
2278             }
2279              
2280             # Look for @{ or %{ .
2281             # But we must let the full scanner handle things ${ because it may
2282             # keep going to get a complete identifier like '${#}' .
2283             elsif (
2284             $rtoken_type->[$i_plus_1] eq '{'
2285             && ( $tok_begin eq '@'
2286             || $tok_begin eq '%' )
2287             )
2288             {
2289              
2290 30         81 $identifier = $tok;
2291 30         63 $type = 't';
2292 30         62 $fast_scan_type = $type;
2293             }
2294             }
2295              
2296             #---------------------------
2297             # Quick scan with leading ->
2298             # Look for ->[ and ->{
2299             #---------------------------
2300             elsif (
2301             $tok eq '->'
2302             && $i < $max_token_index
2303             && ( $rtokens->[$i_plus_1] eq '{'
2304             || $rtokens->[$i_plus_1] eq '[' )
2305             )
2306             {
2307 0         0 $type = $tok;
2308 0         0 $fast_scan_type = $type;
2309 0         0 $identifier = $tok;
2310 0         0 $context = UNKNOWN_CONTEXT;
2311             }
2312              
2313             #--------------------------------------
2314             # Verify correctness during development
2315             #--------------------------------------
2316 4779         6730 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2317              
2318             # We will call the full method
2319             my $identifier_simple = $identifier;
2320             my $tok_simple = $tok;
2321             my $i_simple = $i;
2322             my $context_simple = $context;
2323              
2324             $tok = $tok_begin;
2325             $i = $i_begin;
2326             $self->scan_identifier();
2327              
2328             if ( $tok ne $tok_simple
2329             || $type ne $fast_scan_type
2330             || $i != $i_simple
2331             || $identifier ne $identifier_simple
2332             || $id_scan_state
2333             || $context ne $context_simple )
2334             {
2335             print STDERR <<EOM;
2336             scan_simple_identifier differs from scan_identifier:
2337             simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2338             full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2339             EOM
2340             }
2341             }
2342              
2343             #-------------------------------------------------
2344             # call full scanner if fast method did not succeed
2345             #-------------------------------------------------
2346 4779 100       10150 if ( !$fast_scan_type ) {
2347 479         1798 $self->scan_identifier();
2348             }
2349 4779         7961 return;
2350             } ## end sub scan_simple_identifier
2351              
2352             sub method_ok_here {
2353              
2354             # Return:
2355             # false if this is definitely an invalid method declaration
2356             # true otherwise (even if not sure)
2357              
2358             # We are trying to avoid problems with old uses of 'method'
2359             # when --use-feature=class is set (rt145706).
2360             # For example, this should cause a return of 'false':
2361              
2362             # method paint => sub {
2363             # return;
2364             # };
2365              
2366 8     8 0 21 my $self = shift;
2367              
2368             # from do_scan_sub:
2369 8         18 my $i_beg = $i + 1;
2370 8         22 my $pos_beg = $rtoken_map->[$i_beg];
2371 8         24 pos($input_line) = $pos_beg;
2372              
2373             # TEST 1: look a valid sub NAME
2374 8 50       54 if (
2375             $input_line =~ m/\G\s*
2376             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2377             (\w+) # NAME - required
2378             /gcx
2379             )
2380             {
2381             # For possible future use..
2382 8         25 my $subname = $2;
2383 8 50       33 my $package = $1 ? $1 : EMPTY_STRING;
2384             }
2385             else {
2386 0         0 return;
2387             }
2388              
2389             # TEST 2: look for invalid characters after name, such as here:
2390             # method paint => sub {
2391             # ...
2392             # }
2393 8         19 my $next_char = EMPTY_STRING;
2394 8 100       38 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
  7         21  
2395 8 100 66     48 if ( !$next_char || $next_char eq '#' ) {
2396 1         14 ( $next_char, my $i_next ) =
2397             $self->find_next_nonblank_token( $max_token_index,
2398             $rtokens, $max_token_index );
2399             }
2400              
2401 8 50       27 if ( !$next_char ) {
2402              
2403             # out of characters - give up
2404 0         0 return;
2405             }
2406              
2407             # Possibly valid next token types:
2408             # '(' could start prototype or signature
2409             # ':' could start ATTRIBUTE
2410             # '{' cold start BLOCK
2411             # ';' or '}' could end a statement
2412 8 100       30 if ( $next_char !~ /^[\(\:\{\;\}]/ ) {
2413              
2414             # This does not match use feature 'class' syntax
2415 3         13 return;
2416             }
2417              
2418             # We will stop here and assume that this is valid syntax for
2419             # use feature 'class'.
2420 5         25 return 1;
2421             } ## end sub method_ok_here
2422              
2423             sub class_ok_here {
2424              
2425             # Return:
2426             # false if this is definitely an invalid class declaration
2427             # true otherwise (even if not sure)
2428              
2429             # We are trying to avoid problems with old uses of 'class'
2430             # when --use-feature=class is set (rt145706). We look ahead
2431             # see if this use of 'class' is obviously inconsistent with
2432             # the syntax of use feature 'class'. This allows the default
2433             # setting --use-feature=class to work for old syntax too.
2434              
2435             # Valid class declarations look like
2436             # class NAME ?ATTRS ?VERSION ?BLOCK
2437             # where ATTRS VERSION and BLOCK are optional
2438              
2439             # For example, this should produce a return of 'false':
2440             #
2441             # class ExtendsBasicAttributes is BasicAttributes{
2442              
2443 6     6 0 12 my $self = shift;
2444              
2445             # TEST 1: class stmt can only go where a new statment can start
2446 6 50       18 if ( !new_statement_ok() ) { return }
  0         0  
2447              
2448 6         12 my $i_beg = $i + 1;
2449 6         15 my $pos_beg = $rtoken_map->[$i_beg];
2450 6         19 pos($input_line) = $pos_beg;
2451              
2452             # TEST 2: look for a valid NAME
2453 6 50       35 if (
2454             $input_line =~ m/\G\s*
2455             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2456             (\w+) # NAME - required
2457             /gcx
2458             )
2459             {
2460             # For possible future use..
2461 6         20 my $subname = $2;
2462 6 100       20 my $package = $1 ? $1 : EMPTY_STRING;
2463             }
2464             else {
2465 0         0 return;
2466             }
2467              
2468             # TEST 3: look for valid characters after NAME
2469 6         10 my $next_char = EMPTY_STRING;
2470 6 100       20 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
  5         18  
2471 6 100 66     27 if ( !$next_char || $next_char eq '#' ) {
2472 1         4 ( $next_char, my $i_next ) =
2473             $self->find_next_nonblank_token( $max_token_index,
2474             $rtokens, $max_token_index );
2475             }
2476 6 50       17 if ( !$next_char ) {
2477              
2478             # out of characters - give up
2479 0         0 return;
2480             }
2481              
2482             # Must see one of: ATTRIBUTE, VERSION, BLOCK, or end stmt
2483              
2484             # Possibly valid next token types:
2485             # ':' could start ATTRIBUTE
2486             # '\d' could start VERSION
2487             # '{' cold start BLOCK
2488             # ';' could end a statement
2489             # '}' could end statement but would be strange
2490              
2491 6 100       22 if ( $next_char !~ /^[\:\d\{\;\}]/ ) {
2492              
2493             # This does not match use feature 'class' syntax
2494 2         9 return;
2495             }
2496              
2497             # We will stop here and assume that this is valid syntax for
2498             # use feature 'class'.
2499 4         17 return 1;
2500             } ## end sub class_ok_here
2501              
2502             sub scan_id {
2503 330     330 0 706 my $self = shift;
2504 330         1299 ( $i, $tok, $type, $id_scan_state ) =
2505             $self->scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2506             $id_scan_state, $max_token_index );
2507 330         818 return;
2508             } ## end sub scan_id
2509              
2510             sub scan_number {
2511 629     629 0 1134 my $self = shift;
2512 629         1021 my $number;
2513 629         1848 ( $i, $type, $number ) =
2514             $self->scan_number_do( $input_line, $i, $rtoken_map, $type,
2515             $max_token_index );
2516 629         1530 return $number;
2517             } ## end sub scan_number
2518              
2519 38     38   1630 use constant VERIFY_FASTNUM => 0;
  38         98  
  38         351150  
2520              
2521             sub scan_number_fast {
2522              
2523             # This is a wrapper for sub scan_number. It does a fast preliminary
2524             # scan for a simple integer. It calls the original scan_number if it
2525             # does not find one.
2526              
2527 2272     2272 0 3565 my $self = shift;
2528 2272         3440 my $i_begin = $i;
2529 2272         3499 my $tok_begin = $tok;
2530 2272         3248 my $number;
2531              
2532             #---------------------------------
2533             # Quick check for (signed) integer
2534             #---------------------------------
2535              
2536             # This will be the string of digits:
2537 2272         3355 my $i_d = $i;
2538 2272         3453 my $tok_d = $tok;
2539 2272         4254 my $typ_d = $rtoken_type->[$i_d];
2540              
2541             # check for signed integer
2542 2272         3666 my $sign = EMPTY_STRING;
2543 2272 50 66     6962 if ( $typ_d ne 'd'
      66        
      33        
2544             && ( $typ_d eq '+' || $typ_d eq '-' )
2545             && $i_d < $max_token_index )
2546             {
2547 343         595 $sign = $tok_d;
2548 343         532 $i_d++;
2549 343         592 $tok_d = $rtokens->[$i_d];
2550 343         614 $typ_d = $rtoken_type->[$i_d];
2551             }
2552              
2553             # Handle integers
2554 2272 100 100     16419 if (
      100        
2555             $typ_d eq 'd'
2556             && (
2557             $i_d == $max_token_index
2558             || ( $i_d < $max_token_index
2559             && $rtoken_type->[ $i_d + 1 ] ne '.'
2560             && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2561             )
2562             )
2563             {
2564             # Let let full scanner handle multi-digit integers beginning with
2565             # '0' because there could be error messages. For example, '009' is
2566             # not a valid number.
2567              
2568 1710 100 100     7451 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2569 1653         3136 $number = $sign . $tok_d;
2570 1653         2652 $type = 'n';
2571 1653         2682 $i = $i_d;
2572             }
2573             }
2574              
2575             #--------------------------------------
2576             # Verify correctness during development
2577             #--------------------------------------
2578 2272         3278 if ( VERIFY_FASTNUM && defined($number) ) {
2579              
2580             # We will call the full method
2581             my $type_simple = $type;
2582             my $i_simple = $i;
2583             my $number_simple = $number;
2584              
2585             $tok = $tok_begin;
2586             $i = $i_begin;
2587             $number = $self->scan_number();
2588              
2589             if ( $type ne $type_simple
2590             || ( $i != $i_simple && $i <= $max_token_index )
2591             || $number ne $number_simple )
2592             {
2593             print STDERR <<EOM;
2594             scan_number_fast differs from scan_number:
2595             simple: i=$i_simple, type=$type_simple, number=$number_simple
2596             full: i=$i, type=$type, number=$number
2597             EOM
2598             }
2599             }
2600              
2601             #----------------------------------------
2602             # call full scanner if may not be integer
2603             #----------------------------------------
2604 2272 100       4978 if ( !defined($number) ) {
2605 619         1585 $number = $self->scan_number();
2606             }
2607 2272         5263 return $number;
2608             } ## end sub scan_number_fast
2609              
2610             # a sub to warn if token found where term expected
2611             sub error_if_expecting_TERM {
2612 9     9 0 19 my $self = shift;
2613 9 50       29 if ( $expecting == TERM ) {
2614 9 50       30 if ( $really_want_term{$last_nonblank_type} ) {
2615 0         0 $self->report_unexpected( $tok, "term", $i_tok,
2616             $last_nonblank_i, $rtoken_map, $rtoken_type, $input_line );
2617 0         0 return 1;
2618             }
2619             }
2620 9         18 return;
2621             } ## end sub error_if_expecting_TERM
2622              
2623             # a sub to warn if token found where operator expected
2624             sub error_if_expecting_OPERATOR {
2625 769     769 0 1666 my ( $self, $thing ) = @_;
2626 769 50       1833 if ( $expecting == OPERATOR ) {
2627 0 0       0 if ( !defined($thing) ) { $thing = $tok }
  0         0  
2628 0         0 $self->report_unexpected( $thing, "operator", $i_tok,
2629             $last_nonblank_i, $rtoken_map, $rtoken_type, $input_line );
2630 0 0       0 if ( $i_tok == 0 ) {
2631 0         0 $self->interrupt_logfile();
2632 0         0 $self->warning("Missing ';' or ',' above?\n");
2633 0         0 $self->resume_logfile();
2634             }
2635 0         0 return 1;
2636             }
2637 769         1517 return;
2638             } ## end sub error_if_expecting_OPERATOR
2639              
2640             # ------------------------------------------------------------
2641             # end scanner interfaces
2642             # ------------------------------------------------------------
2643              
2644             #------------------
2645             # Tokenization subs
2646             #------------------
2647             sub do_GREATER_THAN_SIGN {
2648              
2649 31     31 0 84 my $self = shift;
2650              
2651             # '>'
2652 31 50       118 $self->error_if_expecting_TERM()
2653             if ( $expecting == TERM );
2654 31         71 return;
2655             } ## end sub do_GREATER_THAN_SIGN
2656              
2657             sub do_VERTICAL_LINE {
2658              
2659 4     4 0 10 my $self = shift;
2660              
2661             # '|'
2662 4 50       12 $self->error_if_expecting_TERM()
2663             if ( $expecting == TERM );
2664 4         7 return;
2665             } ## end sub do_VERTICAL_LINE
2666              
2667             sub do_DOLLAR_SIGN {
2668              
2669 4024     4024 0 7182 my $self = shift;
2670              
2671             # '$'
2672             # start looking for a scalar
2673 4024 50       9035 $self->error_if_expecting_OPERATOR("Scalar")
2674             if ( $expecting == OPERATOR );
2675 4024         12174 $self->scan_simple_identifier();
2676              
2677 4024 100       9213 if ( $identifier eq '$^W' ) {
2678 1         4 $self->[_saw_perl_dash_w_] = 1;
2679             }
2680              
2681             # Check for identifier in indirect object slot
2682             # (vorboard.pl, sort.t). Something like:
2683             # /^(print|printf|sort|exec|system)$/
2684 4024 100 66     31930 if (
      100        
      100        
      66        
      66        
2685             $is_indirect_object_taker{$last_nonblank_token}
2686             && $last_nonblank_type eq 'k'
2687             || ( ( $last_nonblank_token eq '(' )
2688             && $is_indirect_object_taker{ $rparen_type->[$paren_depth] } )
2689             || ( $last_nonblank_type eq 'w'
2690             || $last_nonblank_type eq 'U' ) # possible object
2691             )
2692             {
2693              
2694             # An identifier followed by '->' is not indirect object;
2695             # fixes b1175, b1176
2696 98         647 my ( $next_nonblank_type, $i_next ) =
2697             $self->find_next_noncomment_type( $i, $rtokens,
2698             $max_token_index );
2699 98 100       379 $type = 'Z' if ( $next_nonblank_type ne '->' );
2700             }
2701 4024         6532 return;
2702             } ## end sub do_DOLLAR_SIGN
2703              
2704             sub do_LEFT_PARENTHESIS {
2705              
2706 2121     2121 0 4408 my $self = shift;
2707              
2708             # '('
2709 2121         3533 ++$paren_depth;
2710 2121         4350 $rparen_semicolon_count->[$paren_depth] = 0;
2711 2121 100       6387 if ($want_paren) {
    100          
2712 68         193 $container_type = $want_paren;
2713 68         164 $want_paren = EMPTY_STRING;
2714             }
2715             elsif ( $statement_type =~ /^sub\b/ ) {
2716 14         42 $container_type = $statement_type;
2717             }
2718             else {
2719 2039         3570 $container_type = $last_nonblank_token;
2720              
2721             # We can check for a syntax error here of unexpected '(',
2722             # but this is going to get messy...
2723 2039 100 100     7969 if (
2724             $expecting == OPERATOR
2725              
2726             # Be sure this is not a method call of the form
2727             # &method(...), $method->(..), &{method}(...),
2728             # $ref[2](list) is ok & short for $ref[2]->(list)
2729             # NOTE: at present, braces in something like &{ xxx }
2730             # are not marked as a block, we might have a method call.
2731             # Added ')' to fix case c017, something like ()()()
2732             && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2733             )
2734             {
2735              
2736             # ref: camel 3 p 703.
2737 3 50       11 if ( $last_last_nonblank_token eq 'do' ) {
2738 0         0 $self->complain(
2739             "do SUBROUTINE is deprecated; consider & or -> notation\n"
2740             );
2741             }
2742             else {
2743              
2744             # if this is an empty list, (), then it is not an
2745             # error; for example, we might have a constant pi and
2746             # invoke it with pi() or just pi;
2747 3         10 my ( $next_nonblank_token, $i_next ) =
2748             $self->find_next_nonblank_token( $i, $rtokens,
2749             $max_token_index );
2750              
2751             # Patch for c029: give up error check if
2752             # a side comment follows
2753 3 50 33     20 if ( $next_nonblank_token ne ')'
2754             && $next_nonblank_token ne '#' )
2755             {
2756 0         0 my $hint;
2757              
2758 0         0 $self->error_if_expecting_OPERATOR('(');
2759              
2760 0 0       0 if ( $last_nonblank_type eq 'C' ) {
    0          
2761 0         0 $hint =
2762             "$last_nonblank_token has a void prototype\n";
2763             }
2764             elsif ( $last_nonblank_type eq 'i' ) {
2765 0 0 0     0 if ( $i_tok > 0
2766             && $last_nonblank_token =~ /^\$/ )
2767             {
2768 0         0 $hint =
2769             "Do you mean '$last_nonblank_token->(' ?\n";
2770             }
2771             }
2772 0 0       0 if ($hint) {
2773 0         0 $self->interrupt_logfile();
2774 0         0 $self->warning($hint);
2775 0         0 $self->resume_logfile();
2776             }
2777             } ## end if ( $next_nonblank_token...
2778             } ## end else [ if ( $last_last_nonblank_token...
2779             } ## end if ( $expecting == OPERATOR...
2780             }
2781              
2782             # Do not update container type at ') ('; fix for git #105. This will
2783             # propagate the container type onward so that any subsequent brace gets
2784             # correctly marked. I have implemented this as a general rule, which
2785             # should be safe, but if necessary it could be restricted to certain
2786             # container statement types such as 'for'.
2787 2121 100       6272 $rparen_type->[$paren_depth] = $container_type
2788             if ( $last_nonblank_token ne ')' );
2789              
2790 2121         6422 ( $type_sequence, $indent_flag ) =
2791             $self->increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2792              
2793             # propagate types down through nested parens
2794             # for example: the second paren in 'if ((' would be structural
2795             # since the first is.
2796              
2797 2121 100       5635 if ( $last_nonblank_token eq '(' ) {
2798 61         233 $type = $last_nonblank_type;
2799             }
2800              
2801             # We exclude parens as structural after a ',' because it
2802             # causes subtle problems with continuation indentation for
2803             # something like this, where the first 'or' will not get
2804             # indented.
2805             #
2806             # assert(
2807             # __LINE__,
2808             # ( not defined $check )
2809             # or ref $check
2810             # or $check eq "new"
2811             # or $check eq "old",
2812             # );
2813             #
2814             # Likewise, we exclude parens where a statement can start
2815             # because of problems with continuation indentation, like
2816             # these:
2817             #
2818             # ($firstline =~ /^#\!.*perl/)
2819             # and (print $File::Find::name, "\n")
2820             # and (return 1);
2821             #
2822             # (ref($usage_fref) =~ /CODE/)
2823             # ? &$usage_fref
2824             # : (&blast_usage, &blast_params, &blast_general_params);
2825              
2826             else {
2827 2060         3641 $type = '{';
2828             }
2829              
2830 2121 50       5061 if ( $last_nonblank_type eq ')' ) {
2831 0         0 $self->warning(
2832             "Syntax error? found token '$last_nonblank_type' then '('\n");
2833             }
2834 2121         4115 $rparen_structural_type->[$paren_depth] = $type;
2835 2121         3477 return;
2836              
2837             } ## end sub do_LEFT_PARENTHESIS
2838              
2839             sub do_RIGHT_PARENTHESIS {
2840              
2841 2121     2121 0 4395 my $self = shift;
2842              
2843             # ')'
2844 2121         6662 ( $type_sequence, $indent_flag ) =
2845             $self->decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2846              
2847 2121 50       6339 if ( $rparen_structural_type->[$paren_depth] eq '{' ) {
2848 2121         3736 $type = '}';
2849             }
2850              
2851 2121         4074 $container_type = $rparen_type->[$paren_depth];
2852              
2853             # restore statement type as 'sub' at closing paren of a signature
2854             # so that a subsequent ':' is identified as an attribute
2855 2121 100       6167 if ( $container_type =~ /^sub\b/ ) {
2856 24         70 $statement_type = $container_type;
2857             }
2858              
2859             # /^(for|foreach)$/
2860 2121 100       5771 if ( $is_for_foreach{ $rparen_type->[$paren_depth] } ) {
2861 69         238 my $num_sc = $rparen_semicolon_count->[$paren_depth];
2862 69 50 66     486 if ( $num_sc > 0 && $num_sc != 2 ) {
2863 0         0 $self->warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2864             }
2865             }
2866              
2867 2121 50       4920 if ( $paren_depth > 0 ) { $paren_depth-- }
  2121         3241  
2868 2121         3548 return;
2869             } ## end sub do_RIGHT_PARENTHESIS
2870              
2871             sub do_COMMA {
2872              
2873 3075     3075 0 5166 my $self = shift;
2874              
2875             # ','
2876 3075 100 33     11169 if ( $last_nonblank_type eq ',' ) {
    50          
2877 10         62 $self->complain("Repeated ','s \n");
2878             }
2879              
2880             # Note that we have to check both token and type here because a
2881             # comma following a qw list can have last token='(' but type = 'q'
2882             elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
2883 0         0 $self->warning("Unexpected leading ',' after a '('\n");
2884             }
2885              
2886             # patch for operator_expected: note if we are in the list (use.t)
2887 3075 100       6430 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
  6         15  
2888 3075         4684 return;
2889              
2890             } ## end sub do_COMMA
2891              
2892             sub do_SEMICOLON {
2893              
2894 2448     2448 0 4848 my $self = shift;
2895              
2896             # ';'
2897 2448         3965 $context = UNKNOWN_CONTEXT;
2898 2448         4033 $statement_type = EMPTY_STRING;
2899 2448         4028 $want_paren = EMPTY_STRING;
2900              
2901             # /^(for|foreach)$/
2902 2448 100       7086 if ( $is_for_foreach{ $rparen_type->[$paren_depth] } )
2903             { # mark ; in for loop
2904              
2905             # Be careful: we do not want a semicolon such as the
2906             # following to be included:
2907             #
2908             # for (sort {strcoll($a,$b);} keys %investments) {
2909              
2910 35 100 66     240 if ( $brace_depth == $rdepth_array->[PAREN][BRACE][$paren_depth]
2911             && $square_bracket_depth ==
2912             $rdepth_array->[PAREN][SQUARE_BRACKET][$paren_depth] )
2913             {
2914              
2915 34         68 $type = 'f';
2916 34         76 $rparen_semicolon_count->[$paren_depth]++;
2917             }
2918             }
2919 2448         4654 return;
2920             } ## end sub do_SEMICOLON
2921              
2922             sub do_QUOTATION_MARK {
2923              
2924 1124     1124 0 2276 my $self = shift;
2925              
2926             # '"'
2927 1124 50       2799 $self->error_if_expecting_OPERATOR("String")
2928             if ( $expecting == OPERATOR );
2929 1124         1838 $in_quote = 1;
2930 1124         1934 $type = 'Q';
2931 1124         1877 $allowed_quote_modifiers = EMPTY_STRING;
2932 1124         1824 return;
2933             } ## end sub do_QUOTATION_MARK
2934              
2935             sub do_APOSTROPHE {
2936              
2937 1160     1160 0 2243 my $self = shift;
2938              
2939             # "'"
2940 1160 50       2861 $self->error_if_expecting_OPERATOR("String")
2941             if ( $expecting == OPERATOR );
2942 1160         1944 $in_quote = 1;
2943 1160         1968 $type = 'Q';
2944 1160         1881 $allowed_quote_modifiers = EMPTY_STRING;
2945 1160         1879 return;
2946             } ## end sub do_APOSTROPHE
2947              
2948             sub do_BACKTICK {
2949              
2950 0     0 0 0 my $self = shift;
2951              
2952             # '`'
2953 0 0       0 $self->error_if_expecting_OPERATOR("String")
2954             if ( $expecting == OPERATOR );
2955 0         0 $in_quote = 1;
2956 0         0 $type = 'Q';
2957 0         0 $allowed_quote_modifiers = EMPTY_STRING;
2958 0         0 return;
2959             } ## end sub do_BACKTICK
2960              
2961             sub do_SLASH {
2962              
2963 207     207 0 477 my $self = shift;
2964              
2965             # '/'
2966 207         384 my $is_pattern;
2967              
2968             # a pattern cannot follow certain keywords which take optional
2969             # arguments, like 'shift' and 'pop'. See also '?'.
2970 207 50 66     1127 if (
    50          
2971             $last_nonblank_type eq 'k'
2972             && $is_keyword_rejecting_slash_as_pattern_delimiter{
2973             $last_nonblank_token}
2974             )
2975             {
2976 0         0 $is_pattern = 0;
2977             }
2978             elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
2979 0         0 my $msg;
2980 0         0 ( $is_pattern, $msg ) =
2981             $self->guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2982             $max_token_index );
2983              
2984 0 0       0 if ($msg) {
2985 0         0 $self->write_diagnostics("DIVIDE:$msg\n");
2986 0         0 $self->write_logfile_entry($msg);
2987             }
2988             }
2989 207         711 else { $is_pattern = ( $expecting == TERM ) }
2990              
2991 207 100       498 if ($is_pattern) {
2992 78         152 $in_quote = 1;
2993 78         151 $type = 'Q';
2994 78         156 $allowed_quote_modifiers = '[msixpodualngc]';
2995             }
2996             else { # not a pattern; check for a /= token
2997              
2998 129 50       409 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
2999 0         0 $i++;
3000 0         0 $tok = '/=';
3001 0         0 $type = $tok;
3002             }
3003              
3004             #DEBUG - collecting info on what tokens follow a divide
3005             # for development of guessing algorithm
3006             ## if (
3007             ## $self->is_possible_numerator( $i, $rtokens,
3008             ## $max_token_index ) < 0
3009             ## )
3010             ## {
3011             ## $self->write_diagnostics("DIVIDE? $input_line\n");
3012             ## }
3013             }
3014 207         429 return;
3015             } ## end sub do_SLASH
3016              
3017             sub do_LEFT_CURLY_BRACKET {
3018              
3019 1659     1659 0 3236 my $self = shift;
3020              
3021             # '{'
3022             # if we just saw a ')', we will label this block with
3023             # its type. We need to do this to allow sub
3024             # code_block_type to determine if this brace starts a
3025             # code block or anonymous hash. (The type of a paren
3026             # pair is the preceding token, such as 'if', 'else',
3027             # etc).
3028 1659         3006 $container_type = EMPTY_STRING;
3029              
3030             # ATTRS: for a '{' following an attribute list, reset
3031             # things to look like we just saw the sub name
3032             # Added 'package' (can be 'class') for --use-feature=class (rt145706)
3033 1659 100 100     13756 if ( $statement_type =~ /^(sub|package)\b/ ) {
    50 66        
    100 33        
    50          
3034 38         94 $last_nonblank_token = $statement_type;
3035 38         81 $last_nonblank_type = 'i';
3036 38         70 $statement_type = EMPTY_STRING;
3037             }
3038              
3039             # patch for SWITCH/CASE: hide these keywords from an immediately
3040             # following opening brace
3041             elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
3042             && $statement_type eq $last_nonblank_token )
3043             {
3044 0         0 $last_nonblank_token = ";";
3045             }
3046              
3047             elsif ( $last_nonblank_token eq ')' ) {
3048 237         763 $last_nonblank_token = $rparen_type->[ $paren_depth + 1 ];
3049              
3050             # defensive move in case of a nesting error (pbug.t)
3051             # in which this ')' had no previous '('
3052             # this nesting error will have been caught
3053 237 50       770 if ( !defined($last_nonblank_token) ) {
3054 0         0 $last_nonblank_token = 'if';
3055             }
3056              
3057             # check for syntax error here;
3058 237 100       1041 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
3059 14 50       76 if ( $self->[_extended_syntax_] ) {
3060              
3061             # we append a trailing () to mark this as an unknown
3062             # block type. This allows perltidy to format some
3063             # common extensions of perl syntax.
3064             # This is used by sub code_block_type
3065 14         48 $last_nonblank_token .= '()';
3066             }
3067             else {
3068 0         0 my $list =
3069             join( SPACE, sort keys %is_blocktype_with_paren );
3070 0         0 $self->warning(
3071             "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
3072             );
3073             }
3074             }
3075             }
3076              
3077             # patch for paren-less for/foreach glitch, part 2.
3078             # see note below under 'qw'
3079             elsif ($last_nonblank_token eq 'qw'
3080             && $is_for_foreach{$want_paren} )
3081             {
3082 0         0 $last_nonblank_token = $want_paren;
3083 0 0       0 if ( $last_last_nonblank_token eq $want_paren ) {
3084 0         0 $self->warning(
3085             "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
3086             );
3087              
3088             }
3089 0         0 $want_paren = EMPTY_STRING;
3090             }
3091              
3092             # now identify which of the three possible types of
3093             # curly braces we have: hash index container, anonymous
3094             # hash reference, or code block.
3095              
3096             # non-structural (hash index) curly brace pair
3097             # get marked 'L' and 'R'
3098 1659 100       4549 if ( is_non_structural_brace() ) {
3099 363         865 $type = 'L';
3100              
3101             # patch for SWITCH/CASE:
3102             # allow paren-less identifier after 'when'
3103             # if the brace is preceded by a space
3104 363 0 33     1444 if ( $statement_type eq 'when'
      33        
      0        
      0        
3105             && $last_nonblank_type eq 'i'
3106             && $last_last_nonblank_type eq 'k'
3107             && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
3108             {
3109 0         0 $type = '{';
3110 0         0 $block_type = $statement_type;
3111             }
3112             }
3113              
3114             # code and anonymous hash have the same type, '{', but are
3115             # distinguished by 'block_type',
3116             # which will be blank for an anonymous hash
3117             else {
3118              
3119 1296         4418 $block_type =
3120             $self->code_block_type( $i_tok, $rtokens, $rtoken_type,
3121             $max_token_index );
3122              
3123             # patch to promote bareword type to function taking block
3124 1296 100 100     6063 if ( $block_type
      66        
3125             && $last_nonblank_type eq 'w'
3126             && $last_nonblank_i >= 0 )
3127             {
3128 34 50       181 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
3129             $routput_token_type->[$last_nonblank_i] =
3130 34 100       220 $is_grep_alias{$block_type} ? 'k' : 'G';
3131             }
3132             }
3133              
3134             # patch for SWITCH/CASE: if we find a stray opening block brace
3135             # where we might accept a 'case' or 'when' block, then take it
3136 1296 100 100     5549 if ( $statement_type eq 'case'
3137             || $statement_type eq 'when' )
3138             {
3139 38 100 66     219 if ( !$block_type || $block_type eq '}' ) {
3140 4         6 $block_type = $statement_type;
3141             }
3142             }
3143             }
3144              
3145 1659         3781 $rbrace_type->[ ++$brace_depth ] = $block_type;
3146              
3147             # Patch for CLASS BLOCK definitions: do not update the package for the
3148             # current depth if this is a BLOCK type definition.
3149             # TODO: should make 'class' separate from 'package' and only do
3150             # this for 'class'
3151 1659 100       5504 $rbrace_package->[$brace_depth] = $current_package
3152             if ( substr( $block_type, 0, 8 ) ne 'package ' );
3153              
3154 1659         3422 $rbrace_structural_type->[$brace_depth] = $type;
3155 1659         3263 $rbrace_context->[$brace_depth] = $context;
3156 1659         4768 ( $type_sequence, $indent_flag ) =
3157             $self->increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
3158 1659         3342 return;
3159             } ## end sub do_LEFT_CURLY_BRACKET
3160              
3161             sub do_RIGHT_CURLY_BRACKET {
3162              
3163 1659     1659 0 3697 my $self = shift;
3164              
3165             # '}'
3166 1659         3560 $block_type = $rbrace_type->[$brace_depth];
3167 1659 100       4267 if ($block_type) { $statement_type = EMPTY_STRING }
  967         1999  
3168 1659 100       3848 if ( defined( $rbrace_package->[$brace_depth] ) ) {
3169 1655         3187 $current_package = $rbrace_package->[$brace_depth];
3170             }
3171              
3172             # can happen on brace error (caught elsewhere)
3173             else {
3174             }
3175 1659         5050 ( $type_sequence, $indent_flag ) =
3176             $self->decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
3177              
3178 1659 100       5435 if ( $rbrace_structural_type->[$brace_depth] eq 'L' ) {
3179 363         776 $type = 'R';
3180             }
3181              
3182             # propagate type information for 'do' and 'eval' blocks, and also
3183             # for smartmatch operator. This is necessary to enable us to know
3184             # if an operator or term is expected next.
3185 1659 100       4701 if ( $is_block_operator{$block_type} ) {
3186 83         224 $tok = $block_type;
3187             }
3188              
3189 1659         2914 $context = $rbrace_context->[$brace_depth];
3190 1659 50       3927 if ( $brace_depth > 0 ) { $brace_depth--; }
  1659         3304  
3191 1659         2764 return;
3192             } ## end sub do_RIGHT_CURLY_BRACKET
3193              
3194             sub do_AMPERSAND {
3195              
3196 126     126 0 326 my $self = shift;
3197              
3198             # '&' = maybe sub call? start looking
3199             # We have to check for sub call unless we are sure we
3200             # are expecting an operator. This example from s2p
3201             # got mistaken as a q operator in an early version:
3202             # print BODY &q(<<'EOT');
3203 126 100       380 if ( $expecting != OPERATOR ) {
3204              
3205             # But only look for a sub call if we are expecting a term or
3206             # if there is no existing space after the &.
3207             # For example we probably don't want & as sub call here:
3208             # Fcntl::S_IRUSR & $mode;
3209 107 100 66     487 if ( $expecting == TERM || $next_type ne 'b' ) {
3210 104         321 $self->scan_simple_identifier();
3211             }
3212             }
3213             else {
3214             }
3215 126         289 return;
3216             } ## end sub do_AMPERSAND
3217              
3218             sub do_LESS_THAN_SIGN {
3219              
3220 29     29 0 94 my $self = shift;
3221              
3222             # '<' - angle operator or less than?
3223 29 100       121 if ( $expecting != OPERATOR ) {
3224 8         55 ( $i, $type ) =
3225             $self->find_angle_operator_termination( $input_line, $i,
3226             $rtoken_map, $expecting, $max_token_index );
3227              
3228             ## This message is not very helpful and quite confusing if the above
3229             ## routine decided not to write a message with the line number.
3230             ## if ( $type eq '<' && $expecting == TERM ) {
3231             ## $self->error_if_expecting_TERM();
3232             ## $self->interrupt_logfile();
3233             ## $self->warning("Unterminated <> operator?\n");
3234             ## $self->resume_logfile();
3235             ## }
3236              
3237             }
3238             else {
3239             }
3240 29         340 return;
3241             } ## end sub do_LESS_THAN_SIGN
3242              
3243             sub do_QUESTION_MARK {
3244              
3245 187     187 0 553 my $self = shift;
3246              
3247             # '?' = conditional or starting pattern?
3248 187         380 my $is_pattern;
3249              
3250             # Patch for rt #126965
3251             # a pattern cannot follow certain keywords which take optional
3252             # arguments, like 'shift' and 'pop'. See also '/'.
3253 187 100 66     1522 if (
    100          
    100          
3254             $last_nonblank_type eq 'k'
3255             && $is_keyword_rejecting_question_as_pattern_delimiter{
3256             $last_nonblank_token}
3257             )
3258             {
3259 1         3 $is_pattern = 0;
3260             }
3261              
3262             # patch for RT#131288, user constant function without prototype
3263             # last type is 'U' followed by ?.
3264             elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
3265 1         3 $is_pattern = 0;
3266             }
3267             elsif ( $expecting == UNKNOWN ) {
3268              
3269             # In older versions of Perl, a bare ? can be a pattern
3270             # delimiter. In perl version 5.22 this was
3271             # dropped, but we have to support it in order to format
3272             # older programs. See:
3273             ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
3274             # For example, the following line worked
3275             # at one time:
3276             # ?(.*)? && (print $1,"\n");
3277             # In current versions it would have to be written with slashes:
3278             # /(.*)/ && (print $1,"\n");
3279 11         32 my $msg;
3280 11         69 ( $is_pattern, $msg ) =
3281             $self->guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
3282             $max_token_index );
3283              
3284 11 50       51 if ($msg) { $self->write_logfile_entry($msg) }
  11         48  
3285             }
3286 174         442 else { $is_pattern = ( $expecting == TERM ) }
3287              
3288 187 50       548 if ($is_pattern) {
3289 0         0 $in_quote = 1;
3290 0         0 $type = 'Q';
3291 0         0 $allowed_quote_modifiers = '[msixpodualngc]';
3292             }
3293             else {
3294 187         729 ( $type_sequence, $indent_flag ) =
3295             $self->increase_nesting_depth( QUESTION_COLON,
3296             $rtoken_map->[$i_tok] );
3297             }
3298 187         443 return;
3299             } ## end sub do_QUESTION_MARK
3300              
3301             sub do_STAR {
3302              
3303 238     238 0 484 my $self = shift;
3304              
3305             # '*' = typeglob, or multiply?
3306 238 50 66     841 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
3307 0 0 0     0 if ( $next_type ne 'b'
      0        
3308             && $next_type ne '('
3309             && $next_type ne '#' ) # Fix c036
3310             {
3311 0         0 $expecting = TERM;
3312             }
3313             }
3314 238 100       622 if ( $expecting == TERM ) {
3315 21         89 $self->scan_simple_identifier();
3316             }
3317             else {
3318              
3319 217 50       877 if ( $rtokens->[ $i + 1 ] eq '=' ) {
    100          
3320 0         0 $tok = '*=';
3321 0         0 $type = $tok;
3322 0         0 $i++;
3323             }
3324             elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
3325 36         102 $tok = '**';
3326 36         67 $type = $tok;
3327 36         61 $i++;
3328 36 50       123 if ( $rtokens->[ $i + 1 ] eq '=' ) {
3329 0         0 $tok = '**=';
3330 0         0 $type = $tok;
3331 0         0 $i++;
3332             }
3333             }
3334             }
3335 238         420 return;
3336             } ## end sub do_STAR
3337              
3338             sub do_DOT {
3339              
3340 150     150 0 329 my $self = shift;
3341              
3342             # '.' = what kind of . ?
3343 150 100       448 if ( $expecting != OPERATOR ) {
3344 10         39 $self->scan_number();
3345 10 100       29 if ( $type eq '.' ) {
3346 2 50       7 $self->error_if_expecting_TERM()
3347             if ( $expecting == TERM );
3348             }
3349             }
3350             else {
3351             }
3352 150         299 return;
3353             } ## end sub do_DOT
3354              
3355             sub do_COLON {
3356              
3357 271     271 0 718 my $self = shift;
3358              
3359             # ':' = label, ternary, attribute, ?
3360              
3361             # if this is the first nonblank character, call it a label
3362             # since perl seems to just swallow it
3363 271 50 66     3765 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
    100 66        
    100 66        
    100 66        
    100          
3364 0         0 $type = 'J';
3365             }
3366              
3367             # ATTRS: check for a ':' which introduces an attribute list
3368             # either after a 'sub' keyword or within a paren list
3369             # Added 'package' (can be 'class') for --use-feature=class (rt145706)
3370             elsif ( $statement_type =~ /^(sub|package)\b/ ) {
3371 22         60 $type = 'A';
3372 22         62 $self->[_in_attribute_list_] = 1;
3373             }
3374              
3375             # Within a signature, unless we are in a ternary. For example,
3376             # from 't/filter_example.t':
3377             # method foo4 ( $class: $bar ) { $class->bar($bar) }
3378             elsif ( $rparen_type->[$paren_depth] =~ /^sub\b/
3379             && !is_balanced_closing_container(QUESTION_COLON) )
3380             {
3381 1         3 $type = 'A';
3382 1         2 $self->[_in_attribute_list_] = 1;
3383             }
3384              
3385             # check for scalar attribute, such as
3386             # my $foo : shared = 1;
3387             elsif ($is_my_our_state{$statement_type}
3388             && $rcurrent_depth->[QUESTION_COLON] == 0 )
3389             {
3390 15         31 $type = 'A';
3391 15         30 $self->[_in_attribute_list_] = 1;
3392             }
3393              
3394             # Look for Switch::Plain syntax if an error would otherwise occur
3395             # here. Note that we do not need to check if the extended syntax
3396             # flag is set because otherwise an error would occur, and we would
3397             # then have to output a message telling the user to set the
3398             # extended syntax flag to avoid the error.
3399             # case 1: {
3400             # default: {
3401             # default:
3402             # Note that the line 'default:' will be parsed as a label elsewhere.
3403             elsif ( $is_case_default{$statement_type}
3404             && !is_balanced_closing_container(QUESTION_COLON) )
3405             {
3406             # mark it as a perltidy label type
3407 46         106 $type = 'J';
3408             }
3409              
3410             # otherwise, it should be part of a ?/: operator
3411             else {
3412 187         764 ( $type_sequence, $indent_flag ) =
3413             $self->decrease_nesting_depth( QUESTION_COLON,
3414             $rtoken_map->[$i_tok] );
3415 187 50       962 if ( $last_nonblank_token eq '?' ) {
3416 0         0 $self->warning("Syntax error near ? :\n");
3417             }
3418             }
3419 271         519 return;
3420             } ## end sub do_COLON
3421              
3422             sub do_PLUS_SIGN {
3423              
3424 227     227 0 499 my $self = shift;
3425              
3426             # '+' = what kind of plus?
3427 227 100       943 if ( $expecting == TERM ) {
    100          
3428 13         54 my $number = $self->scan_number_fast();
3429              
3430             # unary plus is safest assumption if not a number
3431 13 50       54 if ( !defined($number) ) { $type = 'p'; }
  13         26  
3432             }
3433             elsif ( $expecting == OPERATOR ) {
3434             }
3435             else {
3436 3 100       12 if ( $next_type eq 'w' ) { $type = 'p' }
  2         6  
3437             }
3438 227         429 return;
3439             } ## end sub do_PLUS_SIGN
3440              
3441             sub do_AT_SIGN {
3442              
3443 438     438 0 1211 my $self = shift;
3444              
3445             # '@' = sigil for array?
3446 438 50       1436 $self->error_if_expecting_OPERATOR("Array")
3447             if ( $expecting == OPERATOR );
3448 438         1616 $self->scan_simple_identifier();
3449 438         835 return;
3450             } ## end sub do_AT_SIGN
3451              
3452             sub do_PERCENT_SIGN {
3453              
3454 202     202 0 670 my $self = shift;
3455              
3456             # '%' = hash or modulo?
3457             # first guess is hash if no following blank or paren
3458 202 50       724 if ( $expecting == UNKNOWN ) {
3459 0 0 0     0 if ( $next_type ne 'b' && $next_type ne '(' ) {
3460 0         0 $expecting = TERM;
3461             }
3462             }
3463 202 100       660 if ( $expecting == TERM ) {
3464 192         710 $self->scan_simple_identifier();
3465             }
3466 202         494 return;
3467             } ## end sub do_PERCENT_SIGN
3468              
3469             sub do_LEFT_SQUARE_BRACKET {
3470              
3471 594     594 0 1257 my $self = shift;
3472              
3473             # '['
3474 594         1392 $rsquare_bracket_type->[ ++$square_bracket_depth ] =
3475             $last_nonblank_token;
3476 594         2025 ( $type_sequence, $indent_flag ) =
3477             $self->increase_nesting_depth( SQUARE_BRACKET,
3478             $rtoken_map->[$i_tok] );
3479              
3480             # It may seem odd, but structural square brackets have
3481             # type '{' and '}'. This simplifies the indentation logic.
3482 594 100       1900 if ( !is_non_structural_brace() ) {
3483 287         673 $type = '{';
3484             }
3485 594         1315 $rsquare_bracket_structural_type->[$square_bracket_depth] = $type;
3486 594         1076 return;
3487             } ## end sub do_LEFT_SQUARE_BRACKET
3488              
3489             sub do_RIGHT_SQUARE_BRACKET {
3490              
3491 594     594 0 1311 my $self = shift;
3492              
3493             # ']'
3494 594         2052 ( $type_sequence, $indent_flag ) =
3495             $self->decrease_nesting_depth( SQUARE_BRACKET,
3496             $rtoken_map->[$i_tok] );
3497              
3498 594 100       2119 if ( $rsquare_bracket_structural_type->[$square_bracket_depth] eq '{' )
3499             {
3500 287         642 $type = '}';
3501             }
3502              
3503             # propagate type information for smartmatch operator. This is
3504             # necessary to enable us to know if an operator or term is expected
3505             # next.
3506 594 100       1693 if ( $rsquare_bracket_type->[$square_bracket_depth] eq '~~' ) {
3507 20         43 $tok = $rsquare_bracket_type->[$square_bracket_depth];
3508             }
3509              
3510 594 50       1573 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
  594         963  
3511 594         1024 return;
3512             } ## end sub do_RIGHT_SQUARE_BRACKET
3513              
3514             sub do_MINUS_SIGN {
3515              
3516 441     441 0 916 my $self = shift;
3517              
3518             # '-' = what kind of minus?
3519 441 100 100     2991 if ( ( $expecting != OPERATOR )
    100          
    100          
3520             && $is_file_test_operator{$next_tok} )
3521             {
3522 10         52 my ( $next_nonblank_token, $i_next ) =
3523             $self->find_next_nonblank_token( $i + 1, $rtokens,
3524             $max_token_index );
3525              
3526             # check for a quoted word like "-w=>xx";
3527             # it is sufficient to just check for a following '='
3528 10 50       81 if ( $next_nonblank_token eq '=' ) {
3529 0         0 $type = 'm';
3530             }
3531             else {
3532 10         28 $i++;
3533 10         26 $tok .= $next_tok;
3534 10         37 $type = 'F';
3535             }
3536             }
3537             elsif ( $expecting == TERM ) {
3538 330         908 my $number = $self->scan_number_fast();
3539              
3540             # maybe part of bareword token? unary is safest
3541 330 100       907 if ( !defined($number) ) { $type = 'm'; }
  288         542  
3542              
3543             }
3544             elsif ( $expecting == OPERATOR ) {
3545             }
3546             else {
3547              
3548 4 50       17 if ( $next_type eq 'w' ) {
3549 4         10 $type = 'm';
3550             }
3551             }
3552 441         791 return;
3553             } ## end sub do_MINUS_SIGN
3554              
3555             sub do_CARAT_SIGN {
3556              
3557 12     12 0 25 my $self = shift;
3558              
3559             # '^'
3560             # check for special variables like ${^WARNING_BITS}
3561 12 100       35 if ( $expecting == TERM ) {
3562              
3563 5 50 33     52 if ( $last_nonblank_token eq '{'
      33        
3564             && ( $next_tok !~ /^\d/ )
3565             && ( $next_tok =~ /^\w/ ) )
3566             {
3567              
3568 5 100       25 if ( $next_tok eq 'W' ) {
3569 1         3 $self->[_saw_perl_dash_w_] = 1;
3570             }
3571 5         13 $tok = $tok . $next_tok;
3572 5         9 $i = $i + 1;
3573 5         12 $type = 'w';
3574              
3575             # Optional coding to try to catch syntax errors. This can
3576             # be removed if it ever causes incorrect warning messages.
3577             # The '{^' should be preceded by either by a type or '$#'
3578             # Examples:
3579             # $#{^CAPTURE} ok
3580             # *${^LAST_FH}{NAME} ok
3581             # @{^HOWDY} ok
3582             # $hash{^HOWDY} error
3583              
3584             # Note that a type sigil '$' may be tokenized as 'Z'
3585             # after something like 'print', so allow type 'Z'
3586 5 0 33     20 if ( $last_last_nonblank_type ne 't'
      33        
3587             && $last_last_nonblank_type ne 'Z'
3588             && $last_last_nonblank_token ne '$#' )
3589             {
3590 0         0 $self->warning("Possible syntax error near '{^'\n");
3591             }
3592             }
3593              
3594             else {
3595 0 0       0 unless ( $self->error_if_expecting_TERM() ) {
3596              
3597             # Something like this is valid but strange:
3598             # undef ^I;
3599 0         0 $self->complain("The '^' seems unusual here\n");
3600             }
3601             }
3602             }
3603 12         24 return;
3604             } ## end sub do_CARAT_SIGN
3605              
3606             sub do_DOUBLE_COLON {
3607              
3608 9     9 0 16 my $self = shift;
3609              
3610             # '::' = probably a sub call
3611 9         28 $self->scan_bare_identifier();
3612 9         17 return;
3613             } ## end sub do_DOUBLE_COLON
3614              
3615             sub do_LEFT_SHIFT {
3616              
3617 7     7 0 27 my $self = shift;
3618              
3619             # '<<' = maybe a here-doc?
3620              
3621             ## This check removed because it could be a deprecated here-doc with
3622             ## no specified target. See example in log 16 Sep 2020.
3623             ## return
3624             ## unless ( $i < $max_token_index )
3625             ## ; # here-doc not possible if end of line
3626              
3627 7 50       38 if ( $expecting != OPERATOR ) {
3628 7         23 my ( $found_target, $here_doc_target, $here_quote_character,
3629             $saw_error );
3630             (
3631 7         55 $found_target, $here_doc_target, $here_quote_character, $i,
3632             $saw_error
3633             )
3634             = $self->find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3635             $max_token_index );
3636              
3637 7 50       35 if ($found_target) {
    0          
3638 7         17 push @{$rhere_target_list},
  7         27  
3639             [ $here_doc_target, $here_quote_character ];
3640 7         53 $type = 'h';
3641 7 50       87 if ( length($here_doc_target) > 80 ) {
    50          
    100          
3642 0         0 my $truncated = substr( $here_doc_target, 0, 80 );
3643 0         0 $self->complain("Long here-target: '$truncated' ...\n");
3644             }
3645             elsif ( !$here_doc_target ) {
3646 0 0       0 $self->warning(
3647             'Use of bare << to mean <<"" is deprecated' . "\n" )
3648             unless ($here_quote_character);
3649             }
3650             elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3651 2         12 $self->complain(
3652             "Unconventional here-target: '$here_doc_target'\n");
3653             }
3654             }
3655             elsif ( $expecting == TERM ) {
3656 0 0       0 unless ($saw_error) {
3657              
3658             # shouldn't happen..arriving here implies an error in
3659             # the logic in sub 'find_here_doc'
3660 0         0 if (DEVEL_MODE) {
3661             $self->Fault(<<EOM);
3662             Program bug; didn't find here doc target
3663             EOM
3664             }
3665             $self->warning(
3666 0         0 "Possible program error: didn't find here doc target\n"
3667             );
3668 0         0 $self->report_definite_bug();
3669             }
3670             }
3671             }
3672             else {
3673             }
3674 7         24 return;
3675             } ## end sub do_LEFT_SHIFT
3676              
3677             sub do_NEW_HERE_DOC {
3678              
3679             # '<<~' = a here-doc, new type added in v26
3680              
3681 2     2 0 9 my $self = shift;
3682              
3683             return
3684 2 50       12 unless ( $i < $max_token_index )
3685             ; # here-doc not possible if end of line
3686 2 50       11 if ( $expecting != OPERATOR ) {
3687 2         14 my ( $found_target, $here_doc_target, $here_quote_character,
3688             $saw_error );
3689             (
3690 2         12 $found_target, $here_doc_target, $here_quote_character, $i,
3691             $saw_error
3692             )
3693             = $self->find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3694             $max_token_index );
3695              
3696 2 50       8 if ($found_target) {
    0          
3697              
3698 2 50       19 if ( length($here_doc_target) > 80 ) {
    50          
3699 0         0 my $truncated = substr( $here_doc_target, 0, 80 );
3700 0         0 $self->complain("Long here-target: '$truncated' ...\n");
3701             }
3702             elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3703 0         0 $self->complain(
3704             "Unconventional here-target: '$here_doc_target'\n");
3705             }
3706              
3707             # Note that we put a leading space on the here quote
3708             # character indicate that it may be preceded by spaces
3709 2         6 $here_quote_character = SPACE . $here_quote_character;
3710 2         6 push @{$rhere_target_list},
  2         7  
3711             [ $here_doc_target, $here_quote_character ];
3712 2         7 $type = 'h';
3713             }
3714             elsif ( $expecting == TERM ) {
3715 0 0       0 unless ($saw_error) {
3716              
3717             # shouldn't happen..arriving here implies an error in
3718             # the logic in sub 'find_here_doc'
3719 0         0 if (DEVEL_MODE) {
3720             $self->Fault(<<EOM);
3721             Program bug; didn't find here doc target
3722             EOM
3723             }
3724             $self->warning(
3725 0         0 "Possible program error: didn't find here doc target\n"
3726             );
3727 0         0 $self->report_definite_bug();
3728             }
3729             }
3730             }
3731             else {
3732 0         0 $self->error_if_expecting_OPERATOR();
3733             }
3734 2         6 return;
3735             } ## end sub do_NEW_HERE_DOC
3736              
3737             sub do_POINTER {
3738              
3739             # '->'
3740 886     886 0 1594 return;
3741             }
3742              
3743             sub do_PLUS_PLUS {
3744              
3745 46     46 0 156 my $self = shift;
3746              
3747             # '++'
3748             # type = 'pp' for pre-increment, '++' for post-increment
3749 46 100       260 if ( $expecting == TERM ) { $type = 'pp' }
  7 100       18  
3750             elsif ( $expecting == UNKNOWN ) {
3751              
3752 2         9 my ( $next_nonblank_token, $i_next ) =
3753             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
3754              
3755             # Fix for c042: look past a side comment
3756 2 50       11 if ( $next_nonblank_token eq '#' ) {
3757 0         0 ( $next_nonblank_token, $i_next ) =
3758             $self->find_next_nonblank_token( $max_token_index,
3759             $rtokens, $max_token_index );
3760             }
3761              
3762 2 50       8 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
  0         0  
3763             }
3764 46         137 return;
3765             } ## end sub do_PLUS_PLUS
3766              
3767             sub do_FAT_COMMA {
3768              
3769 1025     1025 0 1857 my $self = shift;
3770              
3771             # '=>'
3772 1025 50       2555 if ( $last_nonblank_type eq $tok ) {
3773 0         0 $self->complain("Repeated '=>'s \n");
3774             }
3775              
3776             # patch for operator_expected: note if we are in the list (use.t)
3777             # TODO: make version numbers a new token type
3778 1025 100       2448 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
  18         45  
3779 1025         1655 return;
3780             } ## end sub do_FAT_COMMA
3781              
3782             sub do_MINUS_MINUS {
3783              
3784 2     2 0 8 my $self = shift;
3785              
3786             # '--'
3787             # type = 'mm' for pre-decrement, '--' for post-decrement
3788              
3789 2 50       8 if ( $expecting == TERM ) { $type = 'mm' }
  2 0       9  
3790             elsif ( $expecting == UNKNOWN ) {
3791 0         0 my ( $next_nonblank_token, $i_next ) =
3792             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
3793              
3794             # Fix for c042: look past a side comment
3795 0 0       0 if ( $next_nonblank_token eq '#' ) {
3796 0         0 ( $next_nonblank_token, $i_next ) =
3797             $self->find_next_nonblank_token( $max_token_index,
3798             $rtokens, $max_token_index );
3799             }
3800              
3801 0 0       0 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
  0         0  
3802             }
3803 2         7 return;
3804             } ## end sub do_MINUS_MINUS
3805              
3806             sub do_LOGICAL_AND {
3807              
3808 58     58 0 137 my $self = shift;
3809              
3810             # '&&'
3811 58 50 33     236 $self->error_if_expecting_TERM()
3812             if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3813 58         113 return;
3814             } ## end sub do_LOGICAL_AND
3815              
3816             sub do_LOGICAL_OR {
3817              
3818 74     74 0 198 my $self = shift;
3819              
3820             # '||'
3821 74 100 66     339 $self->error_if_expecting_TERM()
3822             if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3823 74         149 return;
3824             } ## end sub do_LOGICAL_OR
3825              
3826             sub do_SLASH_SLASH {
3827              
3828 10     10 0 22 my $self = shift;
3829              
3830             # '//'
3831 10 100       35 $self->error_if_expecting_TERM()
3832             if ( $expecting == TERM );
3833 10         17 return;
3834             } ## end sub do_SLASH_SLASH
3835              
3836             sub do_DIGITS {
3837              
3838 1929     1929 0 3241 my $self = shift;
3839              
3840             # 'd' = string of digits
3841 1929 50       4362 $self->error_if_expecting_OPERATOR("Number")
3842             if ( $expecting == OPERATOR );
3843              
3844 1929         4889 my $number = $self->scan_number_fast();
3845 1929 50       4792 if ( !defined($number) ) {
3846              
3847             # shouldn't happen - we should always get a number
3848 0         0 if (DEVEL_MODE) {
3849             $self->Fault(<<EOM);
3850             non-number beginning with digit--program bug
3851             EOM
3852             }
3853             $self->warning(
3854 0         0 "Unexpected error condition: non-number beginning with digit\n"
3855             );
3856 0         0 $self->report_definite_bug();
3857             }
3858 1929         4636 return;
3859             } ## end sub do_DIGITS
3860              
3861             sub do_ATTRIBUTE_LIST {
3862              
3863 39     39 0 109 my ( $self, $next_nonblank_token ) = @_;
3864              
3865             # Called at a bareword encountered while in an attribute list
3866             # returns 'is_attribute':
3867             # true if attribute found
3868             # false if an attribute (continue parsing bareword)
3869              
3870             # treat bare word followed by open paren like qw(
3871 39 100       130 if ( $next_nonblank_token eq '(' ) {
3872              
3873             # For something like:
3874             # : prototype($$)
3875             # we should let do_scan_sub see it so that it can see
3876             # the prototype. All other attributes get parsed as a
3877             # quoted string.
3878 18 100       62 if ( $tok eq 'prototype' ) {
3879 2         7 $id_scan_state = 'prototype';
3880              
3881             # start just after the word 'prototype'
3882 2         5 my $i_beg = $i + 1;
3883 2         21 ( $i, $tok, $type, $id_scan_state ) = $self->do_scan_sub(
3884             {
3885             input_line => $input_line,
3886             i => $i,
3887             i_beg => $i_beg,
3888             tok => $tok,
3889             type => $type,
3890             rtokens => $rtokens,
3891             rtoken_map => $rtoken_map,
3892             id_scan_state => $id_scan_state,
3893             max_token_index => $max_token_index,
3894             }
3895             );
3896              
3897             # If successful, mark as type 'q' to be consistent
3898             # with other attributes. Type 'w' would also work.
3899 2 50       20 if ( $i > $i_beg ) {
3900 2         5 $type = 'q';
3901 2         8 return 1;
3902             }
3903              
3904             # If not successful, continue and parse as a quote.
3905             }
3906              
3907             # All other attribute lists must be parsed as quotes
3908             # (see 'signatures.t' for good examples)
3909 16         50 $in_quote = $quote_items{'q'};
3910 16         43 $allowed_quote_modifiers = $quote_modifiers{'q'};
3911 16         32 $type = 'q';
3912 16         31 $quote_type = 'q';
3913 16         36 return 1;
3914             }
3915              
3916             # handle bareword not followed by open paren
3917             else {
3918 21         44 $type = 'w';
3919 21         52 return 1;
3920             }
3921              
3922             # attribute not found
3923 0         0 return;
3924             } ## end sub do_ATTRIBUTE_LIST
3925              
3926             sub do_QUOTED_BAREWORD {
3927              
3928 786     786 0 1424 my $self = shift;
3929              
3930             # find type of a bareword followed by a '=>'
3931 786 100       4006 if ( $ris_constant->{$current_package}{$tok} ) {
    50          
    50          
3932 14         34 $type = 'C';
3933             }
3934             elsif ( $ris_user_function->{$current_package}{$tok} ) {
3935 0         0 $type = 'U';
3936 0         0 $prototype = $ruser_function_prototype->{$current_package}{$tok};
3937             }
3938             elsif ( $tok =~ /^v\d+$/ ) {
3939 0         0 $type = 'v';
3940 0         0 $self->report_v_string($tok);
3941             }
3942             else {
3943              
3944             # Bareword followed by a fat comma - see 'git18.in'
3945             # If tok is something like 'x17' then it could
3946             # actually be operator x followed by number 17.
3947             # For example, here:
3948             # 123x17 => [ 792, 1224 ],
3949             # (a key of 123 repeated 17 times, perhaps not
3950             # what was intended). We will mark x17 as type
3951             # 'n' and it will be split. If the previous token
3952             # was also a bareword then it is not very clear is
3953             # going on. In this case we will not be sure that
3954             # an operator is expected, so we just mark it as a
3955             # bareword. Perl is a little murky in what it does
3956             # with stuff like this, and its behavior can change
3957             # over time. Something like
3958             # a x18 => [792, 1224], will compile as
3959             # a key with 18 a's. But something like
3960             # push @array, a x18;
3961             # is a syntax error.
3962 772 100 66     2739 if (
      33        
      66        
3963             $expecting == OPERATOR
3964             && substr( $tok, 0, 1 ) eq 'x'
3965             && ( length($tok) == 1
3966             || substr( $tok, 1, 1 ) =~ /^\d/ )
3967             )
3968             {
3969 3         5 $type = 'n';
3970 3 50       12 if ( $self->split_pretoken(1) ) {
3971 3         5 $type = 'x';
3972 3         7 $tok = 'x';
3973             }
3974             }
3975             else {
3976              
3977             # git #18
3978 769         1331 $type = 'w';
3979 769         1981 $self->error_if_expecting_OPERATOR();
3980             }
3981             }
3982 786         1257 return;
3983             } ## end sub do_QUOTED_BAREWORD
3984              
3985             sub do_X_OPERATOR {
3986              
3987 17     17 0 46 my $self = shift;
3988              
3989 17 100       64 if ( $tok eq 'x' ) {
3990 15 50       80 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
3991 0         0 $tok = 'x=';
3992 0         0 $type = $tok;
3993 0         0 $i++;
3994             }
3995             else {
3996 15         39 $type = 'x';
3997             }
3998             }
3999             else {
4000              
4001             # Split a pretoken like 'x10' into 'x' and '10'.
4002             # Note: In previous versions of perltidy it was marked
4003             # as a number, $type = 'n', and fixed downstream by the
4004             # Formatter.
4005 2         6 $type = 'n';
4006 2 50       6 if ( $self->split_pretoken(1) ) {
4007 2         5 $type = 'x';
4008 2         4 $tok = 'x';
4009             }
4010             }
4011 17         32 return;
4012             } ## end sub do_X_OPERATOR
4013              
4014             sub do_USE_CONSTANT {
4015              
4016 16     16 0 36 my $self = shift;
4017              
4018 16         61 $self->scan_bare_identifier();
4019 16         92 my ( $next_nonblank_tok2, $i_next2 ) =
4020             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
4021              
4022 16 50       74 if ($next_nonblank_tok2) {
4023              
4024 16 100       98 if ( $is_keyword{$next_nonblank_tok2} ) {
4025              
4026             # Assume qw is used as a quote and okay, as in:
4027             # use constant qw{ DEBUG 0 };
4028             # Not worth trying to parse for just a warning
4029              
4030             # NOTE: This warning is deactivated because recent
4031             # versions of perl do not complain here, but
4032             # the coding is retained for reference.
4033 1         2 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
4034             $self->warning(
4035             "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
4036             );
4037             }
4038             }
4039              
4040             else {
4041 15         54 $ris_constant->{$current_package}{$next_nonblank_tok2} = 1;
4042             }
4043             }
4044 16         65 return;
4045             } ## end sub do_USE_CONSTANT
4046              
4047             sub do_KEYWORD {
4048              
4049 2636     2636 0 4855 my $self = shift;
4050              
4051             # found a keyword - set any associated flags
4052 2636         4500 $type = 'k';
4053              
4054             # Since for and foreach may not be followed immediately
4055             # by an opening paren, we have to remember which keyword
4056             # is associated with the next '('
4057 2636 100 100     20838 if ( $is_for_foreach{$tok} ) {
    100 66        
    100          
    100          
    100          
    100          
    100          
4058 74 100       328 if ( new_statement_ok() ) {
4059 72         243 $want_paren = $tok;
4060             }
4061             }
4062              
4063             # recognize 'use' statements, which are special
4064             elsif ( $is_use_require{$tok} ) {
4065 175         419 $statement_type = $tok;
4066 175 50       529 $self->error_if_expecting_OPERATOR()
4067             if ( $expecting == OPERATOR );
4068             }
4069              
4070             # remember my and our to check for trailing ": shared"
4071             elsif ( $is_my_our_state{$tok} ) {
4072 628         1404 $statement_type = $tok;
4073             }
4074              
4075             # Check for misplaced 'elsif' and 'else', but allow isolated
4076             # else or elsif blocks to be formatted. This is indicated
4077             # by a last noblank token of ';'
4078             elsif ( $tok eq 'elsif' ) {
4079 27 50 66     211 if (
4080             $last_nonblank_token ne ';'
4081              
4082             ## !~ /^(if|elsif|unless)$/
4083             && !$is_if_elsif_unless{$last_nonblank_block_type}
4084             )
4085             {
4086 0         0 $self->warning(
4087             "expecting '$tok' to follow one of 'if|elsif|unless'\n");
4088             }
4089             }
4090             elsif ( $tok eq 'else' ) {
4091              
4092             # patched for SWITCH/CASE
4093 44 50 66     535 if (
      66        
4094             $last_nonblank_token ne ';'
4095              
4096             ## !~ /^(if|elsif|unless|case|when)$/
4097             && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
4098              
4099             # patch to avoid an unwanted error message for
4100             # the case of a parenless 'case' (RT 105484):
4101             # switch ( 1 ) { case x { 2 } else { } }
4102             ## !~ /^(if|elsif|unless|case|when)$/
4103             && !$is_if_elsif_unless_case_when{$statement_type}
4104             )
4105             {
4106 0         0 $self->warning(
4107             "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
4108             );
4109             }
4110             }
4111              
4112             # patch for SWITCH/CASE if 'case' and 'when are
4113             # treated as keywords. Also 'default' for Switch::Plain
4114             elsif ($tok eq 'when'
4115             || $tok eq 'case'
4116             || $tok eq 'default' )
4117             {
4118 56         129 $statement_type = $tok; # next '{' is block
4119             }
4120              
4121             # feature 'err' was removed in Perl 5.10. So mark this as
4122             # a bareword unless an operator is expected (see c158).
4123             elsif ( $tok eq 'err' ) {
4124 1 50       9 if ( $expecting != OPERATOR ) { $type = 'w' }
  1         2  
4125             }
4126              
4127 2636         4711 return;
4128             } ## end sub do_KEYWORD
4129              
4130             sub do_QUOTE_OPERATOR {
4131              
4132 202     202 0 457 my $self = shift;
4133              
4134 202 50       638 if ( $expecting == OPERATOR ) {
4135              
4136             # Be careful not to call an error for a qw quote
4137             # where a parenthesized list is allowed. For example,
4138             # it could also be a for/foreach construct such as
4139             #
4140             # foreach my $key qw\Uno Due Tres Quadro\ {
4141             # print "Set $key\n";
4142             # }
4143             #
4144              
4145             # Or it could be a function call.
4146             # NOTE: Braces in something like &{ xxx } are not
4147             # marked as a block, we might have a method call.
4148             # &method(...), $method->(..), &{method}(...),
4149             # $ref[2](list) is ok & short for $ref[2]->(list)
4150             #
4151             # See notes in 'sub code_block_type' and
4152             # 'sub is_non_structural_brace'
4153              
4154 0 0 0     0 unless (
      0        
4155             $tok eq 'qw'
4156             && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
4157             || $is_for_foreach{$want_paren} )
4158             )
4159             {
4160 0         0 $self->error_if_expecting_OPERATOR();
4161             }
4162             }
4163 202         556 $in_quote = $quote_items{$tok};
4164 202         554 $allowed_quote_modifiers = $quote_modifiers{$tok};
4165              
4166             # All quote types are 'Q' except possibly qw quotes.
4167             # qw quotes are special in that they may generally be trimmed
4168             # of leading and trailing whitespace. So they are given a
4169             # separate type, 'q', unless requested otherwise.
4170 202 100 66     987 $type =
4171             ( $tok eq 'qw' && $self->[_trim_qw_] )
4172             ? 'q'
4173             : 'Q';
4174 202         380 $quote_type = $type;
4175 202         374 return;
4176             } ## end sub do_QUOTE_OPERATOR
4177              
4178             sub do_UNKNOWN_BAREWORD {
4179              
4180 957     957 0 3120 my ( $self, $next_nonblank_token ) = @_;
4181              
4182 957         3149 $self->scan_bare_identifier();
4183              
4184 957 100 100     3494 if ( $statement_type eq 'use'
4185             && $last_nonblank_token eq 'use' )
4186             {
4187 108         422 $rsaw_use_module->{$current_package}->{$tok} = 1;
4188             }
4189              
4190 957 100       2602 if ( $type eq 'w' ) {
4191              
4192 932 50       2428 if ( $expecting == OPERATOR ) {
4193              
4194             # Patch to avoid error message for RPerl overloaded
4195             # operator functions: use overload
4196             # '+' => \&sse_add,
4197             # '-' => \&sse_sub,
4198             # '*' => \&sse_mul,
4199             # '/' => \&sse_div;
4200             # TODO: this could eventually be generalized
4201 0 0 0     0 if ( $rsaw_use_module->{$current_package}->{'RPerl'}
    0          
    0          
4202             && $tok =~ /^sse_(mul|div|add|sub)$/ )
4203             {
4204              
4205             }
4206              
4207             # Fix part 1 for git #63 in which a comment falls
4208             # between an -> and the following word. An
4209             # alternate fix would be to change operator_expected
4210             # to return an UNKNOWN for this type.
4211             elsif ( $last_nonblank_type eq '->' ) {
4212              
4213             }
4214              
4215             # don't complain about possible indirect object
4216             # notation.
4217             # For example:
4218             # package main;
4219             # sub new($) { ... }
4220             # $b = new A::; # calls A::new
4221             # $c = new A; # same thing but suspicious
4222             # This will call A::new but we have a 'new' in
4223             # main:: which looks like a constant.
4224             #
4225             elsif ( $last_nonblank_type eq 'C' ) {
4226 0 0       0 if ( $tok !~ /::$/ ) {
4227 0         0 $self->complain(<<EOM);
4228             Expecting operator after '$last_nonblank_token' but found bare word '$tok'
4229             Maybe indirectet object notation?
4230             EOM
4231             }
4232             }
4233             else {
4234 0         0 $self->error_if_expecting_OPERATOR("bareword");
4235             }
4236             }
4237              
4238             # mark bare words immediately followed by a paren as
4239             # functions
4240 932         2242 $next_tok = $rtokens->[ $i + 1 ];
4241 932 100       2535 if ( $next_tok eq '(' ) {
4242              
4243             # Patch for issue c151, where we are processing a snippet and
4244             # have not seen that SPACE is a constant. In this case 'x' is
4245             # probably an operator. The only disadvantage with an incorrect
4246             # guess is that the space after it may be incorrect. For example
4247             # $str .= SPACE x ( 16 - length($str) ); See also b1410.
4248 276 50 33     1583 if ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { $type = 'x' }
  0 50       0  
4249              
4250             # Fix part 2 for git #63. Leave type as 'w' to keep
4251             # the type the same as if the -> were not separated
4252 276         567 elsif ( $last_nonblank_type ne '->' ) { $type = 'U' }
4253              
4254             }
4255              
4256             # underscore after file test operator is file handle
4257 932 50 66     2757 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
4258 0         0 $type = 'Z';
4259             }
4260              
4261             # patch for SWITCH/CASE if 'case' and 'when are
4262             # not treated as keywords:
4263 932 50 33     4691 if (
      33        
      33        
4264             ( $tok eq 'case' && $rbrace_type->[$brace_depth] eq 'switch' )
4265             || ( $tok eq 'when'
4266             && $rbrace_type->[$brace_depth] eq 'given' )
4267             )
4268             {
4269 0         0 $statement_type = $tok; # next '{' is block
4270 0         0 $type = 'k'; # for keyword syntax coloring
4271             }
4272 932 100       2334 if ( $next_nonblank_token eq '(' ) {
4273              
4274             # patch for SWITCH/CASE if switch and given not keywords
4275             # Switch is not a perl 5 keyword, but we will gamble
4276             # and mark switch followed by paren as a keyword. This
4277             # is only necessary to get html syntax coloring nice,
4278             # and does not commit this as being a switch/case.
4279 241 50 33     1941 if ( $tok eq 'switch' || $tok eq 'given' ) {
    50 33        
4280 0         0 $type = 'k'; # for keyword syntax coloring
4281             }
4282              
4283             # mark 'x' as operator for something like this (see b1410)
4284             # my $line = join( LD_X, map { LD_H x ( $_ + 2 ) } @$widths );
4285             elsif ( $tok eq 'x' && $last_nonblank_type eq 'w' ) {
4286 0         0 $type = 'x';
4287             }
4288             }
4289             }
4290 957         1764 return;
4291             } ## end sub do_UNKNOWN_BAREWORD
4292              
4293             sub sub_attribute_ok_here {
4294              
4295 35     35 0 163 my ( $self, $tok_kw, $next_nonblank_token, $i_next ) = @_;
4296              
4297             # Decide if 'sub :' can be the start of a sub attribute list.
4298             # We will decide based on if the colon is followed by a
4299             # bareword which is not a keyword.
4300             # Changed inext+1 to inext to fixed case b1190.
4301 35         65 my $sub_attribute_ok_here;
4302 35 50 66     161 if ( $is_sub{$tok_kw}
      66        
4303             && $expecting != OPERATOR
4304             && $next_nonblank_token eq ':' )
4305             {
4306 3         12 my ( $nn_nonblank_token, $i_nn ) =
4307             $self->find_next_nonblank_token( $i_next, $rtokens,
4308             $max_token_index );
4309             $sub_attribute_ok_here =
4310             $nn_nonblank_token =~ /^\w/
4311             && $nn_nonblank_token !~ /^\d/
4312 3   66     29 && !$is_keyword{$nn_nonblank_token};
4313             }
4314 35         300 return $sub_attribute_ok_here;
4315             } ## end sub sub_attribute_ok_here
4316              
4317             sub do_BAREWORD {
4318              
4319 5826     5826 0 11547 my ( $self, $is_END_or_DATA ) = @_;
4320              
4321             # handle a bareword token:
4322             # returns
4323             # true if this token ends the current line
4324             # false otherwise
4325              
4326 5826         16287 my ( $next_nonblank_token, $i_next ) =
4327             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
4328              
4329             # a bare word immediately followed by :: is not a keyword;
4330             # use $tok_kw when testing for keywords to avoid a mistake
4331 5826         11407 my $tok_kw = $tok;
4332 5826 100 100     16815 if ( $rtokens->[ $i + 1 ] eq ':'
4333             && $rtokens->[ $i + 2 ] eq ':' )
4334             {
4335 266         613 $tok_kw .= '::';
4336             }
4337              
4338 5826 100       13224 if ( $self->[_in_attribute_list_] ) {
4339 39         187 my $is_attribute = $self->do_ATTRIBUTE_LIST($next_nonblank_token);
4340 39 50       119 return if ($is_attribute);
4341             }
4342              
4343             #----------------------------------------
4344             # Starting final if-elsif- chain of tests
4345             #----------------------------------------
4346              
4347             # This is the return flag:
4348             # true => this is the last token on the line
4349             # false => keep tokenizing the line
4350 5787         8778 my $is_last;
4351              
4352             # The following blocks of code must update these vars:
4353             # $type - the final token type, must always be set
4354              
4355             # In addition, if additional pretokens are added:
4356             # $tok - the final token
4357             # $i - the index of the last pretoken
4358              
4359             # They may also need to check and set various flags
4360              
4361             # Scan a bare word following a -> as an identifier; it could
4362             # have a long package name. Fixes c037, c041.
4363 5787 100 100     93750 if ( $last_nonblank_token eq '->' ) {
    100 100        
    100 66        
    100 66        
    100 100        
    100 66        
    100 66        
    50 100        
    50 100        
    100 33        
    100 0        
    100 0        
    100 33        
    100 0        
    100 0        
    100 66        
    100 100        
    100 100        
      100        
      66        
      66        
      66        
4364 669         2362 $self->scan_bare_identifier();
4365              
4366             # a bareward after '->' gets type 'i'
4367 669         1240 $type = 'i';
4368             }
4369              
4370             # Quote a word followed by => operator
4371             # unless the word __END__ or __DATA__ and the only word on
4372             # the line.
4373             elsif ( !$is_END_or_DATA
4374             && $next_nonblank_token eq '='
4375             && $rtokens->[ $i_next + 1 ] eq '>' )
4376             {
4377 786         2366 $self->do_QUOTED_BAREWORD();
4378             }
4379              
4380             # quote a bare word within braces..like xxx->{s}; note that we
4381             # must be sure this is not a structural brace, to avoid
4382             # mistaking {s} in the following for a quoted bare word:
4383             # for(@[){s}bla}BLA}
4384             # Also treat q in something like var{-q} as a bare word, not
4385             # a quote operator
4386             elsif (
4387             $next_nonblank_token eq '}'
4388             && (
4389             $last_nonblank_type eq 'L'
4390             || ( $last_nonblank_type eq 'm'
4391             && $last_last_nonblank_type eq 'L' )
4392             )
4393             )
4394             {
4395 100         242 $type = 'w';
4396             }
4397              
4398             # handle operator x (now we know it isn't $x=)
4399             elsif (
4400             $expecting == OPERATOR
4401             && substr( $tok, 0, 1 ) eq 'x'
4402             && ( length($tok) == 1
4403             || substr( $tok, 1, 1 ) =~ /^\d/ )
4404             )
4405             {
4406 17         108 $self->do_X_OPERATOR();
4407             }
4408             elsif ( $tok_kw eq 'CORE::' ) {
4409 3         25 $type = $tok = $tok_kw;
4410 3         6 $i += 2;
4411             }
4412             elsif ( ( $tok eq 'strict' )
4413             and ( $last_nonblank_token eq 'use' ) )
4414             {
4415 14         33 $self->[_saw_use_strict_] = 1;
4416 14         73 $self->scan_bare_identifier();
4417             }
4418              
4419             elsif ( ( $tok eq 'warnings' )
4420             and ( $last_nonblank_token eq 'use' ) )
4421             {
4422 7         24 $self->[_saw_perl_dash_w_] = 1;
4423              
4424             # scan as identifier, so that we pick up something like:
4425             # use warnings::register
4426 7         27 $self->scan_bare_identifier();
4427             }
4428              
4429             elsif (
4430             $tok eq 'AutoLoader'
4431             && $self->[_look_for_autoloader_]
4432             && (
4433             $last_nonblank_token eq 'use'
4434              
4435             # these regexes are from AutoSplit.pm, which we want
4436             # to mimic
4437             || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
4438             || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
4439             )
4440             )
4441             {
4442 0         0 $self->write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
4443 0         0 $self->[_saw_autoloader_] = 1;
4444 0         0 $self->[_look_for_autoloader_] = 0;
4445 0         0 $self->scan_bare_identifier();
4446             }
4447              
4448             elsif (
4449             $tok eq 'SelfLoader'
4450             && $self->[_look_for_selfloader_]
4451             && ( $last_nonblank_token eq 'use'
4452             || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4453             || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4454             )
4455             {
4456 0         0 $self->write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4457 0         0 $self->[_saw_selfloader_] = 1;
4458 0         0 $self->[_look_for_selfloader_] = 0;
4459 0         0 $self->scan_bare_identifier();
4460             }
4461              
4462             elsif ( ( $tok eq 'constant' )
4463             and ( $last_nonblank_token eq 'use' ) )
4464             {
4465 16         73 $self->do_USE_CONSTANT();
4466             }
4467              
4468             # various quote operators
4469             elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4470 202         910 $self->do_QUOTE_OPERATOR();
4471             }
4472              
4473             # check for a statement label
4474             elsif (
4475             ( $next_nonblank_token eq ':' )
4476             && ( $rtokens->[ $i_next + 1 ] ne ':' )
4477             && ( $i_next <= $max_token_index ) # colon on same line
4478              
4479             # like 'sub : lvalue' ?
4480             && !$self->sub_attribute_ok_here( $tok_kw, $next_nonblank_token,
4481             $i_next )
4482             && label_ok()
4483             )
4484             {
4485 33 100       205 if ( $tok !~ /[A-Z]/ ) {
4486 15         35 push @{ $self->[_rlower_case_labels_at_] }, $input_line_number;
  15         52  
4487             }
4488 33         82 $type = 'J';
4489 33         87 $tok .= ':';
4490 33         67 $i = $i_next;
4491             }
4492              
4493             # 'sub' or other sub alias
4494             elsif ( $is_sub{$tok_kw} ) {
4495              
4496             # Update for --use-feature=class (rt145706):
4497             # We have to be extra careful to avoid misparsing other uses of
4498             # 'method' in older scripts.
4499 301 100       1105 if ( $tok_kw eq 'method' ) {
4500 12 100 66     132 if ( $expecting == OPERATOR
      100        
4501             || $next_nonblank_token !~ /^(\w|\:)/
4502             || !$self->method_ok_here() )
4503             {
4504 7         24 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4505             }
4506             else {
4507 5         23 initialize_subname();
4508 5         20 $self->scan_id();
4509             }
4510             }
4511             else {
4512 289 50       886 $self->error_if_expecting_OPERATOR()
4513             if ( $expecting == OPERATOR );
4514 289         1067 initialize_subname();
4515 289         1111 $self->scan_id();
4516             }
4517             }
4518              
4519             # 'package'
4520             elsif ( $is_package{$tok_kw} ) {
4521              
4522             # Update for --use-feature=class (rt145706):
4523             # We have to be extra careful because 'class' may be used for other
4524             # purposes on older code; i.e.
4525             # class($x) - valid sub call
4526             # package($x) - error
4527 30 100       101 if ( $tok_kw eq 'class' ) {
4528 8 100 66     75 if ( $expecting == OPERATOR
      100        
4529             || $next_nonblank_token !~ /^(\w|\:)/
4530             || !$self->class_ok_here() )
4531             {
4532 4         13 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4533             }
4534 4         15 else { $self->scan_id() }
4535             }
4536             else {
4537 22 50       63 $self->error_if_expecting_OPERATOR()
4538             if ( $expecting == OPERATOR );
4539 22         79 $self->scan_id();
4540             }
4541             }
4542              
4543             # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4544             # more restrictive. Require a new statement to be ok here.
4545             elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4546 1         3 $type = ';'; # make tokenizer look for TERM next
4547 1         3 $self->[_in_format_] = 1;
4548 1         2 $is_last = 1; ## is last token on this line
4549             }
4550              
4551             # Note on token types for format, __DATA__, __END__:
4552             # It simplifies things to give these type ';', so that when we
4553             # start rescanning we will be expecting a token of type TERM.
4554             # We will switch to type 'k' before outputting the tokens.
4555             elsif ( $is_END_DATA{$tok_kw} ) {
4556 7         16 $type = ';'; # make tokenizer look for TERM next
4557              
4558             # Remember that we are in one of these three sections
4559 7         21 $self->[ $is_END_DATA{$tok_kw} ] = 1;
4560 7         14 $is_last = 1; ## is last token on this line
4561             }
4562              
4563             elsif ( $is_keyword{$tok_kw} ) {
4564 2636         7923 $self->do_KEYWORD();
4565             }
4566              
4567             # check for inline label following
4568             # /^(redo|last|next|goto)$/
4569             elsif (( $last_nonblank_type eq 'k' )
4570             && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4571             {
4572 19         49 $type = 'j';
4573             }
4574              
4575             # something else --
4576             else {
4577 946         3129 $self->do_UNKNOWN_BAREWORD($next_nonblank_token);
4578             }
4579              
4580 5787         11267 return $is_last;
4581              
4582             } ## end sub do_BAREWORD
4583              
4584             sub do_FOLLOW_QUOTE {
4585              
4586 2763     2763 0 4475 my $self = shift;
4587              
4588             # Continue following a quote on a new line
4589 2763         4432 $type = $quote_type;
4590              
4591 2763 100       4041 unless ( @{$routput_token_list} ) { # initialize if continuation line
  2763         6809  
4592 184         470 push( @{$routput_token_list}, $i );
  184         431  
4593 184         436 $routput_token_type->[$i] = $type;
4594              
4595             }
4596              
4597             # scan for the end of the quote or pattern
4598             (
4599 2763         8883 $i,
4600             $in_quote,
4601             $quote_character,
4602             $quote_pos,
4603             $quote_depth,
4604             $quoted_string_1,
4605             $quoted_string_2,
4606              
4607             ) = $self->do_quote(
4608              
4609             $i,
4610             $in_quote,
4611             $quote_character,
4612             $quote_pos,
4613             $quote_depth,
4614             $quoted_string_1,
4615             $quoted_string_2,
4616             $rtokens,
4617             $rtoken_map,
4618             $max_token_index,
4619              
4620             );
4621              
4622             # all done if we didn't find it
4623 2763 100       6953 if ($in_quote) { return }
  183         367  
4624              
4625             # save pattern and replacement text for rescanning
4626 2580         4349 my $qs1 = $quoted_string_1;
4627              
4628             # re-initialize for next search
4629 2580         4075 $quote_character = EMPTY_STRING;
4630 2580         3820 $quote_pos = 0;
4631 2580         3844 $quote_type = 'Q';
4632 2580         3818 $quoted_string_1 = EMPTY_STRING;
4633 2580         3798 $quoted_string_2 = EMPTY_STRING;
4634 2580 100       5651 if ( ++$i > $max_token_index ) { return }
  116         320  
4635              
4636             # look for any modifiers
4637 2464 100       5334 if ($allowed_quote_modifiers) {
4638              
4639             # check for exact quote modifiers
4640 144 100       740 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4641 30         89 my $str = $rtokens->[$i];
4642 30         59 my $saw_modifier_e;
4643 30         501 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4644 47         118 my $pos = pos($str);
4645 47         111 my $char = substr( $str, $pos - 1, 1 );
4646 47   66     307 $saw_modifier_e ||= ( $char eq 'e' );
4647             }
4648              
4649             # For an 'e' quote modifier we must scan the replacement
4650             # text for here-doc targets...
4651             # but if the modifier starts a new line we can skip
4652             # this because either the here doc will be fully
4653             # contained in the replacement text (so we can
4654             # ignore it) or Perl will not find it.
4655             # See test 'here2.in'.
4656 30 50 66     161 if ( $saw_modifier_e && $i_tok >= 0 ) {
4657              
4658 0         0 my $rht = $self->scan_replacement_text($qs1);
4659              
4660             # Change type from 'Q' to 'h' for quotes with
4661             # here-doc targets so that the formatter (see sub
4662             # process_line_of_CODE) will not make any line
4663             # breaks after this point.
4664 0 0       0 if ($rht) {
4665 0         0 push @{$rhere_target_list}, @{$rht};
  0         0  
  0         0  
4666 0         0 $type = 'h';
4667 0 0       0 if ( $i_tok < 0 ) {
4668 0         0 my $ilast = $routput_token_list->[-1];
4669 0         0 $routput_token_type->[$ilast] = $type;
4670             }
4671             }
4672             }
4673              
4674 30 50       96 if ( defined( pos($str) ) ) {
4675              
4676             # matched
4677 30 50       100 if ( pos($str) == length($str) ) {
4678 30 50       128 if ( ++$i > $max_token_index ) { return }
  0         0  
4679             }
4680              
4681             # Looks like a joined quote modifier
4682             # and keyword, maybe something like
4683             # s/xxx/yyy/gefor @k=...
4684             # Example is "galgen.pl". Would have to split
4685             # the word and insert a new token in the
4686             # pre-token list. This is so rare that I haven't
4687             # done it. Will just issue a warning citation.
4688              
4689             # This error might also be triggered if my quote
4690             # modifier characters are incomplete
4691             else {
4692 0         0 $self->warning(<<EOM);
4693              
4694             Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4695             Please put a space between quote modifiers and trailing keywords.
4696             EOM
4697              
4698             # print "token $rtokens->[$i]\n";
4699             # my $num = length($str) - pos($str);
4700             # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4701             # print "continuing with new token $rtokens->[$i]\n";
4702              
4703             # skipping past this token does least damage
4704 0 0       0 if ( ++$i > $max_token_index ) { return }
  0         0  
4705             }
4706             }
4707             else {
4708              
4709             # example file: rokicki4.pl
4710             # This error might also be triggered if my quote
4711             # modifier characters are incomplete
4712 0         0 $self->write_logfile_entry(
4713             "Note: found word $str at quote modifier location\n");
4714             }
4715             }
4716              
4717             # re-initialize
4718 144         283 $allowed_quote_modifiers = EMPTY_STRING;
4719             }
4720 2464         4526 return;
4721             } ## end sub do_FOLLOW_QUOTE
4722              
4723             # ------------------------------------------------------------
4724             # begin hash of code for handling most token types
4725             # ------------------------------------------------------------
4726             my $tokenization_code = {
4727              
4728             '>' => \&do_GREATER_THAN_SIGN,
4729             '|' => \&do_VERTICAL_LINE,
4730             '$' => \&do_DOLLAR_SIGN,
4731             '(' => \&do_LEFT_PARENTHESIS,
4732             ')' => \&do_RIGHT_PARENTHESIS,
4733             ',' => \&do_COMMA,
4734             ';' => \&do_SEMICOLON,
4735             '"' => \&do_QUOTATION_MARK,
4736             "'" => \&do_APOSTROPHE,
4737             '`' => \&do_BACKTICK,
4738             '/' => \&do_SLASH,
4739             '{' => \&do_LEFT_CURLY_BRACKET,
4740             '}' => \&do_RIGHT_CURLY_BRACKET,
4741             '&' => \&do_AMPERSAND,
4742             '<' => \&do_LESS_THAN_SIGN,
4743             '?' => \&do_QUESTION_MARK,
4744             '*' => \&do_STAR,
4745             '.' => \&do_DOT,
4746             ':' => \&do_COLON,
4747             '+' => \&do_PLUS_SIGN,
4748             '@' => \&do_AT_SIGN,
4749             '%' => \&do_PERCENT_SIGN,
4750             '[' => \&do_LEFT_SQUARE_BRACKET,
4751             ']' => \&do_RIGHT_SQUARE_BRACKET,
4752             '-' => \&do_MINUS_SIGN,
4753             '^' => \&do_CARAT_SIGN,
4754             '::' => \&do_DOUBLE_COLON,
4755             '<<' => \&do_LEFT_SHIFT,
4756             '<<~' => \&do_NEW_HERE_DOC,
4757             '->' => \&do_POINTER,
4758             '++' => \&do_PLUS_PLUS,
4759             '=>' => \&do_FAT_COMMA,
4760             '--' => \&do_MINUS_MINUS,
4761             '&&' => \&do_LOGICAL_AND,
4762             '||' => \&do_LOGICAL_OR,
4763             '//' => \&do_SLASH_SLASH,
4764              
4765             # No special code for these types yet, but syntax checks
4766             # could be added.
4767             ## '!' => undef,
4768             ## '!=' => undef,
4769             ## '!~' => undef,
4770             ## '%=' => undef,
4771             ## '&&=' => undef,
4772             ## '&=' => undef,
4773             ## '+=' => undef,
4774             ## '-=' => undef,
4775             ## '..' => undef,
4776             ## '..' => undef,
4777             ## '...' => undef,
4778             ## '.=' => undef,
4779             ## '<<=' => undef,
4780             ## '<=' => undef,
4781             ## '<=>' => undef,
4782             ## '<>' => undef,
4783             ## '=' => undef,
4784             ## '==' => undef,
4785             ## '=~' => undef,
4786             ## '>=' => undef,
4787             ## '>>' => undef,
4788             ## '>>=' => undef,
4789             ## '\\' => undef,
4790             ## '^=' => undef,
4791             ## '|=' => undef,
4792             ## '||=' => undef,
4793             ## '//=' => undef,
4794             ## '~' => undef,
4795             ## '~~' => undef,
4796             ## '!~~' => undef,
4797              
4798             };
4799              
4800             # ------------------------------------------------------------
4801             # end hash of code for handling individual token types
4802             # ------------------------------------------------------------
4803              
4804 38     38   442 use constant DEBUG_TOKENIZE => 0;
  38         92  
  38         186316  
4805              
4806             sub tokenize_this_line {
4807              
4808             # This routine breaks a line of perl code into tokens which are of use in
4809             # indentation and reformatting. One of my goals has been to define tokens
4810             # such that a newline may be inserted between any pair of tokens without
4811             # changing or invalidating the program. This version comes close to this,
4812             # although there are necessarily a few exceptions which must be caught by
4813             # the formatter. Many of these involve the treatment of bare words.
4814             #
4815             # The tokens and their types are returned in arrays. See previous
4816             # routine for their names.
4817             #
4818             # See also the array "valid_token_types" in the BEGIN section for an
4819             # up-to-date list.
4820             #
4821             # To simplify things, token types are either a single character, or they
4822             # are identical to the tokens themselves.
4823             #
4824             # As a debugging aid, the -D flag creates a file containing a side-by-side
4825             # comparison of the input string and its tokenization for each line of a file.
4826             # This is an invaluable debugging aid.
4827             #
4828             # In addition to tokens, and some associated quantities, the tokenizer
4829             # also returns flags indication any special line types. These include
4830             # quotes, here_docs, formats.
4831             #
4832             # -----------------------------------------------------------------------
4833             #
4834             # How to add NEW_TOKENS:
4835             #
4836             # New token types will undoubtedly be needed in the future both to keep up
4837             # with changes in perl and to help adapt the tokenizer to other applications.
4838             #
4839             # Here are some notes on the minimal steps. I wrote these notes while
4840             # adding the 'v' token type for v-strings, which are things like version
4841             # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
4842             # can use your editor to search for the string "NEW_TOKENS" to find the
4843             # appropriate sections to change):
4844             #
4845             # *. Try to talk somebody else into doing it! If not, ..
4846             #
4847             # *. Make a backup of your current version in case things don't work out!
4848             #
4849             # *. Think of a new, unused character for the token type, and add to
4850             # the array @valid_token_types in the BEGIN section of this package.
4851             # For example, I used 'v' for v-strings.
4852             #
4853             # *. Implement coding to recognize the $type of the token in this routine.
4854             # This is the hardest part, and is best done by imitating or modifying
4855             # some of the existing coding. For example, to recognize v-strings, I
4856             # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
4857             # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
4858             #
4859             # *. Update sub operator_expected. This update is critically important but
4860             # the coding is trivial. Look at the comments in that routine for help.
4861             # For v-strings, which should behave like numbers, I just added 'v' to the
4862             # regex used to handle numbers and strings (types 'n' and 'Q').
4863             #
4864             # *. Implement a 'bond strength' rule in sub set_bond_strengths in
4865             # Perl::Tidy::Formatter for breaking lines around this token type. You can
4866             # skip this step and take the default at first, then adjust later to get
4867             # desired results. For adding type 'v', I looked at sub bond_strength and
4868             # saw that number type 'n' was using default strengths, so I didn't do
4869             # anything. I may tune it up someday if I don't like the way line
4870             # breaks with v-strings look.
4871             #
4872             # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
4873             # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
4874             # and saw that type 'n' used spaces on both sides, so I just added 'v'
4875             # to the array @spaces_both_sides.
4876             #
4877             # *. Update HtmlWriter package so that users can colorize the token as
4878             # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
4879             # that package. For v-strings, I initially chose to use a default color
4880             # equal to the default for numbers, but it might be nice to change that
4881             # eventually.
4882             #
4883             # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
4884             #
4885             # *. Run lots and lots of debug tests. Start with special files designed
4886             # to test the new token type. Run with the -D flag to create a .DEBUG
4887             # file which shows the tokenization. When these work ok, test as many old
4888             # scripts as possible. Start with all of the '.t' files in the 'test'
4889             # directory of the distribution file. Compare .tdy output with previous
4890             # version and updated version to see the differences. Then include as
4891             # many more files as possible. My own technique has been to collect a huge
4892             # number of perl scripts (thousands!) into one directory and run perltidy
4893             # *, then run diff between the output of the previous version and the
4894             # current version.
4895             #
4896             # *. For another example, search for the smartmatch operator '~~'
4897             # with your editor to see where updates were made for it.
4898             #
4899             # -----------------------------------------------------------------------
4900              
4901 7502     7502 0 14957 my ( $self, $line_of_tokens ) = @_;
4902 7502         14326 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
4903              
4904             # Extract line number for use in error messages
4905 7502         12476 $input_line_number = $line_of_tokens->{_line_number};
4906              
4907             # Check for pod documentation
4908 7502 100 66     21313 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
4909             && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
4910             {
4911              
4912             # Must not be in multi-line quote
4913             # and must not be in an equation
4914 13 50 33     97 if ( !$in_quote
4915             && ( $self->operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
4916             {
4917 13         35 $self->[_in_pod_] = 1;
4918 13         27 return;
4919             }
4920             }
4921              
4922 7489         14352 $input_line = $untrimmed_input_line;
4923              
4924 7489         14361 chomp $input_line;
4925              
4926             # Set a flag to indicate if we might be at an __END__ or __DATA__ line
4927             # This will be used below to avoid quoting a bare word followed by
4928             # a fat comma.
4929 7489         11267 my $is_END_or_DATA;
4930              
4931             # Reinitialize the multi-line quote flag
4932 7489 100 100     18373 if ( $in_quote && $quote_type eq 'Q' ) {
4933 47         149 $line_of_tokens->{_starting_in_quote} = 1;
4934             }
4935             else {
4936 7442         13293 $line_of_tokens->{_starting_in_quote} = 0;
4937              
4938             # Trim start of this line unless we are continuing a quoted line.
4939             # Do not trim end because we might end in a quote (test: deken4.pl)
4940             # Perl::Tidy::Formatter will delete needless trailing blanks
4941 7442 100       33972 if ( !length($input_line) ) {
    100          
4942              
4943             # line is empty
4944             }
4945             elsif ( $input_line =~ m/\S/g ) {
4946              
4947             # There are $spaces blank characters before a nonblank character
4948 6636         13687 my $spaces = pos($input_line) - 1;
4949 6636 100       14945 if ( $spaces > 0 ) {
4950              
4951             # Trim the leading spaces
4952 3537         9417 $input_line = substr( $input_line, $spaces );
4953              
4954             # Find actual space count if there are leading tabs
4955 3537 100 66     12674 if (
4956             ord( substr( $untrimmed_input_line, 0, 1 ) ) == ORD_TAB
4957             && $untrimmed_input_line =~ /^(\t+)/ )
4958             {
4959 213         519 my $tabsize = $self->[_tabsize_];
4960 213         710 $spaces += length($1) * ( $tabsize - 1 );
4961             }
4962              
4963             # Calculate a guessed level for nonblank lines to avoid
4964             # calls to sub guess_old_indentation_level()
4965 3537         7773 my $indent_columns = $self->[_indent_columns_];
4966             $line_of_tokens->{_guessed_indentation_level} =
4967 3537         10094 int( $spaces / $indent_columns );
4968             }
4969             }
4970             else {
4971              
4972             # line has all blank characters
4973 9         43 $input_line = EMPTY_STRING;
4974             }
4975              
4976 7442   100     19810 $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
4977             && $input_line =~ /^__(END|DATA)__\s*$/;
4978             }
4979              
4980             # Optimize for a full-line comment.
4981 7489 100       15613 if ( !$in_quote ) {
4982 7305 100       16514 if ( substr( $input_line, 0, 1 ) eq '#' ) {
4983              
4984             # and check for skipped section
4985 788 100 66     6098 if ( $rOpts_code_skipping
4986             && $input_line =~ /$code_skipping_pattern_begin/ )
4987             {
4988 2         9 $self->[_in_skipped_] = $self->[_last_line_number_];
4989 2         6 return;
4990             }
4991              
4992             # Optional fast processing of a block comment
4993 786         2251 my $ci_string_sum =
4994             ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4995 786         1611 my $ci_string_i = $ci_string_sum + $in_statement_continuation;
4996 786         1669 $line_of_tokens->{_line_type} = 'CODE';
4997 786         2380 $line_of_tokens->{_rtokens} = [$input_line];
4998 786         2100 $line_of_tokens->{_rtoken_type} = ['#'];
4999 786         2067 $line_of_tokens->{_rlevels} = [$level_in_tokenizer];
5000 786         1948 $line_of_tokens->{_rci_levels} = [$ci_string_i];
5001 786         2104 $line_of_tokens->{_rblock_type} = [EMPTY_STRING];
5002 786         1868 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5003 786         3789 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5004 786         1930 return;
5005             }
5006              
5007             # Optimize handling of a blank line
5008 6517 100       15214 if ( !length($input_line) ) {
5009 806         2186 $line_of_tokens->{_line_type} = 'CODE';
5010 806         1964 $line_of_tokens->{_rtokens} = [];
5011 806         1827 $line_of_tokens->{_rtoken_type} = [];
5012 806         1686 $line_of_tokens->{_rlevels} = [];
5013 806         1664 $line_of_tokens->{_rci_levels} = [];
5014 806         1646 $line_of_tokens->{_rblock_type} = [];
5015 806         1724 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5016 806         2440 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5017 806         1826 return;
5018             }
5019             }
5020              
5021             # update the copy of the line for use in error messages
5022             # This must be exactly what we give the pre_tokenizer
5023 5895         10694 $self->[_line_of_text_] = $input_line;
5024              
5025             # re-initialize for the main loop
5026 5895         14798 $routput_token_list = []; # stack of output token indexes
5027 5895         18180 $routput_token_type = []; # token types
5028 5895         15995 $routput_block_type = []; # types of code block
5029 5895         15340 $routput_container_type = []; # paren types, such as if, elsif, ..
5030 5895         13778 $routput_type_sequence = []; # nesting sequential number
5031              
5032 5895         9353 $rhere_target_list = [];
5033              
5034 5895         9768 $tok = $last_nonblank_token;
5035 5895         9253 $type = $last_nonblank_type;
5036 5895         9186 $prototype = $last_nonblank_prototype;
5037 5895         8683 $last_nonblank_i = -1;
5038 5895         9044 $block_type = $last_nonblank_block_type;
5039 5895         8700 $container_type = $last_nonblank_container_type;
5040 5895         8661 $type_sequence = $last_nonblank_type_sequence;
5041 5895         7959 $indent_flag = 0;
5042 5895         8202 $peeked_ahead = 0;
5043              
5044 5895         18362 $self->tokenizer_main_loop($is_END_or_DATA);
5045              
5046             #-----------------------------------------------
5047             # all done tokenizing this line ...
5048             # now prepare the final list of tokens and types
5049             #-----------------------------------------------
5050 5895 50       11704 if ( $self->[_calculate_ci_] ) {
5051 0         0 $self->OLD_tokenizer_wrapup_line($line_of_tokens);
5052             }
5053             else {
5054 5895         16321 $self->tokenizer_wrapup_line($line_of_tokens);
5055             }
5056              
5057 5895         11390 return;
5058             } ## end sub tokenize_this_line
5059              
5060             sub tokenizer_main_loop {
5061              
5062 5895     5895 0 12160 my ( $self, $is_END_or_DATA ) = @_;
5063              
5064             #---------------------------------
5065             # Break one input line into tokens
5066             #---------------------------------
5067              
5068             # Input parameter:
5069             # $is_END_or_DATA is true for a __END__ or __DATA__ line
5070              
5071             # start by breaking the line into pre-tokens
5072 5895         8758 my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
5073 5895         13955 ( $rtokens, $rtoken_map, $rtoken_type ) =
5074             pre_tokenize( $input_line, $max_tokens_wanted );
5075              
5076 5895         23371 $max_token_index = scalar( @{$rtokens} ) - 1;
  5895         11233  
5077 5895         9064 push( @{$rtokens}, SPACE, SPACE, SPACE )
  5895         15096  
5078             ; # extra whitespace simplifies logic
5079 5895         8930 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
  5895         12648  
5080 5895         8783 push( @{$rtoken_type}, 'b', 'b', 'b' );
  5895         12945  
5081              
5082             # initialize for main loop
5083 5895         8486 if (0) { #<<< this is not necessary
5084             foreach my $ii ( 0 .. $max_token_index + 3 ) {
5085             $routput_token_type->[$ii] = EMPTY_STRING;
5086             $routput_block_type->[$ii] = EMPTY_STRING;
5087             $routput_container_type->[$ii] = EMPTY_STRING;
5088             $routput_type_sequence->[$ii] = EMPTY_STRING;
5089             $routput_indent_flag->[$ii] = 0;
5090             }
5091             }
5092              
5093 5895         8928 $i = -1;
5094 5895         8732 $i_tok = -1;
5095              
5096             #-----------------------------
5097             # begin main tokenization loop
5098             #-----------------------------
5099              
5100             # we are looking at each pre-token of one line and combining them
5101             # into tokens
5102 5895         13764 while ( ++$i <= $max_token_index ) {
5103              
5104             # continue looking for the end of a quote
5105 50693 100       90154 if ($in_quote) {
5106 2763         8506 $self->do_FOLLOW_QUOTE();
5107 2763 100 100     10572 last if ( $in_quote || $i > $max_token_index );
5108             }
5109              
5110 50394 100 100     138777 if ( $type ne 'b' && $tok ne 'CORE::' ) {
5111              
5112             # try to catch some common errors
5113 35221 100 100     76896 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
5114              
5115 1588 100       4938 if ( $last_nonblank_token eq 'eq' ) {
    50          
5116 9         71 $self->complain("Should 'eq' be '==' here ?\n");
5117             }
5118             elsif ( $last_nonblank_token eq 'ne' ) {
5119 0         0 $self->complain("Should 'ne' be '!=' here ?\n");
5120             }
5121             }
5122              
5123             # fix c090, only rotate vars if a new token will be stored
5124 35221 100       65611 if ( $i_tok >= 0 ) {
5125 29464         43857 $last_last_nonblank_token = $last_nonblank_token;
5126 29464         39870 $last_last_nonblank_type = $last_nonblank_type;
5127 29464         40270 $last_last_nonblank_block_type = $last_nonblank_block_type;
5128 29464         40287 $last_last_nonblank_container_type =
5129             $last_nonblank_container_type;
5130 29464         41364 $last_last_nonblank_type_sequence =
5131             $last_nonblank_type_sequence;
5132              
5133             # Fix part #3 for git82: propagate type 'Z' though L-R pair
5134 29464 100 100     62616 unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
5135 29461         40596 $last_nonblank_token = $tok;
5136 29461         39359 $last_nonblank_type = $type;
5137             }
5138 29464         41125 $last_nonblank_prototype = $prototype;
5139 29464         40938 $last_nonblank_block_type = $block_type;
5140 29464         40168 $last_nonblank_container_type = $container_type;
5141 29464         40860 $last_nonblank_type_sequence = $type_sequence;
5142 29464         39077 $last_nonblank_i = $i_tok;
5143             }
5144              
5145             # Patch for c030: Fix things in case a '->' got separated from
5146             # the subsequent identifier by a side comment. We need the
5147             # last_nonblank_token to have a leading -> to avoid triggering
5148             # an operator expected error message at the next '('. See also
5149             # fix for git #63.
5150 35221 100       65070 if ( $last_last_nonblank_token eq '->' ) {
5151 885 100 66     4976 if ( $last_nonblank_type eq 'w'
5152             || $last_nonblank_type eq 'i' )
5153             {
5154 674         2647 $last_nonblank_token = '->' . $last_nonblank_token;
5155 674         1395 $last_nonblank_type = 'i';
5156             }
5157             }
5158             }
5159              
5160             # store previous token type
5161 50394 100       86480 if ( $i_tok >= 0 ) {
5162 44637         85008 $routput_token_type->[$i_tok] = $type;
5163 44637         72322 $routput_block_type->[$i_tok] = $block_type;
5164 44637         72244 $routput_container_type->[$i_tok] = $container_type;
5165 44637         70708 $routput_type_sequence->[$i_tok] = $type_sequence;
5166 44637         69279 $routput_indent_flag->[$i_tok] = $indent_flag;
5167             }
5168              
5169             # get the next pre-token and type
5170             # $tok and $type will be modified to make the output token
5171 50394         81888 my $pre_tok = $tok = $rtokens->[$i]; # get the next pre-token
5172 50394         76515 my $pre_type = $type = $rtoken_type->[$i]; # and type
5173              
5174             # remember the starting index of this token; we will be updating $i
5175 50394         67556 $i_tok = $i;
5176              
5177             # re-initialize various flags for the next output token
5178 50394   100     90928 $block_type &&= EMPTY_STRING;
5179 50394   100     92555 $container_type &&= EMPTY_STRING;
5180 50394   100     98745 $type_sequence &&= EMPTY_STRING;
5181 50394   100     84653 $indent_flag &&= 0;
5182 50394   100     81402 $prototype &&= EMPTY_STRING;
5183              
5184             # this pre-token will start an output token
5185 50394         65002 push( @{$routput_token_list}, $i_tok );
  50394         87335  
5186              
5187             #--------------------------
5188             # handle a whitespace token
5189             #--------------------------
5190 50394 100       105920 next if ( $pre_type eq 'b' );
5191              
5192             #-----------------
5193             # handle a comment
5194             #-----------------
5195 35079 100       60510 last if ( $pre_type eq '#' );
5196              
5197             # continue gathering identifier if necessary
5198 34751 100       59634 if ($id_scan_state) {
5199              
5200 17 100 66     145 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
5201 10         38 $self->scan_id();
5202             }
5203             else {
5204 7         30 $self->scan_identifier();
5205             }
5206              
5207 17 100       80 if ($id_scan_state) {
5208              
5209             # Still scanning ...
5210             # Check for side comment between sub and prototype (c061)
5211              
5212             # done if nothing left to scan on this line
5213 1 50       8 last if ( $i > $max_token_index );
5214              
5215 1         6 my ( $next_nonblank_token, $i_next ) =
5216             find_next_nonblank_token_on_this_line( $i, $rtokens,
5217             $max_token_index );
5218              
5219             # done if it was just some trailing space
5220 1 50       4 last if ( $i_next > $max_token_index );
5221              
5222             # something remains on the line ... must be a side comment
5223 1         7 next;
5224             }
5225              
5226 16 100 100     125 next if ( ( $i > 0 ) || $type );
5227              
5228             # didn't find any token; start over
5229 7         26 $type = $pre_type;
5230 7         16 $tok = $pre_tok;
5231             }
5232              
5233             ## my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : SPACE;
5234 34741 100       75058 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
5235              
5236             #-----------------------------------------------------------
5237             # Combine pre-tokens into digraphs and trigraphs if possible
5238             #-----------------------------------------------------------
5239              
5240             # See if we can make a digraph...
5241             # The following tokens are excluded and handled specially:
5242             # '/=' is excluded because the / might start a pattern.
5243             # 'x=' is excluded since it might be $x=, with $ on previous line
5244             # '**' and *= might be typeglobs of punctuation variables
5245             # I have allowed tokens starting with <, such as <=,
5246             # because I don't think these could be valid angle operators.
5247             # test file: storrs4.pl
5248 34741 100 100     108014 if ( $can_start_digraph{$tok}
      100        
5249             && $i < $max_token_index
5250             && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
5251             {
5252              
5253 2555         4793 my $combine_ok = 1;
5254 2555         5222 my $test_tok = $tok . $rtokens->[ $i + 1 ];
5255              
5256             # check for special cases which cannot be combined
5257              
5258             # '//' must be defined_or operator if an operator is expected.
5259             # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
5260             # could be migrated here for clarity
5261              
5262             # Patch for RT#102371, misparsing a // in the following snippet:
5263             # state $b //= ccc();
5264             # The solution is to always accept the digraph (or trigraph)
5265             # after type 'Z' (possible file handle). The reason is that
5266             # sub operator_expected gives TERM expected here, which is
5267             # wrong in this case.
5268 2555 100 66     7252 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
5269              
5270             # note that here $tok = '/' and the next tok and type is '/'
5271 16         69 $expecting =
5272             $self->operator_expected( [ $prev_type, $tok, '/' ] );
5273              
5274             # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
5275 16 100       69 $combine_ok = 0 if ( $expecting == TERM );
5276             }
5277              
5278             # Patch for RT #114359: mis-parsing of "print $x ** 0.5;
5279             # Accept the digraphs '**' only after type 'Z'
5280             # Otherwise postpone the decision.
5281 2555 100       5581 if ( $test_tok eq '**' ) {
5282 39 100       131 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
  37         79  
5283             }
5284              
5285 2555 50 66     16304 if (
      66        
      33        
5286              
5287             # still ok to combine?
5288             $combine_ok
5289              
5290             && ( $test_tok ne '/=' ) # might be pattern
5291             && ( $test_tok ne 'x=' ) # might be $x
5292             && ( $test_tok ne '*=' ) # typeglob?
5293              
5294             # Moved above as part of fix for
5295             # RT #114359: Missparsing of "print $x ** 0.5;
5296             # && ( $test_tok ne '**' ) # typeglob?
5297             )
5298             {
5299 2514         4087 $tok = $test_tok;
5300 2514         3618 $i++;
5301              
5302             # Now try to assemble trigraphs. Note that all possible
5303             # perl trigraphs can be constructed by appending a character
5304             # to a digraph.
5305 2514         4327 $test_tok = $tok . $rtokens->[ $i + 1 ];
5306              
5307 2514 100       8726 if ( $is_trigraph{$test_tok} ) {
    100          
5308 76         183 $tok = $test_tok;
5309 76         181 $i++;
5310             }
5311              
5312             # The only current tetragraph is the double diamond operator
5313             # and its first three characters are not a trigraph, so
5314             # we do can do a special test for it
5315             elsif ( $test_tok eq '<<>' ) {
5316 1         3 $test_tok .= $rtokens->[ $i + 2 ];
5317 1 50       5 if ( $is_tetragraph{$test_tok} ) {
5318 1         2 $tok = $test_tok;
5319 1         3 $i += 2;
5320             }
5321             }
5322             }
5323             }
5324              
5325 34741         49705 $type = $tok;
5326 34741         57544 $next_tok = $rtokens->[ $i + 1 ];
5327 34741         53018 $next_type = $rtoken_type->[ $i + 1 ];
5328              
5329 34741         43615 DEBUG_TOKENIZE && do {
5330             local $LIST_SEPARATOR = ')(';
5331             my @debug_list = (
5332             $last_nonblank_token, $tok,
5333             $next_tok, $brace_depth,
5334             $rbrace_type->[$brace_depth], $paren_depth,
5335             $rparen_type->[$paren_depth],
5336             );
5337             print STDOUT "TOKENIZE:(@debug_list)\n";
5338             };
5339              
5340             # Turn off attribute list on first non-blank, non-bareword.
5341             # Added '#' to fix c038 (later moved above).
5342 34741 100 100     103304 if ( $pre_type ne 'w' && $self->[_in_attribute_list_] ) {
5343 39         90 $self->[_in_attribute_list_] = 0;
5344             }
5345              
5346             #--------------------------------------------------------
5347             # We have the next token, $tok.
5348             # Now we have to examine this token and decide what it is
5349             # and define its $type
5350             #
5351             # section 1: bare words
5352             #--------------------------------------------------------
5353              
5354 34741 100       72622 if ( $pre_type eq 'w' ) {
    100          
5355 5826         21455 $expecting =
5356             $self->operator_expected( [ $prev_type, $tok, $next_type ] );
5357 5826         18321 my $is_last = $self->do_BAREWORD($is_END_or_DATA);
5358 5826 100       17438 last if ($is_last);
5359             }
5360              
5361             #-----------------------------
5362             # section 2: strings of digits
5363             #-----------------------------
5364             elsif ( $pre_type eq 'd' ) {
5365 1929         7459 $expecting =
5366             $self->operator_expected( [ $prev_type, $tok, $next_type ] );
5367 1929         6483 $self->do_DIGITS();
5368             }
5369              
5370             #----------------------------
5371             # section 3: all other tokens
5372             #----------------------------
5373             else {
5374 26986         55688 my $code = $tokenization_code->{$tok};
5375 26986 100       51207 if ($code) {
5376 25261         86658 $expecting =
5377             $self->operator_expected(
5378             [ $prev_type, $tok, $next_type ] );
5379 25261         89188 $code->($self);
5380 25261 100       68527 redo if $in_quote;
5381             }
5382             }
5383             }
5384              
5385             # -----------------------------
5386             # end of main tokenization loop
5387             # -----------------------------
5388              
5389             # Store the final token
5390 5895 100       12912 if ( $i_tok >= 0 ) {
5391 5757         12608 $routput_token_type->[$i_tok] = $type;
5392 5757         10434 $routput_block_type->[$i_tok] = $block_type;
5393 5757         10051 $routput_container_type->[$i_tok] = $container_type;
5394 5757         10329 $routput_type_sequence->[$i_tok] = $type_sequence;
5395 5757         9833 $routput_indent_flag->[$i_tok] = $indent_flag;
5396             }
5397              
5398             # Remember last nonblank values
5399 5895 100 100     21242 if ( $type ne 'b' && $type ne '#' ) {
5400 5422         9194 $last_last_nonblank_token = $last_nonblank_token;
5401 5422         7841 $last_last_nonblank_type = $last_nonblank_type;
5402 5422         8268 $last_last_nonblank_block_type = $last_nonblank_block_type;
5403 5422         8269 $last_last_nonblank_container_type = $last_nonblank_container_type;
5404 5422         8162 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
5405 5422         7766 $last_nonblank_token = $tok;
5406 5422         7625 $last_nonblank_type = $type;
5407 5422         7866 $last_nonblank_block_type = $block_type;
5408 5422         7660 $last_nonblank_container_type = $container_type;
5409 5422         8001 $last_nonblank_type_sequence = $type_sequence;
5410 5422         8327 $last_nonblank_prototype = $prototype;
5411             }
5412              
5413             # reset indentation level if necessary at a sub or package
5414             # in an attempt to recover from a nesting error
5415 5895 50       12329 if ( $level_in_tokenizer < 0 ) {
5416 0 0       0 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
5417 0         0 reset_indentation_level(0);
5418 0         0 $self->brace_warning("resetting level to 0 at $1 $2\n");
5419             }
5420             }
5421              
5422 5895         9895 $self->[_in_quote_] = $in_quote;
5423 5895 100       13080 $self->[_quote_target_] =
5424             $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
5425 5895         10444 $self->[_rhere_target_list_] = $rhere_target_list;
5426              
5427 5895         10019 return;
5428             } ## end sub tokenizer_main_loop
5429              
5430             sub OLD_tokenizer_wrapup_line {
5431 0     0 0 0 my ( $self, $line_of_tokens ) = @_;
5432              
5433             #---------------------------------------------------------
5434             # Package a line of tokens for shipping back to the caller
5435             #---------------------------------------------------------
5436              
5437             # NOTE: This routine is retained for testing purposes only; it should
5438             # be removed by about 2025. Until then, it can be called for testing
5439             # with -exp=ci0 or -exp=ci1.
5440              
5441             # Most of the remaining work involves defining the two indentation
5442             # parameters that the formatter needs for each token:
5443             # - $level = structural indentation level and
5444             # - $ci_level = continuation indentation level
5445              
5446             # The method for setting the indentation level is straightforward.
5447             # But the method used to define the continuation indentation is
5448             # complicated because it has evolved over a long time by trial and
5449             # error. It could undoubtedly be simplified but it works okay as is.
5450              
5451             # Here is a brief description of how indentation is computed.
5452             # Perl::Tidy computes indentation as the sum of 2 terms:
5453             #
5454             # (1) structural indentation, such as if/else/elsif blocks
5455             # (2) continuation indentation, such as long parameter call lists.
5456             #
5457             # These are occasionally called primary and secondary indentation.
5458             #
5459             # Structural indentation is introduced by tokens of type '{',
5460             # although the actual tokens might be '{', '(', or '['. Structural
5461             # indentation is of two types: BLOCK and non-BLOCK. Default
5462             # structural indentation is 4 characters if the standard indentation
5463             # scheme is used.
5464             #
5465             # Continuation indentation is introduced whenever a line at BLOCK
5466             # level is broken before its termination. Default continuation
5467             # indentation is 2 characters in the standard indentation scheme.
5468             #
5469             # Both types of indentation may be nested arbitrarily deep and
5470             # interlaced. The distinction between the two is somewhat arbitrary.
5471             #
5472             # For each token, we will define two variables which would apply if
5473             # the current statement were broken just before that token, so that
5474             # that token started a new line:
5475             #
5476             # $level = the structural indentation level,
5477             # $ci_level = the continuation indentation level
5478             #
5479             # The total indentation will be $level * (4 spaces) + $ci_level * (2
5480             # spaces), assuming defaults. However, in some special cases it is
5481             # customary to modify $ci_level from this strict value.
5482             #
5483             # The total structural indentation is easy to compute by adding and
5484             # subtracting 1 from a saved value as types '{' and '}' are seen.
5485             # The running value of this variable is $level_in_tokenizer.
5486             #
5487             # The total continuation is much more difficult to compute, and
5488             # requires several variables. These variables are:
5489             #
5490             # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
5491             # each indentation level, if there are intervening open secondary
5492             # structures just prior to that level.
5493             # $continuation_string_in_tokenizer = a string of 1's and 0's
5494             # indicating if the last token at that level is "continued", meaning
5495             # that it is not the first token of an expression.
5496             # $nesting_block_string = a string of 1's and 0's indicating, for each
5497             # indentation level, if the level is of type BLOCK or not.
5498             # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
5499             # $nesting_list_string = a string of 1's and 0's indicating, for each
5500             # indentation level, if it is appropriate for list formatting.
5501             # If so, continuation indentation is used to indent long list items.
5502             # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
5503             # @{$rslevel_stack} = a stack of total nesting depths at each
5504             # structural indentation level, where "total nesting depth" means
5505             # the nesting depth that would occur if every nesting token
5506             # -- '{', '[', # and '(' -- , regardless of context, is used to
5507             # compute a nesting depth.
5508              
5509             # Notes on the Continuation Indentation
5510             #
5511             # There is a sort of chicken-and-egg problem with continuation
5512             # indentation. The formatter can't make decisions on line breaks
5513             # without knowing what 'ci' will be at arbitrary locations.
5514             #
5515             # But a problem with setting the continuation indentation (ci) here
5516             # in the tokenizer is that we do not know where line breaks will
5517             # actually be. As a result, we don't know if we should propagate
5518             # continuation indentation to higher levels of structure.
5519             #
5520             # For nesting of only structural indentation, we never need to do
5521             # this. For example, in a long if statement, like this
5522             #
5523             # if ( !$output_block_type[$i]
5524             # && ($in_statement_continuation) )
5525             # { <--outdented
5526             # do_something();
5527             # }
5528             #
5529             # the second line has ci but we do normally give the lines within
5530             # the BLOCK any ci. This would be true if we had blocks nested
5531             # arbitrarily deeply.
5532             #
5533             # But consider something like this, where we have created a break
5534             # after an opening paren on line 1, and the paren is not (currently)
5535             # a structural indentation token:
5536             #
5537             # my $file = $menubar->Menubutton(
5538             # qw/-text File -underline 0 -menuitems/ => [
5539             # [
5540             # Cascade => '~View',
5541             # -menuitems => [
5542             # ...
5543             #
5544             # The second line has ci, so it would seem reasonable to propagate
5545             # it down, giving the third line 1 ci + 1 indentation. This
5546             # suggests the following rule, which is currently used to
5547             # propagating ci down: if there are any non-structural opening
5548             # parens (or brackets, or braces), before an opening structural
5549             # brace, then ci is propagated down, and otherwise
5550             # not. The variable $intervening_secondary_structure contains this
5551             # information for the current token, and the string
5552             # "$ci_string_in_tokenizer" is a stack of previous values of this
5553             # variable.
5554              
5555 0         0 my @token_type = (); # stack of output token types
5556 0         0 my @block_type = (); # stack of output code block types
5557 0         0 my @type_sequence = (); # stack of output type sequence numbers
5558 0         0 my @tokens = (); # output tokens
5559 0         0 my @levels = (); # structural brace levels of output tokens
5560 0         0 my @ci_string = (); # string needed to compute continuation indentation
5561              
5562             # Count the number of '1's in the string (previously sub ones_count)
5563 0         0 my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5564              
5565 0         0 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5566              
5567 0         0 my ( $ci_string_i, $level_i );
5568              
5569             #-----------------
5570             # Loop over tokens
5571             #-----------------
5572 0         0 my $rtoken_map_im;
5573 0         0 foreach my $i ( @{$routput_token_list} ) {
  0         0  
5574              
5575 0         0 my $type_i = $routput_token_type->[$i];
5576 0         0 $level_i = $level_in_tokenizer;
5577              
5578             # Quick handling of indentation levels for blanks and comments
5579 0 0 0     0 if ( $type_i eq 'b' || $type_i eq '#' ) {
5580 0         0 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5581             }
5582              
5583             # All other types
5584             else {
5585              
5586             # $tok_i is the PRE-token. It only equals the token for symbols
5587 0         0 my $tok_i = $rtokens->[$i];
5588              
5589             # Check for an invalid token type..
5590             # This can happen by running perltidy on non-scripts although
5591             # it could also be bug introduced by programming change. Perl
5592             # silently accepts a 032 (^Z) and takes it as the end
5593 0 0       0 if ( !$is_valid_token_type{$type_i} ) {
5594 0         0 my $val = ord($type_i);
5595 0         0 $self->warning(
5596             "unexpected character decimal $val ($type_i) in script\n"
5597             );
5598 0         0 $self->[_in_error_] = 1;
5599             }
5600              
5601             # $ternary_indentation_flag indicates that we need a change
5602             # in level at a nested ternary, as follows
5603             # 1 => at a nested ternary ?
5604             # -1 => at a nested ternary :
5605             # 0 => otherwise
5606 0         0 my $ternary_indentation_flag = $routput_indent_flag->[$i];
5607              
5608             #-------------------------------------------
5609             # Section 1: handle a level-increasing token
5610             #-------------------------------------------
5611             # set primary indentation levels based on structural braces
5612             # Note: these are set so that the leading braces have a HIGHER
5613             # level than their CONTENTS, which is convenient for indentation
5614             # Also, define continuation indentation for each token.
5615 0 0 0     0 if ( $type_i eq '{'
    0 0        
      0        
      0        
5616             || $type_i eq 'L'
5617             || $ternary_indentation_flag > 0 )
5618             {
5619              
5620             # if the difference between total nesting levels is not 1,
5621             # there are intervening non-structural nesting types between
5622             # this '{' and the previous unclosed '{'
5623 0         0 my $intervening_secondary_structure = 0;
5624 0 0       0 if ( @{$rslevel_stack} ) {
  0         0  
5625 0         0 $intervening_secondary_structure =
5626             $slevel_in_tokenizer - $rslevel_stack->[-1];
5627             }
5628              
5629             # save the current states
5630 0         0 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
  0         0  
5631 0         0 $level_in_tokenizer++;
5632              
5633             ##NOTE: _maximum_level_ does not seem to be needed now
5634 0 0       0 if ( $level_in_tokenizer > $self->[_maximum_level_] ) {
5635 0         0 $self->[_maximum_level_] = $level_in_tokenizer;
5636             }
5637              
5638 0 0       0 if ($ternary_indentation_flag) {
5639              
5640             # break BEFORE '?' in a nested ternary
5641 0 0       0 if ( $type_i eq '?' ) {
5642 0         0 $level_i = $level_in_tokenizer;
5643             }
5644              
5645 0         0 $nesting_block_string .= "$nesting_block_flag";
5646             } ## end if ($ternary_indentation_flag)
5647             else {
5648              
5649 0 0       0 if ( $routput_block_type->[$i] ) {
5650 0         0 $nesting_block_flag = 1;
5651 0         0 $nesting_block_string .= '1';
5652             }
5653             else {
5654 0         0 $nesting_block_flag = 0;
5655 0         0 $nesting_block_string .= '0';
5656             }
5657             }
5658              
5659             # we will use continuation indentation within containers
5660             # which are not blocks and not logical expressions
5661 0         0 my $bit = 0;
5662 0 0       0 if ( !$routput_block_type->[$i] ) {
5663              
5664             # propagate flag down at nested open parens
5665 0 0       0 if ( $routput_container_type->[$i] eq '(' ) {
5666 0 0       0 $bit = 1 if $nesting_list_flag;
5667             }
5668              
5669             # use list continuation if not a logical grouping
5670             # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
5671             else {
5672             $bit = 1
5673             unless
5674 0 0       0 $is_logical_container{ $routput_container_type
5675             ->[$i] };
5676             }
5677             }
5678 0         0 $nesting_list_string .= $bit;
5679 0         0 $nesting_list_flag = $bit;
5680              
5681 0 0       0 $ci_string_in_tokenizer .=
5682             ( $intervening_secondary_structure != 0 ) ? '1' : '0';
5683 0         0 $ci_string_sum =
5684             ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5685 0 0       0 $continuation_string_in_tokenizer .=
5686             ( $in_statement_continuation > 0 ) ? '1' : '0';
5687              
5688             # Sometimes we want to give an opening brace
5689             # continuation indentation, and sometimes not. For code
5690             # blocks, we don't do it, so that the leading '{' gets
5691             # outdented, like this:
5692             #
5693             # if ( !$output_block_type[$i]
5694             # && ($in_statement_continuation) )
5695             # { <--outdented
5696             #
5697             # For other types, we will give them continuation
5698             # indentation. For example, here is how a list looks
5699             # with the opening paren indented:
5700             #
5701             # @LoL =
5702             # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
5703             # [ "homer", "marge", "bart" ], );
5704             #
5705             # This looks best when 'ci' is one-half of the
5706             # indentation (i.e., 2 and 4)
5707              
5708 0         0 my $total_ci = $ci_string_sum;
5709 0 0 0     0 if (
      0        
      0        
5710             !$routput_block_type->[$i] # patch: skip for BLOCK
5711             && ($in_statement_continuation)
5712             && !( $ternary_indentation_flag && $type_i eq ':' )
5713             )
5714             {
5715 0 0       0 $total_ci += $in_statement_continuation
5716             unless (
5717             substr( $ci_string_in_tokenizer, -1 ) eq '1' );
5718             }
5719              
5720 0         0 $ci_string_i = $total_ci;
5721 0         0 $in_statement_continuation = 0;
5722             } ## end if ( $type_i eq '{' ||...})
5723              
5724             #-------------------------------------------
5725             # Section 2: handle a level-decreasing token
5726             #-------------------------------------------
5727             elsif ($type_i eq '}'
5728             || $type_i eq 'R'
5729             || $ternary_indentation_flag < 0 )
5730             {
5731              
5732             # only a nesting error in the script would prevent
5733             # popping here
5734 0 0       0 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
  0         0  
  0         0  
  0         0  
5735              
5736 0         0 $level_i = --$level_in_tokenizer;
5737              
5738 0 0       0 if ( $level_in_tokenizer < 0 ) {
5739 0 0       0 unless ( $self->[_saw_negative_indentation_] ) {
5740 0         0 $self->[_saw_negative_indentation_] = 1;
5741 0         0 $self->warning("Starting negative indentation\n");
5742             }
5743             }
5744              
5745             # restore previous level values
5746 0 0       0 if ( length($nesting_block_string) > 1 )
5747             { # true for valid script
5748 0         0 chop $nesting_block_string;
5749 0         0 $nesting_block_flag =
5750             substr( $nesting_block_string, -1 ) eq '1';
5751 0         0 chop $nesting_list_string;
5752 0         0 $nesting_list_flag =
5753             substr( $nesting_list_string, -1 ) eq '1';
5754              
5755 0         0 chop $ci_string_in_tokenizer;
5756 0         0 $ci_string_sum =
5757             ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5758              
5759 0         0 $in_statement_continuation =
5760             chop $continuation_string_in_tokenizer;
5761              
5762             # zero continuation flag at terminal BLOCK '}' which
5763             # ends a statement.
5764 0         0 my $block_type_i = $routput_block_type->[$i];
5765 0 0       0 if ($block_type_i) {
    0          
5766              
5767             # ...These include non-anonymous subs
5768             # note: could be sub ::abc { or sub 'abc
5769 0 0 0     0 if ( substr( $block_type_i, 0, 3 ) eq 'sub'
    0 0        
    0          
    0          
5770             && $block_type_i =~ m/^sub\s*/gc )
5771             {
5772              
5773             # note: older versions of perl require the /gc
5774             # modifier here or else the \G does not work.
5775 0 0       0 $in_statement_continuation = 0
5776             if ( $block_type_i =~ /\G('|::|\w)/gc );
5777             }
5778              
5779             # ...and include all block types except user subs
5780             # with block prototypes and these:
5781             # (sort|grep|map|do|eval)
5782             elsif (
5783             $is_zero_continuation_block_type{$block_type_i}
5784             )
5785             {
5786 0         0 $in_statement_continuation = 0;
5787             }
5788              
5789             # ..but these are not terminal types:
5790             # /^(sort|grep|map|do|eval)$/ )
5791             elsif ($is_sort_map_grep_eval_do{$block_type_i}
5792             || $is_grep_alias{$block_type_i} )
5793             {
5794             }
5795              
5796             # ..and a block introduced by a label
5797             # /^\w+\s*:$/gc ) {
5798             elsif ( $block_type_i =~ /:$/ ) {
5799 0         0 $in_statement_continuation = 0;
5800             }
5801              
5802             # user function with block prototype
5803             else {
5804 0         0 $in_statement_continuation = 0;
5805             }
5806             } ## end if ($block_type_i)
5807              
5808             # If we are in a list, then
5809             # we must set continuation indentation at the closing
5810             # paren of something like this (paren after $check):
5811             # assert(
5812             # __LINE__,
5813             # ( not defined $check )
5814             # or ref $check
5815             # or $check eq "new"
5816             # or $check eq "old",
5817             # );
5818             elsif ( $tok_i eq ')' ) {
5819             $in_statement_continuation = 1
5820             if (
5821             $is_list_end_type{
5822 0 0       0 $routput_container_type->[$i]
5823             }
5824             );
5825             ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
5826             }
5827             } ## end if ( length($nesting_block_string...))
5828              
5829 0         0 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5830             } ## end elsif ( $type_i eq '}' ||...{)
5831              
5832             #-----------------------------------------
5833             # Section 3: handle a constant level token
5834             #-----------------------------------------
5835             else {
5836              
5837             # zero the continuation indentation at certain tokens so
5838             # that they will be at the same level as its container. For
5839             # commas, this simplifies the -lp indentation logic, which
5840             # counts commas. For ?: it makes them stand out.
5841 0 0 0     0 if (
5842             $nesting_list_flag
5843             ## $type_i =~ /^[,\?\:]$/
5844             && $is_comma_question_colon{$type_i}
5845             )
5846             {
5847 0         0 $in_statement_continuation = 0;
5848             }
5849              
5850             # Be sure binary operators get continuation indentation.
5851             # Note: the check on $nesting_block_flag is only needed
5852             # to add ci to binary operators following a 'try' block,
5853             # or similar extended syntax block operator (see c158).
5854 0 0 0     0 if (
      0        
      0        
      0        
5855             !$in_statement_continuation
5856             && ( $nesting_block_flag || $nesting_list_flag )
5857             && ( $type_i eq 'k' && $is_binary_keyword{$tok_i}
5858             || $is_binary_type{$type_i} )
5859             )
5860             {
5861 0         0 $in_statement_continuation = 1;
5862             }
5863              
5864             # continuation indentation is sum of any open ci from
5865             # previous levels plus the current level
5866 0         0 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5867              
5868             # update continuation flag ...
5869              
5870             # if we are in a BLOCK
5871 0 0       0 if ($nesting_block_flag) {
5872              
5873             # the next token after a ';' and label starts a new stmt
5874 0 0 0     0 if ( $type_i eq ';' || $type_i eq 'J' ) {
5875 0         0 $in_statement_continuation = 0;
5876             }
5877              
5878             # otherwise, we are continuing the current statement
5879             else {
5880 0         0 $in_statement_continuation = 1;
5881             }
5882             }
5883              
5884             # if we are not in a BLOCK..
5885             else {
5886              
5887             # do not use continuation indentation if not list
5888             # environment (could be within if/elsif clause)
5889 0 0 0     0 if ( !$nesting_list_flag ) {
    0          
5890 0         0 $in_statement_continuation = 0;
5891             }
5892              
5893             # otherwise, the token after a ',' starts a new term
5894              
5895             # Patch FOR RT#99961; no continuation after a ';'
5896             # This is needed because perltidy currently marks
5897             # a block preceded by a type character like % or @
5898             # as a non block, to simplify formatting. But these
5899             # are actually blocks and can have semicolons.
5900             # See code_block_type() and is_non_structural_brace().
5901             elsif ( $type_i eq ',' || $type_i eq ';' ) {
5902 0         0 $in_statement_continuation = 0;
5903             }
5904              
5905             # otherwise, we are continuing the current term
5906             else {
5907 0         0 $in_statement_continuation = 1;
5908             }
5909             } ## end else [ if ($nesting_block_flag)]
5910              
5911             } ## end else [ if ( $type_i eq '{' ||...})]
5912              
5913             #-------------------------------------------
5914             # Section 4: operations common to all levels
5915             #-------------------------------------------
5916              
5917             # set secondary nesting levels based on all containment token
5918             # types Note: these are set so that the nesting depth is the
5919             # depth of the PREVIOUS TOKEN, which is convenient for setting
5920             # the strength of token bonds
5921              
5922             # /^[L\{\(\[]$/
5923 0 0       0 if ( $is_opening_type{$type_i} ) {
    0          
5924 0         0 $slevel_in_tokenizer++;
5925 0         0 $nesting_token_string .= $tok_i;
5926 0         0 $nesting_type_string .= $type_i;
5927             }
5928              
5929             # /^[R\}\)\]]$/
5930             elsif ( $is_closing_type{$type_i} ) {
5931 0         0 $slevel_in_tokenizer--;
5932 0         0 my $char = chop $nesting_token_string;
5933              
5934 0 0       0 if ( $char ne $matching_start_token{$tok_i} ) {
5935 0         0 $nesting_token_string .= $char . $tok_i;
5936 0         0 $nesting_type_string .= $type_i;
5937             }
5938             else {
5939 0         0 chop $nesting_type_string;
5940             }
5941             }
5942              
5943             # apply token type patch:
5944             # - output anonymous 'sub' as keyword (type 'k')
5945             # - output __END__, __DATA__, and format as type 'k' instead
5946             # of ';' to make html colors correct, etc.
5947             # The following hash tests are equivalent to these older tests:
5948             # if ( $type_i eq 't' && $is_sub{$tok_i} ) { $fix_type = 'k' }
5949             # if ( $type_i eq ';' && $tok_i =~ /\w/ ) { $fix_type = 'k' }
5950 0 0 0     0 if ( $is_END_DATA_format_sub{$tok_i}
5951             && $is_semicolon_or_t{$type_i} )
5952             {
5953 0         0 $type_i = 'k';
5954             }
5955             } ## end else [ if ( $type_i eq 'b' ||...)]
5956              
5957             #--------------------------------
5958             # Store the values for this token
5959             #--------------------------------
5960 0 0       0 push( @ci_string, $ci_string_i ? 1 : 0 ); # clip ci to 1
5961 0         0 push( @levels, $level_i );
5962 0         0 push( @block_type, $routput_block_type->[$i] );
5963 0         0 push( @type_sequence, $routput_type_sequence->[$i] );
5964 0         0 push( @token_type, $type_i );
5965              
5966             # Form and store the PREVIOUS token
5967 0 0       0 if ( defined($rtoken_map_im) ) {
5968 0         0 my $numc =
5969             $rtoken_map->[$i] - $rtoken_map_im; # how many characters
5970              
5971 0 0       0 if ( $numc > 0 ) {
5972 0         0 push( @tokens,
5973             substr( $input_line, $rtoken_map_im, $numc ) );
5974             }
5975             else {
5976              
5977             # Should not happen unless @{$rtoken_map} is corrupted
5978 0         0 DEVEL_MODE
5979             && $self->Fault(
5980             "number of characters is '$numc' but should be >0\n");
5981             }
5982             }
5983              
5984             # or grab some values for the leading token (needed for log output)
5985             else {
5986 0         0 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5987             }
5988              
5989 0         0 $rtoken_map_im = $rtoken_map->[$i];
5990             } ## end foreach my $i ( @{$routput_token_list...})
5991              
5992             #------------------------
5993             # End loop to over tokens
5994             #------------------------
5995              
5996             # Form and store the final token of this line
5997 0 0       0 if ( defined($rtoken_map_im) ) {
5998 0         0 my $numc = length($input_line) - $rtoken_map_im;
5999 0 0       0 if ( $numc > 0 ) {
6000 0         0 push( @tokens, substr( $input_line, $rtoken_map_im, $numc ) );
6001             }
6002             else {
6003              
6004             # Should not happen unless @{$rtoken_map} is corrupted
6005 0         0 DEVEL_MODE
6006             && $self->Fault(
6007             "Number of Characters is '$numc' but should be >0\n");
6008             }
6009             }
6010              
6011             #----------------------------------------------------------
6012             # Wrap up this line of tokens for shipping to the Formatter
6013             #----------------------------------------------------------
6014 0         0 $line_of_tokens->{_rtoken_type} = \@token_type;
6015 0         0 $line_of_tokens->{_rtokens} = \@tokens;
6016 0         0 $line_of_tokens->{_rblock_type} = \@block_type;
6017 0         0 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
6018 0         0 $line_of_tokens->{_rlevels} = \@levels;
6019 0         0 $line_of_tokens->{_rci_levels} = \@ci_string;
6020              
6021 0         0 return;
6022             } ## end sub OLD_tokenizer_wrapup_line
6023              
6024             sub tokenizer_wrapup_line {
6025 5895     5895 0 11401 my ( $self, $line_of_tokens ) = @_;
6026              
6027             #---------------------------------------------------------
6028             # Package a line of tokens for shipping back to the caller
6029             #---------------------------------------------------------
6030              
6031             # Note: This is the new version of this routine. It does not compute
6032             # continuation indentation; it returns values ci=0. The ci values
6033             # are computed later by sub Formatter::set_ci.
6034              
6035             # Arrays to hold token values for this line:
6036 5895         10447 my @levels = (); # structural brace levels of output tokens
6037 5895         8740 my @block_type = (); # stack of output code block types
6038 5895         9314 my @type_sequence = (); # stack of output type sequence numbers
6039 5895         8435 my @token_type = (); # stack of output token types
6040 5895         8485 my @tokens = (); # output tokens
6041              
6042 5895         13816 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
6043              
6044             # Remember starting nesting block string
6045 5895         9470 my $nesting_block_string_0 = $nesting_block_string;
6046              
6047             #-----------------
6048             # Loop over tokens
6049             #-----------------
6050 5895         8279 my $rtoken_map_im;
6051              
6052             # $i is the index of the pretoken which starts this full token
6053 5895         9164 foreach my $i ( @{$routput_token_list} ) {
  5895         12814  
6054              
6055 50578         73883 my $type_i = $routput_token_type->[$i];
6056              
6057             #--------------------------------
6058             # 1. Handle a non-sequenced token
6059             #--------------------------------
6060 50578 100       79824 if ( !$routput_type_sequence->[$i] ) {
6061              
6062             # 1.1 types ';' and 't'
6063             # - output anonymous 'sub' as keyword (type 'k')
6064             # - output __END__, __DATA__, and format as type 'k' instead
6065             # of ';' to make html colors correct, etc.
6066 41456 100       96097 if ( $is_semicolon_or_t{$type_i} ) {
    50          
6067 2674         6262 my $tok_i = $rtokens->[$i];
6068 2674 100       7504 if ( $is_END_DATA_format_sub{$tok_i} ) {
6069 172         536 $type_i = 'k';
6070             }
6071             }
6072              
6073             # 1.2 Check for an invalid token type..
6074             # This can happen by running perltidy on non-scripts although
6075             # it could also be bug introduced by programming change. Perl
6076             # silently accepts a 032 (^Z) and takes it as the end
6077             elsif ( !$is_valid_token_type{$type_i} ) {
6078 0         0 my $val = ord($type_i);
6079 0         0 $self->warning(
6080             "unexpected character decimal $val ($type_i) in script\n"
6081             );
6082 0         0 $self->[_in_error_] = 1;
6083             }
6084              
6085             # Store values for a non-sequenced token
6086 41456         67147 push( @levels, $level_in_tokenizer );
6087 41456         62770 push( @block_type, EMPTY_STRING );
6088 41456         60594 push( @type_sequence, EMPTY_STRING );
6089 41456         78946 push( @token_type, $type_i );
6090              
6091             }
6092              
6093             #----------------------------
6094             # 2. Handle a sequenced token
6095             # One of { [ ( ? ) ] } :
6096             #----------------------------
6097             else {
6098              
6099             # $level_i is the level we will store. Levels of braces are
6100             # set so that the leading braces have a HIGHER level than their
6101             # CONTENTS, which is convenient for indentation.
6102 9122         13706 my $level_i = $level_in_tokenizer;
6103              
6104             # $tok_i is the PRE-token. It only equals the token for symbols
6105 9122         14365 my $tok_i = $rtokens->[$i];
6106              
6107             # $routput_indent_flag->[$i] indicates that we need a change
6108             # in level at a nested ternary, as follows
6109             # 1 => at a nested ternary ?
6110             # -1 => at a nested ternary :
6111             # 0 => otherwise
6112              
6113             #------------------------------------
6114             # 2.1 handle a level-increasing token
6115             #------------------------------------
6116 9122 100       24292 if ( $is_opening_or_ternary_type{$type_i} ) {
    50          
6117              
6118 4561 100       9089 if ( $type_i eq '?' ) {
6119              
6120 187 100       927 if ( $routput_indent_flag->[$i] > 0 ) {
6121 8         19 $level_in_tokenizer++;
6122              
6123             # break BEFORE '?' in a nested ternary
6124 8         14 $level_i = $level_in_tokenizer;
6125 8         21 $nesting_block_string .= "$nesting_block_flag";
6126              
6127             }
6128             }
6129             else {
6130              
6131 4374         7406 $nesting_token_string .= $tok_i;
6132              
6133 4374 100 100     12255 if ( $type_i eq '{' || $type_i eq 'L' ) {
6134              
6135 4067         5926 $level_in_tokenizer++;
6136              
6137 4067 100       8046 if ( $routput_block_type->[$i] ) {
6138 967         1964 $nesting_block_flag = 1;
6139 967         1939 $nesting_block_string .= '1';
6140             }
6141             else {
6142 3100         5260 $nesting_block_flag = 0;
6143 3100         5535 $nesting_block_string .= '0';
6144             }
6145             }
6146             }
6147             }
6148              
6149             #------------------------------------
6150             # 2.2 handle a level-decreasing token
6151             #------------------------------------
6152             elsif ( $is_closing_or_ternary_type{$type_i} ) {
6153              
6154 4561 100       10264 if ( $type_i ne ':' ) {
6155 4374         8216 my $char = chop $nesting_token_string;
6156 4374 50       11282 if ( $char ne $matching_start_token{$tok_i} ) {
6157 0         0 $nesting_token_string .= $char . $tok_i;
6158             }
6159             }
6160              
6161 4561 100 100     15286 if (
      100        
      100        
6162             $type_i eq '}'
6163             || $type_i eq 'R'
6164              
6165             # only the second and higher ? : have levels
6166             || $type_i eq ':' && $routput_indent_flag->[$i] < 0
6167             )
6168             {
6169              
6170 4075         6309 $level_i = --$level_in_tokenizer;
6171              
6172 4075 50       7994 if ( $level_in_tokenizer < 0 ) {
6173 0 0       0 unless ( $self->[_saw_negative_indentation_] ) {
6174 0         0 $self->[_saw_negative_indentation_] = 1;
6175 0         0 $self->warning(
6176             "Starting negative indentation\n");
6177             }
6178             }
6179              
6180             # restore previous level values
6181 4075 50       8824 if ( length($nesting_block_string) > 1 )
6182             { # true for valid script
6183 4075         5882 chop $nesting_block_string;
6184 4075         7973 $nesting_block_flag =
6185             substr( $nesting_block_string, -1 ) eq '1';
6186             }
6187              
6188             }
6189             }
6190              
6191             #-------------------------------------------------------
6192             # 2.3 Unexpected sequenced token type - shouldn't happen
6193             #-------------------------------------------------------
6194             else {
6195              
6196             # The tokenizer should only be assigning sequence numbers
6197             # to types { [ ( ? ) ] } :
6198 0         0 DEVEL_MODE && $self->Fault(<<EOM);
6199             unexpected sequence number on token type $type_i with pre-tok=$tok_i
6200             EOM
6201             }
6202              
6203             # The starting nesting block string, which is used in any .LOG
6204             # output, should include the first token of the line
6205 9122 100       18290 if ( !@levels ) {
6206 1574         2860 $nesting_block_string_0 = $nesting_block_string;
6207             }
6208              
6209             # Store values for a sequenced token
6210 9122         17077 push( @levels, $level_i );
6211 9122         17407 push( @block_type, $routput_block_type->[$i] );
6212 9122         15666 push( @type_sequence, $routput_type_sequence->[$i] );
6213 9122         19385 push( @token_type, $type_i );
6214              
6215             }
6216              
6217             }
6218              
6219             # End loop to over tokens
6220              
6221 5895         14565 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string_0;
6222              
6223             #--------------------------
6224             # Form and store the tokens
6225             #--------------------------
6226 5895 50       13742 if (@levels) {
6227              
6228 5895         8211 my $im = shift @{$routput_token_list};
  5895         11201  
6229 5895         10941 my $offset = $rtoken_map->[$im];
6230 5895         8588 foreach my $i ( @{$routput_token_list} ) {
  5895         11000  
6231 44683         58561 my $numc = $rtoken_map->[$i] - $offset;
6232 44683         80435 push( @tokens, substr( $input_line, $offset, $numc ) );
6233              
6234 44683         53844 if ( DEVEL_MODE && $numc <= 0 ) {
6235              
6236             # Should not happen unless @{$rtoken_map} is corrupted
6237             $self->Fault(
6238             "number of characters is '$numc' but should be >0\n");
6239             }
6240 44683         65556 $offset = $rtoken_map->[$i];
6241             }
6242              
6243             # Form and store the final token of this line
6244 5895         11594 my $numc = length($input_line) - $offset;
6245 5895         12837 push( @tokens, substr( $input_line, $offset, $numc ) );
6246              
6247 5895         9321 if ( DEVEL_MODE && $numc <= 0 ) {
6248             $self->Fault(
6249             "Number of Characters is '$numc' but should be >0\n");
6250             }
6251             }
6252              
6253             # This sub returns zero ci values
6254 5895         20735 my @ci_levels = (0) x scalar(@levels);
6255              
6256             #----------------------------------------------------------
6257             # Wrap up this line of tokens for shipping to the Formatter
6258             #----------------------------------------------------------
6259 5895         13743 $line_of_tokens->{_rtoken_type} = \@token_type;
6260 5895         11461 $line_of_tokens->{_rtokens} = \@tokens;
6261 5895         12090 $line_of_tokens->{_rblock_type} = \@block_type;
6262 5895         11047 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
6263 5895         18987 $line_of_tokens->{_rlevels} = \@levels;
6264 5895         11569 $line_of_tokens->{_rci_levels} = \@ci_levels;
6265              
6266 5895         16354 return;
6267             } ## end sub tokenizer_wrapup_line
6268              
6269             } ## end tokenize_this_line
6270              
6271             #######################################################################
6272             # Tokenizer routines which assist in identifying token types
6273             #######################################################################
6274              
6275             # hash lookup table of operator expected values
6276             my %op_expected_table;
6277              
6278             # exceptions to perl's weird parsing rules after type 'Z'
6279             my %is_weird_parsing_rule_exception;
6280              
6281             my %is_paren_dollar;
6282              
6283             my %is_n_v;
6284              
6285             BEGIN {
6286              
6287             # Always expecting TERM following these types:
6288             # note: this is identical to '@value_requestor_type' defined later.
6289 38     38   582 my @q = qw(
6290             ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
6291             || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
6292             &= // >> ~. &. |. ^.
6293             ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
6294             );
6295 38         161 push @q, ',';
6296 38         77 push @q, '('; # for completeness, not currently a token type
6297 38         71 push @q, '->'; # was previously in UNKNOWN
6298 38         2024 @{op_expected_table}{@q} = (TERM) x scalar(@q);
6299              
6300             # Always UNKNOWN following these types;
6301             # previously had '->' in this list for c030
6302 38         241 @q = qw( w );
6303 38         104 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
6304              
6305             # Always expecting OPERATOR ...
6306             # 'n' and 'v' are currently excluded because they might be VERSION numbers
6307             # 'i' is currently excluded because it might be a package
6308             # 'q' is currently excluded because it might be a prototype
6309             # Fix for c030: removed '->' from this list:
6310 38         128 @q = qw( -- C h R ++ ] Q <> ); ## n v q i );
6311 38         94 push @q, ')';
6312 38         229 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
6313              
6314             # Fix for git #62: added '*' and '%'
6315 38         109 @q = qw( < ? * % );
6316 38         112 @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
6317              
6318 38         90 @q = qw<) $>;
6319 38         110 @{is_paren_dollar}{@q} = (1) x scalar(@q);
6320              
6321 38         87 @q = qw( n v );
6322 38         1338 @{is_n_v}{@q} = (1) x scalar(@q);
6323              
6324             } ## end BEGIN
6325              
6326 38     38   320 use constant DEBUG_OPERATOR_EXPECTED => 0;
  38         114  
  38         88134  
6327              
6328             sub operator_expected {
6329              
6330             # Returns a parameter indicating what types of tokens can occur next
6331              
6332             # Call format:
6333             # $op_expected =
6334             # $self->operator_expected( [ $prev_type, $tok, $next_type ] );
6335             # where
6336             # $prev_type is the type of the previous token (blank or not)
6337             # $tok is the current token
6338             # $next_type is the type of the next token (blank or not)
6339              
6340             # Many perl symbols have two or more meanings. For example, '<<'
6341             # can be a shift operator or a here-doc operator. The
6342             # interpretation of these symbols depends on the current state of
6343             # the tokenizer, which may either be expecting a term or an
6344             # operator. For this example, a << would be a shift if an OPERATOR
6345             # is expected, and a here-doc if a TERM is expected. This routine
6346             # is called to make this decision for any current token. It returns
6347             # one of three possible values:
6348             #
6349             # OPERATOR - operator expected (or at least, not a term)
6350             # UNKNOWN - can't tell
6351             # TERM - a term is expected (or at least, not an operator)
6352             #
6353             # The decision is based on what has been seen so far. This
6354             # information is stored in the "$last_nonblank_type" and
6355             # "$last_nonblank_token" variables. For example, if the
6356             # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
6357             # if $last_nonblank_type is 'n' (numeric), we are expecting an
6358             # OPERATOR.
6359             #
6360             # If a UNKNOWN is returned, the calling routine must guess. A major
6361             # goal of this tokenizer is to minimize the possibility of returning
6362             # UNKNOWN, because a wrong guess can spoil the formatting of a
6363             # script.
6364             #
6365             # Adding NEW_TOKENS: it is critically important that this routine be
6366             # updated to allow it to determine if an operator or term is to be
6367             # expected after the new token. Doing this simply involves adding
6368             # the new token character to one of the regexes in this routine or
6369             # to one of the hash lists
6370             # that it uses, which are initialized in the BEGIN section.
6371             # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
6372             # $statement_type
6373              
6374             # When possible, token types should be selected such that we can determine
6375             # the 'operator_expected' value by a simple hash lookup. If there are
6376             # exceptions, that is an indication that a new type is needed.
6377              
6378 33045     33045 0 60074 my ( $self, $rarg ) = @_;
6379              
6380             #-------------
6381             # Table lookup
6382             #-------------
6383              
6384             # Many types are can be obtained by a table lookup given the previous type.
6385             # This typically handles half or more of the calls.
6386 33045         62265 my $op_expected = $op_expected_table{$last_nonblank_type};
6387 33045 100       62062 if ( defined($op_expected) ) {
6388 20534         25564 DEBUG_OPERATOR_EXPECTED
6389             && print STDOUT
6390             "OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
6391 20534         40456 return $op_expected;
6392             }
6393              
6394             #---------------------
6395             # Handle special cases
6396             #---------------------
6397              
6398 12511         18342 $op_expected = UNKNOWN;
6399 12511         16963 my ( $prev_type, $tok, $next_type ) = @{$rarg};
  12511         25718  
6400              
6401             # Types 'k', '}' and 'Z' depend on context
6402             # Types 'i', 'n', 'v', 'q' currently also temporarily depend on context.
6403              
6404             # identifier...
6405 12511 100       39000 if ( $last_nonblank_type eq 'i' ) {
    100          
    100          
    100          
    100          
    100          
6406 4290         7486 $op_expected = OPERATOR;
6407              
6408             # TODO: it would be cleaner to make this a special type
6409             # expecting VERSION or {} after package NAMESPACE;
6410             # maybe mark these words as type 'Y'?
6411 4290 50 66     13231 if ( substr( $last_nonblank_token, 0, 7 ) eq 'package'
      66        
6412             && $statement_type =~ /^package\b/
6413             && $last_nonblank_token =~ /^package\b/ )
6414             {
6415 26         62 $op_expected = TERM;
6416             }
6417             }
6418              
6419             # keyword...
6420             elsif ( $last_nonblank_type eq 'k' ) {
6421 2629         4337 $op_expected = TERM;
6422 2629 100       12214 if ( $expecting_operator_token{$last_nonblank_token} ) {
    100          
6423 7         16 $op_expected = OPERATOR;
6424             }
6425             elsif ( $expecting_term_token{$last_nonblank_token} ) {
6426              
6427             # Exceptions from TERM:
6428              
6429             # // may follow perl functions which may be unary operators
6430             # see test file dor.t (defined or);
6431 2523 100 100     11981 if (
    50 100        
      66        
6432             $tok eq '/'
6433             && $next_type eq '/'
6434             && $is_keyword_rejecting_slash_as_pattern_delimiter{
6435             $last_nonblank_token}
6436             )
6437             {
6438 1         3 $op_expected = OPERATOR;
6439             }
6440              
6441             # Patch to allow a ? following 'split' to be a deprecated pattern
6442             # delimiter. This patch is coordinated with the omission of split
6443             # from the list
6444             # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
6445             # will force perltidy to guess.
6446             elsif ($tok eq '?'
6447             && $last_nonblank_token eq 'split' )
6448             {
6449 0         0 $op_expected = UNKNOWN;
6450             }
6451             }
6452             } ## end type 'k'
6453              
6454             # closing container token...
6455              
6456             # Note that the actual token for type '}' may also be a ')'.
6457              
6458             # Also note that $last_nonblank_token is not the token corresponding to
6459             # $last_nonblank_type when the type is a closing container. In that
6460             # case it is the token before the corresponding opening container token.
6461             # So for example, for this snippet
6462             # $a = do { BLOCK } / 2;
6463             # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
6464              
6465             elsif ( $last_nonblank_type eq '}' ) {
6466 3448         5718 $op_expected = UNKNOWN;
6467              
6468             # handle something after 'do' and 'eval'
6469 3448 100 66     18814 if ( $is_block_operator{$last_nonblank_token} ) {
    100          
    100          
6470              
6471             # something like $a = do { BLOCK } / 2;
6472 82         241 $op_expected = OPERATOR; # block mode following }
6473             }
6474              
6475             # $last_nonblank_token =~ /^(\)|\$|\-\>)/
6476             elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
6477             || substr( $last_nonblank_token, 0, 2 ) eq '->' )
6478             {
6479 2021         3375 $op_expected = OPERATOR;
6480 2021 50       4952 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
  0         0  
6481             }
6482              
6483             # Check for smartmatch operator before preceding brace or square
6484             # bracket. For example, at the ? after the ] in the following
6485             # expressions we are expecting an operator:
6486             #
6487             # qr/3/ ~~ ['1234'] ? 1 : 0;
6488             # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
6489             elsif ( $last_nonblank_token eq '~~' ) {
6490 20         48 $op_expected = OPERATOR;
6491             }
6492              
6493             # A right brace here indicates the end of a simple block. All
6494             # non-structural right braces have type 'R' all braces associated with
6495             # block operator keywords have been given those keywords as
6496             # "last_nonblank_token" and caught above. (This statement is order
6497             # dependent, and must come after checking $last_nonblank_token).
6498             else {
6499              
6500             # patch for dor.t (defined or).
6501 1325 50 33     6006 if ( $tok eq '/'
    100 33        
6502             && $next_type eq '/'
6503             && $last_nonblank_token eq ']' )
6504             {
6505 0         0 $op_expected = OPERATOR;
6506             }
6507              
6508             # Patch for RT #116344: misparse a ternary operator after an
6509             # anonymous hash, like this:
6510             # return ref {} ? 1 : 0;
6511             # The right brace should really be marked type 'R' in this case,
6512             # and it is safest to return an UNKNOWN here. Expecting a TERM will
6513             # cause the '?' to always be interpreted as a pattern delimiter
6514             # rather than introducing a ternary operator.
6515             elsif ( $tok eq '?' ) {
6516 1         4 $op_expected = UNKNOWN;
6517             }
6518             else {
6519 1324         2469 $op_expected = TERM;
6520             }
6521             }
6522             } ## end type '}'
6523              
6524             # number or v-string...
6525             # An exception is for VERSION numbers a 'use' statement. It has the format
6526             # use Module VERSION LIST
6527             # We could avoid this exception by writing a special sub to parse 'use'
6528             # statements and perhaps mark these numbers with a new type V (for VERSION)
6529             ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
6530             elsif ( $is_n_v{$last_nonblank_type} ) {
6531 1916         3114 $op_expected = OPERATOR;
6532 1916 100       4350 if ( $statement_type eq 'use' ) {
6533 11         27 $op_expected = UNKNOWN;
6534             }
6535             }
6536              
6537             # quote...
6538             # TODO: labeled prototype words would better be given type 'A' or maybe
6539             # 'J'; not 'q'; or maybe mark as type 'Y'?
6540             elsif ( $last_nonblank_type eq 'q' ) {
6541 137         296 $op_expected = OPERATOR;
6542 137 50       631 if ( $last_nonblank_token eq 'prototype' ) {
    100          
6543 0         0 $op_expected = TERM;
6544             }
6545              
6546             # update for --use-feature=class (rt145706):
6547             # Look for class VERSION after possible attribute, as in
6548             # class Example::Subclass : isa(Example::Base) 1.345 { ... }
6549             elsif ( $statement_type =~ /^package\b/ ) {
6550 3         6 $op_expected = TERM;
6551             }
6552             }
6553              
6554             # file handle or similar
6555             elsif ( $last_nonblank_type eq 'Z' ) {
6556              
6557 90         193 $op_expected = UNKNOWN;
6558              
6559             # angle.t
6560 90 100 33     1199 if ( $last_nonblank_token =~ /^\w/ ) {
    50 100        
    100          
    100          
    100          
6561 2         7 $op_expected = UNKNOWN;
6562             }
6563              
6564             # Exception to weird parsing rules for 'x(' ... see case b1205:
6565             # In something like 'print $vv x(...' the x is an operator;
6566             # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
6567             # otherwise x follows the weird parsing rules.
6568             elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
6569 0         0 $op_expected = OPERATOR;
6570             }
6571              
6572             # The 'weird parsing rules' of next section do not work for '<' and '?'
6573             # It is best to mark them as unknown. Test case:
6574             # print $fh <DATA>;
6575             elsif ( $is_weird_parsing_rule_exception{$tok} ) {
6576 4         10 $op_expected = UNKNOWN;
6577             }
6578              
6579             # For possible file handle like "$a", Perl uses weird parsing rules.
6580             # For example:
6581             # print $a/2,"/hi"; - division
6582             # print $a / 2,"/hi"; - division
6583             # print $a/ 2,"/hi"; - division
6584             # print $a /2,"/hi"; - pattern (and error)!
6585             # Some examples where this logic works okay, for '&','*','+':
6586             # print $fh &xsi_protos(@mods);
6587             # my $x = new $CompressClass *FH;
6588             # print $OUT +( $count % 15 ? ", " : "\n\t" );
6589             elsif ($prev_type eq 'b'
6590             && $next_type ne 'b' )
6591             {
6592 9         15 $op_expected = TERM;
6593             }
6594              
6595             # Note that '?' and '<' have been moved above
6596             # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
6597             elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
6598              
6599             # Do not complain in 'use' statements, which have special syntax.
6600             # For example, from RT#130344:
6601             # use lib $FindBin::Bin . '/lib';
6602 9 50       34 if ( $statement_type ne 'use' ) {
6603 9         40 $self->complain(
6604             "operator in possible indirect object location not recommended\n"
6605             );
6606             }
6607 9         28 $op_expected = OPERATOR;
6608             }
6609             }
6610              
6611             # anything else...
6612             else {
6613 1         3 $op_expected = UNKNOWN;
6614             }
6615              
6616 12511         16242 DEBUG_OPERATOR_EXPECTED
6617             && print STDOUT
6618             "OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
6619              
6620 12511         22751 return $op_expected;
6621              
6622             } ## end sub operator_expected
6623              
6624             sub new_statement_ok {
6625              
6626             # return true if the current token can start a new statement
6627             # USES GLOBAL VARIABLES: $last_nonblank_type
6628              
6629 81   66 81 0 310 return label_ok() # a label would be ok here
6630              
6631             || $last_nonblank_type eq 'J'; # or we follow a label
6632              
6633             } ## end sub new_statement_ok
6634              
6635             sub label_ok {
6636              
6637             # Decide if a bare word followed by a colon here is a label
6638             # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6639             # $brace_depth, $rbrace_type
6640              
6641             # if it follows an opening or closing code block curly brace..
6642 114 100 100 114 0 1090 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
      66        
6643             && $last_nonblank_type eq $last_nonblank_token )
6644             {
6645              
6646             # it is a label if and only if the curly encloses a code block
6647 47         294 return $rbrace_type->[$brace_depth];
6648             }
6649              
6650             # otherwise, it is a label if and only if it follows a ';' (real or fake)
6651             # or another label
6652             else {
6653 67   100     511 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
6654             }
6655             } ## end sub label_ok
6656              
6657             sub code_block_type {
6658              
6659             # Decide if this is a block of code, and its type.
6660             # Must be called only when $type = $token = '{'
6661             # The problem is to distinguish between the start of a block of code
6662             # and the start of an anonymous hash reference
6663             # Returns "" if not code block, otherwise returns 'last_nonblank_token'
6664             # to indicate the type of code block. (For example, 'last_nonblank_token'
6665             # might be 'if' for an if block, 'else' for an else block, etc).
6666             # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6667             # $last_nonblank_block_type, $brace_depth, $rbrace_type
6668              
6669             # handle case of multiple '{'s
6670              
6671             # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
6672              
6673 1296     1296 0 3300 my ( $self, $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6674 1296 100 66     18038 if ( $last_nonblank_token eq '{'
    100 66        
    100 66        
    100 100        
    100 66        
    100 33        
    50 33        
    50          
    50          
    100          
    100          
    100          
6675             && $last_nonblank_type eq $last_nonblank_token )
6676             {
6677              
6678             # opening brace where a statement may appear is probably
6679             # a code block but might be and anonymous hash reference
6680 90 50       326 if ( $rbrace_type->[$brace_depth] ) {
6681 90         326 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6682             $max_token_index );
6683             }
6684              
6685             # cannot start a code block within an anonymous hash
6686             else {
6687 0         0 return EMPTY_STRING;
6688             }
6689             }
6690              
6691             elsif ( $last_nonblank_token eq ';' ) {
6692              
6693             # an opening brace where a statement may appear is probably
6694             # a code block but might be and anonymous hash reference
6695 48         297 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6696             $max_token_index );
6697             }
6698              
6699             # handle case of '}{'
6700             elsif ($last_nonblank_token eq '}'
6701             && $last_nonblank_type eq $last_nonblank_token )
6702             {
6703              
6704             # a } { situation ...
6705             # could be hash reference after code block..(blktype1.t)
6706 9 50       39 if ($last_nonblank_block_type) {
6707 9         33 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6708             $max_token_index );
6709             }
6710              
6711             # must be a block if it follows a closing hash reference
6712             else {
6713 0         0 return $last_nonblank_token;
6714             }
6715             }
6716              
6717             #--------------------------------------------------------------
6718             # NOTE: braces after type characters start code blocks, but for
6719             # simplicity these are not identified as such. See also
6720             # sub is_non_structural_brace.
6721             #--------------------------------------------------------------
6722              
6723             ## elsif ( $last_nonblank_type eq 't' ) {
6724             ## return $last_nonblank_token;
6725             ## }
6726              
6727             # brace after label:
6728             elsif ( $last_nonblank_type eq 'J' ) {
6729 34         117 return $last_nonblank_token;
6730             }
6731              
6732             # otherwise, look at previous token. This must be a code block if
6733             # it follows any of these:
6734             # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
6735             elsif ($is_code_block_token{$last_nonblank_token}
6736             || $is_grep_alias{$last_nonblank_token} )
6737             {
6738              
6739             # Bug Patch: Note that the opening brace after the 'if' in the following
6740             # snippet is an anonymous hash ref and not a code block!
6741             # print 'hi' if { x => 1, }->{x};
6742             # We can identify this situation because the last nonblank type
6743             # will be a keyword (instead of a closing paren)
6744 476 50 33     2515 if (
      66        
6745             $last_nonblank_type eq 'k'
6746             && ( $last_nonblank_token eq 'if'
6747             || $last_nonblank_token eq 'unless' )
6748             )
6749             {
6750 0         0 return EMPTY_STRING;
6751             }
6752             else {
6753 476         1472 return $last_nonblank_token;
6754             }
6755             }
6756              
6757             # or a sub or package BLOCK
6758             elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6759             && $last_nonblank_token =~ /^(sub|package)\b/ )
6760             {
6761 293         1036 return $last_nonblank_token;
6762             }
6763              
6764             # or a sub alias
6765             elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6766             && ( $is_sub{$last_nonblank_token} ) )
6767             {
6768 0         0 return 'sub';
6769             }
6770              
6771             elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6772 0         0 return $statement_type;
6773             }
6774              
6775             # user-defined subs with block parameters (like grep/map/eval)
6776             elsif ( $last_nonblank_type eq 'G' ) {
6777 0         0 return $last_nonblank_token;
6778             }
6779              
6780             # check bareword
6781             elsif ( $last_nonblank_type eq 'w' ) {
6782              
6783             # check for syntax 'use MODULE LIST'
6784             # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6785 22 100       105 return EMPTY_STRING if ( $statement_type eq 'use' );
6786              
6787 21         110 return $self->decide_if_code_block( $i, $rtokens, $rtoken_type,
6788             $max_token_index );
6789             }
6790              
6791             # Patch for bug # RT #94338 reported by Daniel Trizen
6792             # for-loop in a parenthesized block-map triggering an error message:
6793             # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6794             # Check for a code block within a parenthesized function call
6795             elsif ( $last_nonblank_token eq '(' ) {
6796 81         213 my $paren_type = $rparen_type->[$paren_depth];
6797              
6798             # /^(map|grep|sort)$/
6799 81 100 66     419 if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6800              
6801             # We will mark this as a code block but use type 't' instead
6802             # of the name of the containing function. This will allow for
6803             # correct parsing but will usually produce better formatting.
6804             # Braces with block type 't' are not broken open automatically
6805             # in the formatter as are other code block types, and this usually
6806             # works best.
6807 1         4 return 't'; # (Not $paren_type)
6808             }
6809             else {
6810 80         233 return EMPTY_STRING;
6811             }
6812             }
6813              
6814             # handle unknown syntax ') {'
6815             # we previously appended a '()' to mark this case
6816             elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6817 14         53 return $last_nonblank_token;
6818             }
6819              
6820             # anything else must be anonymous hash reference
6821             else {
6822 229         663 return EMPTY_STRING;
6823             }
6824             } ## end sub code_block_type
6825              
6826             sub decide_if_code_block {
6827              
6828             # USES GLOBAL VARIABLES: $last_nonblank_token
6829 168     168 0 450 my ( $self, $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6830              
6831 168         596 my ( $next_nonblank_token, $i_next ) =
6832             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
6833              
6834             # we are at a '{' where a statement may appear.
6835             # We must decide if this brace starts an anonymous hash or a code
6836             # block.
6837             # return "" if anonymous hash, and $last_nonblank_token otherwise
6838              
6839             # initialize to be code BLOCK
6840 168         448 my $code_block_type = $last_nonblank_token;
6841              
6842             # Check for the common case of an empty anonymous hash reference:
6843             # Maybe something like sub { { } }
6844 168 100       557 if ( $next_nonblank_token eq '}' ) {
6845 5         14 $code_block_type = EMPTY_STRING;
6846             }
6847              
6848             else {
6849              
6850             # To guess if this '{' is an anonymous hash reference, look ahead
6851             # and test as follows:
6852             #
6853             # it is a hash reference if next come:
6854             # - a string or digit followed by a comma or =>
6855             # - bareword followed by =>
6856             # otherwise it is a code block
6857             #
6858             # Examples of anonymous hash ref:
6859             # {'aa',};
6860             # {1,2}
6861             #
6862             # Examples of code blocks:
6863             # {1; print "hello\n", 1;}
6864             # {$a,1};
6865              
6866             # We are only going to look ahead one more (nonblank/comment) line.
6867             # Strange formatting could cause a bad guess, but that's unlikely.
6868 163         386 my @pre_types;
6869             my @pre_tokens;
6870              
6871             # Ignore the rest of this line if it is a side comment
6872 163 100       459 if ( $next_nonblank_token ne '#' ) {
6873 139         487 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
  139         725  
6874 139         448 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
  139         696  
6875             }
6876              
6877             # Here 20 is arbitrary but generous, and prevents wasting lots of time
6878             # in mangled files
6879 163         699 my ( $rpre_tokens, $rpre_types ) =
6880             $self->peek_ahead_for_n_nonblank_pre_tokens(20);
6881 163 100 66     601 if ( defined($rpre_types) && @{$rpre_types} ) {
  155         587  
6882 155         288 push @pre_types, @{$rpre_types};
  155         648  
6883 155         329 push @pre_tokens, @{$rpre_tokens};
  155         718  
6884             }
6885              
6886             # put a sentinel token to simplify stopping the search
6887 163         397 push @pre_types, '}';
6888 163         357 push @pre_types, '}';
6889              
6890 163         301 my $jbeg = 0;
6891 163 100       490 $jbeg = 1 if $pre_types[0] eq 'b';
6892              
6893             # first look for one of these
6894             # - bareword
6895             # - bareword with leading -
6896             # - digit
6897             # - quoted string
6898 163         288 my $j = $jbeg;
6899 163 100 33     1233 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
    100          
    100          
    50          
6900              
6901             # find the closing quote; don't worry about escapes
6902 1         3 my $quote_mark = $pre_types[$j];
6903 1         5 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6904 1 50       6 if ( $pre_types[$k] eq $quote_mark ) {
6905 1         3 $j = $k + 1;
6906             ##my $next = $pre_types[$j];
6907 1         2 last;
6908             }
6909             }
6910             }
6911             elsif ( $pre_types[$j] eq 'd' ) {
6912 8         15 $j++;
6913             }
6914             elsif ( $pre_types[$j] eq 'w' ) {
6915 71         168 $j++;
6916             }
6917             elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6918 0         0 $j++;
6919             }
6920 163 100       487 if ( $j > $jbeg ) {
6921              
6922 80 100       332 $j++ if $pre_types[$j] eq 'b';
6923              
6924             # Patched for RT #95708
6925 80 100 33     665 if (
      66        
      66        
6926              
6927             # it is a comma which is not a pattern delimiter except for qw
6928             (
6929             $pre_types[$j] eq ','
6930             ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6931             && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6932             )
6933              
6934             # or a =>
6935             || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6936             )
6937             {
6938 18         37 $code_block_type = EMPTY_STRING;
6939             }
6940             }
6941              
6942 163 100       524 if ($code_block_type) {
6943              
6944             # Patch for cases b1085 b1128: It is uncertain if this is a block.
6945             # If this brace follows a bareword, then append a space as a signal
6946             # to the formatter that this may not be a block brace. To find the
6947             # corresponding code in Formatter.pm search for 'b1085'.
6948 145 100       1181 $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6949             }
6950             }
6951              
6952 168         595 return $code_block_type;
6953             } ## end sub decide_if_code_block
6954              
6955             sub report_unexpected {
6956              
6957             # report unexpected token type and show where it is
6958             # USES GLOBAL VARIABLES: (none)
6959 0     0 0 0 my ( $self, $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6960             $rpretoken_type, $input_line )
6961             = @_;
6962              
6963 0 0       0 if ( ++$self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6964 0         0 my $msg = "found $found where $expecting expected";
6965 0         0 my $pos = $rpretoken_map->[$i_tok];
6966 0         0 $self->interrupt_logfile();
6967 0         0 my $input_line_number = $self->[_last_line_number_];
6968 0         0 my ( $offset, $numbered_line, $underline ) =
6969             make_numbered_line( $input_line_number, $input_line, $pos );
6970 0         0 $underline = write_on_underline( $underline, $pos - $offset, '^' );
6971              
6972 0         0 my $trailer = EMPTY_STRING;
6973 0 0 0     0 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6974 0         0 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6975 0         0 my $num;
6976 0 0       0 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6977 0         0 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6978             }
6979             else {
6980 0         0 $num = $pos - $pos_prev;
6981             }
6982 0 0       0 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
  0         0  
  0         0  
6983              
6984             $underline =
6985 0         0 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6986 0         0 $trailer = " (previous token underlined)";
6987             }
6988 0         0 $underline =~ s/\s+$//;
6989 0         0 $self->warning( $numbered_line . "\n" );
6990 0         0 $self->warning( $underline . "\n" );
6991 0         0 $self->warning( $msg . $trailer . "\n" );
6992 0         0 $self->resume_logfile();
6993             }
6994 0         0 return;
6995             } ## end sub report_unexpected
6996              
6997             my %is_sigil_or_paren;
6998             my %is_R_closing_sb;
6999              
7000             BEGIN {
7001              
7002 38     38   294 my @q = qw< $ & % * @ ) >;
7003 38         324 @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
7004              
7005 38         173 @q = qw(R ]);
7006 38         84484 @{is_R_closing_sb}{@q} = (1) x scalar(@q);
7007             } ## end BEGIN
7008              
7009             sub is_non_structural_brace {
7010              
7011             # Decide if a brace or bracket is structural or non-structural
7012             # by looking at the previous token and type
7013             # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
7014              
7015             # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
7016             # Tentatively deactivated because it caused the wrong operator expectation
7017             # for this code:
7018             # $user = @vars[1] / 100;
7019             # Must update sub operator_expected before re-implementing.
7020             # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
7021             # return 0;
7022             # }
7023              
7024             #--------------------------------------------------------------
7025             # NOTE: braces after type characters start code blocks, but for
7026             # simplicity these are not identified as such. See also
7027             # sub code_block_type
7028             #--------------------------------------------------------------
7029              
7030             ##if ($last_nonblank_type eq 't') {return 0}
7031              
7032             # otherwise, it is non-structural if it is decorated
7033             # by type information.
7034             # For example, the '{' here is non-structural: ${xxx}
7035             # Removed '::' to fix c074
7036             ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
7037             return (
7038             ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
7039             $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
7040             || substr( $last_nonblank_token, 0, 2 ) eq '->'
7041              
7042             # or if we follow a hash or array closing curly brace or bracket
7043             # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
7044             # because the first '}' would have been given type 'R'
7045             ##|| $last_nonblank_type =~ /^([R\]])$/
7046 2253   66 2253 0 14696 || $is_R_closing_sb{$last_nonblank_type}
7047             );
7048             } ## end sub is_non_structural_brace
7049              
7050             #######################################################################
7051             # Tokenizer routines for tracking container nesting depths
7052             #######################################################################
7053              
7054             # The following routines keep track of nesting depths of the nesting
7055             # types, ( [ { and ?. This is necessary for determining the indentation
7056             # level, and also for debugging programs. Not only do they keep track of
7057             # nesting depths of the individual brace types, but they check that each
7058             # of the other brace types is balanced within matching pairs. For
7059             # example, if the program sees this sequence:
7060             #
7061             # { ( ( ) }
7062             #
7063             # then it can determine that there is an extra left paren somewhere
7064             # between the { and the }. And so on with every other possible
7065             # combination of outer and inner brace types. For another
7066             # example:
7067             #
7068             # ( [ ..... ] ] )
7069             #
7070             # which has an extra ] within the parens.
7071             #
7072             # The brace types have indexes 0 .. 3 which are indexes into
7073             # the matrices.
7074             #
7075             # The pair ? : are treated as just another nesting type, with ? acting
7076             # as the opening brace and : acting as the closing brace.
7077             #
7078             # The matrix
7079             #
7080             # $rdepth_array->[$a][$b][ $rcurrent_depth->[$a] ] = $rcurrent_depth->[$b];
7081             #
7082             # saves the nesting depth of brace type $b (where $b is either of the other
7083             # nesting types) when brace type $a enters a new depth. When this depth
7084             # decreases, a check is made that the current depth of brace types $b is
7085             # unchanged, or otherwise there must have been an error. This can
7086             # be very useful for localizing errors, particularly when perl runs to
7087             # the end of a large file (such as this one) and announces that there
7088             # is a problem somewhere.
7089             #
7090             # A numerical sequence number is maintained for every nesting type,
7091             # so that each matching pair can be uniquely identified in a simple
7092             # way.
7093              
7094             sub increase_nesting_depth {
7095 4561     4561 0 9081 my ( $self, $aa, $pos ) = @_;
7096              
7097             # USES GLOBAL VARIABLES: $rcurrent_depth,
7098             # $rcurrent_sequence_number, $rdepth_array,
7099             # $rstarting_line_of_current_depth, $statement_type
7100 4561         8055 my $cd_aa = ++$rcurrent_depth->[$aa];
7101 4561         6715 $total_depth++;
7102 4561         8791 $rtotal_depth->[$aa][$cd_aa] = $total_depth;
7103 4561         7465 my $input_line_number = $self->[_last_line_number_];
7104 4561         7598 my $input_line = $self->[_line_of_text_];
7105              
7106             # Sequence numbers increment by number of items. This keeps
7107             # a unique set of numbers but still allows the relative location
7108             # of any type to be determined.
7109              
7110             # make a new unique sequence number
7111 4561         7776 my $seqno = $next_sequence_number++;
7112              
7113 4561         8216 $rcurrent_sequence_number->[$aa][$cd_aa] = $seqno;
7114              
7115 4561         13446 $rstarting_line_of_current_depth->[$aa][$cd_aa] =
7116             [ $input_line_number, $input_line, $pos ];
7117              
7118 4561         13763 for my $bb ( 0 .. @closing_brace_names - 1 ) {
7119 18244 100       33322 next if ( $bb == $aa );
7120 13683         26753 $rdepth_array->[$aa][$bb][$cd_aa] = $rcurrent_depth->[$bb];
7121             }
7122              
7123             # set a flag for indenting a nested ternary statement
7124 4561         8333 my $indent = 0;
7125 4561 100       10328 if ( $aa == QUESTION_COLON ) {
7126 187         614 $rnested_ternary_flag->[$cd_aa] = 0;
7127 187 100       629 if ( $cd_aa > 1 ) {
7128 17 100       98 if ( $rnested_ternary_flag->[ $cd_aa - 1 ] == 0 ) {
7129 16         66 my $pdepth = $rtotal_depth->[$aa][ $cd_aa - 1 ];
7130 16 100       68 if ( $pdepth == $total_depth - 1 ) {
7131 8         15 $indent = 1;
7132 8         26 $rnested_ternary_flag->[ $cd_aa - 1 ] = -1;
7133             }
7134             }
7135             }
7136             }
7137              
7138             # Fix part #1 for git82: save last token type for propagation of type 'Z'
7139 4561         15208 $rnested_statement_type->[$aa][$cd_aa] =
7140             [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
7141 4561         7928 $statement_type = EMPTY_STRING;
7142 4561         12693 return ( $seqno, $indent );
7143             } ## end sub increase_nesting_depth
7144              
7145             sub is_balanced_closing_container {
7146              
7147             # Return true if a closing container can go here without error
7148             # Return false if not
7149 47     47 0 125 my ($aa) = @_;
7150              
7151             # cannot close if there was no opening
7152 47         95 my $cd_aa = $rcurrent_depth->[$aa];
7153 47 100       187 return unless ( $cd_aa > 0 );
7154              
7155             # check that any other brace types $bb contained within would be balanced
7156 8         29 for my $bb ( 0 .. @closing_brace_names - 1 ) {
7157 8 50       24 next if ( $bb == $aa );
7158             return
7159             unless (
7160 8 50       41 $rdepth_array->[$aa][$bb][$cd_aa] == $rcurrent_depth->[$bb] );
7161             }
7162              
7163             # OK, everything will be balanced
7164 0         0 return 1;
7165             } ## end sub is_balanced_closing_container
7166              
7167             sub decrease_nesting_depth {
7168              
7169 4561     4561 0 9104 my ( $self, $aa, $pos ) = @_;
7170              
7171             # USES GLOBAL VARIABLES: $rcurrent_depth,
7172             # $rcurrent_sequence_number, $rdepth_array, $rstarting_line_of_current_depth
7173             # $statement_type
7174 4561         7024 my $seqno = 0;
7175 4561         7249 my $input_line_number = $self->[_last_line_number_];
7176 4561         7721 my $input_line = $self->[_line_of_text_];
7177              
7178 4561         6882 my $outdent = 0;
7179 4561         6766 $total_depth--;
7180 4561         7878 my $cd_aa = $rcurrent_depth->[$aa];
7181 4561 50       9255 if ( $cd_aa > 0 ) {
7182              
7183             # set a flag for un-indenting after seeing a nested ternary statement
7184 4561         8338 $seqno = $rcurrent_sequence_number->[$aa][$cd_aa];
7185 4561 100       9757 if ( $aa == QUESTION_COLON ) {
7186 187         556 $outdent = $rnested_ternary_flag->[$cd_aa];
7187             }
7188              
7189             # Fix part #2 for git82: use saved type for propagation of type 'Z'
7190             # through type L-R braces. Perl seems to allow ${bareword}
7191             # as an indirect object, but nothing much more complex than that.
7192             ( $statement_type, my $saved_type, my $saved_token ) =
7193 4561         6634 @{ $rnested_statement_type->[$aa][ $rcurrent_depth->[$aa] ] };
  4561         12980  
7194 4561 50 100     16127 if ( $aa == BRACE
      66        
      66        
7195             && $saved_type eq 'Z'
7196             && $last_nonblank_type eq 'w'
7197             && $rbrace_structural_type->[$brace_depth] eq 'L' )
7198             {
7199 1         4 $last_nonblank_type = $saved_type;
7200             }
7201              
7202             # check that any brace types $bb contained within are balanced
7203 4561         12842 for my $bb ( 0 .. @closing_brace_names - 1 ) {
7204 18244 100       32516 next if ( $bb == $aa );
7205              
7206 13683 50       31970 unless (
7207             $rdepth_array->[$aa][$bb][$cd_aa] == $rcurrent_depth->[$bb] )
7208             {
7209 0         0 my $diff =
7210             $rcurrent_depth->[$bb] - $rdepth_array->[$aa][$bb][$cd_aa];
7211              
7212             # don't whine too many times
7213 0         0 my $saw_brace_error = $self->get_saw_brace_error();
7214 0 0 0     0 if (
      0        
7215             $saw_brace_error <= MAX_NAG_MESSAGES
7216              
7217             # if too many closing types have occurred, we probably
7218             # already caught this error
7219             && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
7220             )
7221             {
7222 0         0 $self->interrupt_logfile();
7223 0         0 my $rsl = $rstarting_line_of_current_depth->[$aa][$cd_aa];
7224 0         0 my $sl = $rsl->[0];
7225 0         0 my $rel = [ $input_line_number, $input_line, $pos ];
7226 0         0 my $el = $rel->[0];
7227 0         0 my ($ess);
7228              
7229 0 0 0     0 if ( $diff == 1 || $diff == -1 ) {
7230 0         0 $ess = EMPTY_STRING;
7231             }
7232             else {
7233 0         0 $ess = 's';
7234             }
7235 0 0       0 my $bname =
7236             ( $diff > 0 )
7237             ? $opening_brace_names[$bb]
7238             : $closing_brace_names[$bb];
7239 0         0 $self->write_error_indicator_pair( @{$rsl}, '^' );
  0         0  
7240 0         0 my $msg = <<"EOM";
7241             Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
7242             EOM
7243              
7244 0 0       0 if ( $diff > 0 ) {
7245 0         0 my $rml =
7246             $rstarting_line_of_current_depth->[$bb]
7247             [ $rcurrent_depth->[$bb] ];
7248 0         0 my $ml = $rml->[0];
7249 0         0 $msg .=
7250             " The most recent un-matched $bname is on line $ml\n";
7251 0         0 $self->write_error_indicator_pair( @{$rml}, '^' );
  0         0  
7252             }
7253 0         0 $self->write_error_indicator_pair( @{$rel}, '^' );
  0         0  
7254 0         0 $self->warning($msg);
7255 0         0 $self->resume_logfile();
7256             }
7257 0         0 $self->increment_brace_error();
7258             }
7259             }
7260 4561         8410 $rcurrent_depth->[$aa]--;
7261             }
7262             else {
7263              
7264 0         0 my $saw_brace_error = $self->get_saw_brace_error();
7265 0 0       0 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
7266 0         0 my $msg = <<"EOM";
7267             There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
7268             EOM
7269 0         0 $self->indicate_error( $msg, $input_line_number, $input_line, $pos,
7270             '^' );
7271             }
7272 0         0 $self->increment_brace_error();
7273              
7274             # keep track of errors in braces alone (ignoring ternary nesting errors)
7275 0 0       0 $self->[_true_brace_error_count_]++
7276             if ( $closing_brace_names[$aa] ne "':'" );
7277             }
7278 4561         12189 return ( $seqno, $outdent );
7279             } ## end sub decrease_nesting_depth
7280              
7281             sub check_final_nesting_depths {
7282              
7283             # USES GLOBAL VARIABLES: $rcurrent_depth, $rstarting_line_of_current_depth
7284 556     556 0 1346 my $self = shift;
7285              
7286 556         2342 for my $aa ( 0 .. @closing_brace_names - 1 ) {
7287              
7288 2224         3752 my $cd_aa = $rcurrent_depth->[$aa];
7289 2224 50       5343 if ($cd_aa) {
7290 0         0 my $rsl = $rstarting_line_of_current_depth->[$aa][$cd_aa];
7291 0         0 my $sl = $rsl->[0];
7292 0         0 my $msg = <<"EOM";
7293             Final nesting depth of $opening_brace_names[$aa]s is $cd_aa
7294             The most recent un-matched $opening_brace_names[$aa] is on line $sl
7295             EOM
7296 0         0 $self->indicate_error( $msg, @{$rsl}, '^' );
  0         0  
7297 0         0 $self->increment_brace_error();
7298             }
7299             }
7300 556         1513 return;
7301             } ## end sub check_final_nesting_depths
7302              
7303             #######################################################################
7304             # Tokenizer routines for looking ahead in input stream
7305             #######################################################################
7306              
7307             sub peek_ahead_for_n_nonblank_pre_tokens {
7308              
7309             # returns next n pretokens if they exist
7310             # returns undef's if hits eof without seeing any pretokens
7311             # USES GLOBAL VARIABLES: (none)
7312 170     170 0 454 my ( $self, $max_pretokens ) = @_;
7313 170         307 my $line;
7314 170         289 my $i = 0;
7315 170         588 my ( $rpre_tokens, $rmap, $rpre_types );
7316              
7317 170         749 while ( $line = $self->[_line_buffer_object_]->peek_ahead( $i++ ) ) {
7318 182         975 $line =~ s/^\s*//; # trim leading blanks
7319 182 100       642 next if ( length($line) <= 0 ); # skip blank
7320 176 100       601 next if ( $line =~ /^#/ ); # skip comment
7321 162         433 ( $rpre_tokens, $rmap, $rpre_types ) =
7322             pre_tokenize( $line, $max_pretokens );
7323 162         464 last;
7324             }
7325 170         656 return ( $rpre_tokens, $rpre_types );
7326             } ## end sub peek_ahead_for_n_nonblank_pre_tokens
7327              
7328             # look ahead for next non-blank, non-comment line of code
7329             sub peek_ahead_for_nonblank_token {
7330              
7331             # USES GLOBAL VARIABLES: (none)
7332 115     115 0 374 my ( $self, $rtokens, $max_token_index ) = @_;
7333 115         216 my $line;
7334 115         235 my $i = 0;
7335              
7336 115         586 while ( $line = $self->[_line_buffer_object_]->peek_ahead( $i++ ) ) {
7337 159         863 $line =~ s/^\s*//; # trim leading blanks
7338 159 100       619 next if ( length($line) <= 0 ); # skip blank
7339 134 100       551 next if ( $line =~ /^#/ ); # skip comment
7340              
7341             # Updated from 2 to 3 to get trigraphs, added for case b1175
7342 113         415 my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
7343 113         423 my $j = $max_token_index + 1;
7344              
7345 113         265 foreach my $tok ( @{$rtok} ) {
  113         368  
7346 327 100       901 last if ( $tok =~ "\n" );
7347 294         740 $rtokens->[ ++$j ] = $tok;
7348             }
7349 113         452 last;
7350             }
7351 115         364 return;
7352             } ## end sub peek_ahead_for_nonblank_token
7353              
7354             #######################################################################
7355             # Tokenizer guessing routines for ambiguous situations
7356             #######################################################################
7357              
7358             sub guess_if_pattern_or_conditional {
7359              
7360             # this routine is called when we have encountered a ? following an
7361             # unknown bareword, and we must decide if it starts a pattern or not
7362             # input parameters:
7363             # $i - token index of the ? starting possible pattern
7364             # output parameters:
7365             # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
7366             # msg = a warning or diagnostic message
7367             # USES GLOBAL VARIABLES: $last_nonblank_token
7368              
7369 11     11 0 43 my ( $self, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
7370 11         28 my $is_pattern = 0;
7371 11         48 my $msg = "guessing that ? after $last_nonblank_token starts a ";
7372              
7373 11 50       44 if ( $i >= $max_token_index ) {
7374 0         0 $msg .= "conditional (no end to pattern found on the line)\n";
7375             }
7376             else {
7377 11         30 my $ibeg = $i;
7378 11         27 $i = $ibeg + 1;
7379 11         34 my $next_token = $rtokens->[$i]; # first token after ?
7380              
7381             # look for a possible ending ? on this line..
7382 11         35 my $in_quote = 1;
7383 11         25 my $quote_depth = 0;
7384 11         28 my $quote_character = EMPTY_STRING;
7385 11         20 my $quote_pos = 0;
7386 11         31 my $quoted_string;
7387             (
7388              
7389 11         54 $i,
7390             $in_quote,
7391             $quote_character,
7392             $quote_pos,
7393             $quote_depth,
7394             $quoted_string,
7395              
7396             ) = $self->follow_quoted_string(
7397              
7398             $ibeg,
7399             $in_quote,
7400             $rtokens,
7401             $quote_character,
7402             $quote_pos,
7403             $quote_depth,
7404             $max_token_index,
7405              
7406             );
7407              
7408 11 50       70 if ($in_quote) {
7409              
7410             # we didn't find an ending ? on this line,
7411             # so we bias towards conditional
7412 11         33 $is_pattern = 0;
7413 11         44 $msg .= "conditional (no ending ? on this line)\n";
7414              
7415             # we found an ending ?, so we bias towards a pattern
7416             }
7417             else {
7418              
7419             # Watch out for an ending ? in quotes, like this
7420             # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
7421 0         0 my $s_quote = 0;
7422 0         0 my $d_quote = 0;
7423 0         0 my $colons = 0;
7424 0         0 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
7425 0         0 my $tok = $rtokens->[$ii];
7426 0 0       0 if ( $tok eq ":" ) { $colons++ }
  0         0  
7427 0 0       0 if ( $tok eq "'" ) { $s_quote++ }
  0         0  
7428 0 0       0 if ( $tok eq '"' ) { $d_quote++ }
  0         0  
7429             }
7430 0 0 0     0 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
    0 0        
7431 0         0 $is_pattern = 0;
7432 0         0 $msg .= "found ending ? but unbalanced quote chars\n";
7433             }
7434             elsif (
7435             $self->pattern_expected( $i, $rtokens, $max_token_index ) >= 0 )
7436             {
7437 0         0 $is_pattern = 1;
7438 0         0 $msg .= "pattern (found ending ? and pattern expected)\n";
7439             }
7440             else {
7441 0         0 $msg .= "pattern (uncertain, but found ending ?)\n";
7442             }
7443             }
7444             }
7445 11         43 return ( $is_pattern, $msg );
7446             } ## end sub guess_if_pattern_or_conditional
7447              
7448             my %is_known_constant;
7449             my %is_known_function;
7450              
7451             BEGIN {
7452              
7453             # Constants like 'pi' in Trig.pm are common
7454 38     38   283 my @q = qw(pi pi2 pi4 pip2 pip4);
7455 38         297 @{is_known_constant}{@q} = (1) x scalar(@q);
7456              
7457             # parenless calls of 'ok' are common
7458 38         139 @q = qw( ok );
7459 38         67026 @{is_known_function}{@q} = (1) x scalar(@q);
7460             } ## end BEGIN
7461              
7462             sub guess_if_pattern_or_division {
7463              
7464             # this routine is called when we have encountered a / following an
7465             # unknown bareword, and we must decide if it starts a pattern or is a
7466             # division
7467             # input parameters:
7468             # $i - token index of the / starting possible pattern
7469             # output parameters:
7470             # $is_pattern = 0 if probably division, =1 if probably a pattern
7471             # msg = a warning or diagnostic message
7472             # USES GLOBAL VARIABLES: $last_nonblank_token
7473 0     0 0 0 my ( $self, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
7474 0         0 my $is_pattern = 0;
7475 0         0 my $msg = "guessing that / after $last_nonblank_token starts a ";
7476              
7477 0 0       0 if ( $i >= $max_token_index ) {
7478 0         0 $msg .= "division (no end to pattern found on the line)\n";
7479             }
7480             else {
7481 0         0 my $ibeg = $i;
7482 0         0 my $divide_possible =
7483             $self->is_possible_numerator( $i, $rtokens, $max_token_index );
7484              
7485 0 0       0 if ( $divide_possible < 0 ) {
7486 0         0 $msg = "pattern (division not possible here)\n";
7487 0         0 $is_pattern = 1;
7488 0         0 return ( $is_pattern, $msg );
7489             }
7490              
7491 0         0 $i = $ibeg + 1;
7492 0         0 my $next_token = $rtokens->[$i]; # first token after slash
7493              
7494             # One of the things we can look at is the spacing around the slash.
7495             # There # are four possible spacings around the first slash:
7496             #
7497             # return pi/two;#/; -/-
7498             # return pi/ two;#/; -/+
7499             # return pi / two;#/; +/+
7500             # return pi /two;#/; +/- <-- possible pattern
7501             #
7502             # Spacing rule: a space before the slash but not after the slash
7503             # usually indicates a pattern. We can use this to break ties.
7504              
7505 0   0     0 my $is_pattern_by_spacing =
7506             ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
7507              
7508             # look for a possible ending / on this line..
7509 0         0 my $in_quote = 1;
7510 0         0 my $quote_depth = 0;
7511 0         0 my $quote_character = EMPTY_STRING;
7512 0         0 my $quote_pos = 0;
7513 0         0 my $quoted_string;
7514             (
7515 0         0 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
7516             $quoted_string
7517             )
7518             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
7519             $quote_character, $quote_pos, $quote_depth, $max_token_index );
7520              
7521 0 0       0 if ($in_quote) {
7522              
7523             # we didn't find an ending / on this line, so we bias towards
7524             # division
7525 0 0       0 if ( $divide_possible >= 0 ) {
7526 0         0 $is_pattern = 0;
7527 0         0 $msg .= "division (no ending / on this line)\n";
7528             }
7529             else {
7530              
7531             # assuming a multi-line pattern ... this is risky, but division
7532             # does not seem possible. If this fails, it would either be due
7533             # to a syntax error in the code, or the division_expected logic
7534             # needs to be fixed.
7535 0         0 $msg = "multi-line pattern (division not possible)\n";
7536 0         0 $is_pattern = 1;
7537             }
7538             }
7539              
7540             # we found an ending /, so we bias slightly towards a pattern
7541             else {
7542              
7543 0         0 my $pattern_expected =
7544             $self->pattern_expected( $i, $rtokens, $max_token_index );
7545              
7546 0 0       0 if ( $pattern_expected >= 0 ) {
7547              
7548             # pattern looks possible...
7549 0 0       0 if ( $divide_possible >= 0 ) {
7550              
7551             # Both pattern and divide can work here...
7552              
7553             # Increase weight of divide if a pure number follows
7554 0         0 $divide_possible += $next_token =~ /^\d+$/;
7555              
7556             # Check for known constants in the numerator, like 'pi'
7557 0 0       0 if ( $is_known_constant{$last_nonblank_token} ) {
    0          
    0          
    0          
7558 0         0 $msg .=
7559             "division (pattern works too but saw known constant '$last_nonblank_token')\n";
7560 0         0 $is_pattern = 0;
7561             }
7562              
7563             # A very common bare word in pattern expressions is 'ok'
7564             elsif ( $is_known_function{$last_nonblank_token} ) {
7565 0         0 $msg .=
7566             "pattern (division works too but saw '$last_nonblank_token')\n";
7567 0         0 $is_pattern = 1;
7568             }
7569              
7570             # If one rule is more definite, use it
7571             elsif ( $divide_possible > $pattern_expected ) {
7572 0         0 $msg .=
7573             "division (more likely based on following tokens)\n";
7574 0         0 $is_pattern = 0;
7575             }
7576              
7577             # otherwise, use the spacing rule
7578             elsif ($is_pattern_by_spacing) {
7579 0         0 $msg .=
7580             "pattern (guess on spacing, but division possible too)\n";
7581 0         0 $is_pattern = 1;
7582             }
7583             else {
7584 0         0 $msg .=
7585             "division (guess on spacing, but pattern is possible too)\n";
7586 0         0 $is_pattern = 0;
7587             }
7588             }
7589              
7590             # divide_possible < 0 means divide can not work here
7591             else {
7592 0         0 $is_pattern = 1;
7593 0         0 $msg .= "pattern (division not possible)\n";
7594             }
7595             }
7596              
7597             # pattern does not look possible...
7598             else {
7599              
7600 0 0       0 if ( $divide_possible >= 0 ) {
7601 0         0 $is_pattern = 0;
7602 0         0 $msg .= "division (pattern not possible)\n";
7603             }
7604              
7605             # Neither pattern nor divide look possible...go by spacing
7606             else {
7607 0 0       0 if ($is_pattern_by_spacing) {
7608 0         0 $msg .= "pattern (guess on spacing)\n";
7609 0         0 $is_pattern = 1;
7610             }
7611             else {
7612 0         0 $msg .= "division (guess on spacing)\n";
7613 0         0 $is_pattern = 0;
7614             }
7615             }
7616             }
7617             }
7618             }
7619 0         0 return ( $is_pattern, $msg );
7620             } ## end sub guess_if_pattern_or_division
7621              
7622             # try to resolve here-doc vs. shift by looking ahead for
7623             # non-code or the end token (currently only looks for end token)
7624             # returns 1 if it is probably a here doc, 0 if not
7625             sub guess_if_here_doc {
7626              
7627 0     0 0 0 my ( $self, $next_token ) = @_;
7628              
7629             # This is how many lines we will search for a target as part of the
7630             # guessing strategy. It is a constant because there is probably
7631             # little reason to change it.
7632             # USES GLOBAL VARIABLES: $current_package $ris_constant,
7633 0         0 my $HERE_DOC_WINDOW = 40;
7634              
7635 0         0 my $here_doc_expected = 0;
7636 0         0 my $line;
7637 0         0 my $k = 0;
7638 0         0 my $msg = "checking <<";
7639              
7640 0         0 while ( $line = $self->[_line_buffer_object_]->peek_ahead( $k++ ) ) {
7641 0         0 chomp $line;
7642              
7643 0 0       0 if ( $line =~ /^$next_token$/ ) {
7644 0         0 $msg .= " -- found target $next_token ahead $k lines\n";
7645 0         0 $here_doc_expected = 1; # got it
7646 0         0 last;
7647             }
7648 0 0       0 last if ( $k >= $HERE_DOC_WINDOW );
7649             }
7650              
7651 0 0       0 unless ($here_doc_expected) {
7652              
7653 0 0       0 if ( !defined($line) ) {
7654 0         0 $here_doc_expected = -1; # hit eof without seeing target
7655 0         0 $msg .= " -- must be shift; target $next_token not in file\n";
7656              
7657             }
7658             else { # still unsure..taking a wild guess
7659              
7660 0 0       0 if ( !$ris_constant->{$current_package}{$next_token} ) {
7661 0         0 $here_doc_expected = 1;
7662 0         0 $msg .=
7663             " -- guessing it's a here-doc ($next_token not a constant)\n";
7664             }
7665             else {
7666 0         0 $msg .=
7667             " -- guessing it's a shift ($next_token is a constant)\n";
7668             }
7669             }
7670             }
7671 0         0 $self->write_logfile_entry($msg);
7672 0         0 return $here_doc_expected;
7673             } ## end sub guess_if_here_doc
7674              
7675             #######################################################################
7676             # Tokenizer Routines for scanning identifiers and related items
7677             #######################################################################
7678              
7679             sub scan_bare_identifier_do {
7680              
7681             # this routine is called to scan a token starting with an alphanumeric
7682             # variable or package separator, :: or '.
7683             # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
7684             # $last_nonblank_type, $rparen_type, $paren_depth
7685              
7686 1672     1672 0 6003 my ( $self, $input_line, $i, $tok, $type, $prototype, $rtoken_map,
7687             $max_token_index )
7688             = @_;
7689 1672         2654 my $i_begin = $i;
7690 1672         2806 my $package = undef;
7691              
7692 1672         2487 my $i_beg = $i;
7693              
7694             # we have to back up one pretoken at a :: since each : is one pretoken
7695 1672 100       4142 if ( $tok eq '::' ) { $i_beg-- }
  9         19  
7696 1672 50       3817 if ( $tok eq '->' ) { $i_beg-- }
  0         0  
7697 1672         2995 my $pos_beg = $rtoken_map->[$i_beg];
7698 1672         4921 pos($input_line) = $pos_beg;
7699              
7700             # Examples:
7701             # A::B::C
7702             # A::
7703             # ::A
7704             # A'B
7705 1672 50       12484 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
7706              
7707 1672         3345 my $pos = pos($input_line);
7708 1672         2842 my $numc = $pos - $pos_beg;
7709 1672         3597 $tok = substr( $input_line, $pos_beg, $numc );
7710              
7711             # type 'w' includes anything without leading type info
7712             # ($,%,@,*) including something like abc::def::ghi
7713 1672         2837 $type = 'w';
7714              
7715 1672         2784 my $sub_name = EMPTY_STRING;
7716 1672 100       4756 if ( defined($2) ) { $sub_name = $2; }
  1667         3264  
7717 1672 100       4212 if ( defined($1) ) {
7718 274         639 $package = $1;
7719              
7720             # patch: don't allow isolated package name which just ends
7721             # in the old style package separator (single quote). Example:
7722             # use CGI':all';
7723 274 50 66     1097 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
7724 0         0 $pos--;
7725             }
7726              
7727 274         783 $package =~ s/\'/::/g;
7728 274 100       828 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  9         23  
7729 274         1224 $package =~ s/::$//;
7730             }
7731             else {
7732 1398         2695 $package = $current_package;
7733              
7734             # patched for c043, part 1: keyword does not follow '->'
7735 1398 50 66     5374 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
7736 0         0 $type = 'k';
7737             }
7738             }
7739              
7740             # if it is a bareword.. patched for c043, part 2: not following '->'
7741 1672 100 66     8080 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
7742              
7743             # check for v-string with leading 'v' type character
7744             # (This seems to have precedence over filehandle, type 'Y')
7745 1003 100 66     14264 if ( $tok =~ /^v\d[_\d]*$/ ) {
    100 66        
    100 66        
    50          
    50          
    100          
    100          
7746              
7747             # we only have the first part - something like 'v101' -
7748             # look for more
7749 2 50       16 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
7750 2         8 $pos = pos($input_line);
7751 2         4 $numc = $pos - $pos_beg;
7752 2         5 $tok = substr( $input_line, $pos_beg, $numc );
7753             }
7754 2         8 $type = 'v';
7755              
7756             # warn if this version can't handle v-strings
7757 2         16 $self->report_v_string($tok);
7758             }
7759              
7760             elsif ( $ris_constant->{$package}{$sub_name} ) {
7761 12         39 $type = 'C';
7762             }
7763              
7764             # bareword after sort has implied empty prototype; for example:
7765             # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7766             # This has priority over whatever the user has specified.
7767             elsif ($last_nonblank_token eq 'sort'
7768             && $last_nonblank_type eq 'k' )
7769             {
7770 1         3 $type = 'Z';
7771             }
7772              
7773             # Note: strangely, perl does not seem to really let you create
7774             # functions which act like eval and do, in the sense that eval
7775             # and do may have operators following the final }, but any operators
7776             # that you create with prototype (&) apparently do not allow
7777             # trailing operators, only terms. This seems strange.
7778             # If this ever changes, here is the update
7779             # to make perltidy behave accordingly:
7780              
7781             # elsif ( $ris_block_function->{$package}{$tok} ) {
7782             # $tok='eval'; # patch to do braces like eval - doesn't work
7783             # $type = 'k';
7784             #}
7785             # TODO: This could become a separate type to allow for different
7786             # future behavior:
7787             elsif ( $ris_block_function->{$package}{$sub_name} ) {
7788 0         0 $type = 'G';
7789             }
7790             elsif ( $ris_block_list_function->{$package}{$sub_name} ) {
7791 0         0 $type = 'G';
7792             }
7793             elsif ( $ris_user_function->{$package}{$sub_name} ) {
7794 6         14 $type = 'U';
7795 6         433 $prototype = $ruser_function_prototype->{$package}{$sub_name};
7796             }
7797              
7798             # check for indirect object
7799             elsif (
7800              
7801             # added 2001-03-27: must not be followed immediately by '('
7802             # see fhandle.t
7803             ( $input_line !~ m/\G\(/gc )
7804              
7805             # and
7806             && (
7807              
7808             # preceded by keyword like 'print', 'printf' and friends
7809             $is_indirect_object_taker{$last_nonblank_token}
7810              
7811             # or preceded by something like 'print(' or 'printf('
7812             || (
7813             ( $last_nonblank_token eq '(' )
7814             && $is_indirect_object_taker{
7815             $rparen_type->[$paren_depth]
7816             }
7817              
7818             )
7819             )
7820             )
7821             {
7822              
7823             # may not be indirect object unless followed by a space;
7824             # updated 2021-01-16 to consider newline to be a space.
7825             # updated for case b990 to look for either ';' or space
7826 4 50 33     63 if ( pos($input_line) == length($input_line)
7827             || $input_line =~ m/\G[;\s]/gc )
7828             {
7829 4         14 $type = 'Y';
7830              
7831             # Abandon Hope ...
7832             # Perl's indirect object notation is a very bad
7833             # thing and can cause subtle bugs, especially for
7834             # beginning programmers. And I haven't even been
7835             # able to figure out a sane warning scheme which
7836             # doesn't get in the way of good scripts.
7837              
7838             # Complain if a filehandle has any lower case
7839             # letters. This is suggested good practice.
7840             # Use 'sub_name' because something like
7841             # main::MYHANDLE is ok for filehandle
7842 4 100       23 if ( $sub_name =~ /[a-z]/ ) {
7843              
7844             # could be bug caused by older perltidy if
7845             # followed by '('
7846 1 50       12 if ( $input_line =~ m/\G\s*\(/gc ) {
7847 1         9 $self->complain(
7848             "Caution: unknown word '$tok' in indirect object slot\n"
7849             );
7850             }
7851             }
7852             }
7853              
7854             # bareword not followed by a space -- may not be filehandle
7855             # (may be function call defined in a 'use' statement)
7856             else {
7857 0         0 $type = 'Z';
7858             }
7859             }
7860             }
7861              
7862             # Now we must convert back from character position
7863             # to pre_token index.
7864             # I don't think an error flag can occur here ..but who knows
7865 1672         2854 my $error;
7866 1672         4811 ( $i, $error ) =
7867             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7868 1672 50       4629 if ($error) {
7869 0         0 $self->warning(
7870             "scan_bare_identifier: Possibly invalid tokenization\n");
7871             }
7872             }
7873              
7874             # no match but line not blank - could be syntax error
7875             # perl will take '::' alone without complaint
7876             else {
7877 0         0 $type = 'w';
7878              
7879             # change this warning to log message if it becomes annoying
7880 0         0 $self->warning("didn't find identifier after leading ::\n");
7881             }
7882 1672         7089 return ( $i, $tok, $type, $prototype );
7883             } ## end sub scan_bare_identifier_do
7884              
7885             sub scan_id_do {
7886              
7887             # This is the new scanner and will eventually replace scan_identifier.
7888             # Only type 'sub' and 'package' are implemented.
7889             # Token types $ * % @ & -> are not yet implemented.
7890             #
7891             # Scan identifier following a type token.
7892             # The type of call depends on $id_scan_state: $id_scan_state = ''
7893             # for starting call, in which case $tok must be the token defining
7894             # the type.
7895             #
7896             # If the type token is the last nonblank token on the line, a value
7897             # of $id_scan_state = $tok is returned, indicating that further
7898             # calls must be made to get the identifier. If the type token is
7899             # not the last nonblank token on the line, the identifier is
7900             # scanned and handled and a value of '' is returned.
7901              
7902 330     330 0 1189 my ( $self, $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7903             $max_token_index )
7904             = @_;
7905 38     38   410 use constant DEBUG_NSCAN => 0;
  38         133  
  38         46761  
7906 330         673 my $type = EMPTY_STRING;
7907 330         669 my ( $i_beg, $pos_beg );
7908              
7909             #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7910             #my ($a,$b,$c) = caller;
7911             #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7912              
7913             # on re-entry, start scanning at first token on the line
7914 330 100       825 if ($id_scan_state) {
7915 10         39 $i_beg = $i;
7916 10         40 $type = EMPTY_STRING;
7917             }
7918              
7919             # on initial entry, start scanning just after type token
7920             else {
7921 320         594 $i_beg = $i + 1;
7922 320         547 $id_scan_state = $tok;
7923 320         693 $type = 't';
7924             }
7925              
7926             # find $i_beg = index of next nonblank token,
7927             # and handle empty lines
7928 330         573 my $blank_line = 0;
7929 330         736 my $next_nonblank_token = $rtokens->[$i_beg];
7930 330 100       950 if ( $i_beg > $max_token_index ) {
7931 2         5 $blank_line = 1;
7932             }
7933             else {
7934              
7935             # only a '#' immediately after a '$' is not a comment
7936 328 50       1004 if ( $next_nonblank_token eq '#' ) {
7937 0 0       0 unless ( $tok eq '$' ) {
7938 0         0 $blank_line = 1;
7939             }
7940             }
7941              
7942 328 100       1545 if ( $next_nonblank_token =~ /^\s/ ) {
7943 308         1152 ( $next_nonblank_token, $i_beg ) =
7944             find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7945             $max_token_index );
7946 308 100       1714 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7947 4         16 $blank_line = 1;
7948             }
7949             }
7950             }
7951              
7952             # handle non-blank line; identifier, if any, must follow
7953 330 100       1006 unless ($blank_line) {
7954              
7955 324 100       906 if ( $is_sub{$id_scan_state} ) {
    50          
7956 298         3324 ( $i, $tok, $type, $id_scan_state ) = $self->do_scan_sub(
7957             {
7958             input_line => $input_line,
7959             i => $i,
7960             i_beg => $i_beg,
7961             tok => $tok,
7962             type => $type,
7963             rtokens => $rtokens,
7964             rtoken_map => $rtoken_map,
7965             id_scan_state => $id_scan_state,
7966             max_token_index => $max_token_index,
7967             }
7968             );
7969             }
7970              
7971             elsif ( $is_package{$id_scan_state} ) {
7972 26         104 ( $i, $tok, $type ) =
7973             $self->do_scan_package( $input_line, $i, $i_beg, $tok, $type,
7974             $rtokens, $rtoken_map, $max_token_index );
7975 26         61 $id_scan_state = EMPTY_STRING;
7976             }
7977              
7978             else {
7979 0         0 $self->warning("invalid token in scan_id: $tok\n");
7980 0         0 $id_scan_state = EMPTY_STRING;
7981             }
7982             }
7983              
7984 330 50 33     1964 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
      66        
7985              
7986             # shouldn't happen:
7987 0         0 if (DEVEL_MODE) {
7988             $self->Fault(<<EOM);
7989             Program bug in scan_id: undefined type but scan_state=$id_scan_state
7990             EOM
7991             }
7992             $self->warning(
7993 0         0 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7994             );
7995 0         0 $self->report_definite_bug();
7996             }
7997              
7998 330         545 DEBUG_NSCAN && do {
7999             print STDOUT
8000             "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
8001             };
8002 330         1414 return ( $i, $tok, $type, $id_scan_state );
8003             } ## end sub scan_id_do
8004              
8005             sub check_prototype {
8006 136     136 0 434 my ( $proto, $package, $subname ) = @_;
8007 136 50 33     794 return unless ( defined($package) && defined($subname) );
8008 136 100       423 if ( defined($proto) ) {
8009 34         202 $proto =~ s/^\s*\(\s*//;
8010 34         157 $proto =~ s/\s*\)$//;
8011 34 100       85 if ($proto) {
8012 5         24 $ris_user_function->{$package}{$subname} = 1;
8013 5         26 $ruser_function_prototype->{$package}{$subname} = "($proto)";
8014              
8015             # prototypes containing '&' must be treated specially..
8016 5 100       23 if ( $proto =~ /\&/ ) {
8017              
8018             # right curly braces of prototypes ending in
8019             # '&' may be followed by an operator
8020 1 50       10 if ( $proto =~ /\&$/ ) {
    50          
8021 0         0 $ris_block_function->{$package}{$subname} = 1;
8022             }
8023              
8024             # right curly braces of prototypes NOT ending in
8025             # '&' may NOT be followed by an operator
8026             elsif ( $proto !~ /\&$/ ) {
8027 1         4 $ris_block_list_function->{$package}{$subname} = 1;
8028             }
8029             }
8030             }
8031             else {
8032 29         113 $ris_constant->{$package}{$subname} = 1;
8033             }
8034             }
8035             else {
8036 102         325 $ris_user_function->{$package}{$subname} = 1;
8037             }
8038 136         304 return;
8039             } ## end sub check_prototype
8040              
8041             sub do_scan_package {
8042              
8043             # do_scan_package parses a package name
8044             # it is called with $i_beg equal to the index of the first nonblank
8045             # token following a 'package' token.
8046             # USES GLOBAL VARIABLES: $current_package,
8047              
8048             # package NAMESPACE
8049             # package NAMESPACE VERSION
8050             # package NAMESPACE BLOCK
8051             # package NAMESPACE VERSION BLOCK
8052             #
8053             # If VERSION is provided, package sets the $VERSION variable in the given
8054             # namespace to a version object with the VERSION provided. VERSION must be
8055             # a "strict" style version number as defined by the version module: a
8056             # positive decimal number (integer or decimal-fraction) without
8057             # exponentiation or else a dotted-decimal v-string with a leading 'v'
8058             # character and at least three components.
8059             # reference http://perldoc.perl.org/functions/package.html
8060              
8061             my (
8062 26     26 0 90 $self, $input_line, $i,
8063             $i_beg, $tok, $type,
8064             $rtokens, $rtoken_map, $max_token_index
8065             ) = @_;
8066 26         52 my $package = undef;
8067 26         52 my $pos_beg = $rtoken_map->[$i_beg];
8068 26         87 pos($input_line) = $pos_beg;
8069              
8070             # handle non-blank line; package name, if any, must follow
8071 26 50       221 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
8072 26         69 $package = $1;
8073 26 50 33     157 $package = ( defined($1) && $1 ) ? $1 : 'main';
8074 26         75 $package =~ s/\'/::/g;
8075 26 50       82 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  0         0  
8076 26         58 $package =~ s/::$//;
8077 26         56 my $pos = pos($input_line);
8078 26         57 my $numc = $pos - $pos_beg;
8079 26         82 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
8080 26         55 $type = 'i';
8081              
8082             # Now we must convert back from character position
8083             # to pre_token index.
8084             # I don't think an error flag can occur here ..but ?
8085 26         44 my $error;
8086 26         97 ( $i, $error ) =
8087             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8088 26 50       87 if ($error) { $self->warning("Possibly invalid package\n") }
  0         0  
8089 26         70 $current_package = $package;
8090              
8091             # we should now have package NAMESPACE
8092             # now expecting VERSION, BLOCK, or ; to follow ...
8093             # package NAMESPACE VERSION
8094             # package NAMESPACE BLOCK
8095             # package NAMESPACE VERSION BLOCK
8096 26         81 my ( $next_nonblank_token, $i_next ) =
8097             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
8098              
8099             # check that something recognizable follows, but do not parse.
8100             # A VERSION number will be parsed later as a number or v-string in the
8101             # normal way. What is important is to set the statement type if
8102             # everything looks okay so that the operator_expected() routine
8103             # knows that the number is in a package statement.
8104             # Examples of valid primitive tokens that might follow are:
8105             # 1235 . ; { } v3 v
8106             # FIX: added a '#' since a side comment may also follow
8107             # Added ':' for class attributes (for --use-feature=class, rt145706)
8108 26 50       120 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#\:])|v\d|\d+$/ ) {
8109 26         72 $statement_type = $tok;
8110             }
8111             else {
8112 0         0 $self->warning(
8113             "Unexpected '$next_nonblank_token' after package name '$tok'\n"
8114             );
8115             }
8116             }
8117              
8118             # no match but line not blank --
8119             # could be a label with name package, like package: , for example.
8120             else {
8121 0         0 $type = 'k';
8122             }
8123              
8124 26         103 return ( $i, $tok, $type );
8125             } ## end sub do_scan_package
8126              
8127             { ## begin closure for sub scan_complex_identifier
8128              
8129 38     38   371 use constant DEBUG_SCAN_ID => 0;
  38         141  
  38         5293  
8130              
8131             # Constant hash:
8132             my %is_special_variable_char;
8133              
8134             BEGIN {
8135              
8136             # These are the only characters which can (currently) form special
8137             # variables, like $^W: (issue c066).
8138 38     38   369 my @q =
8139             qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
8140 38         137681 @{is_special_variable_char}{@q} = (1) x scalar(@q);
8141             } ## end BEGIN
8142              
8143             # These are the possible states for this scanner:
8144             my $scan_state_SIGIL = '$';
8145             my $scan_state_ALPHA = 'A';
8146             my $scan_state_COLON = ':';
8147             my $scan_state_LPAREN = '(';
8148             my $scan_state_RPAREN = ')';
8149             my $scan_state_AMPERSAND = '&';
8150             my $scan_state_SPLIT = '^';
8151              
8152             # Only these non-blank states may be returned to caller:
8153             my %is_returnable_scan_state = (
8154             $scan_state_SIGIL => 1,
8155             $scan_state_AMPERSAND => 1,
8156             );
8157              
8158             # USES GLOBAL VARIABLES:
8159             # $context, $last_nonblank_token, $last_nonblank_type
8160              
8161             #-----------
8162             # call args:
8163             #-----------
8164             my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
8165             $expecting, $container_type );
8166              
8167             #-------------------------------------------
8168             # my variables, re-initialized on each call:
8169             #-------------------------------------------
8170             my $i_begin; # starting index $i
8171             my $type; # returned identifier type
8172             my $tok_begin; # starting token
8173             my $tok; # returned token
8174             my $id_scan_state_begin; # starting scan state
8175             my $identifier_begin; # starting identifier
8176             my $i_save; # a last good index, in case of error
8177             my $message; # hold error message for log file
8178             my $tok_is_blank;
8179             my $last_tok_is_blank;
8180             my $in_prototype_or_signature;
8181             my $saw_alpha;
8182             my $saw_type;
8183             my $allow_tick;
8184              
8185             sub initialize_my_scan_id_vars {
8186              
8187             # Initialize all 'my' vars on entry
8188 486     486 0 883 $i_begin = $i;
8189 486         871 $type = EMPTY_STRING;
8190 486         995 $tok_begin = $rtokens->[$i_begin];
8191 486         854 $tok = $tok_begin;
8192 486 50       1369 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
  0         0  
8193 486         890 $id_scan_state_begin = $id_scan_state;
8194 486         812 $identifier_begin = $identifier;
8195 486         816 $i_save = undef;
8196              
8197 486         888 $message = EMPTY_STRING;
8198 486         864 $tok_is_blank = undef; # a flag to speed things up
8199 486         802 $last_tok_is_blank = undef;
8200              
8201 486   100     1702 $in_prototype_or_signature =
8202             $container_type && $container_type =~ /^sub\b/;
8203              
8204             # these flags will be used to help figure out the type:
8205 486         794 $saw_alpha = undef;
8206 486         762 $saw_type = undef;
8207              
8208             # allow old package separator (') except in 'use' statement
8209 486         904 $allow_tick = ( $last_nonblank_token ne 'use' );
8210 486         842 return;
8211             } ## end sub initialize_my_scan_id_vars
8212              
8213             #----------------------------------
8214             # Routines for handling scan states
8215             #----------------------------------
8216             sub do_id_scan_state_dollar {
8217              
8218 514     514 0 889 my $self = shift;
8219              
8220             # We saw a sigil, now looking to start a variable name
8221 514 100 66     4149 if ( $tok eq '$' ) {
    100 33        
    100          
    50          
    50          
    100          
    100          
    100          
    100          
8222              
8223 31         144 $identifier .= $tok;
8224              
8225             # we've got a punctuation variable if end of line (punct.t)
8226 31 50       161 if ( $i == $max_token_index ) {
8227 0         0 $type = 'i';
8228 0         0 $id_scan_state = EMPTY_STRING;
8229             }
8230             }
8231             elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
8232 253         489 $saw_alpha = 1;
8233 253         495 $identifier .= $tok;
8234              
8235             # now need :: except for special digit vars like '$1' (c208)
8236 253 100       862 $id_scan_state = $tok =~ /^\d/ ? EMPTY_STRING : $scan_state_COLON;
8237             }
8238             elsif ( $tok eq '::' ) {
8239 16         92 $id_scan_state = $scan_state_ALPHA;
8240 16         61 $identifier .= $tok;
8241             }
8242              
8243             # POSTDEFREF ->@ ->% ->& ->*
8244             elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
8245 0         0 $identifier .= $tok;
8246             }
8247             elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
8248 0         0 $saw_alpha = 1;
8249 0         0 $id_scan_state = $scan_state_COLON; # now need ::
8250 0         0 $identifier .= $tok;
8251              
8252             # Perl will accept leading digits in identifiers,
8253             # although they may not always produce useful results.
8254             # Something like $main::0 is ok. But this also works:
8255             #
8256             # sub howdy::123::bubba{ print "bubba $54321!\n" }
8257             # howdy::123::bubba();
8258             #
8259             }
8260             elsif ( $tok eq '#' ) {
8261              
8262 99         256 my $is_punct_var = $identifier eq '$$';
8263              
8264             # side comment or identifier?
8265 99 100 66     2034 if (
      66        
      66        
      33        
8266              
8267             # A '#' starts a comment if it follows a space. For example,
8268             # the following is equivalent to $ans=40.
8269             # my $ #
8270             # ans = 40;
8271             !$last_tok_is_blank
8272              
8273             # a # inside a prototype or signature can only start a
8274             # comment
8275             && !$in_prototype_or_signature
8276              
8277             # these are valid punctuation vars: *# %# @# $#
8278             # May also be '$#array' or POSTDEFREF ->$#
8279             && ( $identifier =~ /^[\%\@\$\*]$/
8280             || $identifier =~ /\$$/ )
8281              
8282             # but a '#' after '$$' is a side comment; see c147
8283             && !$is_punct_var
8284              
8285             )
8286             {
8287 95         275 $identifier .= $tok; # keep same state, a $ could follow
8288             }
8289             else {
8290              
8291             # otherwise it is a side comment
8292 4 50       21 if ( $identifier eq '->' ) { }
    50          
    50          
8293 0         0 elsif ($is_punct_var) { $type = 'i' }
8294 4         6 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
8295 0         0 else { $type = 'i' }
8296 4         7 $i = $i_save;
8297 4         10 $id_scan_state = EMPTY_STRING;
8298             }
8299             }
8300              
8301             elsif ( $tok eq '{' ) {
8302              
8303             # check for something like ${#} or ${?}, where ? is a special char
8304 38 100 100     539 if (
      66        
      100        
      100        
8305             (
8306             $identifier eq '$'
8307             || $identifier eq '@'
8308             || $identifier eq '$#'
8309             )
8310             && $i + 2 <= $max_token_index
8311             && $rtokens->[ $i + 2 ] eq '}'
8312             && $rtokens->[ $i + 1 ] !~ /[\s\w]/
8313             )
8314             {
8315 1         4 my $next2 = $rtokens->[ $i + 2 ];
8316 1         3 my $next1 = $rtokens->[ $i + 1 ];
8317 1         3 $identifier .= $tok . $next1 . $next2;
8318 1         3 $i += 2;
8319 1         2 $id_scan_state = EMPTY_STRING;
8320             }
8321             else {
8322              
8323             # skip something like ${xxx} or ->{
8324 37         92 $id_scan_state = EMPTY_STRING;
8325              
8326             # if this is the first token of a line, any tokens for this
8327             # identifier have already been accumulated
8328 37 100 66     180 if ( $identifier eq '$' || $i == 0 ) {
8329 26         52 $identifier = EMPTY_STRING;
8330             }
8331 37         80 $i = $i_save;
8332             }
8333             }
8334              
8335             # space ok after leading $ % * & @
8336             elsif ( $tok =~ /^\s*$/ ) {
8337              
8338 20         65 $tok_is_blank = 1;
8339              
8340             # note: an id with a leading '&' does not actually come this way
8341 20 50       106 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
    0          
8342              
8343 20 100       70 if ( length($identifier) > 1 ) {
8344 8         17 $id_scan_state = EMPTY_STRING;
8345 8         18 $i = $i_save;
8346 8         18 $type = 'i'; # probably punctuation variable
8347             }
8348             else {
8349              
8350             # fix c139: trim line-ending type 't'
8351 12 100       59 if ( $i == $max_token_index ) {
    100          
8352 1         3 $i = $i_save;
8353 1         3 $type = 't';
8354             }
8355              
8356             # spaces after $'s are common, and space after @
8357             # is harmless, so only complain about space
8358             # after other type characters. Space after $ and
8359             # @ will be removed in formatting. Report space
8360             # after % and * because they might indicate a
8361             # parsing error. In other words '% ' might be a
8362             # modulo operator. Delete this warning if it
8363             # gets annoying.
8364             elsif ( $identifier !~ /^[\@\$]$/ ) {
8365 1         5 $message =
8366             "Space in identifier, following $identifier\n";
8367             }
8368             else {
8369             ## ok: silently accept space after '$' and '@' sigils
8370             }
8371             }
8372             }
8373              
8374             elsif ( $identifier eq '->' ) {
8375              
8376             # space after '->' is ok except at line end ..
8377             # so trim line-ending in type '->' (fixes c139)
8378 0 0       0 if ( $i == $max_token_index ) {
8379 0         0 $i = $i_save;
8380 0         0 $type = '->';
8381             }
8382             }
8383              
8384             # stop at space after something other than -> or sigil
8385             # Example of what can arrive here:
8386             # eval { $MyClass->$$ };
8387             else {
8388 0         0 $id_scan_state = EMPTY_STRING;
8389 0         0 $i = $i_save;
8390 0         0 $type = 'i';
8391             }
8392             }
8393             elsif ( $tok eq '^' ) {
8394              
8395             # check for some special variables like $^ $^W
8396 11 50       47 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
8397 11         34 $identifier .= $tok;
8398 11         25 $type = 'i';
8399              
8400             # There may be one more character, not a space, after the ^
8401 11         31 my $next1 = $rtokens->[ $i + 1 ];
8402 11         28 my $chr = substr( $next1, 0, 1 );
8403 11 100       47 if ( $is_special_variable_char{$chr} ) {
8404              
8405             # It is something like $^W
8406             # Test case (c066) : $^Oeq'linux'
8407 9         20 $i++;
8408 9         19 $identifier .= $next1;
8409              
8410             # If pretoken $next1 is more than one character long,
8411             # set a flag indicating that it needs to be split.
8412 9 100       39 $id_scan_state =
8413             ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
8414             }
8415             else {
8416              
8417             # it is just $^
8418             # Simple test case (c065): '$aa=$^if($bb)';
8419 2         4 $id_scan_state = EMPTY_STRING;
8420             }
8421             }
8422             else {
8423 0         0 $id_scan_state = EMPTY_STRING;
8424 0         0 $i = $i_save;
8425             }
8426             }
8427             else { # something else
8428              
8429 46 100 66     428 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
    100 66        
    100          
    50          
    0          
    0          
8430              
8431             # We might be in an extrusion of
8432             # sub foo2 ( $first, $, $third ) {
8433             # looking at a line starting with a comma, like
8434             # $
8435             # ,
8436             # in this case the comma ends the signature variable
8437             # '$' which will have been previously marked type 't'
8438             # rather than 'i'.
8439 3 100       10 if ( $i == $i_begin ) {
8440 1         10 $identifier = EMPTY_STRING;
8441 1         4 $type = EMPTY_STRING;
8442             }
8443              
8444             # at a # we have to mark as type 't' because more may
8445             # follow, otherwise, in a signature we can let '$' be an
8446             # identifier here for better formatting.
8447             # See 'mangle4.in' for a test case.
8448             else {
8449 2         4 $type = 'i';
8450 2 50 33     12 if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
8451 0         0 $type = 't';
8452             }
8453 2         6 $i = $i_save;
8454             }
8455 3         5 $id_scan_state = EMPTY_STRING;
8456             }
8457              
8458             # check for various punctuation variables
8459             elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
8460 35         117 $identifier .= $tok;
8461             }
8462              
8463             # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
8464             elsif ($tok eq '*'
8465             && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
8466             {
8467 6         13 $identifier .= $tok;
8468             }
8469              
8470             elsif ( $identifier eq '$#' ) {
8471              
8472 2 50       12 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
  0 50       0  
  0         0  
8473              
8474             # perl seems to allow just these: $#: $#- $#+
8475             elsif ( $tok =~ /^[\:\-\+]$/ ) {
8476 0         0 $type = 'i';
8477 0         0 $identifier .= $tok;
8478             }
8479             else {
8480 2         6 $i = $i_save;
8481 2         28 $self->write_logfile_entry(
8482             'Use of $# is deprecated' . "\n" );
8483             }
8484             }
8485             elsif ( $identifier eq '$$' ) {
8486              
8487             # perl does not allow references to punctuation
8488             # variables without braces. For example, this
8489             # won't work:
8490             # $:=\4;
8491             # $a = $$:;
8492             # You would have to use
8493             # $a = ${$:};
8494              
8495             # '$$' alone is punctuation variable for PID
8496 0         0 $i = $i_save;
8497 0 0       0 if ( $tok eq '{' ) { $type = 't' }
  0         0  
8498 0         0 else { $type = 'i' }
8499             }
8500             elsif ( $identifier eq '->' ) {
8501 0         0 $i = $i_save;
8502             }
8503             else {
8504 0         0 $i = $i_save;
8505 0 0       0 if ( length($identifier) == 1 ) {
8506 0         0 $identifier = EMPTY_STRING;
8507             }
8508             }
8509 46         109 $id_scan_state = EMPTY_STRING;
8510             }
8511 514         973 return;
8512             } ## end sub do_id_scan_state_dollar
8513              
8514             sub do_id_scan_state_alpha {
8515              
8516 113     113 0 260 my $self = shift;
8517              
8518             # looking for alphanumeric after ::
8519 113         418 $tok_is_blank = $tok =~ /^\s*$/;
8520              
8521 113 100 33     520 if ( $tok =~ /^\w/ ) { # found it
    50 66        
    50 33        
    50          
8522 100         201 $identifier .= $tok;
8523 100         174 $id_scan_state = $scan_state_COLON; # now need ::
8524 100         173 $saw_alpha = 1;
8525             }
8526             elsif ( $tok eq "'" && $allow_tick ) {
8527 0         0 $identifier .= $tok;
8528 0         0 $id_scan_state = $scan_state_COLON; # now need ::
8529 0         0 $saw_alpha = 1;
8530             }
8531             elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
8532 0         0 $id_scan_state = $scan_state_LPAREN;
8533 0         0 $identifier .= $tok;
8534             }
8535             elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8536 0         0 $id_scan_state = $scan_state_RPAREN;
8537 0         0 $identifier .= $tok;
8538             }
8539             else {
8540 13         26 $id_scan_state = EMPTY_STRING;
8541 13         21 $i = $i_save;
8542             }
8543 113         191 return;
8544             } ## end sub do_id_scan_state_alpha
8545              
8546             sub do_id_scan_state_colon {
8547              
8548 434     434 0 816 my $self = shift;
8549              
8550             # looking for possible :: after alphanumeric
8551              
8552 434         1517 $tok_is_blank = $tok =~ /^\s*$/;
8553              
8554 434 100 66     3722 if ( $tok eq '::' ) { # got it
    100 66        
    100 66        
    50          
    50          
8555 97         184 $identifier .= $tok;
8556 97         164 $id_scan_state = $scan_state_ALPHA; # now require alpha
8557             }
8558             elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
8559 20         66 $identifier .= $tok;
8560 20         46 $id_scan_state = $scan_state_COLON; # now need ::
8561 20         36 $saw_alpha = 1;
8562             }
8563             elsif ( $tok eq "'" && $allow_tick ) { # tick
8564              
8565 12 50       30 if ( $is_keyword{$identifier} ) {
8566 0         0 $id_scan_state = EMPTY_STRING; # that's all
8567 0         0 $i = $i_save;
8568             }
8569             else {
8570 12         23 $identifier .= $tok;
8571             }
8572             }
8573             elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
8574 0         0 $id_scan_state = $scan_state_LPAREN;
8575 0         0 $identifier .= $tok;
8576             }
8577             elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8578 0         0 $id_scan_state = $scan_state_RPAREN;
8579 0         0 $identifier .= $tok;
8580             }
8581             else {
8582 305         609 $id_scan_state = EMPTY_STRING; # that's all
8583 305         511 $i = $i_save;
8584             }
8585 434         747 return;
8586             } ## end sub do_id_scan_state_colon
8587              
8588             sub do_id_scan_state_left_paren {
8589              
8590 0     0 0 0 my $self = shift;
8591              
8592             # looking for possible '(' of a prototype
8593              
8594 0 0       0 if ( $tok eq '(' ) { # got it
    0          
8595 0         0 $identifier .= $tok;
8596 0         0 $id_scan_state = $scan_state_RPAREN; # now find the end of it
8597             }
8598             elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
8599 0         0 $identifier .= $tok;
8600 0         0 $tok_is_blank = 1;
8601             }
8602             else {
8603 0         0 $id_scan_state = EMPTY_STRING; # that's all - no prototype
8604 0         0 $i = $i_save;
8605             }
8606 0         0 return;
8607             } ## end sub do_id_scan_state_left_paren
8608              
8609             sub do_id_scan_state_right_paren {
8610              
8611 0     0 0 0 my $self = shift;
8612              
8613             # looking for a ')' of prototype to close a '('
8614              
8615 0         0 $tok_is_blank = $tok =~ /^\s*$/;
8616              
8617 0 0       0 if ( $tok eq ')' ) { # got it
    0          
8618 0         0 $identifier .= $tok;
8619 0         0 $id_scan_state = EMPTY_STRING; # all done
8620             }
8621             elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
8622 0         0 $identifier .= $tok;
8623             }
8624             else { # probable error in script, but keep going
8625 0         0 warning("Unexpected '$tok' while seeking end of prototype\n");
8626 0         0 $identifier .= $tok;
8627             }
8628 0         0 return;
8629             } ## end sub do_id_scan_state_right_paren
8630              
8631             sub do_id_scan_state_ampersand {
8632              
8633 105     105 0 277 my $self = shift;
8634              
8635             # Starting sub call after seeing an '&'
8636              
8637 105 100 33     673 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
    50          
    100          
    50          
    50          
    0          
8638 88         200 $id_scan_state = $scan_state_COLON; # now need ::
8639 88         169 $saw_alpha = 1;
8640 88         193 $identifier .= $tok;
8641             }
8642             elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
8643 0         0 $id_scan_state = $scan_state_COLON; # now need ::
8644 0         0 $saw_alpha = 1;
8645 0         0 $identifier .= $tok;
8646             }
8647             elsif ( $tok =~ /^\s*$/ ) { # allow space
8648 2         5 $tok_is_blank = 1;
8649              
8650             # fix c139: trim line-ending type 't'
8651 2 50 33     12 if ( length($identifier) == 1 && $i == $max_token_index ) {
8652 2         3 $i = $i_save;
8653 2         5 $type = 't';
8654             }
8655             }
8656             elsif ( $tok eq '::' ) { # leading ::
8657 0         0 $id_scan_state = $scan_state_ALPHA; # accept alpha next
8658 0         0 $identifier .= $tok;
8659             }
8660             elsif ( $tok eq '{' ) {
8661 15 50 33     66 if ( $identifier eq '&' || $i == 0 ) {
8662 15         42 $identifier = EMPTY_STRING;
8663             }
8664 15         34 $i = $i_save;
8665 15         31 $id_scan_state = EMPTY_STRING;
8666             }
8667             elsif ( $tok eq '^' ) {
8668 0 0       0 if ( $identifier eq '&' ) {
8669              
8670             # Special variable (c066)
8671 0         0 $identifier .= $tok;
8672 0         0 $type = '&';
8673              
8674             # There may be one more character, not a space, after the ^
8675 0         0 my $next1 = $rtokens->[ $i + 1 ];
8676 0         0 my $chr = substr( $next1, 0, 1 );
8677 0 0       0 if ( $is_special_variable_char{$chr} ) {
8678              
8679             # It is something like &^O
8680 0         0 $i++;
8681 0         0 $identifier .= $next1;
8682              
8683             # If pretoken $next1 is more than one character long,
8684             # set a flag indicating that it needs to be split.
8685 0 0       0 $id_scan_state =
8686             ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
8687             }
8688             else {
8689              
8690             # it is &^
8691 0         0 $id_scan_state = EMPTY_STRING;
8692             }
8693             }
8694             else {
8695 0         0 $identifier = EMPTY_STRING;
8696 0         0 $i = $i_save;
8697             }
8698             }
8699             else {
8700              
8701             # punctuation variable?
8702             # testfile: cunningham4.pl
8703             #
8704             # We have to be careful here. If we are in an unknown state,
8705             # we will reject the punctuation variable. In the following
8706             # example the '&' is a binary operator but we are in an unknown
8707             # state because there is no sigil on 'Prima', so we don't
8708             # know what it is. But it is a bad guess that
8709             # '&~' is a function variable.
8710             # $self->{text}->{colorMap}->[
8711             # Prima::PodView::COLOR_CODE_FOREGROUND
8712             # & ~tb::COLOR_INDEX ] =
8713             # $sec->{ColorCode}
8714              
8715             # Fix for case c033: a '#' here starts a side comment
8716 0 0 0     0 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
      0        
8717 0         0 $identifier .= $tok;
8718             }
8719             else {
8720 0         0 $identifier = EMPTY_STRING;
8721 0         0 $i = $i_save;
8722 0         0 $type = '&';
8723             }
8724 0         0 $id_scan_state = EMPTY_STRING;
8725             }
8726 105         203 return;
8727             } ## end sub do_id_scan_state_ampersand
8728              
8729             #-------------------
8730             # hash of scanner subs
8731             #-------------------
8732             my $scan_identifier_code = {
8733             $scan_state_SIGIL => \&do_id_scan_state_dollar,
8734             $scan_state_ALPHA => \&do_id_scan_state_alpha,
8735             $scan_state_COLON => \&do_id_scan_state_colon,
8736             $scan_state_LPAREN => \&do_id_scan_state_left_paren,
8737             $scan_state_RPAREN => \&do_id_scan_state_right_paren,
8738             $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
8739             };
8740              
8741             sub scan_complex_identifier {
8742              
8743             # This routine assembles tokens into identifiers. It maintains a
8744             # scan state, id_scan_state. It updates id_scan_state based upon
8745             # current id_scan_state and token, and returns an updated
8746             # id_scan_state and the next index after the identifier.
8747              
8748             # This routine now serves a a backup for sub scan_simple_identifier
8749             # which handles most identifiers.
8750              
8751             # Note that $self must be a 'my' variable and not be a closure
8752             # variables like the other args. Otherwise it will not get
8753             # deleted by a DESTROY call at the end of a file. Then an
8754             # attempt to create multiple tokenizers can occur when multiple
8755             # files are processed, causing an error.
8756              
8757             (
8758 486     486 0 2689 my $self, $i, $id_scan_state, $identifier, $rtokens,
8759             $max_token_index, $expecting, $container_type
8760             ) = @_;
8761              
8762             # return flag telling caller to split the pretoken
8763 486         2485 my $split_pretoken_flag;
8764              
8765             #-------------------
8766             # Initialize my vars
8767             #-------------------
8768              
8769 486         1628 initialize_my_scan_id_vars();
8770              
8771             #--------------------------------------------------------
8772             # get started by defining a type and a state if necessary
8773             #--------------------------------------------------------
8774              
8775 486 100       1286 if ( !$id_scan_state ) {
8776 479         843 $context = UNKNOWN_CONTEXT;
8777              
8778             # fixup for digraph
8779 479 50       1273 if ( $tok eq '>' ) {
8780 0         0 $tok = '->';
8781 0         0 $tok_begin = $tok;
8782             }
8783 479         858 $identifier = $tok;
8784              
8785 479 100 100     3030 if ( $last_nonblank_token eq '->' ) {
    100 100        
    100 0        
    50          
    0          
    0          
    0          
    0          
8786 6         14 $identifier = '->' . $identifier;
8787 6         12 $id_scan_state = $scan_state_SIGIL;
8788             }
8789             elsif ( $tok eq '$' || $tok eq '*' ) {
8790 293         564 $id_scan_state = $scan_state_SIGIL;
8791 293         529 $context = SCALAR_CONTEXT;
8792             }
8793             elsif ( $tok eq '%' || $tok eq '@' ) {
8794 77         159 $id_scan_state = $scan_state_SIGIL;
8795 77         147 $context = LIST_CONTEXT;
8796             }
8797             elsif ( $tok eq '&' ) {
8798 103         220 $id_scan_state = $scan_state_AMPERSAND;
8799             }
8800             elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8801 0         0 $saw_alpha = 0; # 'sub' is considered type info here
8802 0         0 $id_scan_state = $scan_state_SIGIL;
8803 0         0 $identifier .=
8804             SPACE; # need a space to separate sub from sub name
8805             }
8806             elsif ( $tok eq '::' ) {
8807 0         0 $id_scan_state = $scan_state_ALPHA;
8808             }
8809             elsif ( $tok =~ /^\w/ ) {
8810 0         0 $id_scan_state = $scan_state_COLON;
8811 0         0 $saw_alpha = 1;
8812             }
8813             elsif ( $tok eq '->' ) {
8814 0         0 $id_scan_state = $scan_state_SIGIL;
8815             }
8816             else {
8817              
8818             # shouldn't happen: bad call parameter
8819 0         0 my $msg =
8820             "Program bug detected: scan_complex_identifier received bad starting token = '$tok'\n";
8821 0         0 if (DEVEL_MODE) { $self->Fault($msg) }
8822 0 0       0 if ( !$self->[_in_error_] ) {
8823 0         0 warning($msg);
8824 0         0 $self->[_in_error_] = 1;
8825             }
8826 0         0 $id_scan_state = EMPTY_STRING;
8827              
8828             # emergency return
8829 0         0 goto RETURN;
8830             }
8831 479         912 $saw_type = !$saw_alpha;
8832             }
8833             else {
8834 7         22 $i--;
8835 7         32 $saw_alpha = ( $tok =~ /^\w/ );
8836 7         18 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
8837              
8838             # check for a valid starting state
8839 7         16 if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8840             $self->Fault(<<EOM);
8841             Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8842             EOM
8843             }
8844             }
8845              
8846             #------------------------------
8847             # loop to gather the identifier
8848             #------------------------------
8849              
8850 486         861 $i_save = $i;
8851              
8852 486   100     2188 while ( $i < $max_token_index && $id_scan_state ) {
8853              
8854             # Be sure we have code to handle this state before we proceed
8855 1169         2648 my $code = $scan_identifier_code->{$id_scan_state};
8856 1169 100       2381 if ( !$code ) {
8857              
8858 3 50       14 if ( $id_scan_state eq $scan_state_SPLIT ) {
8859             ## OK: this is the signal to exit and split the pretoken
8860             }
8861              
8862             # unknown state - should not happen
8863             else {
8864 0         0 if (DEVEL_MODE) {
8865             $self->Fault(<<EOM);
8866             Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8867             Scan state at sub entry was '$id_scan_state_begin'
8868             EOM
8869             }
8870 0         0 $id_scan_state = EMPTY_STRING;
8871 0         0 $i = $i_save;
8872             }
8873 3         6 last;
8874             }
8875              
8876             # Remember the starting index for progress check below
8877 1166         1804 my $i_start_loop = $i;
8878              
8879 1166         1941 $last_tok_is_blank = $tok_is_blank;
8880 1166 100       2092 if ($tok_is_blank) { $tok_is_blank = undef }
  11         29  
8881 1155         1674 else { $i_save = $i }
8882              
8883 1166         2117 $tok = $rtokens->[ ++$i ];
8884              
8885             # patch to make digraph :: if necessary
8886 1166 100 100     3119 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8887 113         261 $tok = '::';
8888 113         172 $i++;
8889             }
8890              
8891 1166         3438 $code->($self);
8892              
8893             # check for forward progress: a decrease in the index $i
8894             # implies that scanning has finished
8895 1166 100       3979 last if ( $i <= $i_start_loop );
8896              
8897             } ## end of main loop
8898              
8899             #-------------
8900             # Check result
8901             #-------------
8902              
8903             # Be sure a valid state is returned
8904 486 100       1293 if ($id_scan_state) {
8905              
8906 20 100       86 if ( !$is_returnable_scan_state{$id_scan_state} ) {
8907              
8908 13 100       57 if ( $id_scan_state eq $scan_state_SPLIT ) {
8909 3         5 $split_pretoken_flag = 1;
8910             }
8911              
8912 13 50       53 if ( $id_scan_state eq $scan_state_RPAREN ) {
8913 0         0 warning(
8914             "Hit end of line while seeking ) to end prototype\n");
8915             }
8916              
8917 13         28 $id_scan_state = EMPTY_STRING;
8918             }
8919              
8920             # Patch: the deprecated variable $# does not combine with anything
8921             # on the next line.
8922 20 50       65 if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
  0         0  
8923             }
8924              
8925             # Be sure the token index is valid
8926 486 50       1303 if ( $i < 0 ) { $i = 0 }
  0         0  
8927              
8928             # Be sure a token type is defined
8929 486 100       1229 if ( !$type ) {
8930              
8931 458 100       1049 if ($saw_type) {
    100          
8932              
8933 452 100 33     3562 if ($saw_alpha) {
    50 66        
    100 66        
      33        
8934              
8935             # The type without the -> should be the same as with the -> so
8936             # that if they get separated we get the same bond strengths,
8937             # etc. See b1234
8938 348 50 33     1387 if ( $identifier =~ /^->/
      33        
8939             && $last_nonblank_type eq 'w'
8940             && substr( $identifier, 2, 1 ) =~ /^\w/ )
8941             {
8942 0         0 $type = 'w';
8943             }
8944 348         732 else { $type = 'i' }
8945             }
8946             elsif ( $identifier eq '->' ) {
8947 0         0 $type = '->';
8948             }
8949             elsif (
8950             ( length($identifier) > 1 )
8951              
8952             # In something like '@$=' we have an identifier '@$'
8953             # In something like '$${' we have type '$$' (and only
8954             # part of an identifier)
8955             && !( $identifier =~ /\$$/ && $tok eq '{' )
8956              
8957             ## && ( $identifier !~ /^(sub |package )$/ )
8958             && $identifier ne 'sub '
8959             && $identifier ne 'package '
8960             )
8961             {
8962 53         1730 $type = 'i';
8963             }
8964 51         119 else { $type = 't' }
8965             }
8966             elsif ($saw_alpha) {
8967              
8968             # type 'w' includes anything without leading type info
8969             # ($,%,@,*) including something like abc::def::ghi
8970 5         10 $type = 'w';
8971              
8972             # Fix for b1337, if restarting scan after line break between
8973             # '->' or sigil and identifier name, use type 'i'
8974 5 50 33     32 if ( $id_scan_state_begin
8975             && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8976             {
8977 5         12 $type = 'i';
8978             }
8979             }
8980             else {
8981 1         13 $type = EMPTY_STRING;
8982             } # this can happen on a restart
8983             }
8984              
8985             # See if we formed an identifier...
8986 486 100       2053 if ($identifier) {
8987 444         827 $tok = $identifier;
8988 444 100       1057 if ($message) { $self->write_logfile_entry($message) }
  1         7  
8989             }
8990              
8991             # did not find an identifier, back up
8992             else {
8993 42         85 $tok = $tok_begin;
8994 42         75 $i = $i_begin;
8995             }
8996              
8997             RETURN:
8998              
8999 486         1124 DEBUG_SCAN_ID && do {
9000             my ( $a, $b, $c ) = caller;
9001             print STDOUT
9002             "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
9003             print STDOUT
9004             "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
9005             };
9006 486         2326 return ( $i, $tok, $type, $id_scan_state, $identifier,
9007             $split_pretoken_flag );
9008             } ## end sub scan_complex_identifier
9009             } ## end closure for sub scan_complex_identifier
9010              
9011             { ## closure for sub do_scan_sub
9012              
9013             my %warn_if_lexical;
9014              
9015             BEGIN {
9016              
9017             # lexical subs with these names can cause parsing errors in this version
9018 38     38   279 my @q = qw( m q qq qr qw qx s tr y );
9019 38         3277 @{warn_if_lexical}{@q} = (1) x scalar(@q);
9020             } ## end BEGIN
9021              
9022             # saved package and subnames in case prototype is on separate line
9023             my ( $package_saved, $subname_saved );
9024              
9025             # initialize subname each time a new 'sub' keyword is encountered
9026             sub initialize_subname {
9027 294     294 0 615 $package_saved = EMPTY_STRING;
9028 294         576 $subname_saved = EMPTY_STRING;
9029 294         511 return;
9030             }
9031              
9032             use constant {
9033 38         87204 SUB_CALL => 1,
9034             PAREN_CALL => 2,
9035             PROTOTYPE_CALL => 3,
9036 38     38   321 };
  38         116  
9037              
9038             sub do_scan_sub {
9039              
9040             # do_scan_sub parses a sub name and prototype.
9041              
9042             # At present there are three basic CALL TYPES which are
9043             # distinguished by the starting value of '$tok':
9044             # 1. $tok='sub', id_scan_state='sub'
9045             # it is called with $i_beg equal to the index of the first nonblank
9046             # token following a 'sub' token.
9047             # 2. $tok='(', id_scan_state='sub',
9048             # it is called with $i_beg equal to the index of a '(' which may
9049             # start a prototype.
9050             # 3. $tok='prototype', id_scan_state='prototype'
9051             # it is called with $i_beg equal to the index of a '(' which is
9052             # preceded by ': prototype' and has $id_scan_state eq 'prototype'
9053              
9054             # Examples:
9055              
9056             # A single type 1 call will get both the sub and prototype
9057             # sub foo1 ( $$ ) { }
9058             # ^
9059              
9060             # The subname will be obtained with a 'sub' call
9061             # The prototype on line 2 will be obtained with a '(' call
9062             # sub foo1
9063             # ^ <---call type 1
9064             # ( $$ ) { }
9065             # ^ <---call type 2
9066              
9067             # The subname will be obtained with a 'sub' call
9068             # The prototype will be obtained with a 'prototype' call
9069             # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
9070             # ^ <---type 1 ^ <---type 3
9071              
9072             # TODO: add future error checks to be sure we have a valid
9073             # sub name. For example, 'sub &doit' is wrong. Also, be sure
9074             # a name is given if and only if a non-anonymous sub is
9075             # appropriate.
9076             # USES GLOBAL VARS: $current_package, $last_nonblank_token,
9077             # $rsaw_function_definition,
9078             # $statement_type
9079              
9080 300     300 0 760 my ( $self, $rinput_hash ) = @_;
9081              
9082 300         705 my $input_line = $rinput_hash->{input_line};
9083 300         600 my $i = $rinput_hash->{i};
9084 300         824 my $i_beg = $rinput_hash->{i_beg};
9085 300         652 my $tok = $rinput_hash->{tok};
9086 300         571 my $type = $rinput_hash->{type};
9087 300         600 my $rtokens = $rinput_hash->{rtokens};
9088 300         564 my $rtoken_map = $rinput_hash->{rtoken_map};
9089 300         568 my $id_scan_state = $rinput_hash->{id_scan_state};
9090 300         547 my $max_token_index = $rinput_hash->{max_token_index};
9091              
9092 300         540 my $i_entry = $i;
9093              
9094             # Determine the CALL TYPE
9095             # 1=sub
9096             # 2=(
9097             # 3=prototype
9098 300 100       1019 my $call_type =
    100          
9099             $tok eq 'prototype' ? PROTOTYPE_CALL
9100             : $tok eq '(' ? PAREN_CALL
9101             : SUB_CALL;
9102              
9103 300         509 $id_scan_state = EMPTY_STRING; # normally we get everything in one call
9104 300         559 my $subname = $subname_saved;
9105 300         558 my $package = $package_saved;
9106 300         623 my $proto = undef;
9107 300         547 my $attrs = undef;
9108 300         493 my $match;
9109              
9110 300         568 my $pos_beg = $rtoken_map->[$i_beg];
9111 300         1034 pos($input_line) = $pos_beg;
9112              
9113             # Look for the sub NAME if this is a SUB call
9114 300 100 100     2774 if (
9115             $call_type == SUB_CALL
9116             && $input_line =~ m/\G\s*
9117             ((?:\w*(?:'|::))*) # package - something that ends in :: or '
9118             (\w+) # NAME - required
9119             /gcx
9120             )
9121             {
9122 121         313 $match = 1;
9123 121         337 $subname = $2;
9124              
9125 121   33     452 my $is_lexical_sub =
9126             $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
9127 121 0 33     383 if ( $is_lexical_sub && $1 ) {
9128 0         0 $self->warning("'my' sub $subname cannot be in package '$1'\n");
9129 0         0 $is_lexical_sub = 0;
9130             }
9131              
9132 121 50       360 if ($is_lexical_sub) {
9133              
9134             # lexical subs use the block sequence number as a package name
9135 0         0 my $seqno =
9136             $rcurrent_sequence_number->[BRACE]
9137             [ $rcurrent_depth->[BRACE] ];
9138 0 0       0 $seqno = 1 unless ( defined($seqno) );
9139 0         0 $package = $seqno;
9140 0 0       0 if ( $warn_if_lexical{$subname} ) {
9141 0         0 $self->warning(
9142             "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
9143             );
9144              
9145             # This may end badly, it is safest to block formatting
9146             # For an example, see perl527/lexsub.t (issue c203)
9147 0         0 $self->[_in_trouble_] = 1;
9148             }
9149             }
9150             else {
9151 121 100 66     813 $package = ( defined($1) && $1 ) ? $1 : $current_package;
9152 121         438 $package =~ s/\'/::/g;
9153 121 50       457 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
  0         0  
9154 121         308 $package =~ s/::$//;
9155             }
9156              
9157 121         261 my $pos = pos($input_line);
9158 121         260 my $numc = $pos - $pos_beg;
9159 121         376 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
9160 121         253 $type = 'i';
9161              
9162             # remember the sub name in case another call is needed to
9163             # get the prototype
9164 121         267 $package_saved = $package;
9165 121         286 $subname_saved = $subname;
9166             }
9167              
9168             # Now look for PROTO ATTRS for all call types
9169             # Look for prototype/attributes which are usually on the same
9170             # line as the sub name but which might be on a separate line.
9171             # For example, we might have an anonymous sub with attributes,
9172             # or a prototype on a separate line from its sub name
9173              
9174             # NOTE: We only want to parse PROTOTYPES here. If we see anything that
9175             # does not look like a prototype, we assume it is a SIGNATURE and we
9176             # will stop and let the the standard tokenizer handle it. In
9177             # particular, we stop if we see any nested parens, braces, or commas.
9178             # Also note, a valid prototype cannot contain any alphabetic character
9179             # -- see https://perldoc.perl.org/perlsub
9180             # But it appears that an underscore is valid in a prototype, so the
9181             # regex below uses [A-Za-z] rather than \w
9182             # This is the old regex which has been replaced:
9183             # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
9184 300         1085 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
9185 300 100 100     2896 if (
      66        
9186             $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
9187             (\s*:)? # ATTRS leading ':'
9188             /gcx
9189             && ( $1 || $2 )
9190             )
9191             {
9192 45         119 $proto = $1;
9193 45         100 $attrs = $2;
9194              
9195             # Append the prototype to the starting token if it is 'sub' or
9196             # 'prototype'. This is not necessary but for compatibility with
9197             # previous versions when the -csc flag is used:
9198 45 100 100     298 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
    100 100        
9199 24         64 $tok .= $proto;
9200             }
9201              
9202             # If we just entered the sub at an opening paren on this call, not
9203             # a following :prototype, label it with the previous token. This is
9204             # necessary to propagate the sub name to its opening block.
9205             elsif ( $call_type == PAREN_CALL ) {
9206 2         8 $tok = $last_nonblank_token;
9207             }
9208              
9209 45   100     161 $match ||= 1;
9210              
9211             # Patch part #1 to fixes cases b994 and b1053:
9212             # Mark an anonymous sub keyword without prototype as type 'k', i.e.
9213             # 'sub : lvalue { ...'
9214 45         95 $type = 'i';
9215 45 100 100     202 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
  2         5  
9216             }
9217              
9218 300 100       902 if ($match) {
9219              
9220             # ATTRS: if there are attributes, back up and let the ':' be
9221             # found later by the scanner.
9222 136         309 my $pos = pos($input_line);
9223 136 100       406 if ($attrs) {
9224 15         33 $pos -= length($attrs);
9225             }
9226              
9227 136         312 my $next_nonblank_token = $tok;
9228              
9229             # catch case of line with leading ATTR ':' after anonymous sub
9230 136 100 100     557 if ( $pos == $pos_beg && $tok eq ':' ) {
9231 1         2 $type = 'A';
9232 1         5 $self->[_in_attribute_list_] = 1;
9233             }
9234              
9235             # Otherwise, if we found a match we must convert back from
9236             # string position to the pre_token index for continued parsing.
9237             else {
9238              
9239             # I don't think an error flag can occur here ..but ?
9240 135         247 my $error;
9241 135         544 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
9242             $max_token_index );
9243 135 50       448 if ($error) { $self->warning("Possibly invalid sub\n") }
  0         0  
9244              
9245             # Patch part #2 to fixes cases b994 and b1053:
9246             # Do not let spaces be part of the token of an anonymous sub
9247             # keyword which we marked as type 'k' above...i.e. for
9248             # something like:
9249             # 'sub : lvalue { ...'
9250             # Back up and let it be parsed as a blank
9251 135 50 66     621 if ( $type eq 'k'
      66        
      33        
9252             && $attrs
9253             && $i > $i_entry
9254             && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
9255             {
9256 2         5 $i--;
9257             }
9258              
9259             # check for multiple definitions of a sub
9260 135         388 ( $next_nonblank_token, my $i_next ) =
9261             find_next_nonblank_token_on_this_line( $i, $rtokens,
9262             $max_token_index );
9263             }
9264              
9265 136 100       759 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
9266             { # skip blank or side comment
9267 7         49 my ( $rpre_tokens, $rpre_types ) =
9268             $self->peek_ahead_for_n_nonblank_pre_tokens(1);
9269 7 50 33     39 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
  7         31  
9270 7         26 $next_nonblank_token = $rpre_tokens->[0];
9271             }
9272             else {
9273 0         0 $next_nonblank_token = '}';
9274             }
9275             }
9276              
9277             # See what's next...
9278 136 100       570 if ( $next_nonblank_token eq '{' ) {
    100          
    50          
    100          
    50          
    0          
9279 104 100       364 if ($subname) {
9280              
9281             # Check for multiple definitions of a sub, but
9282             # it is ok to have multiple sub BEGIN, etc,
9283             # so we do not complain if name is all caps
9284 94 50 33     616 if ( $rsaw_function_definition->{$subname}{$package}
9285             && $subname !~ /^[A-Z]+$/ )
9286             {
9287             my $lno =
9288 0         0 $rsaw_function_definition->{$subname}{$package};
9289 0 0       0 if ( $package =~ /^\d/ ) {
9290 0         0 $self->warning(
9291             "already saw definition of lexical 'sub $subname' at line $lno\n"
9292             );
9293              
9294             }
9295             else {
9296 0         0 $self->warning(
9297             "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
9298             ) unless (DEVEL_MODE);
9299             }
9300             }
9301 94         368 $rsaw_function_definition->{$subname}{$package} =
9302             $self->[_last_line_number_];
9303             }
9304             }
9305             elsif ( $next_nonblank_token eq ';' ) {
9306             }
9307             elsif ( $next_nonblank_token eq '}' ) {
9308             }
9309              
9310             # ATTRS - if an attribute list follows, remember the name
9311             # of the sub so the next opening brace can be labeled.
9312             # Setting 'statement_type' causes any ':'s to introduce
9313             # attributes.
9314             elsif ( $next_nonblank_token eq ':' ) {
9315 16 100       62 if ( $call_type == SUB_CALL ) {
9316 14 100       53 $statement_type =
9317             substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
9318             }
9319             }
9320              
9321             # if we stopped before an open paren ...
9322             elsif ( $next_nonblank_token eq '(' ) {
9323              
9324             # If we DID NOT see this paren above then it must be on the
9325             # next line so we will set a flag to come back here and see if
9326             # it is a PROTOTYPE
9327              
9328             # Otherwise, we assume it is a SIGNATURE rather than a
9329             # PROTOTYPE and let the normal tokenizer handle it as a list
9330 15 100       49 if ( !$saw_opening_paren ) {
9331 4         11 $id_scan_state = 'sub'; # we must come back to get proto
9332             }
9333 15 50       73 if ( $call_type == SUB_CALL ) {
9334 15 50       71 $statement_type =
9335             substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
9336             }
9337             }
9338             elsif ($next_nonblank_token) { # EOF technically ok
9339              
9340 0 0 0     0 if ( $rinput_hash->{tok} eq 'method' && $call_type == SUB_CALL )
9341             {
9342             # For a method call, silently ignore this error (rt145706)
9343             # to avoid needless warnings. Example which can produce it:
9344             # test(method Pack (), "method");
9345              
9346             # TODO: scan for use feature 'class' and:
9347             # - if we saw 'use feature 'class' then issue the warning.
9348             # - if we did not see use feature 'class' then issue the
9349             # warning and suggest turning off --use-feature=class
9350             }
9351             else {
9352 0 0       0 $subname = EMPTY_STRING unless defined($subname);
9353 0         0 $self->warning(
9354             "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
9355             );
9356             }
9357             }
9358 136         678 check_prototype( $proto, $package, $subname );
9359             }
9360              
9361             # no match to either sub name or prototype, but line not blank
9362             else {
9363              
9364             }
9365 300         1585 return ( $i, $tok, $type, $id_scan_state );
9366             } ## end sub do_scan_sub
9367             }
9368              
9369             #########################################################################
9370             # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
9371             #########################################################################
9372              
9373             sub find_next_nonblank_token {
9374 6154     6154 0 12052 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9375              
9376             # Returns the next nonblank token after the token at index $i
9377             # To skip past a side comment, and any subsequent block comments
9378             # and blank lines, call with i=$max_token_index
9379              
9380 6154 100       13259 if ( $i >= $max_token_index ) {
9381 117 100       643 if ( !peeked_ahead() ) {
9382 115         382 peeked_ahead(1);
9383 115         600 $self->peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
9384             }
9385             }
9386              
9387 6154         10894 my $next_nonblank_token = $rtokens->[ ++$i ];
9388 6154 50 33     22890 return ( SPACE, $i )
9389             unless ( defined($next_nonblank_token) && length($next_nonblank_token) );
9390              
9391             # Quick test for nonblank ascii char. Note that we just have to
9392             # examine the first character here.
9393 6154         12510 my $ord = ord( substr( $next_nonblank_token, 0, 1 ) );
9394 6154 100 66     23682 if ( $ord >= ORD_PRINTABLE_MIN
    50 33        
    0          
9395             && $ord <= ORD_PRINTABLE_MAX )
9396             {
9397 2353         7058 return ( $next_nonblank_token, $i );
9398             }
9399              
9400             # Quick test to skip over an ascii space or tab
9401             elsif ( $ord == ORD_SPACE || $ord == ORD_TAB ) {
9402 3801         6783 $next_nonblank_token = $rtokens->[ ++$i ];
9403 3801 50       8444 return ( SPACE, $i ) unless defined($next_nonblank_token);
9404             }
9405              
9406             # Slow test to skip over something else identified as whitespace
9407             elsif ( $next_nonblank_token =~ /^\s*$/ ) {
9408 0         0 $next_nonblank_token = $rtokens->[ ++$i ];
9409 0 0       0 return ( SPACE, $i ) unless defined($next_nonblank_token);
9410             }
9411              
9412             # We should be at a nonblank now
9413 3801         10821 return ( $next_nonblank_token, $i );
9414             } ## end sub find_next_nonblank_token
9415              
9416             sub find_next_noncomment_type {
9417 98     98 0 358 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9418              
9419             # Given the current character position, look ahead past any comments
9420             # and blank lines and return the next token, including digraphs and
9421             # trigraphs.
9422              
9423 98         515 my ( $next_nonblank_token, $i_next ) =
9424             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9425              
9426             # skip past any side comment
9427 98 50       551 if ( $next_nonblank_token eq '#' ) {
9428 0         0 ( $next_nonblank_token, $i_next ) =
9429             $self->find_next_nonblank_token( $i_next, $rtokens,
9430             $max_token_index );
9431             }
9432              
9433             # check for a digraph
9434 98 50 33     927 if ( $next_nonblank_token
      33        
9435             && $next_nonblank_token ne SPACE
9436             && defined( $rtokens->[ $i_next + 1 ] ) )
9437             {
9438 98         305 my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
9439 98 100       408 if ( $is_digraph{$test2} ) {
9440 15         48 $next_nonblank_token = $test2;
9441 15         67 $i_next = $i_next + 1;
9442              
9443             # check for a trigraph
9444 15 50       67 if ( defined( $rtokens->[ $i_next + 1 ] ) ) {
9445 15         54 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
9446 15 50       83 if ( $is_trigraph{$test3} ) {
9447 0         0 $next_nonblank_token = $test3;
9448 0         0 $i_next = $i_next + 1;
9449             }
9450             }
9451             }
9452             }
9453              
9454 98         330 return ( $next_nonblank_token, $i_next );
9455             } ## end sub find_next_noncomment_type
9456              
9457             sub is_possible_numerator {
9458              
9459             # Look at the next non-comment character and decide if it could be a
9460             # numerator. Return
9461             # 1 - yes
9462             # 0 - can't tell
9463             # -1 - no
9464              
9465 0     0 0 0 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9466 0         0 my $is_possible_numerator = 0;
9467              
9468 0         0 my $next_token = $rtokens->[ $i + 1 ];
9469 0 0       0 if ( $next_token eq '=' ) { $i++; } # handle /=
  0         0  
9470 0         0 my ( $next_nonblank_token, $i_next ) =
9471             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9472              
9473 0 0       0 if ( $next_nonblank_token eq '#' ) {
9474 0         0 ( $next_nonblank_token, $i_next ) =
9475             $self->find_next_nonblank_token( $max_token_index, $rtokens,
9476             $max_token_index );
9477             }
9478              
9479 0 0       0 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
    0          
9480 0         0 $is_possible_numerator = 1;
9481             }
9482             elsif ( $next_nonblank_token =~ /^\s*$/ ) {
9483 0         0 $is_possible_numerator = 0;
9484             }
9485             else {
9486 0         0 $is_possible_numerator = -1;
9487             }
9488              
9489 0         0 return $is_possible_numerator;
9490             } ## end sub is_possible_numerator
9491              
9492             { ## closure for sub pattern_expected
9493             my %pattern_test;
9494              
9495             BEGIN {
9496              
9497             # List of tokens which may follow a pattern. Note that we will not
9498             # have formed digraphs at this point, so we will see '&' instead of
9499             # '&&' and '|' instead of '||'
9500              
9501             # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
9502 38     38   303 my @q = qw( & && | || ? : + - * and or while if unless);
9503 38         155 push @q, ')', '}', ']', '>', ',', ';';
9504 38         175624 @{pattern_test}{@q} = (1) x scalar(@q);
9505             } ## end BEGIN
9506              
9507             sub pattern_expected {
9508              
9509             # This a filter for a possible pattern.
9510             # It looks at the token after a possible pattern and tries to
9511             # determine if that token could end a pattern.
9512             # returns -
9513             # 1 - yes
9514             # 0 - can't tell
9515             # -1 - no
9516 0     0 0 0 my ( $self, $i, $rtokens, $max_token_index ) = @_;
9517 0         0 my $is_pattern = 0;
9518              
9519 0         0 my $next_token = $rtokens->[ $i + 1 ];
9520 0 0       0 if ( $next_token =~ /^[msixpodualgc]/ ) {
9521 0         0 $i++;
9522             } # skip possible modifier
9523 0         0 my ( $next_nonblank_token, $i_next ) =
9524             $self->find_next_nonblank_token( $i, $rtokens, $max_token_index );
9525              
9526 0 0       0 if ( $pattern_test{$next_nonblank_token} ) {
9527 0         0 $is_pattern = 1;
9528             }
9529             else {
9530              
9531             # Added '#' to fix issue c044
9532 0 0 0     0 if ( $next_nonblank_token =~ /^\s*$/
9533             || $next_nonblank_token eq '#' )
9534             {
9535 0         0 $is_pattern = 0;
9536             }
9537             else {
9538 0         0 $is_pattern = -1;
9539             }
9540             }
9541 0         0 return $is_pattern;
9542             } ## end sub pattern_expected
9543             }
9544              
9545             sub find_next_nonblank_token_on_this_line {
9546 453     453 0 1049 my ( $i, $rtokens, $max_token_index ) = @_;
9547 453         754 my $next_nonblank_token;
9548              
9549 453 100       1111 if ( $i < $max_token_index ) {
9550 445         953 $next_nonblank_token = $rtokens->[ ++$i ];
9551              
9552 445 100       1913 if ( $next_nonblank_token =~ /^\s*$/ ) {
9553              
9554 120 100       480 if ( $i < $max_token_index ) {
9555 118         369 $next_nonblank_token = $rtokens->[ ++$i ];
9556             }
9557             }
9558             }
9559             else {
9560 8         26 $next_nonblank_token = EMPTY_STRING;
9561             }
9562 453         1435 return ( $next_nonblank_token, $i );
9563             } ## end sub find_next_nonblank_token_on_this_line
9564              
9565             sub find_angle_operator_termination {
9566              
9567             # We are looking at a '<' and want to know if it is an angle operator.
9568             # We are to return:
9569             # $i = pretoken index of ending '>' if found, current $i otherwise
9570             # $type = 'Q' if found, '>' otherwise
9571 8     8 0 42 my ( $self, $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index )
9572             = @_;
9573 8         17 my $i = $i_beg;
9574 8         21 my $type = '<';
9575 8         33 pos($input_line) = 1 + $rtoken_map->[$i];
9576              
9577 8         19 my $filter;
9578              
9579             # we just have to find the next '>' if a term is expected
9580 8 100       48 if ( $expecting == TERM ) { $filter = '[\>]' }
  6 50       19  
9581              
9582             # we have to guess if we don't know what is expected
9583 2         4 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
9584              
9585             # shouldn't happen - we shouldn't be here if operator is expected
9586             else {
9587 0         0 if (DEVEL_MODE) {
9588             $self->Fault(<<EOM);
9589             Bad call to find_angle_operator_termination
9590             EOM
9591             }
9592 0         0 return ( $i, $type );
9593             }
9594              
9595             # To illustrate what we might be looking at, in case we are
9596             # guessing, here are some examples of valid angle operators
9597             # (or file globs):
9598             # <tmp_imp/*>
9599             # <FH>
9600             # <$fh>
9601             # <*.c *.h>
9602             # <_>
9603             # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
9604             # <${PREFIX}*img*.$IMAGE_TYPE>
9605             # <img*.$IMAGE_TYPE>
9606             # <Timg*.$IMAGE_TYPE>
9607             # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
9608             #
9609             # Here are some examples of lines which do not have angle operators:
9610             # return unless $self->[2]++ < $#{$self->[1]};
9611             # < 2 || @$t >
9612             #
9613             # the following line from dlister.pl caused trouble:
9614             # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
9615             #
9616             # If the '<' starts an angle operator, it must end on this line and
9617             # it must not have certain characters like ';' and '=' in it. I use
9618             # this to limit the testing. This filter should be improved if
9619             # possible.
9620              
9621 8 50       171 if ( $input_line =~ /($filter)/g ) {
9622              
9623 8 50       48 if ( $1 eq '>' ) {
9624              
9625             # We MAY have found an angle operator termination if we get
9626             # here, but we need to do more to be sure we haven't been
9627             # fooled.
9628 8         34 my $pos = pos($input_line);
9629              
9630 8         20 my $pos_beg = $rtoken_map->[$i];
9631 8         28 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
9632              
9633             # Test for '<' after possible filehandle, issue c103
9634             # print $fh <>; # syntax error
9635             # print $fh <DATA>; # ok
9636             # print $fh < DATA>; # syntax error at '>'
9637             # print STDERR < DATA>; # ok, prints word 'DATA'
9638             # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
9639 8 100       32 if ( $last_nonblank_type eq 'Z' ) {
9640              
9641             # $str includes brackets; something like '<DATA>'
9642 1 0 33     22 if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
9643             && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
9644             {
9645 0         0 return ( $i, $type );
9646             }
9647             }
9648              
9649             # Reject if the closing '>' follows a '-' as in:
9650             # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
9651 8 100       36 if ( $expecting eq UNKNOWN ) {
9652 2         4 my $check = substr( $input_line, $pos - 2, 1 );
9653 2 100       12 if ( $check eq '-' ) {
9654 1         5 return ( $i, $type );
9655             }
9656             }
9657              
9658             ######################################debug#####
9659             #$self->write_diagnostics( "ANGLE? :$str\n");
9660             #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
9661             ######################################debug#####
9662 7         16 $type = 'Q';
9663 7         13 my $error;
9664 7         33 ( $i, $error ) =
9665             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9666              
9667             # It may be possible that a quote ends midway in a pretoken.
9668             # If this happens, it may be necessary to split the pretoken.
9669 7 50       34 if ($error) {
9670 0         0 if (DEVEL_MODE) {
9671             $self->Fault(<<EOM);
9672             unexpected error condition returned by inverse_pretoken_map
9673             EOM
9674             }
9675             $self->warning(
9676 0         0 "Possible tokinization error..please check this line\n");
9677             }
9678              
9679             # Check for accidental formatting of a markup language doc...
9680             # Formatting will be skipped if we set _html_tag_count_ and
9681             # also set a warning of any kind.
9682 7         26 my $is_html_tag;
9683 7   33     33 my $is_first_string =
9684             $i_beg == 0 && $self->[_last_line_number_] == 1;
9685              
9686             # html comment '<!...' of any type
9687 7 50 33     85 if ( $str =~ /^<\s*!/ ) {
    50          
    50          
9688 0         0 $is_html_tag = 1;
9689 0 0       0 if ($is_first_string) {
9690 0         0 $self->warning(
9691             "looks like a markup language, continuing error checks\n"
9692             );
9693             }
9694             }
9695              
9696             # html end tag, something like </h1>
9697             elsif ( $str =~ /^<\s*\/\w+\s*>$/ ) {
9698 0         0 $is_html_tag = 1;
9699             }
9700              
9701             # xml prolog?
9702             elsif ( $str =~ /^<\?xml\s.*\?>$/i && $is_first_string ) {
9703 0         0 $is_html_tag = 1;
9704 0         0 $self->warning(
9705             "looks like a markup language, continuing error checks\n");
9706             }
9707              
9708 7 50       32 if ($is_html_tag) {
9709 0         0 $self->[_html_tag_count_]++;
9710             }
9711              
9712             # count blanks on inside of brackets
9713 7         35 my $blank_count = 0;
9714 7 100       47 $blank_count++ if ( $str =~ /<\s+/ );
9715 7 100       37 $blank_count++ if ( $str =~ /\s+>/ );
9716              
9717             # Now let's see where we stand....
9718             # OK if math op not possible
9719 7 100       38 if ( $expecting == TERM ) {
    50          
    50          
    0          
9720             }
9721              
9722             elsif ($is_html_tag) {
9723             }
9724              
9725             # OK if there are no more than 2 non-blank pre-tokens inside
9726             # (not possible to write 2 token math between < and >)
9727             # This catches most common cases
9728             elsif ( $i <= $i_beg + 3 + $blank_count ) {
9729              
9730             # No longer any need to document this common case
9731             ## $self->write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
9732             }
9733              
9734             # OK if there is some kind of identifier inside
9735             # print $fh <tvg::INPUT>;
9736             elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
9737 0         0 $self->write_diagnostics("ANGLE (contains identifier): $str\n");
9738             }
9739              
9740             # Not sure..
9741             else {
9742              
9743             # Let's try a Brace Test: any braces inside must balance
9744 0         0 my $br = 0;
9745 0         0 while ( $str =~ /\{/g ) { $br++ }
  0         0  
9746 0         0 while ( $str =~ /\}/g ) { $br-- }
  0         0  
9747 0         0 my $sb = 0;
9748 0         0 while ( $str =~ /\[/g ) { $sb++ }
  0         0  
9749 0         0 while ( $str =~ /\]/g ) { $sb-- }
  0         0  
9750 0         0 my $pr = 0;
9751 0         0 while ( $str =~ /\(/g ) { $pr++ }
  0         0  
9752 0         0 while ( $str =~ /\)/g ) { $pr-- }
  0         0  
9753              
9754             # if braces do not balance - not angle operator
9755 0 0 0     0 if ( $br || $sb || $pr ) {
      0        
9756 0         0 $i = $i_beg;
9757 0         0 $type = '<';
9758 0         0 $self->write_diagnostics(
9759             "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
9760             }
9761              
9762             # we should keep doing more checks here...to be continued
9763             # Tentatively accepting this as a valid angle operator.
9764             # There are lots more things that can be checked.
9765             else {
9766 0         0 $self->write_diagnostics(
9767             "ANGLE-Guessing yes: $str expecting=$expecting\n");
9768 0         0 $self->write_logfile_entry(
9769             "Guessing angle operator here: $str\n");
9770             }
9771             }
9772             }
9773              
9774             # didn't find ending >
9775             else {
9776 0 0       0 if ( $expecting == TERM ) {
9777 0         0 $self->warning("No ending > for angle operator\n");
9778             }
9779             }
9780             }
9781 7         48 return ( $i, $type );
9782             } ## end sub find_angle_operator_termination
9783              
9784             sub scan_number_do {
9785              
9786             # scan a number in any of the formats that Perl accepts
9787             # Underbars (_) are allowed in decimal numbers.
9788             # input parameters -
9789             # $input_line - the string to scan
9790             # $i - pre_token index to start scanning
9791             # $rtoken_map - reference to the pre_token map giving starting
9792             # character position in $input_line of token $i
9793             # output parameters -
9794             # $i - last pre_token index of the number just scanned
9795             # number - the number (characters); or undef if not a number
9796              
9797 629     629 0 1620 my ( $self, $input_line, $i, $rtoken_map, $input_type, $max_token_index ) =
9798             @_;
9799 629         1133 my $pos_beg = $rtoken_map->[$i];
9800 629         919 my $pos;
9801 629         987 my $i_begin = $i;
9802 629         1001 my $number = undef;
9803 629         1338 my $type = $input_type;
9804              
9805 629         1382 my $first_char = substr( $input_line, $pos_beg, 1 );
9806              
9807             # Look for bad starting characters; Shouldn't happen..
9808 629 50       2814 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
9809 0         0 if (DEVEL_MODE) {
9810             $self->Fault(<<EOM);
9811             Program bug - scan_number given bad first character = '$first_char'
9812             EOM
9813             }
9814 0         0 return ( $i, $type, $number );
9815             }
9816              
9817             # handle v-string without leading 'v' character ('Two Dot' rule)
9818             # (vstring.t)
9819             # Here is the format prior to including underscores:
9820             ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
9821 629         1866 pos($input_line) = $pos_beg;
9822 629 50       3017 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
9823 0         0 $pos = pos($input_line);
9824 0         0 my $numc = $pos - $pos_beg;
9825 0         0 $number = substr( $input_line, $pos_beg, $numc );
9826 0         0 $type = 'v';
9827 0         0 $self->report_v_string($number);
9828             }
9829              
9830             # handle octal, hex, binary
9831 629 50       1631 if ( !defined($number) ) {
9832 629         1380 pos($input_line) = $pos_beg;
9833              
9834             # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
9835             # For reference, the format prior to hex floating point is:
9836             # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
9837             # (hex) (octal) (binary)
9838 629 100       2467 if (
9839             $input_line =~
9840              
9841             /\G[+-]?0( # leading [signed] 0
9842              
9843             # a hex float, i.e. '0x0.b17217f7d1cf78p0'
9844             ([xX][0-9a-fA-F_]* # X and optional leading digits
9845             (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
9846             [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
9847             [0-9a-fA-F_]*) # optional Additional exponent digits
9848              
9849             # or hex integer
9850             |([xX][0-9a-fA-F_]+)
9851              
9852             # or octal fraction
9853             |([oO]?[0-7_]+ # string of octal digits
9854             (\.([0-7][0-7_]*)?)? # optional decimal and fraction
9855             [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
9856             [0-7_]*) # Additional exponent digits with underscores
9857              
9858             # or octal integer
9859             |([oO]?[0-7_]+) # string of octal digits
9860              
9861             # or a binary float
9862             |([bB][01_]* # 'b' with string of binary digits
9863             (\.([01][01_]*)?)? # optional decimal and fraction
9864             [Pp][+-]?[01] # Required exponent indicator, no underscore
9865             [01_]*) # additional exponent bits
9866              
9867             # or binary integer
9868             |([bB][01_]+) # 'b' with string of binary digits
9869              
9870             )/gx
9871             )
9872             {
9873 72         149 $pos = pos($input_line);
9874 72         129 my $numc = $pos - $pos_beg;
9875 72         157 $number = substr( $input_line, $pos_beg, $numc );
9876 72         132 $type = 'n';
9877             }
9878             }
9879              
9880             # handle decimal
9881 629 100       1481 if ( !defined($number) ) {
9882 557         1137 pos($input_line) = $pos_beg;
9883              
9884 557 50       2728 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9885 557         1067 $pos = pos($input_line);
9886              
9887             # watch out for things like 0..40 which would give 0. by this;
9888 557 100 100     2067 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9889             && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9890             {
9891 37         58 $pos--;
9892             }
9893 557         929 my $numc = $pos - $pos_beg;
9894 557         1089 $number = substr( $input_line, $pos_beg, $numc );
9895 557         1052 $type = 'n';
9896             }
9897             }
9898              
9899             # filter out non-numbers like e + - . e2 .e3 +e6
9900             # the rule: at least one digit, and any 'e' must be preceded by a digit
9901 629 100 66     3205 if (
      66        
9902             $number !~ /\d/ # no digits
9903             || ( $number =~ /^(.*)[eE]/
9904             && $1 !~ /\d/ ) # or no digits before the 'e'
9905             )
9906             {
9907 303         479 $number = undef;
9908 303         530 $type = $input_type;
9909 303         1195 return ( $i, $type, $number );
9910             }
9911              
9912             # Found a number; now we must convert back from character position
9913             # to pre_token index. An error here implies user syntax error.
9914             # An example would be an invalid octal number like '009'.
9915 326         581 my $error;
9916 326         905 ( $i, $error ) =
9917             inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9918 326 50       918 if ($error) { $self->warning("Possibly invalid number\n") }
  0         0  
9919              
9920 326         1239 return ( $i, $type, $number );
9921             } ## end sub scan_number_do
9922              
9923             sub inverse_pretoken_map {
9924              
9925             # Starting with the current pre_token index $i, scan forward until
9926             # finding the index of the next pre_token whose position is $pos.
9927 2166     2166 0 5131 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9928 2166         3656 my $error = 0;
9929              
9930 2166         5526 while ( ++$i <= $max_token_index ) {
9931              
9932 4032 100       9339 if ( $pos <= $rtoken_map->[$i] ) {
9933              
9934             # Let the calling routine handle errors in which we do not
9935             # land on a pre-token boundary. It can happen by running
9936             # perltidy on some non-perl scripts, for example.
9937 2131 50       5285 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
  0         0  
9938 2131         3442 $i--;
9939 2131         3690 last;
9940             }
9941             }
9942 2166         5461 return ( $i, $error );
9943             } ## end sub inverse_pretoken_map
9944              
9945             sub find_here_doc {
9946              
9947             # find the target of a here document, if any
9948             # input parameters:
9949             # $i - token index of the second < of <<
9950             # ($i must be less than the last token index if this is called)
9951             # output parameters:
9952             # $found_target = 0 didn't find target; =1 found target
9953             # HERE_TARGET - the target string (may be empty string)
9954             # $i - unchanged if not here doc,
9955             # or index of the last token of the here target
9956             # $saw_error - flag noting unbalanced quote on here target
9957 9     9 0 40 my ( $self, $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9958              
9959 9         23 my $ibeg = $i;
9960 9         25 my $found_target = 0;
9961 9         23 my $here_doc_target = EMPTY_STRING;
9962 9         30 my $here_quote_character = EMPTY_STRING;
9963 9         26 my $saw_error = 0;
9964 9         30 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9965 9         55 $next_token = $rtokens->[ $i + 1 ];
9966              
9967             # perl allows a backslash before the target string (heredoc.t)
9968 9         26 my $backslash = 0;
9969 9 50       51 if ( $next_token eq '\\' ) {
9970 0         0 $backslash = 1;
9971 0         0 $next_token = $rtokens->[ $i + 2 ];
9972             }
9973              
9974 9         55 ( $next_nonblank_token, $i_next_nonblank ) =
9975             find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9976              
9977 9 100 33     73 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
    50          
    50          
9978              
9979 6         21 my $in_quote = 1;
9980 6         15 my $quote_depth = 0;
9981 6         19 my $quote_pos = 0;
9982 6         15 my $quoted_string;
9983              
9984             (
9985 6         41 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9986             $quoted_string
9987             )
9988             = $self->follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9989             $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9990              
9991 6 50       44 if ($in_quote) { # didn't find end of quote, so no target found
9992 0         0 $i = $ibeg;
9993 0 0       0 if ( $expecting == TERM ) {
9994 0         0 $self->warning(
9995             "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9996             );
9997 0         0 $saw_error = 1;
9998             }
9999             }
10000             else { # found ending quote
10001 6         19 $found_target = 1;
10002              
10003 6         14 my $tokj;
10004 6         958 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
10005 6         24 $tokj = $rtokens->[$j];
10006              
10007             # we have to remove any backslash before the quote character
10008             # so that the here-doc-target exactly matches this string
10009             next
10010 6 0 33     39 if ( $tokj eq "\\"
      33        
10011             && $j < $i - 1
10012             && $rtokens->[ $j + 1 ] eq $here_quote_character );
10013 6         27 $here_doc_target .= $tokj;
10014             }
10015             }
10016             }
10017              
10018             elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
10019 0         0 $found_target = 1;
10020 0         0 $self->write_logfile_entry(
10021             "found blank here-target after <<; suggest using \"\"\n");
10022 0         0 $i = $ibeg;
10023             }
10024             elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
10025              
10026 3         12 my $here_doc_expected;
10027 3 50       12 if ( $expecting == UNKNOWN ) {
10028 0         0 $here_doc_expected = $self->guess_if_here_doc($next_token);
10029             }
10030             else {
10031 3         11 $here_doc_expected = 1;
10032             }
10033              
10034 3 50       11 if ($here_doc_expected) {
10035 3         8 $found_target = 1;
10036 3         7 $here_doc_target = $next_token;
10037 3         9 $i = $ibeg + 1;
10038             }
10039              
10040             }
10041             else {
10042              
10043 0 0       0 if ( $expecting == TERM ) {
10044 0         0 $found_target = 1;
10045 0         0 $self->write_logfile_entry("Note: bare here-doc operator <<\n");
10046             }
10047             else {
10048 0         0 $i = $ibeg;
10049             }
10050             }
10051              
10052             # patch to neglect any prepended backslash
10053 9 50 33     80 if ( $found_target && $backslash ) { $i++ }
  0         0  
10054              
10055 9         54 return ( $found_target, $here_doc_target, $here_quote_character, $i,
10056             $saw_error );
10057             } ## end sub find_here_doc
10058              
10059             sub do_quote {
10060              
10061             # follow (or continue following) quoted string(s)
10062             # $in_quote return code:
10063             # 0 - ok, found end
10064             # 1 - still must find end of quote whose target is $quote_character
10065             # 2 - still looking for end of first of two quotes
10066             #
10067             # Returns updated strings:
10068             # $quoted_string_1 = quoted string seen while in_quote=1
10069             # $quoted_string_2 = quoted string seen while in_quote=2
10070             my (
10071              
10072 2763     2763 0 8255 $self,
10073             $i,
10074             $in_quote,
10075             $quote_character,
10076             $quote_pos,
10077             $quote_depth,
10078             $quoted_string_1,
10079             $quoted_string_2,
10080             $rtokens,
10081             $rtoken_map,
10082             $max_token_index,
10083              
10084             ) = @_;
10085              
10086 2763         4362 my $quoted_string;
10087 2763 100       6388 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
10088 29         67 my $ibeg = $i;
10089             (
10090 29         126 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
10091             $quoted_string
10092             )
10093             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
10094             $quote_character, $quote_pos, $quote_depth, $max_token_index );
10095 29         88 $quoted_string_2 .= $quoted_string;
10096 29 50       225 if ( $in_quote == 1 ) {
10097 29 50       114 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
  0         0  
10098 29         76 $quote_character = EMPTY_STRING;
10099             }
10100             else {
10101 0         0 $quoted_string_2 .= "\n";
10102             }
10103             }
10104              
10105 2763 50       6007 if ( $in_quote == 1 ) { # one (more) quote to follow
10106 2763         4181 my $ibeg = $i;
10107             (
10108 2763         7635 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
10109             $quoted_string
10110             )
10111             = $self->follow_quoted_string( $ibeg, $in_quote, $rtokens,
10112             $quote_character, $quote_pos, $quote_depth, $max_token_index );
10113 2763         5860 $quoted_string_1 .= $quoted_string;
10114 2763 100       6719 if ( $in_quote == 1 ) {
10115 183         404 $quoted_string_1 .= "\n";
10116             }
10117             }
10118             return (
10119              
10120 2763         9482 $i,
10121             $in_quote,
10122             $quote_character,
10123             $quote_pos,
10124             $quote_depth,
10125             $quoted_string_1,
10126             $quoted_string_2,
10127              
10128             );
10129             } ## end sub do_quote
10130              
10131             sub follow_quoted_string {
10132              
10133             # scan for a specific token, skipping escaped characters
10134             # if the quote character is blank, use the first non-blank character
10135             # input parameters:
10136             # $rtokens = reference to the array of tokens
10137             # $i = the token index of the first character to search
10138             # $in_quote = number of quoted strings being followed
10139             # $beginning_tok = the starting quote character
10140             # $quote_pos = index to check next for alphanumeric delimiter
10141             # output parameters:
10142             # $i = the token index of the ending quote character
10143             # $in_quote = decremented if found end, unchanged if not
10144             # $beginning_tok = the starting quote character
10145             # $quote_pos = index to check next for alphanumeric delimiter
10146             # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
10147             # $quoted_string = the text of the quote (without quotation tokens)
10148             my (
10149              
10150 2809     2809 0 6366 $self,
10151             $i_beg,
10152             $in_quote,
10153             $rtokens,
10154             $beginning_tok,
10155             $quote_pos,
10156             $quote_depth,
10157             $max_token_index,
10158              
10159             ) = @_;
10160              
10161 2809         4314 my ( $tok, $end_tok );
10162 2809         4629 my $i = $i_beg - 1;
10163 2809         4779 my $quoted_string = EMPTY_STRING;
10164              
10165 2809         3947 0 && do {
10166             print STDOUT
10167             "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
10168             };
10169              
10170             # get the corresponding end token
10171 2809 100       13063 if ( $beginning_tok !~ /^\s*$/ ) {
10172 183         908 $end_tok = matching_end_token($beginning_tok);
10173             }
10174              
10175             # a blank token means we must find and use the first non-blank one
10176             else {
10177 2626 100       6550 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
10178              
10179 2626         6261 while ( $i < $max_token_index ) {
10180 2626         5089 $tok = $rtokens->[ ++$i ];
10181              
10182 2626 50       7867 if ( $tok !~ /^\s*$/ ) {
10183              
10184 2626 50 66     7596 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
10185 0         0 $i = $max_token_index;
10186             }
10187             else {
10188              
10189 2626 100       5522 if ( length($tok) > 1 ) {
10190 1 50       5 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
  1         3  
10191 1         10 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
10192             }
10193             else {
10194 2625         4624 $beginning_tok = $tok;
10195 2625         4394 $quote_pos = 0;
10196             }
10197 2626         6392 $end_tok = matching_end_token($beginning_tok);
10198 2626         4481 $quote_depth = 1;
10199 2626         4626 last;
10200             }
10201             }
10202             else {
10203 0         0 $allow_quote_comments = 1;
10204             }
10205             }
10206             }
10207              
10208             # There are two different loops which search for the ending quote
10209             # character. In the rare case of an alphanumeric quote delimiter, we
10210             # have to look through alphanumeric tokens character-by-character, since
10211             # the pre-tokenization process combines multiple alphanumeric
10212             # characters, whereas for a non-alphanumeric delimiter, only tokens of
10213             # length 1 can match.
10214              
10215             #----------------------------------------------------------------
10216             # Case 1 (rare): loop for case of alphanumeric quote delimiter..
10217             # "quote_pos" is the position the current word to begin searching
10218             #----------------------------------------------------------------
10219 2809 100       7408 if ( $beginning_tok =~ /\w/ ) {
10220              
10221             # Note this because it is not recommended practice except
10222             # for obfuscated perl contests
10223 1 50       4 if ( $in_quote == 1 ) {
10224 1         6 $self->write_logfile_entry(
10225             "Note: alphanumeric quote delimiter ($beginning_tok) \n");
10226             }
10227              
10228             # Note: changed < to <= here to fix c109. Relying on extra end blanks.
10229 1         7 while ( $i <= $max_token_index ) {
10230              
10231 4 100 66     14 if ( $quote_pos == 0 || ( $i < 0 ) ) {
10232 3         9 $tok = $rtokens->[ ++$i ];
10233              
10234 3 100       9 if ( $tok eq '\\' ) {
10235              
10236             # retain backslash unless it hides the end token
10237 1 50       6 $quoted_string .= $tok
10238             unless $rtokens->[ $i + 1 ] eq $end_tok;
10239 1         4 $quote_pos++;
10240 1 50       7 last if ( $i >= $max_token_index );
10241 1         6 $tok = $rtokens->[ ++$i ];
10242             }
10243             }
10244 4         8 my $old_pos = $quote_pos;
10245              
10246 4 50 33     20 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
      33        
10247             {
10248              
10249             }
10250 4         8 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
10251              
10252 4 100       11 if ( $quote_pos > 0 ) {
10253              
10254 1         3 $quoted_string .=
10255             substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
10256              
10257             # NOTE: any quote modifiers will be at the end of '$tok'. If we
10258             # wanted to check them, this is the place to get them. But
10259             # this quote form is rarely used in practice, so it isn't
10260             # worthwhile.
10261              
10262 1         3 $quote_depth--;
10263              
10264 1 50       5 if ( $quote_depth == 0 ) {
10265 1         2 $in_quote--;
10266 1         4 last;
10267             }
10268             }
10269             else {
10270 3 50       8 if ( $old_pos <= length($tok) ) {
10271 3         8 $quoted_string .= substr( $tok, $old_pos );
10272             }
10273             }
10274             }
10275             }
10276              
10277             #-----------------------------------------------------------------------
10278             # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
10279             #-----------------------------------------------------------------------
10280             else {
10281              
10282 2808         6515 while ( $i < $max_token_index ) {
10283 10785         17025 $tok = $rtokens->[ ++$i ];
10284              
10285 10785 100       25425 if ( $tok eq $end_tok ) {
    100          
    100          
10286 2615         4015 $quote_depth--;
10287              
10288 2615 100       5979 if ( $quote_depth == 0 ) {
10289 2614         3734 $in_quote--;
10290 2614         4189 last;
10291             }
10292             }
10293             elsif ( $tok eq $beginning_tok ) {
10294 1         3 $quote_depth++;
10295             }
10296             elsif ( $tok eq '\\' ) {
10297              
10298             # retain backslash unless it hides the beginning or end token
10299 376         901 $tok = $rtokens->[ ++$i ];
10300 376 100 100     2224 $quoted_string .= '\\'
10301             unless ( $tok eq $end_tok || $tok eq $beginning_tok );
10302             }
10303 8171         15304 $quoted_string .= $tok;
10304             }
10305             }
10306 2809 50       6316 if ( $i > $max_token_index ) { $i = $max_token_index }
  0         0  
10307             return (
10308              
10309 2809         10143 $i,
10310             $in_quote,
10311             $beginning_tok,
10312             $quote_pos,
10313             $quote_depth,
10314             $quoted_string,
10315              
10316             );
10317             } ## end sub follow_quoted_string
10318              
10319             sub indicate_error {
10320 0     0 0 0 my ( $self, $msg, $line_number, $input_line, $pos, $carrat ) = @_;
10321 0         0 $self->interrupt_logfile();
10322 0         0 $self->warning($msg);
10323 0         0 $self->write_error_indicator_pair( $line_number, $input_line, $pos,
10324             $carrat );
10325 0         0 $self->resume_logfile();
10326 0         0 return;
10327             } ## end sub indicate_error
10328              
10329             sub write_error_indicator_pair {
10330 0     0 0 0 my ( $self, $line_number, $input_line, $pos, $carrat ) = @_;
10331 0         0 my ( $offset, $numbered_line, $underline ) =
10332             make_numbered_line( $line_number, $input_line, $pos );
10333 0         0 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
10334 0         0 $self->warning( $numbered_line . "\n" );
10335 0         0 $underline =~ s/\s*$//;
10336 0         0 $self->warning( $underline . "\n" );
10337 0         0 return;
10338             } ## end sub write_error_indicator_pair
10339              
10340             sub make_numbered_line {
10341              
10342             # Given an input line, its line number, and a character position of
10343             # interest, create a string not longer than 80 characters of the form
10344             # $lineno: sub_string
10345             # such that the sub_string of $str contains the position of interest
10346             #
10347             # Here is an example of what we want, in this case we add trailing
10348             # '...' because the line is long.
10349             #
10350             # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
10351             #
10352             # Here is another example, this time in which we used leading '...'
10353             # because of excessive length:
10354             #
10355             # 2: ... er of the World Wide Web Consortium's
10356             #
10357             # input parameters are:
10358             # $lineno = line number
10359             # $str = the text of the line
10360             # $pos = position of interest (the error) : 0 = first character
10361             #
10362             # We return :
10363             # - $offset = an offset which corrects the position in case we only
10364             # display part of a line, such that $pos-$offset is the effective
10365             # position from the start of the displayed line.
10366             # - $numbered_line = the numbered line as above,
10367             # - $underline = a blank 'underline' which is all spaces with the same
10368             # number of characters as the numbered line.
10369              
10370 0     0 0 0 my ( $lineno, $str, $pos ) = @_;
10371 0 0       0 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
10372 0         0 my $excess = length($str) - $offset - 68;
10373 0 0       0 my $numc = ( $excess > 0 ) ? 68 : undef;
10374              
10375 0 0       0 if ( defined($numc) ) {
10376 0 0       0 if ( $offset == 0 ) {
10377 0         0 $str = substr( $str, $offset, $numc - 4 ) . " ...";
10378             }
10379             else {
10380 0         0 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
10381             }
10382             }
10383             else {
10384              
10385 0 0       0 if ( $offset == 0 ) {
10386             }
10387             else {
10388 0         0 $str = "... " . substr( $str, $offset + 4 );
10389             }
10390             }
10391              
10392 0         0 my $numbered_line = sprintf( "%d: ", $lineno );
10393 0         0 $offset -= length($numbered_line);
10394 0         0 $numbered_line .= $str;
10395 0         0 my $underline = SPACE x length($numbered_line);
10396 0         0 return ( $offset, $numbered_line, $underline );
10397             } ## end sub make_numbered_line
10398              
10399             sub write_on_underline {
10400              
10401             # The "underline" is a string that shows where an error is; it starts
10402             # out as a string of blanks with the same length as the numbered line of
10403             # code above it, and we have to add marking to show where an error is.
10404             # In the example below, we want to write the string '--^' just below
10405             # the line of bad code:
10406             #
10407             # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
10408             # ---^
10409             # We are given the current underline string, plus a position and a
10410             # string to write on it.
10411             #
10412             # In the above example, there will be 2 calls to do this:
10413             # First call: $pos=19, pos_chr=^
10414             # Second call: $pos=16, pos_chr=---
10415             #
10416             # This is a trivial thing to do with substr, but there is some
10417             # checking to do.
10418              
10419 0     0 0 0 my ( $underline, $pos, $pos_chr ) = @_;
10420              
10421             # check for error..shouldn't happen
10422 0 0 0     0 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
10423 0         0 return $underline;
10424             }
10425 0         0 my $excess = length($pos_chr) + $pos - length($underline);
10426 0 0       0 if ( $excess > 0 ) {
10427 0         0 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
10428             }
10429 0         0 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
10430 0         0 return ($underline);
10431             } ## end sub write_on_underline
10432              
10433             sub pre_tokenize {
10434              
10435 6170     6170 0 12148 my ( $str, $max_tokens_wanted ) = @_;
10436              
10437             # Input parameter:
10438             # $max_tokens_wanted > 0 to stop on reaching this many tokens.
10439             # = 0 means get all tokens
10440              
10441             # Break a string, $str, into a sequence of preliminary tokens. We
10442             # are interested in these types of tokens:
10443             # words (type='w'), example: 'max_tokens_wanted'
10444             # digits (type = 'd'), example: '0755'
10445             # whitespace (type = 'b'), example: ' '
10446             # any other single character (i.e. punct; type = the character itself).
10447             # We cannot do better than this yet because we might be in a quoted
10448             # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
10449             # tokens.
10450              
10451             # An advantage of doing this pre-tokenization step is that it keeps almost
10452             # all of the regex work highly localized. A disadvantage is that in some
10453             # very rare instances we will have to go back and split a pre-token.
10454              
10455             # Return parameters:
10456 6170         10414 my @tokens = (); # array of the tokens themselves
10457 6170         12611 my @token_map = (0); # string position of start of each token
10458 6170         9807 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
10459              
10460 6170         9638 do {
10461              
10462             # whitespace
10463 82014 100       267997 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
  18684 100       32629  
    100          
    100          
10464              
10465             # numbers
10466             # note that this must come before words!
10467 2593         5527 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
10468              
10469             # words
10470 16628         30404 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
10471              
10472             # single-character punctuation
10473 38075         80674 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
10474              
10475             # that's all..
10476             else {
10477 6034         37446 return ( \@tokens, \@token_map, \@type );
10478             }
10479              
10480 75980         135096 push @tokens, $1;
10481 75980         157063 push @token_map, pos($str);
10482              
10483             } while ( --$max_tokens_wanted != 0 );
10484              
10485 136         814 return ( \@tokens, \@token_map, \@type );
10486             } ## end sub pre_tokenize
10487              
10488             sub show_tokens {
10489              
10490             # this is an old debug routine
10491             # not called, but saved for reference
10492 0     0 0 0 my ( $rtokens, $rtoken_map ) = @_;
10493 0         0 my $num = scalar( @{$rtokens} );
  0         0  
10494              
10495 0         0 foreach my $i ( 0 .. $num - 1 ) {
10496 0         0 my $len = length( $rtokens->[$i] );
10497 0         0 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
10498             }
10499 0         0 return;
10500             } ## end sub show_tokens
10501              
10502             { ## closure for sub matching end token
10503             my %matching_end_token;
10504              
10505             BEGIN {
10506 38     38   60305 %matching_end_token = (
10507             '{' => '}',
10508             '(' => ')',
10509             '[' => ']',
10510             '<' => '>',
10511             );
10512             } ## end BEGIN
10513              
10514             sub matching_end_token {
10515              
10516             # return closing character for a pattern
10517 2993     2993 0 4834 my $beginning_token = shift;
10518 2993 100       7936 if ( $matching_end_token{$beginning_token} ) {
10519 373         961 return $matching_end_token{$beginning_token};
10520             }
10521 2620         5860 return ($beginning_token);
10522             } ## end sub matching_end_token
10523             }
10524              
10525             sub dump_token_types {
10526 0     0 0   my ( $class, $fh ) = @_;
10527              
10528             # This should be the latest list of token types in use
10529             # adding NEW_TOKENS: add a comment here
10530 0           $fh->print(<<'END_OF_LIST');
10531              
10532             Here is a list of the token types currently used for lines of type 'CODE'.
10533             For the following tokens, the "type" of a token is just the token itself.
10534              
10535             .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
10536             ( ) <= >= == =~ !~ != ++ -- /= x=
10537             ... **= <<= >>= &&= ||= //= <=>
10538             , + - / * | % ! x ~ = \ ? : . < > ^ &
10539              
10540             The following additional token types are defined:
10541              
10542             type meaning
10543             b blank (white space)
10544             { indent: opening structural curly brace or square bracket or paren
10545             (code block, anonymous hash reference, or anonymous array reference)
10546             } outdent: right structural curly brace or square bracket or paren
10547             [ left non-structural square bracket (enclosing an array index)
10548             ] right non-structural square bracket
10549             ( left non-structural paren (all but a list right of an =)
10550             ) right non-structural paren
10551             L left non-structural curly brace (enclosing a key)
10552             R right non-structural curly brace
10553             ; terminal semicolon
10554             f indicates a semicolon in a "for" statement
10555             h here_doc operator <<
10556             # a comment
10557             Q indicates a quote or pattern
10558             q indicates a qw quote block
10559             k a perl keyword
10560             C user-defined constant or constant function (with void prototype = ())
10561             U user-defined function taking parameters
10562             G user-defined function taking block parameter (like grep/map/eval)
10563             M (unused, but reserved for subroutine definition name)
10564             P (unused, but -html uses it to label pod text)
10565             t type indicater such as %,$,@,*,&,sub
10566             w bare word (perhaps a subroutine call)
10567             i identifier of some type (with leading %, $, @, *, &, sub, -> )
10568             n a number
10569             v a v-string
10570             F a file test operator (like -e)
10571             Y File handle
10572             Z identifier in indirect object slot: may be file handle, object
10573             J LABEL: code block label
10574             j LABEL after next, last, redo, goto
10575             p unary +
10576             m unary -
10577             pp pre-increment operator ++
10578             mm pre-decrement operator --
10579             A : used as attribute separator
10580            
10581             Here are the '_line_type' codes used internally:
10582             SYSTEM - system-specific code before hash-bang line
10583             CODE - line of perl code (including comments)
10584             POD_START - line starting pod, such as '=head'
10585             POD - pod documentation text
10586             POD_END - last line of pod section, '=cut'
10587             HERE - text of here-document
10588             HERE_END - last line of here-doc (target word)
10589             FORMAT - format section
10590             FORMAT_END - last line of format section, '.'
10591             SKIP - code skipping section
10592             SKIP_END - last line of code skipping section, '#>>V'
10593             DATA_START - __DATA__ line
10594             DATA - unidentified text following __DATA__
10595             END_START - __END__ line
10596             END - unidentified text following __END__
10597             ERROR - we are in big trouble, probably not a perl script
10598             END_OF_LIST
10599              
10600 0           return;
10601             } ## end sub dump_token_types
10602              
10603             BEGIN {
10604              
10605             # These names are used in error messages
10606 38     38   371 @opening_brace_names = qw# '{' '[' '(' '?' #;
10607 38         163 @closing_brace_names = qw# '}' ']' ')' ':' #;
10608              
10609 38         89 my @q;
10610              
10611 38         227 my @digraphs = qw(
10612             .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
10613             <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
10614             );
10615 38         807 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
10616              
10617 38         212 @q = qw(
10618             . : < > * & | / - = + - % ^ ! x ~
10619             );
10620 38         462 @can_start_digraph{@q} = (1) x scalar(@q);
10621              
10622 38         218 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
10623 38         317 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
10624              
10625 38         158 my @tetragraphs = qw( <<>> );
10626 38         170 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
10627              
10628             # make a hash of all valid token types for self-checking the tokenizer
10629             # (adding NEW_TOKENS : select a new character and add to this list)
10630 38         641 my @valid_token_types = qw#
10631             A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
10632             { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
10633             #;
10634 38         355 push( @valid_token_types, @digraphs );
10635 38         183 push( @valid_token_types, @trigraphs );
10636 38         98 push( @valid_token_types, @tetragraphs );
10637 38         92 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
10638 38         1474 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
10639              
10640             # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
10641 38         264 my @file_test_operators =
10642             qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
10643 38         808 @is_file_test_operator{@file_test_operators} =
10644             (1) x scalar(@file_test_operators);
10645              
10646             # these functions have prototypes of the form (&), so when they are
10647             # followed by a block, that block MAY BE followed by an operator.
10648             # Smartmatch operator ~~ may be followed by anonymous hash or array ref
10649 38         211 @q = qw( do eval );
10650 38         148 @is_block_operator{@q} = (1) x scalar(@q);
10651              
10652             # these functions allow an identifier in the indirect object slot
10653 38         117 @q = qw( print printf sort exec system say);
10654 38         237 @is_indirect_object_taker{@q} = (1) x scalar(@q);
10655              
10656             # Note: 'field' will be added by sub check_options if --use-feature=class
10657 38         115 @q = qw(my our state);
10658 38         162 @is_my_our_state{@q} = (1) x scalar(@q);
10659              
10660             # These tokens may precede a code block
10661             # patched for SWITCH/CASE/CATCH. Actually these could be removed
10662             # now and we could let the extended-syntax coding handle them.
10663             # Added 'default' for Switch::Plain.
10664             # Note: 'ADJUST' will be added by sub check_options if --use-feature=class
10665 38         211 @q =
10666             qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
10667             unless do while until eval for foreach map grep sort
10668             switch case given when default catch try finally);
10669 38         863 @is_code_block_token{@q} = (1) x scalar(@q);
10670              
10671             # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
10672             # to contrast it with the block types in '%is_zero_continuation_block_type'
10673 38         210 @q = qw( sort map grep eval do );
10674 38         155 @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
10675              
10676 38         119 @q = qw( sort map grep );
10677 38         124 @is_sort_map_grep{@q} = (1) x scalar(@q);
10678              
10679 38         97 %is_grep_alias = ();
10680              
10681             # I'll build the list of keywords incrementally
10682 38         99 my @Keywords = ();
10683              
10684             # keywords and tokens after which a value or pattern is expected,
10685             # but not an operator. In other words, these should consume terms
10686             # to their right, or at least they are not expected to be followed
10687             # immediately by operators.
10688 38         1492 my @value_requestor = qw(
10689             AUTOLOAD
10690             BEGIN
10691             CHECK
10692             DESTROY
10693             END
10694             EQ
10695             GE
10696             GT
10697             INIT
10698             LE
10699             LT
10700             NE
10701             UNITCHECK
10702             abs
10703             accept
10704             alarm
10705             and
10706             atan2
10707             bind
10708             binmode
10709             bless
10710             break
10711             caller
10712             chdir
10713             chmod
10714             chomp
10715             chop
10716             chown
10717             chr
10718             chroot
10719             close
10720             closedir
10721             cmp
10722             connect
10723             continue
10724             cos
10725             crypt
10726             dbmclose
10727             dbmopen
10728             defined
10729             delete
10730             die
10731             dump
10732             each
10733             else
10734             elsif
10735             eof
10736             eq
10737             evalbytes
10738             exec
10739             exists
10740             exit
10741             exp
10742             fc
10743             fcntl
10744             fileno
10745             flock
10746             for
10747             foreach
10748             formline
10749             ge
10750             getc
10751             getgrgid
10752             getgrnam
10753             gethostbyaddr
10754             gethostbyname
10755             getnetbyaddr
10756             getnetbyname
10757             getpeername
10758             getpgrp
10759             getpriority
10760             getprotobyname
10761             getprotobynumber
10762             getpwnam
10763             getpwuid
10764             getservbyname
10765             getservbyport
10766             getsockname
10767             getsockopt
10768             glob
10769             gmtime
10770             goto
10771             grep
10772             gt
10773             hex
10774             if
10775             index
10776             int
10777             ioctl
10778             join
10779             keys
10780             kill
10781             last
10782             lc
10783             lcfirst
10784             le
10785             length
10786             link
10787             listen
10788             local
10789             localtime
10790             lock
10791             log
10792             lstat
10793             lt
10794             map
10795             mkdir
10796             msgctl
10797             msgget
10798             msgrcv
10799             msgsnd
10800             my
10801             ne
10802             next
10803             no
10804             not
10805             oct
10806             open
10807             opendir
10808             or
10809             ord
10810             our
10811             pack
10812             pipe
10813             pop
10814             pos
10815             print
10816             printf
10817             prototype
10818             push
10819             quotemeta
10820             rand
10821             read
10822             readdir
10823             readlink
10824             readline
10825             readpipe
10826             recv
10827             redo
10828             ref
10829             rename
10830             require
10831             reset
10832             return
10833             reverse
10834             rewinddir
10835             rindex
10836             rmdir
10837             scalar
10838             seek
10839             seekdir
10840             select
10841             semctl
10842             semget
10843             semop
10844             send
10845             sethostent
10846             setnetent
10847             setpgrp
10848             setpriority
10849             setprotoent
10850             setservent
10851             setsockopt
10852             shift
10853             shmctl
10854             shmget
10855             shmread
10856             shmwrite
10857             shutdown
10858             sin
10859             sleep
10860             socket
10861             socketpair
10862             sort
10863             splice
10864             split
10865             sprintf
10866             sqrt
10867             srand
10868             stat
10869             state
10870             study
10871             substr
10872             symlink
10873             syscall
10874             sysopen
10875             sysread
10876             sysseek
10877             system
10878             syswrite
10879             tell
10880             telldir
10881             tie
10882             tied
10883             truncate
10884             uc
10885             ucfirst
10886             umask
10887             undef
10888             unless
10889             unlink
10890             unpack
10891             unshift
10892             untie
10893             until
10894             use
10895             utime
10896             values
10897             vec
10898             waitpid
10899             warn
10900             while
10901             write
10902             xor
10903              
10904             switch
10905             case
10906             default
10907             given
10908             when
10909             err
10910             say
10911             isa
10912              
10913             catch
10914              
10915             );
10916              
10917             # Note: 'ADJUST', 'field' are added by sub check_options
10918             # if --use-feature=class
10919              
10920             # patched above for SWITCH/CASE given/when err say
10921             # 'err' is a fairly safe addition.
10922             # Added 'default' for Switch::Plain. Note that we could also have
10923             # a separate set of keywords to include if we see 'use Switch::Plain'
10924 38         1551 push( @Keywords, @value_requestor );
10925              
10926             # These are treated the same but are not keywords:
10927 38         174 my @extra_vr = qw(
10928             constant
10929             vars
10930             );
10931 38         290 push( @value_requestor, @extra_vr );
10932              
10933 38         4778 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10934              
10935             # this list contains keywords which do not look for arguments,
10936             # so that they might be followed by an operator, or at least
10937             # not a term.
10938 38         344 my @operator_requestor = qw(
10939             endgrent
10940             endhostent
10941             endnetent
10942             endprotoent
10943             endpwent
10944             endservent
10945             fork
10946             getgrent
10947             gethostent
10948             getlogin
10949             getnetent
10950             getppid
10951             getprotoent
10952             getpwent
10953             getservent
10954             setgrent
10955             setpwent
10956             time
10957             times
10958             wait
10959             wantarray
10960             );
10961              
10962 38         175 push( @Keywords, @operator_requestor );
10963              
10964             # These are treated the same but are not considered keywords:
10965 38         129 my @extra_or = qw(
10966             STDERR
10967             STDIN
10968             STDOUT
10969             );
10970              
10971 38         169 push( @operator_requestor, @extra_or );
10972              
10973 38         887 @expecting_operator_token{@operator_requestor} =
10974             (1) x scalar(@operator_requestor);
10975              
10976             # these token TYPES expect trailing operator but not a term
10977             # note: ++ and -- are post-increment and decrement, 'C' = constant
10978 38         248 my @operator_requestor_types = qw( ++ -- C <> q );
10979              
10980             # NOTE: This hash is available but not currently used
10981 38         152 @expecting_operator_types{@operator_requestor_types} =
10982             (1) x scalar(@operator_requestor_types);
10983              
10984             # these token TYPES consume values (terms)
10985             # note: pp and mm are pre-increment and decrement
10986             # f=semicolon in for, F=file test operator
10987 38         920 my @value_requestor_type = qw#
10988             L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10989             **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10990             <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10991             f F pp mm Y p m U J G j >> << ^ t
10992             ~. ^. |. &. ^.= |.= &.=
10993             #;
10994 38         216 push( @value_requestor_type, ',' )
10995             ; # (perl doesn't like a ',' in a qw block)
10996              
10997             # NOTE: This hash is available but not currently used
10998 38         1008 @expecting_term_types{@value_requestor_type} =
10999             (1) x scalar(@value_requestor_type);
11000              
11001             # Note: the following valid token types are not assigned here to
11002             # hashes requesting to be followed by values or terms, but are
11003             # instead currently hard-coded into sub operator_expected:
11004             # ) -> :: Q R Z ] b h i k n v w } #
11005              
11006             # For simple syntax checking, it is nice to have a list of operators which
11007             # will really be unhappy if not followed by a term. This includes most
11008             # of the above...
11009 38         1304 @really_want_term{@value_requestor_type} =
11010             (1) x scalar(@value_requestor_type);
11011              
11012             # with these exceptions...
11013 38         194 delete $really_want_term{'U'}; # user sub, depends on prototype
11014 38         86 delete $really_want_term{'F'}; # file test works on $_ if no following term
11015 38         91 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
11016             # let perl do it
11017 38         177 @q = qw(q qq qx qr s y tr m);
11018 38         227 @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
11019              
11020             # Note added 'qw' here
11021 38         165 @q = qw(q qq qw qx qr s y tr m);
11022 38         3963 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
11023              
11024             # Note: 'class' will be added by sub check_options if -use-feature=class
11025 38         265 @q = qw(package);
11026 38         139 @is_package{@q} = (1) x scalar(@q);
11027              
11028 38         86 @q = qw( ? : );
11029 38         98 push @q, ',';
11030 38         107 @is_comma_question_colon{@q} = (1) x scalar(@q);
11031              
11032 38         157 @q = qw( if elsif unless );
11033 38         105 @is_if_elsif_unless{@q} = (1) x scalar(@q);
11034              
11035 38         83 @q = qw( ; t );
11036 38         95 @is_semicolon_or_t{@q} = (1) x scalar(@q);
11037              
11038 38         104 @q = qw( if elsif unless case when );
11039 38         111 @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
11040              
11041             # Hash of other possible line endings which may occur.
11042             # Keep these coordinated with the regex where this is used.
11043             # Note: chr(13) = chr(015)="\r".
11044 38         119 @q = ( chr(13), chr(29), chr(26) );
11045 38         209 @other_line_endings{@q} = (1) x scalar(@q);
11046              
11047             # These keywords are handled specially in the tokenizer code:
11048 38         226 my @special_keywords = qw(
11049             do
11050             eval
11051             format
11052             m
11053             package
11054             q
11055             qq
11056             qr
11057             qw
11058             qx
11059             s
11060             sub
11061             tr
11062             y
11063             );
11064 38         185 push( @Keywords, @special_keywords );
11065              
11066             # Keywords after which list formatting may be used
11067             # WARNING: do not include |map|grep|eval or perl may die on
11068             # syntax errors (map1.t).
11069 38         685 my @keyword_taking_list = qw(
11070             and
11071             chmod
11072             chomp
11073             chop
11074             chown
11075             dbmopen
11076             die
11077             elsif
11078             exec
11079             fcntl
11080             for
11081             foreach
11082             formline
11083             getsockopt
11084             if
11085             index
11086             ioctl
11087             join
11088             kill
11089             local
11090             msgctl
11091             msgrcv
11092             msgsnd
11093             my
11094             open
11095             or
11096             our
11097             pack
11098             print
11099             printf
11100             push
11101             read
11102             readpipe
11103             recv
11104             return
11105             reverse
11106             rindex
11107             seek
11108             select
11109             semctl
11110             semget
11111             send
11112             setpriority
11113             setsockopt
11114             shmctl
11115             shmget
11116             shmread
11117             shmwrite
11118             socket
11119             socketpair
11120             sort
11121             splice
11122             split
11123             sprintf
11124             state
11125             substr
11126             syscall
11127             sysopen
11128             sysread
11129             sysseek
11130             system
11131             syswrite
11132             tie
11133             unless
11134             unlink
11135             unpack
11136             unshift
11137             until
11138             vec
11139             warn
11140             while
11141             given
11142             when
11143             );
11144              
11145             # NOTE: This hash is available but not currently used
11146 38         1214 @is_keyword_taking_list{@keyword_taking_list} =
11147             (1) x scalar(@keyword_taking_list);
11148              
11149             # perl functions which may be unary operators.
11150              
11151             # This list is used to decide if a pattern delimited by slashes, /pattern/,
11152             # can follow one of these keywords.
11153 38         191 @q = qw(
11154             chomp eof eval fc lc pop shift uc undef
11155             );
11156              
11157 38         238 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
11158             (1) x scalar(@q);
11159              
11160             # These are keywords for which an arg may optionally be omitted. They are
11161             # currently only used to disambiguate a ? used as a ternary from one used
11162             # as a (deprecated) pattern delimiter. In the future, they might be used
11163             # to give a warning about ambiguous syntax before a /.
11164             # Note: split has been omitted (see note below).
11165 38         592 my @keywords_taking_optional_arg = qw(
11166             abs
11167             alarm
11168             caller
11169             chdir
11170             chomp
11171             chop
11172             chr
11173             chroot
11174             close
11175             cos
11176             defined
11177             die
11178             eof
11179             eval
11180             evalbytes
11181             exit
11182             exp
11183             fc
11184             getc
11185             glob
11186             gmtime
11187             hex
11188             int
11189             last
11190             lc
11191             lcfirst
11192             length
11193             localtime
11194             log
11195             lstat
11196             mkdir
11197             next
11198             oct
11199             ord
11200             pop
11201             pos
11202             print
11203             printf
11204             prototype
11205             quotemeta
11206             rand
11207             readline
11208             readlink
11209             readpipe
11210             redo
11211             ref
11212             require
11213             reset
11214             reverse
11215             rmdir
11216             say
11217             select
11218             shift
11219             sin
11220             sleep
11221             sqrt
11222             srand
11223             stat
11224             study
11225             tell
11226             uc
11227             ucfirst
11228             umask
11229             undef
11230             unlink
11231             warn
11232             write
11233             );
11234 38         887 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
11235             (1) x scalar(@keywords_taking_optional_arg);
11236              
11237             # This list is used to decide if a pattern delimited by question marks,
11238             # ?pattern?, can follow one of these keywords. Note that from perl 5.22
11239             # on, a ?pattern? is not recognized, so we can be much more strict than
11240             # with a /pattern/. Note that 'split' is not in this list. In current
11241             # versions of perl a question following split must be a ternary, but
11242             # in older versions it could be a pattern. The guessing algorithm will
11243             # decide. We are combining two lists here to simplify the test.
11244 38         810 @q = ( @keywords_taking_optional_arg, @operator_requestor );
11245 38         1437 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
11246             (1) x scalar(@q);
11247              
11248             # These are not used in any way yet
11249             # my @unused_keywords = qw(
11250             # __FILE__
11251             # __LINE__
11252             # __PACKAGE__
11253             # );
11254              
11255             # The list of keywords was originally extracted from function 'keyword' in
11256             # perl file toke.c version 5.005.03, using this utility, plus a
11257             # little editing: (file getkwd.pl):
11258             # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
11259             # Add 'get' prefix where necessary, then split into the above lists.
11260             # This list should be updated as necessary.
11261             # The list should not contain these special variables:
11262             # ARGV DATA ENV SIG STDERR STDIN STDOUT
11263             # __DATA__ __END__
11264              
11265 38         8823 @is_keyword{@Keywords} = (1) x scalar(@Keywords);
11266             } ## end BEGIN
11267             1;