| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package PPI::Tokenizer; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=pod |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
PPI::Tokenizer - The Perl Document Tokenizer |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Create a tokenizer for a file, array or string |
|
12
|
|
|
|
|
|
|
$Tokenizer = PPI::Tokenizer->new( 'filename.pl' ); |
|
13
|
|
|
|
|
|
|
$Tokenizer = PPI::Tokenizer->new( \@lines ); |
|
14
|
|
|
|
|
|
|
$Tokenizer = PPI::Tokenizer->new( \$source ); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# Return all the tokens for the document |
|
17
|
|
|
|
|
|
|
my $tokens = $Tokenizer->all_tokens; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Or we can use it as an iterator |
|
20
|
|
|
|
|
|
|
while ( my $Token = $Tokenizer->get_token ) { |
|
21
|
|
|
|
|
|
|
print "Found token '$Token'\n"; |
|
22
|
|
|
|
|
|
|
} |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# If we REALLY need to manually nudge the cursor, you |
|
25
|
|
|
|
|
|
|
# can do that to (The lexer needs this ability to do rollbacks) |
|
26
|
|
|
|
|
|
|
$is_incremented = $Tokenizer->increment_cursor; |
|
27
|
|
|
|
|
|
|
$is_decremented = $Tokenizer->decrement_cursor; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
PPI::Tokenizer is the class that provides Tokenizer objects for use in |
|
32
|
|
|
|
|
|
|
breaking strings of Perl source code into Tokens. |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
By the time you are reading this, you probably need to know a little |
|
35
|
|
|
|
|
|
|
about the difference between how perl parses Perl "code" and how PPI |
|
36
|
|
|
|
|
|
|
parsers Perl "documents". |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
"perl" itself (the interpreter) uses a heavily modified lex specification |
|
39
|
|
|
|
|
|
|
to specify its parsing logic, maintains several types of state as it |
|
40
|
|
|
|
|
|
|
goes, and incrementally tokenizes, lexes AND EXECUTES at the same time. |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
In fact, it is provably impossible to use perl's parsing method without |
|
43
|
|
|
|
|
|
|
simultaneously executing code. A formal mathematical proof has been |
|
44
|
|
|
|
|
|
|
published demonstrating the method. |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
This is where the truism "Only perl can parse Perl" comes from. |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
PPI uses a completely different approach by abandoning the (impossible) |
|
49
|
|
|
|
|
|
|
ability to parse Perl the same way that the interpreter does, and instead |
|
50
|
|
|
|
|
|
|
parsing the source as a document, using a document structure independently |
|
51
|
|
|
|
|
|
|
derived from the Perl documentation and approximating the perl interpreter |
|
52
|
|
|
|
|
|
|
interpretation as closely as possible. |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
It was touch and go for a long time whether we could get it close enough, |
|
55
|
|
|
|
|
|
|
but in the end it turned out that it could be done. |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
In this approach, the tokenizer C is implemented separately |
|
58
|
|
|
|
|
|
|
from the lexer L. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
The job of C is to take pure source as a string and break it |
|
61
|
|
|
|
|
|
|
up into a stream/set of tokens, and contains most of the "black magic" used |
|
62
|
|
|
|
|
|
|
in PPI. By comparison, the lexer implements a relatively straight forward |
|
63
|
|
|
|
|
|
|
tree structure, and has an implementation that is uncomplicated (compared |
|
64
|
|
|
|
|
|
|
to the insanity in the tokenizer at least). |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
The Tokenizer uses an immense amount of heuristics, guessing and cruft, |
|
67
|
|
|
|
|
|
|
supported by a very B flexible internal API, but fortunately it was |
|
68
|
|
|
|
|
|
|
possible to largely encapsulate the black magic, so there is not a lot that |
|
69
|
|
|
|
|
|
|
gets exposed to people using the C itself. |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head1 METHODS |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Despite the incredible complexity, the Tokenizer itself only exposes a |
|
74
|
|
|
|
|
|
|
relatively small number of methods, with most of the complexity implemented |
|
75
|
|
|
|
|
|
|
in private methods. |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=cut |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# Make sure everything we need is loaded so |
|
80
|
|
|
|
|
|
|
# we don't have to go and load all of PPI. |
|
81
|
64
|
|
|
64
|
|
375
|
use strict; |
|
|
64
|
|
|
|
|
119
|
|
|
|
64
|
|
|
|
|
1820
|
|
|
82
|
64
|
|
|
64
|
|
290
|
use Params::Util qw{_INSTANCE _SCALAR0 _ARRAY0}; |
|
|
64
|
|
|
|
|
128
|
|
|
|
64
|
|
|
|
|
2852
|
|
|
83
|
64
|
|
|
64
|
|
328
|
use List::Util 1.33 (); |
|
|
64
|
|
|
|
|
958
|
|
|
|
64
|
|
|
|
|
1013
|
|
|
84
|
64
|
|
|
64
|
|
283
|
use PPI::Util (); |
|
|
64
|
|
|
|
|
112
|
|
|
|
64
|
|
|
|
|
997
|
|
|
85
|
64
|
|
|
64
|
|
298
|
use PPI::Element (); |
|
|
64
|
|
|
|
|
135
|
|
|
|
64
|
|
|
|
|
1033
|
|
|
86
|
64
|
|
|
64
|
|
300
|
use PPI::Token (); |
|
|
64
|
|
|
|
|
142
|
|
|
|
64
|
|
|
|
|
1214
|
|
|
87
|
64
|
|
|
64
|
|
310
|
use PPI::Exception (); |
|
|
64
|
|
|
|
|
125
|
|
|
|
64
|
|
|
|
|
1042
|
|
|
88
|
64
|
|
|
64
|
|
21973
|
use PPI::Exception::ParserRejection (); |
|
|
64
|
|
|
|
|
162
|
|
|
|
64
|
|
|
|
|
154239
|
|
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
our $VERSION = '1.276'; |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# The x operator cannot follow most Perl operators, implying that |
|
93
|
|
|
|
|
|
|
# anything beginning with x following an operator is a word. |
|
94
|
|
|
|
|
|
|
# These are the exceptions. |
|
95
|
|
|
|
|
|
|
my %X_CAN_FOLLOW_OPERATOR = map { $_ => 1 } qw( -- ++ ); |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# The x operator cannot follow most structure elements, implying that |
|
98
|
|
|
|
|
|
|
# anything beginning with x following a structure element is a word. |
|
99
|
|
|
|
|
|
|
# These are the exceptions. |
|
100
|
|
|
|
|
|
|
my %X_CAN_FOLLOW_STRUCTURE = map { $_ => 1 } qw( } ] \) ); |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Something that looks like the x operator but follows a word |
|
103
|
|
|
|
|
|
|
# is usually that word's argument. |
|
104
|
|
|
|
|
|
|
# These are the exceptions. |
|
105
|
|
|
|
|
|
|
# chop, chomp, dump are ambiguous because they can have either parms |
|
106
|
|
|
|
|
|
|
# or no parms. |
|
107
|
|
|
|
|
|
|
my %X_CAN_FOLLOW_WORD = map { $_ => 1 } qw( |
|
108
|
|
|
|
|
|
|
endgrent |
|
109
|
|
|
|
|
|
|
endhostent |
|
110
|
|
|
|
|
|
|
endnetent |
|
111
|
|
|
|
|
|
|
endprotoent |
|
112
|
|
|
|
|
|
|
endpwent |
|
113
|
|
|
|
|
|
|
endservent |
|
114
|
|
|
|
|
|
|
fork |
|
115
|
|
|
|
|
|
|
getgrent |
|
116
|
|
|
|
|
|
|
gethostent |
|
117
|
|
|
|
|
|
|
getlogin |
|
118
|
|
|
|
|
|
|
getnetent |
|
119
|
|
|
|
|
|
|
getppid |
|
120
|
|
|
|
|
|
|
getprotoent |
|
121
|
|
|
|
|
|
|
getpwent |
|
122
|
|
|
|
|
|
|
getservent |
|
123
|
|
|
|
|
|
|
setgrent |
|
124
|
|
|
|
|
|
|
setpwent |
|
125
|
|
|
|
|
|
|
time |
|
126
|
|
|
|
|
|
|
times |
|
127
|
|
|
|
|
|
|
wait |
|
128
|
|
|
|
|
|
|
wantarray |
|
129
|
|
|
|
|
|
|
__SUB__ |
|
130
|
|
|
|
|
|
|
); |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
##################################################################### |
|
135
|
|
|
|
|
|
|
# Creation and Initialization |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=pod |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=head2 new $file | \@lines | \$source |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
The main C constructor creates a new Tokenizer object. These |
|
142
|
|
|
|
|
|
|
objects have no configuration parameters, and can only be used once, |
|
143
|
|
|
|
|
|
|
to tokenize a single perl source file. |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
It takes as argument either a normal scalar containing source code, |
|
146
|
|
|
|
|
|
|
a reference to a scalar containing source code, or a reference to an |
|
147
|
|
|
|
|
|
|
ARRAY containing newline-terminated lines of source code. |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Returns a new C object on success, or throws a |
|
150
|
|
|
|
|
|
|
L exception on error. |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=cut |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub new { |
|
155
|
16798
|
|
33
|
16798
|
1
|
44591
|
my $class = ref($_[0]) || $_[0]; |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Create the empty tokenizer struct |
|
158
|
16798
|
|
|
|
|
102015
|
my $self = bless { |
|
159
|
|
|
|
|
|
|
# Source code |
|
160
|
|
|
|
|
|
|
source => undef, |
|
161
|
|
|
|
|
|
|
source_bytes => undef, |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Line buffer |
|
164
|
|
|
|
|
|
|
line => undef, |
|
165
|
|
|
|
|
|
|
line_length => undef, |
|
166
|
|
|
|
|
|
|
line_cursor => undef, |
|
167
|
|
|
|
|
|
|
line_count => 0, |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# Parse state |
|
170
|
|
|
|
|
|
|
token => undef, |
|
171
|
|
|
|
|
|
|
class => 'PPI::Token::BOM', |
|
172
|
|
|
|
|
|
|
zone => 'PPI::Token::Whitespace', |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Output token buffer |
|
175
|
|
|
|
|
|
|
tokens => [], |
|
176
|
|
|
|
|
|
|
token_cursor => 0, |
|
177
|
|
|
|
|
|
|
token_eof => 0, |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# Perl 6 blocks |
|
180
|
|
|
|
|
|
|
perl6 => [], |
|
181
|
|
|
|
|
|
|
}, $class; |
|
182
|
|
|
|
|
|
|
|
|
183
|
16798
|
50
|
|
|
|
57005
|
if ( ! defined $_[1] ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# We weren't given anything |
|
185
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("No source provided to Tokenizer"); |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
} elsif ( ! ref $_[1] ) { |
|
188
|
496
|
|
|
|
|
1679
|
my $source = PPI::Util::_slurp($_[1]); |
|
189
|
496
|
50
|
|
|
|
1702
|
if ( ref $source ) { |
|
190
|
|
|
|
|
|
|
# Content returned by reference |
|
191
|
496
|
|
|
|
|
1507
|
$self->{source} = $$source; |
|
192
|
|
|
|
|
|
|
} else { |
|
193
|
|
|
|
|
|
|
# Errors returned as a string |
|
194
|
0
|
|
|
|
|
0
|
return( $source ); |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
} elsif ( _SCALAR0($_[1]) ) { |
|
198
|
16302
|
|
|
|
|
17816
|
$self->{source} = ${$_[1]}; |
|
|
16302
|
|
|
|
|
29689
|
|
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
} elsif ( _ARRAY0($_[1]) ) { |
|
201
|
0
|
|
|
|
|
0
|
$self->{source} = join '', map { "\n" } @{$_[1]}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
} else { |
|
204
|
|
|
|
|
|
|
# We don't support whatever this is |
|
205
|
0
|
|
|
|
|
0
|
PPI::Exception->throw(ref($_[1]) . " is not supported as a source provider"); |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# We can't handle a null string |
|
209
|
16798
|
|
|
|
|
26249
|
$self->{source_bytes} = length $self->{source}; |
|
210
|
16798
|
100
|
|
|
|
25477
|
if ( $self->{source_bytes} ) { |
|
211
|
|
|
|
|
|
|
# Split on local newlines |
|
212
|
16794
|
|
|
|
|
256726
|
$self->{source} =~ s/(?:\015{1,2}\012|\015|\012)/\n/g; |
|
213
|
16794
|
|
|
|
|
181424
|
$self->{source} = [ split /(?<=\n)/, $self->{source} ]; |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
} else { |
|
216
|
4
|
|
|
|
|
7
|
$self->{source} = [ ]; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
### EVIL |
|
220
|
|
|
|
|
|
|
# I'm explaining this earlier than I should so you can understand |
|
221
|
|
|
|
|
|
|
# why I'm about to do something that looks very strange. There's |
|
222
|
|
|
|
|
|
|
# a problem with the Tokenizer, in that tokens tend to change |
|
223
|
|
|
|
|
|
|
# classes as each letter is added, but they don't get allocated |
|
224
|
|
|
|
|
|
|
# their definite final class until the "end" of the token, the |
|
225
|
|
|
|
|
|
|
# detection of which occurs in about a hundred different places, |
|
226
|
|
|
|
|
|
|
# all through various crufty code (that triples the speed). |
|
227
|
|
|
|
|
|
|
# |
|
228
|
|
|
|
|
|
|
# However, in general, this does not apply to tokens in which a |
|
229
|
|
|
|
|
|
|
# whitespace character is valid, such as comments, whitespace and |
|
230
|
|
|
|
|
|
|
# big strings. |
|
231
|
|
|
|
|
|
|
# |
|
232
|
|
|
|
|
|
|
# So what we do is add a space to the end of the source. This |
|
233
|
|
|
|
|
|
|
# triggers normal "end of token" functionality for all cases. Then, |
|
234
|
|
|
|
|
|
|
# once the tokenizer hits end of file, it examines the last token to |
|
235
|
|
|
|
|
|
|
# manually either remove the ' ' token, or chop it off the end of |
|
236
|
|
|
|
|
|
|
# a longer one in which the space would be valid. |
|
237
|
16798
|
100
|
|
70863
|
|
57556
|
if ( List::Util::any { /^__(?:DATA|END)__\s*$/ } @{$self->{source}} ) { |
|
|
70863
|
100
|
|
|
|
124152
|
|
|
|
16798
|
100
|
|
|
|
46173
|
|
|
238
|
10
|
|
|
|
|
29
|
$self->{source_eof_chop} = ''; |
|
239
|
|
|
|
|
|
|
} elsif ( ! defined $self->{source}->[0] ) { |
|
240
|
4
|
|
|
|
|
17
|
$self->{source_eof_chop} = ''; |
|
241
|
|
|
|
|
|
|
} elsif ( $self->{source}->[-1] =~ /\s$/ ) { |
|
242
|
1060
|
|
|
|
|
2634
|
$self->{source_eof_chop} = ''; |
|
243
|
|
|
|
|
|
|
} else { |
|
244
|
15724
|
|
|
|
|
24946
|
$self->{source_eof_chop} = 1; |
|
245
|
15724
|
|
|
|
|
26696
|
$self->{source}->[-1] .= ' '; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
|
|
248
|
16798
|
|
|
|
|
56828
|
$self; |
|
249
|
|
|
|
|
|
|
} |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
##################################################################### |
|
256
|
|
|
|
|
|
|
# Main Public Methods |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=pod |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=head2 get_token |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
When using the PPI::Tokenizer object as an iterator, the C |
|
263
|
|
|
|
|
|
|
method is the primary method that is used. It increments the cursor |
|
264
|
|
|
|
|
|
|
and returns the next Token in the output array. |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
The actual parsing of the file is done only as-needed, and a line at |
|
267
|
|
|
|
|
|
|
a time. When C hits the end of the token array, it will |
|
268
|
|
|
|
|
|
|
cause the parser to pull in the next line and parse it, continuing |
|
269
|
|
|
|
|
|
|
as needed until there are more tokens on the output array that |
|
270
|
|
|
|
|
|
|
get_token can then return. |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
This means that a number of Tokenizer objects can be created, and |
|
273
|
|
|
|
|
|
|
won't consume significant CPU until you actually begin to pull tokens |
|
274
|
|
|
|
|
|
|
from it. |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
Return a L object on success, C<0> if the Tokenizer had |
|
277
|
|
|
|
|
|
|
reached the end of the file, or C on error. |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub get_token { |
|
282
|
380898
|
|
|
380898
|
1
|
443571
|
my $self = shift; |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# Shortcut for EOF |
|
285
|
380898
|
50
|
66
|
|
|
639257
|
if ( $self->{token_eof} |
|
286
|
13364
|
|
|
|
|
29080
|
and $self->{token_cursor} > scalar @{$self->{tokens}} |
|
287
|
|
|
|
|
|
|
) { |
|
288
|
0
|
|
|
|
|
0
|
return 0; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# Return the next token if we can |
|
292
|
380898
|
100
|
|
|
|
841972
|
if ( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) { |
|
293
|
304418
|
|
|
|
|
328770
|
$self->{token_cursor}++; |
|
294
|
304418
|
|
|
|
|
924282
|
return $token; |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
76480
|
|
|
|
|
80791
|
my $line_rv; |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# Catch exceptions and return undef, so that we |
|
300
|
|
|
|
|
|
|
# can start to convert code to exception-based code. |
|
301
|
76480
|
|
|
|
|
86957
|
my $rv = eval { |
|
302
|
|
|
|
|
|
|
# No token, we need to get some more |
|
303
|
76480
|
|
|
|
|
121400
|
while ( $line_rv = $self->_process_next_line ) { |
|
304
|
|
|
|
|
|
|
# If there is something in the buffer, return it |
|
305
|
|
|
|
|
|
|
# The defined() prevents a ton of calls to PPI::Util::TRUE |
|
306
|
67198
|
100
|
|
|
|
137239
|
if ( defined( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) ) { |
|
307
|
46407
|
|
|
|
|
51382
|
$self->{token_cursor}++; |
|
308
|
46407
|
|
|
|
|
73219
|
return $token; |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
} |
|
311
|
30072
|
|
|
|
|
38970
|
return undef; |
|
312
|
|
|
|
|
|
|
}; |
|
313
|
76480
|
100
|
|
|
|
178473
|
if ( $@ ) { |
|
|
|
100
|
|
|
|
|
|
|
314
|
1
|
50
|
|
|
|
9
|
if ( _INSTANCE($@, 'PPI::Exception') ) { |
|
315
|
1
|
|
|
|
|
12
|
$@->throw; |
|
316
|
|
|
|
|
|
|
} else { |
|
317
|
0
|
|
|
|
|
0
|
my $errstr = $@; |
|
318
|
0
|
|
|
|
|
0
|
$errstr =~ s/^(.*) at line .+$/$1/; |
|
319
|
0
|
|
|
|
|
0
|
PPI::Exception->throw( $errstr ); |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
} elsif ( $rv ) { |
|
322
|
46407
|
|
|
|
|
180201
|
return $rv; |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
|
|
325
|
30072
|
50
|
|
|
|
45548
|
if ( defined $line_rv ) { |
|
326
|
|
|
|
|
|
|
# End of file, but we can still return things from the buffer |
|
327
|
30072
|
50
|
|
|
|
49055
|
if ( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) { |
|
328
|
0
|
|
|
|
|
0
|
$self->{token_cursor}++; |
|
329
|
0
|
|
|
|
|
0
|
return $token; |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
# Set our token end of file flag |
|
333
|
30072
|
|
|
|
|
33153
|
$self->{token_eof} = 1; |
|
334
|
30072
|
|
|
|
|
87132
|
return 0; |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# Error, pass it up to our caller |
|
338
|
0
|
|
|
|
|
0
|
undef; |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=pod |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 all_tokens |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
When not being used as an iterator, the C method tells |
|
346
|
|
|
|
|
|
|
the Tokenizer to parse the entire file and return all of the tokens |
|
347
|
|
|
|
|
|
|
in a single ARRAY reference. |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
It should be noted that C does B interfere with the |
|
350
|
|
|
|
|
|
|
use of the Tokenizer object as an iterator (does not modify the token |
|
351
|
|
|
|
|
|
|
cursor) and use of the two different mechanisms can be mixed safely. |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Returns a reference to an ARRAY of L objects on success |
|
354
|
|
|
|
|
|
|
or throws an exception on error. |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=cut |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
sub all_tokens { |
|
359
|
4
|
|
|
4
|
1
|
16
|
my $self = shift; |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# Catch exceptions and return undef, so that we |
|
362
|
|
|
|
|
|
|
# can start to convert code to exception-based code. |
|
363
|
4
|
|
|
|
|
5
|
my $ok = eval { |
|
364
|
|
|
|
|
|
|
# Process lines until we get EOF |
|
365
|
4
|
50
|
|
|
|
9
|
unless ( $self->{token_eof} ) { |
|
366
|
4
|
|
|
|
|
4
|
my $rv; |
|
367
|
4
|
|
|
|
|
9
|
while ( $rv = $self->_process_next_line ) {} |
|
368
|
4
|
50
|
|
|
|
6
|
unless ( defined $rv ) { |
|
369
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("Error while processing source"); |
|
370
|
|
|
|
|
|
|
} |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# Clean up the end of the tokenizer |
|
373
|
4
|
|
|
|
|
8
|
$self->_clean_eof; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
4
|
|
|
|
|
8
|
1; |
|
376
|
|
|
|
|
|
|
}; |
|
377
|
4
|
50
|
|
|
|
8
|
if ( !$ok ) { |
|
378
|
0
|
|
|
|
|
0
|
my $errstr = $@; |
|
379
|
0
|
|
|
|
|
0
|
$errstr =~ s/^(.*) at line .+$/$1/; |
|
380
|
0
|
|
|
|
|
0
|
PPI::Exception->throw( $errstr ); |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# End of file, return a copy of the token array. |
|
384
|
4
|
|
|
|
|
5
|
return [ @{$self->{tokens}} ]; |
|
|
4
|
|
|
|
|
12
|
|
|
385
|
|
|
|
|
|
|
} |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=pod |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=head2 increment_cursor |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Although exposed as a public method, C is implemented |
|
392
|
|
|
|
|
|
|
for expert use only, when writing lexers or other components that work |
|
393
|
|
|
|
|
|
|
directly on token streams. |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
It manually increments the token cursor forward through the file, in effect |
|
396
|
|
|
|
|
|
|
"skipping" the next token. |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
Return true if the cursor is incremented, C<0> if already at the end of |
|
399
|
|
|
|
|
|
|
the file, or C on error. |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=cut |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
sub increment_cursor { |
|
404
|
|
|
|
|
|
|
# Do this via the get_token method, which makes sure there |
|
405
|
|
|
|
|
|
|
# is actually a token there to move to. |
|
406
|
0
|
0
|
|
0
|
1
|
0
|
$_[0]->get_token and 1; |
|
407
|
|
|
|
|
|
|
} |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=pod |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=head2 decrement_cursor |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
Although exposed as a public method, C is implemented |
|
414
|
|
|
|
|
|
|
for expert use only, when writing lexers or other components that work |
|
415
|
|
|
|
|
|
|
directly on token streams. |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
It manually decrements the token cursor backwards through the file, in |
|
418
|
|
|
|
|
|
|
effect "rolling back" the token stream. And indeed that is what it is |
|
419
|
|
|
|
|
|
|
primarily intended for, when the component that is consuming the token |
|
420
|
|
|
|
|
|
|
stream needs to implement some sort of "roll back" feature in its use |
|
421
|
|
|
|
|
|
|
of the token stream. |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
Return true if the cursor is decremented, C<0> if already at the |
|
424
|
|
|
|
|
|
|
beginning of the file, or C on error. |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=cut |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
sub decrement_cursor { |
|
429
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# Check for the beginning of the file |
|
432
|
0
|
0
|
|
|
|
0
|
return 0 unless $self->{token_cursor}; |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
# Decrement the token cursor |
|
435
|
0
|
|
|
|
|
0
|
$self->{token_eof} = 0; |
|
436
|
0
|
|
|
|
|
0
|
--$self->{token_cursor}; |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
##################################################################### |
|
444
|
|
|
|
|
|
|
# Working With Source |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# Fetches the next line from the input line buffer |
|
447
|
|
|
|
|
|
|
# Returns undef at EOF. |
|
448
|
|
|
|
|
|
|
sub _get_line { |
|
449
|
105036
|
|
|
105036
|
|
109269
|
my $self = shift; |
|
450
|
105036
|
100
|
|
|
|
169981
|
return undef unless $self->{source}; # EOF hit previously |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# Pull off the next line |
|
453
|
89162
|
|
|
|
|
91518
|
my $line = shift @{$self->{source}}; |
|
|
89162
|
|
|
|
|
155174
|
|
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
# Flag EOF if we hit it |
|
456
|
89162
|
100
|
|
|
|
149506
|
$self->{source} = undef unless defined $line; |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
# Return the line (or EOF flag) |
|
459
|
89162
|
|
|
|
|
132306
|
return $line; # string or undef |
|
460
|
|
|
|
|
|
|
} |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# Fetches the next line, ready to process |
|
463
|
|
|
|
|
|
|
# Returns 1 on success |
|
464
|
|
|
|
|
|
|
# Returns 0 on EOF |
|
465
|
|
|
|
|
|
|
sub _fill_line { |
|
466
|
102729
|
|
|
102729
|
|
110707
|
my $self = shift; |
|
467
|
102729
|
|
|
|
|
110861
|
my $inscan = shift; |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# Get the next line |
|
470
|
102729
|
|
|
|
|
134299
|
my $line = $self->_get_line; |
|
471
|
102729
|
100
|
|
|
|
154552
|
unless ( defined $line ) { |
|
472
|
|
|
|
|
|
|
# End of file |
|
473
|
32113
|
100
|
|
|
|
47108
|
unless ( $inscan ) { |
|
474
|
30076
|
|
|
|
|
43022
|
delete $self->{line}; |
|
475
|
30076
|
|
|
|
|
35134
|
delete $self->{line_cursor}; |
|
476
|
30076
|
|
|
|
|
31460
|
delete $self->{line_length}; |
|
477
|
30076
|
|
|
|
|
55047
|
return 0; |
|
478
|
|
|
|
|
|
|
} |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# In the scan version, just set the cursor to the end |
|
481
|
|
|
|
|
|
|
# of the line, and the rest should just cascade out. |
|
482
|
2037
|
|
|
|
|
2581
|
$self->{line_cursor} = $self->{line_length}; |
|
483
|
2037
|
|
|
|
|
3727
|
return 0; |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
# Populate the appropriate variables |
|
487
|
70616
|
|
|
|
|
101773
|
$self->{line} = $line; |
|
488
|
70616
|
|
|
|
|
83466
|
$self->{line_cursor} = -1; |
|
489
|
70616
|
|
|
|
|
83999
|
$self->{line_length} = length $line; |
|
490
|
70616
|
|
|
|
|
76726
|
$self->{line_count}++; |
|
491
|
|
|
|
|
|
|
|
|
492
|
70616
|
|
|
|
|
119924
|
1; |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
# Get the current character |
|
496
|
|
|
|
|
|
|
sub _char { |
|
497
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
498
|
0
|
|
|
|
|
0
|
substr( $self->{line}, $self->{line_cursor}, 1 ); |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
#################################################################### |
|
506
|
|
|
|
|
|
|
# Per line processing methods |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
# Processes the next line |
|
509
|
|
|
|
|
|
|
# Returns 1 on success completion |
|
510
|
|
|
|
|
|
|
# Returns 0 if EOF |
|
511
|
|
|
|
|
|
|
# Returns undef on error |
|
512
|
|
|
|
|
|
|
sub _process_next_line { |
|
513
|
97285
|
|
|
97285
|
|
108876
|
my $self = shift; |
|
514
|
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
# Fill the line buffer |
|
516
|
97285
|
|
|
|
|
96909
|
my $rv; |
|
517
|
97285
|
100
|
|
|
|
137513
|
unless ( $rv = $self->_fill_line ) { |
|
518
|
30076
|
50
|
|
|
|
43995
|
return undef unless defined $rv; |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
# End of file, finalize last token |
|
521
|
30076
|
|
|
|
|
51182
|
$self->_finalize_token; |
|
522
|
30076
|
|
|
|
|
55370
|
return 0; |
|
523
|
|
|
|
|
|
|
} |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# Run the __TOKENIZER__on_line_start |
|
526
|
67209
|
|
|
|
|
160629
|
$rv = $self->{class}->__TOKENIZER__on_line_start( $self ); |
|
527
|
67209
|
100
|
|
|
|
106695
|
unless ( $rv ) { |
|
528
|
|
|
|
|
|
|
# If there are no more source lines, then clean up |
|
529
|
27815
|
100
|
66
|
|
|
50153
|
if ( ref $self->{source} eq 'ARRAY' and ! @{$self->{source}} ) { |
|
|
27815
|
|
|
|
|
61600
|
|
|
530
|
307
|
|
|
|
|
840
|
$self->_clean_eof; |
|
531
|
|
|
|
|
|
|
} |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# Defined but false means next line |
|
534
|
27815
|
50
|
|
|
|
58641
|
return 1 if defined $rv; |
|
535
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("Error at line $self->{line_count}"); |
|
536
|
|
|
|
|
|
|
} |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
# If we can't deal with the entire line, process char by char |
|
539
|
39394
|
|
|
|
|
64089
|
while ( $rv = $self->_process_next_char ) {} |
|
540
|
39393
|
50
|
|
|
|
66414
|
unless ( defined $rv ) { |
|
541
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("Error at line $self->{line_count}, character $self->{line_cursor}"); |
|
542
|
|
|
|
|
|
|
} |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
# Trigger any action that needs to happen at the end of a line |
|
545
|
39393
|
|
|
|
|
92742
|
$self->{class}->__TOKENIZER__on_line_end( $self ); |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
# If there are no more source lines, then clean up |
|
548
|
39393
|
100
|
100
|
|
|
85027
|
unless ( ref($self->{source}) eq 'ARRAY' and @{$self->{source}} ) { |
|
|
37114
|
|
|
|
|
95947
|
|
|
549
|
16486
|
|
|
|
|
28548
|
return $self->_clean_eof; |
|
550
|
|
|
|
|
|
|
} |
|
551
|
|
|
|
|
|
|
|
|
552
|
22907
|
|
|
|
|
46974
|
return 1; |
|
553
|
|
|
|
|
|
|
} |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
##################################################################### |
|
560
|
|
|
|
|
|
|
# Per-character processing methods |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# Process on a per-character basis. |
|
563
|
|
|
|
|
|
|
# Note that due the high number of times this gets |
|
564
|
|
|
|
|
|
|
# called, it has been fairly heavily in-lined, so the code |
|
565
|
|
|
|
|
|
|
# might look a bit ugly and duplicated. |
|
566
|
|
|
|
|
|
|
sub _process_next_char { |
|
567
|
434430
|
|
|
434430
|
|
492547
|
my $self = shift; |
|
568
|
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
### FIXME - This checks for a screwed up condition that triggers |
|
570
|
|
|
|
|
|
|
### several warnings, amongst other things. |
|
571
|
434430
|
50
|
33
|
|
|
1070434
|
if ( ! defined $self->{line_cursor} or ! defined $self->{line_length} ) { |
|
572
|
|
|
|
|
|
|
# $DB::single = 1; |
|
573
|
0
|
|
|
|
|
0
|
return undef; |
|
574
|
|
|
|
|
|
|
} |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
# Increment the counter and check for end of line |
|
577
|
434430
|
100
|
|
|
|
699061
|
return 0 if ++$self->{line_cursor} >= $self->{line_length}; |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
# Pass control to the token class |
|
580
|
395037
|
|
|
|
|
394356
|
my $result; |
|
581
|
395037
|
100
|
|
|
|
746300
|
unless ( $result = $self->{class}->__TOKENIZER__on_char( $self ) ) { |
|
582
|
|
|
|
|
|
|
# undef is error. 0 is "Did stuff ourself, you don't have to do anything" |
|
583
|
76272
|
50
|
|
|
|
205225
|
return defined $result ? 1 : undef; |
|
584
|
|
|
|
|
|
|
} |
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
# We will need the value of the current character |
|
587
|
318764
|
|
|
|
|
472747
|
my $char = substr( $self->{line}, $self->{line_cursor}, 1 ); |
|
588
|
318764
|
100
|
|
|
|
475287
|
if ( $result eq '1' ) { |
|
589
|
|
|
|
|
|
|
# If __TOKENIZER__on_char returns 1, it is signaling that it thinks that |
|
590
|
|
|
|
|
|
|
# the character is part of it. |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# Add the character |
|
593
|
57739
|
50
|
|
|
|
85532
|
if ( defined $self->{token} ) { |
|
594
|
57739
|
|
|
|
|
80268
|
$self->{token}->{content} .= $char; |
|
595
|
|
|
|
|
|
|
} else { |
|
596
|
0
|
0
|
|
|
|
0
|
defined($self->{token} = $self->{class}->new($char)) or return undef; |
|
597
|
|
|
|
|
|
|
} |
|
598
|
|
|
|
|
|
|
|
|
599
|
57739
|
|
|
|
|
126998
|
return 1; |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
# We have been provided with the name of a class |
|
603
|
261025
|
100
|
|
|
|
481793
|
if ( $self->{class} ne "PPI::Token::$result" ) { |
|
|
|
100
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
# New class |
|
605
|
101525
|
|
|
|
|
158837
|
$self->_new_token( $result, $char ); |
|
606
|
|
|
|
|
|
|
} elsif ( defined $self->{token} ) { |
|
607
|
|
|
|
|
|
|
# Same class as current |
|
608
|
29517
|
|
|
|
|
39394
|
$self->{token}->{content} .= $char; |
|
609
|
|
|
|
|
|
|
} else { |
|
610
|
|
|
|
|
|
|
# Same class, but no current |
|
611
|
129983
|
50
|
|
|
|
254436
|
defined($self->{token} = $self->{class}->new($char)) or return undef; |
|
612
|
|
|
|
|
|
|
} |
|
613
|
|
|
|
|
|
|
|
|
614
|
261025
|
|
|
|
|
561823
|
1; |
|
615
|
|
|
|
|
|
|
} |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
##################################################################### |
|
622
|
|
|
|
|
|
|
# Altering Tokens in Tokenizer |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
# Finish the end of a token. |
|
625
|
|
|
|
|
|
|
# Returns the resulting parse class as a convenience. |
|
626
|
|
|
|
|
|
|
sub _finalize_token { |
|
627
|
394130
|
|
|
394130
|
|
436748
|
my $self = shift; |
|
628
|
394130
|
100
|
|
|
|
590914
|
return $self->{class} unless defined $self->{token}; |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# Add the token to the token buffer |
|
631
|
364052
|
|
|
|
|
367998
|
push @{ $self->{tokens} }, $self->{token}; |
|
|
364052
|
|
|
|
|
562831
|
|
|
632
|
364052
|
|
|
|
|
423364
|
$self->{token} = undef; |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# Return the parse class to that of the zone we are in |
|
635
|
364052
|
|
|
|
|
674651
|
$self->{class} = $self->{zone}; |
|
636
|
|
|
|
|
|
|
} |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
# Creates a new token and sets it in the tokenizer |
|
639
|
|
|
|
|
|
|
# The defined() in here prevent a ton of calls to PPI::Util::TRUE |
|
640
|
|
|
|
|
|
|
sub _new_token { |
|
641
|
234067
|
|
|
234067
|
|
252484
|
my $self = shift; |
|
642
|
|
|
|
|
|
|
# throw PPI::Exception() unless @_; |
|
643
|
234067
|
100
|
|
|
|
454936
|
my $class = substr( $_[0], 0, 12 ) eq 'PPI::Token::' |
|
644
|
|
|
|
|
|
|
? shift : 'PPI::Token::' . shift; |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
# Finalize any existing token |
|
647
|
234067
|
100
|
|
|
|
443166
|
$self->_finalize_token if defined $self->{token}; |
|
648
|
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
# Create the new token and update the parse class |
|
650
|
234067
|
50
|
|
|
|
506321
|
defined($self->{token} = $class->new($_[0])) or PPI::Exception->throw; |
|
651
|
234067
|
|
|
|
|
338119
|
$self->{class} = $class; |
|
652
|
|
|
|
|
|
|
|
|
653
|
234067
|
|
|
|
|
298140
|
1; |
|
654
|
|
|
|
|
|
|
} |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# At the end of the file, we need to clean up the results of the erroneous |
|
657
|
|
|
|
|
|
|
# space that we inserted at the beginning of the process. |
|
658
|
|
|
|
|
|
|
sub _clean_eof { |
|
659
|
16797
|
|
|
16797
|
|
19406
|
my $self = shift; |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
# Finish any partially completed token |
|
662
|
16797
|
100
|
|
|
|
27660
|
$self->_finalize_token if $self->{token}; |
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
# Find the last token, and if it has no content, kill it. |
|
665
|
|
|
|
|
|
|
# There appears to be some evidence that such "null tokens" are |
|
666
|
|
|
|
|
|
|
# somehow getting created accidentally. |
|
667
|
16797
|
|
|
|
|
21439
|
my $last_token = $self->{tokens}->[ -1 ]; |
|
668
|
16797
|
50
|
|
|
|
28469
|
unless ( length $last_token->{content} ) { |
|
669
|
0
|
|
|
|
|
0
|
pop @{$self->{tokens}}; |
|
|
0
|
|
|
|
|
0
|
|
|
670
|
|
|
|
|
|
|
} |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
# Now, if the last character of the last token is a space we added, |
|
673
|
|
|
|
|
|
|
# chop it off, deleting the token if there's nothing else left. |
|
674
|
16797
|
100
|
|
|
|
28691
|
if ( $self->{source_eof_chop} ) { |
|
675
|
15451
|
|
|
|
|
19147
|
$last_token = $self->{tokens}->[ -1 ]; |
|
676
|
15451
|
|
|
|
|
58301
|
$last_token->{content} =~ s/ $//; |
|
677
|
15451
|
100
|
|
|
|
30790
|
unless ( length $last_token->{content} ) { |
|
678
|
|
|
|
|
|
|
# Popping token |
|
679
|
13199
|
|
|
|
|
13744
|
pop @{$self->{tokens}}; |
|
|
13199
|
|
|
|
|
18679
|
|
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
# The hack involving adding an extra space is now reversed, and |
|
683
|
|
|
|
|
|
|
# now nobody will ever know. The perfect crime! |
|
684
|
15451
|
|
|
|
|
22665
|
$self->{source_eof_chop} = ''; |
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
|
|
687
|
16797
|
|
|
|
|
44284
|
1; |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
##################################################################### |
|
695
|
|
|
|
|
|
|
# Utility Methods |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
# Context |
|
698
|
|
|
|
|
|
|
sub _last_token { |
|
699
|
0
|
|
|
0
|
|
0
|
$_[0]->{tokens}->[-1]; |
|
700
|
|
|
|
|
|
|
} |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
sub _last_significant_token { |
|
703
|
3119
|
|
|
3119
|
|
4425
|
my $self = shift; |
|
704
|
3119
|
|
|
|
|
3578
|
my $cursor = $#{ $self->{tokens} }; |
|
|
3119
|
|
|
|
|
4578
|
|
|
705
|
3119
|
|
|
|
|
6201
|
while ( $cursor >= 0 ) { |
|
706
|
4118
|
|
|
|
|
5629
|
my $token = $self->{tokens}->[$cursor--]; |
|
707
|
4118
|
100
|
|
|
|
11641
|
return $token if $token->significant; |
|
708
|
|
|
|
|
|
|
} |
|
709
|
407
|
|
|
|
|
741
|
return; |
|
710
|
|
|
|
|
|
|
} |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
# Get an array ref of previous significant tokens. |
|
713
|
|
|
|
|
|
|
# Like _last_significant_token except it gets more than just one token |
|
714
|
|
|
|
|
|
|
# Returns array with 0 to x entries |
|
715
|
|
|
|
|
|
|
sub _previous_significant_tokens { |
|
716
|
150072
|
|
|
150072
|
|
167434
|
my $self = shift; |
|
717
|
150072
|
|
50
|
|
|
218600
|
my $count = shift || 1; |
|
718
|
150072
|
|
|
|
|
153072
|
my $cursor = $#{ $self->{tokens} }; |
|
|
150072
|
|
|
|
|
202997
|
|
|
719
|
|
|
|
|
|
|
|
|
720
|
150072
|
|
|
|
|
177351
|
my @tokens; |
|
721
|
150072
|
|
|
|
|
231130
|
while ( $cursor >= 0 ) { |
|
722
|
240076
|
|
|
|
|
291846
|
my $token = $self->{tokens}->[$cursor--]; |
|
723
|
240076
|
100
|
|
|
|
451628
|
next if not $token->significant; |
|
724
|
155630
|
|
|
|
|
180368
|
push @tokens, $token; |
|
725
|
155630
|
100
|
|
|
|
268325
|
last if @tokens >= $count; |
|
726
|
|
|
|
|
|
|
} |
|
727
|
|
|
|
|
|
|
|
|
728
|
150072
|
|
|
|
|
279670
|
return @tokens; |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
my %OBVIOUS_CLASS = ( |
|
732
|
|
|
|
|
|
|
'PPI::Token::Symbol' => 'operator', |
|
733
|
|
|
|
|
|
|
'PPI::Token::Magic' => 'operator', |
|
734
|
|
|
|
|
|
|
'PPI::Token::Number' => 'operator', |
|
735
|
|
|
|
|
|
|
'PPI::Token::ArrayIndex' => 'operator', |
|
736
|
|
|
|
|
|
|
'PPI::Token::Quote::Double' => 'operator', |
|
737
|
|
|
|
|
|
|
'PPI::Token::Quote::Interpolate' => 'operator', |
|
738
|
|
|
|
|
|
|
'PPI::Token::Quote::Literal' => 'operator', |
|
739
|
|
|
|
|
|
|
'PPI::Token::Quote::Single' => 'operator', |
|
740
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Backtick' => 'operator', |
|
741
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Command' => 'operator', |
|
742
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Readline' => 'operator', |
|
743
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Regexp' => 'operator', |
|
744
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Words' => 'operator', |
|
745
|
|
|
|
|
|
|
); |
|
746
|
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
my %OBVIOUS_CONTENT = ( |
|
748
|
|
|
|
|
|
|
'(' => 'operand', |
|
749
|
|
|
|
|
|
|
'{' => 'operand', |
|
750
|
|
|
|
|
|
|
'[' => 'operand', |
|
751
|
|
|
|
|
|
|
';' => 'operand', |
|
752
|
|
|
|
|
|
|
'}' => 'operator', |
|
753
|
|
|
|
|
|
|
); |
|
754
|
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
my %USUALLY_FORCES = map { $_ => 1 } qw( sub package use no ); |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
# Try to determine operator/operand context, if possible. |
|
759
|
|
|
|
|
|
|
# Returns "operator", "operand", or "" if unknown. |
|
760
|
|
|
|
|
|
|
sub _opcontext { |
|
761
|
7002
|
|
|
7002
|
|
7981
|
my $self = shift; |
|
762
|
7002
|
|
|
|
|
10580
|
my @tokens = $self->_previous_significant_tokens(1); |
|
763
|
7002
|
|
|
|
|
8461
|
my $p0 = $tokens[0]; |
|
764
|
7002
|
100
|
|
|
|
17105
|
return '' if not $p0; |
|
765
|
6883
|
|
|
|
|
9886
|
my $c0 = ref $p0; |
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
# Map the obvious cases |
|
768
|
6883
|
100
|
|
|
|
19518
|
return $OBVIOUS_CLASS{$c0} if defined $OBVIOUS_CLASS{$c0}; |
|
769
|
2263
|
100
|
|
|
|
4532
|
return $OBVIOUS_CONTENT{$p0} if defined $OBVIOUS_CONTENT{$p0}; |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
# Most of the time after an operator, we are an operand |
|
772
|
1713
|
100
|
|
|
|
6563
|
return 'operand' if $p0->isa('PPI::Token::Operator'); |
|
773
|
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
# If there's NOTHING, it's operand |
|
775
|
1486
|
50
|
|
|
|
3063
|
return 'operand' if $p0->content eq ''; |
|
776
|
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
# Otherwise, we don't know |
|
778
|
1486
|
|
|
|
|
3387
|
return '' |
|
779
|
|
|
|
|
|
|
} |
|
780
|
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
# Assuming we are currently parsing the word 'x', return true |
|
782
|
|
|
|
|
|
|
# if previous tokens imply the x is an operator, false otherwise. |
|
783
|
|
|
|
|
|
|
sub _current_x_is_operator { |
|
784
|
1144
|
|
|
1144
|
|
1822
|
my ( $self ) = @_; |
|
785
|
1144
|
100
|
|
|
|
1211
|
return if !@{$self->{tokens}}; |
|
|
1144
|
|
|
|
|
2524
|
|
|
786
|
|
|
|
|
|
|
|
|
787
|
942
|
|
|
|
|
1674
|
my ($prev, $prevprev) = $self->_previous_significant_tokens(2); |
|
788
|
942
|
50
|
|
|
|
2658
|
return if !$prev; |
|
789
|
|
|
|
|
|
|
|
|
790
|
942
|
100
|
|
|
|
3267
|
return !$self->__current_token_is_forced_word if $prev->isa('PPI::Token::Word'); |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
return (!$prev->isa('PPI::Token::Operator') || $X_CAN_FOLLOW_OPERATOR{$prev}) |
|
793
|
782
|
|
100
|
|
|
4336
|
&& (!$prev->isa('PPI::Token::Structure') || $X_CAN_FOLLOW_STRUCTURE{$prev}) |
|
794
|
|
|
|
|
|
|
&& !$prev->isa('PPI::Token::Label') |
|
795
|
|
|
|
|
|
|
; |
|
796
|
|
|
|
|
|
|
} |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
# Assuming we are at the end of parsing the current token that could be a word, |
|
800
|
|
|
|
|
|
|
# a wordlike operator, or a version string, try to determine whether context |
|
801
|
|
|
|
|
|
|
# before or after it forces it to be a bareword. This method is only useful |
|
802
|
|
|
|
|
|
|
# during tokenization. |
|
803
|
|
|
|
|
|
|
sub __current_token_is_forced_word { |
|
804
|
32670
|
|
|
32670
|
|
53631
|
my ( $t, $word ) = @_; |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
# Check if forced by preceding tokens. |
|
807
|
|
|
|
|
|
|
|
|
808
|
32670
|
|
|
|
|
47534
|
my ( $prev, $prevprev ) = $t->_previous_significant_tokens(2); |
|
809
|
32670
|
100
|
|
|
|
68893
|
if ( !$prev ) { |
|
810
|
8914
|
|
|
|
|
17873
|
pos $t->{line} = $t->{line_cursor}; |
|
811
|
|
|
|
|
|
|
} |
|
812
|
|
|
|
|
|
|
else { |
|
813
|
23756
|
|
|
|
|
35902
|
my $content = $prev->{content}; |
|
814
|
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
# We are forced if we are a method name. |
|
816
|
|
|
|
|
|
|
# '->' will always be an operator, so we don't check its type. |
|
817
|
23756
|
100
|
|
|
|
39289
|
return 1 if $content eq '->'; |
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
# If we are contained in a pair of curly braces, we are probably a |
|
820
|
|
|
|
|
|
|
# forced bareword hash key. '{' is never a word or operator, so we |
|
821
|
|
|
|
|
|
|
# don't check its type. |
|
822
|
23630
|
|
|
|
|
43922
|
pos $t->{line} = $t->{line_cursor}; |
|
823
|
23630
|
100
|
100
|
|
|
56526
|
return 1 if $content eq '{' and $t->{line} =~ /\G\s*\}/gc; |
|
824
|
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
# sub, package, use, and no all indicate that what immediately follows |
|
826
|
|
|
|
|
|
|
# is a word not an operator or (in the case of sub and package) a |
|
827
|
|
|
|
|
|
|
# version string. However, we don't want to be fooled by 'package |
|
828
|
|
|
|
|
|
|
# package v10' or 'use no v10'. We're a forced package unless we're |
|
829
|
|
|
|
|
|
|
# preceded by 'package sub', in which case we're a version string. |
|
830
|
|
|
|
|
|
|
# We also have to make sure that the sub/package/etc doing the forcing |
|
831
|
|
|
|
|
|
|
# is not a method call. |
|
832
|
23403
|
100
|
|
|
|
43173
|
if( $USUALLY_FORCES{$content}) { |
|
833
|
5631
|
100
|
66
|
|
|
11165
|
return if defined $word and $word =~ /^v[0-9]+$/ and ( $content eq "use" or $content eq "no" ); |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
834
|
5621
|
100
|
|
|
|
20906
|
return 1 if not $prevprev; |
|
835
|
236
|
100
|
100
|
|
|
545
|
return 1 if not $USUALLY_FORCES{$prevprev->content} and $prevprev->content ne '->'; |
|
836
|
6
|
|
|
|
|
24
|
return; |
|
837
|
|
|
|
|
|
|
} |
|
838
|
|
|
|
|
|
|
} |
|
839
|
|
|
|
|
|
|
# pos on $t->{line} is guaranteed to be set at this point. |
|
840
|
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
# Check if forced by following tokens. |
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
# If the word is followed by => it is probably a word, not a regex. |
|
844
|
26686
|
100
|
|
|
|
62378
|
return 1 if $t->{line} =~ /\G\s*=>/gc; |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
# Otherwise we probably aren't forced |
|
847
|
25896
|
|
|
|
|
120374
|
return ''; |
|
848
|
|
|
|
|
|
|
} |
|
849
|
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
1; |
|
851
|
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=pod |
|
853
|
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
=head1 NOTES |
|
855
|
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
=head2 How the Tokenizer Works |
|
857
|
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
Understanding the Tokenizer is not for the faint-hearted. It is by far |
|
859
|
|
|
|
|
|
|
the most complex and twisty piece of perl I've ever written that is actually |
|
860
|
|
|
|
|
|
|
still built properly and isn't a terrible spaghetti-like mess. In fact, you |
|
861
|
|
|
|
|
|
|
probably want to skip this section. |
|
862
|
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
But if you really want to understand, well then here goes. |
|
864
|
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=head2 Source Input and Clean Up |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
The Tokenizer starts by taking source in a variety of forms, sucking it |
|
868
|
|
|
|
|
|
|
all in and merging into one big string, and doing our own internal line |
|
869
|
|
|
|
|
|
|
split, using a "universal line separator" which allows the Tokenizer to |
|
870
|
|
|
|
|
|
|
take source for any platform (and even supports a few known types of |
|
871
|
|
|
|
|
|
|
broken newlines caused by mixed mac/pc/*nix editor screw ups). |
|
872
|
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
The resulting array of lines is used to feed the tokenizer, and is also |
|
874
|
|
|
|
|
|
|
accessed directly by the heredoc-logic to do the line-oriented part of |
|
875
|
|
|
|
|
|
|
here-doc support. |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=head2 Doing Things the Old Fashioned Way |
|
878
|
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
Due to the complexity of perl, and after 2 previously aborted parser |
|
880
|
|
|
|
|
|
|
attempts, in the end the tokenizer was fashioned around a line-buffered |
|
881
|
|
|
|
|
|
|
character-by-character method. |
|
882
|
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
That is, the Tokenizer pulls and holds a line at a time into a line buffer, |
|
884
|
|
|
|
|
|
|
and then iterates a cursor along it. At each cursor position, a method is |
|
885
|
|
|
|
|
|
|
called in whatever token class we are currently in, which will examine the |
|
886
|
|
|
|
|
|
|
character at the current position, and handle it. |
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
As the handler methods in the various token classes are called, they |
|
889
|
|
|
|
|
|
|
build up an output token array for the source code. |
|
890
|
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
Various parts of the Tokenizer use look-ahead, arbitrary-distance |
|
892
|
|
|
|
|
|
|
look-behind (although currently the maximum is three significant tokens), |
|
893
|
|
|
|
|
|
|
or both, and various other heuristic guesses. |
|
894
|
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
I've been told it is officially termed a I<"backtracking parser |
|
896
|
|
|
|
|
|
|
with infinite lookaheads">. |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
=head2 State Variables |
|
899
|
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
Aside from the current line and the character cursor, the Tokenizer |
|
901
|
|
|
|
|
|
|
maintains a number of different state variables. |
|
902
|
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
=over |
|
904
|
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
=item Current Class |
|
906
|
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
The Tokenizer maintains the current token class at all times. Much of the |
|
908
|
|
|
|
|
|
|
time is just going to be the "Whitespace" class, which is what the base of |
|
909
|
|
|
|
|
|
|
a document is. As the tokenizer executes the various character handlers, |
|
910
|
|
|
|
|
|
|
the class changes a lot as it moves a long. In fact, in some instances, |
|
911
|
|
|
|
|
|
|
the character handler may not handle the character directly itself, but |
|
912
|
|
|
|
|
|
|
rather change the "current class" and then hand off to the character |
|
913
|
|
|
|
|
|
|
handler for the new class. |
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
Because of this, and some other things I'll deal with later, the number of |
|
916
|
|
|
|
|
|
|
times the character handlers are called does not in fact have a direct |
|
917
|
|
|
|
|
|
|
relationship to the number of actual characters in the document. |
|
918
|
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=item Current Zone |
|
920
|
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
Rather than create a class stack to allow for infinitely nested layers of |
|
922
|
|
|
|
|
|
|
classes, the Tokenizer recognises just a single layer. |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
To put it a different way, in various parts of the file, the Tokenizer will |
|
925
|
|
|
|
|
|
|
recognise different "base" or "substrate" classes. When a Token such as a |
|
926
|
|
|
|
|
|
|
comment or a number is finalised by the tokenizer, it "falls back" to the |
|
927
|
|
|
|
|
|
|
base state. |
|
928
|
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
This allows proper tokenization of special areas such as __DATA__ |
|
930
|
|
|
|
|
|
|
and __END__ blocks, which also contain things like comments and POD, |
|
931
|
|
|
|
|
|
|
without allowing the creation of any significant Tokens inside these areas. |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
For the main part of a document we use L for this, |
|
934
|
|
|
|
|
|
|
with the idea being that code is "floating in a sea of whitespace". |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=item Current Token |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
The final main state variable is the "current token". This is the Token |
|
939
|
|
|
|
|
|
|
that is currently being built by the Tokenizer. For certain types, it |
|
940
|
|
|
|
|
|
|
can be manipulated and morphed and change class quite a bit while being |
|
941
|
|
|
|
|
|
|
assembled, as the Tokenizer's understanding of the token content changes. |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
When the Tokenizer is confident that it has seen the end of the Token, it |
|
944
|
|
|
|
|
|
|
will be "finalized", which adds it to the output token array and resets |
|
945
|
|
|
|
|
|
|
the current class to that of the zone that we are currently in. |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
I should also note at this point that the "current token" variable is |
|
948
|
|
|
|
|
|
|
optional. The Tokenizer is capable of knowing what class it is currently |
|
949
|
|
|
|
|
|
|
set to, without actually having accumulated any characters in the Token. |
|
950
|
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
=back |
|
952
|
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=head2 Making It Faster |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
As I'm sure you can imagine, calling several different methods for each |
|
956
|
|
|
|
|
|
|
character and running regexes and other complex heuristics made the first |
|
957
|
|
|
|
|
|
|
fully working version of the tokenizer extremely slow. |
|
958
|
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
During testing, I created a metric to measure parsing speed called |
|
960
|
|
|
|
|
|
|
LPGC, or "lines per gigacycle" . A gigacycle is simple a billion CPU |
|
961
|
|
|
|
|
|
|
cycles on a typical single-core CPU, and so a Tokenizer running at |
|
962
|
|
|
|
|
|
|
"1000 lines per gigacycle" should generate around 1200 lines of tokenized |
|
963
|
|
|
|
|
|
|
code when running on a 1200 MHz processor. |
|
964
|
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
The first working version of the tokenizer ran at only 350 LPGC, so to |
|
966
|
|
|
|
|
|
|
tokenize a typical large module such as L took |
|
967
|
|
|
|
|
|
|
10-15 seconds. This sluggishness made it unpractical for many uses. |
|
968
|
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
So in the current parser, there are multiple layers of optimisation |
|
970
|
|
|
|
|
|
|
very carefully built in to the basic. This has brought the tokenizer |
|
971
|
|
|
|
|
|
|
up to a more reasonable 1000 LPGC, at the expense of making the code |
|
972
|
|
|
|
|
|
|
quite a bit twistier. |
|
973
|
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
=head2 Making It Faster - Whole Line Classification |
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
The first step in the optimisation process was to add a hew handler to |
|
977
|
|
|
|
|
|
|
enable several of the more basic classes (whitespace, comments) to be |
|
978
|
|
|
|
|
|
|
able to be parsed a line at a time. At the start of each line, a |
|
979
|
|
|
|
|
|
|
special optional handler (only supported by a few classes) is called to |
|
980
|
|
|
|
|
|
|
check and see if the entire line can be parsed in one go. |
|
981
|
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
This is used mainly to handle things like POD, comments, empty lines, |
|
983
|
|
|
|
|
|
|
and a few other minor special cases. |
|
984
|
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=head2 Making It Faster - Inlining |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
The second stage of the optimisation involved inlining a small |
|
988
|
|
|
|
|
|
|
number of critical methods that were repeated an extremely high number |
|
989
|
|
|
|
|
|
|
of times. Profiling suggested that there were about 1,000,000 individual |
|
990
|
|
|
|
|
|
|
method calls per gigacycle, and by cutting these by two thirds a significant |
|
991
|
|
|
|
|
|
|
speed improvement was gained, in the order of about 50%. |
|
992
|
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
You may notice that many methods in the C code look |
|
994
|
|
|
|
|
|
|
very nested and long hand. This is primarily due to this inlining. |
|
995
|
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
At around this time, some statistics code that existed in the early |
|
997
|
|
|
|
|
|
|
versions of the parser was also removed, as it was determined that |
|
998
|
|
|
|
|
|
|
it was consuming around 15% of the CPU for the entire parser, while |
|
999
|
|
|
|
|
|
|
making the core more complicated. |
|
1000
|
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
A judgment call was made that with the difficulties likely to be |
|
1002
|
|
|
|
|
|
|
encountered with future planned enhancements, and given the relatively |
|
1003
|
|
|
|
|
|
|
high cost involved, the statistics features would be removed from the |
|
1004
|
|
|
|
|
|
|
Tokenizer. |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
=head2 Making It Faster - Quote Engine |
|
1007
|
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
Once inlining had reached diminishing returns, it became obvious from |
|
1009
|
|
|
|
|
|
|
the profiling results that a huge amount of time was being spent |
|
1010
|
|
|
|
|
|
|
stepping a char at a time though long, simple and "syntactically boring" |
|
1011
|
|
|
|
|
|
|
code such as comments and strings. |
|
1012
|
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
The existing regex engine was expanded to also encompass quotes and |
|
1014
|
|
|
|
|
|
|
other quote-like things, and a special abstract base class was added |
|
1015
|
|
|
|
|
|
|
that provided a number of specialised parsing methods that would "scan |
|
1016
|
|
|
|
|
|
|
ahead", looking out ahead to find the end of a string, and updating |
|
1017
|
|
|
|
|
|
|
the cursor to leave it in a valid position for the next call. |
|
1018
|
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
This is also the point at which the number of character handler calls began |
|
1020
|
|
|
|
|
|
|
to greatly differ from the number of characters. But it has been done |
|
1021
|
|
|
|
|
|
|
in a way that allows the parser to retain the power of the original |
|
1022
|
|
|
|
|
|
|
version at the critical points, while skipping through the "boring bits" |
|
1023
|
|
|
|
|
|
|
as needed for additional speed. |
|
1024
|
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
The addition of this feature allowed the tokenizer to exceed 1000 LPGC |
|
1026
|
|
|
|
|
|
|
for the first time. |
|
1027
|
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
=head2 Making It Faster - The "Complete" Mechanism |
|
1029
|
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
As it became evident that great speed increases were available by using |
|
1031
|
|
|
|
|
|
|
this "skipping ahead" mechanism, a new handler method was added that |
|
1032
|
|
|
|
|
|
|
explicitly handles the parsing of an entire token, where the structure |
|
1033
|
|
|
|
|
|
|
of the token is relatively simple. Tokens such as symbols fit this case, |
|
1034
|
|
|
|
|
|
|
as once we are passed the initial sigil and word char, we know that we |
|
1035
|
|
|
|
|
|
|
can skip ahead and "complete" the rest of the token much more easily. |
|
1036
|
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
A number of these have been added for most or possibly all of the common |
|
1038
|
|
|
|
|
|
|
cases, with most of these "complete" handlers implemented using regular |
|
1039
|
|
|
|
|
|
|
expressions. |
|
1040
|
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
In fact, so many have been added that at this point, you could arguably |
|
1042
|
|
|
|
|
|
|
reclassify the tokenizer as a "hybrid regex, char-by=char heuristic |
|
1043
|
|
|
|
|
|
|
tokenizer". More tokens are now consumed in "complete" methods in a |
|
1044
|
|
|
|
|
|
|
typical program than are handled by the normal char-by-char methods. |
|
1045
|
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
Many of the these complete-handlers were implemented during the writing |
|
1047
|
|
|
|
|
|
|
of the Lexer, and this has allowed the full parser to maintain around |
|
1048
|
|
|
|
|
|
|
1000 LPGC despite the increasing weight of the Lexer. |
|
1049
|
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
=head2 Making It Faster - Porting To C (In Progress) |
|
1051
|
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
While it would be extraordinarily difficult to port all of the Tokenizer |
|
1053
|
|
|
|
|
|
|
to C, work has started on a L "accelerator" package which acts as |
|
1054
|
|
|
|
|
|
|
a separate and automatically-detected add-on to the main PPI package. |
|
1055
|
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
L implements faster versions of a variety of functions scattered |
|
1057
|
|
|
|
|
|
|
over the entire PPI codebase, from the Tokenizer Core, Quote Engine, and |
|
1058
|
|
|
|
|
|
|
various other places, and implements them identically in XS/C. |
|
1059
|
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
In particular, the skip-ahead methods from the Quote Engine would appear |
|
1061
|
|
|
|
|
|
|
to be extremely amenable to being done in C, and a number of other |
|
1062
|
|
|
|
|
|
|
functions could be cherry-picked one at a time and implemented in C. |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
Each method is heavily tested to ensure that the functionality is |
|
1065
|
|
|
|
|
|
|
identical, and a versioning mechanism is included to ensure that if a |
|
1066
|
|
|
|
|
|
|
function gets out of sync, L will degrade gracefully and just |
|
1067
|
|
|
|
|
|
|
not replace that single method. |
|
1068
|
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
=head1 TO DO |
|
1070
|
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
- Add an option to reset or seek the token stream... |
|
1072
|
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
- Implement more Tokenizer functions in L |
|
1074
|
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
=head1 SUPPORT |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
See the L in the main module. |
|
1078
|
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
=head1 AUTHOR |
|
1080
|
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
Adam Kennedy Eadamk@cpan.orgE |
|
1082
|
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
Copyright 2001 - 2011 Adam Kennedy. |
|
1086
|
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
This program is free software; you can redistribute |
|
1088
|
|
|
|
|
|
|
it and/or modify it under the same terms as Perl itself. |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
The full text of the license can be found in the |
|
1091
|
|
|
|
|
|
|
LICENSE file included with this module. |
|
1092
|
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
=cut |