| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package String::Tokenizer; |
|
3
|
|
|
|
|
|
|
|
|
4
|
2
|
|
|
2
|
|
27758
|
use 5.006; |
|
|
2
|
|
|
|
|
5
|
|
|
5
|
2
|
|
|
2
|
|
8
|
use strict; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
37
|
|
|
6
|
2
|
|
|
2
|
|
5
|
use warnings; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
70
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.06'; |
|
9
|
|
|
|
|
|
|
|
|
10
|
2
|
|
|
2
|
|
7
|
use constant RETAIN_WHITESPACE => 1; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
129
|
|
|
11
|
2
|
|
|
2
|
|
6
|
use constant IGNORE_WHITESPACE => 0; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
950
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
### constructor |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
sub new { |
|
16
|
10
|
|
|
10
|
1
|
3834
|
my ($_class, @args) = @_; |
|
17
|
10
|
|
33
|
|
|
41
|
my $class = ref($_class) || $_class; |
|
18
|
10
|
|
|
|
|
26
|
my $string_tokenizer = { |
|
19
|
|
|
|
|
|
|
tokens => [], |
|
20
|
|
|
|
|
|
|
delimiter => undef, |
|
21
|
|
|
|
|
|
|
handle_whitespace => IGNORE_WHITESPACE |
|
22
|
|
|
|
|
|
|
}; |
|
23
|
10
|
|
|
|
|
10
|
bless($string_tokenizer, $class); |
|
24
|
10
|
100
|
|
|
|
28
|
$string_tokenizer->tokenize(@args) if @args; |
|
25
|
10
|
|
|
|
|
33
|
return $string_tokenizer; |
|
26
|
|
|
|
|
|
|
} |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
### methods |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub setDelimiter { |
|
31
|
8
|
|
|
8
|
1
|
7
|
my ($self, $delimiter) = @_; |
|
32
|
8
|
|
|
|
|
24
|
my $delimiter_reg_exp = join "\|" => map { s/(\W)/\\$1/g; $_ } split // => $delimiter; |
|
|
26
|
|
|
|
|
112
|
|
|
|
26
|
|
|
|
|
44
|
|
|
33
|
8
|
|
|
|
|
111
|
$self->{delimiter} = qr/$delimiter_reg_exp/; |
|
34
|
|
|
|
|
|
|
} |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub handleWhitespace { |
|
37
|
3
|
|
|
3
|
1
|
4
|
my ($self, $value) = @_; |
|
38
|
3
|
|
|
|
|
4
|
$self->{handle_whitespace} = $value; |
|
39
|
|
|
|
|
|
|
} |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub tokenize { |
|
42
|
10
|
|
|
10
|
1
|
392
|
my ($self, $string, $delimiter, $handle_whitespace) = @_; |
|
43
|
|
|
|
|
|
|
# if we have a delimiter passed in then use it |
|
44
|
10
|
100
|
|
|
|
27
|
$self->setDelimiter($delimiter) if defined $delimiter; |
|
45
|
|
|
|
|
|
|
# if we are asking about whitespace then handle it |
|
46
|
10
|
100
|
|
|
|
22
|
$self->handleWhitespace($handle_whitespace) if defined $handle_whitespace; |
|
47
|
|
|
|
|
|
|
# if the two above are not handled, then the object will use |
|
48
|
|
|
|
|
|
|
# the values set already. |
|
49
|
|
|
|
|
|
|
# split everything by whitespace no matter what |
|
50
|
|
|
|
|
|
|
# (possible multiple occurances of white space too) |
|
51
|
10
|
|
|
|
|
8
|
my @tokens; |
|
52
|
10
|
100
|
|
|
|
20
|
if ($self->{handle_whitespace}) { |
|
53
|
2
|
|
|
|
|
21
|
@tokens = split /(\s+)/ => $string; |
|
54
|
|
|
|
|
|
|
} |
|
55
|
|
|
|
|
|
|
else { |
|
56
|
8
|
|
|
|
|
66
|
@tokens = split /\s+/ => $string; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
10
|
100
|
|
|
|
21
|
if ($self->{delimiter}) { |
|
59
|
|
|
|
|
|
|
# create the delimiter reg-ex |
|
60
|
|
|
|
|
|
|
# escape all non-alpha-numeric |
|
61
|
|
|
|
|
|
|
# characters, just to be safe |
|
62
|
9
|
|
|
|
|
11
|
my $delimiter = $self->{delimiter}; |
|
63
|
|
|
|
|
|
|
# loop through the tokens |
|
64
|
|
|
|
|
|
|
@tokens = map { |
|
65
|
|
|
|
|
|
|
# if the token contains a delimiter then ... |
|
66
|
9
|
100
|
|
|
|
11
|
if (/$delimiter/) { |
|
|
148
|
|
|
|
|
339
|
|
|
67
|
60
|
|
|
|
|
43
|
my ($token, @_tokens); |
|
68
|
|
|
|
|
|
|
# split the token up into characters |
|
69
|
|
|
|
|
|
|
# and the loop through all the characters |
|
70
|
60
|
|
|
|
|
83
|
foreach my $char (split //) { |
|
71
|
|
|
|
|
|
|
# if the character is a delimiter |
|
72
|
196
|
100
|
|
|
|
422
|
if ($char =~ /^$delimiter$/) { |
|
73
|
|
|
|
|
|
|
# and we already have a token in the works |
|
74
|
75
|
100
|
66
|
|
|
190
|
if (defined($token) && $token =~ /^.*$/) { |
|
75
|
|
|
|
|
|
|
# add the token to the |
|
76
|
|
|
|
|
|
|
# temp tokens list |
|
77
|
31
|
|
|
|
|
33
|
push @_tokens => $token; |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
# and then push our delimiter character |
|
80
|
|
|
|
|
|
|
# onto the temp tokens list |
|
81
|
75
|
|
|
|
|
57
|
push @_tokens => $char; |
|
82
|
|
|
|
|
|
|
# now we need to undefine our token |
|
83
|
75
|
|
|
|
|
82
|
$token = undef; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
# if the character is not a delimiter then |
|
86
|
|
|
|
|
|
|
else { |
|
87
|
|
|
|
|
|
|
# check to make sure the token is defined |
|
88
|
121
|
100
|
|
|
|
143
|
$token = "" unless defined $token; |
|
89
|
|
|
|
|
|
|
# and then add the chracter to it |
|
90
|
121
|
|
|
|
|
130
|
$token .= $char; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
# now push any remaining token onto |
|
94
|
|
|
|
|
|
|
# the temp tokens list |
|
95
|
60
|
100
|
|
|
|
81
|
push @_tokens => $token if defined $token; |
|
96
|
|
|
|
|
|
|
# and return tokens |
|
97
|
60
|
|
|
|
|
102
|
@_tokens; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
# if our token does not have |
|
100
|
|
|
|
|
|
|
# the delimiter in it |
|
101
|
|
|
|
|
|
|
else { |
|
102
|
|
|
|
|
|
|
# just return it |
|
103
|
88
|
|
|
|
|
107
|
$_ |
|
104
|
|
|
|
|
|
|
} |
|
105
|
|
|
|
|
|
|
} @tokens; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
10
|
|
|
|
|
31
|
$self->{tokens} = \@tokens; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub getTokens { |
|
111
|
6
|
|
|
6
|
1
|
680
|
my ($self) = @_; |
|
112
|
|
|
|
|
|
|
return wantarray ? |
|
113
|
1
|
|
|
|
|
6
|
@{$self->{tokens}} |
|
114
|
|
|
|
|
|
|
: |
|
115
|
6
|
100
|
|
|
|
21
|
$self->{tokens}; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub iterator { |
|
119
|
4
|
|
|
4
|
1
|
1589
|
my ($self) = @_; |
|
120
|
|
|
|
|
|
|
# returns a copy of the array |
|
121
|
4
|
|
|
|
|
10
|
return String::Tokenizer::Iterator->new($self->{tokens}); |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
package String::Tokenizer::Iterator; |
|
125
|
|
|
|
|
|
|
|
|
126
|
2
|
|
|
2
|
|
8
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
31
|
|
|
127
|
2
|
|
|
2
|
|
5
|
use warnings; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
941
|
|
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub new { |
|
130
|
5
|
100
|
|
5
|
|
34
|
((caller())[0] eq "String::Tokenizer") |
|
131
|
|
|
|
|
|
|
|| die "Insufficient Access Priviledges : Only String::Tokenizer can create String::Tokenizer::Iterator instances"; |
|
132
|
4
|
|
|
|
|
5
|
my ($_class, $tokens) = @_; |
|
133
|
4
|
|
33
|
|
|
15
|
my $class = ref($_class) || $_class; |
|
134
|
4
|
|
|
|
|
16
|
my $iterator = { |
|
135
|
|
|
|
|
|
|
tokens => $tokens, |
|
136
|
|
|
|
|
|
|
index => 0 |
|
137
|
|
|
|
|
|
|
}; |
|
138
|
4
|
|
|
|
|
6
|
bless($iterator, $class); |
|
139
|
4
|
|
|
|
|
8
|
return $iterator; |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
sub reset { |
|
143
|
1
|
|
|
1
|
|
2
|
my ($self) = @_; |
|
144
|
1
|
|
|
|
|
2
|
$self->{index} = 0; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub hasNextToken { |
|
148
|
108
|
|
|
108
|
|
3750
|
my ($self) = @_; |
|
149
|
108
|
100
|
|
|
|
69
|
return ($self->{index} < scalar @{$self->{tokens}}) ? 1 : 0; |
|
|
108
|
|
|
|
|
189
|
|
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub hasPrevToken { |
|
153
|
26
|
|
|
26
|
|
20
|
my ($self) = @_; |
|
154
|
26
|
|
|
|
|
28
|
return ($self->{index} > 0); |
|
155
|
|
|
|
|
|
|
} |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub nextToken { |
|
158
|
118
|
|
|
118
|
|
978
|
my ($self) = @_; |
|
159
|
118
|
100
|
|
|
|
78
|
return undef if ($self->{index} >= scalar @{$self->{tokens}}); |
|
|
118
|
|
|
|
|
187
|
|
|
160
|
117
|
|
|
|
|
209
|
return $self->{tokens}->[$self->{index}++]; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub prevToken { |
|
164
|
26
|
|
|
26
|
|
52
|
my ($self) = @_; |
|
165
|
26
|
100
|
|
|
|
32
|
return undef if ($self->{index} <= 0); |
|
166
|
25
|
|
|
|
|
33
|
return $self->{tokens}->[--$self->{index}]; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub currentToken { |
|
170
|
25
|
|
|
25
|
|
49
|
my ($self) = @_; |
|
171
|
25
|
|
|
|
|
37
|
return $self->{tokens}->[$self->{index} - 1]; |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
sub lookAheadToken { |
|
175
|
41
|
|
|
41
|
|
16823
|
my ($self) = @_; |
|
176
|
|
|
|
|
|
|
return undef if ( $self->{index} <= 0 |
|
177
|
41
|
100
|
100
|
|
|
108
|
|| $self->{index} >= scalar @{$self->{tokens}}); |
|
|
40
|
|
|
|
|
131
|
|
|
178
|
38
|
|
|
|
|
111
|
return $self->{tokens}->[$self->{index}]; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub collectTokensUntil { |
|
182
|
5
|
|
|
5
|
|
8
|
my ($self, $token_to_match) = @_; |
|
183
|
|
|
|
|
|
|
# if this matches our current token ... |
|
184
|
|
|
|
|
|
|
# then we just return nothing as there |
|
185
|
|
|
|
|
|
|
# is nothing to accumulate |
|
186
|
5
|
100
|
|
|
|
8
|
if ($self->lookAheadToken() eq $token_to_match) { |
|
187
|
|
|
|
|
|
|
# then just advance it one |
|
188
|
1
|
|
|
|
|
2
|
$self->nextToken(); |
|
189
|
|
|
|
|
|
|
# and return nothing |
|
190
|
1
|
|
|
|
|
3
|
return; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# if it doesnt match our current token then, ... |
|
194
|
4
|
|
|
|
|
5
|
my @collection; |
|
195
|
|
|
|
|
|
|
# store the index we start at |
|
196
|
4
|
|
|
|
|
4
|
my $old_index = $self->{index}; |
|
197
|
4
|
|
|
|
|
4
|
my $matched; |
|
198
|
|
|
|
|
|
|
# loop through the tokens |
|
199
|
4
|
|
|
|
|
6
|
while ($self->hasNextToken()) { |
|
200
|
23
|
|
|
|
|
22
|
my $token = $self->nextToken(); |
|
201
|
23
|
100
|
|
|
|
25
|
if ($token ne $token_to_match) { |
|
202
|
20
|
|
|
|
|
28
|
push @collection => $token; |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
else { |
|
205
|
3
|
|
|
|
|
3
|
$matched++; |
|
206
|
3
|
|
|
|
|
4
|
last; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
} |
|
209
|
4
|
100
|
|
|
|
7
|
unless ($matched) { |
|
210
|
|
|
|
|
|
|
# reset back to where we started, and ... |
|
211
|
1
|
|
|
|
|
2
|
$self->{index} = $old_index; |
|
212
|
|
|
|
|
|
|
# and return nothing |
|
213
|
1
|
|
|
|
|
4
|
return; |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
# and return our collection |
|
216
|
3
|
|
|
|
|
20
|
return @collection; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub skipTokensUntil { |
|
221
|
3
|
|
|
3
|
|
5
|
my ($self, $token_to_match) = @_; |
|
222
|
|
|
|
|
|
|
# if this matches our current token ... |
|
223
|
3
|
100
|
|
|
|
6
|
if ($self->lookAheadToken() eq $token_to_match) { |
|
224
|
|
|
|
|
|
|
# then just advance it one |
|
225
|
1
|
|
|
|
|
3
|
$self->nextToken(); |
|
226
|
|
|
|
|
|
|
# and return success |
|
227
|
1
|
|
|
|
|
4
|
return 1; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
# if it doesnt match our current token then, ... |
|
230
|
|
|
|
|
|
|
# store the index we start at |
|
231
|
2
|
|
|
|
|
4
|
my $old_index = $self->{index}; |
|
232
|
|
|
|
|
|
|
# and loop through the tokens |
|
233
|
2
|
|
|
|
|
4
|
while ($self->hasNextToken()) { |
|
234
|
|
|
|
|
|
|
# return success if we match our token |
|
235
|
18
|
100
|
|
|
|
17
|
return 1 if ($self->nextToken() eq $token_to_match); |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
# otherwise we didnt match, and should |
|
238
|
|
|
|
|
|
|
# reset back to where we started, and ... |
|
239
|
1
|
|
|
|
|
2
|
$self->{index} = $old_index; |
|
240
|
|
|
|
|
|
|
# return failure |
|
241
|
1
|
|
|
|
|
3
|
return 0; |
|
242
|
|
|
|
|
|
|
} |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub skipTokenIfWhitespace { |
|
245
|
4
|
|
|
4
|
|
6
|
my ($self) = @_; |
|
246
|
4
|
100
|
|
|
|
6
|
$self->{index}++ if $self->lookAheadToken() =~ /^\s+$/; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub skipTokens { |
|
250
|
13
|
|
|
13
|
|
21
|
my ($self, $num_token_to_skip) = @_; |
|
251
|
13
|
|
100
|
|
|
29
|
$num_token_to_skip ||= 1; |
|
252
|
13
|
|
|
|
|
16
|
$self->{index} += $num_token_to_skip; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
*skipToken = \&skipTokens; |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
1; |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
__END__ |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head1 NAME |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
String::Tokenizer - A simple string tokenizer. |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
use String::Tokenizer; |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# create the tokenizer and tokenize input |
|
270
|
|
|
|
|
|
|
my $tokenizer = String::Tokenizer->new("((5+5) * 10)", '+*()'); |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# create tokenizer |
|
273
|
|
|
|
|
|
|
my $tokenizer = String::Tokenizer->new(); |
|
274
|
|
|
|
|
|
|
# ... then tokenize the string |
|
275
|
|
|
|
|
|
|
$tokenizer->tokenize("((5 + 5) - 10)", '()'); |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# will print '(, (, 5, +, 5, ), -, 10, )' |
|
278
|
|
|
|
|
|
|
print join ", " => $tokenizer->getTokens(); |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# create tokenizer which retains whitespace |
|
281
|
|
|
|
|
|
|
my $st = String::Tokenizer->new( |
|
282
|
|
|
|
|
|
|
'this is a test with, (significant) whitespace', |
|
283
|
|
|
|
|
|
|
',()', |
|
284
|
|
|
|
|
|
|
String::Tokenizer->RETAIN_WHITESPACE |
|
285
|
|
|
|
|
|
|
); |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# this will print: |
|
288
|
|
|
|
|
|
|
# 'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace' |
|
289
|
|
|
|
|
|
|
print "'" . (join "', '" => $tokenizer->getTokens()) . "'"; |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# get a token iterator |
|
292
|
|
|
|
|
|
|
my $i = $tokenizer->iterator(); |
|
293
|
|
|
|
|
|
|
while ($i->hasNextToken()) { |
|
294
|
|
|
|
|
|
|
my $next = $i->nextToken(); |
|
295
|
|
|
|
|
|
|
# peek ahead at the next token |
|
296
|
|
|
|
|
|
|
my $look_ahead = $i->lookAheadToken(); |
|
297
|
|
|
|
|
|
|
# ... |
|
298
|
|
|
|
|
|
|
# skip the next 2 tokens |
|
299
|
|
|
|
|
|
|
$i->skipTokens(2); |
|
300
|
|
|
|
|
|
|
# ... |
|
301
|
|
|
|
|
|
|
# then backtrack 1 token |
|
302
|
|
|
|
|
|
|
my $previous = $i->prevToken(); |
|
303
|
|
|
|
|
|
|
# ... |
|
304
|
|
|
|
|
|
|
# get the current token |
|
305
|
|
|
|
|
|
|
my $current = $i->currentToken(); |
|
306
|
|
|
|
|
|
|
# ... |
|
307
|
|
|
|
|
|
|
} |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
A simple string tokenizer which takes a string and splits it on whitespace. It also optionally takes a string of characters to use as delimiters, and returns them with the token set as well. This allows for splitting the string in many different ways. |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
This is a very basic tokenizer, so more complex needs should be either addressed with a custom written tokenizer or post-processing of the output generated by this module. Basically, this will not fill everyone's needs, but it spans a gap between simple C<split / /, $string> and the other options that involve much larger and complex modules. |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
Also note that this is not a lexical analyser. Many people confuse tokenization with lexical analysis. A tokenizer merely splits its input into specific chunks, a lexical analyzer classifies those chunks. Sometimes these two steps are combined, but not here. |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head1 METHODS |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=over 4 |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=item B<new ($string, $delimiters, $handle_whitespace)> |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
If you do not supply any parameters, nothing happens, the instance is just created. But if you do supply parameters, they are passed on to the C<tokenize> method and that method is run. For information about those arguments, see C<tokenize> below. |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=item B<setDelimiter ($delimiter)> |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
This can be used to set the delimiter string, this is used by C<tokenize>. |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=item B<handleWhitespace ($value)> |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
This can be used to set the whitespace handling. It accepts one of the two constant values C<RETAIN_WHITESPACE> or C<IGNORE_WHITESPACE>. |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=item B<tokenize ($string, $delimiters, $handle_whitespace)> |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Takes a C<$string> to tokenize, and optionally a set of C<$delimiter> characters to facilitate the tokenization and the type of whitespace handling with C<$handle_whitespace>. The C<$string> parameter and the C<$handle_whitespace> parameter are pretty obvious, the C<$delimiter> parameter is not as transparent. C<$delimiter> is a string of characters, these characters are then separated into individual characters and are used to split the C<$string> with. So given this string: |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
(5 + (100 * (20 - 35)) + 4) |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
The C<tokenize> method without a C<$delimiter> parameter would return the following comma separated list of tokens: |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
'(5', '+', '(100', '*', '(20', '-', '35))', '+', '4)' |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
However, if you were to pass the following set of delimiters C<(, )> to C<tokenize>, you would get the following comma separated list of tokens: |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
'(', '5', '+', '(', '100', '*', '(', '20', '-', '35', ')', ')', '+', '4', ')' |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
We now can differentiate the parens from the numbers, and no globbing occurs. If you wanted to allow for optionally leaving out the whitespace in the expression, like this: |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
(5+(100*(20-35))+4) |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
as some languages do. Then you would give this delimiter C<+*-()> to arrive at the same result. |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
If you decide that whitespace is significant in your string, then you need to specify that like this: |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
my $st = String::Tokenizer->new( |
|
356
|
|
|
|
|
|
|
'this is a test with, (significant) whitespace', |
|
357
|
|
|
|
|
|
|
',()', |
|
358
|
|
|
|
|
|
|
String::Tokenizer->RETAIN_WHITESPACE |
|
359
|
|
|
|
|
|
|
); |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
A call to C<getTokens> on this instance would result in the following token set. |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace' |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
All running whitespace is grouped together into a single token, we make no attempt to split it into its individual parts. |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=item B<getTokens> |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Simply returns the array of tokens. It returns an array-ref in scalar context. |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=item B<iterator> |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Returns a B<String::Tokenizer::Iterator> instance, see below for more details. |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=back |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=head1 INNER CLASS |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
A B<String::Tokenizer::Iterator> instance is returned from the B<String::Tokenizer>'s C<iterator> method and serves as yet another means of iterating through an array of tokens. The simplest way would be to call C<getTokens> and just manipulate the array yourself, or push the array into another object. However, iterating through a set of tokens tends to get messy when done manually. So here I have provided the B<String::Tokenizer::Iterator> to address those common token processing idioms. It is basically a bi-directional iterator which can look ahead, skip and be reset to the beginning. |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
B<NOTE:> |
|
382
|
|
|
|
|
|
|
B<String::Tokenizer::Iterator> is an inner class, which means that only B<String::Tokenizer> objects can create an instance of it. That said, if B<String::Tokenizer::Iterator>'s C<new> method is called from outside of the B<String::Tokenizer> package, an exception is thrown. |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=over 4 |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=item B<new ($tokens_array_ref)> |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
This accepts an array reference of tokens and sets up the iterator. This method can only be called from within the B<String::Tokenizer> package, otherwise an exception will be thrown. |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=item B<reset> |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
This will reset the internal counter, |
|
393
|
|
|
|
|
|
|
bringing it back to the beginning of the token list. |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=item B<hasNextToken> |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
This will return true (1) if there are more tokens to be iterated over, |
|
398
|
|
|
|
|
|
|
and false (0) otherwise. |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=item B<hasPrevToken> |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
This will return true (1) if the beginning of the token list has been reached, and false (0) otherwise. |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=item B<nextToken> |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
This dispenses the next available token, and move the internal counter ahead by one. |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=item B<prevToken> |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
This dispenses the previous token, and moves the internal counter back by one. |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=item B<currentToken> |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
This returns the current token, which will match the last token retrieved by C<nextToken>. |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
=item B<lookAheadToken> |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
This peeks ahead one token to the next one in the list. This item will match the next item dispensed with C<nextToken>. This is a non-destructive look ahead, meaning it does not alter the position of the internal counter. |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=item B<skipToken> |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
This will jump the internal counter ahead by 1. |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item B<skipTokens ($number_to_skip)> |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
This will jump the internal counter ahead by C<$number_to_skip>. |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item B<skipTokenIfWhitespace> |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
This will skip the next token if it is whitespace. |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=item B<skipTokensUntil ($token_to_match)> |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Given a string as a C<$token_to_match>, this will skip all tokens until it matches that string. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state. |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=item B<collectTokensUntil ($token_to_match)> |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
Given a string as a C<$token_to_match>, this will collect all tokens until it matches that string, at which point the collected tokens will be returned. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state and no tokens will be returned. |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=back |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head1 TO DO |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=over 4 |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=item I<Inline token expansion> |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
The Java StringTokenizer class allows for a token to be tokenized further, therefore breaking it up more and including the results into the current token stream. I have never used this feature in this class, but I can see where it might be a useful one. This may be in the next release if it works out. |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Possibly compliment this expansion with compression as well, so for instance double quoted strings could be compressed into a single token. |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=item I<Token Bookmarks> |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Allow for the creation of "token bookmarks". Meaning we could tag a specific token with a label, that index could be returned to from any point in the token stream. We could mix this with a memory stack as well, so that we would have an ordering to the bookmarks as well. |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=back |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=head1 BUGS |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
None that I am aware of. Of course, if you find a bug, let me know, and I will be sure to fix it. |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
=head1 CODE COVERAGE |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
I use B<Devel::Cover> to test the code coverage of my tests, below is the B<Devel::Cover> report on this module's test suite. |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
|
467
|
|
|
|
|
|
|
File stmt branch cond sub pod time total |
|
468
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
|
469
|
|
|
|
|
|
|
String/Tokenizer.pm 100.0 100.0 64.3 100.0 100.0 100.0 97.6 |
|
470
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
|
471
|
|
|
|
|
|
|
Total 100.0 100.0 64.3 100.0 100.0 100.0 97.6 |
|
472
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
The interface and workings of this module are based largely on the StringTokenizer class from the Java standard library. |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
Below is a short list of other modules that might be considered similar to this one. If this module does not suit your needs, you might look at one of these. |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=over 4 |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=item L<String::Tokeniser> |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
Along with being a tokenizer, |
|
485
|
|
|
|
|
|
|
it also provides a means of moving through the resulting tokens, |
|
486
|
|
|
|
|
|
|
allowing for skipping of tokens and such. |
|
487
|
|
|
|
|
|
|
It was last updated in 2011. |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=item L<Parse::Tokens> |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
This one hasn't been touched since 2001, |
|
492
|
|
|
|
|
|
|
although it did get up to version 0.27. |
|
493
|
|
|
|
|
|
|
It looks to lean over more towards the parser side than a basic tokenizer. |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
=item L<Text::Tokenizer> |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
This is both a lexical analyzer and a tokenizer. |
|
498
|
|
|
|
|
|
|
It also uses XS, where String::Tokenizer is pure perl. |
|
499
|
|
|
|
|
|
|
This is something maybe to look into if you were to need a more beefy solution |
|
500
|
|
|
|
|
|
|
than String::Tokenizer provides. |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=back |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head1 THANKS |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=over |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=item Thanks to Stephan Tobias for finding bugs and suggestions on whitespace handling. |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=back |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=head1 AUTHOR |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
stevan little, E<lt>stevan@cpan.orgE<gt> |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Copyright 2004-2016 by Infinity Interactive, Inc. |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
L<http://www.iinteractive.com> |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
523
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=cut |