File Coverage

blib/lib/String/Tokenizer.pm

Criterion	Covered	Total	%
statement	119	119	100.0
branch	44	44	100.0
condition	9	14	64.2
subroutine	25	25	100.0
pod	6	6	100.0
total	203	208	97.6

line	stmt	bran	cond	sub	pod	time	code
1
2							package String::Tokenizer;
3
4	2			2		27758	use 5.006;
	2					5
5	2			2		8	use strict;
	2					2
	2					37
6	2			2		5	use warnings;
	2					5
	2					70
7
8							our $VERSION = '0.06';
9
10	2			2		7	use constant RETAIN_WHITESPACE => 1;
	2					2
	2					129
11	2			2		6	use constant IGNORE_WHITESPACE => 0;
	2					2
	2					950
12
13							### constructor
14
15							sub new {
16	10			10	1	3834	my ($_class, @args) = @_;
17	10		33			41	my $class = ref($_class) \|\| $_class;
18	10					26	my $string_tokenizer = {
19							tokens => [],
20							delimiter => undef,
21							handle_whitespace => IGNORE_WHITESPACE
22							};
23	10					10	bless($string_tokenizer, $class);
24	10	100				28	$string_tokenizer->tokenize(@args) if @args;
25	10					33	return $string_tokenizer;
26							}
27
28							### methods
29
30							sub setDelimiter {
31	8			8	1	7	my ($self, $delimiter) = @_;
32	8					24	my $delimiter_reg_exp = join "\\|" => map { s/(\W)/\\$1/g; $_ } split // => $delimiter;
	26					112
	26					44
33	8					111	$self->{delimiter} = qr/$delimiter_reg_exp/;
34							}
35
36							sub handleWhitespace {
37	3			3	1	4	my ($self, $value) = @_;
38	3					4	$self->{handle_whitespace} = $value;
39							}
40
41							sub tokenize {
42	10			10	1	392	my ($self, $string, $delimiter, $handle_whitespace) = @_;
43							# if we have a delimiter passed in then use it
44	10	100				27	$self->setDelimiter($delimiter) if defined $delimiter;
45							# if we are asking about whitespace then handle it
46	10	100				22	$self->handleWhitespace($handle_whitespace) if defined $handle_whitespace;
47							# if the two above are not handled, then the object will use
48							# the values set already.
49							# split everything by whitespace no matter what
50							# (possible multiple occurances of white space too)
51	10					8	my @tokens;
52	10	100				20	if ($self->{handle_whitespace}) {
53	2					21	@tokens = split /(\s+)/ => $string;
54							}
55							else {
56	8					66	@tokens = split /\s+/ => $string;
57							}
58	10	100				21	if ($self->{delimiter}) {
59							# create the delimiter reg-ex
60							# escape all non-alpha-numeric
61							# characters, just to be safe
62	9					11	my $delimiter = $self->{delimiter};
63							# loop through the tokens
64							@tokens = map {
65							# if the token contains a delimiter then ...
66	9	100				11	if (/$delimiter/) {
	148					339
67	60					43	my ($token, @_tokens);
68							# split the token up into characters
69							# and the loop through all the characters
70	60					83	foreach my $char (split //) {
71							# if the character is a delimiter
72	196	100				422	if ($char =~ /^$delimiter$/) {
73							# and we already have a token in the works
74	75	100	66			190	if (defined($token) && $token =~ /^.*$/) {
75							# add the token to the
76							# temp tokens list
77	31					33	push @_tokens => $token;
78							}
79							# and then push our delimiter character
80							# onto the temp tokens list
81	75					57	push @_tokens => $char;
82							# now we need to undefine our token
83	75					82	$token = undef;
84							}
85							# if the character is not a delimiter then
86							else {
87							# check to make sure the token is defined
88	121	100				143	$token = "" unless defined $token;
89							# and then add the chracter to it
90	121					130	$token .= $char;
91							}
92							}
93							# now push any remaining token onto
94							# the temp tokens list
95	60	100				81	push @_tokens => $token if defined $token;
96							# and return tokens
97	60					102	@_tokens;
98							}
99							# if our token does not have
100							# the delimiter in it
101							else {
102							# just return it
103	88					107	$_
104							}
105							} @tokens;
106							}
107	10					31	$self->{tokens} = \@tokens;
108							}
109
110							sub getTokens {
111	6			6	1	680	my ($self) = @_;
112							return wantarray ?
113	1					6	@{$self->{tokens}}
114							:
115	6	100				21	$self->{tokens};
116							}
117
118							sub iterator {
119	4			4	1	1589	my ($self) = @_;
120							# returns a copy of the array
121	4					10	return String::Tokenizer::Iterator->new($self->{tokens});
122							}
123
124							package String::Tokenizer::Iterator;
125
126	2			2		8	use strict;
	2					3
	2					31
127	2			2		5	use warnings;
	2					2
	2					941
128
129							sub new {
130	5	100		5		34	((caller())[0] eq "String::Tokenizer")
131							\|\| die "Insufficient Access Priviledges : Only String::Tokenizer can create String::Tokenizer::Iterator instances";
132	4					5	my ($_class, $tokens) = @_;
133	4		33			15	my $class = ref($_class) \|\| $_class;
134	4					16	my $iterator = {
135							tokens => $tokens,
136							index => 0
137							};
138	4					6	bless($iterator, $class);
139	4					8	return $iterator;
140							}
141
142							sub reset {
143	1			1		2	my ($self) = @_;
144	1					2	$self->{index} = 0;
145							}
146
147							sub hasNextToken {
148	108			108		3750	my ($self) = @_;
149	108	100				69	return ($self->{index} < scalar @{$self->{tokens}}) ? 1 : 0;
	108					189
150							}
151
152							sub hasPrevToken {
153	26			26		20	my ($self) = @_;
154	26					28	return ($self->{index} > 0);
155							}
156
157							sub nextToken {
158	118			118		978	my ($self) = @_;
159	118	100				78	return undef if ($self->{index} >= scalar @{$self->{tokens}});
	118					187
160	117					209	return $self->{tokens}->[$self->{index}++];
161							}
162
163							sub prevToken {
164	26			26		52	my ($self) = @_;
165	26	100				32	return undef if ($self->{index} <= 0);
166	25					33	return $self->{tokens}->[--$self->{index}];
167							}
168
169							sub currentToken {
170	25			25		49	my ($self) = @_;
171	25					37	return $self->{tokens}->[$self->{index} - 1];
172							}
173
174							sub lookAheadToken {
175	41			41		16823	my ($self) = @_;
176							return undef if ( $self->{index} <= 0
177	41	100	100			108	\|\| $self->{index} >= scalar @{$self->{tokens}});
	40					131
178	38					111	return $self->{tokens}->[$self->{index}];
179							}
180
181							sub collectTokensUntil {
182	5			5		8	my ($self, $token_to_match) = @_;
183							# if this matches our current token ...
184							# then we just return nothing as there
185							# is nothing to accumulate
186	5	100				8	if ($self->lookAheadToken() eq $token_to_match) {
187							# then just advance it one
188	1					2	$self->nextToken();
189							# and return nothing
190	1					3	return;
191							}
192
193							# if it doesnt match our current token then, ...
194	4					5	my @collection;
195							# store the index we start at
196	4					4	my $old_index = $self->{index};
197	4					4	my $matched;
198							# loop through the tokens
199	4					6	while ($self->hasNextToken()) {
200	23					22	my $token = $self->nextToken();
201	23	100				25	if ($token ne $token_to_match) {
202	20					28	push @collection => $token;
203							}
204							else {
205	3					3	$matched++;
206	3					4	last;
207							}
208							}
209	4	100				7	unless ($matched) {
210							# reset back to where we started, and ...
211	1					2	$self->{index} = $old_index;
212							# and return nothing
213	1					4	return;
214							}
215							# and return our collection
216	3					20	return @collection;
217							}
218
219
220							sub skipTokensUntil {
221	3			3		5	my ($self, $token_to_match) = @_;
222							# if this matches our current token ...
223	3	100				6	if ($self->lookAheadToken() eq $token_to_match) {
224							# then just advance it one
225	1					3	$self->nextToken();
226							# and return success
227	1					4	return 1;
228							}
229							# if it doesnt match our current token then, ...
230							# store the index we start at
231	2					4	my $old_index = $self->{index};
232							# and loop through the tokens
233	2					4	while ($self->hasNextToken()) {
234							# return success if we match our token
235	18	100				17	return 1 if ($self->nextToken() eq $token_to_match);
236							}
237							# otherwise we didnt match, and should
238							# reset back to where we started, and ...
239	1					2	$self->{index} = $old_index;
240							# return failure
241	1					3	return 0;
242							}
243
244							sub skipTokenIfWhitespace {
245	4			4		6	my ($self) = @_;
246	4	100				6	$self->{index}++ if $self->lookAheadToken() =~ /^\s+$/;
247							}
248
249							sub skipTokens {
250	13			13		21	my ($self, $num_token_to_skip) = @_;
251	13		100			29	$num_token_to_skip \|\|= 1;
252	13					16	$self->{index} += $num_token_to_skip;
253							}
254
255							*skipToken = \&skipTokens;
256
257							1;
258
259							__END__
260
261							=head1 NAME
262
263							String::Tokenizer - A simple string tokenizer.
264
265							=head1 SYNOPSIS
266
267							use String::Tokenizer;
268
269							# create the tokenizer and tokenize input
270							my $tokenizer = String::Tokenizer->new("((5+5) * 10)", '+*()');
271
272							# create tokenizer
273							my $tokenizer = String::Tokenizer->new();
274							# ... then tokenize the string
275							$tokenizer->tokenize("((5 + 5) - 10)", '()');
276
277							# will print '(, (, 5, +, 5, ), -, 10, )'
278							print join ", " => $tokenizer->getTokens();
279
280							# create tokenizer which retains whitespace
281							my $st = String::Tokenizer->new(
282							'this is a test with, (significant) whitespace',
283							',()',
284							String::Tokenizer->RETAIN_WHITESPACE
285							);
286
287							# this will print:
288							# 'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace'
289							print "'" . (join "', '" => $tokenizer->getTokens()) . "'";
290
291							# get a token iterator
292							my $i = $tokenizer->iterator();
293							while ($i->hasNextToken()) {
294							my $next = $i->nextToken();
295							# peek ahead at the next token
296							my $look_ahead = $i->lookAheadToken();
297							# ...
298							# skip the next 2 tokens
299							$i->skipTokens(2);
300							# ...
301							# then backtrack 1 token
302							my $previous = $i->prevToken();
303							# ...
304							# get the current token
305							my $current = $i->currentToken();
306							# ...
307							}
308
309							=head1 DESCRIPTION
310
311							A simple string tokenizer which takes a string and splits it on whitespace. It also optionally takes a string of characters to use as delimiters, and returns them with the token set as well. This allows for splitting the string in many different ways.
312
313							This is a very basic tokenizer, so more complex needs should be either addressed with a custom written tokenizer or post-processing of the output generated by this module. Basically, this will not fill everyone's needs, but it spans a gap between simple C<split / /, $string> and the other options that involve much larger and complex modules.
314
315							Also note that this is not a lexical analyser. Many people confuse tokenization with lexical analysis. A tokenizer merely splits its input into specific chunks, a lexical analyzer classifies those chunks. Sometimes these two steps are combined, but not here.
316
317							=head1 METHODS
318
319							=over 4
320
321							=item B<new ($string, $delimiters, $handle_whitespace)>
322
323							If you do not supply any parameters, nothing happens, the instance is just created. But if you do supply parameters, they are passed on to the C<tokenize> method and that method is run. For information about those arguments, see C<tokenize> below.
324
325							=item B<setDelimiter ($delimiter)>
326
327							This can be used to set the delimiter string, this is used by C<tokenize>.
328
329							=item B<handleWhitespace ($value)>
330
331							This can be used to set the whitespace handling. It accepts one of the two constant values C<RETAIN_WHITESPACE> or C<IGNORE_WHITESPACE>.
332
333							=item B<tokenize ($string, $delimiters, $handle_whitespace)>
334
335							Takes a C<$string> to tokenize, and optionally a set of C<$delimiter> characters to facilitate the tokenization and the type of whitespace handling with C<$handle_whitespace>. The C<$string> parameter and the C<$handle_whitespace> parameter are pretty obvious, the C<$delimiter> parameter is not as transparent. C<$delimiter> is a string of characters, these characters are then separated into individual characters and are used to split the C<$string> with. So given this string:
336
337							(5 + (100 * (20 - 35)) + 4)
338
339							The C<tokenize> method without a C<$delimiter> parameter would return the following comma separated list of tokens:
340
341							'(5', '+', '(100', '*', '(20', '-', '35))', '+', '4)'
342
343							However, if you were to pass the following set of delimiters C<(, )> to C<tokenize>, you would get the following comma separated list of tokens:
344
345							'(', '5', '+', '(', '100', '*', '(', '20', '-', '35', ')', ')', '+', '4', ')'
346
347							We now can differentiate the parens from the numbers, and no globbing occurs. If you wanted to allow for optionally leaving out the whitespace in the expression, like this:
348
349							(5+(100*(20-35))+4)
350
351							as some languages do. Then you would give this delimiter C<+*-()> to arrive at the same result.
352
353							If you decide that whitespace is significant in your string, then you need to specify that like this:
354
355							my $st = String::Tokenizer->new(
356							'this is a test with, (significant) whitespace',
357							',()',
358							String::Tokenizer->RETAIN_WHITESPACE
359							);
360
361							A call to C<getTokens> on this instance would result in the following token set.
362
363							'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace'
364
365							All running whitespace is grouped together into a single token, we make no attempt to split it into its individual parts.
366
367							=item B<getTokens>
368
369							Simply returns the array of tokens. It returns an array-ref in scalar context.
370
371							=item B<iterator>
372
373							Returns a B<String::Tokenizer::Iterator> instance, see below for more details.
374
375							=back
376
377							=head1 INNER CLASS
378
379							A B<String::Tokenizer::Iterator> instance is returned from the B<String::Tokenizer>'s C<iterator> method and serves as yet another means of iterating through an array of tokens. The simplest way would be to call C<getTokens> and just manipulate the array yourself, or push the array into another object. However, iterating through a set of tokens tends to get messy when done manually. So here I have provided the B<String::Tokenizer::Iterator> to address those common token processing idioms. It is basically a bi-directional iterator which can look ahead, skip and be reset to the beginning.
380
381							B<NOTE:>
382							B<String::Tokenizer::Iterator> is an inner class, which means that only B<String::Tokenizer> objects can create an instance of it. That said, if B<String::Tokenizer::Iterator>'s C<new> method is called from outside of the B<String::Tokenizer> package, an exception is thrown.
383
384							=over 4
385
386							=item B<new ($tokens_array_ref)>
387
388							This accepts an array reference of tokens and sets up the iterator. This method can only be called from within the B<String::Tokenizer> package, otherwise an exception will be thrown.
389
390							=item B<reset>
391
392							This will reset the internal counter,
393							bringing it back to the beginning of the token list.
394
395							=item B<hasNextToken>
396
397							This will return true (1) if there are more tokens to be iterated over,
398							and false (0) otherwise.
399
400							=item B<hasPrevToken>
401
402							This will return true (1) if the beginning of the token list has been reached, and false (0) otherwise.
403
404							=item B<nextToken>
405
406							This dispenses the next available token, and move the internal counter ahead by one.
407
408							=item B<prevToken>
409
410							This dispenses the previous token, and moves the internal counter back by one.
411
412							=item B<currentToken>
413
414							This returns the current token, which will match the last token retrieved by C<nextToken>.
415
416							=item B<lookAheadToken>
417
418							This peeks ahead one token to the next one in the list. This item will match the next item dispensed with C<nextToken>. This is a non-destructive look ahead, meaning it does not alter the position of the internal counter.
419
420							=item B<skipToken>
421
422							This will jump the internal counter ahead by 1.
423
424							=item B<skipTokens ($number_to_skip)>
425
426							This will jump the internal counter ahead by C<$number_to_skip>.
427
428							=item B<skipTokenIfWhitespace>
429
430							This will skip the next token if it is whitespace.
431
432							=item B<skipTokensUntil ($token_to_match)>
433
434							Given a string as a C<$token_to_match>, this will skip all tokens until it matches that string. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state.
435
436							=item B<collectTokensUntil ($token_to_match)>
437
438							Given a string as a C<$token_to_match>, this will collect all tokens until it matches that string, at which point the collected tokens will be returned. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state and no tokens will be returned.
439
440							=back
441
442							=head1 TO DO
443
444							=over 4
445
446							=item I<Inline token expansion>
447
448							The Java StringTokenizer class allows for a token to be tokenized further, therefore breaking it up more and including the results into the current token stream. I have never used this feature in this class, but I can see where it might be a useful one. This may be in the next release if it works out.
449
450							Possibly compliment this expansion with compression as well, so for instance double quoted strings could be compressed into a single token.
451
452							=item I<Token Bookmarks>
453
454							Allow for the creation of "token bookmarks". Meaning we could tag a specific token with a label, that index could be returned to from any point in the token stream. We could mix this with a memory stack as well, so that we would have an ordering to the bookmarks as well.
455
456							=back
457
458							=head1 BUGS
459
460							None that I am aware of. Of course, if you find a bug, let me know, and I will be sure to fix it.
461
462							=head1 CODE COVERAGE
463
464							I use B<Devel::Cover> to test the code coverage of my tests, below is the B<Devel::Cover> report on this module's test suite.
465
466							------------------------ ------ ------ ------ ------ ------ ------ ------
467							File stmt branch cond sub pod time total
468							------------------------ ------ ------ ------ ------ ------ ------ ------
469							String/Tokenizer.pm 100.0 100.0 64.3 100.0 100.0 100.0 97.6
470							------------------------ ------ ------ ------ ------ ------ ------ ------
471							Total 100.0 100.0 64.3 100.0 100.0 100.0 97.6
472							------------------------ ------ ------ ------ ------ ------ ------ ------
473
474							=head1 SEE ALSO
475
476							The interface and workings of this module are based largely on the StringTokenizer class from the Java standard library.
477
478							Below is a short list of other modules that might be considered similar to this one. If this module does not suit your needs, you might look at one of these.
479
480							=over 4
481
482							=item L<String::Tokeniser>
483
484							Along with being a tokenizer,
485							it also provides a means of moving through the resulting tokens,
486							allowing for skipping of tokens and such.
487							It was last updated in 2011.
488
489							=item L<Parse::Tokens>
490
491							This one hasn't been touched since 2001,
492							although it did get up to version 0.27.
493							It looks to lean over more towards the parser side than a basic tokenizer.
494
495							=item L<Text::Tokenizer>
496
497							This is both a lexical analyzer and a tokenizer.
498							It also uses XS, where String::Tokenizer is pure perl.
499							This is something maybe to look into if you were to need a more beefy solution
500							than String::Tokenizer provides.
501
502							=back
503
504							=head1 THANKS
505
506							=over
507
508							=item Thanks to Stephan Tobias for finding bugs and suggestions on whitespace handling.
509
510							=back
511
512							=head1 AUTHOR
513
514							stevan little, E<lt>stevan@cpan.orgE<gt>
515
516							=head1 COPYRIGHT AND LICENSE
517
518							Copyright 2004-2016 by Infinity Interactive, Inc.
519
520							L<http://www.iinteractive.com>
521
522							This library is free software; you can redistribute it and/or modify
523							it under the same terms as Perl itself.
524
525							=cut