File Coverage

blib/lib/String/Tokenizer.pm
Criterion Covered Total %
statement 119 119 100.0
branch 44 44 100.0
condition 9 14 64.2
subroutine 25 25 100.0
pod 6 6 100.0
total 203 208 97.6


line stmt bran cond sub pod time code
1              
2             package String::Tokenizer;
3              
4 2     2   27758 use 5.006;
  2         5  
5 2     2   8 use strict;
  2         2  
  2         37  
6 2     2   5 use warnings;
  2         5  
  2         70  
7              
8             our $VERSION = '0.06';
9              
10 2     2   7 use constant RETAIN_WHITESPACE => 1;
  2         2  
  2         129  
11 2     2   6 use constant IGNORE_WHITESPACE => 0;
  2         2  
  2         950  
12              
13             ### constructor
14              
15             sub new {
16 10     10 1 3834 my ($_class, @args) = @_;
17 10   33     41 my $class = ref($_class) || $_class;
18 10         26 my $string_tokenizer = {
19             tokens => [],
20             delimiter => undef,
21             handle_whitespace => IGNORE_WHITESPACE
22             };
23 10         10 bless($string_tokenizer, $class);
24 10 100       28 $string_tokenizer->tokenize(@args) if @args;
25 10         33 return $string_tokenizer;
26             }
27              
28             ### methods
29              
30             sub setDelimiter {
31 8     8 1 7 my ($self, $delimiter) = @_;
32 8         24 my $delimiter_reg_exp = join "\|" => map { s/(\W)/\\$1/g; $_ } split // => $delimiter;
  26         112  
  26         44  
33 8         111 $self->{delimiter} = qr/$delimiter_reg_exp/;
34             }
35              
36             sub handleWhitespace {
37 3     3 1 4 my ($self, $value) = @_;
38 3         4 $self->{handle_whitespace} = $value;
39             }
40              
41             sub tokenize {
42 10     10 1 392 my ($self, $string, $delimiter, $handle_whitespace) = @_;
43             # if we have a delimiter passed in then use it
44 10 100       27 $self->setDelimiter($delimiter) if defined $delimiter;
45             # if we are asking about whitespace then handle it
46 10 100       22 $self->handleWhitespace($handle_whitespace) if defined $handle_whitespace;
47             # if the two above are not handled, then the object will use
48             # the values set already.
49             # split everything by whitespace no matter what
50             # (possible multiple occurances of white space too)
51 10         8 my @tokens;
52 10 100       20 if ($self->{handle_whitespace}) {
53 2         21 @tokens = split /(\s+)/ => $string;
54             }
55             else {
56 8         66 @tokens = split /\s+/ => $string;
57             }
58 10 100       21 if ($self->{delimiter}) {
59             # create the delimiter reg-ex
60             # escape all non-alpha-numeric
61             # characters, just to be safe
62 9         11 my $delimiter = $self->{delimiter};
63             # loop through the tokens
64             @tokens = map {
65             # if the token contains a delimiter then ...
66 9 100       11 if (/$delimiter/) {
  148         339  
67 60         43 my ($token, @_tokens);
68             # split the token up into characters
69             # and the loop through all the characters
70 60         83 foreach my $char (split //) {
71             # if the character is a delimiter
72 196 100       422 if ($char =~ /^$delimiter$/) {
73             # and we already have a token in the works
74 75 100 66     190 if (defined($token) && $token =~ /^.*$/) {
75             # add the token to the
76             # temp tokens list
77 31         33 push @_tokens => $token;
78             }
79             # and then push our delimiter character
80             # onto the temp tokens list
81 75         57 push @_tokens => $char;
82             # now we need to undefine our token
83 75         82 $token = undef;
84             }
85             # if the character is not a delimiter then
86             else {
87             # check to make sure the token is defined
88 121 100       143 $token = "" unless defined $token;
89             # and then add the chracter to it
90 121         130 $token .= $char;
91             }
92             }
93             # now push any remaining token onto
94             # the temp tokens list
95 60 100       81 push @_tokens => $token if defined $token;
96             # and return tokens
97 60         102 @_tokens;
98             }
99             # if our token does not have
100             # the delimiter in it
101             else {
102             # just return it
103 88         107 $_
104             }
105             } @tokens;
106             }
107 10         31 $self->{tokens} = \@tokens;
108             }
109              
110             sub getTokens {
111 6     6 1 680 my ($self) = @_;
112             return wantarray ?
113 1         6 @{$self->{tokens}}
114             :
115 6 100       21 $self->{tokens};
116             }
117              
118             sub iterator {
119 4     4 1 1589 my ($self) = @_;
120             # returns a copy of the array
121 4         10 return String::Tokenizer::Iterator->new($self->{tokens});
122             }
123              
124             package String::Tokenizer::Iterator;
125              
126 2     2   8 use strict;
  2         3  
  2         31  
127 2     2   5 use warnings;
  2         2  
  2         941  
128              
129             sub new {
130 5 100   5   34 ((caller())[0] eq "String::Tokenizer")
131             || die "Insufficient Access Priviledges : Only String::Tokenizer can create String::Tokenizer::Iterator instances";
132 4         5 my ($_class, $tokens) = @_;
133 4   33     15 my $class = ref($_class) || $_class;
134 4         16 my $iterator = {
135             tokens => $tokens,
136             index => 0
137             };
138 4         6 bless($iterator, $class);
139 4         8 return $iterator;
140             }
141              
142             sub reset {
143 1     1   2 my ($self) = @_;
144 1         2 $self->{index} = 0;
145             }
146              
147             sub hasNextToken {
148 108     108   3750 my ($self) = @_;
149 108 100       69 return ($self->{index} < scalar @{$self->{tokens}}) ? 1 : 0;
  108         189  
150             }
151              
152             sub hasPrevToken {
153 26     26   20 my ($self) = @_;
154 26         28 return ($self->{index} > 0);
155             }
156              
157             sub nextToken {
158 118     118   978 my ($self) = @_;
159 118 100       78 return undef if ($self->{index} >= scalar @{$self->{tokens}});
  118         187  
160 117         209 return $self->{tokens}->[$self->{index}++];
161             }
162              
163             sub prevToken {
164 26     26   52 my ($self) = @_;
165 26 100       32 return undef if ($self->{index} <= 0);
166 25         33 return $self->{tokens}->[--$self->{index}];
167             }
168              
169             sub currentToken {
170 25     25   49 my ($self) = @_;
171 25         37 return $self->{tokens}->[$self->{index} - 1];
172             }
173              
174             sub lookAheadToken {
175 41     41   16823 my ($self) = @_;
176             return undef if ( $self->{index} <= 0
177 41 100 100     108 || $self->{index} >= scalar @{$self->{tokens}});
  40         131  
178 38         111 return $self->{tokens}->[$self->{index}];
179             }
180              
181             sub collectTokensUntil {
182 5     5   8 my ($self, $token_to_match) = @_;
183             # if this matches our current token ...
184             # then we just return nothing as there
185             # is nothing to accumulate
186 5 100       8 if ($self->lookAheadToken() eq $token_to_match) {
187             # then just advance it one
188 1         2 $self->nextToken();
189             # and return nothing
190 1         3 return;
191             }
192              
193             # if it doesnt match our current token then, ...
194 4         5 my @collection;
195             # store the index we start at
196 4         4 my $old_index = $self->{index};
197 4         4 my $matched;
198             # loop through the tokens
199 4         6 while ($self->hasNextToken()) {
200 23         22 my $token = $self->nextToken();
201 23 100       25 if ($token ne $token_to_match) {
202 20         28 push @collection => $token;
203             }
204             else {
205 3         3 $matched++;
206 3         4 last;
207             }
208             }
209 4 100       7 unless ($matched) {
210             # reset back to where we started, and ...
211 1         2 $self->{index} = $old_index;
212             # and return nothing
213 1         4 return;
214             }
215             # and return our collection
216 3         20 return @collection;
217             }
218              
219              
220             sub skipTokensUntil {
221 3     3   5 my ($self, $token_to_match) = @_;
222             # if this matches our current token ...
223 3 100       6 if ($self->lookAheadToken() eq $token_to_match) {
224             # then just advance it one
225 1         3 $self->nextToken();
226             # and return success
227 1         4 return 1;
228             }
229             # if it doesnt match our current token then, ...
230             # store the index we start at
231 2         4 my $old_index = $self->{index};
232             # and loop through the tokens
233 2         4 while ($self->hasNextToken()) {
234             # return success if we match our token
235 18 100       17 return 1 if ($self->nextToken() eq $token_to_match);
236             }
237             # otherwise we didnt match, and should
238             # reset back to where we started, and ...
239 1         2 $self->{index} = $old_index;
240             # return failure
241 1         3 return 0;
242             }
243              
244             sub skipTokenIfWhitespace {
245 4     4   6 my ($self) = @_;
246 4 100       6 $self->{index}++ if $self->lookAheadToken() =~ /^\s+$/;
247             }
248              
249             sub skipTokens {
250 13     13   21 my ($self, $num_token_to_skip) = @_;
251 13   100     29 $num_token_to_skip ||= 1;
252 13         16 $self->{index} += $num_token_to_skip;
253             }
254              
255             *skipToken = \&skipTokens;
256              
257             1;
258              
259             __END__
260              
261             =head1 NAME
262              
263             String::Tokenizer - A simple string tokenizer.
264              
265             =head1 SYNOPSIS
266              
267             use String::Tokenizer;
268              
269             # create the tokenizer and tokenize input
270             my $tokenizer = String::Tokenizer->new("((5+5) * 10)", '+*()');
271              
272             # create tokenizer
273             my $tokenizer = String::Tokenizer->new();
274             # ... then tokenize the string
275             $tokenizer->tokenize("((5 + 5) - 10)", '()');
276              
277             # will print '(, (, 5, +, 5, ), -, 10, )'
278             print join ", " => $tokenizer->getTokens();
279              
280             # create tokenizer which retains whitespace
281             my $st = String::Tokenizer->new(
282             'this is a test with, (significant) whitespace',
283             ',()',
284             String::Tokenizer->RETAIN_WHITESPACE
285             );
286              
287             # this will print:
288             # 'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace'
289             print "'" . (join "', '" => $tokenizer->getTokens()) . "'";
290              
291             # get a token iterator
292             my $i = $tokenizer->iterator();
293             while ($i->hasNextToken()) {
294             my $next = $i->nextToken();
295             # peek ahead at the next token
296             my $look_ahead = $i->lookAheadToken();
297             # ...
298             # skip the next 2 tokens
299             $i->skipTokens(2);
300             # ...
301             # then backtrack 1 token
302             my $previous = $i->prevToken();
303             # ...
304             # get the current token
305             my $current = $i->currentToken();
306             # ...
307             }
308              
309             =head1 DESCRIPTION
310              
311             A simple string tokenizer which takes a string and splits it on whitespace. It also optionally takes a string of characters to use as delimiters, and returns them with the token set as well. This allows for splitting the string in many different ways.
312              
313             This is a very basic tokenizer, so more complex needs should be either addressed with a custom written tokenizer or post-processing of the output generated by this module. Basically, this will not fill everyone's needs, but it spans a gap between simple C<split / /, $string> and the other options that involve much larger and complex modules.
314              
315             Also note that this is not a lexical analyser. Many people confuse tokenization with lexical analysis. A tokenizer merely splits its input into specific chunks, a lexical analyzer classifies those chunks. Sometimes these two steps are combined, but not here.
316              
317             =head1 METHODS
318              
319             =over 4
320              
321             =item B<new ($string, $delimiters, $handle_whitespace)>
322              
323             If you do not supply any parameters, nothing happens, the instance is just created. But if you do supply parameters, they are passed on to the C<tokenize> method and that method is run. For information about those arguments, see C<tokenize> below.
324              
325             =item B<setDelimiter ($delimiter)>
326              
327             This can be used to set the delimiter string, this is used by C<tokenize>.
328              
329             =item B<handleWhitespace ($value)>
330              
331             This can be used to set the whitespace handling. It accepts one of the two constant values C<RETAIN_WHITESPACE> or C<IGNORE_WHITESPACE>.
332              
333             =item B<tokenize ($string, $delimiters, $handle_whitespace)>
334              
335             Takes a C<$string> to tokenize, and optionally a set of C<$delimiter> characters to facilitate the tokenization and the type of whitespace handling with C<$handle_whitespace>. The C<$string> parameter and the C<$handle_whitespace> parameter are pretty obvious, the C<$delimiter> parameter is not as transparent. C<$delimiter> is a string of characters, these characters are then separated into individual characters and are used to split the C<$string> with. So given this string:
336              
337             (5 + (100 * (20 - 35)) + 4)
338              
339             The C<tokenize> method without a C<$delimiter> parameter would return the following comma separated list of tokens:
340              
341             '(5', '+', '(100', '*', '(20', '-', '35))', '+', '4)'
342              
343             However, if you were to pass the following set of delimiters C<(, )> to C<tokenize>, you would get the following comma separated list of tokens:
344              
345             '(', '5', '+', '(', '100', '*', '(', '20', '-', '35', ')', ')', '+', '4', ')'
346              
347             We now can differentiate the parens from the numbers, and no globbing occurs. If you wanted to allow for optionally leaving out the whitespace in the expression, like this:
348              
349             (5+(100*(20-35))+4)
350              
351             as some languages do. Then you would give this delimiter C<+*-()> to arrive at the same result.
352              
353             If you decide that whitespace is significant in your string, then you need to specify that like this:
354              
355             my $st = String::Tokenizer->new(
356             'this is a test with, (significant) whitespace',
357             ',()',
358             String::Tokenizer->RETAIN_WHITESPACE
359             );
360              
361             A call to C<getTokens> on this instance would result in the following token set.
362              
363             'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace'
364              
365             All running whitespace is grouped together into a single token, we make no attempt to split it into its individual parts.
366              
367             =item B<getTokens>
368              
369             Simply returns the array of tokens. It returns an array-ref in scalar context.
370              
371             =item B<iterator>
372              
373             Returns a B<String::Tokenizer::Iterator> instance, see below for more details.
374              
375             =back
376              
377             =head1 INNER CLASS
378              
379             A B<String::Tokenizer::Iterator> instance is returned from the B<String::Tokenizer>'s C<iterator> method and serves as yet another means of iterating through an array of tokens. The simplest way would be to call C<getTokens> and just manipulate the array yourself, or push the array into another object. However, iterating through a set of tokens tends to get messy when done manually. So here I have provided the B<String::Tokenizer::Iterator> to address those common token processing idioms. It is basically a bi-directional iterator which can look ahead, skip and be reset to the beginning.
380              
381             B<NOTE:>
382             B<String::Tokenizer::Iterator> is an inner class, which means that only B<String::Tokenizer> objects can create an instance of it. That said, if B<String::Tokenizer::Iterator>'s C<new> method is called from outside of the B<String::Tokenizer> package, an exception is thrown.
383              
384             =over 4
385              
386             =item B<new ($tokens_array_ref)>
387              
388             This accepts an array reference of tokens and sets up the iterator. This method can only be called from within the B<String::Tokenizer> package, otherwise an exception will be thrown.
389              
390             =item B<reset>
391              
392             This will reset the internal counter,
393             bringing it back to the beginning of the token list.
394              
395             =item B<hasNextToken>
396              
397             This will return true (1) if there are more tokens to be iterated over,
398             and false (0) otherwise.
399              
400             =item B<hasPrevToken>
401              
402             This will return true (1) if the beginning of the token list has been reached, and false (0) otherwise.
403              
404             =item B<nextToken>
405              
406             This dispenses the next available token, and move the internal counter ahead by one.
407              
408             =item B<prevToken>
409              
410             This dispenses the previous token, and moves the internal counter back by one.
411              
412             =item B<currentToken>
413              
414             This returns the current token, which will match the last token retrieved by C<nextToken>.
415              
416             =item B<lookAheadToken>
417              
418             This peeks ahead one token to the next one in the list. This item will match the next item dispensed with C<nextToken>. This is a non-destructive look ahead, meaning it does not alter the position of the internal counter.
419              
420             =item B<skipToken>
421              
422             This will jump the internal counter ahead by 1.
423              
424             =item B<skipTokens ($number_to_skip)>
425              
426             This will jump the internal counter ahead by C<$number_to_skip>.
427              
428             =item B<skipTokenIfWhitespace>
429              
430             This will skip the next token if it is whitespace.
431              
432             =item B<skipTokensUntil ($token_to_match)>
433              
434             Given a string as a C<$token_to_match>, this will skip all tokens until it matches that string. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state.
435              
436             =item B<collectTokensUntil ($token_to_match)>
437              
438             Given a string as a C<$token_to_match>, this will collect all tokens until it matches that string, at which point the collected tokens will be returned. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state and no tokens will be returned.
439              
440             =back
441              
442             =head1 TO DO
443              
444             =over 4
445              
446             =item I<Inline token expansion>
447              
448             The Java StringTokenizer class allows for a token to be tokenized further, therefore breaking it up more and including the results into the current token stream. I have never used this feature in this class, but I can see where it might be a useful one. This may be in the next release if it works out.
449              
450             Possibly compliment this expansion with compression as well, so for instance double quoted strings could be compressed into a single token.
451              
452             =item I<Token Bookmarks>
453              
454             Allow for the creation of "token bookmarks". Meaning we could tag a specific token with a label, that index could be returned to from any point in the token stream. We could mix this with a memory stack as well, so that we would have an ordering to the bookmarks as well.
455              
456             =back
457              
458             =head1 BUGS
459              
460             None that I am aware of. Of course, if you find a bug, let me know, and I will be sure to fix it.
461              
462             =head1 CODE COVERAGE
463              
464             I use B<Devel::Cover> to test the code coverage of my tests, below is the B<Devel::Cover> report on this module's test suite.
465              
466             ------------------------ ------ ------ ------ ------ ------ ------ ------
467             File stmt branch cond sub pod time total
468             ------------------------ ------ ------ ------ ------ ------ ------ ------
469             String/Tokenizer.pm 100.0 100.0 64.3 100.0 100.0 100.0 97.6
470             ------------------------ ------ ------ ------ ------ ------ ------ ------
471             Total 100.0 100.0 64.3 100.0 100.0 100.0 97.6
472             ------------------------ ------ ------ ------ ------ ------ ------ ------
473              
474             =head1 SEE ALSO
475              
476             The interface and workings of this module are based largely on the StringTokenizer class from the Java standard library.
477              
478             Below is a short list of other modules that might be considered similar to this one. If this module does not suit your needs, you might look at one of these.
479              
480             =over 4
481              
482             =item L<String::Tokeniser>
483              
484             Along with being a tokenizer,
485             it also provides a means of moving through the resulting tokens,
486             allowing for skipping of tokens and such.
487             It was last updated in 2011.
488              
489             =item L<Parse::Tokens>
490              
491             This one hasn't been touched since 2001,
492             although it did get up to version 0.27.
493             It looks to lean over more towards the parser side than a basic tokenizer.
494              
495             =item L<Text::Tokenizer>
496              
497             This is both a lexical analyzer and a tokenizer.
498             It also uses XS, where String::Tokenizer is pure perl.
499             This is something maybe to look into if you were to need a more beefy solution
500             than String::Tokenizer provides.
501              
502             =back
503              
504             =head1 THANKS
505              
506             =over
507              
508             =item Thanks to Stephan Tobias for finding bugs and suggestions on whitespace handling.
509              
510             =back
511              
512             =head1 AUTHOR
513              
514             stevan little, E<lt>stevan@cpan.orgE<gt>
515              
516             =head1 COPYRIGHT AND LICENSE
517              
518             Copyright 2004-2016 by Infinity Interactive, Inc.
519              
520             L<http://www.iinteractive.com>
521              
522             This library is free software; you can redistribute it and/or modify
523             it under the same terms as Perl itself.
524              
525             =cut