File Coverage

blib/lib/String/Tokeniser.pm
Criterion Covered Total %
statement 41 89 46.0
branch 6 20 30.0
condition 4 4 100.0
subroutine 5 15 33.3
pod 11 11 100.0
total 67 139 48.2


line stmt bran cond sub pod time code
1             package String::Tokeniser;
2              
3 2     2   10186 use strict;
  2         6  
  2         77  
4 2     2   11 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  2         4  
  2         137  
5 2     2   21 use Carp;
  2         3  
  2         2039  
6              
7             require Exporter;
8              
9             @ISA = qw(Exporter);
10             @EXPORT_OK = qw();
11              
12             $VERSION = '0.05';
13              
14             1;
15              
16             =pod
17              
18             =head1 NAME
19              
20             String::Tokeniser - Perl extension for, uhm, tokenising strings.
21              
22             =head1 SYNOPSIS
23              
24             use String::Tokeniser;
25              
26             =head1 DESCRIPTION
27              
28             C provides an interface to a tokeniser class, allowing
29             one to manipulate strings on a token-by-token basis without having to
30             keep track of list element numbers and so on.
31              
32             =head1 CONSTRUCTOR
33              
34             =over 4
35              
36             =item new ( $sentence, [0|-1|$regexp], [$exception...] )
37              
38             Create a C, tokenises $sentence and resets the token
39             counter.
40              
41             The next argument determines how a ``token'' is defined: a value of 0 or
42             C determines that underscores B included in a token; -1 states
43             that they are not. Alternatively, you can supply your own regular
44             expression which will be fed to a C to determine the tokens.
45              
46             Then may optionally follow a list of exceptions: tokens that would be
47             split in two, but should be treated as one.
48              
49             =back
50              
51             =cut
52              
53             sub new {
54 2     2 1 867 my $classname = shift;
55 2         4 my $self = {};
56 2         5 bless($self, $classname);
57 2         5 my $sentence = shift;
58 2 50       7 carp "! Nothing to tokenise" unless defined $sentence;
59 2   100     11 my $style = shift || 0;
60 2         3 my @list;
61 2 100       8 if ($style==-1) {
    50          
62 1         2 $style= "(?<=[^a-zA-Z0-9])|(?=[^a-zA-Z0-9])";
63             } elsif ($style) {
64             } else {
65 1         2 $style="(?<=[^a-zA-Z0-9_])|(?=[^a-zA-Z0-9_])";
66             }
67 2         11 $self->{STYLE} = $style;
68 2         105 @list = split /$style/, $sentence;
69 2         9 $self->{LIST} = \@list;
70 2         4 $self->{COUNT} = 0;
71 2         5 $self->{STACK} = [];
72 2         7 $self->_except(@_); # Exception handler. Is not fun.
73              
74 2         16 return($self);
75             }
76              
77             =pod
78              
79             =head1 METHODS
80              
81             =over 4
82              
83             =item moretokens
84              
85             Tells you if you have any more tokens left to deal with.
86              
87             =cut
88              
89 0     0 1 0 sub moretokens { my $self = shift;
90 0         0 return ($self->{COUNT} <= $#{$self->{LIST}})
  0         0  
91             }
92              
93             =pod
94              
95             =item skiptoken([n])
96              
97             Move the `pointer' forward one (or C) tokens.
98              
99             =cut
100 0     0 1 0 sub skiptoken { my $self=shift; my $howmany=shift;
  0         0  
101 0 0       0 $howmany=1 unless defined $howmany;
102 0         0 $self->{COUNT}+=$howmany;
103             }
104              
105             =pod
106              
107             =item thistoken
108              
109             Return the current token; that is, the token under the `pointer'.
110              
111             =cut
112              
113 0     0 1 0 sub thistoken { my $self=shift;
114 0         0 return $self->{LIST}->[$self->{COUNT}];
115             }
116              
117             =pod
118              
119             =item lasttoken
120              
121             Return the previous token; that is, the one just past the `pointer'.
122              
123             =cut
124              
125 0     0 1 0 sub lasttoken { my $self=shift;
126 0         0 return $self->{LIST}->[$self->{COUNT}-1];
127             }
128              
129             =pod
130              
131             =item gettoken
132              
133             Equivalent to C - the usual way of grabbing the
134             next token in the list in turn.
135              
136             =cut
137              
138 0     0 1 0 sub gettoken { my $self=shift;
139 0         0 $self->skiptoken(); return $self->lasttoken();}
  0         0  
140              
141             =pod
142              
143             =item nexttoken
144              
145             Looks ahead one token, but does not change the `pointer' position.
146              
147             =cut
148              
149 0     0 1 0 sub nexttoken { my $self=shift;
150 0         0 return $self->{LIST}->[$self->{COUNT}+1];
151             }
152              
153             =pod
154              
155             =item lookahead([n])
156              
157             Returns a string composed of the next C tokens, but does not change
158             the `pointer' position.
159              
160             =cut
161              
162 0     0 1 0 sub lookahead { my $self=shift;
163 0         0 my $howmany=shift;
164 0 0       0 croak "Silly value in lookahead" if $howmany <=1;
165 0         0 my $ret="";
166 0         0 for (my $i=$self->{COUNT}; $i<$self->{COUNT}+$howmany; $i++)
167 0         0 { $ret.= $self->{LIST}->[$i] }
168 0         0 return $ret;
169             }
170              
171             =pod
172              
173             =item gimme($string)
174              
175             Assuming a string of tokens will end in C<$string>, returns everything
176             from the current `pointer' position until the string is found. Returns
177             a two-element list: firsly, why the search terminated, (either C
178             meaning we hit the end of the token list without success, or C
179             meaning C<$string> was found.) and the rest of the tokens upto and
180             including C<$string> (or the end of the list, whichever was soonest).
181              
182             =cut
183              
184 0     0 1 0 sub gimme { my ($self,$expectation)=(shift,shift);
185 0         0 my $why="EOF"; my $retval="";
  0         0  
186 0         0 while ($self->moretokens()) {
187 0         0 $retval.=$self->gettoken();
188 0 0       0 if (substr($retval,-length($expectation)) eq $expectation) {
189 0         0 $why="FOUND";
190             last
191 0         0 }
192             }
193 0         0 return ($why, $retval);
194             }
195              
196             =pod
197              
198             =item save
199              
200             Saves one's pointer position. Can be used multiply as a save stack.
201              
202             =cut
203              
204 0     0 1 0 sub save { my $self=shift;
205 0         0 push @{$self->{STACK}}, $self->{COUNT};
  0         0  
206             }
207              
208             =pod
209              
210             =item restore
211              
212             Restores a previously saved position.
213              
214             =cut
215              
216 0     0 1 0 sub restore { my $self=shift; my $temp;
  0         0  
217 0 0       0 $self->{COUNT}=$temp if $temp = pop @{$self->{STACK}};
  0         0  
218             }
219              
220             =pod
221              
222             =back
223              
224             =head1 FEATURES
225              
226             At present, there is no support for exceptions which spread over three
227             or more tokens, although this is planned.
228              
229             =head1 AUTHOR
230              
231             Originaly written by Simon Cozens;
232             Maintained by Alberto Simoes C<>
233              
234             =head1 SEE ALSO
235              
236             L
237              
238             =cut
239              
240             # I have no idea how this works any more. And I've *only just* written
241             # it.
242             # -- Simon Cozens
243             #
244             # But it is correct, and simple! You just need to indent it correctly.
245             #
246             # -- Alberto Simoes
247              
248             sub _except {
249 2     2   5 my $self = shift;
250 2         3 my $style = $self->{STYLE};
251 2         4 my %decide;
252 2         3 my $listref=$self->{LIST};
253 2         2 my @res;
254              
255 2         8 while($_ = shift) { # was foreach(shift) {
256 0         0 my($left, $right) = split /$style/;
257 0         0 push @{$decide{$left}}, $right;
  0         0  
258             }
259              
260 2         10 @_ = @$listref;
261 2         7 while (@_) {
262 9   100     31 my($first,$second) = (shift, shift || "");
263 9 0       20 if (grep { $first eq $_ and scalar(grep { $second eq $_ } @{$decide{$_}}) } keys %decide ) {
  0 50       0  
  0         0  
  0         0  
264             # I think
265 0         0 push(@res, $first.$second);
266             } else {
267 9         25 push @res,$first;
268 9 50       22 if (grep { $second eq $_ } keys %decide) {
  0         0  
269 0         0 unshift(@_, $second);
270             } else {
271 9         30 push @res, $second;
272             }
273             }
274             }
275 2         7 $self->{LIST}=\@res;
276 2         8 return $self;
277             }
278              
279             # sub ishere { return 1 }