File Coverage

blib/lib/String/Tokeniser.pm

Criterion	Covered	Total	%
statement	41	89	46.0
branch	6	20	30.0
condition	4	4	100.0
subroutine	5	15	33.3
pod	11	11	100.0
total	67	139	48.2

line	stmt	bran	cond	sub	pod	time	code
1							package String::Tokeniser;
2
3	2			2		10186	use strict;
	2					6
	2					77
4	2			2		11	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
	2					4
	2					137
5	2			2		21	use Carp;
	2					3
	2					2039
6
7							require Exporter;
8
9							@ISA = qw(Exporter);
10							@EXPORT_OK = qw();
11
12							$VERSION = '0.05';
13
14							1;
15
16							=pod
17
18							=head1 NAME
19
20							String::Tokeniser - Perl extension for, uhm, tokenising strings.
21
22							=head1 SYNOPSIS
23
24							use String::Tokeniser;
25
26							=head1 DESCRIPTION
27
28							C provides an interface to a tokeniser class, allowing
29							one to manipulate strings on a token-by-token basis without having to
30							keep track of list element numbers and so on.
31
32							=head1 CONSTRUCTOR
33
34							=over 4
35
36							=item new ( $sentence, [0\|-1\|$regexp], [$exception...] )
37
38							Create a C, tokenises $sentence and resets the token
39							counter.
40
41							The next argument determines how a ``token'' is defined: a value of 0 or
42							C determines that underscores B included in a token; -1 states
43							that they are not. Alternatively, you can supply your own regular
44							expression which will be fed to a C to determine the tokens.
45
46							Then may optionally follow a list of exceptions: tokens that would be
47							split in two, but should be treated as one.
48
49							=back
50
51							=cut
52
53							sub new {
54	2			2	1	867	my $classname = shift;
55	2					4	my $self = {};
56	2					5	bless($self, $classname);
57	2					5	my $sentence = shift;
58	2	50				7	carp "! Nothing to tokenise" unless defined $sentence;
59	2		100			11	my $style = shift \|\| 0;
60	2					3	my @list;
61	2	100				8	if ($style==-1) {
		50
62	1					2	$style= "(?<=[^a-zA-Z0-9])\|(?=[^a-zA-Z0-9])";
63							} elsif ($style) {
64							} else {
65	1					2	$style="(?<=[^a-zA-Z0-9_])\|(?=[^a-zA-Z0-9_])";
66							}
67	2					11	$self->{STYLE} = $style;
68	2					105	@list = split /$style/, $sentence;
69	2					9	$self->{LIST} = \@list;
70	2					4	$self->{COUNT} = 0;
71	2					5	$self->{STACK} = [];
72	2					7	$self->_except(@_); # Exception handler. Is not fun.
73
74	2					16	return($self);
75							}
76
77							=pod
78
79							=head1 METHODS
80
81							=over 4
82
83							=item moretokens
84
85							Tells you if you have any more tokens left to deal with.
86
87							=cut
88
89	0			0	1	0	sub moretokens { my $self = shift;
90	0					0	return ($self->{COUNT} <= $#{$self->{LIST}})
	0					0
91							}
92
93							=pod
94
95							=item skiptoken([n])
96
97							Move the `pointer' forward one (or C) tokens.
98
99							=cut
100	0			0	1	0	sub skiptoken { my $self=shift; my $howmany=shift;
	0					0
101	0	0				0	$howmany=1 unless defined $howmany;
102	0					0	$self->{COUNT}+=$howmany;
103							}
104
105							=pod
106
107							=item thistoken
108
109							Return the current token; that is, the token under the `pointer'.
110
111							=cut
112
113	0			0	1	0	sub thistoken { my $self=shift;
114	0					0	return $self->{LIST}->[$self->{COUNT}];
115							}
116
117							=pod
118
119							=item lasttoken
120
121							Return the previous token; that is, the one just past the `pointer'.
122
123							=cut
124
125	0			0	1	0	sub lasttoken { my $self=shift;
126	0					0	return $self->{LIST}->[$self->{COUNT}-1];
127							}
128
129							=pod
130
131							=item gettoken
132
133							Equivalent to C - the usual way of grabbing the
134							next token in the list in turn.
135
136							=cut
137
138	0			0	1	0	sub gettoken { my $self=shift;
139	0					0	$self->skiptoken(); return $self->lasttoken();}
	0					0
140
141							=pod
142
143							=item nexttoken
144
145							Looks ahead one token, but does not change the `pointer' position.
146
147							=cut
148
149	0			0	1	0	sub nexttoken { my $self=shift;
150	0					0	return $self->{LIST}->[$self->{COUNT}+1];
151							}
152
153							=pod
154
155							=item lookahead([n])
156
157							Returns a string composed of the next C tokens, but does not change
158							the `pointer' position.
159
160							=cut
161
162	0			0	1	0	sub lookahead { my $self=shift;
163	0					0	my $howmany=shift;
164	0	0				0	croak "Silly value in lookahead" if $howmany <=1;
165	0					0	my $ret="";
166	0					0	for (my $i=$self->{COUNT}; $i<$self->{COUNT}+$howmany; $i++)
167	0					0	{ $ret.= $self->{LIST}->[$i] }
168	0					0	return $ret;
169							}
170
171							=pod
172
173							=item gimme($string)
174
175							Assuming a string of tokens will end in C<$string>, returns everything
176							from the current `pointer' position until the string is found. Returns
177							a two-element list: firsly, why the search terminated, (either C
178							meaning we hit the end of the token list without success, or C
179							meaning C<$string> was found.) and the rest of the tokens upto and
180							including C<$string> (or the end of the list, whichever was soonest).
181
182							=cut
183
184	0			0	1	0	sub gimme { my ($self,$expectation)=(shift,shift);
185	0					0	my $why="EOF"; my $retval="";
	0					0
186	0					0	while ($self->moretokens()) {
187	0					0	$retval.=$self->gettoken();
188	0	0				0	if (substr($retval,-length($expectation)) eq $expectation) {
189	0					0	$why="FOUND";
190							last
191	0					0	}
192							}
193	0					0	return ($why, $retval);
194							}
195
196							=pod
197
198							=item save
199
200							Saves one's pointer position. Can be used multiply as a save stack.
201
202							=cut
203
204	0			0	1	0	sub save { my $self=shift;
205	0					0	push @{$self->{STACK}}, $self->{COUNT};
	0					0
206							}
207
208							=pod
209
210							=item restore
211
212							Restores a previously saved position.
213
214							=cut
215
216	0			0	1	0	sub restore { my $self=shift; my $temp;
	0					0
217	0	0				0	$self->{COUNT}=$temp if $temp = pop @{$self->{STACK}};
	0					0
218							}
219
220							=pod
221
222							=back
223
224							=head1 FEATURES
225
226							At present, there is no support for exceptions which spread over three
227							or more tokens, although this is planned.
228
229							=head1 AUTHOR
230
231							Originaly written by Simon Cozens;
232							Maintained by Alberto Simoes C<>
233
234							=head1 SEE ALSO
235
236							L
237
238							=cut
239
240							# I have no idea how this works any more. And I've only just written
241							# it.
242							# -- Simon Cozens
243							#
244							# But it is correct, and simple! You just need to indent it correctly.
245							#
246							# -- Alberto Simoes
247
248							sub _except {
249	2			2		5	my $self = shift;
250	2					3	my $style = $self->{STYLE};
251	2					4	my %decide;
252	2					3	my $listref=$self->{LIST};
253	2					2	my @res;
254
255	2					8	while($_ = shift) { # was foreach(shift) {
256	0					0	my($left, $right) = split /$style/;
257	0					0	push @{$decide{$left}}, $right;
	0					0
258							}
259
260	2					10	@_ = @$listref;
261	2					7	while (@_) {
262	9		100			31	my($first,$second) = (shift, shift \|\| "");
263	9	0				20	if (grep { $first eq $_ and scalar(grep { $second eq $_ } @{$decide{$_}}) } keys %decide ) {
	0	50				0
	0					0
	0					0
264							# I think
265	0					0	push(@res, $first.$second);
266							} else {
267	9					25	push @res,$first;
268	9	50				22	if (grep { $second eq $_ } keys %decide) {
	0					0
269	0					0	unshift(@_, $second);
270							} else {
271	9					30	push @res, $second;
272							}
273							}
274							}
275	2					7	$self->{LIST}=\@res;
276	2					8	return $self;
277							}
278
279							# sub ishere { return 1 }