File Coverage

blib/lib/Text/Shellwords/Cursor.pm

Criterion	Covered	Total	%
statement	111	171	64.9
branch	57	114	50.0
condition	32	54	59.2
subroutine	6	8	75.0
pod	6	6	100.0
total	212	353	60.0

line	stmt	bran	cond	sub	pod	time	code
1							# Text::Shellwords::Cursor.pm
2							# Scott Bronson
3							# 27 Jan 2003
4
5							package Text::Shellwords::Cursor;
6
7	2			2		34157	use strict;
	2					5
	2					82
8
9	2			2		12	use vars qw($VERSION);
	2					4
	2					7232
10							$VERSION = '0.81';
11
12							=head1 NAME
13
14							Text::Shellwords::Cursor - Parse a string into tokens
15
16							=head1 SYNOPSIS
17
18							use Text::Shellwords::Cursor;
19							my $parser = Text::Shellwords::Cursor->new();
20							my $str = 'ab cdef "ghi" j"k\"l "';
21							my ($tok1) = $parser->parse_line($str);
22							$tok1 = ['ab', 'cdef', 'ghi', 'j', 'k"l ']
23							my ($tok2, $tokno, $tokoff) = $parser->parse_line($str, cursorpos => 6);
24							as above, but $tokno=1, $tokoff=3 (under the 'f')
25
26							DESCRIPTION
27
28							This module is very similar to Text::Shellwords and Text::ParseWords.
29							However, it has one very significant difference: it keeps track of
30							a character position in the line it's parsing. For instance, if you
31							pass it ("zq fmgb", cursorpos=>6), it would return
32							(['zq', 'fmgb'], 1, 3). The cursorpos parameter
33							tells where in the input string the cursor resides
34							(just before the 'b'), and the result tells you that
35							the cursor was on token 1 ('fmgb'), character 3 ('b').
36							This is very useful when computing command-line completions
37							involving quoting, escaping, and tokenizing characters (like '(' or '=').
38
39							A few helper utilities are included as well. You can escape a string to
40							ensure that parsing it will produce the original string (L).
41							You can also reassemble the tokens with a visually pleasing amount of
42							whitespace between them (L).
43
44							This module started out as an integral part of Term::GDBUI using
45							code loosely based on Text::ParseWords. However,
46							it is now basically a ground-up reimplementation. It was
47							split out of Term::GDBUI for version 0.8.
48
49							=head1 METHODS
50
51							=over 3
52
53							=item new
54
55							Creates a new parser. Takes named arguments on the command line.
56
57							=over 4
58
59							=item keep_quotes
60
61							Normally all unescaped, unnecessary quote marks are stripped.
62							If you specify C1>, however, they are preserved.
63							This is useful if you need to know whether the string was quoted
64							or not (string constants) or what type of quotes was around it
65							(affecting variable interpolation, for instance).
66
67							=item token_chars
68
69							This argument specifies the characters that should be considered
70							tokens all by themselves. For instance, if I pass
71							token_chars=>'=', then 'ab=123' would be parsed to ('ab', '=', '123').
72							Without token_chars, 'ab=123' remains a single string.
73
74							NOTE: you cannot change token_chars after the constructor has been
75							called! The regexps that use it are compiled once (m//o).
76							Also, until the Gnu Readline library can accept "=[]," without
77							diving into an endless loop, we will not tell history expansion
78							to use token_chars (it uses " \t\n()<>;&\|" by default).
79
80							=item debug
81
82							Turns on rather copious debugging to try to show what the parser is
83							thinking at every step.
84
85							=item space_none
86
87							=item space_before
88
89							=item space_after
90
91							These variables affect how whitespace in the line is normalized and
92							it is reassembled into a string. See the L routine.
93
94							=item error
95
96							This is a reference to a routine that should be called to display
97							a parse error. The routine takes two arguments: a reference to the
98							parser, and the error message to display as a string.
99
100							=cut
101
102							sub new
103							{
104	2			2	1	19	my $type = shift;
105	2					62	bless {
106							keep_quotes => 0,
107							token_chars => '',
108							debug => 0,
109							space_none => '(',
110							space_before => '[{',
111							space_after => ',)]}',
112							error => undef,
113							@_
114							}, $type
115							}
116
117							=item parsebail(msg)
118
119							If the parsel routine or any of its subroutines runs into a fatal
120							error, they call parsebail to present a very descriptive diagnostic.
121
122							=cut
123
124							sub parsebail
125							{
126	2			2	1	5	my $self = shift;
127	2					8	my $msg = shift;
128
129	2					33	die "$msg at char " . pos() . ":\n",
130							" $_\n " . (' ' x pos()) . '^' . "\n";
131
132							}
133
134
135							=item parsel
136
137							This is the heinous routine that actually does the parsing.
138							You should never need to call it directly. Call
139							L
140							instead.
141
142							=cut
143
144							sub parsel
145							{
146	65			65	1	391	my $self = shift;
147	65					137	$_ = shift;
148	65					101	my $cursorpos = shift;
149	65					172	my $fixclosequote = shift;
150
151	65					126	my $deb = $self->{debug};
152	65					112	my $tchrs = $self->{token_chars};
153
154	65		66			409	my $usingcp = (defined($cursorpos) && $cursorpos ne '');
155	65					293	my $tokno = undef;
156	65					82	my $tokoff = undef;
157	65					73	my $oldpos;
158
159	65					114	my @pieces = ();
160
161							# Need to special case the empty string. None of the patterns below
162							# will match it yet we need to return an empty token for the cursor.
163	65	100	100			715	return ([''], 0, 0) if $usingcp && $_ eq '';
164
165	64					183	/^/gc; # force scanning to the beginning of the line
166
167	64					88	do {
168	100	0				793	$deb && print "-- top, pos=" . pos() .
		50
169							($usingcp ? " cursorpos=$cursorpos" : "") . "\n";
170
171							# trim whitespace from the beginning
172	100	100				420	if(/\G(\s+)/gc) {
173	54	0				129	$deb && print "trimmed " . length($1) . " whitespace chars, " .
		50
174							($usingcp ? "cursorpos=$cursorpos" : "") . "\n";
175							# if pos passed cursorpos, then we know that the cursor was
176							# surrounded by ws and we need to create an empty token for it.
177	54	100	100			218	if($usingcp && (pos() >= $cursorpos)) {
178							# if pos == cursorpos and we're not yet at EOL, let next token accept cursor
179	12	100	100			61	unless(pos() == $cursorpos && pos() < length($_)) {
180							# need to special-case at end-of-line as there are no more tokens
181							# to take care of the cursor so we must create an empty one.
182	8	50				98	$deb && print "adding bogus token to handle cursor.\n";
183	8					16	push @pieces, '';
184	8					16	$tokno = $#pieces;
185	8					12	$tokoff = 0;
186	8					17	$usingcp = 0;
187							}
188							}
189							}
190
191							# if there's a quote, then suck to the close quote
192	100					276	$oldpos = pos();
193	100	100				757	if(/\G(['"])/gc) {
194	40					101	my $quote = $1;
195	40					58	my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
196	40					49	my $s;
197
198	40	50				102	$deb && print "Found open quote [$quote] oldpos=$oldpos\n";
199
200							# adjust tokoff unless the cursor sits directly on the open quote
201	40	100	100			203	if($usingcp && pos()-1 < $cursorpos) {
202	31	50				81	$deb && print " lead quote increment pos=".pos()." cursorpos=$cursorpos\n";
203	31					61	$adjust += 1;
204							}
205
206	40	100				98	if($quote eq '"') {
207	20	100				106	if(/\G((?:\\.\|(?!["])[^\\])*)["]/gc) {
208	19					40	$s = $1; # string without quotes
209							} else {
210	1	50				7	unless($fixclosequote) {
211	1					5	pos() -= 1;
212	1					5	$self->parsebail("need closing quote [\"]");
213							}
214	0					0	/\G(.*)$/gc; # if no close quote, just suck to the end of the string
215	0					0	$s = $1; # string without quotes
216	0	0	0			0	if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
	0					0
217							}
218	19	50				45	$deb && print " quoted string is \"$s\"\n";
219	19					74	while($s =~ /\\./g) {
220	14					28	my $ps = pos($s) - 2; # points to the start of the sub
221	14	50				30	$deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
222	14	100	66			69	$adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
223	14					28	substr($s, $ps, 1) = '';
224	14					32	pos($s) = $ps + 1;
225	14	50				77	$deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
226							}
227							} else {
228	20	100				384	if(/\G((?:\\.\|(?!['])[^\\])*)[']/gc) {
229	19					41	$s = $1; # string without quotes
230							} else {
231	1	50				5	unless($fixclosequote) {
232	1					4	pos() -= 1;
233	1					7	$self->parsebail("need closing quote [']");
234							}
235	0					0	/\G(.*)$/gc; # if no close quote, just suck to the end of the string
236	0					0	$s = $1;
237	0	0	0			0	if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
	0					0
238							}
239	19	50				574	$deb && print " quoted string is '$s'\n";
240	19					75	while($s =~ /\\[\\']/g) {
241	12					24	my $ps = pos($s) - 2; # points to the start of the sub
242	12	50				295	$deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
243	12	100	66			360	$adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
244	12					44	substr($s, $ps, 1) = '';
245	12					31	pos($s) = $ps + 1;
246	12	50				70	$deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
247							}
248							}
249
250							# adjust tokoff if the cursor if it sits directly on the close quote
251	38	100	100			196	if($usingcp && pos() == $cursorpos) {
252	8	50				26	$deb && print " trail quote increment pos=".pos()." cursorpos=$cursorpos\n";
253	8					15	$adjust += 1;
254							}
255
256	38	50				91	$deb && print " Found close, pushing '$s' oldpos=$oldpos\n";
257	38	50				101	if($self->{keep_quotes}) {
258	0					0	$adjust -= 1; # need to move right 1 for opening quote
259	0					0	$s = $quote.$s.$quote;
260							}
261	38					78	push @pieces, $s;
262
263							# Set tokno and tokoff if this token contained the cursor
264	38	100	100			174	if($usingcp && pos() >= $cursorpos) {
265							# Previous block contains the cursor
266	34					55	$tokno = $#pieces;
267	34					59	$tokoff = $cursorpos - $oldpos - $adjust;
268	34					76	$usingcp = 0;
269							}
270							}
271
272							# suck up as much unquoted text as we can
273	98					149	$oldpos = pos();
274	98	100				787	if(/\G((?:\\.\|[^\s\\"'\Q$tchrs\E])+)/gco) {
275	40					487	my $s = $1; # the unquoted string
276	40					56	my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
277
278	40	50				153	$deb && print "Found unquoted string '$s'\n";
279	40					503	while($s =~ /\\./g) {
280	9					17	my $ps = pos($s) - 2; # points to the start of substitution
281	9	50				21	$deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
282	9	100	100			61	$adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
283	9					23	substr($s, $ps, 1) = '';
284	9					21	pos($s) = $ps + 1;
285	9	50				47	$deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
286							}
287	40	50				96	$deb && print " pushing '$s'\n";
288	40					83	push @pieces, $s;
289
290							# Set tokno and tokoff if this token contained the cursor
291	40	100	100			315	if($usingcp && pos() >= $cursorpos) {
292							# Previous block contains the cursor
293	12					19	$tokno = $#pieces;
294	12					202	$tokoff = $cursorpos - $oldpos - $adjust;
295	12					163	$usingcp = 0;
296							}
297							}
298
299	98	50	33			687	if(length($tchrs) && /\G([\Q$tchrs\E])/gco) {
300	0					0	my $s = $1; # the token char
301	0	0				0	$deb && print " pushing '$s'\n";
302	0					0	push @pieces, $s;
303
304	0	0	0			0	if($usingcp && pos() == $cursorpos) {
305							# Previous block contains the cursor
306	0					0	$tokno = $#pieces;
307	0					0	$tokoff = 0;
308	0					0	$usingcp = 0;
309							}
310							}
311							} until(pos() >= length($_));
312
313	62	0				747	$deb && print "Result: (", join(", ", @pieces), ") " .
		0
		50
314							(defined($tokno) ? $tokno : 'undef') . " " .
315							(defined($tokoff) ? $tokoff : 'undef') . "\n";
316
317	62					684	return ([@pieces], $tokno, $tokoff);
318							}
319
320
321							=item parse_line(line, I)
322
323							This is the entrypoint to this module's parsing functionality. It converts
324							a line into tokens, respecting quoted text, escaped characters,
325							etc. It also keeps track of a cursor position on the input text,
326							returning the token number and offset within the token where that position
327							can be found in the output.
328
329							This routine originally bore some resemblance to Text::ParseWords.
330							It has changed almost completely, however, to support keeping track
331							of the cursor position. It also has nicer failure modes, modular
332							quoting, token characters (see token_chars in L), etc. This
333							routine now does much more.
334
335							Arguments:
336
337							=over 3
338
339							=item line
340
341							This is a string containing the command-line to parse.
342
343							=back
344
345							This routine also accepts the following named parameters:
346
347							=over 3
348
349							=item cursorpos
350
351							This is the character position in the line to keep track of.
352							Pass undef (by not specifying it) or the empty string to have
353							the line processed with cursorpos ignored.
354
355							Note that passing undef is I the same as passing
356							some random number and ignoring the result! For instance, if you
357							pass 0 and the line begins with whitespace, you'll get a 0-length token at
358							the beginning of the line to represent the cursor in
359							the middle of the whitespace. This allows command completion
360							to work even when the cursor is not near any tokens.
361							If you pass undef, all whitespace at the beginning and end of
362							the line will be trimmed as you would expect.
363
364							If it is ambiguous whether the cursor should belong to the previous
365							token or to the following one (i.e. if it's between two quoted
366							strings, say "a""b" or a token_char), it always gravitates to
367							the previous token. This makes more sense when completing.
368
369							=item fixclosequote
370
371							Sometimes you want to try to recover from a missing close quote
372							(for instance, when calculating completions), but usually you
373							want a missing close quote to be a fatal error. fixclosequote=>1
374							will implicitly insert the correct quote if it's missing.
375							fixclosequote=>0 is the default.
376
377							=item messages
378
379							parse_line is capable of printing very informative error messages.
380							However, sometimes you don't care enough to print a message (like
381							when calculating completions). Messages are printed by default,
382							so pass messages=>0 to turn them off.
383
384							=back
385
386							This function returns a reference to an array containing three
387							items:
388
389							=over 3
390
391							=item tokens
392
393							A the tokens that the line was separated into (ref to an array of strings).
394
395							=item tokno
396
397							The number of the token (index into the previous array) that contains
398							cursorpos.
399
400							=item tokoff
401
402							The character offet into tokno of cursorpos.
403
404							=back
405
406							If the cursor is at the end of the token, tokoff will point to 1
407							character past the last character in tokno, a non-existant character.
408							If the cursor is between tokens (surrounded by whitespace), a zero-length
409							token will be created for it.
410
411							=cut
412
413							sub parse_line
414							{
415	65			65	1	54862	my $self = shift;
416	65					198	my $line = shift;
417	65					551	my %args = (
418							messages => 1, # true if we should print errors, etc.
419							cursorpos => undef, # cursor to keep track of, undef to ignore.
420							fixclosequote => 0,
421							@_
422							);
423
424	65					117	my @result = eval { $self->parsel($line,
	65					387
425							$args{'cursorpos'}, $args{'fixclosequote'}) };
426	65	100				306	if($@) {
427	2	50	33			13	$self->{error}->($self, $@) if $args{'messages'} && $self->{error};
428	2					6	@result = (undef, undef, undef);
429							}
430
431	65					318	return @result;
432							}
433
434
435							=item parse_escape(lines)
436
437							Escapes characters that would be otherwise interpreted by the parser.
438							Will accept either a single string or an arrayref of strings (which
439							will be modified in-place).
440
441							=cut
442
443							sub parse_escape
444							{
445	0			0	1		my $self = shift;
446	0						my $arr = shift; # either a string or an arrayref of strings
447
448	0						my $wantstr = 0;
449	0	0					if(ref($arr) ne 'ARRAY') {
450	0						$arr = [$arr];
451	0						$wantstr = 1;
452							}
453
454	0						foreach(@$arr) {
455	0						my $quote;
456	0	0	0				if($self->{keep_quotes} && /^(['"])(.*)\1$/) {
457	0						($quote, $_) = ($1, $2);
458							}
459	0						s/([ \\"'])/\\$1/g;
460	0	0					$_ = $quote.$_.$quote if $quote;
461							}
462
463	0	0					return $wantstr ? $arr->[0] : $arr;
464							}
465
466
467							=item join_line(tokens)
468
469							This routine does a somewhat intelligent job of joining tokens
470							back into a command line. If token_chars (see L) is empty
471							(the default), then it just escapes backslashes and quotes, and
472							joins the tokens with spaces.
473
474							However, if token_chars is nonempty, it tries to insert a visually
475							pleasing amount of space between the tokens. For instance, rather
476							than 'a ( b , c )', it tries to produce 'a (b, c)'. It won't reformat
477							any tokens that aren't found in $self->{token_chars}, of course.
478
479							To change the formatting, you can redefine the variables
480							$self->{space_none}, $self->{space_before}, and $self->{space_after}.
481							Each variable is a string containing all characters that should
482							not be surrounded by whitespace, should have whitespace before,
483							and should have whitespace after, respectively. Any character
484							found in token_chars, but non in any of these space_ variables,
485							will have space placed both before and after.
486
487							=cut
488
489							sub join_line
490							{
491	0			0	1		my $self = shift;
492	0						my $intoks = shift;
493
494	0						my $tchrs = $self->{token_chars};
495	0						my $s_none = $self->{space_none};
496	0						my $s_before = $self->{space_before};
497	0						my $s_after = $self->{space_after};
498
499							# copy the input array so we don't modify it
500	0						my $tokens = $self->parse_escape([@$intoks]);
501
502	0						my $str = '';
503	0						my $sw = 0; # space if space wanted after token.
504	0						my $sf = 0; # space if space should be forced after token.
505	0						for(@$tokens) {
506	0	0	0				if(length == 1 && index($tchrs,$_) >= 0) {
507	0	0					if(index($s_none,$_) >= 0) { $str .= $_; $sw=0; next; }
	0
	0
	0
508	0	0					if(index($s_before,$_) >= 0) { $str .= $sw.$_; $sw=0; next; }
	0
	0
	0
509	0	0					if(index($s_after,$_) >= 0) { $str .= $_; $sw=1; next; }
	0
	0
	0
510							# default: force space on both sides of operator.
511	0						$str .= " $_ "; $sw = 0; next;
	0
	0
512							}
513	0	0					$str .= ($sw ? ' ' : '') . $_;
514	0						$sw = 1;
515							}
516
517	0						return $str;
518							}
519
520
521							=back
522
523							=head1 BUGS
524
525							None known.
526
527							=head1 LICENSE
528
529							Copyright (c) 2003 Scott Bronson, all rights reserved.
530							This program is free software; you can redistribute it and/or modify
531							it under the same terms as Perl itself.
532
533							=head1 AUTHOR
534
535							Scott Bronson Ebronson@rinspin.comE
536
537							=cut
538
539							1;