File Coverage

blib/lib/Text/Shellwords/Cursor.pm
Criterion Covered Total %
statement 111 171 64.9
branch 57 114 50.0
condition 32 54 59.2
subroutine 6 8 75.0
pod 6 6 100.0
total 212 353 60.0


line stmt bran cond sub pod time code
1             # Text::Shellwords::Cursor.pm
2             # Scott Bronson
3             # 27 Jan 2003
4              
5             package Text::Shellwords::Cursor;
6              
7 2     2   34157 use strict;
  2         5  
  2         82  
8              
9 2     2   12 use vars qw($VERSION);
  2         4  
  2         7232  
10             $VERSION = '0.81';
11              
12             =head1 NAME
13              
14             Text::Shellwords::Cursor - Parse a string into tokens
15              
16             =head1 SYNOPSIS
17              
18             use Text::Shellwords::Cursor;
19             my $parser = Text::Shellwords::Cursor->new();
20             my $str = 'ab cdef "ghi" j"k\"l "';
21             my ($tok1) = $parser->parse_line($str);
22             $tok1 = ['ab', 'cdef', 'ghi', 'j', 'k"l ']
23             my ($tok2, $tokno, $tokoff) = $parser->parse_line($str, cursorpos => 6);
24             as above, but $tokno=1, $tokoff=3 (under the 'f')
25              
26             DESCRIPTION
27              
28             This module is very similar to Text::Shellwords and Text::ParseWords.
29             However, it has one very significant difference: it keeps track of
30             a character position in the line it's parsing. For instance, if you
31             pass it ("zq fmgb", cursorpos=>6), it would return
32             (['zq', 'fmgb'], 1, 3). The cursorpos parameter
33             tells where in the input string the cursor resides
34             (just before the 'b'), and the result tells you that
35             the cursor was on token 1 ('fmgb'), character 3 ('b').
36             This is very useful when computing command-line completions
37             involving quoting, escaping, and tokenizing characters (like '(' or '=').
38              
39             A few helper utilities are included as well. You can escape a string to
40             ensure that parsing it will produce the original string (L).
41             You can also reassemble the tokens with a visually pleasing amount of
42             whitespace between them (L).
43              
44             This module started out as an integral part of Term::GDBUI using
45             code loosely based on Text::ParseWords. However,
46             it is now basically a ground-up reimplementation. It was
47             split out of Term::GDBUI for version 0.8.
48              
49             =head1 METHODS
50              
51             =over 3
52              
53             =item new
54              
55             Creates a new parser. Takes named arguments on the command line.
56              
57             =over 4
58              
59             =item keep_quotes
60              
61             Normally all unescaped, unnecessary quote marks are stripped.
62             If you specify C1>, however, they are preserved.
63             This is useful if you need to know whether the string was quoted
64             or not (string constants) or what type of quotes was around it
65             (affecting variable interpolation, for instance).
66              
67             =item token_chars
68              
69             This argument specifies the characters that should be considered
70             tokens all by themselves. For instance, if I pass
71             token_chars=>'=', then 'ab=123' would be parsed to ('ab', '=', '123').
72             Without token_chars, 'ab=123' remains a single string.
73              
74             NOTE: you cannot change token_chars after the constructor has been
75             called! The regexps that use it are compiled once (m//o).
76             Also, until the Gnu Readline library can accept "=[]," without
77             diving into an endless loop, we will not tell history expansion
78             to use token_chars (it uses " \t\n()<>;&|" by default).
79              
80             =item debug
81              
82             Turns on rather copious debugging to try to show what the parser is
83             thinking at every step.
84              
85             =item space_none
86              
87             =item space_before
88              
89             =item space_after
90              
91             These variables affect how whitespace in the line is normalized and
92             it is reassembled into a string. See the L routine.
93              
94             =item error
95              
96             This is a reference to a routine that should be called to display
97             a parse error. The routine takes two arguments: a reference to the
98             parser, and the error message to display as a string.
99              
100             =cut
101              
102             sub new
103             {
104 2     2 1 19 my $type = shift;
105 2         62 bless {
106             keep_quotes => 0,
107             token_chars => '',
108             debug => 0,
109             space_none => '(',
110             space_before => '[{',
111             space_after => ',)]}',
112             error => undef,
113             @_
114             }, $type
115             }
116              
117             =item parsebail(msg)
118              
119             If the parsel routine or any of its subroutines runs into a fatal
120             error, they call parsebail to present a very descriptive diagnostic.
121              
122             =cut
123              
124             sub parsebail
125             {
126 2     2 1 5 my $self = shift;
127 2         8 my $msg = shift;
128              
129 2         33 die "$msg at char " . pos() . ":\n",
130             " $_\n " . (' ' x pos()) . '^' . "\n";
131              
132             }
133              
134              
135             =item parsel
136              
137             This is the heinous routine that actually does the parsing.
138             You should never need to call it directly. Call
139             L
140             instead.
141              
142             =cut
143              
144             sub parsel
145             {
146 65     65 1 391 my $self = shift;
147 65         137 $_ = shift;
148 65         101 my $cursorpos = shift;
149 65         172 my $fixclosequote = shift;
150              
151 65         126 my $deb = $self->{debug};
152 65         112 my $tchrs = $self->{token_chars};
153              
154 65   66     409 my $usingcp = (defined($cursorpos) && $cursorpos ne '');
155 65         293 my $tokno = undef;
156 65         82 my $tokoff = undef;
157 65         73 my $oldpos;
158              
159 65         114 my @pieces = ();
160              
161             # Need to special case the empty string. None of the patterns below
162             # will match it yet we need to return an empty token for the cursor.
163 65 100 100     715 return ([''], 0, 0) if $usingcp && $_ eq '';
164              
165 64         183 /^/gc; # force scanning to the beginning of the line
166              
167 64         88 do {
168 100 0       793 $deb && print "-- top, pos=" . pos() .
    50          
169             ($usingcp ? " cursorpos=$cursorpos" : "") . "\n";
170              
171             # trim whitespace from the beginning
172 100 100       420 if(/\G(\s+)/gc) {
173 54 0       129 $deb && print "trimmed " . length($1) . " whitespace chars, " .
    50          
174             ($usingcp ? "cursorpos=$cursorpos" : "") . "\n";
175             # if pos passed cursorpos, then we know that the cursor was
176             # surrounded by ws and we need to create an empty token for it.
177 54 100 100     218 if($usingcp && (pos() >= $cursorpos)) {
178             # if pos == cursorpos and we're not yet at EOL, let next token accept cursor
179 12 100 100     61 unless(pos() == $cursorpos && pos() < length($_)) {
180             # need to special-case at end-of-line as there are no more tokens
181             # to take care of the cursor so we must create an empty one.
182 8 50       98 $deb && print "adding bogus token to handle cursor.\n";
183 8         16 push @pieces, '';
184 8         16 $tokno = $#pieces;
185 8         12 $tokoff = 0;
186 8         17 $usingcp = 0;
187             }
188             }
189             }
190              
191             # if there's a quote, then suck to the close quote
192 100         276 $oldpos = pos();
193 100 100       757 if(/\G(['"])/gc) {
194 40         101 my $quote = $1;
195 40         58 my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
196 40         49 my $s;
197              
198 40 50       102 $deb && print "Found open quote [$quote] oldpos=$oldpos\n";
199              
200             # adjust tokoff unless the cursor sits directly on the open quote
201 40 100 100     203 if($usingcp && pos()-1 < $cursorpos) {
202 31 50       81 $deb && print " lead quote increment pos=".pos()." cursorpos=$cursorpos\n";
203 31         61 $adjust += 1;
204             }
205              
206 40 100       98 if($quote eq '"') {
207 20 100       106 if(/\G((?:\\.|(?!["])[^\\])*)["]/gc) {
208 19         40 $s = $1; # string without quotes
209             } else {
210 1 50       7 unless($fixclosequote) {
211 1         5 pos() -= 1;
212 1         5 $self->parsebail("need closing quote [\"]");
213             }
214 0         0 /\G(.*)$/gc; # if no close quote, just suck to the end of the string
215 0         0 $s = $1; # string without quotes
216 0 0 0     0 if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
  0         0  
217             }
218 19 50       45 $deb && print " quoted string is \"$s\"\n";
219 19         74 while($s =~ /\\./g) {
220 14         28 my $ps = pos($s) - 2; # points to the start of the sub
221 14 50       30 $deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
222 14 100 66     69 $adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
223 14         28 substr($s, $ps, 1) = '';
224 14         32 pos($s) = $ps + 1;
225 14 50       77 $deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
226             }
227             } else {
228 20 100       384 if(/\G((?:\\.|(?!['])[^\\])*)[']/gc) {
229 19         41 $s = $1; # string without quotes
230             } else {
231 1 50       5 unless($fixclosequote) {
232 1         4 pos() -= 1;
233 1         7 $self->parsebail("need closing quote [']");
234             }
235 0         0 /\G(.*)$/gc; # if no close quote, just suck to the end of the string
236 0         0 $s = $1;
237 0 0 0     0 if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
  0         0  
238             }
239 19 50       574 $deb && print " quoted string is '$s'\n";
240 19         75 while($s =~ /\\[\\']/g) {
241 12         24 my $ps = pos($s) - 2; # points to the start of the sub
242 12 50       295 $deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
243 12 100 66     360 $adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
244 12         44 substr($s, $ps, 1) = '';
245 12         31 pos($s) = $ps + 1;
246 12 50       70 $deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
247             }
248             }
249              
250             # adjust tokoff if the cursor if it sits directly on the close quote
251 38 100 100     196 if($usingcp && pos() == $cursorpos) {
252 8 50       26 $deb && print " trail quote increment pos=".pos()." cursorpos=$cursorpos\n";
253 8         15 $adjust += 1;
254             }
255              
256 38 50       91 $deb && print " Found close, pushing '$s' oldpos=$oldpos\n";
257 38 50       101 if($self->{keep_quotes}) {
258 0         0 $adjust -= 1; # need to move right 1 for opening quote
259 0         0 $s = $quote.$s.$quote;
260             }
261 38         78 push @pieces, $s;
262              
263             # Set tokno and tokoff if this token contained the cursor
264 38 100 100     174 if($usingcp && pos() >= $cursorpos) {
265             # Previous block contains the cursor
266 34         55 $tokno = $#pieces;
267 34         59 $tokoff = $cursorpos - $oldpos - $adjust;
268 34         76 $usingcp = 0;
269             }
270             }
271              
272             # suck up as much unquoted text as we can
273 98         149 $oldpos = pos();
274 98 100       787 if(/\G((?:\\.|[^\s\\"'\Q$tchrs\E])+)/gco) {
275 40         487 my $s = $1; # the unquoted string
276 40         56 my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
277              
278 40 50       153 $deb && print "Found unquoted string '$s'\n";
279 40         503 while($s =~ /\\./g) {
280 9         17 my $ps = pos($s) - 2; # points to the start of substitution
281 9 50       21 $deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
282 9 100 100     61 $adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
283 9         23 substr($s, $ps, 1) = '';
284 9         21 pos($s) = $ps + 1;
285 9 50       47 $deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
286             }
287 40 50       96 $deb && print " pushing '$s'\n";
288 40         83 push @pieces, $s;
289              
290             # Set tokno and tokoff if this token contained the cursor
291 40 100 100     315 if($usingcp && pos() >= $cursorpos) {
292             # Previous block contains the cursor
293 12         19 $tokno = $#pieces;
294 12         202 $tokoff = $cursorpos - $oldpos - $adjust;
295 12         163 $usingcp = 0;
296             }
297             }
298              
299 98 50 33     687 if(length($tchrs) && /\G([\Q$tchrs\E])/gco) {
300 0         0 my $s = $1; # the token char
301 0 0       0 $deb && print " pushing '$s'\n";
302 0         0 push @pieces, $s;
303              
304 0 0 0     0 if($usingcp && pos() == $cursorpos) {
305             # Previous block contains the cursor
306 0         0 $tokno = $#pieces;
307 0         0 $tokoff = 0;
308 0         0 $usingcp = 0;
309             }
310             }
311             } until(pos() >= length($_));
312              
313 62 0       747 $deb && print "Result: (", join(", ", @pieces), ") " .
    0          
    50          
314             (defined($tokno) ? $tokno : 'undef') . " " .
315             (defined($tokoff) ? $tokoff : 'undef') . "\n";
316              
317 62         684 return ([@pieces], $tokno, $tokoff);
318             }
319              
320              
321             =item parse_line(line, I)
322              
323             This is the entrypoint to this module's parsing functionality. It converts
324             a line into tokens, respecting quoted text, escaped characters,
325             etc. It also keeps track of a cursor position on the input text,
326             returning the token number and offset within the token where that position
327             can be found in the output.
328              
329             This routine originally bore some resemblance to Text::ParseWords.
330             It has changed almost completely, however, to support keeping track
331             of the cursor position. It also has nicer failure modes, modular
332             quoting, token characters (see token_chars in L), etc. This
333             routine now does much more.
334              
335             Arguments:
336              
337             =over 3
338              
339             =item line
340              
341             This is a string containing the command-line to parse.
342              
343             =back
344              
345             This routine also accepts the following named parameters:
346              
347             =over 3
348              
349             =item cursorpos
350              
351             This is the character position in the line to keep track of.
352             Pass undef (by not specifying it) or the empty string to have
353             the line processed with cursorpos ignored.
354              
355             Note that passing undef is I the same as passing
356             some random number and ignoring the result! For instance, if you
357             pass 0 and the line begins with whitespace, you'll get a 0-length token at
358             the beginning of the line to represent the cursor in
359             the middle of the whitespace. This allows command completion
360             to work even when the cursor is not near any tokens.
361             If you pass undef, all whitespace at the beginning and end of
362             the line will be trimmed as you would expect.
363              
364             If it is ambiguous whether the cursor should belong to the previous
365             token or to the following one (i.e. if it's between two quoted
366             strings, say "a""b" or a token_char), it always gravitates to
367             the previous token. This makes more sense when completing.
368              
369             =item fixclosequote
370              
371             Sometimes you want to try to recover from a missing close quote
372             (for instance, when calculating completions), but usually you
373             want a missing close quote to be a fatal error. fixclosequote=>1
374             will implicitly insert the correct quote if it's missing.
375             fixclosequote=>0 is the default.
376              
377             =item messages
378              
379             parse_line is capable of printing very informative error messages.
380             However, sometimes you don't care enough to print a message (like
381             when calculating completions). Messages are printed by default,
382             so pass messages=>0 to turn them off.
383              
384             =back
385              
386             This function returns a reference to an array containing three
387             items:
388              
389             =over 3
390              
391             =item tokens
392              
393             A the tokens that the line was separated into (ref to an array of strings).
394              
395             =item tokno
396              
397             The number of the token (index into the previous array) that contains
398             cursorpos.
399              
400             =item tokoff
401              
402             The character offet into tokno of cursorpos.
403              
404             =back
405              
406             If the cursor is at the end of the token, tokoff will point to 1
407             character past the last character in tokno, a non-existant character.
408             If the cursor is between tokens (surrounded by whitespace), a zero-length
409             token will be created for it.
410              
411             =cut
412              
413             sub parse_line
414             {
415 65     65 1 54862 my $self = shift;
416 65         198 my $line = shift;
417 65         551 my %args = (
418             messages => 1, # true if we should print errors, etc.
419             cursorpos => undef, # cursor to keep track of, undef to ignore.
420             fixclosequote => 0,
421             @_
422             );
423              
424 65         117 my @result = eval { $self->parsel($line,
  65         387  
425             $args{'cursorpos'}, $args{'fixclosequote'}) };
426 65 100       306 if($@) {
427 2 50 33     13 $self->{error}->($self, $@) if $args{'messages'} && $self->{error};
428 2         6 @result = (undef, undef, undef);
429             }
430              
431 65         318 return @result;
432             }
433              
434              
435             =item parse_escape(lines)
436              
437             Escapes characters that would be otherwise interpreted by the parser.
438             Will accept either a single string or an arrayref of strings (which
439             will be modified in-place).
440              
441             =cut
442              
443             sub parse_escape
444             {
445 0     0 1   my $self = shift;
446 0           my $arr = shift; # either a string or an arrayref of strings
447              
448 0           my $wantstr = 0;
449 0 0         if(ref($arr) ne 'ARRAY') {
450 0           $arr = [$arr];
451 0           $wantstr = 1;
452             }
453              
454 0           foreach(@$arr) {
455 0           my $quote;
456 0 0 0       if($self->{keep_quotes} && /^(['"])(.*)\1$/) {
457 0           ($quote, $_) = ($1, $2);
458             }
459 0           s/([ \\"'])/\\$1/g;
460 0 0         $_ = $quote.$_.$quote if $quote;
461             }
462              
463 0 0         return $wantstr ? $arr->[0] : $arr;
464             }
465              
466              
467             =item join_line(tokens)
468              
469             This routine does a somewhat intelligent job of joining tokens
470             back into a command line. If token_chars (see L) is empty
471             (the default), then it just escapes backslashes and quotes, and
472             joins the tokens with spaces.
473              
474             However, if token_chars is nonempty, it tries to insert a visually
475             pleasing amount of space between the tokens. For instance, rather
476             than 'a ( b , c )', it tries to produce 'a (b, c)'. It won't reformat
477             any tokens that aren't found in $self->{token_chars}, of course.
478              
479             To change the formatting, you can redefine the variables
480             $self->{space_none}, $self->{space_before}, and $self->{space_after}.
481             Each variable is a string containing all characters that should
482             not be surrounded by whitespace, should have whitespace before,
483             and should have whitespace after, respectively. Any character
484             found in token_chars, but non in any of these space_ variables,
485             will have space placed both before and after.
486              
487             =cut
488              
489             sub join_line
490             {
491 0     0 1   my $self = shift;
492 0           my $intoks = shift;
493              
494 0           my $tchrs = $self->{token_chars};
495 0           my $s_none = $self->{space_none};
496 0           my $s_before = $self->{space_before};
497 0           my $s_after = $self->{space_after};
498              
499             # copy the input array so we don't modify it
500 0           my $tokens = $self->parse_escape([@$intoks]);
501              
502 0           my $str = '';
503 0           my $sw = 0; # space if space wanted after token.
504 0           my $sf = 0; # space if space should be forced after token.
505 0           for(@$tokens) {
506 0 0 0       if(length == 1 && index($tchrs,$_) >= 0) {
507 0 0         if(index($s_none,$_) >= 0) { $str .= $_; $sw=0; next; }
  0            
  0            
  0            
508 0 0         if(index($s_before,$_) >= 0) { $str .= $sw.$_; $sw=0; next; }
  0            
  0            
  0            
509 0 0         if(index($s_after,$_) >= 0) { $str .= $_; $sw=1; next; }
  0            
  0            
  0            
510             # default: force space on both sides of operator.
511 0           $str .= " $_ "; $sw = 0; next;
  0            
  0            
512             }
513 0 0         $str .= ($sw ? ' ' : '') . $_;
514 0           $sw = 1;
515             }
516              
517 0           return $str;
518             }
519              
520              
521             =back
522              
523             =head1 BUGS
524              
525             None known.
526              
527             =head1 LICENSE
528              
529             Copyright (c) 2003 Scott Bronson, all rights reserved.
530             This program is free software; you can redistribute it and/or modify
531             it under the same terms as Perl itself.
532              
533             =head1 AUTHOR
534              
535             Scott Bronson Ebronson@rinspin.comE
536              
537             =cut
538              
539             1;