File Coverage

blib/lib/Text/Shellwords/Cursor.pm
Criterion Covered Total %
statement 111 171 64.9
branch 57 114 50.0
condition 32 54 59.2
subroutine 6 8 75.0
pod 6 6 100.0
total 212 353 60.0


line stmt bran cond sub pod time code
1             # Text::Shellwords::Cursor.pm
2             # Scott Bronson
3             # 27 Jan 2003
4             # Covered by the MIT license.
5              
6             package Text::Shellwords::Cursor;
7              
8 1     1   27909 use strict;
  1         2  
  1         47  
9              
10 1     1   6 use vars qw($VERSION);
  1         2  
  1         2666  
11             $VERSION = '0.81';
12              
13             =head1 NAME
14              
15             Text::Shellwords::Cursor - Parse a string into tokens
16              
17             =head1 SYNOPSIS
18              
19             use Text::Shellwords::Cursor;
20             my $parser = Text::Shellwords::Cursor->new();
21             my $str = 'ab cdef "ghi" j"k\"l "';
22             my ($tok1) = $parser->parse_line($str);
23             $tok1 = ['ab', 'cdef', 'ghi', 'j', 'k"l ']
24             my ($tok2, $tokno, $tokoff) = $parser->parse_line($str, cursorpos => 6);
25             as above, but $tokno=1, $tokoff=3 (under the 'f')
26              
27             DESCRIPTION
28              
29             This module is very similar to Text::Shellwords and Text::ParseWords.
30             However, it has one very significant difference: it keeps track of
31             a character position in the line it's parsing. For instance, if you
32             pass it ("zq fmgb", cursorpos=>6), it would return
33             (['zq', 'fmgb'], 1, 3). The cursorpos parameter
34             tells where in the input string the cursor resides
35             (just before the 'b'), and the result tells you that
36             the cursor was on token 1 ('fmgb'), character 3 ('b').
37             This is very useful when computing command-line completions
38             involving quoting, escaping, and tokenizing characters (like '(' or '=').
39              
40             A few helper utilities are included as well. You can escape a string to
41             ensure that parsing it will produce the original string (L).
42             You can also reassemble the tokens with a visually pleasing amount of
43             whitespace between them (L).
44              
45             This module started out as an integral part of Term::GDBUI using
46             code loosely based on Text::ParseWords. However,
47             it is now basically a ground-up reimplementation. It was
48             split out of Term::GDBUI for version 0.8.
49              
50             =head1 METHODS
51              
52             =over 3
53              
54             =item new
55              
56             Creates a new parser. Takes named arguments on the command line.
57              
58             =over 4
59              
60             =item keep_quotes
61              
62             Normally all unescaped, unnecessary quote marks are stripped.
63             If you specify C1>, however, they are preserved.
64             This is useful if you need to know whether the string was quoted
65             or not (string constants) or what type of quotes was around it
66             (affecting variable interpolation, for instance).
67              
68             =item token_chars
69              
70             This argument specifies the characters that should be considered
71             tokens all by themselves. For instance, if I pass
72             token_chars=>'=', then 'ab=123' would be parsed to ('ab', '=', '123').
73             Without token_chars, 'ab=123' remains a single string.
74              
75             NOTE: you cannot change token_chars after the constructor has been
76             called! The regexps that use it are compiled once (m//o).
77             Also, until the Gnu Readline library can accept "=[]," without
78             diving into an endless loop, we will not tell history expansion
79             to use token_chars (it uses " \t\n()<>;&|" by default).
80              
81             =item debug
82              
83             Turns on rather copious debugging to try to show what the parser is
84             thinking at every step.
85              
86             =item space_none
87              
88             =item space_before
89              
90             =item space_after
91              
92             These variables affect how whitespace in the line is normalized and
93             it is reassembled into a string. See the L routine.
94              
95             =item error
96              
97             This is a reference to a routine that should be called to display
98             a parse error. The routine takes two arguments: a reference to the
99             parser, and the error message to display as a string.
100              
101             =cut
102              
103             sub new
104             {
105 1     1 1 12 my $type = shift;
106 1         16 bless {
107             keep_quotes => 0,
108             token_chars => '',
109             debug => 0,
110             space_none => '(',
111             space_before => '[{',
112             space_after => ',)]}',
113             error => undef,
114             @_
115             }, $type
116             }
117              
118             =item parsebail(msg)
119              
120             If the parsel routine or any of its subroutines runs into a fatal
121             error, they call parsebail to present a very descriptive diagnostic.
122              
123             =cut
124              
125             sub parsebail
126             {
127 2     2 1 5 my $self = shift;
128 2         5 my $msg = shift;
129              
130 2         38 die "$msg at char " . pos() . ":\n",
131             " $_\n " . (' ' x pos()) . '^' . "\n";
132              
133             }
134              
135              
136             =item parsel
137              
138             This is the heinous routine that actually does the parsing.
139             You should never need to call it directly. Call
140             L
141             instead.
142              
143             =cut
144              
145             sub parsel
146             {
147 65     65 1 83 my $self = shift;
148 65         142 $_ = shift;
149 65         94 my $cursorpos = shift;
150 65         82 my $fixclosequote = shift;
151              
152 65         112 my $deb = $self->{debug};
153 65         98 my $tchrs = $self->{token_chars};
154              
155 65   66     305 my $usingcp = (defined($cursorpos) && $cursorpos ne '');
156 65         77 my $tokno = undef;
157 65         63 my $tokoff = undef;
158 65         64 my $oldpos;
159              
160 65         95 my @pieces = ();
161              
162             # Need to special case the empty string. None of the patterns below
163             # will match it yet we need to return an empty token for the cursor.
164 65 100 100     299 return ([''], 0, 0) if $usingcp && $_ eq '';
165              
166 64         155 /^/gc; # force scanning to the beginning of the line
167              
168 64         79 do {
169 100 0       5023 $deb && print "-- top, pos=" . pos() .
    50          
170             ($usingcp ? " cursorpos=$cursorpos" : "") . "\n";
171              
172             # trim whitespace from the beginning
173 100 100       344 if(/\G(\s+)/gc) {
174 54 0       119 $deb && print "trimmed " . length($1) . " whitespace chars, " .
    50          
175             ($usingcp ? "cursorpos=$cursorpos" : "") . "\n";
176             # if pos passed cursorpos, then we know that the cursor was
177             # surrounded by ws and we need to create an empty token for it.
178 54 100 100     210 if($usingcp && (pos() >= $cursorpos)) {
179             # if pos == cursorpos and we're not yet at EOL, let next token accept cursor
180 12 100 100     57 unless(pos() == $cursorpos && pos() < length($_)) {
181             # need to special-case at end-of-line as there are no more tokens
182             # to take care of the cursor so we must create an empty one.
183 8 50       20 $deb && print "adding bogus token to handle cursor.\n";
184 8         15 push @pieces, '';
185 8         15 $tokno = $#pieces;
186 8         8 $tokoff = 0;
187 8         15 $usingcp = 0;
188             }
189             }
190             }
191              
192             # if there's a quote, then suck to the close quote
193 100         137 $oldpos = pos();
194 100 100       300 if(/\G(['"])/gc) {
195 40         91 my $quote = $1;
196 40         49 my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
197 40         106 my $s;
198              
199 40 50       91 $deb && print "Found open quote [$quote] oldpos=$oldpos\n";
200              
201             # adjust tokoff unless the cursor sits directly on the open quote
202 40 100 100     196 if($usingcp && pos()-1 < $cursorpos) {
203 31 50       63 $deb && print " lead quote increment pos=".pos()." cursorpos=$cursorpos\n";
204 31         51 $adjust += 1;
205             }
206              
207 40 100       87 if($quote eq '"') {
208 20 100       105 if(/\G((?:\\.|(?!["])[^\\])*)["]/gc) {
209 19         38 $s = $1; # string without quotes
210             } else {
211 1 50       5 unless($fixclosequote) {
212 1         4 pos() -= 1;
213 1         9 $self->parsebail("need closing quote [\"]");
214             }
215 0         0 /\G(.*)$/gc; # if no close quote, just suck to the end of the string
216 0         0 $s = $1; # string without quotes
217 0 0 0     0 if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
  0         0  
218             }
219 19 50       71 $deb && print " quoted string is \"$s\"\n";
220 19         68 while($s =~ /\\./g) {
221 14         26 my $ps = pos($s) - 2; # points to the start of the sub
222 14 50       31 $deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
223 14 100 66     64 $adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
224 14         26 substr($s, $ps, 1) = '';
225 14         32 pos($s) = $ps + 1;
226 14 50       66 $deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
227             }
228             } else {
229 20 100       111 if(/\G((?:\\.|(?!['])[^\\])*)[']/gc) {
230 19         40 $s = $1; # string without quotes
231             } else {
232 1 50       5 unless($fixclosequote) {
233 1         4 pos() -= 1;
234 1         10 $self->parsebail("need closing quote [']");
235             }
236 0         0 /\G(.*)$/gc; # if no close quote, just suck to the end of the string
237 0         0 $s = $1;
238 0 0 0     0 if($usingcp && pos() == $cursorpos) { $adjust -= 1; } # make cursor think cq was there
  0         0  
239             }
240 19 50       40 $deb && print " quoted string is '$s'\n";
241 19         75 while($s =~ /\\[\\']/g) {
242 12         20 my $ps = pos($s) - 2; # points to the start of the sub
243 12 50       26 $deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
244 12 100 66     63 $adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
245 12         25 substr($s, $ps, 1) = '';
246 12         483 pos($s) = $ps + 1;
247 12 50       176 $deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
248             }
249             }
250              
251             # adjust tokoff if the cursor if it sits directly on the close quote
252 38 100 100     185 if($usingcp && pos() == $cursorpos) {
253 8 50       23 $deb && print " trail quote increment pos=".pos()." cursorpos=$cursorpos\n";
254 8         12 $adjust += 1;
255             }
256              
257 38 50       92 $deb && print " Found close, pushing '$s' oldpos=$oldpos\n";
258 38 50       94 if($self->{keep_quotes}) {
259 0         0 $adjust -= 1; # need to move right 1 for opening quote
260 0         0 $s = $quote.$s.$quote;
261             }
262 38         72 push @pieces, $s;
263              
264             # Set tokno and tokoff if this token contained the cursor
265 38 100 100     165 if($usingcp && pos() >= $cursorpos) {
266             # Previous block contains the cursor
267 34         51 $tokno = $#pieces;
268 34         48 $tokoff = $cursorpos - $oldpos - $adjust;
269 34         66 $usingcp = 0;
270             }
271             }
272              
273             # suck up as much unquoted text as we can
274 98         126 $oldpos = pos();
275 98 100       368 if(/\G((?:\\.|[^\s\\"'\Q$tchrs\E])+)/gco) {
276 40         79 my $s = $1; # the unquoted string
277 40         45 my $adjust = 0; # keeps track of tokoff bumps due to subs, etc.
278              
279 40 50       80 $deb && print "Found unquoted string '$s'\n";
280 40         119 while($s =~ /\\./g) {
281 9         15 my $ps = pos($s) - 2; # points to the start of substitution
282 9 50       18 $deb && print " doing substr at $ps on '$s' oldpos=$oldpos adjust=$adjust\n";
283 9 100 100     43 $adjust += 1 if $usingcp && $ps < $cursorpos - $oldpos - $adjust;
284 9         17 substr($s, $ps, 1) = '';
285 9         18 pos($s) = $ps + 1;
286 9 50       41 $deb && print " s='$s' usingcp=$usingcp pos(s)=" . pos($s) . " cursorpos=$cursorpos oldpos=$oldpos adjust=$adjust\n";
287             }
288 40 50       82 $deb && print " pushing '$s'\n";
289 40         66 push @pieces, $s;
290              
291             # Set tokno and tokoff if this token contained the cursor
292 40 100 100     153 if($usingcp && pos() >= $cursorpos) {
293             # Previous block contains the cursor
294 12         23 $tokno = $#pieces;
295 12         18 $tokoff = $cursorpos - $oldpos - $adjust;
296 12         24 $usingcp = 0;
297             }
298             }
299              
300 98 50 33     443 if(length($tchrs) && /\G([\Q$tchrs\E])/gco) {
301 0         0 my $s = $1; # the token char
302 0 0       0 $deb && print " pushing '$s'\n";
303 0         0 push @pieces, $s;
304              
305 0 0 0     0 if($usingcp && pos() == $cursorpos) {
306             # Previous block contains the cursor
307 0         0 $tokno = $#pieces;
308 0         0 $tokoff = 0;
309 0         0 $usingcp = 0;
310             }
311             }
312             } until(pos() >= length($_));
313              
314 62 0       123 $deb && print "Result: (", join(", ", @pieces), ") " .
    0          
    50          
315             (defined($tokno) ? $tokno : 'undef') . " " .
316             (defined($tokoff) ? $tokoff : 'undef') . "\n";
317              
318 62         336 return ([@pieces], $tokno, $tokoff);
319             }
320              
321              
322             =item parse_line(line, I)
323              
324             This is the entrypoint to this module's parsing functionality. It converts
325             a line into tokens, respecting quoted text, escaped characters,
326             etc. It also keeps track of a cursor position on the input text,
327             returning the token number and offset within the token where that position
328             can be found in the output.
329              
330             This routine originally bore some resemblance to Text::ParseWords.
331             It has changed almost completely, however, to support keeping track
332             of the cursor position. It also has nicer failure modes, modular
333             quoting, token characters (see token_chars in L), etc. This
334             routine now does much more.
335              
336             Arguments:
337              
338             =over 3
339              
340             =item line
341              
342             This is a string containing the command-line to parse.
343              
344             =back
345              
346             This routine also accepts the following named parameters:
347              
348             =over 3
349              
350             =item cursorpos
351              
352             This is the character position in the line to keep track of.
353             Pass undef (by not specifying it) or the empty string to have
354             the line processed with cursorpos ignored.
355              
356             Note that passing undef is I the same as passing
357             some random number and ignoring the result! For instance, if you
358             pass 0 and the line begins with whitespace, you'll get a 0-length token at
359             the beginning of the line to represent the cursor in
360             the middle of the whitespace. This allows command completion
361             to work even when the cursor is not near any tokens.
362             If you pass undef, all whitespace at the beginning and end of
363             the line will be trimmed as you would expect.
364              
365             If it is ambiguous whether the cursor should belong to the previous
366             token or to the following one (i.e. if it's between two quoted
367             strings, say "a""b" or a token_char), it always gravitates to
368             the previous token. This makes more sense when completing.
369              
370             =item fixclosequote
371              
372             Sometimes you want to try to recover from a missing close quote
373             (for instance, when calculating completions), but usually you
374             want a missing close quote to be a fatal error. fixclosequote=>1
375             will implicitly insert the correct quote if it's missing.
376             fixclosequote=>0 is the default.
377              
378             =item messages
379              
380             parse_line is capable of printing very informative error messages.
381             However, sometimes you don't care enough to print a message (like
382             when calculating completions). Messages are printed by default,
383             so pass messages=>0 to turn them off.
384              
385             =back
386              
387             This function returns a reference to an array containing three
388             items:
389              
390             =over 3
391              
392             =item tokens
393              
394             A the tokens that the line was separated into (ref to an array of strings).
395              
396             =item tokno
397              
398             The number of the token (index into the previous array) that contains
399             cursorpos.
400              
401             =item tokoff
402              
403             The character offet into tokno of cursorpos.
404              
405             =back
406              
407             If the cursor is at the end of the token, tokoff will point to 1
408             character past the last character in tokno, a non-existant character.
409             If the cursor is between tokens (surrounded by whitespace), a zero-length
410             token will be created for it.
411              
412             =cut
413              
414             sub parse_line
415             {
416 65     65 1 36007 my $self = shift;
417 65         100 my $line = shift;
418 65         320 my %args = (
419             messages => 1, # true if we should print errors, etc.
420             cursorpos => undef, # cursor to keep track of, undef to ignore.
421             fixclosequote => 0,
422             @_
423             );
424              
425 65         95 my @result = eval { $self->parsel($line,
  65         194  
426             $args{'cursorpos'}, $args{'fixclosequote'}) };
427 65 100       357 if($@) {
428 2 50 33     11 $self->{error}->($self, $@) if $args{'messages'} && $self->{error};
429 2         6 @result = (undef, undef, undef);
430             }
431              
432 65         277 return @result;
433             }
434              
435              
436             =item parse_escape(lines)
437              
438             Escapes characters that would be otherwise interpreted by the parser.
439             Will accept either a single string or an arrayref of strings (which
440             will be modified in-place).
441              
442             =cut
443              
444             sub parse_escape
445             {
446 0     0 1   my $self = shift;
447 0           my $arr = shift; # either a string or an arrayref of strings
448              
449 0           my $wantstr = 0;
450 0 0         if(ref($arr) ne 'ARRAY') {
451 0           $arr = [$arr];
452 0           $wantstr = 1;
453             }
454              
455 0           foreach(@$arr) {
456 0           my $quote;
457 0 0 0       if($self->{keep_quotes} && /^(['"])(.*)\1$/) {
458 0           ($quote, $_) = ($1, $2);
459             }
460 0           s/([ \\"'])/\\$1/g;
461 0 0         $_ = $quote.$_.$quote if $quote;
462             }
463              
464 0 0         return $wantstr ? $arr->[0] : $arr;
465             }
466              
467              
468             =item join_line(tokens)
469              
470             This routine does a somewhat intelligent job of joining tokens
471             back into a command line. If token_chars (see L) is empty
472             (the default), then it just escapes backslashes and quotes, and
473             joins the tokens with spaces.
474              
475             However, if token_chars is nonempty, it tries to insert a visually
476             pleasing amount of space between the tokens. For instance, rather
477             than 'a ( b , c )', it tries to produce 'a (b, c)'. It won't reformat
478             any tokens that aren't found in $self->{token_chars}, of course.
479              
480             To change the formatting, you can redefine the variables
481             $self->{space_none}, $self->{space_before}, and $self->{space_after}.
482             Each variable is a string containing all characters that should
483             not be surrounded by whitespace, should have whitespace before,
484             and should have whitespace after, respectively. Any character
485             found in token_chars, but non in any of these space_ variables,
486             will have space placed both before and after.
487              
488             =cut
489              
490             sub join_line
491             {
492 0     0 1   my $self = shift;
493 0           my $intoks = shift;
494              
495 0           my $tchrs = $self->{token_chars};
496 0           my $s_none = $self->{space_none};
497 0           my $s_before = $self->{space_before};
498 0           my $s_after = $self->{space_after};
499              
500             # copy the input array so we don't modify it
501 0           my $tokens = $self->parse_escape([@$intoks]);
502              
503 0           my $str = '';
504 0           my $sw = 0; # space if space wanted after token.
505 0           my $sf = 0; # space if space should be forced after token.
506 0           for(@$tokens) {
507 0 0 0       if(length == 1 && index($tchrs,$_) >= 0) {
508 0 0         if(index($s_none,$_) >= 0) { $str .= $_; $sw=0; next; }
  0            
  0            
  0            
509 0 0         if(index($s_before,$_) >= 0) { $str .= $sw.$_; $sw=0; next; }
  0            
  0            
  0            
510 0 0         if(index($s_after,$_) >= 0) { $str .= $_; $sw=1; next; }
  0            
  0            
  0            
511             # default: force space on both sides of operator.
512 0           $str .= " $_ "; $sw = 0; next;
  0            
  0            
513             }
514 0 0         $str .= ($sw ? ' ' : '') . $_;
515 0           $sw = 1;
516             }
517              
518 0           return $str;
519             }
520              
521              
522             =back
523              
524             =back
525              
526             =head1 BUGS
527              
528             None known.
529              
530             =head1 LICENSE
531              
532             Copyright (c) 2003-2011 Scott Bronson, all rights reserved.
533             This program is covered by the MIT license.
534              
535             =head1 AUTHOR
536              
537             Scott Bronson Ebronson@rinspin.comE
538              
539             =cut
540              
541             1;