File Coverage

blib/lib/Regexp/Wildcards.pm
Criterion Covered Total %
statement 145 145 100.0
branch 42 42 100.0
condition 8 8 100.0
subroutine 26 26 100.0
pod 5 5 100.0
total 226 226 100.0


line stmt bran cond sub pod time code
1             package Regexp::Wildcards;
2              
3 9     9   199379 use strict;
  9         23  
  9         363  
4 9     9   81 use warnings;
  9         19  
  9         302  
5              
6 9     9   53 use Carp qw;
  9         21  
  9         622  
7 9     9   48 use Scalar::Util qw;
  9         17  
  9         794  
8 9     9   11059 use Text::Balanced qw;
  9         262454  
  9         1602  
9              
10             =head1 NAME
11              
12             Regexp::Wildcards - Converts wildcard expressions to Perl regular expressions.
13              
14             =head1 VERSION
15              
16             Version 1.05
17              
18             =cut
19              
20 9     9   88 use vars qw<$VERSION>;
  9         23  
  9         517  
21             BEGIN {
22 9     9   21433 $VERSION = '1.05';
23             }
24              
25             =head1 SYNOPSIS
26              
27             use Regexp::Wildcards;
28              
29             my $rw = Regexp::Wildcards->new(type => 'unix');
30              
31             my $re;
32             $re = $rw->convert('a{b?,c}*'); # Do it Unix shell style.
33             $re = $rw->convert('a?,b*', 'win32'); # Do it Windows shell style.
34             $re = $rw->convert('*{x,y}?', 'jokers'); # Process the jokers and
35             # escape the rest.
36             $re = $rw->convert('%a_c%', 'sql'); # Turn SQL wildcards into
37             # regexps.
38              
39             $rw = Regexp::Wildcards->new(
40             do => [ qw ], # Do jokers and brackets.
41             capture => [ qw ], # Capture *'s greedily.
42             );
43              
44             $rw->do(add => 'groups'); # Don't escape groups.
45             $rw->capture(rem => [ qw ]); # Actually we want non-greedy
46             # matches.
47             $re = $rw->convert('*a{,(b)?}?c*'); # '(.*?)a(?:|(b).).c(.*?)'
48             $rw->capture(); # No more captures.
49              
50             =head1 DESCRIPTION
51              
52             In many situations, users may want to specify patterns to match but don't need the full power of regexps.
53             Wildcards make one of those sets of simplified rules.
54             This module converts wildcard expressions to Perl regular expressions, so that you can use them for matching.
55              
56             It handles the C<*> and C jokers, as well as Unix bracketed alternatives C<{,}>, but also C<%> and C<_> SQL wildcards.
57             If required, it can also keep original C<(...)> groups or C<^> and C<$> anchors.
58             Backspace (C<\>) is used as an escape character.
59              
60             Typesets that mimic the behaviour of Windows and Unix shells are also provided.
61              
62             =head1 METHODS
63              
64             =cut
65              
66             sub _check_self {
67 468 100 100 468   5258 croak 'First argument isn\'t a valid ' . __PACKAGE__ . ' object'
68             unless blessed $_[0] and $_[0]->isa(__PACKAGE__);
69             }
70              
71             my %types = (
72             jokers => [ qw ],
73             sql => [ qw ],
74             commas => [ qw ],
75             brackets => [ qw ],
76             unix => [ qw ],
77             win32 => [ qw ],
78             );
79             $types{$_} = $types{win32} for qw;
80             $types{$_} = $types{unix} for qw
81             darwin machten next
82             aix irix hpux dgux dynixptx
83             bsdos freebsd openbsd
84             svr4 solaris sunos dec_osf
85             sco_sv unicos unicosmk>;
86              
87             my %escapes = (
88             jokers => '?*',
89             sql => '_%',
90             commas => ',',
91             brackets => '{},',
92             groups => '()',
93             anchors => '^$',
94             );
95              
96             my %captures = (
97             single => sub { $_[1] ? '(.)' : '.' },
98             any => sub { $_[1] ? ($_[0]->{greedy} ? '(.*)'
99             : '(.*?)')
100             : '.*' },
101             brackets => sub { $_[1] ? '(' : '(?:'; },
102             greedy => undef,
103             );
104              
105             sub _validate {
106 66     66   101 my $self = shift;
107 66         283 _check_self $self;
108 66         132 my $valid = shift;
109 66         110 my $old = shift;
110 66 100       1061 $old = { } unless defined $old;
111              
112 66         262 my %opts;
113 66 100       227 if (@_ <= 1) {
    100          
114 47 100       338 $opts{set} = defined $_[0] ? $_[0] : { };
115             } elsif (@_ % 2) {
116 2         250 croak 'Arguments must be passed as an unique scalar or as key => value pairs';
117             } else {
118 17         51 %opts = @_;
119             }
120              
121 64         90 my %checked;
122 64         138 for (qw) {
123 184         290 my $opt = $opts{$_};
124 184 100       545 next unless defined $opt;
125              
126             my $cb = {
127 15     15   153 '' => sub { +{ ($_[0] => 1) x (exists $valid->{$_[0]}) } },
128 25     25   107 'ARRAY' => sub { +{ map { ($_ => 1) x (exists $valid->{$_}) } @{$_[0]} } },
  47         288  
  25         96  
129 6         36 'HASH' => sub { +{ map { ($_ => $_[0]->{$_}) x (exists $valid->{$_}) }
  20         119  
130 20     20   27 keys %{$_[0]} } }
131 64         671 }->{ ref $opt };
132 64 100       1023 croak 'Wrong option set' unless $cb;
133 60         136 $checked{$_} = $cb->($opt);
134             }
135              
136 60 100       221 my $config = (exists $checked{set}) ? $checked{set} : $old;
137 60 100       99 $config->{$_} = $checked{add}->{$_} for grep $checked{add}->{$_},
  60         412  
138             keys %{$checked{add} || {}};
139 60 100       160 delete $config->{$_} for grep $checked{rem}->{$_},
  60         462  
140             keys %{$checked{rem} || {}};
141              
142 60         304 $config;
143             }
144              
145             sub _do {
146 32     32   223 my $self = shift;
147              
148 32         46 my $config;
149 32         347 $config->{do} = $self->_validate(\%escapes, $self->{do}, @_);
150 29         83 $config->{escape} = '';
151 29         42 $config->{escape} .= $escapes{$_} for keys %{$config->{do}};
  29         190  
152 29         96 $config->{escape} = quotemeta $config->{escape};
153              
154 29         64 $config;
155             }
156              
157             sub do {
158 18     18 1 2667 my $self = shift;
159 18         56 _check_self $self;
160              
161 16         59 my $config = $self->_do(@_);
162 13         85 $self->{$_} = $config->{$_} for keys %$config;
163              
164 13         50 $self;
165             }
166              
167             sub _capture {
168 34     34   49 my $self = shift;
169              
170 34         49 my $config;
171 34         128 $config->{capture} = $self->_validate(\%captures, $self->{capture}, @_);
172 31         108 $config->{greedy} = delete $config->{capture}->{greedy};
173 31         102 for (keys %captures) {
174 124 100       526 $config->{'c_' . $_} = $captures{$_}->($config, $config->{capture}->{$_})
175             if $captures{$_}; # Skip 'greedy'
176             }
177              
178 31         80 $config;
179             }
180              
181             sub capture {
182 36     36 1 2798 my $self = shift;
183 36         83 _check_self $self;
184              
185 34         123 my $config = $self->_capture(@_);
186 31         341 $self->{$_} = $config->{$_} for keys %$config;
187              
188 31         141 $self;
189             }
190              
191             sub _type {
192 18     18   49 my ($self, $type) = @_;
193 18 100       59 $type = 'unix' unless defined $type;
194 18 100       401 croak 'Wrong type' unless exists $types{$type};
195              
196 16         4039 my $config = $self->_do($types{$type});
197 16         34 $config->{type} = $type;
198              
199 16         35 $config;
200             }
201              
202             sub type {
203 16     16 1 2235 my $self = shift;
204 16         54 _check_self $self;
205              
206 14         62 my $config = $self->_type(@_);
207 13         134 $self->{$_} = $config->{$_} for keys %$config;
208              
209 13         46 $self;
210             }
211              
212             sub new {
213 14     14 1 2987 my $class = shift;
214 14   100     241 $class = blessed($class) || $class || __PACKAGE__;
215              
216 14 100       259 croak 'Optional arguments must be passed as key => value pairs' if @_ % 2;
217 13         44 my %args = @_;
218              
219 13         44 my $self = bless { }, $class;
220              
221 13 100       53 if (defined $args{do}) {
222 4         24 $self->do($args{do});
223             } else {
224 9         46 $self->type($args{type});
225             }
226              
227 13         68 $self->capture($args{capture});
228             }
229              
230             =head2 C
231              
232             my $rw = Regexp::Wildcards->new(do => $what, capture => $capture);
233             my $rw = Regexp::Wildcards->new(type => $type, capture => $capture);
234              
235             Constructs a new L object.
236              
237             C lists all features that should be enabled when converting wildcards to regexps.
238             Refer to L for details on what can be passed in C<$what>.
239              
240             The C specifies a predefined set of C features to use.
241             See L for details on which types are valid.
242             The C option overrides C.
243              
244             C lists which atoms should be capturing.
245             Refer to L for more details.
246              
247             =head2 C
248              
249             $rw->do($what);
250             $rw->do(set => $c1);
251             $rw->do(add => $c2);
252             $rw->do(rem => $c3);
253              
254             Specifies the list of metacharacters to convert or to prevent for escaping.
255             They fit into six classes :
256              
257             =over 4
258              
259             =item *
260              
261             C<'jokers'>
262              
263             Converts C to C<.> and C<*> to C<.*>.
264              
265             'a**\\*b??\\?c' ==> 'a.*\\*b..\\?c'
266              
267             =item *
268              
269             C<'sql'>
270              
271             Converts C<_> to C<.> and C<%> to C<.*>.
272              
273             'a%%\\%b__\\_c' ==> 'a.*\\%b..\\_c'
274              
275             =item *
276              
277             C<'commas'>
278              
279             Converts all C<,> to C<|> and puts the complete resulting regular expression inside C<(?: ... )>.
280              
281             'a,b{c,d},e' ==> '(?:a|b\\{c|d\\}|e)'
282              
283             =item *
284              
285             C<'brackets'>
286              
287             Converts all matching C<{ ... , ... }> brackets to C<(?: ... | ... )> alternations.
288             If some brackets are unbalanced, it tries to substitute as many of them as possible, and then escape the remaining unmatched C<{> and C<}>.
289             Commas outside of any bracket-delimited block are also escaped.
290              
291             'a,b{c,d},e' ==> 'a\\,b(?:c|d)\\,e'
292             '{a\\{b,c}d,e}' ==> '(?:a\\{b|c)d\\,e\\}'
293             '{a{b,c\\}d,e}' ==> '\\{a\\{b\\,c\\}d\\,e\\}'
294              
295             =item *
296              
297             C<'groups'>
298              
299             Keeps the parenthesis C<( ... )> of the original string without escaping them.
300             Currently, no check is done to ensure that the parenthesis are matching.
301              
302             'a(b(c))d\\(\\)' ==> (no change)
303              
304             =item *
305              
306             C<'anchors'>
307              
308             Prevents the I C<^> and I C<$> anchors to be escaped.
309             Since C<[...]> character class are currently escaped, a C<^> will always be interpreted as I.
310              
311             'a^b$c' ==> (no change)
312              
313             =back
314              
315             Each C<$c> can be any of :
316              
317             =over 4
318              
319             =item *
320              
321             A hash reference, with wanted metacharacter group names (described above) as keys and booleans as values ;
322              
323             =item *
324              
325             An array reference containing the list of wanted metacharacter classes ;
326              
327             =item *
328              
329             A plain scalar, when only one group is required.
330              
331             =back
332              
333             When C is present, the classes given as its value replace the current object options.
334             Then the C classes are added, and the C classes removed.
335              
336             Passing a sole scalar C<$what> is equivalent as passing C<< set => $what >>.
337             No argument means C<< set => [ ] >>.
338              
339             $rw->do(set => 'jokers'); # Only translate jokers.
340             $rw->do('jokers'); # Same.
341             $rw->do(add => [ qw ]); # Translate also SQL and commas.
342             $rw->do(rem => 'jokers'); # Specifying both 'sql' and
343             # 'jokers' is useless.
344             $rw->do(); # Translate nothing.
345              
346             The C method returns the L object.
347              
348             =head2 C
349              
350             $rw->type($type);
351              
352             Notifies to convert the metacharacters that corresponds to the predefined type C<$type>.
353             C<$type> can be any of :
354              
355             =over 4
356              
357             =item *
358              
359             C<'jokers'>, C<'sql'>, C<'commas'>, C<'brackets'>
360              
361             Singleton types that enable the corresponding C classes.
362              
363             =item *
364              
365             C<'unix'>
366              
367             Covers typical Unix shell globbing features (effectively C<'jokers'> and C<'brackets'>).
368              
369             =item *
370              
371             C<$^O> values for common Unix systems
372              
373             Wrap to C<'unix'> (see L for the list).
374              
375             =item *
376              
377             C
378              
379             Defaults to C<'unix'>.
380              
381             =item *
382              
383             C<'win32'>
384              
385             Covers typical Windows shell globbing features (effectively C<'jokers'> and C<'commas'>).
386              
387             =item *
388              
389             C<'dos'>, C<'os2'>, C<'MSWin32'>, C<'cygwin'>
390              
391             Wrap to C<'win32'>.
392              
393             =back
394              
395             In particular, you can usually pass C<$^O> as the C<$type> and get the corresponding shell behaviour.
396              
397             $rw->type('win32'); # Set type to win32.
398             $rw->type($^O); # Set type to unix on Unices and win32 on Windows
399             $rw->type(); # Set type to unix.
400              
401             The C method returns the L object.
402              
403             =head2 C
404              
405             $rw->capture($captures);
406             $rw->capture(set => $c1);
407             $rw->capture(add => $c2);
408             $rw->capture(rem => $c3);
409              
410             Specifies the list of atoms to capture.
411             This method works like L, except that the classes are different :
412              
413             =over 4
414              
415             =item *
416              
417             C<'single'>
418              
419             Captures all unescaped I<"exactly one"> metacharacters, i.e. C for wildcards or C<_> for SQL.
420              
421             'a???b\\??' ==> 'a(.)(.)(.)b\\?(.)'
422             'a___b\\__' ==> 'a(.)(.)(.)b\\_(.)'
423              
424             =item *
425              
426             C<'any'>
427              
428             Captures all unescaped I<"any"> metacharacters, i.e. C<*> for wildcards or C<%> for SQL.
429              
430             'a***b\\**' ==> 'a(.*)b\\*(.*)'
431             'a%%%b\\%%' ==> 'a(.*)b\\%(.*)'
432              
433             =item *
434              
435             C<'greedy'>
436              
437             When used in conjunction with C<'any'>, it makes the C<'any'> captures greedy (by default they are not).
438              
439             'a***b\\**' ==> 'a(.*?)b\\*(.*?)'
440             'a%%%b\\%%' ==> 'a(.*?)b\\%(.*?)'
441              
442             =item *
443              
444             C<'brackets'>
445              
446             Capture matching C<{ ... , ... }> alternations.
447              
448             'a{b\\},\\{c}' ==> 'a(b\\}|\\{c)'
449              
450             =back
451              
452             $rw->capture(set => 'single'); # Only capture "exactly one"
453             # metacharacters.
454             $rw->capture('single'); # Same.
455             $rw->capture(add => [ qw ]); # Also greedily capture
456             # "any" metacharacters.
457             $rw->capture(rem => 'greedy'); # No more greed please.
458             $rw->capture(); # Capture nothing.
459              
460             The C method returns the L object.
461              
462             =head2 C
463              
464             my $rx = $rw->convert($wc);
465             my $rx = $rw->convert($wc, $type);
466              
467             Converts the wildcard expression C<$wc> into a regular expression according to the options stored into the L object, or to C<$type> if it's supplied.
468             It successively escapes all unprotected regexp special characters that doesn't hold any meaning for wildcards, then replace C<'jokers'>, C<'sql'> and C<'commas'> or C<'brackets'> (depending on the L or L options), all of this by applying the C<'capture'> rules specified in the constructor or by L.
469              
470             =cut
471              
472             sub convert {
473 332     332 1 6096 my ($self, $wc, $type) = @_;
474 332         642 _check_self $self;
475              
476 330 100       865 my $config = (defined $type) ? $self->_type($type) : $self;
477 329 100       660 return unless defined $wc;
478              
479 328         540 my $e = $config->{escape};
480             # Escape :
481             # - an even number of \ that doesn't protect a regexp/wildcard metachar
482             # - an odd number of \ that doesn't protect a wildcard metachar
483 328         36852 $wc =~ s/
484             (?
485             (?:\\\\)*
486             (?:
487             [^\w\s\\$e]
488             |
489             \\
490             (?: [^\W$e] | \s | $ )
491             )
492             )
493             /\\$1/gx;
494              
495 328         5404 my $do = $config->{do};
496 328 100       1062 $wc = $self->_jokers($wc) if $do->{jokers};
497 328 100       998 $wc = $self->_sql($wc) if $do->{sql};
498 328 100 100     1455 if ($do->{brackets}) {
    100          
499 40         94 $wc = $self->_bracketed($wc);
500             } elsif ($do->{commas} and $wc =~ /(?
501 7         28 $wc = $self->{'c_brackets'} . $self->_commas($wc) . ')';
502             }
503              
504             $wc
505 328         2023 }
506              
507             =head1 EXPORT
508              
509             An object module shouldn't export any function, and so does this one.
510              
511             =head1 DEPENDENCIES
512              
513             L (core module since perl 5), L, L (since 5.7.3).
514              
515             =head1 CAVEATS
516              
517             This module does not implement the strange behaviours of Windows shell that result from the special handling of the three last characters (for the file extension).
518             For example, Windows XP shell matches C<*a> like C<.*a>, C<*a?> like C<.*a.?>, C<*a??> like C<.*a.{0,2}> and so on.
519              
520             =head1 SEE ALSO
521              
522             L.
523              
524             =head1 AUTHOR
525              
526             Vincent Pit, C<< >>, L.
527              
528             You can contact me by mail or on C (vincent).
529              
530             =head1 BUGS
531              
532             Please report any bugs or feature requests to C, or through the web interface at L.
533             I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
534              
535             =head1 SUPPORT
536              
537             You can find documentation for this module with the perldoc command.
538              
539             perldoc Regexp::Wildcards
540              
541             Tests code coverage report is available at L.
542              
543             =head1 COPYRIGHT & LICENSE
544              
545             Copyright 2007,2008,2009,2013 Vincent Pit, all rights reserved.
546              
547             This program is free software; you can redistribute it and/or modify it
548             under the same terms as Perl itself.
549              
550             =cut
551              
552 134     134   629 sub _extract ($) { extract_bracketed $_[0], '{', qr/.*?(?
553              
554             sub _jokers {
555 201     201   269 my $self = shift;
556 201         350 local $_ = $_[0];
557              
558             # substitute ? preceded by an even number of \
559 201         296 my $s = $self->{c_single};
560 201         748 s/(?
561             # substitute * preceded by an even number of \
562 201         318 $s = $self->{c_any};
563 201         980 s/(?
564              
565 201         583 $_
566             }
567              
568             sub _sql {
569 175     175   259 my $self = shift;
570 175         290 local $_ = $_[0];
571              
572             # substitute _ preceded by an even number of \
573 175         263 my $s = $self->{c_single};
574 175         781 s/(?
575             # substitute % preceded by an even number of \
576 175         290 $s = $self->{c_any};
577 175         508 s/(?
578              
579 175         482 $_
580             }
581              
582             sub _commas {
583 67     67   113 local $_ = $_[1];
584              
585             # substitute , preceded by an even number of \
586 67         393 s/(?
587              
588 67         191 $_
589             }
590              
591             sub _brackets {
592 47     47   68 my ($self, $rest) = @_;
593              
594 47         93 substr $rest, 0, 1, '';
595 47         69 chop $rest;
596              
597 47         66 my ($re, $bracket, $prefix) = ('');
598 47         61 while (do { ($bracket, $rest, $prefix) = _extract $rest; $bracket }) {
  60         107  
  60         3723  
599 13         32 $re .= $self->_commas($prefix) . $self->_brackets($bracket);
600             }
601 47         112 $re .= $self->_commas($rest);
602              
603 47         252 $self->{c_brackets} . $re . ')';
604             }
605              
606             sub _bracketed {
607 40     40   71 my ($self, $rest) = @_;
608              
609 40         102 my ($re, $bracket, $prefix) = ('');
610 40         49 while (do { ($bracket, $rest, $prefix) = _extract $rest; $bracket }) {
  74         157  
  74         7336  
611 34         110 $re .= $prefix . $self->_brackets($bracket);
612             }
613 40         69 $re .= $rest;
614              
615 40         167 $re =~ s/(?
616              
617 40         114 $re;
618             }
619              
620             1; # End of Regexp::Wildcards