File Coverage

blib/lib/RPerl/Operation/Expression/Operator/RegularExpression.pm
Criterion Covered Total %
statement 111 133 83.4
branch 26 40 65.0
condition n/a
subroutine 7 8 87.5
pod n/a
total 144 181 79.5


line stmt bran cond sub pod time code
1             # [[[ HEADER ]]]
2             package RPerl::Operation::Expression::Operator::RegularExpression;
3 4     4   22 use strict;
  4         10  
  4         98  
4 4     4   18 use warnings;
  4         8  
  4         78  
5 4     4   20 use RPerl::AfterSubclass;
  4         8  
  4         476  
6             our $VERSION = 0.013_000;
7              
8             # [[[ OO INHERITANCE ]]]
9 4     4   25 use parent qw(RPerl::Operation::Expression::Operator);
  4         9  
  4         20  
10 4     4   204 use RPerl::Operation::Expression::Operator;
  4         438  
  4         5012  
11              
12             # [[[ CRITICS ]]]
13             ## no critic qw(ProhibitUselessNoCritic ProhibitMagicNumbers RequireCheckedSyscalls) # USER DEFAULT 1: allow numeric values & print operator
14             ## no critic qw(RequireInterpolationOfMetachars) # USER DEFAULT 2: allow single-quoted control characters & sigils
15              
16             # [[[ OO PROPERTIES ]]]
17             our hashref $properties = {};
18              
19             # COPYRIGHT NOTICE: modifier descriptions copied from JPCRE2 docs under BSD license https://github.com/jpcre2/jpcre2
20              
21             # supported and compliant! :-)
22             our string_hashref $modifiers_compile = {
23             i => 'Case-insensitive. Equivalent to PCRE2_CASELESS option.',
24             m => 'Multi-line regex. Equivalent to PCRE2_MULTILINE option.',
25             s => 'If this modifier is set, a dot meta-character in the pattern matches all characters, including newlines. Equivalent to PCRE2_DOTALL option.',
26             u => 'Enable UTF support.Treat pattern and subjects as UTF strings. It is equivalent to PCRE2_UTF option.',
27             x => 'Whitespace data characters in the pattern are totally ignored except when escaped or inside a character class, enables commentary in pattern. Equivalent to PCRE2_EXTENDED option.',
28             };
29              
30             =begin DISABLED_UNSUPPORTED_OR_NONCOMPLIANT
31              
32             # DEV NOTE: there are other unsupported modifiers as well, see the Perl docs https://perldoc.perl.org/perlre.html#Modifiers
33             our string_hashref $modifiers_compile_unsupported = {
34             xx => 'Whitespace data characters in the pattern are totally ignored except when escaped, EVEN WHEN INSIDE A CHARACTER CLASS. Requires Perl v5.26 or newer.',
35             };
36              
37             # DEV NOTE: some of these noncompliant modifiers may be related to Perl regex assertions, such as 'A' https://perldoc.perl.org/perlre.html
38             our string_hashref $modifiers_compile_noncompliant = {
39             e => 'Unset back-references in the pattern will match to empty strings. Equivalent to PCRE2_MATCH_UNSET_BACKREF.',
40             j => '\u \U \x and unset back-references will act as JavaScript standard. Equivalent to PCRE2_ALT_BSUX | PCRE2_MATCH_UNSET_BACKREF.
41             \U matches an upper case "U" character (by default it causes a compile error if this option is not set).
42             \u matches a lower case "u" character unless it is followed by four hexadecimal digits, in which case the hexadecimal number defines the code point to match (by default it causes a compile error if this option is not set).
43             \x matches a lower case "x" character unless it is followed by two hexadecimal digits, in which case the hexadecimal number defines the code point to match (By default, as in Perl, a hexadecimal number is always expected after \x, but it may have zero, one, or two digits (so, for example, \xz matches a binary zero character followed by z) ).
44             Unset back-references in the pattern will match to empty strings.',
45             n => 'Enable Unicode support for \w \d etc... in pattern. Equivalent to PCRE2_UTF | PCRE2_UCP.',
46             A => 'Match only at the first position. It is equivalent to PCRE2_ANCHORED option.',
47             D => 'A dollar meta-character in the pattern matches only at the end of the subject string. Without this modifier, a dollar also matches immediately before the final character if it is a newline (but not before any other newlines). This modifier is ignored if m modifier is set. Equivalent to PCRE2_DOLLAR_ENDONLY option.',
48             J => 'Allow duplicate names for sub-patterns. Equivalent to PCRE2_DUPNAMES option.',
49             S => 'When a pattern is going to be used several times, it is worth spending more time analyzing it in order to speed up the time taken for matching/replacing. It may also be beneficial for a very long subject string or pattern. Equivalent to an extra compilation with JIT_COMPILER with the option PCRE2_JIT_COMPLETE.',
50             U => 'This modifier inverts the "greediness" of the quantifiers so that they are not greedy by default, but become greedy if followed by ?. Equivalent to PCRE2_UNGREEDY option.',
51             };
52              
53             =end DISABLED_UNSUPPORTED_OR_NONCOMPLIANT
54              
55             =cut
56              
57             our string_hashref $modifiers_match = {
58             g => 'Global. Will perform global matching or replacement if passed. Equivalent to jpcre2::FIND_ALL for match and PCRE2_SUBSTITUTE_GLOBAL for replace.',
59             };
60              
61             =begin DISABLED_UNSUPPORTED_OR_NONCOMPLIANT
62              
63             our string_hashref $modifiers_match_noncompliant = {
64             A => 'Match at start. Equivalent to PCRE2_ANCHORED. Can be used in match operation. Setting this option only at match time (i.e regex was not compiled with this option) will disable optimization during match time.',
65             };
66              
67             =end DISABLED_UNSUPPORTED_OR_NONCOMPLIANT
68              
69             =cut
70              
71             our string_hashref $modifiers_substitute = {
72             g => 'Global. Will perform global matching or replacement if passed. Equivalent to jpcre2::FIND_ALL for match and PCRE2_SUBSTITUTE_GLOBAL for replace.',
73             };
74              
75             =begin DISABLED_UNSUPPORTED_OR_NONCOMPLIANT
76              
77             our string_hashref $modifiers_substitute_noncompliant = {
78             e => 'Replaces unset group with empty string. Equivalent to PCRE2_SUBSTITUTE_UNSET_EMPTY.',
79             E => 'Extension of e modifier. Sets even unknown groups to empty string. Equivalent to PCRE2_SUBSTITUTE_UNSET_EMPTY | PCRE2_SUBSTITUTE_UNKNOWN_UNSET',
80             x => 'Extended replacement operation. Equivalent to PCRE2_SUBSTITUTE_EXTENDED. It enables some Bash like features:
81             ${<n>:-<string>}
82             ${<n>:+<string1>:<string2>}
83             <n> may be a group number or a name. The first form specifies a default value. If group <n> is set, its value is inserted; if not, <string> is expanded and the result is inserted. The second form specifies strings that are expanded and inserted when group <n> is set or unset, respectively. The first form is just a convenient shorthand for ${<n>:+${<n>}:<string>}.',
84             };
85              
86             =end DISABLED_UNSUPPORTED_OR_NONCOMPLIANT
87              
88             =cut
89              
90             # [[[ SUBROUTINES & OO METHODS ]]]
91              
92             sub ast_to_rperl__generate {
93 19     19   36 { my string_hashref::method $RETURN_TYPE };
  19         32  
94 19         51 ( my object $self, my string_hashref $modes) = @ARG;
95 19         55 my string_hashref $rperl_source_group = { PMC => q{} };
96              
97             # RPerl::diag( 'in Operator::RegularExpression->ast_to_rperl__generate(), received $self = ' . "\n" . RPerl::Parser::rperl_ast__dump($self) . "\n" );
98              
99 19         45 my string $self_class = ref $self;
100 19 50       56 if ( $self_class eq 'Operator_104' ) { # Operator -> SubExpression OP06_REGEX_BIND OP06_REGEX_PATTERN
101             my string_hashref $rperl_source_subgroup
102 19         396 = $self->{children}->[0]->ast_to_rperl__generate($modes);
103 19         369 RPerl::Generator::source_group_append( $rperl_source_group,
104             $rperl_source_subgroup );
105 19         84 $rperl_source_group->{PMC} .= q{ } . $self->{children}->[1] . q{ } . $self->{children}->[2];
106             }
107             else {
108 0         0 die RPerl::Parser::rperl_rule__replace(
109             'ERROR ECOGEASRP00, CODE GENERATOR, ABSTRACT SYNTAX TO RPERL: Grammar rule '
110             . $self_class
111             . ' found where Operator_104 expected, dying' )
112             . "\n";
113             }
114 19         90 return $rperl_source_group;
115             }
116              
117             sub ast_to_cpp__generate__CPPOPS_PERLTYPES {
118 0     0   0 { my string_hashref::method $RETURN_TYPE };
  0         0  
119 0         0 ( my object $self, my string_hashref $modes) = @ARG;
120 0         0 my string_hashref $cpp_source_group
121             = { CPP =>
122             q{// <<< RP::O::E::O::RE __DUMMY_SOURCE_CODE CPPOPS_PERLTYPES >>>}
123             . "\n" };
124              
125             #...
126 0         0 return $cpp_source_group;
127             }
128              
129             sub ast_to_cpp__generate__CPPOPS_CPPTYPES {
130 5     5   12 { my string_hashref::method $RETURN_TYPE };
  5         13  
131 5         17 ( my object $self, my string_hashref $modes) = @ARG;
132 5         21 my string_hashref $cpp_source_group = { CPP => q{} };
133              
134             # RPerl::diag( 'in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), received $self = ' . "\n" . RPerl::Parser::rperl_ast__dump($self) . "\n" );
135              
136 5         18 my string $self_class = ref $self;
137 5 50       22 if ( $self_class eq 'Operator_104' ) { # Operator -> SubExpression OP06_REGEX_BIND OP06_REGEX_PATTERN
138             # generate subexpression, to left of regex bind operator
139 5         145 my string_hashref $cpp_source_subgroup = $self->{children}->[0]->ast_to_cpp__generate__CPPOPS_CPPTYPES($modes);
140 5         113 RPerl::Generator::source_group_append( $cpp_source_group, $cpp_source_subgroup );
141              
142             # get bind and pattern strings
143 5         16 my string $bind = $self->{children}->[1];
144 5         20 my string $pattern = $self->{children}->[2];
145 5         46 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $bind = '} . $bind . q{'} . "\n" );
146 5         32 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $pattern = '} . $pattern . q{'} . "\n" );
147              
148             # NEED FIX: DIE ON !~ BINDING OPERATOR, should actually be logic & code generation to implement !~ binding operator
149 5 50       21 if ($bind ne '=~') {
150 0         0 die q{ERROR ECOGEASCPxx: Regular expression binding operator '} . $bind . q{' not yet supported, dying};
151             }
152              
153             # separate pattern into match/substitute flag, bare pattern, and modifiers
154 5         23 my character $match_or_substitute = substr $pattern, 0, 1;
155 5         16 my string $modifiers = q{};
156 5         23 for (my integer $pattern_index = ((length $pattern) - 1); $pattern_index >= 0; $pattern_index--) {
157 23         39 my character $modifier = substr $pattern, $pattern_index, 1;
158 23 100       49 if ($modifier eq '/') { last; }
  5         19  
159 18         45 $modifiers = $modifier . $modifiers;
160             }
161 5         24 my string $pattern_bare = substr $pattern, 1, ((length $pattern) - ((length $modifiers) + 1));
162 5         28 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $match_or_substitute = '} . $match_or_substitute . q{'} . "\n" );
163 5         23 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $pattern_bare = '} . $pattern_bare . q{'} . "\n" );
164 5         42 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $modifiers = '} . $modifiers . q{'} . "\n" );
165              
166             # test for and remove book-end forward slash characters
167 5 50       20 if ((substr $pattern_bare, 0, 1) ne q{/}) {
168 0         0 die q{ERROR ECOGEASCP81: Regular expression pattern '} . $pattern_bare . q{' does not begin with forward slash '/' character, dying};
169             }
170 5 50       23 if ((substr $pattern_bare, -1, 1) ne q{/}) {
171 0         0 die q{ERROR ECOGEASCP82: Regular expression pattern '} . $pattern_bare . q{' does not end with forward slash '/' character, dying};
172             }
173 5         22 $pattern_bare = substr $pattern_bare, 1, ((length $pattern_bare) - 2);
174              
175             # must backslash-escape backslashes within bare pattern, character class \w must appear as \\w etc, convert all single backslashes into double backslashes
176 5         28 $pattern_bare =~ s/\\/\\\\/gxms;
177 5         28 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have forward-slash-cleaned and backslash-escaped $pattern_bare = '} . $pattern_bare . q{'} . "\n" );
178              
179 5         13 my string $modifiers_compile_enabled = q{};
180 5         16 my string $modifiers_match_enabled = q{};
181 5         14 my string $modifiers_substitute_enabled = q{};
182 5         14 my string $modifiers_compile_extra = q{};
183              
184             # include S compile modifier for optimization on long (presumably more complex) patterns
185 5 50       16 if ((length $pattern_bare) > 20) {
186 0         0 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), setting S modifier to attempt PCRE2 optimization of pattern over length of 20 characters} . "\n" );
187 0         0 $modifiers_compile_extra = 'S';
188             }
189              
190             # match
191 5 100       22 if ($match_or_substitute eq 'm') {
    50          
192             # EXAMPLE C++ CODE
193             # // check if string matches the pattern, return true or false
194             # regex("(\\d)|(\\w)").match("I am the subject")
195             # // match all and get the match count using the action modifier 'g', return count
196             # regex("(\\d)|(\\w)","m").match("I am the subject","g")
197            
198 3         8 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have all valid match modifiers = '} . (join ', ', (sort keys %{{%{$modifiers_compile}, %{$modifiers_match}}})) . q{'} . "\n" );
  3         7  
  3         18  
  3         63  
199              
200             # validate & sort modifiers
201 3         20 foreach my character $modifier (split //, $modifiers) {
202 12 100       31 if (exists $modifiers_compile->{$modifier}) {
    50          
203 9         35 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have valid compile modifier '} . $modifier . q{' with description as follows:} . "\n\t" . $modifiers_compile->{$modifier} . "\n" );
204 9         20 $modifiers_compile_enabled .= $modifier;
205             }
206             elsif (exists $modifiers_match->{$modifier}) {
207 3         21 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have valid match modifier '} . $modifier . q{' with description as follows:} . "\n\t" . $modifiers_match->{$modifier} . "\n" );
208 3         7 $modifiers_match_enabled .= $modifier;
209             }
210             else {
211 0         0 die q{ERROR ECOGEASCP83: Non-compliant, unsupported, or unrecognized regular expression modifier '} . $modifier . q{' found, must be one of (} . (join ', ', (sort keys %{{%{$modifiers_compile}, %{$modifiers_match}}})) . q{), dying};
  0         0  
  0         0  
  0         0  
212             }
213             }
214 3         17 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $modifiers_compile_enabled = '} . $modifiers_compile_enabled . "\n" );
215 3         16 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $modifiers_match_enabled = '} . $modifiers_match_enabled . "\n" );
216              
217             # compose final C++ code for modifiers
218 3         13 my $modifiers_compile_CPP = q{};
219 3 50       14 if (($modifiers_compile_enabled . $modifiers_compile_extra) ne q{}) {
220 3         12 $modifiers_compile_CPP = q{, "} . ($modifiers_compile_enabled . $modifiers_compile_extra) . q{"};
221             }
222 3         7 my $modifiers_match_CPP = q{};
223 3 50       14 if ($modifiers_match_enabled ne q{}) {
224 3         11 $modifiers_match_CPP = q{, "} . $modifiers_match_enabled . q{"};
225             }
226 3         12 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $modifiers_compile_CPP = '} . $modifiers_compile_CPP . "\n" );
227 3         11 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have $modifiers_match_CPP = '} . $modifiers_match_CPP . "\n" );
228              
229             # DEV NOTE: $cpp_source_group->{CPP} already contains the generated subexpression to be used as the subject of the regex
230             # DEV NOTE: Perl vs JPCRE2 inconsistency, must explicitly cast return value change count as boolean true/false value
231 3         22 $cpp_source_group->{CPP} = '(boolean) regex("' . $pattern_bare . '"' . $modifiers_compile_CPP . ').match(' . $cpp_source_group->{CPP} . $modifiers_match_CPP . ')';
232             }
233             # substitute
234             elsif ($match_or_substitute eq 's') {
235             # EXAMPLE C++ CODE
236             # // replace first occurrence of a digit with @
237             # string foo = (const string) "I am the subject string 44";
238             # regex("\\d").preplace(&foo, "@")
239             # // replace all occurrences of a digit with @
240             # string foo = (const string) "I am the subject string 44";
241             # regex("\\d").preplace(&foo, "@", "g")
242             # // swap two parts of a string
243             # string foo = (const string) "I am the subject\tTo be swapped according to tab";
244             # regex("^([^\t]+)\t([^\t]+)$").preplace(&foo, "$2 $1")
245              
246 2         9 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have all valid substitute modifiers = '} . (join ', ', (sort keys %{{%{$modifiers_compile}, %{$modifiers_substitute}}})) . q{'} . "\n" );
  2         7  
  2         14  
  2         40  
247              
248             # validate & sort modifiers
249 2         15 foreach my character $modifier (split //, $modifiers) {
250 6 100       18 if (exists $modifiers_compile->{$modifier}) {
    50          
251 4         21 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have valid compile modifier '} . $modifier . q{' with description as follows:} . "\n\t" . $modifiers_compile->{$modifier} . "\n" );
252 4         8 $modifiers_compile_enabled .= $modifier;
253             }
254             elsif (exists $modifiers_substitute->{$modifier}) {
255 2         13 RPerl::diag( q{in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), have valid substitute modifier '} . $modifier . q{' with description as follows:} . "\n\t" . $modifiers_substitute->{$modifier} . "\n" );
256 2         6 $modifiers_substitute_enabled .= $modifier;
257             }
258             else {
259 0         0 die q{ERROR ECOGEASCP84: Non-compliant, unsupported, or unrecognized regular expression modifier '} . $modifier . q{' found, must be one of (} . (join ', ', (sort keys %{{%{$modifiers_compile}, %{$modifiers_substitute}}})) . q{), dying};
  0         0  
  0         0  
  0         0  
260             }
261             }
262              
263             # compose final C++ code for modifiers
264 2         9 my $modifiers_compile_CPP = q{};
265 2 50       12 if (($modifiers_compile_enabled . $modifiers_compile_extra) ne q{}) {
266 2         8 $modifiers_compile_CPP = q{, "} . ($modifiers_compile_enabled . $modifiers_compile_extra) . q{"};
267             }
268 2         7 my $modifiers_substitute_CPP = q{};
269 2 50       10 if ($modifiers_substitute_enabled ne q{}) {
270 2         9 $modifiers_substitute_CPP = q{, "} . $modifiers_substitute_enabled . q{"};
271             }
272              
273             # validate substitute pattern
274 2         12 my $pattern_forward_slash_count = ( $pattern_bare =~ m/\//gxms );
275 2 50       12 if ($pattern_forward_slash_count != 1) {
276 0         0 die q{ERROR ECOGEASCP85: Substitution regular expression pattern '} . $pattern_bare . q{' does not contain exactly one forward slash '/' character, dying};
277             }
278              
279             # split find/replace portions of substitute pattern
280 2         6 my string $pattern_find = q{};
281 2         7 my string $pattern_replace = q{};
282 2         6 my boolean $found_slash = 0;
283 2         11 foreach my character $pattern_character (split //, $pattern_bare) {
284 10 100       21 if ($pattern_character eq '/') {
    100          
285 2         7 $found_slash = 1;
286 2         5 next;
287             }
288             elsif (not $found_slash) {
289 4         10 $pattern_find .= $pattern_character;
290             }
291             else {
292 4         11 $pattern_replace .= $pattern_character;
293             }
294             }
295              
296             # START HERE
297             # START HERE
298             # START HERE
299             # NEED ADD ERROR CHECK OR GRAMMAR CHANGE: regex substitution's LHS subexpression can only be a variable, because we must return assign value back to variable to emulate PERLOPS_PERLTYPES behavior
300             # NEED ADD SUPPORT: non-destructive regex substitution using Perl's /r modifier, and NOT setting the original variable to the return value in C++
301             # NEED ADD LOGIC: bind not !~ instead of only bind =~, disable die on !~ above !!!
302              
303             # DEV NOTE: $cpp_source_group->{CPP} already contains the generated subexpression to be used as the subject of the regex
304              
305             # EXAMPLE C++ CODE: regex("FIND", "MODS_COMP").preplace(&foo, "REPLACE_WITH", "MODS_SUBST")
306 2         19 $cpp_source_group->{CPP} = 'regex("' . $pattern_find . '"' . $modifiers_compile_CPP . ').preplace(&' . $cpp_source_group->{CPP} . ', "' . $pattern_replace . '"' . $modifiers_substitute_CPP . ')';
307             }
308             else {
309 0         0 die q{ERROR ECOGEASCP80: Unrecognized regular expression type '} . $match_or_substitute . q{' found, must be 'm' for match or 's' for substitute, dying};
310             }
311             }
312             else {
313 0         0 die RPerl::Parser::rperl_rule__replace( 'ERROR ECOGEASCP00, CODE GENERATOR, ABSTRACT SYNTAX TO RPERL: Grammar rule ' . $self_class . ' found where Operator_104 expected, dying' ) . "\n";
314             }
315              
316 5         134 RPerl::diag( 'in Operator::RegularExpression->ast_to_cpp__generate__CPPOPS_CPPTYPES(), about to return $cpp_source_group = ' . "\n" . RPerl::Parser::rperl_ast__dump($cpp_source_group) . "\n" );
317 5         31 return $cpp_source_group;
318             }
319              
320             1; # end of class