File Coverage

blib/lib/Data/Validate/Perl.pm
Criterion Covered Total %
statement 11 134 8.2
branch 0 66 0.0
condition 0 12 0.0
subroutine 4 6 66.6
pod 1 1 100.0
total 16 219 7.3


line stmt bran cond sub pod time code
1             package Data::Validate::Perl;
2              
3 1     1   57665 use 5.006;
  1         4  
4 1     1   5 use strict;
  1         2  
  1         20  
5 1     1   4 use warnings FATAL => 'all';
  1         1  
  1         36  
6 1     1   5 use Carp;
  1         1  
  1         1387  
7              
8             our $VERSION = '0.03';
9              
10             our @ISA = qw/Exporter/;
11             our @EXPORT = qw/gen_yp_rules/;
12             our @EXPORT_OK = qw/gen_yp_rules/;
13              
14             =head1 NAME
15              
16             Data::Validate::Perl - validates in-memory perl data using a specification
17              
18             =head1 SYNOPSIS
19              
20             Continue reading only when you want to generate L grammar
21             from the specification file and patch it, or understand how it works
22             internally, else please look into C command-line
23             utility documentation instead.
24              
25             use Data::Validate::Perl qw/gen_yp_rules/;
26              
27             my $yapp_grammar = gen_yp_rules($spec_file);
28              
29             =head1 EXPORTS
30              
31             =over
32              
33             =item gen_yp_rules
34              
35             =back
36              
37             =head1 SUBROUTINES
38              
39             =over
40              
41             =item gen_yp_rules
42              
43             This function contains the main logic to parse the data specification
44             and translate it to L grammar. Returns the grammar string
45             on success.
46              
47             =back
48              
49             =cut
50              
51              
52             sub gen_yp_rules {
53 0     0 1   my ( $spec_file, ) = @_;
54              
55             # contains all the rules defined
56 0           my %rule = ();
57             # contains all the rules being references in rule body
58 0           my %rule_required = ();
59 0           my $start;
60             {
61 0           my %lhs_type = ('%' => 'HASH', '@' => 'ARRAY', '$' => 'SCALAR');
  0            
62 0           my %rhs_type = (%lhs_type, '\'' => 'SYMBOL',);
63 0           my $lhs_type_regex = join('|', map { '\\'. $_ } sort keys %lhs_type);
  0            
64 0           my $rhs_type_regex = join('|', map { '\\'. $_ } sort keys %rhs_type);
  0            
65 0 0         open my $F, '<', $spec_file or croak "cannot open file to read: $!";
66 0           my %rule_map = ();
67 0           while (<$F>) {
68 0           chomp;
69 0 0         next if /^\s*#/io;
70 0           my @l = split /\s*\:\s*/io, $_;
71 0 0         croak "invalid rule line: $_" if @l != 2;
72 0           my ( $k, $v ) = @l;
73 0 0         croak "invalid rule name: $k" if $k !~ /^($lhs_type_regex)(\w+)$/io;
74             # key = name:type
75 0           my $type = $lhs_type{$1};
76 0           my $name = $2;
77 0           my $key = join(':', $name, $type);
78 0           my @v = ();
79 0           foreach my $i (split /\s+/io, $v) {
80 0 0         if ($i =~ /^($rhs_type_regex)(?:\(((?:\w+|\*))\))?(\w+)$/io) {
81 0           my $t = $rhs_type{$1};
82 0           my $k = $2;
83 0           my $n = $3;
84 0 0 0       croak "left-hand side must be a hash: $name" if $k and $type ne 'HASH';
85 0 0 0       $k = $n if $type eq 'HASH' and !$k;
86             # [ name, type ]
87 0 0         push @v, $type eq 'HASH' ? [ $n, $t, $k ] : [ $n, $t ];
88 0           $rule_required{join(':', $n, $t)} = $t;
89             }
90             else {
91 0           croak "invalid rule item: $i";
92             }
93             }
94 0 0         croak "duplicate rule declaration: $k" if exists $rule_map{$key};
95 0           $rule{$key} = [ @v ];
96             # first declared rule is start
97 0 0         $start = $key if !defined $start;
98 0           $rule_map{$key}++;
99             }
100 0           close $F;
101             }
102             # create the rules which have been referenced but not declared
103             # they are simple arrays or hashes which contains text key/value
104 0           foreach my $k (keys %rule_required) {
105 0 0         if (!exists $rule{$k}) {
106 0 0 0       if ($rule_required{$k} eq 'ARRAY' or $rule_required{$k} eq 'HASH') {
    0          
107             # simple array or hash
108 0           $rule{$k} = [];
109             }
110             elsif ($rule_required{$k} eq 'SCALAR') {
111 0           $rule{$k} = [];
112             }
113             }
114             }
115 0 0         croak "$start rule declaration not found" if !exists $rule{$start};
116             # if ($::opt{d}) {
117             # require Data::Dumper;
118             # no warnings 'once';
119             # local $Data::Dumper::Indent = 1;
120             # print STDERR Data::Dumper::Dumper(\%rule), "\n";
121             # }
122              
123 0           my $yapp = "%%\n";
124 0           my $count = 0;
125 0           my @stack = ( [ $start, $count++ ], );
126             my $cb_process_children = sub {
127 0     0     my ( $children, ) = @_;
128              
129 0           for (my $i = 0; $i < @$children; $i++) {
130 0           my $child= $children->[$i];
131 0           my $key = join(':', $child->[0], $child->[1]);
132 0           my $name = $child->[0];
133 0           my $type = $child->[1];
134 0           my $cnt = $count++;
135 0 0         if (exists $rule{$key}) {
136 0 0 0       if ($type eq 'ARRAY' or $type eq 'HASH') {
    0          
    0          
137 0           push @stack, [ $key, $cnt ];
138             }
139             elsif ($type eq 'SCALAR') {
140 0           push @stack, [ $key, $cnt ];
141             }
142             elsif ($type eq 'SYMBOL') {
143             # NOOP: skip
144             }
145             else {
146 0           croak "unknown rule type of $name: $type";
147             }
148             }
149             else {
150 0           croak "internal state error, no such rule: $key";
151             }
152             }
153 0           };
154 0           my $has_list_enum = 0;
155 0           my $has_scalar_enum = 0;
156 0           my $has_simple_hash = 0;
157 0           my $rule_format = 'rule%04d';
158 0           while (@stack) {
159 0           my $item = shift @stack;
160 0           my $k = $item->[0];
161 0           my $c = $item->[1];
162              
163 0           my ( $name, $type, ) = split /:/io, $k, 2;
164 0           my $children = $rule{$k};
165 0           my $rule = sprintf($rule_format, $c);
166 0 0         if ($type eq 'HASH') {
    0          
    0          
    0          
167             # there shouldn't be any enum (such as 'value) in hash declaration
168 0 0         croak "invalid hash declaration for $name: scalar item found" if grep { $_->[1] eq 'SYMBOL' } @$children;
  0            
169 0 0         if (@$children == 0) {
170             # simple hash
171 0           $has_simple_hash++;
172 0           $yapp .= "$rule: '{' my_begin_simple_hash ${rule}_elements my_end_simple_hash '}';\n";
173 0           $yapp .= "${rule}_elements: TEXT ${rule}_elements | TEXT;\n";
174             }
175             else {
176 0           $yapp .= "$rule: '{' ${rule}_elements '}';\n";
177 0           $yapp .= "${rule}_elements: ${rule}_element ${rule}_elements | ${rule}_element;\n";
178             $yapp .= "${rule}_element: ". join(
179 0           ' | ', map { "'". $children->[$_][2]. "' ". sprintf($rule_format, $count+$_) } 0 .. $#{$children}). ";\n";
  0            
  0            
180 0           $cb_process_children->($children);
181             }
182             # NOREACH
183             }
184             elsif ($type eq 'ARRAY') {
185 0 0         if (grep { $_->[1] eq 'SYMBOL' } @$children) {
  0            
186             # enum array, all the children must be enum in this case
187             croak "invalid array declaration for $name: non scalar item found" if
188 0 0         grep { $_->[1] ne 'SYMBOL' } @$children;
  0            
189 0           $has_list_enum++;
190 0           $yapp .= "$rule: '[' my_begin_list_enum ${rule}_items my_end_list_enum ']';\n";
191 0           $yapp .= "${rule}_items: ${rule}_item ${rule}_items | ${rule}_item;\n";
192 0           $yapp .= "${rule}_item: ". join(' | ', map { "'$_'" } map { $_->[0] } @$children). ";\n";
  0            
  0            
193             }
194             else {
195 0           $yapp .= "$rule: '[' ${rule}_items ']';\n";
196 0 0         if (@$children == 0) {
197             # simple array
198 0           $yapp .= "${rule}_items: TEXT ${rule}_items | TEXT;\n";
199             }
200             else {
201 0           $yapp .= "${rule}_items: ${rule}_item ${rule}_items | ${rule}_item;\n";
202             $yapp .= "${rule}_item: ". join(
203 0           ' | ', map { sprintf($rule_format, $count+$_) } 0 .. $#{$children}). ";\n";
  0            
  0            
204 0           $cb_process_children->($children);
205             }
206             }
207             }
208             elsif ($type eq 'SCALAR') {
209 0 0         if (@$children == 0) {
210 0           $yapp .= "${rule}: ;\n";
211             }
212             else {
213 0 0         croak "only constant values permitted for scalar rule" if grep { $_->[1] ne 'SYMBOL' } @$children;
  0            
214 0           $has_scalar_enum++;
215 0           $yapp .= "${rule}: my_begin_scalar_enum ${rule}_value my_end_scalar_enum;\n";
216 0           $yapp .= "${rule}_value: ". join(' | ', map { "'". $_->[0]. "'" } @$children). ";\n";
  0            
217             }
218             }
219             elsif ($type eq 'SYMBOL') {
220             # there shouldn't be any symbol item being pushed onto stack
221 0           croak "internal state error: $type item on stack";
222             }
223             else {
224 0           croak "unknown type of rule $name: $type";
225             }
226             }
227 0 0         $yapp .= <<'EOL' if $has_list_enum;
228             my_begin_list_enum: { $_[0]->YYData->{_flag}->{list_enum} = 1 };
229             my_end_list_enum: { $_[0]->YYData->{_flag}->{list_enum} = 0 };
230             EOL
231 0 0         $yapp .= <<'EOL' if $has_simple_hash;
232             my_begin_simple_hash: { $_[0]->YYData->{_flag}->{simple_hash} = 1 };
233             my_end_simple_hash : { $_[0]->YYData->{_flag}->{simple_hash} = 0 };
234             EOL
235 0 0         $yapp .= <<'EOL' if $has_scalar_enum;
236             my_begin_scalar_enum: { $_[0]->YYData->{_flag}->{scalar_enum} = 1 };
237             my_end_scalar_enum : { $_[0]->YYData->{_flag}->{scalar_enum} = 0 };
238             EOL
239 0           $yapp .= "%%\n";
240 0 0         print STDERR $yapp if $::opt{v};
241 0           $yapp .= do { local $/; };
  0            
  0            
242 0           return $yapp;
243             }
244              
245             =head1 DESCRIPTION
246              
247             In order to understand internal of this module, working knowledge of
248             parsing, especially Yacc is required. Stop and grab a book on topic if
249             you are unsure what this is.
250              
251             A common parsing mechanism applies state machine onto a string, such
252             as regular expression. This part is easy to follow. In this module a
253             Yacc state machine is used, the target is not plain text but a
254             in-memory data structure - a tree made up by several perl
255             scalar/array/hash items.
256              
257             The process to validate a data structure like that is a tree
258             traversal. The biggest challenge is how to put these 2 things
259             together.
260              
261             The best way to figure a solution is, imagine each step to perform a
262             depth-first iteration on a tree. Each move can be abstracted as a
263             'token'. This is the key idea behind.
264              
265             To elaborate, think how to validate a simple perl hash like below:
266              
267             my %hash = (key1 => value1, key2 => value2, key3 => value3);
268              
269             To iterate the hash key/value pairs, use a cursor to describe the
270             following states:
271              
272             1. initial state: place the cursor onto hash itself;
273             2. 1st state: move cursor to key1;
274             3. 2nd state: move cursor to value1;
275             4. 3rd state: move cursor to key2;
276             5. 4th state: move cursor to value2;
277             6. 5th state: move cursor to key3;
278             7. 6th state: move cursor to value3;
279              
280             A draft Yacc grammar written as:
281              
282             root_of_hash: key1 value1 | key2 value2 | key3 value3
283              
284             The state machine needs token to decide which sub-rule to walk
285             into. Looking onto the key1/2/3, the corresponding token can
286             simply be the value of themselves. That is:
287              
288             root_of_hash: 'key1' value1 | 'key2' value2 | 'key3' value3
289              
290             Note the quotes, they mark key1/2/3 as tokens. Next move to the hash
291             value. When the cursor points to a value, I do not care about the
292             actual value, instead I just want to hint the state machine that it is
293             a value. It requires another token to accept the state. How about a
294             plain text token - 'TEXT'. Finally the grammar to be:
295              
296             root_of_hash: 'key1' 'TEXT' | 'key2' 'TEXT' | 'key3' 'TEXT'
297              
298             How to apply the generated state machine to the hash validation then?
299             Each time the parser cannot determine which is next state, it asks the
300             lexer for a token. The simplest form of a lexer is just a function to
301             return the corresponding tokens for each state. At this point, you
302             might be able to guess how it works:
303              
304             1. state machine initialized, it wants to move to next state, so it asks lexer;
305             2. the lexer holds the hash itself, it calls keys function, returns the first key as token, put the key returned into its memory;
306             3. by the time state machine got key1, it moves the cursor onto 'key1', then asks lexer again;
307             4. the lexer checks its memory and figures it returned 'key1' just now, time to return its vlaue, as the state machine has no interest on the actual value, it returns 'TEXT';
308             5. state machine finished the iteration of key1/value1 pair, asks for another token;
309             6. lexer returns 'key2' and keeps it in its own memory;
310             7. state machine steps into the sub-rule 'key2' 'TEXT';
311             ...
312              
313             The state loop is fairly straightforward. Parsing isn't that
314             difficult, huh :-)
315              
316             To iterate a nested tree full of scalar/array/hash, other tokens are
317             introduced:
318              
319             1. '[' ']' indicates start/end state of array traversal;
320             2. '{' '}' indicates start/end state of hash traversal;
321             3. to meet special need, certain rule actions are defined to set some state flags, which influence the decision that the lexer returns the value as 'TEXT', or the actual value string itself;
322              
323             The state maintenance in lexer is made up by a stack, the stack
324             simulates a depth-first traversal:
325              
326             1. when meets array, iterates array items one by one, if any item is another array or hash, push current array onto the stack together with an index marking where we are in this array. Iterates that item recursively;
327             2. similar strategy is applied to hash;
328              
329             The left piece is a DSL to describe the tree structure. By the time
330             you read here, I am fairly confident you are able to figure it out
331             yourself by exercising various pieces of this module, below is a small
332             leaf-note:
333              
334             1. gen_yp_rules function handles translation from data structure spec to corresponding Yacc grammar;
335             2. bottom section of this module contains the Lexer function and other routines L requires to work (browse the module source to read);
336             3. the command-line utility C reads the spec file, calls gen_yp_rules to generate grammar, fits it into a file and calls C to create the parser module;
337              
338             Wish you like this little article and enjoy playing with this module.
339              
340             =head1 SEE ALSO
341              
342             * L
343              
344             =head1 AUTHOR
345              
346             Dongxu Ma, C<< >>
347              
348             =head1 BUGS
349              
350             Please report any bugs or feature requests to C
351             at rt.cpan.org>, or through the web interface at
352             L.
353             I will be notified, and then you'll automatically be notified of
354             progress on your bug as I make changes.
355              
356             =head1 SUPPORT
357              
358             You can find documentation for this module with the perldoc command.
359              
360             perldoc Data::Validate::Perl
361              
362             You can also look for information at:
363              
364             =over 4
365              
366             =item * RT: CPAN's request tracker (report bugs here)
367              
368             L
369              
370             =item * AnnoCPAN: Annotated CPAN documentation
371              
372             L
373              
374             =item * CPAN Ratings
375              
376             L
377              
378             =item * Search CPAN
379              
380             L
381              
382             =back
383              
384             =head1 LICENSE AND COPYRIGHT
385              
386             Copyright 2014 Dongxu Ma.
387              
388             This program is free software; you can redistribute it and/or modify it
389             under the terms of the the Artistic License (2.0). You may obtain a
390             copy of the full license at:
391              
392             L
393              
394             Any use, modification, and distribution of the Standard or Modified
395             Versions is governed by this Artistic License. By using, modifying or
396             distributing the Package, you accept this license. Do not use, modify,
397             or distribute the Package, if you do not accept this license.
398              
399             If your Modified Version has been derived from a Modified Version made
400             by someone other than you, you are nevertheless required to ensure that
401             your Modified Version complies with the requirements of this license.
402              
403             This license does not grant you the right to use any trademark, service
404             mark, tradename, or logo of the Copyright Holder.
405              
406             This license includes the non-exclusive, worldwide, free-of-charge
407             patent license to make, have made, use, offer to sell, sell, import and
408             otherwise transfer the Package with respect to any patent claims
409             licensable by the Copyright Holder that are necessarily infringed by the
410             Package. If you institute patent litigation (including a cross-claim or
411             counterclaim) against any party alleging that the Package constitutes
412             direct or contributory patent infringement, then this Artistic License
413             to you shall terminate on the date that such litigation is filed.
414              
415             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
416             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
417             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
418             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
419             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
420             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
421             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
422             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
423              
424             =cut
425              
426             1; # End of Data::Validate::Perl
427              
428             __DATA__