File Coverage

blib/lib/Data/Validate/Perl.pm
Criterion Covered Total %
statement 12 136 8.8
branch 0 66 0.0
condition 0 12 0.0
subroutine 4 6 66.6
pod 1 1 100.0
total 17 221 7.6


line stmt bran cond sub pod time code
1             package Data::Validate::Perl;
2              
3 1     1   25676 use 5.006;
  1         4  
  1         58  
4 1     1   16 use strict;
  1         2  
  1         44  
5 1     1   6 use warnings FATAL => 'all';
  1         8  
  1         57  
6 1     1   6 use Carp;
  1         2  
  1         2212  
7              
8             our $VERSION = '0.02';
9              
10             our @ISA = qw/Exporter/;
11             our @EXPORT = qw/gen_yp_rules/;
12             our @EXPORT_OK = qw/gen_yp_rules/;
13              
14             =head1 NAME
15              
16             Data::Validate::Perl - validates in-memory perl data using a specification
17              
18             =head1 SYNOPSIS
19              
20             Continue reading only when you want to generate L grammar
21             from the specification file and patch it, or understand how it works
22             internally, else please look into C command-line
23             utility documentation instead.
24              
25             use Data::Validate::Perl qw/gen_yp_rules/;
26              
27             my $yapp_grammar = gen_yp_rules($spec_file);
28              
29             =head1 EXPORTS
30              
31             =over
32              
33             =item gen_yp_rules
34              
35             =back
36              
37             =head1 SUBROUTINES
38              
39             =over
40              
41             =item gen_yp_rules
42              
43             This function contains the main logic to parse the data specification
44             and translate it to L grammar. Returns the grammar string
45             on success.
46              
47             =back
48              
49             =cut
50              
51              
52             sub gen_yp_rules {
53 0     0 1   my ( $spec_file, ) = @_;
54              
55             # contains all the rules defined
56 0           my %rule = ();
57             # contains all the rules being references in rule body
58 0           my %rule_required = ();
59 0           my $start;
60             {
61 0           my %lhs_type = ('%' => 'HASH', '@' => 'ARRAY', '$' => 'SCALAR');
  0            
62 0           my %rhs_type = (%lhs_type, '\'' => 'SYMBOL',);
63 0           my $lhs_type_regex = join('|', map { '\\'. $_ } sort keys %lhs_type);
  0            
64 0           my $rhs_type_regex = join('|', map { '\\'. $_ } sort keys %rhs_type);
  0            
65 0 0         open my $F, '<', $spec_file or croak "cannot open file to read: $!";
66 0           my %rule_map = ();
67 0           while (<$F>) {
68 0           chomp;
69 0 0         next if /^\s*#/io;
70 0           my @l = split /\s*\:\s*/io, $_;
71 0 0         croak "invalid rule line: $_" if @l != 2;
72 0           my ( $k, $v ) = @l;
73 0 0         croak "invalid rule name: $k" if $k !~ /^($lhs_type_regex)(\w+)$/io;
74             # key = name:type
75 0           my $type = $lhs_type{$1};
76 0           my $name = $2;
77 0           my $key = join(':', $name, $type);
78 0           my @v = ();
79 0           foreach my $i (split /\s+/io, $v) {
80 0 0         if ($i =~ /^($rhs_type_regex)(?:\(((?:\w+|\*))\))?(\w+)$/io) {
81 0           my $t = $rhs_type{$1};
82 0           my $k = $2;
83 0           my $n = $3;
84 0 0 0       croak "left-hand side must be a hash: $name" if $k and $type ne 'HASH';
85 0 0 0       $k = $n if $type eq 'HASH' and !$k;
86             # [ name, type ]
87 0 0         push @v, $type eq 'HASH' ? [ $n, $t, $k ] : [ $n, $t ];
88 0           $rule_required{join(':', $n, $t)} = $t;
89             }
90             else {
91 0           croak "invalid rule item: $i";
92             }
93             }
94 0 0         croak "duplicate rule declaration: $k" if exists $rule_map{$key};
95 0           $rule{$key} = [ @v ];
96             # first declared rule is start
97 0 0         $start = $key if !defined $start;
98 0           $rule_map{$key}++;
99             }
100 0           close $F;
101             }
102             # create the rules which have been referenced but not declared
103             # they are simple arrays or hashes which contains text key/value
104 0           foreach my $k (keys %rule_required) {
105 0 0         if (!exists $rule{$k}) {
106 0 0 0       if ($rule_required{$k} eq 'ARRAY' or $rule_required{$k} eq 'HASH') {
    0          
107             # simple array or hash
108 0           $rule{$k} = [];
109             }
110             elsif ($rule_required{$k} eq 'SCALAR') {
111 0           $rule{$k} = [];
112             }
113             }
114             }
115 0 0         croak "$start rule declaration not found" if !exists $rule{$start};
116             # if ($::opt{d}) {
117             # require Data::Dumper;
118             # no warnings 'once';
119             # local $Data::Dumper::Indent = 1;
120             # print STDERR Data::Dumper::Dumper(\%rule), "\n";
121             # }
122              
123 0           my $yapp = "%%\n";
124 0           my $count = 0;
125 0           my @stack = ( [ $start, $count++ ], );
126             my $cb_process_children = sub {
127 0     0     my ( $children, ) = @_;
128              
129 0           for (my $i = 0; $i < @$children; $i++) {
130 0           my $child= $children->[$i];
131 0           my $key = join(':', $child->[0], $child->[1]);
132 0           my $name = $child->[0];
133 0           my $type = $child->[1];
134 0           my $cnt = $count++;
135 0 0         if (exists $rule{$key}) {
136 0 0 0       if ($type eq 'ARRAY' or $type eq 'HASH') {
    0          
    0          
137 0           push @stack, [ $key, $cnt ];
138             }
139             elsif ($type eq 'SCALAR') {
140 0           push @stack, [ $key, $cnt ];
141             }
142             elsif ($type eq 'SYMBOL') {
143             # NOOP: skip
144             }
145             else {
146 0           croak "unknown rule type of $name: $type";
147             }
148             }
149             else {
150 0           croak "internal state error, no such rule: $key";
151             }
152             }
153 0           };
154 0           my $has_list_enum = 0;
155 0           my $has_scalar_enum = 0;
156 0           my $has_simple_hash = 0;
157 0           my $rule_format = 'rule%04d';
158 0           while (@stack) {
159 0           my $item = shift @stack;
160 0           my $k = $item->[0];
161 0           my $c = $item->[1];
162              
163 0           my ( $name, $type, ) = split /:/io, $k, 2;
164 0           my $children = $rule{$k};
165 0           my $rule = sprintf($rule_format, $c);
166 0 0         if ($type eq 'HASH') {
    0          
    0          
    0          
167             # there shouldn't be any enum (such as 'value) in hash declaration
168 0 0         croak "invalid hash declaration for $name: scalar item found" if grep { $_->[1] eq 'SYMBOL' } @$children;
  0            
169 0 0         if (@$children == 0) {
170             # simple hash
171 0           $has_simple_hash++;
172 0           $yapp .= "$rule: '{' my_begin_simple_hash ${rule}_elements my_end_simple_hash '}';\n";
173 0           $yapp .= "${rule}_elements: TEXT ${rule}_elements | TEXT;\n";
174             }
175             else {
176 0           $yapp .= "$rule: '{' ${rule}_elements '}';\n";
177 0           $yapp .= "${rule}_elements: ${rule}_element ${rule}_elements | ${rule}_element;\n";
178 0           $yapp .= "${rule}_element: ". join(
179 0           ' | ', map { "'". $children->[$_][2]. "' ". sprintf($rule_format, $count+$_) } 0 .. $#{$children}). ";\n";
  0            
180 0           $cb_process_children->($children);
181             }
182             # NOREACH
183             }
184             elsif ($type eq 'ARRAY') {
185 0 0         if (grep { $_->[1] eq 'SYMBOL' } @$children) {
  0            
186             # enum array, all the children must be enum in this case
187 0           croak "invalid array declaration for $name: non scalar item found" if
188 0 0         grep { $_->[1] ne 'SYMBOL' } @$children;
189 0           $has_list_enum++;
190 0           $yapp .= "$rule: '[' my_begin_list_enum ${rule}_items my_end_list_enum ']';\n";
191 0           $yapp .= "${rule}_items: ${rule}_item ${rule}_items | ${rule}_item;\n";
192 0           $yapp .= "${rule}_item: ". join(' | ', map { "'$_'" } map { $_->[0] } @$children). ";\n";
  0            
  0            
193             }
194             else {
195 0           $yapp .= "$rule: '[' ${rule}_items ']';\n";
196 0 0         if (@$children == 0) {
197             # simple array
198 0           $yapp .= "${rule}: '[' ${rule}_items ']';\n";
199 0           $yapp .= "${rule}_items: TEXT ${rule}_items | TEXT;\n";
200             }
201             else {
202 0           $yapp .= "${rule}_items: ${rule}_item ${rule}_items | ${rule}_item;\n";
203 0           $yapp .= "${rule}_item: ". join(
204 0           ' | ', map { sprintf($rule_format, $count+$_) } 0 .. $#{$children}). ";\n";
  0            
205 0           $cb_process_children->($children);
206             }
207             }
208             }
209             elsif ($type eq 'SCALAR') {
210 0 0         if (@$children == 0) {
211 0           $yapp .= "${rule}: ;\n";
212             }
213             else {
214 0 0         croak "only constant values permitted for scalar rule" if grep { $_->[1] ne 'SYMBOL' } @$children;
  0            
215 0           $has_scalar_enum++;
216 0           $yapp .= "${rule}: my_begin_scalar_enum ${rule}_value my_end_scalar_enum;\n";
217 0           $yapp .= "${rule}_value: ". join(' | ', map { "'". $_->[0]. "'" } @$children). ";\n";
  0            
218             }
219             }
220             elsif ($type eq 'SYMBOL') {
221             # there shouldn't be any symbol item being pushed onto stack
222 0           croak "internal state error: $type item on stack";
223             }
224             else {
225 0           croak "unknown type of rule $name: $type";
226             }
227             }
228 0 0         $yapp .= <<'EOL' if $has_list_enum;
229             my_begin_list_enum: { $_[0]->YYData->{_flag}->{list_enum} = 1 };
230             my_end_list_enum: { $_[0]->YYData->{_flag}->{list_enum} = 0 };
231             EOL
232 0 0         $yapp .= <<'EOL' if $has_simple_hash;
233             my_begin_simple_hash: { $_[0]->YYData->{_flag}->{simple_hash} = 1 };
234             my_end_simple_hash : { $_[0]->YYData->{_flag}->{simple_hash} = 0 };
235             EOL
236 0 0         $yapp .= <<'EOL' if $has_scalar_enum;
237             my_begin_scalar_enum: { $_[0]->YYData->{_flag}->{scalar_enum} = 1 };
238             my_end_scalar_enum : { $_[0]->YYData->{_flag}->{scalar_enum} = 0 };
239             EOL
240 0           $yapp .= "%%\n";
241 0 0         print STDERR $yapp if $::opt{v};
242 0           $yapp .= do { local $/; };
  0            
  0            
243 0           return $yapp;
244             }
245              
246             =head1 DESCRIPTION
247              
248             In order to understand internal of this module, working knowledge of
249             parsing, especially Yacc is required. Stop and grab a book on topic if
250             you are unsure what this is.
251              
252             A common parsing mechanism applies state machine onto a string, such
253             as regular expression. This part is easy to follow. In this module a
254             Yacc state machine is used, the target is not plain text but a
255             in-memory data structure - a tree made up by several perl
256             scalar/array/hash items.
257              
258             The process to validate a data structure like that is a tree
259             traversal. The biggest challenge is how to put these 2 things
260             together.
261              
262             The best way to figure a solution is, imagine each step to perform a
263             depth-first iteration on a tree. Each move can be abstracted as a
264             'token'. This is the key idea behind.
265              
266             To elaborate, think how to validate a simple perl hash like below:
267              
268             my %hash = (key1 => value1, key2 => value2, key3 => value3);
269              
270             To iterate the hash key/value pairs, use a cursor to describe the
271             following states:
272              
273             1. initial state: place the cursor onto hash itself;
274             2. 1st state: move cursor to key1;
275             3. 2nd state: move cursor to value1;
276             4. 3rd state: move cursor to key2;
277             5. 4th state: move cursor to value2;
278             6. 5th state: move cursor to key3;
279             7. 6th state: move cursor to value3;
280              
281             A draft Yacc grammar written as:
282              
283             root_of_hash: key1 value1 | key2 value2 | key3 value3
284              
285             The state machine needs token to decide which sub-rule to walk
286             into. Looking onto the key1/2/3, the corresponding token can
287             simply be the value of themselves. That is:
288              
289             root_of_hash: 'key1' value1 | 'key2' value2 | 'key3' value3
290              
291             Note the quotes, they mark key1/2/3 as tokens. Next move to the hash
292             value. When the cursor points to a value, I do not care about the
293             actual value, instead I just want to hint the state machine that it is
294             a value. It requires another token to accept the state. How about a
295             plain text token - 'TEXT'. Finally the grammar to be:
296              
297             root_of_hash: 'key1' 'TEXT' | 'key2' 'TEXT' | 'key3' 'TEXT'
298              
299             How to apply the generated state machine to the hash validation then?
300             Each time the parser cannot determine which is next state, it asks the
301             lexer for a token. The simplest form of a lexer is just a function to
302             return the corresponding tokens for each state. At this point, you
303             might be able to guess how it works:
304              
305             1. state machine initialized, it wants to move to next state, so it asks lexer;
306             2. the lexer holds the hash itself, it calls keys function, returns the first key as token, put the key returned into its memory;
307             3. by the time state machine got key1, it moves the cursor onto 'key1', then asks lexer again;
308             4. the lexer checks its memory and figures it returned 'key1' just now, time to return its vlaue, as the state machine has no interest on the actual value, it returns 'TEXT';
309             5. state machine finished the iteration of key1/value1 pair, asks for another token;
310             6. lexer returns 'key2' and keeps it in its own memory;
311             7. state machine steps into the sub-rule 'key2' 'TEXT';
312             ...
313              
314             The state loop is fairly straightforward. Parsing isn't that
315             difficult, huh :-)
316              
317             To iterate a nested tree full of scalar/array/hash, other tokens are
318             introduced:
319              
320             1. '[' ']' indicates start/end state of array traversal;
321             2. '{' '}' indicates start/end state of hash traversal;
322             3. to meet special need, certain rule actions are defined to set some state flags, which influence the decision that the lexer returns the value as 'TEXT', or the actual value string itself;
323              
324             The state maintenance in lexer is made up by a stack, the stack
325             simulates a depth-first traversal:
326              
327             1. when meets array, iterates array items one by one, if any item is another array or hash, push current array onto the stack together with an index marking where we are in this array. Iterates that item recursively;
328             2. similar strategy is applied to hash;
329              
330             The left piece is a DSL to describe the tree structure. By the time
331             you read here, I am fairly confident you are able to figure it out
332             yourself by exercising various pieces of this module, below is a small
333             leaf-note:
334              
335             1. gen_yp_rules function handles translation from data structure spec to corresponding Yacc grammar;
336             2. bottom section of this module contains the Lexer function and other routines L requires to work (browse the module source to read);
337             3. the command-line utility C reads the spec file, calls gen_yp_rules to generate grammar, fits it into a file and calls C to create the parser module;
338              
339             Wish you like this little article and enjoy playing with this module.
340              
341             =head1 SEE ALSO
342              
343             * L
344              
345             =head1 AUTHOR
346              
347             Dongxu Ma, C<< >>
348              
349             =head1 BUGS
350              
351             Please report any bugs or feature requests to C
352             at rt.cpan.org>, or through the web interface at
353             L.
354             I will be notified, and then you'll automatically be notified of
355             progress on your bug as I make changes.
356              
357             =head1 SUPPORT
358              
359             You can find documentation for this module with the perldoc command.
360              
361             perldoc Data::Validate::Perl
362              
363             You can also look for information at:
364              
365             =over 4
366              
367             =item * RT: CPAN's request tracker (report bugs here)
368              
369             L
370              
371             =item * AnnoCPAN: Annotated CPAN documentation
372              
373             L
374              
375             =item * CPAN Ratings
376              
377             L
378              
379             =item * Search CPAN
380              
381             L
382              
383             =back
384              
385             =head1 LICENSE AND COPYRIGHT
386              
387             Copyright 2014 Dongxu Ma.
388              
389             This program is free software; you can redistribute it and/or modify it
390             under the terms of the the Artistic License (2.0). You may obtain a
391             copy of the full license at:
392              
393             L
394              
395             Any use, modification, and distribution of the Standard or Modified
396             Versions is governed by this Artistic License. By using, modifying or
397             distributing the Package, you accept this license. Do not use, modify,
398             or distribute the Package, if you do not accept this license.
399              
400             If your Modified Version has been derived from a Modified Version made
401             by someone other than you, you are nevertheless required to ensure that
402             your Modified Version complies with the requirements of this license.
403              
404             This license does not grant you the right to use any trademark, service
405             mark, tradename, or logo of the Copyright Holder.
406              
407             This license includes the non-exclusive, worldwide, free-of-charge
408             patent license to make, have made, use, offer to sell, sell, import and
409             otherwise transfer the Package with respect to any patent claims
410             licensable by the Copyright Holder that are necessarily infringed by the
411             Package. If you institute patent litigation (including a cross-claim or
412             counterclaim) against any party alleging that the Package constitutes
413             direct or contributory patent infringement, then this Artistic License
414             to you shall terminate on the date that such litigation is filed.
415              
416             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
417             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
418             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
419             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
420             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
421             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
422             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
423             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
424              
425             =cut
426              
427             1; # End of Data::Validate::Perl
428              
429             __DATA__