File Coverage

blib/lib/XML/Easy/SimpleSchemaUtil.pm
Criterion Covered Total %
statement 72 77 93.5
branch 35 38 92.1
condition n/a
subroutine 15 16 93.7
pod 4 4 100.0
total 126 135 93.3


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             XML::Easy::SimpleSchemaUtil - help with simple kinds of XML schema
4              
5             =head1 SYNOPSIS
6              
7             use XML::Easy::SimpleSchemaUtil qw(
8             xml_s_canonise_chars xml_c_canonise_chars
9             xml_c_subelements xml_c_chardata);
10              
11             $chardata = xml_s_canonise_chars($chardata);
12             $content = xml_c_canonise_chars($content);
13             $subelements = xml_c_subelements($content);
14             $chars = xml_c_chardata($content);
15              
16             =head1 DESCRIPTION
17              
18             The rules by which some class of thing is encoded in XML constitute a
19             schema. (A schema does not need to be codified in a formal language such
20             as Schematron: a natural-language specification can also be a schema.
21             Even if there is no explicit specification at all, the behaviour of
22             the interoperating processors of related XML documents constitutes a de
23             facto schema.) Certain kinds of rule are commonly used in all manner
24             of schemata. This module supplies functions that help to implement such
25             common kinds of rule, regardless of how a schema is specified.
26              
27             This module processes XML data in the form used by L,
28             consisting of C and C objects
29             and twine arrays. In this form, character data are stored fully decoded,
30             so they can be manipulated with no knowledge of XML syntax.
31              
32             =cut
33              
34             package XML::Easy::SimpleSchemaUtil;
35              
36 4     4   243108 { use 5.006; }
  4         13  
37 4     4   22 use warnings;
  4         7  
  4         111  
38 4     4   21 use strict;
  4         13  
  4         113  
39              
40 4     4   292 use Params::Classify 0.000 qw(is_ref);
  4         2166  
  4         207  
41 4     4   271 use XML::Easy::Classify 0.006 qw(check_xml_chardata check_xml_content_twine);
  4         16277  
  4         204  
42             use XML::Easy::NodeBasics 0.007
43 4     4   291 qw(xml_content_object xml_content_twine xml_c_content_twine);
  4         3537  
  4         216  
44 4     4   25 use XML::Easy::Syntax 0.000 qw($xml10_s_rx);
  4         45  
  4         461  
45              
46             our $VERSION = "0.003";
47              
48 4     4   26 use parent "Exporter";
  4         11  
  4         25  
49             our @EXPORT_OK = qw(
50             xml_s_canonise_chars xs_charcanon xml_c_canonise_chars xc_charcanon
51             xml_c_subelements xc_subelems xml_c_chardata xc_chars
52             );
53              
54             sub _throw_data_error($) {
55 0     0   0 my($msg) = @_;
56 0         0 die "invalid XML data: $msg\n";
57             }
58              
59             sub _throw_schema_error($) {
60 44     44   92 my($msg) = @_;
61 44         272 die "XML schema error: $msg\n";
62             }
63              
64             =head1 FUNCTIONS
65              
66             Each function has two names. There is a longer descriptive name, and
67             a shorter name to spare screen space and the programmer's fingers.
68              
69             =over
70              
71             =item xml_s_canonise_chars(STRING, OPTIONS)
72              
73             =item xs_charcanon(STRING, OPTIONS)
74              
75             This function is intended to help in parsing XML data, in situations
76             where the schema states that some aspects of characters are not
77             entirely significant. I must be a plain Perl string consisting
78             of character data that is valid for XML. The function examines the
79             characters, processes them as specified in the I, and returns
80             a modified version of the string.
81             I must be a reference to a hash, in which the permitted keys are:
82              
83             =over
84              
85             =item B
86              
87             =item B
88              
89             =item B
90              
91             Controls handling of sequences of whitespace characters. The three
92             keys control, respectively, whitespace at the beginning of the string,
93             whitespace that is at neither the beginning nor the end, and whitespace at
94             the end of the string. If the entire content of the string is whitespace,
95             it is treated as both leading and trailing.
96              
97             The whitespace characters, for this purpose, are tab, linefeed/newline,
98             carriage return, and space. This is the same set of characters that
99             are whitespace for the purposes of the XML syntax.
100              
101             The value for each key may be:
102              
103             =over
104              
105             =item B
106              
107             Completely remove the whitespace. For situations where the whitespace is
108             of no significance at all. (Common for leading and trailing whitespace,
109             but rare for intermediate whitespace.)
110              
111             =item B
112              
113             Replace the whitespace sequence with a single space character.
114             For situations where the presence of whitespace is significant but the
115             length and type are not. (Common for intermediate whitespace.)
116              
117             =item B (default)
118              
119             Leave the whitespace unchanged. For situations where the exact type of
120             whitespace is significant.
121              
122             =back
123              
124             =back
125              
126             =cut
127              
128             sub _canonise_chars($$) {
129 140     140   263 my($string, $options) = @_;
130             my $leading_wsp = exists($options->{leading_wsp}) ?
131 140 100       290 $options->{leading_wsp} : "PRESERVE";
132 140 100       360 if($leading_wsp eq "DELETE") {
    100          
    50          
133 14         89 $string =~ s/\A$xml10_s_rx//o;
134             } elsif($leading_wsp eq "COMPRESS") {
135 14         74 $string =~ s/\A$xml10_s_rx/ /o;
136             } elsif($leading_wsp ne "PRESERVE") {
137 0         0 _throw_data_error("bad character canonicalisation option");
138             }
139             my $intermediate_wsp = exists($options->{intermediate_wsp}) ?
140 140 100       270 $options->{intermediate_wsp} : "PRESERVE";
141 140 100       304 if($intermediate_wsp eq "DELETE") {
    100          
    50          
142 14         156 $string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z)
143             /$1/xsog;
144             } elsif($intermediate_wsp eq "COMPRESS") {
145 14         167 $string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z)
146             /$1 /xsog;
147             } elsif($intermediate_wsp ne "PRESERVE") {
148 0         0 _throw_data_error("bad character canonicalisation option");
149             }
150             my $trailing_wsp = exists($options->{trailing_wsp}) ?
151 140 100       278 $options->{trailing_wsp} : "PRESERVE";
152 140 100       366 if($trailing_wsp eq "DELETE") {
    100          
    50          
153 14         102 $string =~ s/$xml10_s_rx\z//o;
154             } elsif($trailing_wsp eq "COMPRESS") {
155 14         93 $string =~ s/$xml10_s_rx\z/ /o;
156             } elsif($trailing_wsp ne "PRESERVE") {
157 0         0 _throw_data_error("bad character canonicalisation option");
158             }
159 140         520 return $string;
160             }
161              
162             sub xml_s_canonise_chars($$) {
163 22     22 1 15421 check_xml_chardata($_[0]);
164 20         189 return &_canonise_chars;
165             }
166              
167             *xs_charcanon = \&xml_s_canonise_chars;
168              
169             =item xml_c_canonise_chars(CONTENT, OPTIONS)
170              
171             =item xc_charcanon(CONTENT, OPTIONS)
172              
173             This function is intended to help in parsing XML data, in situations
174             where the schema states that some aspects of characters are not
175             entirely significant. I must be a reference to either an
176             L object or a twine array. The function processes its
177             top-level character content in the same way as L,
178             and returns the resulting modified version of the content in the same
179             form that the input supplied.
180              
181             Any element inside the content chunk acts like a special character that
182             will not be modified. It interrupts any character sequence of interest.
183             Elements are not processed recursively: they are treated as atomic.
184              
185             =cut
186              
187             sub _canonise_chars_twine($$) {
188 60     60   496 my($twine, $options) = @_;
189 60 100       165 return [ _canonise_chars($twine->[0], $options) ]
190             if @$twine == 1;
191 20         60 my $leading_options = {%$options};
192 20         45 my $intermediate_options = {%$options};
193 20         36 my $trailing_options = {%$options};
194             $leading_options->{trailing_wsp} =
195             $intermediate_options->{leading_wsp} =
196             $intermediate_options->{trailing_wsp} =
197             $trailing_options->{leading_wsp} =
198             exists($options->{intermediate_wsp}) ?
199 20 100       59 $options->{intermediate_wsp} : "PRESERVE";
200 20         51 my @output = @$twine;
201 20         41 $output[0] = _canonise_chars($output[0], $leading_options);
202 20         76 $output[-1] = _canonise_chars($output[-1], $trailing_options);
203 20         57 for(my $i = @output - 3; $i != 0; $i--) {
204 40         72 $output[$i] =
205             _canonise_chars($output[$i], $intermediate_options);
206             }
207 20         103 return \@output;
208             }
209              
210             sub xml_c_canonise_chars($$) {
211 60 100   60 1 27473 if(is_ref($_[0], "ARRAY")) {
212 30         105 check_xml_content_twine($_[0]);
213 30         710 return xml_content_twine(&_canonise_chars_twine);
214             } else {
215 30         88 return xml_content_object(_canonise_chars_twine(
216             xml_c_content_twine($_[0]), $_[1]));
217             }
218             }
219              
220             *xc_charcanon = \&xml_c_canonise_chars;
221              
222             =item xml_c_subelements(CONTENT, ALLOW_WSP)
223              
224             =item xc_subelems(CONTENT, ALLOW_WSP)
225              
226             This function is intended to help in parsing XML data, in situations
227             where the schema calls for an element to contain only subelements,
228             possibly with optional whitespace around and between them.
229              
230             I must be a reference to either an L object
231             or a twine array. The function checks whether the content includes
232             any unpermitted characters at the top level, and Cs if it does.
233             If the content is of permitted form, the function returns a reference
234             to an array listing all the subelements.
235              
236             I is a truth value controlling whether whitespace is permitted
237             around and between the subelements. The characters recognised as
238             whitespace are the same as those for XML syntax. Allowing whitespace in
239             this way is easier (and slightly more efficient) than first filtering
240             it out via L. Non-whitespace characters are
241             never permitted.
242              
243             =cut
244              
245             sub xml_c_subelements($$) {
246 58     58 1 33735 my($content, $allow_wsp) = @_;
247 58         187 $content = xml_c_content_twine($content);
248 56         876 my $clen = @$content;
249 56         216 for(my $i = $clen-1; $i >= 0; $i -= 2) {
250 112 100       233 if($allow_wsp) {
251 62 100       350 _throw_schema_error("non-whitespace characters ".
252             "where not permitted")
253             unless $content->[$i] =~ /\A$xml10_s_rx?\z/o;
254             } else {
255 50 100       174 _throw_schema_error("characters where not permitted")
256             unless $content->[$i] eq "";
257             }
258             }
259 22         48 my @subelem;
260 22         60 for(my $i = 1; $i < $clen; $i += 2) {
261 28         79 push @subelem, $content->[$i];
262             }
263 22         112 return \@subelem;
264             }
265              
266             *xc_subelems = \&xml_c_subelements;
267              
268             =item xml_c_chardata(CONTENT)
269              
270             =item xc_chars(CONTENT)
271              
272             This function is intended to help in parsing XML data, in situations
273             where the schema calls for an element to contain only character data.
274             I must be a reference to either an L object
275             or a twine array. The function Cs if it contains any subelements.
276             If the content is of permitted form, the function returns a string
277             containing all the character content.
278              
279             =cut
280              
281             sub xml_c_chardata($) {
282 26     26 1 13033 my($content) = @_;
283 26         58 $content = xml_c_content_twine($content);
284 24 100       289 _throw_schema_error("subelement where not permitted")
285             unless @$content == 1;
286 14         45 return $content->[0];
287             }
288              
289             *xc_chars = \&xml_c_chardata;
290              
291             =back
292              
293             =head1 SEE ALSO
294              
295             L
296              
297             =head1 AUTHOR
298              
299             Andrew Main (Zefram)
300              
301             =head1 COPYRIGHT
302              
303             Copyright (C) 2010 PhotoBox Ltd
304              
305             Copyright (C) 2011, 2017 Andrew Main (Zefram)
306              
307             =head1 LICENSE
308              
309             This module is free software; you can redistribute it and/or modify it
310             under the same terms as Perl itself.
311              
312             =cut
313              
314             1;