File Coverage

blib/lib/XML/Easy/SimpleSchemaUtil.pm
Criterion Covered Total %
statement 73 78 93.5
branch 35 38 92.1
condition n/a
subroutine 15 16 93.7
pod 4 4 100.0
total 127 136 93.3


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             XML::Easy::SimpleSchemaUtil - help with simple kinds of XML schema
4              
5             =head1 SYNOPSIS
6              
7             use XML::Easy::SimpleSchemaUtil qw(
8             xml_s_canonise_chars xml_c_canonise_chars
9             xml_c_subelements xml_c_chardata
10             );
11              
12             $chardata = xml_s_canonise_chars($chardata);
13             $content = xml_c_canonise_chars($content);
14             $subelements = xml_c_subelements($content);
15             $chars = xml_c_chardata($content);
16              
17             =head1 DESCRIPTION
18              
19             The rules by which some class of thing is encoded in XML constitute a
20             schema. (A schema does not need to be codified in a formal language such
21             as Schematron: a natural-language specification can also be a schema.
22             Even if there is no explicit specification at all, the behaviour of
23             the interoperating processors of related XML documents constitutes a de
24             facto schema.) Certain kinds of rule are commonly used in all manner
25             of schemata. This module supplies functions that help to implement such
26             common kinds of rule, regardless of how a schema is specified.
27              
28             This module processes XML data in the form used by L,
29             consisting of C and C objects
30             and twine arrays. In this form, character data are stored fully decoded,
31             so they can be manipulated with no knowledge of XML syntax.
32              
33             =cut
34              
35             package XML::Easy::SimpleSchemaUtil;
36              
37 4     4   202279 { use 5.006; }
  4         15  
  4         195  
38 4     4   24 use warnings;
  4         7  
  4         138  
39 4     4   22 use strict;
  4         18  
  4         189  
40              
41 4     4   1234 use Params::Classify 0.000 qw(is_ref);
  4         2803  
  4         259  
42 4     4   833 use XML::Easy::Classify 0.006 qw(check_xml_chardata check_xml_content_twine);
  4         42368  
  4         291  
43             use XML::Easy::NodeBasics 0.007
44 4     4   969 qw(xml_content_object xml_content_twine xml_c_content_twine);
  4         6287  
  4         291  
45 4     4   27 use XML::Easy::Syntax 0.000 qw($xml10_s_rx);
  4         67  
  4         601  
46              
47             our $VERSION = "0.002";
48              
49 4     4   23 use parent "Exporter";
  4         6  
  4         22  
50             our @EXPORT_OK = qw(
51             xml_s_canonise_chars xs_charcanon xml_c_canonise_chars xc_charcanon
52             xml_c_subelements xc_subelems xml_c_chardata xc_chars
53             );
54              
55             sub _throw_data_error($) {
56 0     0   0 my($msg) = @_;
57 0         0 die "invalid XML data: $msg\n";
58             }
59              
60             sub _throw_schema_error($) {
61 44     44   74 my($msg) = @_;
62 44         318 die "XML schema error: $msg\n";
63             }
64              
65             =head1 FUNCTIONS
66              
67             Each function has two names. There is a longer descriptive name, and
68             a shorter name to spare screen space and the programmer's fingers.
69              
70             =over
71              
72             =item xml_s_canonise_chars(STRING, OPTIONS)
73              
74             =item xs_charcanon(STRING, OPTIONS)
75              
76             This function is intended to help in parsing XML data, in situations
77             where the schema states that some aspects of characters are not
78             entirely significant. I must be a plain Perl string consisting
79             of character data that is valid for XML. The function examines the
80             characters, processes them as specified in the I, and returns
81             a modified version of the string.
82             I must be a reference to a hash, in which the permitted keys are:
83              
84             =over
85              
86             =item B
87              
88             =item B
89              
90             =item B
91              
92             Controls handling of sequences of whitespace characters. The three
93             keys control, respectively, whitespace at the beginning of the string,
94             whitespace that is at neither the beginning nor the end, and whitespace at
95             the end of the string. If the entire content of the string is whitespace,
96             it is treated as both leading and trailing.
97              
98             The whitespace characters, for this purpose, are tab, linefeed/newline,
99             carriage return, and space. This is the same set of characters that
100             are whitespace for the purposes of the XML syntax.
101              
102             The value for each key may be:
103              
104             =over
105              
106             =item B
107              
108             Completely remove the whitespace. For situations where the whitespace is
109             of no significance at all. (Common for leading and trailing whitespace,
110             but rare for intermediate whitespace.)
111              
112             =item B
113              
114             Replace the whitespace sequence with a single space character.
115             For situations where the presence of whitespace is significant but the
116             length and type are not. (Common for intermediate whitespace.)
117              
118             =item B (default)
119              
120             Leave the whitespace unchanged. For situations where the exact type of
121             whitespace is significant.
122              
123             =back
124              
125             =back
126              
127             =cut
128              
129             sub _canonise_chars($$) {
130 140     140   260 my($string, $options) = @_;
131 140 100       565 my $leading_wsp = exists($options->{leading_wsp}) ?
132             $options->{leading_wsp} : "PRESERVE";
133 140 100       440 if($leading_wsp eq "DELETE") {
    100          
    50          
134 14         96 $string =~ s/\A$xml10_s_rx//o;
135             } elsif($leading_wsp eq "COMPRESS") {
136 14         91 $string =~ s/\A$xml10_s_rx/ /o;
137             } elsif($leading_wsp ne "PRESERVE") {
138 0         0 _throw_data_error("bad character canonicalisation option");
139             }
140 140 100       292 my $intermediate_wsp = exists($options->{intermediate_wsp}) ?
141             $options->{intermediate_wsp} : "PRESERVE";
142 140 100       741 if($intermediate_wsp eq "DELETE") {
    100          
    50          
143 14         188 $string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z)
144             /$1/xsog;
145             } elsif($intermediate_wsp eq "COMPRESS") {
146 14         284 $string =~ s/(?!$xml10_s_rx)(.)$xml10_s_rx(?!$xml10_s_rx|\z)
147             /$1 /xsog;
148             } elsif($intermediate_wsp ne "PRESERVE") {
149 0         0 _throw_data_error("bad character canonicalisation option");
150             }
151 140 100       333 my $trailing_wsp = exists($options->{trailing_wsp}) ?
152             $options->{trailing_wsp} : "PRESERVE";
153 140 100       1610 if($trailing_wsp eq "DELETE") {
    100          
    50          
154 14         104 $string =~ s/$xml10_s_rx\z//o;
155             } elsif($trailing_wsp eq "COMPRESS") {
156 14         108 $string =~ s/$xml10_s_rx\z/ /o;
157             } elsif($trailing_wsp ne "PRESERVE") {
158 0         0 _throw_data_error("bad character canonicalisation option");
159             }
160 140         585 return $string;
161             }
162              
163             sub xml_s_canonise_chars($$) {
164 22     22 1 26896 check_xml_chardata($_[0]);
165 20         162 return &_canonise_chars;
166             }
167              
168             *xs_charcanon = \&xml_s_canonise_chars;
169              
170             =item xml_c_canonise_chars(CONTENT, OPTIONS)
171              
172             =item xc_charcanon(CONTENT, OPTIONS)
173              
174             This function is intended to help in parsing XML data, in situations
175             where the schema states that some aspects of characters are not
176             entirely significant. I must be a reference to either an
177             L object or a twine array. The function processes its
178             top-level character content in the same way as L,
179             and returns the resulting modified version of the content in the same
180             form that the input supplied.
181              
182             Any element inside the content chunk acts like a special character that
183             will not be modified. It interrupts any character sequence of interest.
184             Elements are not processed recursively: they are treated as atomic.
185              
186             =cut
187              
188             sub _canonise_chars_twine($$) {
189 60     60   624 my($twine, $options) = @_;
190 60 100       252 return [ _canonise_chars($twine->[0], $options) ]
191             if @$twine == 1;
192 20         82 my $leading_options = {%$options};
193 20         57 my $intermediate_options = {%$options};
194 20         42 my $trailing_options = {%$options};
195 20 100       91 $leading_options->{trailing_wsp} =
196             $intermediate_options->{leading_wsp} =
197             $intermediate_options->{trailing_wsp} =
198             $trailing_options->{leading_wsp} =
199             exists($options->{intermediate_wsp}) ?
200             $options->{intermediate_wsp} : "PRESERVE";
201 20         65 my @output = @$twine;
202 20         48 $output[0] = _canonise_chars($output[0], $leading_options);
203 20         46 $output[-1] = _canonise_chars($output[-1], $trailing_options);
204 20         63 for(my $i = @output - 3; $i != 0; $i--) {
205 40         80 $output[$i] =
206             _canonise_chars($output[$i], $intermediate_options);
207             }
208 20         130 return \@output;
209             }
210              
211             sub xml_c_canonise_chars($$) {
212 60 100   60 1 45447 if(is_ref($_[0], "ARRAY")) {
213 30         108 check_xml_content_twine($_[0]);
214 30         740 return xml_content_twine(&_canonise_chars_twine);
215             } else {
216 30         322 return xml_content_object(_canonise_chars_twine(
217             xml_c_content_twine($_[0]), $_[1]));
218             }
219             }
220              
221             *xc_charcanon = \&xml_c_canonise_chars;
222              
223             =item xml_c_subelements(CONTENT, ALLOW_WSP)
224              
225             =item xc_subelems(CONTENT, ALLOW_WSP)
226              
227             This function is intended to help in parsing XML data, in situations
228             where the schema calls for an element to contain only subelements,
229             possibly with optional whitespace around and between them.
230              
231             I must be a reference to either an L object
232             or a twine array. The function checks whether the content includes
233             any unpermitted characters at the top level, and Cs if it does.
234             If the content is of permitted form, the function returns a reference
235             to an array listing all the subelements.
236              
237             I is a truth value controlling whether whitespace is permitted
238             around and between the subelements. The characters recognised as
239             whitespace are the same as those for XML syntax. Allowing whitespace in
240             this way is easier (and slightly more efficient) than first filtering
241             it out via L. Non-whitespace characters are
242             never permitted.
243              
244             =cut
245              
246             sub xml_c_subelements($$) {
247 58     58 1 51803 my($content, $allow_wsp) = @_;
248 58         353 $content = xml_c_content_twine($content);
249 56         1082 my $clen = @$content;
250 56         192 for(my $i = $clen-1; $i >= 0; $i -= 2) {
251 112 100       375 if($allow_wsp) {
252 62 100       381 _throw_schema_error("non-whitespace characters ".
253             "where not permitted")
254             unless $content->[$i] =~ /\A$xml10_s_rx?\z/o;
255             } else {
256 50 100       356 _throw_schema_error("characters where not permitted")
257             unless $content->[$i] eq "";
258             }
259             }
260 22         207 my @subelem;
261 22         225 for(my $i = 1; $i < $clen; $i += 2) {
262 28         70 push @subelem, $content->[$i];
263             }
264 22         284 return \@subelem;
265             }
266              
267             *xc_subelems = \&xml_c_subelements;
268              
269             =item xml_c_chardata(CONTENT)
270              
271             =item xc_chars(CONTENT)
272              
273             This function is intended to help in parsing XML data, in situations
274             where the schema calls for an element to contain only character data.
275             I must be a reference to either an L object
276             or a twine array. The function Cs if it contains any subelements.
277             If the content is of permitted form, the function returns a string
278             containing all the character content.
279              
280             =cut
281              
282             sub xml_c_chardata($) {
283 26     26 1 15570 my($content) = @_;
284 26         75 $content = xml_c_content_twine($content);
285 24 100       322 _throw_schema_error("subelement where not permitted")
286             unless @$content == 1;
287 14         59 return $content->[0];
288             }
289              
290             *xc_chars = \&xml_c_chardata;
291              
292             =back
293              
294             =head1 SEE ALSO
295              
296             L
297              
298             =head1 AUTHOR
299              
300             Andrew Main (Zefram)
301              
302             =head1 COPYRIGHT
303              
304             Copyright (C) 2010 PhotoBox Ltd
305              
306             Copyright (C) 2011 Andrew Main (Zefram)
307              
308             =head1 LICENSE
309              
310             This module is free software; you can redistribute it and/or modify it
311             under the same terms as Perl itself.
312              
313             =cut
314              
315             1;