File Coverage

blib/lib/Search/Tools/XML.pm

Criterion	Covered	Total	%
statement	142	175	81.1
branch	60	96	62.5
condition	43	58	74.1
subroutine	25	27	92.5
pod	17	17	100.0
total	287	373	76.9

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::XML;
2	30			30		468054	use Moo;
	30					79880
	30					135
3	30			30		15484	use Carp;
	30					68
	30					1323
4	30			30		4217	use Search::Tools; # XS required
	30					79
	30					597
5	30			30		4923	use Search::Tools::UTF8;
	30					91
	30					2287
6
7	30			30		5030	use namespace::autoclean;
	30					137937
	30					193
8
9							our $VERSION = '1.006';
10
11							=pod
12
13							=head1 NAME
14
15							Search::Tools::XML - methods for playing nice with XML and HTML
16
17							=head1 SYNOPSIS
18
19							use Search::Tools::XML;
20
21							my $class = 'Search::Tools::XML';
22
23							my $text = 'the "quick brown" fox';
24
25							my $xml = $class->start_tag('foo');
26
27							$xml .= $class->utf8_safe( $text );
28
29							$xml .= $class->end_tag('foo');
30
31							# $xml: the "quick brown" fox
32
33							$xml = $class->escape( $xml );
34
35							# $xml: <foo>the "quick brown" fox</foo>
36
37							$xml = $class->unescape( $xml );
38
39							# $xml: the "quick brown" fox
40
41							my $plain = $class->no_html( $xml );
42
43							# $plain eq $text
44
45
46							=head1 DESCRIPTION
47
48							B The API for escape() and unescape() has changed as of version 0.16.
49							The text is no longer modified in place, as this was less intuitive.
50
51							Search::Tools::XML provides utility methods for dealing with XML and HTML.
52							There isn't really anything new here that CPAN doesn't provide via HTML::Entities
53							or similar modules. The difference is convenience: the most common methods you
54							need for search apps are in one place with no extra dependencies.
55
56							B To get full UTF-8 character set from chr() you must be using Perl >= 5.8.
57							This affects things like the unescape* methods.
58
59							=head1 VARIABLES
60
61							=head2 %HTML_ents
62
63							Complete map of all named HTML entities to their decimal values.
64
65							=cut
66
67							# regexp for what constitutes whitespace in an HTML doc
68							# it's not as simple as \s\| so we define it separately
69							my @white_hex_pts = qw(
70							0009
71							000a
72							000b
73							000c
74							000d
75							0020
76							00a0
77							2000
78							2001
79							2002
80							2003
81							2004
82							2005
83							2006
84							2007
85							2008
86							2009
87							200a
88							200b
89							2028
90							2029
91							202f
92							205f
93							2060
94							3000
95							);
96
97							my @whitesp = ( '\s', ' ' );
98
99							# NOTE that the pound sign # needs escaping because we use
100							# the 'x' flag in our regexp.
101
102							for my $w (@white_hex_pts) {
103							push @whitesp, sprintf( "&\\#x%s;", $w ); # hex entity
104							push @whitesp, sprintf( "&\\#%s;", hex($w) ); # dec entity
105							push @whitesp, sprintf( "\\%s", chr( hex($w) ) ); # byte value
106							}
107
108							my $HTML_WHITESPACE = join( '\|', @whitesp );
109							my $WHITESPACE = join( '\|', map { chr( hex($_) ) } @white_hex_pts );
110
111							# HTML entity table
112							# this just removes a dependency on another module...
113
114							our %HTML_ents = (
115							quot => 34,
116							amp => 38,
117							apos => 39,
118							'lt' => 60,
119							'gt' => 62,
120							nbsp => 160,
121							iexcl => 161,
122							cent => 162,
123							pound => 163,
124							curren => 164,
125							yen => 165,
126							brvbar => 166,
127							sect => 167,
128							uml => 168,
129							copy => 169,
130							ordf => 170,
131							laquo => 171,
132							not => 172,
133							shy => 173,
134							reg => 174,
135							macr => 175,
136							deg => 176,
137							plusmn => 177,
138							sup2 => 178,
139							sup3 => 179,
140							acute => 180,
141							micro => 181,
142							para => 182,
143							middot => 183,
144							cedil => 184,
145							sup1 => 185,
146							ordm => 186,
147							raquo => 187,
148							frac14 => 188,
149							frac12 => 189,
150							frac34 => 190,
151							iquest => 191,
152							Agrave => 192,
153							Aacute => 193,
154							Acirc => 194,
155							Atilde => 195,
156							Auml => 196,
157							Aring => 197,
158							AElig => 198,
159							Ccedil => 199,
160							Egrave => 200,
161							Eacute => 201,
162							Ecirc => 202,
163							Euml => 203,
164							Igrave => 204,
165							Iacute => 205,
166							Icirc => 206,
167							Iuml => 207,
168							ETH => 208,
169							Ntilde => 209,
170							Ograve => 210,
171							Oacute => 211,
172							Ocirc => 212,
173							Otilde => 213,
174							Ouml => 214,
175							'times' => 215,
176							Oslash => 216,
177							Ugrave => 217,
178							Uacute => 218,
179							Ucirc => 219,
180							Uuml => 220,
181							Yacute => 221,
182							THORN => 222,
183							szlig => 223,
184							agrave => 224,
185							aacute => 225,
186							acirc => 226,
187							atilde => 227,
188							auml => 228,
189							aring => 229,
190							aelig => 230,
191							ccedil => 231,
192							egrave => 232,
193							eacute => 233,
194							ecirc => 234,
195							euml => 235,
196							igrave => 236,
197							iacute => 237,
198							icirc => 238,
199							iuml => 239,
200							eth => 240,
201							ntilde => 241,
202							ograve => 242,
203							oacute => 243,
204							ocirc => 244,
205							otilde => 245,
206							ouml => 246,
207							divide => 247,
208							oslash => 248,
209							ugrave => 249,
210							uacute => 250,
211							ucirc => 251,
212							uuml => 252,
213							yacute => 253,
214							thorn => 254,
215							yuml => 255,
216							OElig => 338,
217							oelig => 339,
218							Scaron => 352,
219							scaron => 353,
220							Yuml => 376,
221							fnof => 402,
222							circ => 710,
223							tilde => 732,
224							Alpha => 913,
225							Beta => 914,
226							Gamma => 915,
227							Delta => 916,
228							Epsilon => 917,
229							Zeta => 918,
230							Eta => 919,
231							Theta => 920,
232							Iota => 921,
233							Kappa => 922,
234							Lambda => 923,
235							Mu => 924,
236							Nu => 925,
237							Xi => 926,
238							Omicron => 927,
239							Pi => 928,
240							Rho => 929,
241							Sigma => 931,
242							Tau => 932,
243							Upsilon => 933,
244							Phi => 934,
245							Chi => 935,
246							Psi => 936,
247							Omega => 937,
248							alpha => 945,
249							beta => 946,
250							gamma => 947,
251							delta => 948,
252							epsilon => 949,
253							zeta => 950,
254							eta => 951,
255							theta => 952,
256							iota => 953,
257							kappa => 954,
258							lambda => 955,
259							mu => 956,
260							nu => 957,
261							xi => 958,
262							omicron => 959,
263							pi => 960,
264							rho => 961,
265							sigmaf => 962,
266							sigma => 963,
267							tau => 964,
268							upsilon => 965,
269							phi => 966,
270							chi => 967,
271							psi => 968,
272							omega => 969,
273							thetasym => 977,
274							upsih => 978,
275							piv => 982,
276							ensp => 8194,
277							emsp => 8195,
278							thinsp => 8201,
279							zwnj => 8204,
280							zwj => 8205,
281							lrm => 8206,
282							rlm => 8207,
283							ndash => 8211,
284							mdash => 8212,
285							lsquo => 8216,
286							rsquo => 8217,
287							sbquo => 8218,
288							ldquo => 8220,
289							rdquo => 8221,
290							bdquo => 8222,
291							dagger => 8224,
292							Dagger => 8225,
293							bull => 8226,
294							hellip => 8230,
295							permil => 8240,
296							prime => 8242,
297							Prime => 8243,
298							lsaquo => 8249,
299							rsaquo => 8250,
300							oline => 8254,
301							frasl => 8260,
302							euro => 8364,
303							image => 8465,
304							weierp => 8472,
305							real => 8476,
306							trade => 8482,
307							alefsym => 8501,
308							larr => 8592,
309							uarr => 8593,
310							rarr => 8594,
311							darr => 8595,
312							harr => 8596,
313							crarr => 8629,
314							lArr => 8656,
315							uArr => 8657,
316							rArr => 8658,
317							dArr => 8659,
318							hArr => 8660,
319							forall => 8704,
320							part => 8706,
321							exist => 8707,
322							empty => 8709,
323							nabla => 8711,
324							isin => 8712,
325							notin => 8713,
326							ni => 8715,
327							prod => 8719,
328							'sum' => 8721,
329							'minus' => 8722,
330							lowast => 8727,
331							radic => 8730,
332							prop => 8733,
333							infin => 8734,
334							ang => 8736,
335							'and' => 8743,
336							'or' => 8744,
337							cap => 8745,
338							cup => 8746,
339							int => 8747,
340							there4 => 8756,
341							sim => 8764,
342							cong => 8773,
343							asymp => 8776,
344							ne => 8800,
345							equiv => 8801,
346							le => 8804,
347							ge => 8805,
348							sub => 8834,
349							sup => 8835,
350							nsub => 8836,
351							sube => 8838,
352							supe => 8839,
353							oplus => 8853,
354							otimes => 8855,
355							perp => 8869,
356							sdot => 8901,
357							lceil => 8968,
358							rceil => 8969,
359							lfloor => 8970,
360							rfloor => 8971,
361							lang => 9001,
362							rang => 9002,
363							loz => 9674,
364							spades => 9824,
365							clubs => 9827,
366							hearts => 9829,
367							diams => 9830,
368							);
369
370							my %char2entity = ();
371							while ( my ( $e, $n ) = each(%HTML_ents) ) {
372							my $char = chr($n);
373							$char2entity{$char} = "&$e;";
374							}
375							delete $char2entity{q/'/}; # only one-way decoding
376
377							# Fill in missing entities
378							# TODO does this only work under latin1 locale?
379							for ( 0 .. 255 ) {
380							next if exists $char2entity{ chr($_) };
381							$char2entity{ chr($_) } = "&#$_;";
382							}
383
384							=head1 METHODS
385
386							The following methods may be accessed either as object or class methods.
387
388							=head2 new
389
390							Create a Search::Tools::XML object.
391
392							=cut
393
394							=head2 tag_re
395
396							Returns a qr// regex for matching a SGML (XML, HTML, etc) tag.
397
398							=cut
399
400	36			36	1	222	sub tag_re {qr/<[^>]+>/s}
401
402							=head2 html_whitespace
403
404							Returns a regex for all whitespace characters and
405							HTML whitespace entities.
406
407							=cut
408
409	26			26	1	384	sub html_whitespace {$HTML_WHITESPACE}
410
411							=head2 char2ent_map
412
413							Returns a hash reference to the class data mapping chr() values to their
414							numerical entity equivalents.
415
416							=cut
417
418	26			26	1	66	sub char2ent_map { \%char2entity }
419
420							=head2 looks_like_html( I )
421
422							Returns true if I appears to have HTML-like markup in it.
423
424							Aliases for this method include:
425
426							=over
427
428							=item looks_like_xml
429
430							=item looks_like_markup
431
432							=back
433
434							=cut
435
436	31			31	1	1972	sub looks_like_html { return $_[1] =~ m/[<>]\|&[\#\w]+;/o }
437							*looks_like_xml = \&looks_like_html;
438							*looks_like_markup = \&looks_like_html;
439
440							=head2 start_tag( I [, I<\%attr> ] )
441
442							=head2 end_tag( I )
443
444							Returns I as a tag, either start or end. I will be escaped for any non-valid
445							chars using tag_safe().
446
447							If I<\%attr> is passed, XML-safe attributes are generated using attr_safe().
448
449							=head2 singleton( I [, I<\%attr> ] )
450
451							Like start_tag() but includes the closing slash.
452
453							=cut
454
455	54			54	1	685	sub start_tag { "<" . tag_safe( $_[1] ) . $_[0]->attr_safe( $_[2] ) . ">" }
456	53			53	1	70	sub end_tag { "" }
457	0			0	1	0	sub singleton { "<" . tag_safe( $_[1] ) . $_[0]->attr_safe( $_[2] ) . "/>" }
458
459							=pod
460
461							=head2 tag_safe( I )
462
463							Create a valid XML tag name, escaping/omitting invalid characters.
464
465							Example:
466
467							my $tag = Search::Tools::XML->tag_safe( '1 * ! tag foo' );
468							# $tag == '______tag_foo'
469
470							=cut
471
472							sub tag_safe {
473	111			111	1	118	my $t = pop;
474
475	111	50				148	return '_' unless length $t;
476
477	111					145	$t =~ s/::/_/g; # single colons ok, but doubles are not
478	111					132	$t =~ s/[^-\.\w:]/_/g;
479	111					142	$t =~ s/^(\d)/_$1/;
480
481	111					270	return $t;
482							}
483
484							=head2 attr_safe( I<\%attr> )
485
486							Returns stringified I<\%attr> as XML attributes.
487
488							=cut
489
490							sub attr_safe {
491	54			54	1	65	my $self = shift;
492	54					66	my $attr = shift;
493	54	100				122	return '' unless defined $attr;
494	11	50				22	if ( ref $attr ne "HASH" ) {
495	0					0	croak "attributes must be a hash ref";
496							}
497	11					19	my @xml = (''); # force space at start in return
498	11					26	for my $name ( sort keys %$attr ) {
499							my $val = _escape_xml( $attr->{$name},
500	4					15	is_flagged_utf8( $attr->{$name} ) );
501	4					10	push @xml, tag_safe($name) . qq{="$val"};
502							}
503	11					36	return join( ' ', @xml );
504							}
505
506							=pod
507
508							=head2 utf8_safe( I )
509
510							Return I with special XML chars and all
511							non-ASCII chars converted to numeric entities.
512
513							This is escape() on steroids. B
514							unless you know what you're doing. See the SYNOPSIS for an example.
515
516							=head2 escape_utf8
517
518							Alias for utf8_safe().
519
520							=cut
521
522							*escape_utf8 = \&utf8_safe;
523
524							sub utf8_safe {
525	27			27	1	561	my $t = pop;
526	27	50				39	$t = '' unless defined $t;
527
528							# converts all low chars except \t \n and \r
529							# to space because XML spec disallows <32
530	27					49	$t =~ s,[\x00-\x08\x0b-\x0c\x0e-\x1f], ,g;
531
532	27					48	$t =~ s{([^\x09\x0a\x0d\x20\x21\x23-\x25\x28-\x3b\x3d\x3F-\x5B\x5D-\x7E])}
	2					8
533							{'&#'.(ord($1)).';'}eg;
534	27					49
535							return $t;
536							}
537
538							=head2 no_html( I [, I] )
539
540							no_html() is a brute-force method for removing all tags and entities
541							from I. A simple regular expression is used, so things like
542							nested comments and the like will probably break. If you really
543							need to reliably filter out the tags and entities from a HTML text, use
544							HTML::Parser or similar.
545
546							I is returned with no markup in it.
547
548							If I is true (defaults to false) then
549							all whitespace is normalized away to ASCII space (U+0020).
550							This can be helpful if you have Unicode entities representing
551							line breaks or other layout instructions.
552
553							=cut
554
555	10			10	1	535	sub no_html {
556	10					18	my $class = shift;
557	10		100			56	my $text = shift;
558	10	50				103	my $normalize_whitespace = shift \|\| 0;
559	0					0	if ( !defined $text ) {
560							croak "text required";
561	10					34	}
562	10					2283	my $re = $class->tag_re;
563	10					37	$text =~ s,$re,,g;
564	10	100				28	$text = $class->unescape($text);
565	1					8	if ($normalize_whitespace) {
566							$text =~ s/\s+/ /g;
567	10					57	}
568							return $text;
569							}
570
571							=head2 strip_html
572
573							An alias for no_html().
574
575							=head2 strip_markup
576
577							An alias for no_html().
578
579							=cut
580
581							*strip_html = \&no_html;
582							*strip_markup = \&no_html;
583
584							=head2 escape( I )
585
586							Similar to escape() functions in more famous CPAN modules, but without the
587							added dependency. escape() will convert the special XML chars (><'"&) to their
588							named entity equivalents.
589
590							The escaped I is returned.
591
592							B The API for this method has changed as of version 0.16. I
593							is no longer modified in-place.
594
595							As of version 0.27 escape() is written in C/XS for speed.
596
597							=cut
598
599	2			2	1	959	sub escape {
600	2	50				8	my $text = pop;
601	2					6	return unless defined $text;
602							return _escape_xml( $text, is_flagged_utf8($text) );
603							}
604
605							=head2 unescape( I )
606
607							Similar to unescape() functions in more famous CPAN modules, but without the added
608							dependency. unescape() will convert all entities to their chr() equivalents.
609
610							B unescape() does more than reverse the effects of escape(). It attempts
611							to resolve B entities, not just the special XML entities (><'"&).
612
613							B The API for this method has changed as of version 0.16.
614							I is no longer modified in-place.
615
616							=cut
617
618	11			11	1	635	sub unescape {
619	11					35	my $text = pop;
620	11					34	$text = unescape_named($text);
621	11					25	$text = unescape_decimal($text);
622							return $text;
623							}
624
625							=head2 unescape_named( I )
626
627							Replace all named HTML entities with their chr() equivalents.
628
629							Returns modified copy of I.
630
631							=cut
632
633	13			13	1	27	sub unescape_named {
634	13	50				39	my $t = pop;
635							if ( defined($t) ) {
636
637	13	100				63	# named entities - check first to see if it is worth looping
638	7					274	if ( $t =~ m/&[a-zA-Z0-9]+;/ ) {
639	1771					2427	for my $e ( keys %HTML_ents ) {
640	1771	100				10203	my $dec = $HTML_ents{$e};
	163					354
641							if ( my $n = $t =~ s/&$e;/chr($dec)/eg ) {
642
643							#warn "replaced $e ($dec) -> $HTML_ents{$e} $n times in text";
644							}
645							}
646							}
647	13					92	}
648							return $t;
649							}
650
651							=head2 unescape_decimal( I )
652
653							Replace all decimal entities with their chr() equivalents.
654
655							Returns modified copy of I.
656
657							=cut
658
659	11			11	1	23	sub unescape_decimal {
660							my $t = pop;
661
662	11	50				55	# resolve numeric entities as best we can
	5					20
663	11					26	$t =~ s/&#(\d+);/chr($1)/ego if defined($t);
664							return $t;
665							}
666
667							=head2 perl_to_xml( I [, I] )
668
669							Similar to the XML::Simple XMLout() feature, perl_to_xml()
670							will take a Perl data structure I and convert it to XML.
671
672							I should be a hashref with the following supported key/value pairs:
673
674							=over
675
676							=item root I
677
678							The root element. If I is a string, it is used as the tag name. If
679							I is a hashref, two keys are required:
680
681							=over
682
683							=item tag
684
685							String indicating the element name.
686
687							=item attrs
688
689							Hash ref of attribute key/value pairs (see start_tag()).
690
691							=back
692
693							=item wrap_array I<1\|0>
694
695							If B is true (the default), arrayref items are wrapped
696							in an additional XML tag, keeping the array items enclosed in a logical set.
697							If B is false, each item in the array is treated individually.
698							See B below for the naming convention for arrayref items.
699
700							=item strip_plural I<1\|0>
701
702							The B option interacts with the B option.
703
704							If B is a true value and not a CODE ref,
705							any trailing C ~~character will be stripped from the enclosing tag name~~
706							whenever an array of hashrefs is found. Example:
707
708							my $data = {
709							values => [
710							{ two => 2,
711							three => 3,
712							},
713							{ four => 4,
714							five => 5,
715							},
716							],
717							};
718
719							my $xml = $utils->perl_to_xml($data, {
720							root => 'data',
721							wrap_array => 1,
722							strip_plural => 1,
723							});
724
725							# $xml DOM will look like:
726
727
728
729
730							3
731							2
732
733
734							5
735							4
736
737
738
739
740							Obviously stripping the final C ~~will not always render sensical tag names.~~
741							Pass a CODE ref instead, expecting one value (the tag name) and returning the
742							tag name to use:
743
744							my $xml = $utils->perl_to_xml($data, {
745							root => 'data',
746							wrap_array => 1,
747							strip_plural => sub {
748							my $tag = shift;
749							$tag =~ s/foo/BAR/;
750							return $tag;
751							},
752							});
753
754							=item escape I<1\|0>
755
756							If B is false, strings within the B value will not be passed
757							through escape(). Default is true.
758
759							=back
760
761							=cut
762
763							=head2 perl_to_xml( I, I [, I ][, I] )
764
765							This second usage is deprecated and here for backwards compatability only.
766							Use the named key/value I instead. Readers of your code (including you!) will
767							thank you.
768
769							=cut
770
771	4			4		5	sub _make_singular {
772	4					9	my ($t) = @_;
773	4	50				9	$t =~ s/ies$/y/i;
774	4	50				11	return $t if ( $t =~ s/ses$/s/i );
775	4					12	return $t if ( $t =~ /[aeiouy]ss$/i );
776	4	50				10	$t =~ s/s$//i;
777							return length $t ? $t : $_[0];
778							}
779
780	4			4	1	97	sub perl_to_xml {
781	4					7	my $self = shift;
782							my $perl = shift;
783	4					5
784	4	100	66			19	my ( $root, $wrap_array, $strip_plural, $escape );
785	2					3	if ( ref $_[0] eq 'HASH' and !exists $_[0]->{tag} ) {
	2					7
786	2		50			8	my %opts = %{ $_[0] };
787	2		100			6	$root = delete $opts{root} \|\| '_root';
788	2					4	$strip_plural = delete $opts{strip_plural} \|\| 0;
789	2	100				4	$wrap_array = delete $opts{wrap_array};
790	2					4	$wrap_array = 1 unless defined $wrap_array;
791	2	100				4	$escape = delete $opts{escape};
792							$escape = 1 unless defined $escape;
793							}
794	2		50			6	else {
795	2		100			6	$root = shift \|\| '_root';
796	2					4	$strip_plural = shift \|\| 0;
797							$escape = shift;
798
799	2	50	33			13	# backcompat means we need to reverse logic
		50	33
		50
800	0					0	if ( defined $escape and $escape == 1 ) {
801							$escape = 0;
802							}
803	0					0	elsif ( defined $escape and $escape == 0 ) {
804							$escape = 1;
805							}
806	2					3	elsif ( !defined $escape ) {
807							$escape = 1;
808							}
809	2					3
810							$wrap_array = 1; # old behavior
811	4	50				7	}
812	0					0	unless ( defined $perl ) {
813							croak "perl data struct required";
814							}
815	4	100	66			13
816	2					4	if ( $strip_plural and ref($strip_plural) ne 'CODE' ) {
817							$strip_plural = \&_make_singular;
818							}
819	4					5
820	4	100				8	my ( $root_tag, $attrs );
821	1	50				3	if ( ref $root ) {
822	1	50				3	$root_tag = delete $root->{tag} or croak 'tag key required in root';
823							$attrs = delete $root->{attrs} or croak 'attrs key required in root';
824							}
825	3					4	else {
826	3					5	$root_tag = $root;
827							$attrs = {};
828							}
829	4	50				8
830							if ( !ref $perl ) {
831	0	0				0	return
832							$self->start_tag( $root_tag, $attrs )
833							. ( $escape ? $self->utf8_safe($perl) : $perl )
834							. $self->end_tag($root_tag);
835							}
836	4					11
837	4					12	my $xml = $self->start_tag( $root_tag, $attrs );
838							$self->_ref_to_xml( $perl, '', \$xml, $strip_plural, $escape,
839	4					8	$wrap_array );
840	4					18	$xml .= $self->end_tag($root_tag);
841							return $xml;
842							}
843
844	58			58		118	sub _ref_to_xml {
845							my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array )
846	58					74	= @_;
847	58	100				98	my $type = ref $perl;
		50
		100
		100
848	31	50				60	if ( !$type ) {
849							( $$xml_ref .= $self->start_tag($root) )
850	31	100				70	if length($root);
851	31	50				67	$$xml_ref .= ( $escape ? $self->utf8_safe($perl) : $perl );
852							( $$xml_ref .= $self->end_tag($root) )
853							if length($root);
854
855							#$$xml_ref .= "\n"; # just for debugging
856							}
857	0					0	elsif ( $type eq 'SCALAR' ) {
858							$self->_scalar_to_xml( $perl, $root, $xml_ref, $strip_plural,
859							$escape, $wrap_array );
860							}
861	7					17	elsif ( $type eq 'ARRAY' ) {
862							$self->_array_to_xml( $perl, $root, $xml_ref, $strip_plural,
863							$escape, $wrap_array );
864							}
865	17					34	elsif ( $type eq 'HASH' ) {
866							$self->_hash_to_xml( $perl, $root, $xml_ref, $strip_plural, $escape,
867							$wrap_array );
868							}
869							else {
870	3					26	# assume blessed object, force it to stringify as a scalar
871							$self->_scalar_to_xml( "$perl", $root, $xml_ref, $strip_plural,
872							$escape, $wrap_array );
873							}
874
875							}
876
877	7			7		10	sub _array_to_xml {
878							my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array )
879	7					13	= @_;
880	25	100	66			89	for my $thing (@$perl) {
			100
			66
			100
881							if ( ref $thing
882							and ( ref $thing eq 'ARRAY' or ref $thing eq 'HASH' )
883							and length($root)
884							and $wrap_array )
885							{
886	8					15	#warn "<$root> ref $thing == " . ref($thing);
887							$$xml_ref .= $self->start_tag($root);
888	25					49	}
889							$self->_ref_to_xml( $thing, $root, $xml_ref, $strip_plural, $escape,
890	25	100	66			95	$wrap_array );
			100
			66
			100
891							if ( ref $thing
892							and ( ref $thing eq 'ARRAY' or ref $thing eq 'HASH' )
893							and length($root)
894							and $wrap_array )
895							{
896	8					14	#warn " ref $thing == " . ref($thing);
897							$$xml_ref .= $self->end_tag($root);
898							}
899							}
900							}
901
902	17			17		23	sub _hash_to_xml {
903							my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array )
904	17					35	= @_;
905	29					35	for my $key ( keys %$perl ) {
906	29	100				40	my $thing = $perl->{$key};
907	8					11	if ( ref $thing ) {
908	8					8	my $key_to_pass = $key;
909	8	100	100			25	my %attr;
910	4					10	if ( ref $thing eq 'ARRAY' && $strip_plural ) {
911	4					7	$key_to_pass = $strip_plural->($key_to_pass);
912							$attr{count} = scalar @$thing;
913	8	100	100			27	}
914	6					14	if ( ref $thing ne 'ARRAY' or $wrap_array ) {
915							$$xml_ref .= $self->start_tag( $key, \%attr );
916							}
917	8					19	$self->_ref_to_xml(
918							$thing, $key_to_pass, $xml_ref,
919							$strip_plural, $escape, $wrap_array
920	8	100	100			27	);
921	6					11	if ( ref $thing ne 'ARRAY' or $wrap_array ) {
922							$$xml_ref .= $self->end_tag($key);
923							}
924
925							#$$xml_ref .= "\n"; # just for debugging
926							}
927	21					41	else {
928							$self->_ref_to_xml( $thing, $key, $xml_ref, $strip_plural,
929							$escape, $wrap_array );
930							}
931							}
932							}
933
934	3			3		14	sub _scalar_to_xml {
935							my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array )
936	3	100				7	= @_;
937							$$xml_ref
938							.= $self->start_tag($root)
939							. ( $escape ? $self->utf8_safe($perl) : $perl )
940							. $self->end_tag($root);
941
942							#$$xml_ref .= "\n"; # just for debugging
943							}
944
945							=head2 tidy( I )
946
947							Attempts to indent I correctly to make
948							it more legible.
949
950							Returns the I tidied up.
951
952							B This is an experimental feature. It might be
953							really slow or eat your XML. You have been warned.
954
955							=cut
956
957	0			0	1		sub tidy {
958	0						my $xml = pop;
959	0						my $level = 2;
960	0						my $indent = 0;
961							my @tidy = ();
962
963	0						# normalize tag breaks
964							$xml =~ s,>\s*<,>\n<,gs;
965	0
966							my @xmlarr = split( m/\n/, $xml );
967
968	0	0	0				# shift off declaration
969	0						if ( scalar(@xmlarr) and $xmlarr[0] =~ m/^<\?\s*xml/ ) {
970							push @tidy, shift(@xmlarr);
971							}
972	0
973	0						my $count = 0;
974							for my $el (@xmlarr) {
975	0	0
976	0						if ( $count == 1 ) {
977							$indent = 2;
978	0	0					}
979	0						if ( $count == scalar(@xmlarr) - 1 ) {
980							$indent = 0;
981							}
982
983							#warn "el: $el\n";
984
985	0	0					# singletons get special treatment
		0
986							if ( $el =~ m/^<([\w])+[^>]*\/>$/ ) {
987	0
988							push @tidy, ( ' ' x $indent ) . $el;
989							}
990
991							# match opening tag
992							elsif ( $el =~ m/^<([\w])+[^>]*>$/ ) {
993
994	0						#warn "open $indent\n";
995	0						push @tidy, ( ' ' x $indent ) . $el;
996							$indent += $level;
997							}
998	0	0					else {
999							if ( $el =~ m/^<\// ) {
1000
1001	0						#warn "close $indent\n";
1002							$indent -= $level; # closing tag
1003	0	0					}
1004	0						if ( $indent < 0 ) {
1005							$indent += $level;
1006	0						}
1007							push @tidy, ( ' ' x $indent ) . $el;
1008							}
1009
1010							#warn "indent = $indent\n";
1011
1012	0						#Data::Dump::dump \@tidy;
1013							$count++;
1014							}
1015	0
1016							return join( "\n", @tidy );
1017
1018							}
1019
1020							1;
1021							__END__