File Coverage

lib/MKDoc/XML/Tagger.pm

Criterion	Covered	Total	%
statement	106	109	97.2
branch	4	4	100.0
condition	2	4	50.0
subroutine	12	13	92.3
pod	2	2	100.0
total	126	132	95.4

line	stmt	bran	cond	sub	pod	time	code
1							# -------------------------------------------------------------------------------------
2							# MKDoc::XML::Tagger
3							# -------------------------------------------------------------------------------------
4							# Author : Jean-Michel Hiver.
5							# Copyright : (c) MKDoc Holdings Ltd, 2003
6							#
7							# This module adds markup to an existing XML file / variable by matching expression.
8							# You could see it as an XML-compatible search and substitute module.
9							#
10							# The main reason it exists is to automagically hyperlink HTML in MKDoc, and also to
11							# mark up properly abbreviations based on glossaries.
12							#
13							# This module is distributed under the same license as Perl itself.
14							# -------------------------------------------------------------------------------------
15							package MKDoc::XML::Tagger;
16	6			6		50584	use MKDoc::XML::Tokenizer;
	6					19
	6					297
17	6			6		38	use strict;
	6					13
	6					203
18	6			6		29	use warnings;
	6					11
	6					151
19	6			6		8222	use utf8;
	6					106
	6					33
20
21							our $tags = [];
22							our $Ignorable_RE = qr /(?:\r\|\n\|\s\|(?:\&$\d+$))*/;
23
24							our @DONT_TAG = qw/a/;
25
26							##
27							# $class->process_data ($xml, @expressions);
28							# ------------------------------------------
29							# Tags $xml with @expressions, where expression is a list of hashes.
30							#
31							# For example:
32							#
33							# MKDoc::XML::Tagger->process (
34							# 'I like oranges and bananas',
35							# { _expr => 'oranges', _tag => 'a', href => 'http://www.google.com?q=oranges' },
36							# { _expr => 'bananas', _tag => 'a', href => 'http://www.google.com?q=bananas' },
37							#
38							# Will return
39							#
40							# 'I like oranges and \
41							# bananas.
42							##
43							sub process_data
44							{
45	14			14	1	8833	my $class = shift;
46	14					98	my $tokens = MKDoc::XML::Tokenizer->process_data (shift);
47	14					55	return _replace ($tokens, @_);
48							}
49
50
51							##
52							# $class->process_file ($file, @expressions);
53							# -------------------------------------------
54							# Same as $class->process_data ($data, @expressions), except that $data is read
55							# from $file.
56							##
57							sub process_file
58							{
59	0			0	1	0	my $class = shift;
60	0					0	my $tokens = MKDoc::XML::Tokenizer->process_file (shift);
61	0					0	return _replace ($tokens, @_);
62							}
63
64
65							##
66							# _replace ($tokens, @expressions);
67							# ---------------------------------
68							# This function constructs the newly marked up text from a list
69							# of XML $tokens and a list of @expressions and returns it.
70							#
71							# Longest expressions are applied first.
72							##
73							sub _replace
74							{
75	14			14		26	my $tokens = shift;
76	14					56	my @expr = sort { length ($b->{_expr}) <=> length ($a->{_expr}) } @_;
	2					12
77
78	16					33	@expr = map {
79	14					31	my $hash = \%{$_};
	16					21
80	16					38	for (keys %{$hash}) {
	16					69
81	51					185	$hash->{$_} =~ s/\&/\&/g;
82	51					163	$hash->{$_} =~ s/\
83	51					103	$hash->{$_} =~ s/\>/\>/g;
84	51					483	$hash->{$_} =~ s/\"/\"/g;
85							};
86	16					60	$hash;
87							} @expr;
88
89	14					24	my $text; local $tags;
	14					19
90	14					52	($text, $tags) = _segregate_markup_from_text ($tokens);
91
92							# once we have segregated markup from the text, we can safely
93							# encode < and > and "...
94							# $text =~ s/\&/\&/g; # seems to be already encoded... where do we encode this stuff !?!
95	14					36	$text =~ s/\
96	14					26	$text =~ s/\>/\>/g;
97	14					43	$text =~ s/\"/\"/g;
98
99							# but we don't want any '
100	14					35	$text =~ s/\'/\'/g;
101
102							# @expr = _filter_out ($text, @expr);
103	14					51	while (my $attr = shift (@expr))
104							{
105	16					24	my %attr = %{$attr};
	16					84
106	16		50			70	my $tag = delete $attr{_tag} \|\| next;
107	16		50			58	my $expr = delete $attr{_expr} \|\| next;
108	16					92	$text = _text_replace ($text, $expr, $tag, \%attr);
109							}
110
111	14					361	while ($text =~ /\&$\d+$/)
112							{
113	15					37	for (my $i = 0; $i < @{$tags}; $i++)
	57					193
114							{
115	42					137	my $c = $i + 1;
116	42					75	my $tag = $tags->[$i];
117	42					580	$text =~ s/\&$$c$/$tag/g;
118							}
119							}
120
121	14					140	return $text;
122							}
123
124
125							##
126							# _text_replace ($text, $expr, $tag, $attr);
127							# ------------------------------------------
128							# Replaces all $text, $expr, $tag, $attr.
129							##
130							sub _text_replace
131							{
132	16			16		31	my $text = shift;
133	16					26	my $expr = shift;
134	16					36	my $tag = shift;
135	16					32	my $attr = shift;
136
137	16					39	my $re = _expression_to_regex ($expr);
138	16					46	my $tag1 = _tag_open ($tag, $attr);
139	16					55	my $tag2 = _tag_close ($tag, $attr);
140
141							# let's treat beginning and end of string as spaces,
142							# it makes the regular expressions much easier.
143	16					63	$text = " $text ";
144
145	6			6		75	my %expr = map { $_ => 1 } $text =~
	6					11
	6					278
	16					614
	14					215385
146							/(?<=\p{IsSpace}\|\p{IsPunct}\|\&)($re)(?=\p{IsSpace}\|\p{IsPunct}\|\&)/gi;
147
148	16					26308	foreach (keys %expr)
149							{
150	13					35	my $to_replace = quotemeta ($_);
151	13					27	my $replacement = $_;
152	13					39	$replacement =~ s/(\&$\d+$)/$tag2$1$tag1/g;
153	13					40	$replacement = "$tag1$replacement$tag2";
154
155							# Double hyperlinking fix
156							# JM - 2004-01-23
157	13					125	push @{$tags}, $replacement;
	13					38
158	13					27	my $rep = '&(' . @{$tags} . ')';
	13					159
159	13					295	$text =~ s/(?<=\p{IsSpace}\|\p{IsPunct}\|\&)$to_replace(?=\p{IsSpace}\|\p{IsPunct}\|\&)/$rep/g;
160							# matching placeholders fix Bruno 2005-03-10
161	13					4738	my $rep_quoted = quotemeta ($rep);
162	13					144	$text =~ s/&$$rep_quoted$/&($to_replace)/g;
163							}
164
165							# remove the first and last space which we previously inserted for
166							# ease-of-regex purposes.
167	16					78	$text =~ s/^ //;
168	16					162	$text =~ s/ $//;
169	16					198	return $text;
170							}
171
172
173							##
174							# _segregate_markup_from_text ($tokens);
175							# --------------------------------------
176							# From an array reference of tokens, returns text with
177							# placeholders for markup, followed by an array reference
178							# of markup tokens.
179							#
180							# Example:
181							#
182							# [ '', 'Hello ', ' ', 'World', '' ]
183							#
184							# becomes
185							#
186							# ( '&(1)Hello &(2)World&(3)', [ '', ' ', '' ] )
187							##
188							sub _segregate_markup_from_text
189							{
190	15			15		34	my $tokens = shift;
191	15					26	my @tags = ();
192	15					38	my $res = '';
193
194	15					95	for (@{$tokens})
	15					36
195							{
196	50					108	$_ = $$_; # replace the token object by its value
197	50	100				364	/^
198	28					48	push @tags, $_;
199	28					152	$res .= '&(' . @tags . ')';
200	28					57	next;
201							};
202	22					55	$res .= $_;
203							}
204
205	15					62	return $res, \@tags;
206							}
207
208
209							##
210							# _expression_to_regex ($expr);
211							# -----------------------------
212							# Turns $expr into a regular expression that will match
213							# all segregated text which should match this expression.
214							##
215							sub _expression_to_regex
216							{
217	16			16		27	my $text = shift;
218	16					35	$text = lc ($text);
219	16					58	$text =~ s/^(?:\s\|\r\|\n)+//;
220	16					76	$text =~ s/(?:\s\|\r\|\n)+$//;
221
222	16					102	my @split = split /(?:\s\|\r\|\n)+/, $text;
223	16					36	$text = join $Ignorable_RE, map { quotemeta ($_) } @split;
	23					84
224
225	16					49	return $text;
226							}
227
228
229							##
230							# _tag_open ($tag_name, $tag_attributes);
231							# ---------------------------------------
232							# Turns a structure representing an opening tag into
233							# a string representing an opening tag.
234							##
235							sub _tag_open
236							{
237	18			18		2601	my $tag = shift;
238	18					30	my $attr = shift;
239
240	20					43	my $attr_str = join ' ', map { $_ . '=' . do {
	18					58
241	20					39	my $val = $attr->{$_};
242	20					82	"\"$val\"";
243	18					28	} } keys %{$attr};
244
245	18	100				412	return $attr_str ? "<$tag $attr_str>" : "<$tag>";
246							}
247
248
249							##
250							# _tag_close ($tag_name);
251							# -----------------------
252							# Turns a structure representing an closing tag into
253							# a string representing a closing tag.
254							##
255							sub _tag_close
256							{
257	17			17		41	my $tag = shift;
258	17					57	return "";
259							}
260
261
262							1;
263
264
265							__END__