| lib/MKDoc/XML/Tagger.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 106 | 109 | 97.2 |
| branch | 4 | 4 | 100.0 |
| condition | 2 | 4 | 50.0 |
| subroutine | 12 | 13 | 92.3 |
| pod | 2 | 2 | 100.0 |
| total | 126 | 132 | 95.4 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | # ------------------------------------------------------------------------------------- | ||||||
| 2 | # MKDoc::XML::Tagger | ||||||
| 3 | # ------------------------------------------------------------------------------------- | ||||||
| 4 | # Author : Jean-Michel Hiver. | ||||||
| 5 | # Copyright : (c) MKDoc Holdings Ltd, 2003 | ||||||
| 6 | # | ||||||
| 7 | # This module adds markup to an existing XML file / variable by matching expression. | ||||||
| 8 | # You could see it as an XML-compatible search and substitute module. | ||||||
| 9 | # | ||||||
| 10 | # The main reason it exists is to automagically hyperlink HTML in MKDoc, and also to | ||||||
| 11 | # mark up properly abbreviations based on glossaries. | ||||||
| 12 | # | ||||||
| 13 | # This module is distributed under the same license as Perl itself. | ||||||
| 14 | # ------------------------------------------------------------------------------------- | ||||||
| 15 | package MKDoc::XML::Tagger; | ||||||
| 16 | 6 | 6 | 50584 | use MKDoc::XML::Tokenizer; | |||
| 6 | 19 | ||||||
| 6 | 297 | ||||||
| 17 | 6 | 6 | 38 | use strict; | |||
| 6 | 13 | ||||||
| 6 | 203 | ||||||
| 18 | 6 | 6 | 29 | use warnings; | |||
| 6 | 11 | ||||||
| 6 | 151 | ||||||
| 19 | 6 | 6 | 8222 | use utf8; | |||
| 6 | 106 | ||||||
| 6 | 33 | ||||||
| 20 | |||||||
| 21 | our $tags = []; | ||||||
| 22 | our $Ignorable_RE = qr /(?:\r|\n|\s|(?:\&\(\d+\)))*/; | ||||||
| 23 | |||||||
| 24 | our @DONT_TAG = qw/a/; | ||||||
| 25 | |||||||
| 26 | ## | ||||||
| 27 | # $class->process_data ($xml, @expressions); | ||||||
| 28 | # ------------------------------------------ | ||||||
| 29 | # Tags $xml with @expressions, where expression is a list of hashes. | ||||||
| 30 | # | ||||||
| 31 | # For example: | ||||||
| 32 | # | ||||||
| 33 | # MKDoc::XML::Tagger->process ( | ||||||
| 34 | # 'I like oranges and bananas', | ||||||
| 35 | # { _expr => 'oranges', _tag => 'a', href => 'http://www.google.com?q=oranges' }, | ||||||
| 36 | # { _expr => 'bananas', _tag => 'a', href => 'http://www.google.com?q=bananas' }, | ||||||
| 37 | # | ||||||
| 38 | # Will return | ||||||
| 39 | # | ||||||
| 40 | # 'I like oranges and \ | ||||||
| 41 | # bananas. | ||||||
| 42 | ## | ||||||
| 43 | sub process_data | ||||||
| 44 | { | ||||||
| 45 | 14 | 14 | 1 | 8833 | my $class = shift; | ||
| 46 | 14 | 98 | my $tokens = MKDoc::XML::Tokenizer->process_data (shift); | ||||
| 47 | 14 | 55 | return _replace ($tokens, @_); | ||||
| 48 | } | ||||||
| 49 | |||||||
| 50 | |||||||
| 51 | ## | ||||||
| 52 | # $class->process_file ($file, @expressions); | ||||||
| 53 | # ------------------------------------------- | ||||||
| 54 | # Same as $class->process_data ($data, @expressions), except that $data is read | ||||||
| 55 | # from $file. | ||||||
| 56 | ## | ||||||
| 57 | sub process_file | ||||||
| 58 | { | ||||||
| 59 | 0 | 0 | 1 | 0 | my $class = shift; | ||
| 60 | 0 | 0 | my $tokens = MKDoc::XML::Tokenizer->process_file (shift); | ||||
| 61 | 0 | 0 | return _replace ($tokens, @_); | ||||
| 62 | } | ||||||
| 63 | |||||||
| 64 | |||||||
| 65 | ## | ||||||
| 66 | # _replace ($tokens, @expressions); | ||||||
| 67 | # --------------------------------- | ||||||
| 68 | # This function constructs the newly marked up text from a list | ||||||
| 69 | # of XML $tokens and a list of @expressions and returns it. | ||||||
| 70 | # | ||||||
| 71 | # Longest expressions are applied first. | ||||||
| 72 | ## | ||||||
| 73 | sub _replace | ||||||
| 74 | { | ||||||
| 75 | 14 | 14 | 26 | my $tokens = shift; | |||
| 76 | 14 | 56 | my @expr = sort { length ($b->{_expr}) <=> length ($a->{_expr}) } @_; | ||||
| 2 | 12 | ||||||
| 77 | |||||||
| 78 | 16 | 33 | @expr = map { | ||||
| 79 | 14 | 31 | my $hash = \%{$_}; | ||||
| 16 | 21 | ||||||
| 80 | 16 | 38 | for (keys %{$hash}) { | ||||
| 16 | 69 | ||||||
| 81 | 51 | 185 | $hash->{$_} =~ s/\&/\&/g; | ||||
| 82 | 51 | 163 | $hash->{$_} =~ s/\\</g; | ||||
| 83 | 51 | 103 | $hash->{$_} =~ s/\>/\>/g; | ||||
| 84 | 51 | 483 | $hash->{$_} =~ s/\"/\"/g; | ||||
| 85 | }; | ||||||
| 86 | 16 | 60 | $hash; | ||||
| 87 | } @expr; | ||||||
| 88 | |||||||
| 89 | 14 | 24 | my $text; local $tags; | ||||
| 14 | 19 | ||||||
| 90 | 14 | 52 | ($text, $tags) = _segregate_markup_from_text ($tokens); | ||||
| 91 | |||||||
| 92 | # once we have segregated markup from the text, we can safely | ||||||
| 93 | # encode < and > and "... | ||||||
| 94 | # $text =~ s/\&/\&/g; # seems to be already encoded... where do we encode this stuff !?! | ||||||
| 95 | 14 | 36 | $text =~ s/\\</g; | ||||
| 96 | 14 | 26 | $text =~ s/\>/\>/g; | ||||
| 97 | 14 | 43 | $text =~ s/\"/\"/g; | ||||
| 98 | |||||||
| 99 | # but we don't want any ' | ||||||
| 100 | 14 | 35 | $text =~ s/\'/\'/g; | ||||
| 101 | |||||||
| 102 | # @expr = _filter_out ($text, @expr); | ||||||
| 103 | 14 | 51 | while (my $attr = shift (@expr)) | ||||
| 104 | { | ||||||
| 105 | 16 | 24 | my %attr = %{$attr}; | ||||
| 16 | 84 | ||||||
| 106 | 16 | 50 | 70 | my $tag = delete $attr{_tag} || next; | |||
| 107 | 16 | 50 | 58 | my $expr = delete $attr{_expr} || next; | |||
| 108 | 16 | 92 | $text = _text_replace ($text, $expr, $tag, \%attr); | ||||
| 109 | } | ||||||
| 110 | |||||||
| 111 | 14 | 361 | while ($text =~ /\&\(\d+\)/) | ||||
| 112 | { | ||||||
| 113 | 15 | 37 | for (my $i = 0; $i < @{$tags}; $i++) | ||||
| 57 | 193 | ||||||
| 114 | { | ||||||
| 115 | 42 | 137 | my $c = $i + 1; | ||||
| 116 | 42 | 75 | my $tag = $tags->[$i]; | ||||
| 117 | 42 | 580 | $text =~ s/\&\($c\)/$tag/g; | ||||
| 118 | } | ||||||
| 119 | } | ||||||
| 120 | |||||||
| 121 | 14 | 140 | return $text; | ||||
| 122 | } | ||||||
| 123 | |||||||
| 124 | |||||||
| 125 | ## | ||||||
| 126 | # _text_replace ($text, $expr, $tag, $attr); | ||||||
| 127 | # ------------------------------------------ | ||||||
| 128 | # Replaces all $text, $expr, $tag, $attr. | ||||||
| 129 | ## | ||||||
| 130 | sub _text_replace | ||||||
| 131 | { | ||||||
| 132 | 16 | 16 | 31 | my $text = shift; | |||
| 133 | 16 | 26 | my $expr = shift; | ||||
| 134 | 16 | 36 | my $tag = shift; | ||||
| 135 | 16 | 32 | my $attr = shift; | ||||
| 136 | |||||||
| 137 | 16 | 39 | my $re = _expression_to_regex ($expr); | ||||
| 138 | 16 | 46 | my $tag1 = _tag_open ($tag, $attr); | ||||
| 139 | 16 | 55 | my $tag2 = _tag_close ($tag, $attr); | ||||
| 140 | |||||||
| 141 | # let's treat beginning and end of string as spaces, | ||||||
| 142 | # it makes the regular expressions much easier. | ||||||
| 143 | 16 | 63 | $text = " $text "; | ||||
| 144 | |||||||
| 145 | 6 | 6 | 75 | my %expr = map { $_ => 1 } $text =~ | |||
| 6 | 11 | ||||||
| 6 | 278 | ||||||
| 16 | 614 | ||||||
| 14 | 215385 | ||||||
| 146 | /(?<=\p{IsSpace}|\p{IsPunct}|\&)($re)(?=\p{IsSpace}|\p{IsPunct}|\&)/gi; | ||||||
| 147 | |||||||
| 148 | 16 | 26308 | foreach (keys %expr) | ||||
| 149 | { | ||||||
| 150 | 13 | 35 | my $to_replace = quotemeta ($_); | ||||
| 151 | 13 | 27 | my $replacement = $_; | ||||
| 152 | 13 | 39 | $replacement =~ s/(\&\(\d+\))/$tag2$1$tag1/g; | ||||
| 153 | 13 | 40 | $replacement = "$tag1$replacement$tag2"; | ||||
| 154 | |||||||
| 155 | # Double hyperlinking fix | ||||||
| 156 | # JM - 2004-01-23 | ||||||
| 157 | 13 | 125 | push @{$tags}, $replacement; | ||||
| 13 | 38 | ||||||
| 158 | 13 | 27 | my $rep = '&(' . @{$tags} . ')'; | ||||
| 13 | 159 | ||||||
| 159 | 13 | 295 | $text =~ s/(?<=\p{IsSpace}|\p{IsPunct}|\&)$to_replace(?=\p{IsSpace}|\p{IsPunct}|\&)/$rep/g; | ||||
| 160 | # matching placeholders fix Bruno 2005-03-10 | ||||||
| 161 | 13 | 4738 | my $rep_quoted = quotemeta ($rep); | ||||
| 162 | 13 | 144 | $text =~ s/&\($rep_quoted\)/&($to_replace)/g; | ||||
| 163 | } | ||||||
| 164 | |||||||
| 165 | # remove the first and last space which we previously inserted for | ||||||
| 166 | # ease-of-regex purposes. | ||||||
| 167 | 16 | 78 | $text =~ s/^ //; | ||||
| 168 | 16 | 162 | $text =~ s/ $//; | ||||
| 169 | 16 | 198 | return $text; | ||||
| 170 | } | ||||||
| 171 | |||||||
| 172 | |||||||
| 173 | ## | ||||||
| 174 | # _segregate_markup_from_text ($tokens); | ||||||
| 175 | # -------------------------------------- | ||||||
| 176 | # From an array reference of tokens, returns text with | ||||||
| 177 | # placeholders for markup, followed by an array reference | ||||||
| 178 | # of markup tokens. | ||||||
| 179 | # | ||||||
| 180 | # Example: | ||||||
| 181 | # | ||||||
| 182 | # [ '', 'Hello ', ' ', 'World', '' ] |
||||||
| 183 | # | ||||||
| 184 | # becomes | ||||||
| 185 | # | ||||||
| 186 | # ( '&(1)Hello &(2)World&(3)', [ '', ' ', '' ] ) |
||||||
| 187 | ## | ||||||
| 188 | sub _segregate_markup_from_text | ||||||
| 189 | { | ||||||
| 190 | 15 | 15 | 34 | my $tokens = shift; | |||
| 191 | 15 | 26 | my @tags = (); | ||||
| 192 | 15 | 38 | my $res = ''; | ||||
| 193 | |||||||
| 194 | 15 | 95 | for (@{$tokens}) | ||||
| 15 | 36 | ||||||
| 195 | { | ||||||
| 196 | 50 | 108 | $_ = $$_; # replace the token object by its value | ||||
| 197 | 50 | 100 | 364 | /^ and do { | |||
| 198 | 28 | 48 | push @tags, $_; | ||||
| 199 | 28 | 152 | $res .= '&(' . @tags . ')'; | ||||
| 200 | 28 | 57 | next; | ||||
| 201 | }; | ||||||
| 202 | 22 | 55 | $res .= $_; | ||||
| 203 | } | ||||||
| 204 | |||||||
| 205 | 15 | 62 | return $res, \@tags; | ||||
| 206 | } | ||||||
| 207 | |||||||
| 208 | |||||||
| 209 | ## | ||||||
| 210 | # _expression_to_regex ($expr); | ||||||
| 211 | # ----------------------------- | ||||||
| 212 | # Turns $expr into a regular expression that will match | ||||||
| 213 | # all segregated text which should match this expression. | ||||||
| 214 | ## | ||||||
| 215 | sub _expression_to_regex | ||||||
| 216 | { | ||||||
| 217 | 16 | 16 | 27 | my $text = shift; | |||
| 218 | 16 | 35 | $text = lc ($text); | ||||
| 219 | 16 | 58 | $text =~ s/^(?:\s|\r|\n)+//; | ||||
| 220 | 16 | 76 | $text =~ s/(?:\s|\r|\n)+$//; | ||||
| 221 | |||||||
| 222 | 16 | 102 | my @split = split /(?:\s|\r|\n)+/, $text; | ||||
| 223 | 16 | 36 | $text = join $Ignorable_RE, map { quotemeta ($_) } @split; | ||||
| 23 | 84 | ||||||
| 224 | |||||||
| 225 | 16 | 49 | return $text; | ||||
| 226 | } | ||||||
| 227 | |||||||
| 228 | |||||||
| 229 | ## | ||||||
| 230 | # _tag_open ($tag_name, $tag_attributes); | ||||||
| 231 | # --------------------------------------- | ||||||
| 232 | # Turns a structure representing an opening tag into | ||||||
| 233 | # a string representing an opening tag. | ||||||
| 234 | ## | ||||||
| 235 | sub _tag_open | ||||||
| 236 | { | ||||||
| 237 | 18 | 18 | 2601 | my $tag = shift; | |||
| 238 | 18 | 30 | my $attr = shift; | ||||
| 239 | |||||||
| 240 | 20 | 43 | my $attr_str = join ' ', map { $_ . '=' . do { | ||||
| 18 | 58 | ||||||
| 241 | 20 | 39 | my $val = $attr->{$_}; | ||||
| 242 | 20 | 82 | "\"$val\""; | ||||
| 243 | 18 | 28 | } } keys %{$attr}; | ||||
| 244 | |||||||
| 245 | 18 | 100 | 412 | return $attr_str ? "<$tag $attr_str>" : "<$tag>"; | |||
| 246 | } | ||||||
| 247 | |||||||
| 248 | |||||||
| 249 | ## | ||||||
| 250 | # _tag_close ($tag_name); | ||||||
| 251 | # ----------------------- | ||||||
| 252 | # Turns a structure representing an closing tag into | ||||||
| 253 | # a string representing a closing tag. | ||||||
| 254 | ## | ||||||
| 255 | sub _tag_close | ||||||
| 256 | { | ||||||
| 257 | 17 | 17 | 41 | my $tag = shift; | |||
| 258 | 17 | 57 | return "$tag>"; | ||||
| 259 | } | ||||||
| 260 | |||||||
| 261 | |||||||
| 262 | 1; | ||||||
| 263 | |||||||
| 264 | |||||||
| 265 | __END__ |