File Coverage

blib/lib/LaTeX/Recode.pm
Criterion Covered Total %
statement 155 175 88.5
branch 46 70 65.7
condition 2 4 50.0
subroutine 22 22 100.0
pod 2 2 100.0
total 227 273 83.1


line stmt bran cond sub pod time code
1             package LaTeX::Recode;
2              
3 2     2   207763 use strict;
  2         21  
  2         61  
4 2     2   11 use warnings;
  2         4  
  2         62  
5 2     2   1010 use parent qw(Exporter);
  2         591  
  2         11  
6 2     2   118 use re 'eval';
  2         4  
  2         126  
7 2     2   1193 use Encode;
  2         22195  
  2         137  
8 2     2   1197 use File::Slurp;
  2         56017  
  2         136  
9 2     2   20 use File::Spec;
  2         5  
  2         45  
10 2     2   1272 use Unicode::Normalize;
  2         4592  
  2         142  
11 2     2   15 use List::Util qw (first);
  2         4  
  2         141  
12 2     2   1129 use XML::LibXML::Simple;
  2         89154  
  2         136  
13 2     2   670 use utf8;
  2         19  
  2         16  
14              
15 2     2   1193 use File::ShareDir 'module_file';
  2         49802  
  2         729  
16              
17             our @EXPORT = qw(latex_encode latex_decode);
18              
19             sub import {
20 2     2   41 my ($self, %opts) = @_;
21            
22 2         10 my $sets = qr/^(?:null|base|full)$/i;
23 2 50 50     28 my $encode_set = ($opts{encode_set}||"") =~ m!$sets! ? lc $opts{encode_set} : "base";
24 2 50 50     31 my $decode_set = ($opts{decode_set}||"") =~ m!$sets! ? lc $opts{decode_set} : "base";
25            
26 2         7 _init_sets($decode_set, $encode_set);
27              
28 2         3740 $self->export_to_level(1, undef, @EXPORT);
29             }
30              
31             =encoding utf-8
32              
33             =head1 NAME
34              
35             LaTeX::Recode - Encode/Decode chars to/from UTF-8/macros in LaTeX
36              
37             =head1 SYNOPSIS
38              
39             use LaTeX::Recode;
40              
41             my $string = 'Muḥammad ibn Mūsā al-Khwārizmī';
42             my $latex_string = latex_encode($string);
43             # => 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}'
44              
45             my $string = 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}';
46             my $utf8_string = latex_decode($string);
47             # => 'Muḥammad ibn Mūsā al-Khwārizmī'
48              
49              
50             # if you want to define a different conversion set (either
51             # for encoding or decoding):
52             use LaTeX::Recode encode_set => 'full', decode_set => 'base';
53              
54              
55             =head1 DESCRIPTION
56              
57             Allows conversion between Unicode chars and LaTeX macros.
58              
59             =head1 GLOBAL OPTIONS
60              
61             Possible values for the encoding/decoding set to use are 'null', 'base' and 'full';
62             default value is 'base'.
63              
64             null => No conversion
65              
66             base => Most common macros and diacritics (sufficient for Western languages
67             and common symbols)
68              
69             full => Also converts punctuation, larger range of diacritics and macros
70             (e.g. for IPA, Latin Extended Additional, etc.), symbols, Greek letters,
71             dingbats, negated symbols, and superscript characters and symbols ...
72              
73             =cut
74              
75             our ($remap_d, $remap_e, $remap_e_raw, $set_d, $set_e);
76              
77              
78              
79              
80             =head2 latex_decode($text, @options)
81              
82             Converts LaTeX macros in the $text to Unicode characters.
83              
84             The function accepts a number of options:
85              
86             * normalize => $bool (default 1)
87             whether the output string should be normalized with Unicode::Normalize
88              
89             * normalization => (default 'NFD')
90             and if yes, the normalization form to use (see the Unicode::Normalize documentation)
91              
92             =cut
93              
94             sub latex_decode {
95 4     4 1 3644 my $text = shift;
96              
97             # Optimisation - if there are no macros, no point doing anything
98 4 50       21 return $text unless $text =~ m/\\/;
99              
100             # Optimisation - if virtual null set was specified, do nothing
101 4 50       12 return $text if $set_d eq 'null';
102              
103 4         8 my %opts = @_;
104 4 50       11 my $norm = exists $opts{normalize} ? $opts{normalize} : 1;
105             my $norm_form
106 4 50       10 = exists $opts{normalization} ? $opts{normalization} : 'NFD';
107              
108             # Deal with raw TeX \char macros.
109 2     2   15 $text =~ s/\\char"(\p{ASCII_Hex_Digit}+)/"chr(0x$1)"/gee; # hex chars
  2         4  
  2         28  
  4         9  
  0         0  
110 4         9 $text =~ s/\\char'(\d+)/"chr(0$1)"/gee; # octal chars
  0         0  
111 4         6 $text =~ s/\\char(\d+)/"chr($1)"/gee; # decimal chars
  0         0  
112              
113 4         14 $text =~ s/(\\[a-zA-Z]+)\\(\s+)/$1\{\}$2/g; # \foo\ bar -> \foo{} bar
114 4         26 $text =~ s/([^{]\\\w)([;,.:%])/$1\{\}$2/g; #} Aaaa\o, -> Aaaa\o{},
115              
116 4         11 foreach my $type (
117             'greek', 'dings',
118             'punctuation', 'symbols',
119             'negatedsymbols', 'superscripts',
120             'cmdsuperscripts', 'letters',
121             'diacritics'
122             )
123             {
124 36         135 my $map = $remap_d->{$type}{map};
125 36         61 my $re = $remap_d->{$type}{re};
126 36 50       71 next unless $re; # Might not be present depending on set
127              
128 36 100       167 if ( $type eq 'negatedsymbols' ) {
    100          
    100          
    100          
    100          
    100          
    50          
129 4         62 $text =~ s/\\not\\($re)/$map->{$1}/ge;
  0         0  
130             }
131             elsif ( $type eq 'superscripts' ) {
132 4         48 $text =~ s/\\textsuperscript\{($re)\}/$map->{$1}/ge;
  0         0  
133             }
134             elsif ( $type eq 'cmdsuperscripts' ) {
135 4         45 $text =~ s/\\textsuperscript\{\\($re)\}/$map->{$1}/ge;
  0         0  
136             }
137             elsif ( $type eq 'dings' ) {
138 4         11 $text =~ s/\\ding\{([2-9AF][0-9A-F])\}/$map->{$1}/ge;
  0         0  
139             }
140             elsif ( $type eq 'letters' ) {
141 4         306 $text =~ s/\\($re)(?:\{\}|\s+|\b)/$map->{$1}/ge;
  1         10  
142             }
143 36     36   68 elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
144             {
145 12         2368 $text =~ s/\\($re)(?: \{\}|\s+|\b)/$map->{$1}/ge;
  0         0  
146             }
147             elsif ( $type eq 'diacritics' ) {
148 4         114 $text =~ s/\\($re)\s*\{(\pL\pM*)\}/$2 . $map->{$1}/ge;
  12         1077  
149              
150             # Conditional regexp with code-block condition
151             # non letter macros for diacritics (e.g. \=) can be followed by any letter
152             # but letter diacritic macros (e.g \c) can't (\cS) horribly Broken
153             #
154             # If the RE for the macro doesn't end with a basic LaTeX macro letter (\=), then
155             # next char can be any letter (\=d)
156             # Else if it did end with a normal LaTeX macro letter (\c), then
157             # If this was followed by a space (\c )
158             # Any letter is allowed after the space (\c S)
159             # Else
160             # Only a non basic LaTeX letter is allowed (\c-)
161 4         130 $text =~ s/\\# slash
162             ($re)# the diacritic
163             (\s*)# optional space
164             (# capture paren
165 0         0 (?(?{$1 !~ m:[A-Za-z]$:})# code block condition (is not a letter?)
166             \pL # yes pattern
167             | # no pattern
168             (?(?{$2}) # code block condition (space matched earlier after diacritic?)
169             \pL # yes pattern
170             | # no pattern
171             [^A-Za-z]
172             ) # close conditional
173             ) # close conditional
174             \pM* # optional marks
175             ) # capture paren
176 0         0 /$3 . $map->{$1}/gxe;
177             }
178             }
179              
180             # Now remove braces around single letters with diacritics (which the replace above
181             # can result in). Things like '{á}'. Such things can break kerning. We can't do this in
182             # the RE above as we can't determine if the braces are wrapping a phrase because this
183             # match is on an entire file string. So we can't in one step tell the difference between:
184             #
185             # author = {Andr\'e}
186             # and
187             # author = {Andr\'{e}}
188             #
189             # when this is part of a (much) larger string
190             #
191             # We don't want to do this if it would result in a broken macro name like with
192             # \textupper{é}
193             #
194             # Workaround perl's lack of variable-width negative look-behind -
195             # Reverse string (and therefore some of the Re) and use variable width negative look-ahead
196 4         664 $text = reverse $text;
197 4         13 $text =~ s/}(\pM+\pL)\{(?!\pL+\\)/$1/g;
198 4         9 $text = reverse $text;
199              
200 4 50       21 return $norm ? Unicode::Normalize::normalize( $norm_form, $text ) : $text;
201            
202             }
203              
204             =head2 latex_encode($text, @options)
205              
206             Converts UTF-8 to LaTeX
207              
208             =cut
209              
210             sub latex_encode {
211 4     4 1 3500 my $text = shift;
212              
213             # Optimisation - if virtual null set was specified, do nothing
214 4 50       15 return $text if $set_e eq 'null';
215              
216 4         42 $text = NFD($text);
217              
218 4         9 foreach my $type (
219             qw'greek dings negatedsymbols superscripts cmdsuperscripts
220             diacritics letters punctuation symbols'
221             )
222             {
223 36         300 my $map = $remap_e->{$type}{map};
224 36         63 my $re = $remap_e->{$type}{re};
225 36 50       81 next unless $re; # Might not be present depending on set
226              
227 36 100       177 if ( $type eq 'negatedsymbols' ) {
    100          
    100          
    100          
    100          
    100          
    50          
228 4         85 $text =~ s/($re)/"{\$\\not\\" . $map->{$1} . '$}'/ge;
  0         0  
229             }
230             elsif ( $type eq 'superscripts' ) {
231 4         71 $text =~ s/($re)/'\textsuperscript{' . $map->{$1} . '}'/ge;
  0         0  
232             }
233             elsif ( $type eq 'cmdsuperscripts' ) {
234 4         43 $text =~ s/($re)/"\\textsuperscript{\\" . $map->{$1} . "}"/ge;
  0         0  
235             }
236             elsif ( $type eq 'dings' ) {
237 4         1008 $text =~ s/($re)/'\ding{' . $map->{$1} . '}'/ge;
  0         0  
238             }
239             elsif ( $type eq 'letters' ) {
240              
241             # General macros (excluding special encoding excludes)
242 4         765 $text
243 1 50       17 =~ s/($re)/($remap_e_raw->{$1} ? '' : "\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '{}')/ge;
    50          
244             }
245 36     36   80 elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
246             {
247             # Math mode macros (excluding special encoding excludes)
248 12         5597 $text
249 0 0       0 =~ s/($re)/($remap_e_raw->{$1} ? '' : "{\$\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '$}')/ge;
    0          
250             }
251             elsif ( $type eq 'diacritics' ) {
252              
253             # special case such as "i\x{304}" -> '\={\i}' -> "i" needs the dot removing for accents
254 4         135 $text =~ s/i($re)/"\\" . $map->{$1} . '{\i}'/ge;
  0         0  
255              
256 4         56 $text =~ s/\{(\pL\pM*)\}($re)/"\\" . $map->{$2} . "{$1}"/ge;
  0         0  
257 4         490 $text =~ s/(\pL\pM*)($re)/"\\" . $map->{$2} . "{$1}"/ge;
  12         393  
258              
259 4         84 $text =~ s{
260             (\PM)($re)($re)($re)
261             }{
262 0         0 "\\" . $map->{$4} . "{\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}}'
263             }gex;
264 4         403 $text =~ s{
265             (\PM)($re)($re)
266             }{
267 0         0 "\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}'
268             }gex;
269 4         392 $text =~ s{
270             (\PM)($re)
271             }{
272 0         0 "\\" . $map->{$2} . "{$1}"
273             }gex;
274             }
275             }
276              
277 4         25 return $text;
278             }
279              
280              
281             =head2 _init_sets(, )
282              
283             Initialise recoding sets.
284             This is a private method, and its direct usage should not be needed
285             in normal circunstances.
286              
287             =cut
288              
289              
290             sub _init_sets {
291 2     2   5 ( $set_d, $set_e ) = @_;
292 2     2   49998 no autovivification;
  2         1734  
  2         10  
293              
294             # Reset these, mostly for tests which call init_sets more than once
295 2         4 $remap_d = {};
296 2         5 $remap_e = {};
297 2         4 $remap_e_raw = {};
298              
299 2         13 my $mapdata = module_file( 'LaTeX::Recode' => "recode_data.xml" );
300              
301             # Read driver config file
302 2 50       728 my $xml = File::Slurp::read_file($mapdata)
303             or die("Can't read file $mapdata");
304 2         559 my $doc = XML::LibXML->load_xml( string => decode( 'UTF-8', $xml ) );
305 2         7974 my $xpc = XML::LibXML::XPathContext->new($doc);
306              
307 2         11 my @types = qw(letters diacritics punctuation symbols negatedsymbols
308             superscripts cmdsuperscripts dings greek);
309              
310             # Have to have separate loops for decode/recode or you can't have independent
311             # decode/recode sets
312              
313             # Construct decode set
314 2         6 foreach my $type (@types) {
315 18         615 foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
316 20         1659 my @set = split( /\s*,\s*/, $maps->getAttribute('set') );
317 20 50   28   411 next unless first { $set_d eq $_ } @set;
  28         77  
318 20         98 foreach my $map ( $maps->findnodes('map') ) {
319 1668         31691 my $from = $map->findnodes('from')->shift();
320 1668         50961 my $to = $map->findnodes('to')->shift();
321 1668         59692 $remap_d->{$type}{map}{ NFD( $from->textContent() ) }
322             = NFD( $to->textContent() );
323             }
324             }
325              
326             # Things we don't want to change when decoding as this breaks some things
327 18         10849 foreach my $d ( $xpc->findnodes('/texmap/decode_exclude/char') ) {
328 72         2313 delete( $remap_d->{$type}{map}{ NFD( $d->textContent() ) } );
329             }
330             }
331              
332             # Construct encode set
333 2         80 foreach my $type (@types) {
334 18         2932 foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
335 20         1090 my @set = split( /\s*,\s*/, $maps->getAttribute('set') );
336 20 50   28   404 next unless first { $set_e eq $_ } @set;
  28         79  
337 20         91 foreach my $map ( $maps->findnodes('map') ) {
338 1668         32129 my $from = $map->findnodes('from')->shift();
339 1668         51197 my $to = $map->findnodes('to')->shift();
340 1668         60063 $remap_e->{$type}{map}{ NFD( $to->textContent() ) }
341             = NFD( $from->textContent() );
342             }
343              
344             # There are some duplicates in the data to handle preferred encodings.
345 20         379 foreach my $map ( $maps->findnodes('map[from[@preferred]]') ) {
346 90         10725 my $from = $map->findnodes('from')->shift();
347 90         2646 my $to = $map->findnodes('to')->shift();
348 90         3085 $remap_e->{$type}{map}{ NFD( $to->textContent() ) }
349             = NFD( $from->textContent() );
350             }
351              
352             # Some things might need to be inserted as is rather than wrapped
353             # in some macro/braces
354 20         4618 foreach my $map ( $maps->findnodes('map[from[@raw]]') ) {
355 10         320 my $from = $map->findnodes('from')->shift();
356 10         298 my $to = $map->findnodes('to')->shift();
357 10         352 $remap_e_raw->{ NFD( $to->textContent() ) } = 1;
358             }
359              
360             }
361              
362             # Things we don't want to change when encoding as this would break LaTeX
363 18         2506 foreach my $e ( $xpc->findnodes('/texmap/encode_exclude/char') ) {
364 450         3926 delete( $remap_e->{$type}{map}{ NFD( $e->textContent() ) } );
365             }
366             }
367              
368             # Populate the decode regexps
369             # sort by descending length of macro name to avoid shorter macros which
370             # are substrings of longer ones damaging the longer ones
371 2         352 foreach my $type (@types) {
372 18 50       89 next unless exists $remap_d->{$type};
373             $remap_d->{$type}{re} = join( '|',
374 1650 100       3489 map { /[\.\^\|\+\-\)\(]/ ? '\\' . $_ : $_ }
375 7771         10102 sort { length($b) <=> length($a) }
376 18         31 keys %{$remap_d->{$type}{map}} );
  18         573  
377 18         6327 $remap_d->{$type}{re} = qr|$remap_d->{$type}{re}|;
378             }
379              
380             # Populate the encode regexps
381 2         18 foreach my $type (@types) {
382 18 50       99 next unless exists $remap_e->{$type};
383             $remap_e->{$type}{re} = join( '|',
384 1528 50       2753 map { /[\.\^\|\+\-\)\(]/ ? '\\' . $_ : $_ }
385 18         31 sort keys %{ $remap_e->{$type}{map} } );
  18         661  
386 18         8935 $remap_e->{$type}{re} = qr|$remap_e->{$type}{re}|;
387             }
388             }
389              
390             1;
391              
392             __END__