File Coverage

blib/lib/LaTeX/Recode.pm
Criterion Covered Total %
statement 28 30 93.3
branch n/a
condition n/a
subroutine 10 10 100.0
pod n/a
total 38 40 95.0


line stmt bran cond sub pod time code
1             package LaTeX::Recode;
2              
3 2     2   26517 use strict;
  2         3  
  2         44  
4 2     2   6 use warnings;
  2         2  
  2         45  
5 2     2   708 use parent qw(Exporter);
  2         430  
  2         8  
6 2     2   75 use re 'eval';
  2         2  
  2         73  
7 2     2   900 use Encode;
  2         14359  
  2         149  
8 2     2   1041 use File::Slurp;
  2         20129  
  2         107  
9 2     2   9 use File::Spec;
  2         3  
  2         29  
10 2     2   968 use Unicode::Normalize;
  2         323625  
  2         252  
11 2     2   1360 use List::AllUtils qw (first);
  2         30960  
  2         137  
12 2     2   827 use XML::LibXML::Simple;
  0            
  0            
13             use utf8;
14              
15             use File::ShareDir 'module_file';
16              
17             our @EXPORT = qw(latex_encode latex_decode);
18              
19             sub import {
20             my ($self, %opts) = @_;
21            
22             my $sets = qr/^(?:null|base|full)$/i;
23             my $encode_set = ($opts{encode_set}||"") =~ m!$sets! ? lc $opts{encode_set} : "base";
24             my $decode_set = ($opts{decode_set}||"") =~ m!$sets! ? lc $opts{decode_set} : "base";
25            
26             _init_sets($decode_set, $encode_set);
27              
28             $self->export_to_level(1, undef, @EXPORT);
29             }
30              
31             =encoding utf-8
32              
33             =head1 NAME
34              
35             LaTeX::Recode - Encode/Decode chars to/from UTF-8/macros in LaTeX
36              
37             =head1 SYNOPSIS
38              
39             use LaTeX::Recode;
40              
41             my $string = 'Muḥammad ibn Mūsā al-Khwārizmī';
42             my $latex_string = latex_encode($string);
43             # => 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}'
44              
45             my $string = 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}';
46             my $utf8_string = latex_decode($string);
47             # => 'Muḥammad ibn Mūsā al-Khwārizmī'
48              
49              
50             # if you want to define a different conversion set (either
51             # for encoding or decoding):
52             use LaTeX::Recode encode_set => 'full', decode_set => 'base';
53              
54              
55             =head1 DESCRIPTION
56              
57             Allows conversion between Unicode chars and LaTeX macros.
58              
59             =head1 GLOBAL OPTIONS
60              
61             Possible values for the encoding/decoding set to use are 'null', 'base' and 'full';
62             default value is 'base'.
63              
64             null => No conversion
65              
66             base => Most common macros and diacritics (sufficient for Western languages
67             and common symbols)
68              
69             full => Also converts punctuation, larger range of diacritics and macros
70             (e.g. for IPA, Latin Extended Additional, etc.), symbols, Greek letters,
71             dingbats, negated symbols, and superscript characters and symbols ...
72              
73             =cut
74              
75             our ($remap_d, $remap_e, $remap_e_raw, $set_d, $set_e);
76              
77              
78              
79              
80             =head2 latex_decode($text, @options)
81              
82             Converts LaTeX macros in the $text to Unicode characters.
83              
84             The function accepts a number of options:
85              
86             * normalize => $bool (default 1)
87             whether the output string should be normalized with Unicode::Normalize
88              
89             * normalization => (default 'NFD')
90             and if yes, the normalization form to use (see the Unicode::Normalize documentation)
91              
92             =cut
93              
94             sub latex_decode {
95             my $text = shift;
96              
97             # Optimisation - if there are no macros, no point doing anything
98             return $text unless $text =~ m/\\/;
99              
100             # Optimisation - if virtual null set was specified, do nothing
101             return $text if $set_d eq 'null';
102              
103             my %opts = @_;
104             my $norm = exists $opts{normalize} ? $opts{normalize} : 1;
105             my $norm_form
106             = exists $opts{normalization} ? $opts{normalization} : 'NFD';
107              
108             # Deal with raw TeX \char macros.
109             $text =~ s/\\char"(\p{ASCII_Hex_Digit}+)/"chr(0x$1)"/gee; # hex chars
110             $text =~ s/\\char'(\d+)/"chr(0$1)"/gee; # octal chars
111             $text =~ s/\\char(\d+)/"chr($1)"/gee; # decimal chars
112              
113             $text =~ s/(\\[a-zA-Z]+)\\(\s+)/$1\{\}$2/g; # \foo\ bar -> \foo{} bar
114             $text =~ s/([^{]\\\w)([;,.:%])/$1\{\}$2/g; #} Aaaa\o, -> Aaaa\o{},
115              
116             foreach my $type (
117             'greek', 'dings',
118             'punctuation', 'symbols',
119             'negatedsymbols', 'superscripts',
120             'cmdsuperscripts', 'letters',
121             'diacritics'
122             )
123             {
124             my $map = $remap_d->{$type}{map};
125             my $re = $remap_d->{$type}{re};
126             next unless $re; # Might not be present depending on set
127              
128             if ( $type eq 'negatedsymbols' ) {
129             $text =~ s/\\not\\($re)/$map->{$1}/ge;
130             }
131             elsif ( $type eq 'superscripts' ) {
132             $text =~ s/\\textsuperscript\{($re)\}/$map->{$1}/ge;
133             }
134             elsif ( $type eq 'cmdsuperscripts' ) {
135             $text =~ s/\\textsuperscript\{\\($re)\}/$map->{$1}/ge;
136             }
137             elsif ( $type eq 'dings' ) {
138             $text =~ s/\\ding\{([2-9AF][0-9A-F])\}/$map->{$1}/ge;
139             }
140             elsif ( $type eq 'letters' ) {
141             $text =~ s/\\($re)(?:\{\}|\s+|\b)/$map->{$1}/ge;
142             }
143             elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
144             {
145             $text =~ s/\\($re)(?: \{\}|\s+|\b)/$map->{$1}/ge;
146             }
147             elsif ( $type eq 'diacritics' ) {
148             $text =~ s/\\($re)\s*\{(\pL\pM*)\}/$2 . $map->{$1}/ge;
149              
150             # Conditional regexp with code-block condition
151             # non letter macros for diacritics (e.g. \=) can be followed by any letter
152             # but letter diacritic macros (e.g \c) can't (\cS) horribly Broken
153             #
154             # If the RE for the macro doesn't end with a basic LaTeX macro letter (\=), then
155             # next char can be any letter (\=d)
156             # Else if it did end with a normal LaTeX macro letter (\c), then
157             # If this was followed by a space (\c )
158             # Any letter is allowed after the space (\c S)
159             # Else
160             # Only a non basic LaTeX letter is allowed (\c-)
161             $text =~ s/\\# slash
162             ($re)# the diacritic
163             (\s*)# optional space
164             (# capture paren
165             (?(?{$1 !~ m:[A-Za-z]$:})# code block condition (is not a letter?)
166             \pL # yes pattern
167             | # no pattern
168             (?(?{$2}) # code block condition (space matched earlier after diacritic?)
169             \pL # yes pattern
170             | # no pattern
171             [^A-Za-z]
172             ) # close conditional
173             ) # close conditional
174             \pM* # optional marks
175             ) # capture paren
176             /$3 . $map->{$1}/gxe;
177             }
178             }
179              
180             # Now remove braces around single letters with diacritics (which the replace above
181             # can result in). Things like '{á}'. Such things can break kerning. We can't do this in
182             # the RE above as we can't determine if the braces are wrapping a phrase because this
183             # match is on an entire file string. So we can't in one step tell the difference between:
184             #
185             # author = {Andr\'e}
186             # and
187             # author = {Andr\'{e}}
188             #
189             # when this is part of a (much) larger string
190             #
191             # We don't want to do this if it would result in a broken macro name like with
192             # \textupper{é}
193             #
194             # Workaround perl's lack of variable-width negative look-behind -
195             # Reverse string (and therefore some of the Re) and use variable width negative look-ahead
196             $text = reverse $text;
197             $text =~ s/}(\pM+\pL){(?!\pL+\\)/$1/g;
198             $text = reverse $text;
199              
200             return $norm ? Unicode::Normalize::normalize( $norm_form, $text ) : $text;
201            
202             }
203              
204             =head2 latex_encode($text, @options)
205              
206             Converts UTF-8 to LaTeX
207              
208             =cut
209              
210             sub latex_encode {
211             my $text = shift;
212              
213             # Optimisation - if virtual null set was specified, do nothing
214             return $text if $set_e eq 'null';
215              
216             $text = NFD($text);
217              
218             foreach my $type (
219             qw'greek dings negatedsymbols superscripts cmdsuperscripts
220             diacritics letters punctuation symbols'
221             )
222             {
223             my $map = $remap_e->{$type}{map};
224             my $re = $remap_e->{$type}{re};
225             next unless $re; # Might not be present depending on set
226              
227             if ( $type eq 'negatedsymbols' ) {
228             $text =~ s/($re)/"{\$\\not\\" . $map->{$1} . '$}'/ge;
229             }
230             elsif ( $type eq 'superscripts' ) {
231             $text =~ s/($re)/'\textsuperscript{' . $map->{$1} . '}'/ge;
232             }
233             elsif ( $type eq 'cmdsuperscripts' ) {
234             $text =~ s/($re)/"\\textsuperscript{\\" . $map->{$1} . "}"/ge;
235             }
236             elsif ( $type eq 'dings' ) {
237             $text =~ s/($re)/'\ding{' . $map->{$1} . '}'/ge;
238             }
239             elsif ( $type eq 'letters' ) {
240              
241             # General macros (excluding special encoding excludes)
242             $text
243             =~ s/($re)/($remap_e_raw->{$1} ? '' : "\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '{}')/ge;
244             }
245             elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
246             {
247             # Math mode macros (excluding special encoding excludes)
248             $text
249             =~ s/($re)/($remap_e_raw->{$1} ? '' : "{\$\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '$}')/ge;
250             }
251             elsif ( $type eq 'diacritics' ) {
252              
253             # special case such as "i\x{304}" -> '\={\i}' -> "i" needs the dot removing for accents
254             $text =~ s/i($re)/"\\" . $map->{$1} . '{\i}'/ge;
255              
256             $text =~ s/\{(\pL\pM*)\}($re)/"\\" . $map->{$2} . "{$1}"/ge;
257             $text =~ s/(\pL\pM*)($re)/"\\" . $map->{$2} . "{$1}"/ge;
258              
259             $text =~ s{
260             (\PM)($re)($re)($re)
261             }{
262             "\\" . $map->{$4} . "{\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}}'
263             }gex;
264             $text =~ s{
265             (\PM)($re)($re)
266             }{
267             "\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}'
268             }gex;
269             $text =~ s{
270             (\PM)($re)
271             }{
272             "\\" . $map->{$2} . "{$1}"
273             }gex;
274             }
275             }
276              
277             return $text;
278             }
279              
280              
281             =head2 _init_sets(, )
282              
283             Initialise recoding sets.
284             This is a private method, and its direct usage should not be needed
285             in normal circunstances.
286              
287             =cut
288              
289              
290             sub _init_sets {
291             ( $set_d, $set_e ) = @_;
292             no autovivification;
293              
294             # Reset these, mostly for tests which call init_sets more than once
295             $remap_d = {};
296             $remap_e = {};
297             $remap_e_raw = {};
298              
299             my $mapdata = module_file( 'LaTeX::Recode' => "recode_data.xml" );
300              
301             # Read driver config file
302             my $xml = File::Slurp::read_file($mapdata)
303             or die("Can't read file $mapdata");
304             my $doc = XML::LibXML->load_xml( string => decode( 'UTF-8', $xml ) );
305             my $xpc = XML::LibXML::XPathContext->new($doc);
306              
307             my @types = qw(letters diacritics punctuation symbols negatedsymbols
308             superscripts cmdsuperscripts dings greek);
309              
310             # Have to have separate loops for decode/recode or you can't have independent
311             # decode/recode sets
312              
313             # Construct decode set
314             foreach my $type (@types) {
315             foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
316             my @set = split( /\s*,\s*/, $maps->getAttribute('set') );
317             next unless first { $set_d eq $_ } @set;
318             foreach my $map ( $maps->findnodes('map') ) {
319             my $from = $map->findnodes('from')->shift();
320             my $to = $map->findnodes('to')->shift();
321             $remap_d->{$type}{map}{ NFD( $from->textContent() ) }
322             = NFD( $to->textContent() );
323             }
324             }
325              
326             # Things we don't want to change when decoding as this breaks some things
327             foreach my $d ( $xpc->findnodes('/texmap/decode_exclude/char') ) {
328             delete( $remap_d->{$type}{map}{ NFD( $d->textContent() ) } );
329             }
330             }
331              
332             # Construct encode set
333             foreach my $type (@types) {
334             foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
335             my @set = split( /\s*,\s*/, $maps->getAttribute('set') );
336             next unless first { $set_e eq $_ } @set;
337             foreach my $map ( $maps->findnodes('map') ) {
338             my $from = $map->findnodes('from')->shift();
339             my $to = $map->findnodes('to')->shift();
340             $remap_e->{$type}{map}{ NFD( $to->textContent() ) }
341             = NFD( $from->textContent() );
342             }
343              
344             # There are some duplicates in the data to handle preferred encodings.
345             foreach my $map ( $maps->findnodes('map[from[@preferred]]') ) {
346             my $from = $map->findnodes('from')->shift();
347             my $to = $map->findnodes('to')->shift();
348             $remap_e->{$type}{map}{ NFD( $to->textContent() ) }
349             = NFD( $from->textContent() );
350             }
351              
352             # Some things might need to be inserted as is rather than wrapped
353             # in some macro/braces
354             foreach my $map ( $maps->findnodes('map[from[@raw]]') ) {
355             my $from = $map->findnodes('from')->shift();
356             my $to = $map->findnodes('to')->shift();
357             $remap_e_raw->{ NFD( $to->textContent() ) } = 1;
358             }
359              
360             }
361              
362             # Things we don't want to change when encoding as this would break LaTeX
363             foreach my $e ( $xpc->findnodes('/texmap/encode_exclude/char') ) {
364             delete( $remap_e->{$type}{map}{ NFD( $e->textContent() ) } );
365             }
366             }
367              
368             # Populate the decode regexps
369             # sort by descending length of macro name to avoid shorter macros which
370             # are substrings of longer ones damaging the longer ones
371             foreach my $type (@types) {
372             next unless exists $remap_d->{$type};
373             $remap_d->{$type}{re} = join( '|',
374             map { /[\.\^\|\+\-\)\(]/ ? '\\' . $_ : $_ }
375             sort { length($b) <=> length($a) }
376             keys %{$remap_d->{$type}{map}} );
377             $remap_d->{$type}{re} = qr|$remap_d->{$type}{re}|;
378             }
379              
380             # Populate the encode regexps
381             foreach my $type (@types) {
382             next unless exists $remap_e->{$type};
383             $remap_e->{$type}{re} = join( '|',
384             map { /[\.\^\|\+\-\)\(]/ ? '\\' . $_ : $_ }
385             sort keys %{ $remap_e->{$type}{map} } );
386             $remap_e->{$type}{re} = qr|$remap_e->{$type}{re}|;
387             }
388             }
389              
390             1;
391              
392             __END__