File Coverage

blib/lib/LaTeX/Recode.pm

Criterion	Covered	Total	%
statement	28	30	93.3
branch			n/a
condition			n/a
subroutine	10	10	100.0
pod			n/a
total	38	40	95.0

line	stmt	sub	time	code
1				package LaTeX::Recode;
2
3	2	2	26517	use strict;
	2		3
	2		44
4	2	2	6	use warnings;
	2		2
	2		45
5	2	2	708	use parent qw(Exporter);
	2		430
	2		8
6	2	2	75	use re 'eval';
	2		2
	2		73
7	2	2	900	use Encode;
	2		14359
	2		149
8	2	2	1041	use File::Slurp;
	2		20129
	2		107
9	2	2	9	use File::Spec;
	2		3
	2		29
10	2	2	968	use Unicode::Normalize;
	2		323625
	2		252
11	2	2	1360	use List::AllUtils qw (first);
	2		30960
	2		137
12	2	2	827	use XML::LibXML::Simple;
	0
	0
13				use utf8;
14
15				use File::ShareDir 'module_file';
16
17				our @EXPORT = qw(latex_encode latex_decode);
18
19				sub import {
20				my ($self, %opts) = @_;
21
22				my $sets = qr/^(?:null\|base\|full)$/i;
23				my $encode_set = ($opts{encode_set}\|\|"") =~ m!$sets! ? lc $opts{encode_set} : "base";
24				my $decode_set = ($opts{decode_set}\|\|"") =~ m!$sets! ? lc $opts{decode_set} : "base";
25
26				_init_sets($decode_set, $encode_set);
27
28				$self->export_to_level(1, undef, @EXPORT);
29				}
30
31				=encoding utf-8
32
33				=head1 NAME
34
35				LaTeX::Recode - Encode/Decode chars to/from UTF-8/macros in LaTeX
36
37				=head1 SYNOPSIS
38
39				use LaTeX::Recode;
40
41				my $string = 'Muḥammad ibn Mūsā al-Khwārizmī';
42				my $latex_string = latex_encode($string);
43				# => 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}'
44
45				my $string = 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}';
46				my $utf8_string = latex_decode($string);
47				# => 'Muḥammad ibn Mūsā al-Khwārizmī'
48
49
50				# if you want to define a different conversion set (either
51				# for encoding or decoding):
52				use LaTeX::Recode encode_set => 'full', decode_set => 'base';
53
54
55				=head1 DESCRIPTION
56
57				Allows conversion between Unicode chars and LaTeX macros.
58
59				=head1 GLOBAL OPTIONS
60
61				Possible values for the encoding/decoding set to use are 'null', 'base' and 'full';
62				default value is 'base'.
63
64				null => No conversion
65
66				base => Most common macros and diacritics (sufficient for Western languages
67				and common symbols)
68
69				full => Also converts punctuation, larger range of diacritics and macros
70				(e.g. for IPA, Latin Extended Additional, etc.), symbols, Greek letters,
71				dingbats, negated symbols, and superscript characters and symbols ...
72
73				=cut
74
75				our ($remap_d, $remap_e, $remap_e_raw, $set_d, $set_e);
76
77
78
79
80				=head2 latex_decode($text, @options)
81
82				Converts LaTeX macros in the $text to Unicode characters.
83
84				The function accepts a number of options:
85
86				* normalize => $bool (default 1)
87				whether the output string should be normalized with Unicode::Normalize
88
89				* normalization => (default 'NFD')
90				and if yes, the normalization form to use (see the Unicode::Normalize documentation)
91
92				=cut
93
94				sub latex_decode {
95				my $text = shift;
96
97				# Optimisation - if there are no macros, no point doing anything
98				return $text unless $text =~ m/\\/;
99
100				# Optimisation - if virtual null set was specified, do nothing
101				return $text if $set_d eq 'null';
102
103				my %opts = @_;
104				my $norm = exists $opts{normalize} ? $opts{normalize} : 1;
105				my $norm_form
106				= exists $opts{normalization} ? $opts{normalization} : 'NFD';
107
108				# Deal with raw TeX \char macros.
109				$text =~ s/\\char"(\p{ASCII_Hex_Digit}+)/"chr(0x$1)"/gee; # hex chars
110				$text =~ s/\\char'(\d+)/"chr(0$1)"/gee; # octal chars
111				$text =~ s/\\char(\d+)/"chr($1)"/gee; # decimal chars
112
113				$text =~ s/(\\[a-zA-Z]+)\\(\s+)/$1\{\}$2/g; # \foo\ bar -> \foo{} bar
114				$text =~ s/([^{]\\\w)([;,.:%])/$1\{\}$2/g; #} Aaaa\o, -> Aaaa\o{},
115
116				foreach my $type (
117				'greek', 'dings',
118				'punctuation', 'symbols',
119				'negatedsymbols', 'superscripts',
120				'cmdsuperscripts', 'letters',
121				'diacritics'
122				)
123				{
124				my $map = $remap_d->{$type}{map};
125				my $re = $remap_d->{$type}{re};
126				next unless $re; # Might not be present depending on set
127
128				if ( $type eq 'negatedsymbols' ) {
129				$text =~ s/\\not\\($re)/$map->{$1}/ge;
130				}
131				elsif ( $type eq 'superscripts' ) {
132				$text =~ s/\\textsuperscript\{($re)\}/$map->{$1}/ge;
133				}
134				elsif ( $type eq 'cmdsuperscripts' ) {
135				$text =~ s/\\textsuperscript\{\\($re)\}/$map->{$1}/ge;
136				}
137				elsif ( $type eq 'dings' ) {
138				$text =~ s/\\ding\{([2-9AF][0-9A-F])\}/$map->{$1}/ge;
139				}
140				elsif ( $type eq 'letters' ) {
141				$text =~ s/\\($re)(?:\{\}\|\s+\|\b)/$map->{$1}/ge;
142				}
143				elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
144				{
145				$text =~ s/\\($re)(?: \{\}\|\s+\|\b)/$map->{$1}/ge;
146				}
147				elsif ( $type eq 'diacritics' ) {
148				$text =~ s/\\($re)\s\{(\pL\pM)\}/$2 . $map->{$1}/ge;
149
150				# Conditional regexp with code-block condition
151				# non letter macros for diacritics (e.g. \=) can be followed by any letter
152				# but letter diacritic macros (e.g \c) can't (\cS) horribly Broken
153				#
154				# If the RE for the macro doesn't end with a basic LaTeX macro letter (\=), then
155				# next char can be any letter (\=d)
156				# Else if it did end with a normal LaTeX macro letter (\c), then
157				# If this was followed by a space (\c )
158				# Any letter is allowed after the space (\c S)
159				# Else
160				# Only a non basic LaTeX letter is allowed (\c-)
161				$text =~ s/\\# slash
162				($re)# the diacritic
163				(\s*)# optional space
164				(# capture paren
165				(?(?{$1 !~ m:[A-Za-z]$:})# code block condition (is not a letter?)
166				\pL # yes pattern
167				\| # no pattern
168				(?(?{$2}) # code block condition (space matched earlier after diacritic?)
169				\pL # yes pattern
170				\| # no pattern
171				[^A-Za-z]
172				) # close conditional
173				) # close conditional
174				\pM* # optional marks
175				) # capture paren
176				/$3 . $map->{$1}/gxe;
177				}
178				}
179
180				# Now remove braces around single letters with diacritics (which the replace above
181				# can result in). Things like '{á}'. Such things can break kerning. We can't do this in
182				# the RE above as we can't determine if the braces are wrapping a phrase because this
183				# match is on an entire file string. So we can't in one step tell the difference between:
184				#
185				# author = {Andr\'e}
186				# and
187				# author = {Andr\'{e}}
188				#
189				# when this is part of a (much) larger string
190				#
191				# We don't want to do this if it would result in a broken macro name like with
192				# \textupper{é}
193				#
194				# Workaround perl's lack of variable-width negative look-behind -
195				# Reverse string (and therefore some of the Re) and use variable width negative look-ahead
196				$text = reverse $text;
197				$text =~ s/}(\pM+\pL){(?!\pL+\\)/$1/g;
198				$text = reverse $text;
199
200				return $norm ? Unicode::Normalize::normalize( $norm_form, $text ) : $text;
201
202				}
203
204				=head2 latex_encode($text, @options)
205
206				Converts UTF-8 to LaTeX
207
208				=cut
209
210				sub latex_encode {
211				my $text = shift;
212
213				# Optimisation - if virtual null set was specified, do nothing
214				return $text if $set_e eq 'null';
215
216				$text = NFD($text);
217
218				foreach my $type (
219				qw'greek dings negatedsymbols superscripts cmdsuperscripts
220				diacritics letters punctuation symbols'
221				)
222				{
223				my $map = $remap_e->{$type}{map};
224				my $re = $remap_e->{$type}{re};
225				next unless $re; # Might not be present depending on set
226
227				if ( $type eq 'negatedsymbols' ) {
228				$text =~ s/($re)/"{\$\\not\\" . $map->{$1} . '$}'/ge;
229				}
230				elsif ( $type eq 'superscripts' ) {
231				$text =~ s/($re)/'\textsuperscript{' . $map->{$1} . '}'/ge;
232				}
233				elsif ( $type eq 'cmdsuperscripts' ) {
234				$text =~ s/($re)/"\\textsuperscript{\\" . $map->{$1} . "}"/ge;
235				}
236				elsif ( $type eq 'dings' ) {
237				$text =~ s/($re)/'\ding{' . $map->{$1} . '}'/ge;
238				}
239				elsif ( $type eq 'letters' ) {
240
241				# General macros (excluding special encoding excludes)
242				$text
243				=~ s/($re)/($remap_e_raw->{$1} ? '' : "\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '{}')/ge;
244				}
245				elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
246				{
247				# Math mode macros (excluding special encoding excludes)
248				$text
249				=~ s/($re)/($remap_e_raw->{$1} ? '' : "{\$\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '$}')/ge;
250				}
251				elsif ( $type eq 'diacritics' ) {
252
253				# special case such as "i\x{304}" -> '\={\i}' -> "i" needs the dot removing for accents
254				$text =~ s/i($re)/"\\" . $map->{$1} . '{\i}'/ge;
255
256				$text =~ s/\{(\pL\pM*)\}($re)/"\\" . $map->{$2} . "{$1}"/ge;
257				$text =~ s/(\pL\pM*)($re)/"\\" . $map->{$2} . "{$1}"/ge;
258
259				$text =~ s{
260				(\PM)($re)($re)($re)
261				}{
262				"\\" . $map->{$4} . "{\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}}'
263				}gex;
264				$text =~ s{
265				(\PM)($re)($re)
266				}{
267				"\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}'
268				}gex;
269				$text =~ s{
270				(\PM)($re)
271				}{
272				"\\" . $map->{$2} . "{$1}"
273				}gex;
274				}
275				}
276
277				return $text;
278				}
279
280
281				=head2 _init_sets(, )
282
283				Initialise recoding sets.
284				This is a private method, and its direct usage should not be needed
285				in normal circunstances.
286
287				=cut
288
289
290				sub _init_sets {
291				( $set_d, $set_e ) = @_;
292				no autovivification;
293
294				# Reset these, mostly for tests which call init_sets more than once
295				$remap_d = {};
296				$remap_e = {};
297				$remap_e_raw = {};
298
299				my $mapdata = module_file( 'LaTeX::Recode' => "recode_data.xml" );
300
301				# Read driver config file
302				my $xml = File::Slurp::read_file($mapdata)
303				or die("Can't read file $mapdata");
304				my $doc = XML::LibXML->load_xml( string => decode( 'UTF-8', $xml ) );
305				my $xpc = XML::LibXML::XPathContext->new($doc);
306
307				my @types = qw(letters diacritics punctuation symbols negatedsymbols
308				superscripts cmdsuperscripts dings greek);
309
310				# Have to have separate loops for decode/recode or you can't have independent
311				# decode/recode sets
312
313				# Construct decode set
314				foreach my $type (@types) {
315				foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
316				my @set = split( /\s,\s/, $maps->getAttribute('set') );
317				next unless first { $set_d eq $_ } @set;
318				foreach my $map ( $maps->findnodes('map') ) {
319				my $from = $map->findnodes('from')->shift();
320				my $to = $map->findnodes('to')->shift();
321				$remap_d->{$type}{map}{ NFD( $from->textContent() ) }
322				= NFD( $to->textContent() );
323				}
324				}
325
326				# Things we don't want to change when decoding as this breaks some things
327				foreach my $d ( $xpc->findnodes('/texmap/decode_exclude/char') ) {
328				delete( $remap_d->{$type}{map}{ NFD( $d->textContent() ) } );
329				}
330				}
331
332				# Construct encode set
333				foreach my $type (@types) {
334				foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
335				my @set = split( /\s,\s/, $maps->getAttribute('set') );
336				next unless first { $set_e eq $_ } @set;
337				foreach my $map ( $maps->findnodes('map') ) {
338				my $from = $map->findnodes('from')->shift();
339				my $to = $map->findnodes('to')->shift();
340				$remap_e->{$type}{map}{ NFD( $to->textContent() ) }
341				= NFD( $from->textContent() );
342				}
343
344				# There are some duplicates in the data to handle preferred encodings.
345				foreach my $map ( $maps->findnodes('map[from[@preferred]]') ) {
346				my $from = $map->findnodes('from')->shift();
347				my $to = $map->findnodes('to')->shift();
348				$remap_e->{$type}{map}{ NFD( $to->textContent() ) }
349				= NFD( $from->textContent() );
350				}
351
352				# Some things might need to be inserted as is rather than wrapped
353				# in some macro/braces
354				foreach my $map ( $maps->findnodes('map[from[@raw]]') ) {
355				my $from = $map->findnodes('from')->shift();
356				my $to = $map->findnodes('to')->shift();
357				$remap_e_raw->{ NFD( $to->textContent() ) } = 1;
358				}
359
360				}
361
362				# Things we don't want to change when encoding as this would break LaTeX
363				foreach my $e ( $xpc->findnodes('/texmap/encode_exclude/char') ) {
364				delete( $remap_e->{$type}{map}{ NFD( $e->textContent() ) } );
365				}
366				}
367
368				# Populate the decode regexps
369				# sort by descending length of macro name to avoid shorter macros which
370				# are substrings of longer ones damaging the longer ones
371				foreach my $type (@types) {
372				next unless exists $remap_d->{$type};
373				$remap_d->{$type}{re} = join( '\|',
374				map { /[\.\^\\|\+\-\)\(]/ ? '\\' . $_ : $_ }
375				sort { length($b) <=> length($a) }
376				keys %{$remap_d->{$type}{map}} );
377				$remap_d->{$type}{re} = qr\|$remap_d->{$type}{re}\|;
378				}
379
380				# Populate the encode regexps
381				foreach my $type (@types) {
382				next unless exists $remap_e->{$type};
383				$remap_e->{$type}{re} = join( '\|',
384				map { /[\.\^\\|\+\-\)\(]/ ? '\\' . $_ : $_ }
385				sort keys %{ $remap_e->{$type}{map} } );
386				$remap_e->{$type}{re} = qr\|$remap_e->{$type}{re}\|;
387				}
388				}
389
390				1;
391
392				__END__