File Coverage

blib/lib/LaTeX/Recode.pm

Criterion	Covered	Total	%
statement	155	175	88.5
branch	46	70	65.7
condition	2	4	50.0
subroutine	22	22	100.0
pod	2	2	100.0
total	227	273	83.1

line	stmt	bran	cond	sub	pod	time	code
1							package LaTeX::Recode;
2
3	2			2		207763	use strict;
	2					21
	2					61
4	2			2		11	use warnings;
	2					4
	2					62
5	2			2		1010	use parent qw(Exporter);
	2					591
	2					11
6	2			2		118	use re 'eval';
	2					4
	2					126
7	2			2		1193	use Encode;
	2					22195
	2					137
8	2			2		1197	use File::Slurp;
	2					56017
	2					136
9	2			2		20	use File::Spec;
	2					5
	2					45
10	2			2		1272	use Unicode::Normalize;
	2					4592
	2					142
11	2			2		15	use List::Util qw (first);
	2					4
	2					141
12	2			2		1129	use XML::LibXML::Simple;
	2					89154
	2					136
13	2			2		670	use utf8;
	2					19
	2					16
14
15	2			2		1193	use File::ShareDir 'module_file';
	2					49802
	2					729
16
17							our @EXPORT = qw(latex_encode latex_decode);
18
19							sub import {
20	2			2		41	my ($self, %opts) = @_;
21
22	2					10	my $sets = qr/^(?:null\|base\|full)$/i;
23	2	50	50			28	my $encode_set = ($opts{encode_set}\|\|"") =~ m!$sets! ? lc $opts{encode_set} : "base";
24	2	50	50			31	my $decode_set = ($opts{decode_set}\|\|"") =~ m!$sets! ? lc $opts{decode_set} : "base";
25
26	2					7	_init_sets($decode_set, $encode_set);
27
28	2					3740	$self->export_to_level(1, undef, @EXPORT);
29							}
30
31							=encoding utf-8
32
33							=head1 NAME
34
35							LaTeX::Recode - Encode/Decode chars to/from UTF-8/macros in LaTeX
36
37							=head1 SYNOPSIS
38
39							use LaTeX::Recode;
40
41							my $string = 'Muḥammad ibn Mūsā al-Khwārizmī';
42							my $latex_string = latex_encode($string);
43							# => 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}'
44
45							my $string = 'Mu\d{h}ammad ibn M\=us\=a al-Khw\=arizm\={\i}';
46							my $utf8_string = latex_decode($string);
47							# => 'Muḥammad ibn Mūsā al-Khwārizmī'
48
49
50							# if you want to define a different conversion set (either
51							# for encoding or decoding):
52							use LaTeX::Recode encode_set => 'full', decode_set => 'base';
53
54
55							=head1 DESCRIPTION
56
57							Allows conversion between Unicode chars and LaTeX macros.
58
59							=head1 GLOBAL OPTIONS
60
61							Possible values for the encoding/decoding set to use are 'null', 'base' and 'full';
62							default value is 'base'.
63
64							null => No conversion
65
66							base => Most common macros and diacritics (sufficient for Western languages
67							and common symbols)
68
69							full => Also converts punctuation, larger range of diacritics and macros
70							(e.g. for IPA, Latin Extended Additional, etc.), symbols, Greek letters,
71							dingbats, negated symbols, and superscript characters and symbols ...
72
73							=cut
74
75							our ($remap_d, $remap_e, $remap_e_raw, $set_d, $set_e);
76
77
78
79
80							=head2 latex_decode($text, @options)
81
82							Converts LaTeX macros in the $text to Unicode characters.
83
84							The function accepts a number of options:
85
86							* normalize => $bool (default 1)
87							whether the output string should be normalized with Unicode::Normalize
88
89							* normalization => (default 'NFD')
90							and if yes, the normalization form to use (see the Unicode::Normalize documentation)
91
92							=cut
93
94							sub latex_decode {
95	4			4	1	3644	my $text = shift;
96
97							# Optimisation - if there are no macros, no point doing anything
98	4	50				21	return $text unless $text =~ m/\\/;
99
100							# Optimisation - if virtual null set was specified, do nothing
101	4	50				12	return $text if $set_d eq 'null';
102
103	4					8	my %opts = @_;
104	4	50				11	my $norm = exists $opts{normalize} ? $opts{normalize} : 1;
105							my $norm_form
106	4	50				10	= exists $opts{normalization} ? $opts{normalization} : 'NFD';
107
108							# Deal with raw TeX \char macros.
109	2			2		15	$text =~ s/\\char"(\p{ASCII_Hex_Digit}+)/"chr(0x$1)"/gee; # hex chars
	2					4
	2					28
	4					9
	0					0
110	4					9	$text =~ s/\\char'(\d+)/"chr(0$1)"/gee; # octal chars
	0					0
111	4					6	$text =~ s/\\char(\d+)/"chr($1)"/gee; # decimal chars
	0					0
112
113	4					14	$text =~ s/(\\[a-zA-Z]+)\\(\s+)/$1\{\}$2/g; # \foo\ bar -> \foo{} bar
114	4					26	$text =~ s/([^{]\\\w)([;,.:%])/$1\{\}$2/g; #} Aaaa\o, -> Aaaa\o{},
115
116	4					11	foreach my $type (
117							'greek', 'dings',
118							'punctuation', 'symbols',
119							'negatedsymbols', 'superscripts',
120							'cmdsuperscripts', 'letters',
121							'diacritics'
122							)
123							{
124	36					135	my $map = $remap_d->{$type}{map};
125	36					61	my $re = $remap_d->{$type}{re};
126	36	50				71	next unless $re; # Might not be present depending on set
127
128	36	100				167	if ( $type eq 'negatedsymbols' ) {
		100
		100
		100
		100
		100
		50
129	4					62	$text =~ s/\\not\\($re)/$map->{$1}/ge;
	0					0
130							}
131							elsif ( $type eq 'superscripts' ) {
132	4					48	$text =~ s/\\textsuperscript\{($re)\}/$map->{$1}/ge;
	0					0
133							}
134							elsif ( $type eq 'cmdsuperscripts' ) {
135	4					45	$text =~ s/\\textsuperscript\{\\($re)\}/$map->{$1}/ge;
	0					0
136							}
137							elsif ( $type eq 'dings' ) {
138	4					11	$text =~ s/\\ding\{([2-9AF][0-9A-F])\}/$map->{$1}/ge;
	0					0
139							}
140							elsif ( $type eq 'letters' ) {
141	4					306	$text =~ s/\\($re)(?:\{\}\|\s+\|\b)/$map->{$1}/ge;
	1					10
142							}
143	36			36		68	elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
144							{
145	12					2368	$text =~ s/\\($re)(?: \{\}\|\s+\|\b)/$map->{$1}/ge;
	0					0
146							}
147							elsif ( $type eq 'diacritics' ) {
148	4					114	$text =~ s/\\($re)\s\{(\pL\pM)\}/$2 . $map->{$1}/ge;
	12					1077
149
150							# Conditional regexp with code-block condition
151							# non letter macros for diacritics (e.g. \=) can be followed by any letter
152							# but letter diacritic macros (e.g \c) can't (\cS) horribly Broken
153							#
154							# If the RE for the macro doesn't end with a basic LaTeX macro letter (\=), then
155							# next char can be any letter (\=d)
156							# Else if it did end with a normal LaTeX macro letter (\c), then
157							# If this was followed by a space (\c )
158							# Any letter is allowed after the space (\c S)
159							# Else
160							# Only a non basic LaTeX letter is allowed (\c-)
161	4					130	$text =~ s/\\# slash
162							($re)# the diacritic
163							(\s*)# optional space
164							(# capture paren
165	0					0	(?(?{$1 !~ m:[A-Za-z]$:})# code block condition (is not a letter?)
166							\pL # yes pattern
167							\| # no pattern
168							(?(?{$2}) # code block condition (space matched earlier after diacritic?)
169							\pL # yes pattern
170							\| # no pattern
171							[^A-Za-z]
172							) # close conditional
173							) # close conditional
174							\pM* # optional marks
175							) # capture paren
176	0					0	/$3 . $map->{$1}/gxe;
177							}
178							}
179
180							# Now remove braces around single letters with diacritics (which the replace above
181							# can result in). Things like '{á}'. Such things can break kerning. We can't do this in
182							# the RE above as we can't determine if the braces are wrapping a phrase because this
183							# match is on an entire file string. So we can't in one step tell the difference between:
184							#
185							# author = {Andr\'e}
186							# and
187							# author = {Andr\'{e}}
188							#
189							# when this is part of a (much) larger string
190							#
191							# We don't want to do this if it would result in a broken macro name like with
192							# \textupper{é}
193							#
194							# Workaround perl's lack of variable-width negative look-behind -
195							# Reverse string (and therefore some of the Re) and use variable width negative look-ahead
196	4					664	$text = reverse $text;
197	4					13	$text =~ s/}(\pM+\pL)\{(?!\pL+\\)/$1/g;
198	4					9	$text = reverse $text;
199
200	4	50				21	return $norm ? Unicode::Normalize::normalize( $norm_form, $text ) : $text;
201
202							}
203
204							=head2 latex_encode($text, @options)
205
206							Converts UTF-8 to LaTeX
207
208							=cut
209
210							sub latex_encode {
211	4			4	1	3500	my $text = shift;
212
213							# Optimisation - if virtual null set was specified, do nothing
214	4	50				15	return $text if $set_e eq 'null';
215
216	4					42	$text = NFD($text);
217
218	4					9	foreach my $type (
219							qw'greek dings negatedsymbols superscripts cmdsuperscripts
220							diacritics letters punctuation symbols'
221							)
222							{
223	36					300	my $map = $remap_e->{$type}{map};
224	36					63	my $re = $remap_e->{$type}{re};
225	36	50				81	next unless $re; # Might not be present depending on set
226
227	36	100				177	if ( $type eq 'negatedsymbols' ) {
		100
		100
		100
		100
		100
		50
228	4					85	$text =~ s/($re)/"{\$\\not\\" . $map->{$1} . '$}'/ge;
	0					0
229							}
230							elsif ( $type eq 'superscripts' ) {
231	4					71	$text =~ s/($re)/'\textsuperscript{' . $map->{$1} . '}'/ge;
	0					0
232							}
233							elsif ( $type eq 'cmdsuperscripts' ) {
234	4					43	$text =~ s/($re)/"\\textsuperscript{\\" . $map->{$1} . "}"/ge;
	0					0
235							}
236							elsif ( $type eq 'dings' ) {
237	4					1008	$text =~ s/($re)/'\ding{' . $map->{$1} . '}'/ge;
	0					0
238							}
239							elsif ( $type eq 'letters' ) {
240
241							# General macros (excluding special encoding excludes)
242	4					765	$text
243	1	50				17	=~ s/($re)/($remap_e_raw->{$1} ? '' : "\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '{}')/ge;
		50
244							}
245	36			36		80	elsif ( first { $type eq $_ } ( 'punctuation', 'symbols', 'greek' ) )
246							{
247							# Math mode macros (excluding special encoding excludes)
248	12					5597	$text
249	0	0				0	=~ s/($re)/($remap_e_raw->{$1} ? '' : "{\$\\") . $map->{$1} . ($remap_e_raw->{$1} ? '' : '$}')/ge;
		0
250							}
251							elsif ( $type eq 'diacritics' ) {
252
253							# special case such as "i\x{304}" -> '\={\i}' -> "i" needs the dot removing for accents
254	4					135	$text =~ s/i($re)/"\\" . $map->{$1} . '{\i}'/ge;
	0					0
255
256	4					56	$text =~ s/\{(\pL\pM*)\}($re)/"\\" . $map->{$2} . "{$1}"/ge;
	0					0
257	4					490	$text =~ s/(\pL\pM*)($re)/"\\" . $map->{$2} . "{$1}"/ge;
	12					393
258
259	4					84	$text =~ s{
260							(\PM)($re)($re)($re)
261							}{
262	0					0	"\\" . $map->{$4} . "{\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}}'
263							}gex;
264	4					403	$text =~ s{
265							(\PM)($re)($re)
266							}{
267	0					0	"\\" . $map->{$3} . "{\\" . $map->{$2} . "{$1}" . '}'
268							}gex;
269	4					392	$text =~ s{
270							(\PM)($re)
271							}{
272	0					0	"\\" . $map->{$2} . "{$1}"
273							}gex;
274							}
275							}
276
277	4					25	return $text;
278							}
279
280
281							=head2 _init_sets(, )
282
283							Initialise recoding sets.
284							This is a private method, and its direct usage should not be needed
285							in normal circunstances.
286
287							=cut
288
289
290							sub _init_sets {
291	2			2		5	( $set_d, $set_e ) = @_;
292	2			2		49998	no autovivification;
	2					1734
	2					10
293
294							# Reset these, mostly for tests which call init_sets more than once
295	2					4	$remap_d = {};
296	2					5	$remap_e = {};
297	2					4	$remap_e_raw = {};
298
299	2					13	my $mapdata = module_file( 'LaTeX::Recode' => "recode_data.xml" );
300
301							# Read driver config file
302	2	50				728	my $xml = File::Slurp::read_file($mapdata)
303							or die("Can't read file $mapdata");
304	2					559	my $doc = XML::LibXML->load_xml( string => decode( 'UTF-8', $xml ) );
305	2					7974	my $xpc = XML::LibXML::XPathContext->new($doc);
306
307	2					11	my @types = qw(letters diacritics punctuation symbols negatedsymbols
308							superscripts cmdsuperscripts dings greek);
309
310							# Have to have separate loops for decode/recode or you can't have independent
311							# decode/recode sets
312
313							# Construct decode set
314	2					6	foreach my $type (@types) {
315	18					615	foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
316	20					1659	my @set = split( /\s,\s/, $maps->getAttribute('set') );
317	20	50		28		411	next unless first { $set_d eq $_ } @set;
	28					77
318	20					98	foreach my $map ( $maps->findnodes('map') ) {
319	1668					31691	my $from = $map->findnodes('from')->shift();
320	1668					50961	my $to = $map->findnodes('to')->shift();
321	1668					59692	$remap_d->{$type}{map}{ NFD( $from->textContent() ) }
322							= NFD( $to->textContent() );
323							}
324							}
325
326							# Things we don't want to change when decoding as this breaks some things
327	18					10849	foreach my $d ( $xpc->findnodes('/texmap/decode_exclude/char') ) {
328	72					2313	delete( $remap_d->{$type}{map}{ NFD( $d->textContent() ) } );
329							}
330							}
331
332							# Construct encode set
333	2					80	foreach my $type (@types) {
334	18					2932	foreach my $maps ( $xpc->findnodes("/texmap/maps[\@type='$type']") ) {
335	20					1090	my @set = split( /\s,\s/, $maps->getAttribute('set') );
336	20	50		28		404	next unless first { $set_e eq $_ } @set;
	28					79
337	20					91	foreach my $map ( $maps->findnodes('map') ) {
338	1668					32129	my $from = $map->findnodes('from')->shift();
339	1668					51197	my $to = $map->findnodes('to')->shift();
340	1668					60063	$remap_e->{$type}{map}{ NFD( $to->textContent() ) }
341							= NFD( $from->textContent() );
342							}
343
344							# There are some duplicates in the data to handle preferred encodings.
345	20					379	foreach my $map ( $maps->findnodes('map[from[@preferred]]') ) {
346	90					10725	my $from = $map->findnodes('from')->shift();
347	90					2646	my $to = $map->findnodes('to')->shift();
348	90					3085	$remap_e->{$type}{map}{ NFD( $to->textContent() ) }
349							= NFD( $from->textContent() );
350							}
351
352							# Some things might need to be inserted as is rather than wrapped
353							# in some macro/braces
354	20					4618	foreach my $map ( $maps->findnodes('map[from[@raw]]') ) {
355	10					320	my $from = $map->findnodes('from')->shift();
356	10					298	my $to = $map->findnodes('to')->shift();
357	10					352	$remap_e_raw->{ NFD( $to->textContent() ) } = 1;
358							}
359
360							}
361
362							# Things we don't want to change when encoding as this would break LaTeX
363	18					2506	foreach my $e ( $xpc->findnodes('/texmap/encode_exclude/char') ) {
364	450					3926	delete( $remap_e->{$type}{map}{ NFD( $e->textContent() ) } );
365							}
366							}
367
368							# Populate the decode regexps
369							# sort by descending length of macro name to avoid shorter macros which
370							# are substrings of longer ones damaging the longer ones
371	2					352	foreach my $type (@types) {
372	18	50				89	next unless exists $remap_d->{$type};
373							$remap_d->{$type}{re} = join( '\|',
374	1650	100				3489	map { /[\.\^\\|\+\-\)\(]/ ? '\\' . $_ : $_ }
375	7771					10102	sort { length($b) <=> length($a) }
376	18					31	keys %{$remap_d->{$type}{map}} );
	18					573
377	18					6327	$remap_d->{$type}{re} = qr\|$remap_d->{$type}{re}\|;
378							}
379
380							# Populate the encode regexps
381	2					18	foreach my $type (@types) {
382	18	50				99	next unless exists $remap_e->{$type};
383							$remap_e->{$type}{re} = join( '\|',
384	1528	50				2753	map { /[\.\^\\|\+\-\)\(]/ ? '\\' . $_ : $_ }
385	18					31	sort keys %{ $remap_e->{$type}{map} } );
	18					661
386	18					8935	$remap_e->{$type}{re} = qr\|$remap_e->{$type}{re}\|;
387							}
388							}
389
390							1;
391
392							__END__