File Coverage

blib/lib/LaTeX/ToUnicode.pm

Criterion	Covered	Total	%
statement	118	171	69.0
branch	18	48	37.5
condition	0	3	0.0
subroutine	17	20	85.0
pod	2	3	66.6
total	155	245	63.2

line	stmt	bran	cond	sub	pod	time	code
1	1			1		69972	use strict;
	1					2
	1					29
2	1			1		5	use warnings;
	1					2
	1					41
3							package LaTeX::ToUnicode;
4							BEGIN {
5	1			1		59	$LaTeX::ToUnicode::VERSION = '0.53';
6							}
7							#ABSTRACT: Convert LaTeX commands to Unicode (simplistically)
8
9							require Exporter;
10							our @ISA = qw(Exporter);
11							our @EXPORT_OK = qw( convert debuglevel $endcw );
12
13	1			1		6	use utf8;
	1					10
	1					6
14	1			1		593	use Encode;
	1					10561
	1					70
15	1			1		476	use LaTeX::ToUnicode::Tables;
	1					3
	1					2605
16
17							# Terminating a control word (not symbol) the way TeX does: at the
18							# boundary between a letter (lookbehind) and a nonletter (lookahead),
19							# and then ignore any following whitespace.
20							our $endcw = qr/(?<=[a-zA-Z])(?=[^a-zA-Z]\|$)\s*/;
21
22							# all we need for is debugging being on and off. And it's pretty random
23							# what gets output.
24							my $debug = 0;
25	0			0	1	0	sub debuglevel { $debug = shift; }
26							sub _debug {
27	510	50		510		1066	return unless $debug;
28	0					0	my ($pkgname,$filename,$line,$subr) = caller(1);
29	0					0	warn @_, " at $filename:$line (${pkgname}::$subr)\n";
30							}
31
32							# The main conversion function.
33							#
34							sub convert {
35	102			102	1	57375	my ($string, %options) = @_;
36							#warn debug_hash_as_string("starting with: $string", %options);
37
38							# First, remove leading and trailing horizontal whitespace
39							# on each line of the possibly-multiline string we're given.
40	102					558	$string =~ s/^[ \t]*//m;
41	102					520	$string =~ s/[ \t]*$//m;
42
43							# For HTML output, must convert special characters that were in the
44							# TeX text (&<>) to their entities to avoid misparsing. We want to
45							# do this first, because conversion of the markup commands might
46							# output HTML tags like `, and we don't want to convert those <>.`
47							# Although <tt> works, better to keep the output HTML as
48							# human-readable as we can.
49							#
50	102	50				313	if ($options{html}) {
51	0					0	$string =~ s/([^\\]\|^)&/$1&/g;
52	0					0	$string =~ s/
53	0					0	$string =~ s/>/>/g;
54							}
55
56	102					167	my $user_hook = $options{hook};
57	102	50				187	if ($user_hook) {
58	0					0	$string = &$user_hook($string, \%options);
59	0					0	_debug("after user hook: $string");
60							}
61
62							# Convert general commands that take arguments, since (1) they might
63							# insert TeX commands that need to be converted, and (2) because
64							# their arguments could well contain constructs that will map to a
65							# Perl string \x{nnnn} for Unicode character nnnn; those Perl braces
66							# for the \x will confuse further parsing of the TeX.
67							#
68	102					187	$string = _convert_commands_with_arg($string);
69	102					354	_debug("after commands with arg: $string");
70
71							# Convert markups (\texttt, etc.); they have the same brace-parsing issue.
72	102					248	$string = _convert_markups($string, \%options);
73	102					320	_debug("after markups: $string");
74
75							# And urls, a special case of commands with arguments.
76	102					211	$string = _convert_urls($string, \%options);
77	102					288	_debug("after urls: $string");
78
79	102					173	$string = _convert_control_words($string);
80	102					360	_debug("after control words: $string");
81
82	102					183	$string = _convert_control_symbols($string);
83	102					311	_debug("after control symbols: $string");
84
85	102					187	$string = _convert_accents($string);
86	102	100				279	$string = _convert_german($string) if $options{german};
87	102					209	$string = _convert_symbols($string);
88	102					209	$string = _convert_ligatures($string);
89
90							# Let's handle ties here, after all the other conversions, since
91							# they don't fit well with any of the tables.
92							#
93							# /~, or ~ at the beginning of a line, is probably part of a url or
94							# path, not a tie. Otherwise, consider it a space, since a no-break
95							# spot in TeX is most likely fine to break in text or HTML.
96							#
97	102					186	$string =~ s,([^/])~,$1 ,g;
98
99							# Remove kerns. Clearly needs generalizing/sharpening to recognize
100							# dimens better, and plenty of other commands could use it.
101							#_debug("before kern: $string");
102	102					301	my $dimen_re = qr/[-+]?[0-9., ]+[a-z][a-z]\s*/;
103	102					369	$string =~ s!\\kern${endcw}${dimen_re}!!g;
104
105							# What the heck, let's do \hfuzz and \vfuzz too. They come up pretty
106							# often and practically the same thing (plus ignore optional =)..
107	102					344	$string =~ s!\\[hv]fuzz${endcw}=?\s*${dimen_re}!!g;
108
109							# After all the conversions, $string contains \x{....} constructs
110							# (Perl Unicode characters) where translations have happened. Change
111							# those to the desired output format. Thus we assume that the
112							# Unicode \x{....}'s are not themselves involved in further
113							# translations, which is, so far, true.
114							#
115	102	50				290	if (! $options{entities}) {
		0
116							# Convert our \x strings from Tables.pm to the binary characters.
117							# Assume no more than four hex digits.
118	102					531	$string =~ s/\\x\{(.{1,4})\}/ pack('U*', hex($1))/eg;
	85					577
119
120							} elsif ($options{entities}) {
121							# Convert the XML special characters that appeared in the input,
122							# e.g., from a TeX \&. Unless we're generating HTML output, in
123							# which case they have already been converted.
124	0	0				0	if (! $options{html}) {
125	0					0	$string =~ s/&/&/g;
126	0					0	$string =~ s/
127	0					0	$string =~ s/>/>/g;
128							}
129
130							# Our values in Tables.pm are simple ASCII strings \x{....},
131							# so we can replace them with hex entities with no trouble.
132							# Fortunately TeX does not have a standard \x control sequence.
133	0					0	$string =~ s/\\x\{(....)\}/&#x$1;/g;
134
135							# The rest of the job is about binary Unicode characters in the
136							# input. We want to transform them into entities also. As always
137							# in Perl, there's more than one way to do it, and several are
138							# described here, just for the fun of it.
139	0					0	my $ret = "";
140							#
141							# decode_utf8 is described in https://perldoc.perl.org/Encode.
142							# Without the decode_utf8, all of these methods output each byte
143							# separately; apparently $string is a byte string at this point,
144							# not a Unicode string. I don't know why that is.
145	0					0	$ret = decode_utf8($string);
146							#
147							# Transform everything that's not printable ASCII or newline into
148							# entities.
149	0					0	$ret =~ s/([^ -~\n])/ sprintf("&#x%04x;", ord($1)) /eg;
	0					0
150							#
151							# This method leaves control characters as literal; doesn't matter
152							# for XML output, since control characters aren't allowed, but
153							# let's use the regexp method anyway.
154							#$ret = encode("ascii", decode_utf8($string), Encode::FB_XMLCREF);
155							#
156							# The nice_string function from perluniintro also works.
157							#
158							# This fails, just outputs numbers (that is, ord values):
159							# foreach my $c (unpack("U*", $ret)) {
160							#
161							# Without the decode_utf8, outputs each byte separately.
162							# With the decode_utf8, works, but the above seems cleaner.
163							#foreach my $c (split(//, $ret)) {
164							# if (ord($c) <= 31 \|\| ord($c) >= 128) {
165							# $ret .= sprintf("&#x%04x;", ord($c));
166							# } else {
167							# $ret .= $c;
168							# }
169							#}
170							#
171	0					0	$string = $ret; # assigned from above.
172							}
173
174	102	50				271	if ($string =~ /\\x\{/) {
175	0					0	warn "LaTeX::ToUnicode::convert: untranslated \\x remains: $string\n";
176	0					0	warn "LaTeX::ToUnicode::convert: please report as bug.\n";
177							}
178
179							# Drop all braces.
180	102					290	$string =~ s/[{}]//g;
181
182							# Backslashes might remain. Don't remove them, as it makes for a
183							# useful way to find unhandled commands.
184
185							# leave newlines alone, but trim spaces and tabs.
186	102					223	$string =~ s/^[ \t]+//s; # remove leading whitespace
187	102					201	$string =~ s/[ \t]+$//s; # remove trailing whitespace
188	102					163	$string =~ s/[ \t]+/ /gs; # collapse all remaining whitespace to one space
189
190	102					641	$string;
191							}
192
193							# Convert commands that take a single braced argument. The table
194							# defines text we're supposed to insert before and after the argument.
195							# We let future processing handle conversion of both the inserted text
196							# and the argument.
197							#
198							sub _convert_commands_with_arg {
199	102			102		174	my $string = shift;
200
201	102					298	foreach my $cmd ( keys %LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS ) {
202	306					643	my $repl = $LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS{$cmd};
203	306					445	my $lft = $repl->[0]; # ref to two-element list
204	306					393	my $rht = $repl->[1];
205							# \cmd{foo} -> LFT foo RHT
206	306					6360	$string =~ s/\\$cmd${endcw}\{(.*?)\}/$lft$1$rht/g;
207							#warn "replaced arg $cmd, yielding $string\n";
208							}
209
210	102					302	$string;
211							}
212
213							# Convert url commands in STRING. This is a special case of commands
214							# with arguments: \url{u} and \href{u}{desc text}. The HTML output
215							# (generated if $OPTIONS{html} is set) is just too special to be handled
216							# in a table; further, \href is the only two-argument command we are
217							# currently handling.
218							#
219							sub _convert_urls {
220	102			102		186	my ($string,$options) = @_;
221
222	102	50				223	if ($options->{html}) {
223							# HTML output.
224							# \url{URL} -> URL
225	0					0	$string =~ s,\\url$endcw\{([^}]*)\}
226							,$1,gx;
227							#
228							# \href{URL}{TEXT} -> TEXT
229	0					0	$string =~ s,\\href$endcw\{([^}])\}\s\{([^}]*)\}
230							,$2,gx;
231
232							} else {
233							# plain text output.
234							# \url{URL} -> URL
235	102					309	$string =~ s/\\url$endcw\{([^}]*)\}/$1/g;
236							#
237							# \href{URL}{TEXT} -> TEXT (URL)
238							# but, as a special case, if URL ends with TEXT, just output URL,
239							# as in:
240							# \href{https://doi.org/10/fjzzc8}{10/fjzzc8}
241							# ->
242							# https://doi.org/10/fjzzc8
243							#
244							# Yet more specialness: the TEXT might have extra braces, as in
245							# \href{https://doi.org/10/fjzzc8}{{10/fjzzc8}}
246							# left over from previous markup commands (\path) which got
247							# removed. We want to accept and ignore such extra braces,
248							# hence the \{+ ... \}+ in recognizing TEXT.
249							#
250							#warn "txt url: starting with $string\n";
251	102	50				388	if ($string =~ m/\\href$endcw\{([^}])\}\s\{+([^}]*)\}+/) {
252	0					0	my $url = $1;
253	0					0	my $text = $2;
254							#warn " url: $url\n";
255							#warn " text: $text\n";
256	0	0				0	my $repl = ($url =~ m!$text$!) ? $url : "$text ($url)";
257							#warn " repl: $repl\n";
258	0					0	$string =~ s/\\href$endcw\{([^}])\}\s\{+([^}]*)\}+/$repl/;
259							#warn " str: $string\n";
260							}
261							}
262
263	102					203	$string;
264							}
265
266							# Convert control words (not symbols), that is, a backslash and an
267							# alphabetic sequence of characters terminated by a non-alphabetic
268							# character. Following whitespace is ignored.
269							#
270							sub _convert_control_words {
271	102			102		159	my $string = shift;
272
273	102					692	foreach my $command ( keys %LaTeX::ToUnicode::Tables::CONTROL_WORDS ) {
274	5814					12500	my $repl = $LaTeX::ToUnicode::Tables::CONTROL_WORDS{$command};
275							# replace {\CMD}, whitespace ignored after \CMD.
276	5814					82552	$string =~ s/\{\\$command$endcw\}/$repl/g;
277
278							# replace \CMD, preceded by not-consumed non-backslash.
279	5814					99539	$string =~ s/(?<=[^\\])\\$command$endcw/$repl/g;
280
281							# replace \CMD at beginning of whole string, which otherwise
282							# wouldn't be matched. Two separate regexps to avoid
283							# variable-length lookbehind.
284	5814					87222	$string =~ s/^\\$command$endcw/$repl/g;
285							}
286
287	102					521	$string;
288							}
289
290							# Convert control symbols, other than accents. Much simpler than
291							# control words, since are self-delimiting, don't take arguments, and
292							# don't consume any following text.
293							#
294							sub _convert_control_symbols {
295	102			102		167	my $string = shift;
296
297	102					557	foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS ) {
298	2754					5840	my $repl = $LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS{$symbol};
299
300							# because these are not alphabetic, we can quotemeta them,
301							# and we need to because "\" is one of the symbols.
302	2754					3669	my $rx = quotemeta($symbol);
303
304							# the preceding character must not be a backslash, else "\\ "
305							# could have the "\ " seen first as a control space, leaving
306							# a spurious \ behind. Don't consume the preceding.
307							# Or it could be at the beginning of a line.
308							#
309	2754					34846	$string =~ s/(^\|(?<=[^\\]))\\$rx/$repl/g;
310							#warn "after sym $symbol (\\$rx -> $repl), have: $string\n";
311							}
312
313	102					382	$string;
314							}
315
316							# Convert accents.
317							#
318							sub _convert_accents {
319	102			102		161	my $string = shift;
320
321							# first the non-alphabetic accent commands, like \".
322	102					490	my %tbl = %LaTeX::ToUnicode::Tables::ACCENT_SYMBOLS;
323	102	100				394	$string =~ s/(\{\\(.)\s*\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} \|\| $1/eg; #{\"{a}}
	29					234
324	102	100				533	$string =~ s/(\{\\(.)\s*(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # {\"a}
	47					365
325	102	50				360	$string =~ s/(\\(.)\s*(\\?\w{1,1}))/ $tbl{$2}{$3} \|\| $1/eg; # \"a
	6					40
326	102	100				293	$string =~ s/(\\(.)\s*\{(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # \"{a}
	20					111
327
328							# second the alphabetic commands, like \c. They have be handled
329							# differently because \cc is not \c{c}! The only difference in the
330							# regular expressions is using $endcw instead of just \s*.
331							#
332	102					898	%tbl = %LaTeX::ToUnicode::Tables::ACCENT_LETTERS;
333	102	50				490	$string =~ s/(\{\\(.)$endcw\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} \|\| $1/eg; #{\"{a}}
	19					114
334	102	0				347	$string =~ s/(\{\\(.)$endcw(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # {\"a}
	0					0
335	102	0				443	$string =~ s/(\\(.)$endcw(\\?\w{1,1}))/ $tbl{$2}{$3} \|\| $1/eg; # \"a
	0					0
336	102	0				404	$string =~ s/(\\(.)$endcw\{(\\?\w{1,2})\})/ $tbl{$2}{$3} \|\| $1/eg; # \"{a}
	0					0
337
338
339							# The argument is just one \w character for the \"a case, not two,
340							# because otherwise we might consume a following character that is
341							# not part of the accent, e.g., a backslash (\"a\'e).
342							#
343							# Others can be two because of the \t tie-after accent. Even {\t oo} is ok.
344							#
345							# Allow whitespace after the \CMD in all cases, e.g., "\c c". Even
346							# for the control symbols, it turns out spaces are ignored there
347							# (as in \" o), unlike the usual syntax.
348							#
349							# Some non-word constituents would work, but in practice we hope
350							# everyone just uses letters.
351
352	102					375	$string;
353							}
354
355							# For the [n]german package.
356							sub _convert_german {
357	3			3		6	my $string = shift;
358
359	3					18	foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::GERMAN ) {
360	87					542	$string =~ s/\Q$symbol\E/$LaTeX::ToUnicode::Tables::GERMAN{$symbol}/g;
361							}
362	3					10	$string;
363							}
364
365							# Control words that produce printed symbols (and letters in languages
366							# other than English), that is.
367							#
368							sub _convert_symbols {
369	102			102		170	my $string = shift;
370
371	102					437	foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::SYMBOLS ) {
372	2550					5424	my $repl = $LaTeX::ToUnicode::Tables::SYMBOLS{$symbol};
373							# preceded by a (non-consumed) non-backslash,
374							# usual termination for a control word.
375							# These commands don't take arguments.
376	2550					44337	$string =~ s/(?<=[^\\])\\$symbol$endcw/$repl/g;
377
378							# or the beginning of the whole string:
379	2550					39948	$string =~ s/^\\$symbol$endcw/$repl/g;
380							}
381	102					395	$string;
382							}
383
384							# Special character sequences, not \commands. They aren't all
385							# technically ligatures, but no matter.
386							#
387							sub _convert_ligatures {
388	102			102		169	my $string = shift;
389
390							# have to convert these in order specified.
391	102					397	my @ligs = @LaTeX::ToUnicode::Tables::LIGATURES;
392	102					304	for (my $i = 0; $i < @ligs; $i+=2) {
393	816					1449	my $in = $ligs[$i];
394	816					1143	my $out = $ligs[$i+1];
395	816					5880	$string =~ s/\Q$in\E/$out/g;
396							}
397	102					311	$string;
398							}
399
400							#
401							# Convert LaTeX markup commands in STRING like \textbf{...} and
402							# {\bfshape ...} and {\bf ...}.
403							#
404							# If we're aiming for plain text output, they are just cleared away (the
405							# braces are not removed).
406							#
407							# If we're generating HTML output ("html" key is set in $OPTIONS hash
408							# ref), we use the value in the hash, so that \textbf{foo} becomes
409							# foo. Nested markup doesn't work.
410							#
411							sub _convert_markups {
412	102			102		222	my ($string, $options) = @_;
413
414							# HTML is different.
415	102	50				221	return _convert_markups_html($string) if $options->{html};
416							# Ok, we'll "convert" to plain text by removing the markup commands.
417
418							# we can do all markup commands at once.
419	102					478	my $markups = join('\|', keys %LaTeX::ToUnicode::Tables::MARKUPS);
420
421							# Remove \textMARKUP{...}, leaving just the {...}
422	102					461	$string =~ s/\\text($markups)$endcw//g;
423
424							# Similarly remove \MARKUPshape.
425	102					365	$string =~ s/\\($markups)shape$endcw//g;
426
427							# Remove braces and \command in: {... \MARKUP ...}
428	102					375	$string =~ s/(\{[^{}]+)\\(?:$markups)$endcw([^{}]+\})/$1$2/g;
429
430							# Remove braces and \command in: {\MARKUP ...}
431	102					624	$string =~ s/\{\\(?:$markups)$endcw([^{}]*)\}/$1/g;
432
433							# Remove: {\MARKUP
434							# Although this will leave unmatched } chars behind, there's no
435							# alternative without full parsing, since the bib entry will often
436							# look like: {\em {The TeX{}book}}. Also might, in principle, be
437							# at the end of a line.
438	102					406	$string =~ s/\{\\(?:$markups)$endcw//g;
439
440							# Ultimately we remove all braces in ltx2crossrefxml SanitizeText fns,
441							# so the unmatched braces don't matter ... that code should be moved.
442
443	102					332	$string;
444							}
445
446							# Convert \markup in STRING to html. We can't always figure out where to
447							# put the end tag, but we always put it somewhere. We don't even attempt
448							# to handle nested markup.
449							#
450							sub _convert_markups_html {
451	0			0			my ($string) = @_;
452
453	0						my %MARKUPS = %LaTeX::ToUnicode::Tables::MARKUPS;
454							# have to consider each markup \command separately.
455	0						for my $markup (keys %MARKUPS) {
456	0						my $hcmd = $MARKUPS{$markup}; # some TeX commands don't translate
457	0	0					my $tag = $hcmd ? "<$hcmd>" : "";
458	0	0					my $end_tag = $hcmd ? "" : "";
459
460							# The easy one: \textMARKUP{...}
461	0						$string =~ s/\\text$markup$endcw\{(.*?)\}/$tag$1$end_tag/g;
462
463							# {x\MARKUP(shape) y} -> xy (leave out braces)
464	0						$string =~ s/\{([^{}]+)\\$markup(shape)?$endcw([^{}]+)\}
465							/$1$tag$3$end_tag/gx;
466
467							# {\MARKUP(shape) y} -> y. Same as previous but without
468							# the x part. Could do it in one regex but this seems clearer.
469	0						$string =~ s/\{\\$markup(shape)?$endcw([^{}]+)\}
470							/$tag$2$end_tag/gx;
471
472							# for {\MARKUP(shape) ... with no matching brace, we don't know
473							# where to put the end tag, so seems best to do nothing.
474							}
475
476	0						$string;
477							}
478
479
480							##############################################################
481							# debug_hash_as_string($LABEL, HASH)
482							#
483							# Return LABEL followed by HASH elements, followed by a newline, as a
484							# single string. If HASH is a reference, it is followed (but no recursive
485							# derefencing).
486							###############################################################
487							sub debug_hash_as_string {
488	0			0	0		my ($label) = shift;
489	0	0	0				my (%hash) = (ref $_[0] && $_[0] =~ /.HASH./) ? %{$_[0]} : @_;
	0
490
491	0						my $str = "$label: {";
492	0						my @items = ();
493	0						for my $key (sort keys %hash) {
494	0						my $val = $hash{$key};
495	0	0					$val = ".undef" if ! defined $val;
496	0						$key =~ s/\n/\\n/g;
497	0						$val =~ s/\n/\\n/g;
498	0						push (@items, "$key:$val");
499							}
500	0						$str .= join (",", @items);
501	0						$str .= "}";
502
503	0						return "$str\n";
504							}
505
506							1;
507
508							__END__