File Coverage

blib/lib/LaTeX/ToUnicode.pm
Criterion Covered Total %
statement 118 171 69.0
branch 18 48 37.5
condition 0 3 0.0
subroutine 17 20 85.0
pod 2 3 66.6
total 155 245 63.2


line stmt bran cond sub pod time code
1 1     1   69972 use strict;
  1         2  
  1         29  
2 1     1   5 use warnings;
  1         2  
  1         41  
3             package LaTeX::ToUnicode;
4             BEGIN {
5 1     1   59 $LaTeX::ToUnicode::VERSION = '0.53';
6             }
7             #ABSTRACT: Convert LaTeX commands to Unicode (simplistically)
8              
9             require Exporter;
10             our @ISA = qw(Exporter);
11             our @EXPORT_OK = qw( convert debuglevel $endcw );
12              
13 1     1   6 use utf8;
  1         10  
  1         6  
14 1     1   593 use Encode;
  1         10561  
  1         70  
15 1     1   476 use LaTeX::ToUnicode::Tables;
  1         3  
  1         2605  
16              
17             # Terminating a control word (not symbol) the way TeX does: at the
18             # boundary between a letter (lookbehind) and a nonletter (lookahead),
19             # and then ignore any following whitespace.
20             our $endcw = qr/(?<=[a-zA-Z])(?=[^a-zA-Z]|$)\s*/;
21              
22             # all we need for is debugging being on and off. And it's pretty random
23             # what gets output.
24             my $debug = 0;
25 0     0 1 0 sub debuglevel { $debug = shift; }
26             sub _debug {
27 510 50   510   1066 return unless $debug;
28 0         0 my ($pkgname,$filename,$line,$subr) = caller(1);
29 0         0 warn @_, " at $filename:$line (${pkgname}::$subr)\n";
30             }
31              
32             # The main conversion function.
33             #
34             sub convert {
35 102     102 1 57375 my ($string, %options) = @_;
36             #warn debug_hash_as_string("starting with: $string", %options);
37              
38             # First, remove leading and trailing horizontal whitespace
39             # on each line of the possibly-multiline string we're given.
40 102         558 $string =~ s/^[ \t]*//m;
41 102         520 $string =~ s/[ \t]*$//m;
42            
43             # For HTML output, must convert special characters that were in the
44             # TeX text (&<>) to their entities to avoid misparsing. We want to
45             # do this first, because conversion of the markup commands might
46             # output HTML tags like , and we don't want to convert those <>.
47             # Although <tt> works, better to keep the output HTML as
48             # human-readable as we can.
49             #
50 102 50       313 if ($options{html}) {
51 0         0 $string =~ s/([^\\]|^)&/$1&/g;
52 0         0 $string =~ s/
53 0         0 $string =~ s/>/>/g;
54             }
55            
56 102         167 my $user_hook = $options{hook};
57 102 50       187 if ($user_hook) {
58 0         0 $string = &$user_hook($string, \%options);
59 0         0 _debug("after user hook: $string");
60             }
61            
62             # Convert general commands that take arguments, since (1) they might
63             # insert TeX commands that need to be converted, and (2) because
64             # their arguments could well contain constructs that will map to a
65             # Perl string \x{nnnn} for Unicode character nnnn; those Perl braces
66             # for the \x will confuse further parsing of the TeX.
67             #
68 102         187 $string = _convert_commands_with_arg($string);
69 102         354 _debug("after commands with arg: $string");
70            
71             # Convert markups (\texttt, etc.); they have the same brace-parsing issue.
72 102         248 $string = _convert_markups($string, \%options);
73 102         320 _debug("after markups: $string");
74            
75             # And urls, a special case of commands with arguments.
76 102         211 $string = _convert_urls($string, \%options);
77 102         288 _debug("after urls: $string");
78              
79 102         173 $string = _convert_control_words($string);
80 102         360 _debug("after control words: $string");
81              
82 102         183 $string = _convert_control_symbols($string);
83 102         311 _debug("after control symbols: $string");
84              
85 102         187 $string = _convert_accents($string);
86 102 100       279 $string = _convert_german($string) if $options{german};
87 102         209 $string = _convert_symbols($string);
88 102         209 $string = _convert_ligatures($string);
89            
90             # Let's handle ties here, after all the other conversions, since
91             # they don't fit well with any of the tables.
92             #
93             # /~, or ~ at the beginning of a line, is probably part of a url or
94             # path, not a tie. Otherwise, consider it a space, since a no-break
95             # spot in TeX is most likely fine to break in text or HTML.
96             #
97 102         186 $string =~ s,([^/])~,$1 ,g;
98            
99             # Remove kerns. Clearly needs generalizing/sharpening to recognize
100             # dimens better, and plenty of other commands could use it.
101             #_debug("before kern: $string");
102 102         301 my $dimen_re = qr/[-+]?[0-9., ]+[a-z][a-z]\s*/;
103 102         369 $string =~ s!\\kern${endcw}${dimen_re}!!g;
104            
105             # What the heck, let's do \hfuzz and \vfuzz too. They come up pretty
106             # often and practically the same thing (plus ignore optional =)..
107 102         344 $string =~ s!\\[hv]fuzz${endcw}=?\s*${dimen_re}!!g;
108              
109             # After all the conversions, $string contains \x{....} constructs
110             # (Perl Unicode characters) where translations have happened. Change
111             # those to the desired output format. Thus we assume that the
112             # Unicode \x{....}'s are not themselves involved in further
113             # translations, which is, so far, true.
114             #
115 102 50       290 if (! $options{entities}) {
    0          
116             # Convert our \x strings from Tables.pm to the binary characters.
117             # Assume no more than four hex digits.
118 102         531 $string =~ s/\\x\{(.{1,4})\}/ pack('U*', hex($1))/eg;
  85         577  
119              
120             } elsif ($options{entities}) {
121             # Convert the XML special characters that appeared in the input,
122             # e.g., from a TeX \&. Unless we're generating HTML output, in
123             # which case they have already been converted.
124 0 0       0 if (! $options{html}) {
125 0         0 $string =~ s/&/&/g;
126 0         0 $string =~ s/
127 0         0 $string =~ s/>/>/g;
128             }
129            
130             # Our values in Tables.pm are simple ASCII strings \x{....},
131             # so we can replace them with hex entities with no trouble.
132             # Fortunately TeX does not have a standard \x control sequence.
133 0         0 $string =~ s/\\x\{(....)\}/&#x$1;/g;
134            
135             # The rest of the job is about binary Unicode characters in the
136             # input. We want to transform them into entities also. As always
137             # in Perl, there's more than one way to do it, and several are
138             # described here, just for the fun of it.
139 0         0 my $ret = "";
140             #
141             # decode_utf8 is described in https://perldoc.perl.org/Encode.
142             # Without the decode_utf8, all of these methods output each byte
143             # separately; apparently $string is a byte string at this point,
144             # not a Unicode string. I don't know why that is.
145 0         0 $ret = decode_utf8($string);
146             #
147             # Transform everything that's not printable ASCII or newline into
148             # entities.
149 0         0 $ret =~ s/([^ -~\n])/ sprintf("&#x%04x;", ord($1)) /eg;
  0         0  
150             #
151             # This method leaves control characters as literal; doesn't matter
152             # for XML output, since control characters aren't allowed, but
153             # let's use the regexp method anyway.
154             #$ret = encode("ascii", decode_utf8($string), Encode::FB_XMLCREF);
155             #
156             # The nice_string function from perluniintro also works.
157             #
158             # This fails, just outputs numbers (that is, ord values):
159             # foreach my $c (unpack("U*", $ret)) {
160             #
161             # Without the decode_utf8, outputs each byte separately.
162             # With the decode_utf8, works, but the above seems cleaner.
163             #foreach my $c (split(//, $ret)) {
164             # if (ord($c) <= 31 || ord($c) >= 128) {
165             # $ret .= sprintf("&#x%04x;", ord($c));
166             # } else {
167             # $ret .= $c;
168             # }
169             #}
170             #
171 0         0 $string = $ret; # assigned from above.
172             }
173              
174 102 50       271 if ($string =~ /\\x\{/) {
175 0         0 warn "LaTeX::ToUnicode::convert: untranslated \\x remains: $string\n";
176 0         0 warn "LaTeX::ToUnicode::convert: please report as bug.\n";
177             }
178            
179             # Drop all braces.
180 102         290 $string =~ s/[{}]//g;
181            
182             # Backslashes might remain. Don't remove them, as it makes for a
183             # useful way to find unhandled commands.
184              
185             # leave newlines alone, but trim spaces and tabs.
186 102         223 $string =~ s/^[ \t]+//s; # remove leading whitespace
187 102         201 $string =~ s/[ \t]+$//s; # remove trailing whitespace
188 102         163 $string =~ s/[ \t]+/ /gs; # collapse all remaining whitespace to one space
189            
190 102         641 $string;
191             }
192              
193             # Convert commands that take a single braced argument. The table
194             # defines text we're supposed to insert before and after the argument.
195             # We let future processing handle conversion of both the inserted text
196             # and the argument.
197             #
198             sub _convert_commands_with_arg {
199 102     102   174 my $string = shift;
200              
201 102         298 foreach my $cmd ( keys %LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS ) {
202 306         643 my $repl = $LaTeX::ToUnicode::Tables::ARGUMENT_COMMANDS{$cmd};
203 306         445 my $lft = $repl->[0]; # ref to two-element list
204 306         393 my $rht = $repl->[1];
205             # \cmd{foo} -> LFT foo RHT
206 306         6360 $string =~ s/\\$cmd${endcw}\{(.*?)\}/$lft$1$rht/g;
207             #warn "replaced arg $cmd, yielding $string\n";
208             }
209            
210 102         302 $string;
211             }
212              
213             # Convert url commands in STRING. This is a special case of commands
214             # with arguments: \url{u} and \href{u}{desc text}. The HTML output
215             # (generated if $OPTIONS{html} is set) is just too special to be handled
216             # in a table; further, \href is the only two-argument command we are
217             # currently handling.
218             #
219             sub _convert_urls {
220 102     102   186 my ($string,$options) = @_;
221              
222 102 50       223 if ($options->{html}) {
223             # HTML output.
224             # \url{URL} -> URL
225 0         0 $string =~ s,\\url$endcw\{([^}]*)\}
226             ,$1,gx;
227             #
228             # \href{URL}{TEXT} -> TEXT
229 0         0 $string =~ s,\\href$endcw\{([^}]*)\}\s*\{([^}]*)\}
230             ,$2,gx;
231              
232             } else {
233             # plain text output.
234             # \url{URL} -> URL
235 102         309 $string =~ s/\\url$endcw\{([^}]*)\}/$1/g;
236             #
237             # \href{URL}{TEXT} -> TEXT (URL)
238             # but, as a special case, if URL ends with TEXT, just output URL,
239             # as in:
240             # \href{https://doi.org/10/fjzzc8}{10/fjzzc8}
241             # ->
242             # https://doi.org/10/fjzzc8
243             #
244             # Yet more specialness: the TEXT might have extra braces, as in
245             # \href{https://doi.org/10/fjzzc8}{{10/fjzzc8}}
246             # left over from previous markup commands (\path) which got
247             # removed. We want to accept and ignore such extra braces,
248             # hence the \{+ ... \}+ in recognizing TEXT.
249             #
250             #warn "txt url: starting with $string\n";
251 102 50       388 if ($string =~ m/\\href$endcw\{([^}]*)\}\s*\{+([^}]*)\}+/) {
252 0         0 my $url = $1;
253 0         0 my $text = $2;
254             #warn " url: $url\n";
255             #warn " text: $text\n";
256 0 0       0 my $repl = ($url =~ m!$text$!) ? $url : "$text ($url)";
257             #warn " repl: $repl\n";
258 0         0 $string =~ s/\\href$endcw\{([^}]*)\}\s*\{+([^}]*)\}+/$repl/;
259             #warn " str: $string\n";
260             }
261             }
262            
263 102         203 $string;
264             }
265              
266             # Convert control words (not symbols), that is, a backslash and an
267             # alphabetic sequence of characters terminated by a non-alphabetic
268             # character. Following whitespace is ignored.
269             #
270             sub _convert_control_words {
271 102     102   159 my $string = shift;
272              
273 102         692 foreach my $command ( keys %LaTeX::ToUnicode::Tables::CONTROL_WORDS ) {
274 5814         12500 my $repl = $LaTeX::ToUnicode::Tables::CONTROL_WORDS{$command};
275             # replace {\CMD}, whitespace ignored after \CMD.
276 5814         82552 $string =~ s/\{\\$command$endcw\}/$repl/g;
277            
278             # replace \CMD, preceded by not-consumed non-backslash.
279 5814         99539 $string =~ s/(?<=[^\\])\\$command$endcw/$repl/g;
280            
281             # replace \CMD at beginning of whole string, which otherwise
282             # wouldn't be matched. Two separate regexps to avoid
283             # variable-length lookbehind.
284 5814         87222 $string =~ s/^\\$command$endcw/$repl/g;
285             }
286              
287 102         521 $string;
288             }
289              
290             # Convert control symbols, other than accents. Much simpler than
291             # control words, since are self-delimiting, don't take arguments, and
292             # don't consume any following text.
293             #
294             sub _convert_control_symbols {
295 102     102   167 my $string = shift;
296              
297 102         557 foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS ) {
298 2754         5840 my $repl = $LaTeX::ToUnicode::Tables::CONTROL_SYMBOLS{$symbol};
299              
300             # because these are not alphabetic, we can quotemeta them,
301             # and we need to because "\" is one of the symbols.
302 2754         3669 my $rx = quotemeta($symbol);
303            
304             # the preceding character must not be a backslash, else "\\ "
305             # could have the "\ " seen first as a control space, leaving
306             # a spurious \ behind. Don't consume the preceding.
307             # Or it could be at the beginning of a line.
308             #
309 2754         34846 $string =~ s/(^|(?<=[^\\]))\\$rx/$repl/g;
310             #warn "after sym $symbol (\\$rx -> $repl), have: $string\n";
311             }
312              
313 102         382 $string;
314             }
315              
316             # Convert accents.
317             #
318             sub _convert_accents {
319 102     102   161 my $string = shift;
320            
321             # first the non-alphabetic accent commands, like \".
322 102         490 my %tbl = %LaTeX::ToUnicode::Tables::ACCENT_SYMBOLS;
323 102 100       394 $string =~ s/(\{\\(.)\s*\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} || $1/eg; #{\"{a}}
  29         234  
324 102 100       533 $string =~ s/(\{\\(.)\s*(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # {\"a}
  47         365  
325 102 50       360 $string =~ s/(\\(.)\s*(\\?\w{1,1}))/ $tbl{$2}{$3} || $1/eg; # \"a
  6         40  
326 102 100       293 $string =~ s/(\\(.)\s*\{(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # \"{a}
  20         111  
327            
328             # second the alphabetic commands, like \c. They have be handled
329             # differently because \cc is not \c{c}! The only difference in the
330             # regular expressions is using $endcw instead of just \s*.
331             #
332 102         898 %tbl = %LaTeX::ToUnicode::Tables::ACCENT_LETTERS;
333 102 50       490 $string =~ s/(\{\\(.)$endcw\{(\\?\w{1,2})\}\})/$tbl{$2}{$3} || $1/eg; #{\"{a}}
  19         114  
334 102 0       347 $string =~ s/(\{\\(.)$endcw(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # {\"a}
  0         0  
335 102 0       443 $string =~ s/(\\(.)$endcw(\\?\w{1,1}))/ $tbl{$2}{$3} || $1/eg; # \"a
  0         0  
336 102 0       404 $string =~ s/(\\(.)$endcw\{(\\?\w{1,2})\})/ $tbl{$2}{$3} || $1/eg; # \"{a}
  0         0  
337            
338            
339             # The argument is just one \w character for the \"a case, not two,
340             # because otherwise we might consume a following character that is
341             # not part of the accent, e.g., a backslash (\"a\'e).
342             #
343             # Others can be two because of the \t tie-after accent. Even {\t oo} is ok.
344             #
345             # Allow whitespace after the \CMD in all cases, e.g., "\c c". Even
346             # for the control symbols, it turns out spaces are ignored there
347             # (as in \" o), unlike the usual syntax.
348             #
349             # Some non-word constituents would work, but in practice we hope
350             # everyone just uses letters.
351              
352 102         375 $string;
353             }
354              
355             # For the [n]german package.
356             sub _convert_german {
357 3     3   6 my $string = shift;
358              
359 3         18 foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::GERMAN ) {
360 87         542 $string =~ s/\Q$symbol\E/$LaTeX::ToUnicode::Tables::GERMAN{$symbol}/g;
361             }
362 3         10 $string;
363             }
364              
365             # Control words that produce printed symbols (and letters in languages
366             # other than English), that is.
367             #
368             sub _convert_symbols {
369 102     102   170 my $string = shift;
370              
371 102         437 foreach my $symbol ( keys %LaTeX::ToUnicode::Tables::SYMBOLS ) {
372 2550         5424 my $repl = $LaTeX::ToUnicode::Tables::SYMBOLS{$symbol};
373             # preceded by a (non-consumed) non-backslash,
374             # usual termination for a control word.
375             # These commands don't take arguments.
376 2550         44337 $string =~ s/(?<=[^\\])\\$symbol$endcw/$repl/g;
377            
378             # or the beginning of the whole string:
379 2550         39948 $string =~ s/^\\$symbol$endcw/$repl/g;
380             }
381 102         395 $string;
382             }
383              
384             # Special character sequences, not \commands. They aren't all
385             # technically ligatures, but no matter.
386             #
387             sub _convert_ligatures {
388 102     102   169 my $string = shift;
389              
390             # have to convert these in order specified.
391 102         397 my @ligs = @LaTeX::ToUnicode::Tables::LIGATURES;
392 102         304 for (my $i = 0; $i < @ligs; $i+=2) {
393 816         1449 my $in = $ligs[$i];
394 816         1143 my $out = $ligs[$i+1];
395 816         5880 $string =~ s/\Q$in\E/$out/g;
396             }
397 102         311 $string;
398             }
399              
400             #
401             # Convert LaTeX markup commands in STRING like \textbf{...} and
402             # {\bfshape ...} and {\bf ...}.
403             #
404             # If we're aiming for plain text output, they are just cleared away (the
405             # braces are not removed).
406             #
407             # If we're generating HTML output ("html" key is set in $OPTIONS hash
408             # ref), we use the value in the hash, so that \textbf{foo} becomes
409             # foo. Nested markup doesn't work.
410             #
411             sub _convert_markups {
412 102     102   222 my ($string, $options) = @_;
413            
414             # HTML is different.
415 102 50       221 return _convert_markups_html($string) if $options->{html};
416             # Ok, we'll "convert" to plain text by removing the markup commands.
417              
418             # we can do all markup commands at once.
419 102         478 my $markups = join('|', keys %LaTeX::ToUnicode::Tables::MARKUPS);
420            
421             # Remove \textMARKUP{...}, leaving just the {...}
422 102         461 $string =~ s/\\text($markups)$endcw//g;
423              
424             # Similarly remove \MARKUPshape.
425 102         365 $string =~ s/\\($markups)shape$endcw//g;
426              
427             # Remove braces and \command in: {... \MARKUP ...}
428 102         375 $string =~ s/(\{[^{}]+)\\(?:$markups)$endcw([^{}]+\})/$1$2/g;
429              
430             # Remove braces and \command in: {\MARKUP ...}
431 102         624 $string =~ s/\{\\(?:$markups)$endcw([^{}]*)\}/$1/g;
432              
433             # Remove: {\MARKUP
434             # Although this will leave unmatched } chars behind, there's no
435             # alternative without full parsing, since the bib entry will often
436             # look like: {\em {The TeX{}book}}. Also might, in principle, be
437             # at the end of a line.
438 102         406 $string =~ s/\{\\(?:$markups)$endcw//g;
439              
440             # Ultimately we remove all braces in ltx2crossrefxml SanitizeText fns,
441             # so the unmatched braces don't matter ... that code should be moved.
442              
443 102         332 $string;
444             }
445              
446             # Convert \markup in STRING to html. We can't always figure out where to
447             # put the end tag, but we always put it somewhere. We don't even attempt
448             # to handle nested markup.
449             #
450             sub _convert_markups_html {
451 0     0     my ($string) = @_;
452            
453 0           my %MARKUPS = %LaTeX::ToUnicode::Tables::MARKUPS;
454             # have to consider each markup \command separately.
455 0           for my $markup (keys %MARKUPS) {
456 0           my $hcmd = $MARKUPS{$markup}; # some TeX commands don't translate
457 0 0         my $tag = $hcmd ? "<$hcmd>" : "";
458 0 0         my $end_tag = $hcmd ? "" : "";
459            
460             # The easy one: \textMARKUP{...}
461 0           $string =~ s/\\text$markup$endcw\{(.*?)\}/$tag$1$end_tag/g;
462              
463             # {x\MARKUP(shape) y} -> xy (leave out braces)
464 0           $string =~ s/\{([^{}]+)\\$markup(shape)?$endcw([^{}]+)\}
465             /$1$tag$3$end_tag/gx;
466              
467             # {\MARKUP(shape) y} -> y. Same as previous but without
468             # the x part. Could do it in one regex but this seems clearer.
469 0           $string =~ s/\{\\$markup(shape)?$endcw([^{}]+)\}
470             /$tag$2$end_tag/gx;
471            
472             # for {\MARKUP(shape) ... with no matching brace, we don't know
473             # where to put the end tag, so seems best to do nothing.
474             }
475            
476 0           $string;
477             }
478              
479            
480             ##############################################################
481             # debug_hash_as_string($LABEL, HASH)
482             #
483             # Return LABEL followed by HASH elements, followed by a newline, as a
484             # single string. If HASH is a reference, it is followed (but no recursive
485             # derefencing).
486             ###############################################################
487             sub debug_hash_as_string {
488 0     0 0   my ($label) = shift;
489 0 0 0       my (%hash) = (ref $_[0] && $_[0] =~ /.*HASH.*/) ? %{$_[0]} : @_;
  0            
490              
491 0           my $str = "$label: {";
492 0           my @items = ();
493 0           for my $key (sort keys %hash) {
494 0           my $val = $hash{$key};
495 0 0         $val = ".undef" if ! defined $val;
496 0           $key =~ s/\n/\\n/g;
497 0           $val =~ s/\n/\\n/g;
498 0           push (@items, "$key:$val");
499             }
500 0           $str .= join (",", @items);
501 0           $str .= "}";
502              
503 0           return "$str\n";
504             }
505              
506             1;
507              
508             __END__