File Coverage

blib/lib/Biblio/bp/lib/bp-cs-tex.pl
Criterion Covered Total %
statement 72 154 46.7
branch 31 64 48.4
condition 0 15 0.0
subroutine 2 3 66.6
pod n/a
total 105 236 44.4


line stmt bran cond sub pod time code
1             #
2             # bibliography package for Perl
3             #
4             # TeX character set.
5             #
6             # Dana Jacobsen (dana@acm.org)
7             # 22 January 1995 (last modified on 14 March 1996)
8             #
9             # These routines have gone through a major update in November 1995.
10             #
11             # This is still in beta.
12             # There are many characters not implemented, and the underlying charset
13             # code is not solid yet.
14             #
15             # Some ugly convolutions are gone through to make it run at a decent
16             # speed. This code is _very_ timing sensitive. On a typical 1043 record
17             # run, the first implementation ran at 83 seconds for tocanon, 28 seconds
18             # for fromcanon. Two days of work brought this down to 1 second and 2
19             # seconds.
20             # Lesson:
21             # If you're not careful, you may find the charset code dominating
22             # your entire conversion time since it is run for every _field_, but
23             # with some careful profiling, it can be very fast.
24             #
25              
26              
27             ####
28             #
29             # ToDo's identified by ptandler, 02-07-18
30             #
31             # - Unknown TeX characters in 'ACM SIG{\-}PLAN Notices' --> \- is an optional hyphen
32             # - Unknown TeX characters in '\{lopez,borning\}@cs' --> \{ and \} protect braces
33             # - braces are not removed ... in bibtex they are often needed to protect the case of words
34             # that should not be converted to lowercase in titles.
35             # - tex commands like \cite{....} in bibtex entries are treated as unknown tex characters
36             #
37             ####
38              
39             package bp_cs_tex;
40              
41             ######
42              
43             $bib'charsets{'tex', 'i_name'} = 'tex';
44              
45             $bib'charsets{'tex', 'tocanon'} = "bp_cs_tex'tocanon";
46             $bib'charsets{'tex', 'fromcanon'} = "bp_cs_tex'fromcanon";
47              
48             $bib'charsets{'tex', 'toesc'} = "[\$\\\\]";
49             # XXXXX We have so many characters to protect, should we even bother?
50             $bib'charsets{'tex', 'fromesc'} = "[\\#\$\%\&{}_\|><\^~\200-\377]|${bib'cs_ext}|${bib'cs_meta}";
51              
52             ######
53              
54             $cs_init = 0;
55              
56             # package variables for anyone to use
57             $mine = '';
58             $unicode = '';
59             $can = '';
60              
61             ######
62              
63             sub init_cs {
64              
65             # Thorn and eth are really nasty since they don't exist in the standard TeX
66             # fonts. This is what I came up with in r2b to fake it. Fortunately they
67             # aren't used often. Get the cmoer fonts if you want to do them right.
68             # My eth is pretty nice, but the thorn leaves a little to be desired.
69              
70 2     2   245 %charmap = (
71             '00A1', "!'",
72             '00A2', '\leavevmode\hbox{\rm\rlap/c}',
73             '00A3', '{\pounds}',
74             '00A4', '$\spadesuit$',
75             '00A5', '\leavevmode\hbox{\rm\rlap=Y}',
76             '00A6', '\leavevmode
77             \hbox{\hskip.4ex\hbox{\ooalign{\vrule width.2ex height.5ex depth.4ex\crcr
78             \hfil\raise.8ex\hbox{\vrule width.2ex height.9ex depth0ex}\hfil}}}',
79             '00A7', '\S ',
80             '00A8', '{\"{ }}',
81             '00A9', '\leavevmode\hbox{\raise.6em\hbox{\copyright}}',
82             '00AA', '${}^{\b{\scriptsize a}}$',
83             '00AB', '$\scriptscriptstyle\ll$',
84             '00AC', '$\neg$',
85             '00AE', '\leavevmode\hbox{\raise.6em\hbox{\ooalign{{\mathhexbox20D}\crcr
86             \hfil\raise.07ex\hbox{r}\hfil}}}',
87             '00AF', '{\={ }}',
88             '00B0', '${}^\circ$',
89             '00B1', '$\pm$',
90             '00B2', '${}^2$',
91             '00B3', '${}^3$',
92             '00B4', '{\'{ }}',
93             '00B5', '$\mu$',
94             '00B6', '\P ',
95             '00B7', '$\cdot$',
96             '00B8', '{\c{ }}',
97             '00B9', '${}^1$',
98             '00BA', '${}^{\b{\scriptsize o}}$',
99             '00BB', '$\scriptscriptstyle\gg$',
100             '00BC', '$1\over4$',
101             '00BD', '$1\over2$',
102             '00BE', '$3\over4$',
103             '00BF', '?`',
104             '00C0', '{\`A}',
105             '00C1', q-{\'A}-,
106             '00C2', '{\^A}',
107             '00C3', '{\~A}',
108             '00C4', '{\"A}',
109             '00C5', '{\AA}',
110             '00C6', '{\AE}',
111             '00C7', '{\c{C}}',
112             '00C8', '{\`E}',
113             '00C9', q-{\'E}-,
114             '00CA', '{\^E}',
115             '00CB', '{\"E}',
116             '00CC', '{\`I}',
117             '00CD', q-{\'I}-,
118             '00CE', '{\^I}',
119             '00CF', '{\"I}',
120             '00D0', '\leavevmode\hbox{\ooalign{{D}\crcr
121             \hskip.2ex\raise.25ex\hbox{-}\hfil}}',
122             '00D1', '{\~N}',
123             '00D2', '{\`O}',
124             '00D3', q-{\'O}-,
125             '00D4', '{\^O}',
126             '00D5', '{\~O}',
127             '00D6', '{\"O}',
128             '00D7', '$\times$',
129             '00D8', '{\O}',
130             '00D9', '{\`U}',
131             '00DA', q-{\'U}-,
132             '00DB', '{\^U}',
133             '00DC', '{\"U}',
134             '00DD', q-{\'Y}-,
135             '00DE', '\leavevmode\hbox{I\hskip-.6ex\raise.5ex\hbox{$\scriptscriptstyle\supset$}}',
136             '00DF', '{\ss}',
137             '00E0', '{\`a}',
138             '00E1', q-{\'a}-,
139             '00E2', '{\^a}',
140             '00E3', '{\~a}',
141             '00E4', '{\"a}',
142             '00E5', '{\aa}',
143             '00E6', '{\ae}',
144             '00E7', '{\c{c}}',
145             '00E8', '{\`e}',
146             '00E9', q-{\'e}-,
147             '00EA', '{\^e}',
148             '00EB', '{\"e}',
149             '00EC', '{\`i}',
150             '00ED', q-{\'i}-,
151             '00EE', '{\^i}',
152             '00EF', '{\"i}',
153             '00F0', '\leavevmode\hbox{\ooalign{$\partial$\crcr\hskip.8ex\raise.7ex\hbox{-}\hfil}}',
154             '00F1', '{\~n}',
155             '00F2', '{\`o}',
156             '00F3', q-{\'o}-,
157             '00F4', '{\^o}',
158             '00F5', '{\~o}',
159             '00F6', '{\"o}',
160             '00F7', '$\div$',
161             '00F8', '{\o}',
162             '00F9', '{\`u}',
163             '00FA', q-{\'u}-,
164             '00FB', '{\^u}',
165             '00FC', '{\"u}',
166             '00FD', q-{\'y}-,
167             '00FE', '\leavevmode\hbox{{\lower.3ex\hbox{\large l}}\hskip-.52ex o}',
168             '00FF', '{\"y}',
169             '0107', q-{\'c}-,
170             '010C', '{\vC}',
171             '010D', '{\vc}',
172             '0159', '{\vr}',
173             '015F', '{\c{s}}',
174             '0160', '{\vS}',
175             '0161', '{\vs}',
176             '017A', q-{\'z}-,
177             '017E', '{\vz}',
178             # XXXXX
179             # Should these be surrounded by $ (math mode)?
180             # Also, what to do with \mu, which is listed twice?
181             '03B1', '\alpha',
182             '03B2', '\beta',
183             '03B3', '\gamma',
184             '03B4', '\delta',
185             '03B5', '\epsilon',
186             '03B6', '\zeta',
187             '03B7', '\eta',
188             '03B8', '\theta',
189             '03B9', '\iota',
190             '03BA', '\kappa',
191             '03BB', '\lambda',
192             '03BC', '\mu',
193             '03BD', '\nu',
194             '03BE', '\xi',
195             '03C0', '\pi',
196             '03C1', '\rho',
197             '03C2', '\varsigma',
198             '03C3', '\sigma',
199             '03C4', '\tau',
200             '03C5', '\upsilon',
201             '03C6', '\phi',
202             '03C7', '\chi',
203             '03C8', '\psi',
204             '03C9', '\omega',
205             '2007', '$\:$',
206             '2009', '$\,$',
207             '201C', '``',
208             '201D', '\'\'',
209             );
210              
211             # This mapping is only used in the from section. We'll do these by hand
212             # in the to mapping.
213 2         14 %charmap2 = (
214             '00A0', '~',
215             '00AD', '-',
216             '2002', '\ ',
217             '2003', '\ \ ',
218             '2014', '---',
219             '03BF', 'o',
220             );
221              
222             # Blah. TeX has such a non-uniform way of handling characters that this is
223             # really slow. I'm going to try some optimizations for the tocanon code
224             # since that will be heavily used. It makes this stuff less uniform though.
225             # Remember that we don't have a full TeX parser, or even a partial one.
226              
227             # Build up a search string to do the reverse map.
228 2         5 $cmap_to_eval = '';
229 2         4 $cmap_from8_eval = '';
230 2         5 $cmap_to_eval_1 = '';
231 2         5 $cmap_to_eval_2 = '';
232 2         5 %rmap = ();
233 2         6 %accent = ();
234              
235             # Step 1: Build a reverse map
236 2         26 while (($unicode, $mine) = each %charmap) {
237 262         899 $rmap{$mine} = $unicode;
238             }
239             # Step 2: walk through the keys in sorted order
240 2         5 local($mineE);
241 2         150 foreach $mine (sort keys %rmap) {
242 262         706 $can = &bib'unicode_to_canon( $rmap{$mine} );
243 262         429 $mineE = $mine;
244 262         1781 $mineE =~ s/(\W)/\\$1/g;
245             # The various maps for tocanon
246 262 100       1088 if ($mine =~ /^{\\([`'^"~])([\w])}$/) {
    100          
    100          
    100          
    100          
247 102         354 $accent{$1 . $2} = $can;
248             } elsif ($mine =~ /^{\\([vc])(\w)}$/) {
249 12         39 $accent{$1 . $2} = $can;
250             } elsif ($mine =~ /^{\\([vc]){(\w)}}$/) {
251 6         20 $accent{$1 . $2} = $can;
252             } elsif ($mine =~ /leavevmode/) {
253 18         56 $cmap_to_eval_1 .= "s/$mineE/$can/g;\n";
254             } elsif ($mine =~ /\$/) {
255 40         97 $cmap_to_eval_2 .= "s/$mineE/$can/g;\n";
256             } else {
257 84         176 $cmap_to_eval .= "s/$mineE/$can/g;\n";
258             }
259 262 100       567 if ( length($can) == 1 ) {
260 188         434 $cmap_from8_eval .= "s/$can/$mineE/g;\n";
261             }
262             }
263 2         19 $cmap_from8_eval .= "s/\\240/\\~/g;\ns/\\255/-/g;";
264             # leave rmap
265              
266             #%map_diac = (
267             #'tilde', '\~{}',
268             #'circ', '\^{}',
269             #'lcub', '$\lbrace$',
270             #'rcub', '$\rbrace$',
271             #'bsol', '$\backslash$',
272             #);
273              
274             # Careful. This is from only.
275 2         47 %metamap = (
276             '3100', '{', # Begin protection
277             '3110', '}', # End protection
278             # fonts
279             '0101', '{\rm ',
280             '0102', '{\it ',
281             '0103', '{\bf ',
282             '0111', '}',
283             '0112', '}',
284             '0113', '}',
285             '0110', '}', # previous font. We don't need a font stack to handle it.
286             '2102', '{\em ',
287             '2112', '}',
288             );
289              
290 2         10 $cs_init = 1;
291             }
292              
293             ######
294              
295             sub tocanon {
296 84     84   191 local($_, $protect) = @_;
297              
298             # unprotect the TeX characters
299 84 50       188 if ($protect) {
300             # input is assumed to be in TeX format, before _any_ canon processing.
301             # output is TeX format, but with raw magic characters.
302 84         187 s/\$>\$/>/g;
303 84         139 s/\$<\$/
304 84         145 s/\$\|\$/\|/g;
305 84         114 s/\\_/_/g;
306 84         129 s/\$\\rbrace\$/}/g;
307 84         107 s/\$\\lbrace\$/{/g;
308 84         119 s/\\\&/\&/g;
309 84         113 s/\\\%/\%/g;
310 84         105 s/\\\$/\$/g;
311 84         176 s/\\#/#/g;
312             }
313              
314 84 100       218 if (/-/) {
315 18         41 s/\$-\$/${bib'cs_ext}2212/go;
316 18         34 s/\b---\b/${bib'cs_ext}2014/go;
317 18         36 s/\b--\b/${bib'cs_ext}2013/go;
318             # leave -
319             }
320 84 100       269 if (/~/) {
321 8         35 1 while s/([^\\])~/$1\240/g;
322             }
323 84         122 s/\\ \\ /${bib'cs_ext}2003/go;
324 84         123 s/\\ /${bib'cs_ext}2002/go;
325              
326             # Can we go now?
327 84 100       395 return $_ unless /\\/;
328              
329 38 100       97 &init_cs unless $cs_init;
330              
331 38 100       172 if (/\\[`'^"~vc][{ ]?[\w]/) {
332             # ISO -- we try {\"{c}}, {\"c}, \"{c}, \"c
333             # ^^^^^
334             # preferred
335             #
336             # XXXXX What do we do about all the other ways they can try?
337             # mgnet.bib uses {\" u} a lot. (got this way now)
338              
339 30         128 while (/{\\([`'^"~vc])( ?)([\w])}/) {
340 32         142 $can = $accent{$1 . $3};
341 32         112 $mine = "{\\$1$2$3}";
342 32 50       72 if (!defined $can) {
343 0         0 &bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
344 0         0 $can = '';
345             }
346 32         337 $mine =~ s/(\W)/\\$1/g;
347 32         573 s/$mine/$can/g;
348             }
349 30         96 while (/{\\([`'^"~vc]){([\w])}}/) {
350 2         11 $can = $accent{$1 . $2};
351 2         8 $mine = "{\\$1\{$2\}}";
352 2 50       11 if (!defined $can) {
353 0         0 &bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
354 0         0 $can = '';
355             }
356 2         18 $mine =~ s/(\W)/\\$1/g;
357 2         25 s/$mine/$can/g;
358             }
359 30         93 while (/\\([`'^"~vc]){([\w])}/) {
360 0         0 $can = $accent{$1 . $2};
361 0         0 $mine = "\\$1\{$2\}";
362 0 0       0 if (!defined $can) {
363 0         0 &bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
364 0         0 $can = '';
365             }
366 0         0 $mine =~ s/(\W)/\\$1/g;
367 0         0 s/$mine/$can/g;
368             }
369 30         91 while (/\\([`'^"~])( ?)([\w])/) {
370 0         0 $can = $accent{$1 . $3};
371 0         0 $mine = "\\$1$2$3";
372 0 0       0 if (!defined $can) {
373 0         0 &bib'gotwarn("Can't convert TeX '$mine' in $_ to canon");
374 0         0 $can = '';
375             }
376 0         0 $mine =~ s/(\W)/\\$1/g;
377 0         0 s/$mine/$can/g;
378             }
379              
380             # This unfortunately matches \cr and \circ. We aren't doing a loop
381             # any more, so it's not even necessary anymore. Let the standard
382             # routine try to match and give the normal error message on failure.
383             #while (s/(\\[`'^"~vc][{ ]?[\w])//) {
384             # &bib'gotwarn("Couldn't parse TeX accented character: $1!");
385             #}
386              
387 30 100       231 return $_ unless /\\/;
388             } # end of standard accented characters
389              
390             # XXXXX What about the v, c, and other accents? Do we need another
391             # section for those, or can we fit them in above?
392              
393 12 100       43 if (/leavevmode/) {
394 2         417 eval $cmap_to_eval_1;
395             }
396 12 50       46 if (/\$/) {
397 0         0 eval $cmap_to_eval_2;
398             }
399 12         6691 eval $cmap_to_eval;
400              
401 12         44 s/\\\^{}/\^/g;
402 12         55 s/\\~{\s?}/~/g;
403              
404             # hopefully we're done by now
405 12 50       108 return $_ unless /\\/;
406              
407             # font changes
408             # This doesn't work all that well, but most bibliographies are simple
409 0           s/\{\\rm ([^{}]*)\}/${bib'cs_meta}0101$1${bib'cs_meta}0110/g;
410 0           s/\{\\it ([^{}]*)\}/${bib'cs_meta}0102$1${bib'cs_meta}0110/g;
411 0           s/\{\\bf ([^{}]*)\}/${bib'cs_meta}0103$1${bib'cs_meta}0110/g;
412 0           s/\{\\em ([^{}]*)\}/${bib'cs_meta}2102$1${bib'cs_meta}2112/g;
413 0 0         $_ = &bib'font_check($_) if /${bib'cs_meta}01/o;
414             # done with font changing
415              
416 0 0         return $_ unless /\\/;
417              
418 0           s/\$\\backslash\$/$bib'cs_temp/g;
419 0 0         if (!/\\/) {
420 0           s/$bib'cs_temp/\\/go;
421 0           return $_;
422             }
423 0           s/$bib'cs_temp/\\/go;
424              
425             # I give up.
426             # XXXXX We really ought to remove the escape and meta characters we have
427             # converted when we give them this warning.
428 0           &bib'gotwarn("Unknown TeX characters in '$_'");
429 0           $_;
430             }
431              
432             ######
433              
434             sub fromcanon {
435 0     0     local($_, $protect) = @_;
436 0           local($repl);
437             # We no longer check for font matching here, as that should be done by a
438             # call to bib'font_check in the tocanon code.
439              
440 0 0         if ($protect) {
441 0           s/\\/$bib'cs_temp/go;
442 0           s/#/\\#/g;
443 0           s/\$/\\\$/g;
444 0           s/\%/\\\%/g;
445 0           s/\&/\\\&/g;
446 0           s/{/\$\\lbrace\$/g;
447 0           s/}/\$\\rbrace\$/g;
448 0           s/_/\\_/g;
449 0           s/\|/\$\|\$/g;
450 0           s/>/\$>\$/g;
451 0           s/
452 0           s/\^/\\^{}/g;
453 0           s/~/\\~{}/g;
454 0           s/$bib'cs_temp/\$\\backslash\$/go;
455             }
456              
457 0           while (/([\200-\237])/) {
458 0           $repl = $1;
459 0           $unicode = &bib'canon_to_unicode($repl);
460 0           &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
461 0           s/$repl//g;
462             }
463              
464 0 0         &init_cs unless $cs_init;
465              
466             #if (/[\240-\377]/) {
467             # eval $cmap_from8_eval;
468             #}
469 0           s/\240/~/g;
470 0           s/\255/-/g;
471 0           while (/([\240-\377])/) {
472 0           $repl = $1;
473 0           $unicode = &bib'canon_to_unicode($repl);
474 0           s/$repl/$charmap{$unicode}/g;
475             }
476              
477             # Maybe we can go now?
478 0 0         return $_ unless /$bib'cs_escape/o;
479              
480 0           while (/${bib'cs_ext}(....)/) {
481 0           $unicode = $1;
482 0 0         if ($unicode =~ /^00[0-7]/) {
483 0           1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
  0            
484 0           next;
485             }
486 0 0 0       defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/$charmap{$unicode}/g
487             && next;
488 0 0 0       defined $charmap2{$unicode} && s/${bib'cs_ext}$unicode/$charmap2{$unicode}/g
489             && next;
490              
491 0           $can = &bib'unicode_approx($unicode);
492 0 0 0       defined $can && s/$bib'cs_ext$unicode/$can/g && next;
493              
494 0           &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
495 0           s/${bib'cs_ext}$unicode//g;
496             }
497              
498 0           while (/${bib'cs_meta}(....)/) {
499 0           $repl = $1;
500 0 0 0       defined $metamap{$repl} && s/${bib'cs_meta}$repl/$metamap{$repl}/g
501             && next;
502              
503 0           $can = &bib'meta_approx($repl);
504 0 0 0       defined $can && s/$bib'cs_meta$repl/$can/g && next;
505              
506 0           &bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to TeX");
507 0           s/${bib'cs_meta}$repl//g;
508             }
509              
510 0           $_;
511             }
512              
513             ######
514              
515              
516             #######################
517             # end of package
518             #######################
519              
520             1;