File Coverage

blib/lib/LaTeX/Decode.pm

Criterion	Covered	Total	%
statement	71	87	81.6
branch	16	26	61.5
condition	2	6	33.3
subroutine	11	11	100.0
pod	1	1	100.0
total	101	131	77.1

line	stmt	bran	cond	sub	pod	time	code
1							package LaTeX::Decode;
2
3	2			2		32171	use 5.008;
	2					6
	2					84
4	2			2		12	use warnings;
	2					3
	2					67
5	2			2		7	use strict;
	2					6
	2					55
6	2			2		12	use Carp;
	2					2
	2					158
7
8							=encoding utf-8
9
10							=head1 NAME
11
12							LaTeX::Decode - Decode from LaTeX to Unicode
13
14							=head1 VERSION
15
16							Version 0.04
17
18							=cut
19
20	2			2		11	use base qw(Exporter);
	2					5
	2					277
21							our $VERSION = '0.04';
22							our @EXPORT = qw(latex_decode);
23	2			2		1110	use LaTeX::Decode::Data;
	2					4
	2					495
24	2			2		1362	use Unicode::Normalize;
	2					4101
	2					1305
25
26							=head1 SYNOPSIS
27
28							use LaTeX::Decode;
29
30							my $latex_string = 'Mu\\d{h}ammad ibn M\\=us\=a al-Khw\\=arizm\\={\\i}';
31							my $new_string = latex_decode($latex_string); # => 'Muḥammad ibn Mūsā al-Khwārizmī'
32
33							=head1 DESCRIPTION
34
35							=head1 EXPORT
36
37							=head1 FUNCTIONS
38
39							=head2 latex_decode($text, %options)
40
41							Decodes the given text from LaTeX to Unicode.
42
43							The function accepts a number of options:
44
45							* normalize => $bool (default 0)
46							whether the output string should be normalized with Unicode::Normalize
47
48							* normalization => (default 'NFC')
49							and if yes, the normalization form to use (see the Unicode::Normalize documentation)
50
51							* strip_outer_braces => $bool (default 0)
52							whether the outer curly braces around letters+combining marks should be
53							stripped off. By default "fut{\\'e}" becomes fut{é}, to prevent something
54							like '\\textuppercase{\\'e}' to become '\\textuppercaseé'. Setting this option to
55							TRUE can be useful for instance when converting BibTeX files.
56
57							=head1 GLOBAL OPTIONS
58
59							The decoding scheme can be set with
60							$LaTeX::Decode::DefaultScheme = '';
61							Possible values are 'base', 'extra' and 'full'; default value is 'extra'.
62
63							TODO : explain these scheme!
64
65							base => Most common macros and diacritics (sufficient for Western languages
66							and common symbols)
67
68							extra => Also converts punctuation, larger range of diacritics and macros (e.g. for IPA, Latin Extended
69							Additional, etc.)
70
71							full => Also converts symbols, Greek letters, dingbats, negated symbols, and
72							superscript characters and symbols ...
73
74							=cut
75
76							our $DefaultScheme = 'extra';
77
78							sub _get_diac {
79	3			3		4	my $scheme = shift;
80	3	50				7	if ( $scheme eq 'base' ) {
81	0					0	return %DIACRITICS;
82							}
83							else {
84	3					178	return ( %DIACRITICS, %DIACRITICSEXTRA );
85							}
86							}
87
88							sub _get_mac {
89	3			3		4	my $scheme = shift;
90	3	50				13	if ( $scheme eq 'base' ) {
		50
91	0					0	return %WORDMACROS;
92							}
93							elsif ( $scheme eq 'full' ) {
94	0					0	return ( %WORDMACROS, %WORDMACROSEXTRA, %PUNCTUATION, %SYMBOLS,
95							%GREEK );
96							}
97							else {
98	3					434	return ( %WORDMACROS, %WORDMACROSEXTRA, %PUNCTUATION );
99							}
100							}
101
102							sub latex_decode {
103	3			3	1	1624	my $text = shift;
104	3					8	my %opts = @_;
105	3	50				10	my $norm = exists $opts{normalize} ? $opts{normalize} : 1;
106	3	50				11	my $norm_form = exists $opts{normalization} ? $opts{normalization} : 'NFC';
107	3	50				10	my $scheme = exists $opts{scheme} ? $opts{scheme} : $DefaultScheme;
108	3	50	33			31	croak "invalid scheme name '$scheme'"
			33
109							unless ( $scheme eq 'full' or $scheme eq 'base' or $scheme eq 'extra' );
110	3	100				9	my $strip_outer_braces =
111							exists $opts{strip_outer_braces} ? $opts{strip_outer_braces} : 0;
112
113	3					13	my %DIAC = _get_diac($scheme);
114	3					21	my %WORDMAC = _get_mac($scheme);
115
116							# a regex with all possible word macros
117	5455					4312	my $WORDMAC_RE =
118	3					143	join( '\|', sort { length $b <=> length $a } keys %WORDMAC );
119	3					2607	$WORDMAC_RE = qr{$WORDMAC_RE};
120
121	3					131	my $DIAC_RE;
122	3	50				15	if ( $scheme eq 'base' ) {
123	0					0	$DIAC_RE = $DIAC_RE_BASE;
124							}
125							else {
126	3					7	$DIAC_RE = $DIAC_RE_EXTRA;
127							}
128
129	3	50				7	if ( $scheme eq 'full' ) {
130	0					0	$text =~ s/\\not\\($NEG_SYMB_RE)/$NEGATEDSYMBOLS{$1}/ge;
	0					0
131	0					0	$text =~ s/\\textsuperscript{($SUPER_RE)}/$SUPERSCRIPTS{$1}/ge;
	0					0
132	0					0	$text =~ s/\\textsuperscript{\\($SUPERCMD_RE)}/$CMDSUPERSCRIPTS{$1}/ge;
	0					0
133	0					0	$text =~ s/\\dings{([2-9AF][0-9A-F])}/$DINGS{$1}/ge;
	0					0
134							}
135
136	3					38	$text =~ s/(\\[a-zA-Z]+)\\(\s+)/$1\{\}$2/g; # \foo\ bar -> \foo{} bar
137	3					20	$text =~ s/([^{]\\\w)([;,.:%])/$1\{\}$2/g; #} Aaaa\o, -> Aaaa\o{},
138	3					93	$text =~ s/(\\(?:$DIAC_RE_BASE\|$ACCENTS_RE)){\\i}/$1\{i\}/g;
139							# special cases such as '\={\i}' -> '\={i}' -> "i\x{304}"
140
141							## remove {} around macros that print one character
142							## by default we skip that, as it would break constructions like \foo{\i}
143	3	100				10	if ($strip_outer_braces) {
144	1					354	$text =~ s/ \{\\($WORDMAC_RE)\} / $WORDMAC{$1} /gxe;
	0					0
145							}
146	3					1051	$text =~ s/ \\($WORDMAC_RE)(?: \{\} \| \s+ \| \b) / $WORDMAC{$1} /gxe;
	6					29
147
148	3					53	$text =~ s/\\($ACCENTS_RE)\{(\p{L}\p{M}*)\}/$2 . $ACCENTS{$1}/ge;
	6					1012
149
150	3					39	$text =~ s/\\($ACCENTS_RE)(\p{L}\p{M}*)/$2 . $ACCENTS{$1}/ge;
	5					162
151
152	3					112	$text =~ s/\\($DIAC_RE)\s\{(\p{L}\p{M})\}/$2 . $DIAC{$1}/ge;
	6					456
153
154	3					76	$text =~ s/\\($DIAC_RE)\s+(\p{L}\p{M}*)/$2 . $DIAC{$1}/ge;
	4					314
155
156	3					36	$text =~ s/\\($ACCENTS_RE)\{(\p{L}\p{M}*)\}/$2 . $ACCENTS{$1}/ge;
	4					13
157
158	3					166	$text =~ s/\\($ACCENTS_RE)(\p{L}\p{M}*)/$2 . $ACCENTS{$1}/ge;
	0					0
159
160	3					229	$text =~ s/\\($DIAC_RE)\s\{(\p{L}\p{M})\}/$2 . $DIAC{$1}/ge;
	1					4
161
162	3					424	$text =~ s/\\($DIAC_RE)\s+(\p{L}\p{M}*)/$2 . $DIAC{$1}/ge;
	0					0
163
164							## remove {} around letter+combining mark(s)
165							## by default we skip that, as it would destroy constructions like \foo{\`e}
166	3	100				318	if ($strip_outer_braces) {
167	2			2		553	$text =~ s/{(\PM\pM+)}/$1/g;
	2					8
	2					27
	1					12
168							}
169
170	3	50				8	if ($norm) {
171	3					13	return Unicode::Normalize::normalize( $norm_form, $text );
172							}
173							else {
174	0						return $text;
175							}
176							}
177
178							=head1 AUTHOR
179
180							François Charette, C<< >>
181
182							=head1 BUGS
183
184							Please report any bugs or feature requests to C
185							rt.cpan.org>, or through the web interface at
186							L. I will be
187							notified, and then you'll automatically be notified of progress on your bug as
188							I make changes.
189
190							=head1 COPYRIGHT & LICENSE
191
192							Copyright 2009-2015 François Charette, all rights reserved.
193
194							This module is free software. You can redistribute it and/or
195							modify it under the terms of the Artistic License 2.0.
196
197							This program is distributed in the hope that it will be useful,
198							but without any warranty; without even the implied warranty of
199							merchantability or fitness for a particular purpose.
200
201							=cut
202
203							1;
204
205							# vim: set tabstop=4 shiftwidth=4 expandtab: