File Coverage

blib/lib/LaTeX/Decode.pm

Criterion	Covered	Total	%
statement	71	87	81.6
branch	16	26	61.5
condition	2	6	33.3
subroutine	11	11	100.0
pod	1	1	100.0
total	101	131	77.1

line	stmt	bran	cond	sub	pod	time	code
1							package LaTeX::Decode;
2
3	2			2		45250	use 5.008;
	2					8
	2					83
4	2			2		13	use warnings;
	2					5
	2					84
5	2			2		12	use strict;
	2					8
	2					78
6	2			2		19	use Carp;
	2					7
	2					187
7
8							=encoding utf-8
9
10							=head1 NAME
11
12							LaTeX::Decode - Decode from LaTeX to Unicode
13
14							=head1 VERSION
15
16							Version 0.03
17
18							=cut
19
20	2			2		10	use base qw(Exporter);
	2					9
	2					280
21							our $VERSION = '0.03';
22							our @EXPORT = qw(latex_decode);
23	2			2		1438	use LaTeX::Decode::Data;
	2					5
	2					883
24	2			2		2709	use Unicode::Normalize;
	2					15068
	2					2720
25
26							=head1 SYNOPSIS
27
28							use LaTeX::Decode;
29
30							my $latex_string = 'Mu\\d{h}ammad ibn M\\=us\=a al-Khw\\=arizm\\={\\i}';
31							my $new_string = latex_decode($latex_string); # => 'Muḥammad ibn Mūsā al-Khwārizmī'
32
33							=head1 DESCRIPTION
34
35							=head1 EXPORT
36
37							=head1 FUNCTIONS
38
39							=head2 latex_decode($text, %options)
40
41							Decodes the given text from LaTeX to Unicode.
42
43							The function accepts a number of options:
44
45							* normalize => $bool (default 1)
46							whether the output string should be normalized with Unicode::Normalize
47
48							* normalization => (default 'NFC')
49							and if yes, the normalization form to use (see the Unicode::Normalize documentation)
50
51							* strip_outer_braces => $bool (default 0)
52							whether the outer curly braces around letters+combining marks should be
53							stripped off. By default "fut{\\'e}" becomes fut{é}, to prevent something
54							like '\\textuppercase{\\'e}' to become '\\textuppercaseé'. Setting this option to
55							TRUE can be useful for instance when converting BibTeX files.
56
57							=head1 GLOBAL OPTIONS
58
59							The decoding scheme can be set with
60
61							$LaTeX::Decode::DefaultScheme = '';
62
63							Possible values are 'base', 'extra' and 'full'; default value is 'extra'.
64
65							base => Most common macros and diacritics (sufficient for Western languages
66							and common symbols)
67
68							extra => Also converts punctuation, larger range of diacritics and macros
69							(e.g. for IPA, Latin Extended Additional, etc.)
70
71							full => Also converts symbols, Greek letters, dingbats, negated symbols, and
72							superscript characters and symbols ...
73
74							=cut
75
76							our $DefaultScheme = 'extra';
77
78							sub _get_diac {
79	3			3		5	my $scheme = shift;
80	3	50				7	if ( $scheme eq 'base' ) {
81	0					0	return %DIACRITICS;
82							}
83							else {
84	3					208	return ( %DIACRITICS, %DIACRITICSEXTRA );
85							}
86							}
87
88							sub _get_mac {
89	3			3		7	my $scheme = shift;
90	3	50				14	if ( $scheme eq 'base' ) {
		50
91	0					0	return %WORDMACROS;
92							}
93							elsif ( $scheme eq 'full' ) {
94	0					0	return ( %WORDMACROS, %WORDMACROSEXTRA, %PUNCTUATION, %SYMBOLS,
95							%GREEK );
96							}
97							else {
98	3					564	return ( %WORDMACROS, %WORDMACROSEXTRA, %PUNCTUATION );
99							}
100							}
101
102							sub latex_decode {
103	3			3	1	2040	my $text = shift;
104	3					10	my %opts = @_;
105	3	50				12	my $norm = exists $opts{normalize} ? $opts{normalize} : 1;
106	3	50				11	my $norm_form = exists $opts{normalization} ? $opts{normalization} : 'NFC';
107	3	50				10	my $scheme = exists $opts{scheme} ? $opts{scheme} : $DefaultScheme;
108	3	50	33			37	croak "invalid scheme name '$scheme'"
			33
109							unless ( $scheme eq 'full' or $scheme eq 'base' or $scheme eq 'extra' );
110	3	100				9	my $strip_outer_braces =
111							exists $opts{strip_outer_braces} ? $opts{strip_outer_braces} : 0;
112
113	3					17	my %DIAC = _get_diac($scheme);
114	3					31	my %WORDMAC = _get_mac($scheme);
115
116							# a regex with all possible word macros
117	5429					5160	my $WORDMAC_RE =
118	3					177	join( '\|', sort { length $b <=> length $a } keys %WORDMAC );
119	3					3171	$WORDMAC_RE = qr{$WORDMAC_RE};
120
121	3					168	my $DIAC_RE;
122	3	50				12	if ( $scheme eq 'base' ) {
123	0					0	$DIAC_RE = $DIAC_RE_BASE;
124							}
125							else {
126	3					6	$DIAC_RE = $DIAC_RE_EXTRA;
127							}
128
129	3	50				10	if ( $scheme eq 'full' ) {
130	0					0	$text =~ s/\\not\\($NEG_SYMB_RE)/$NEGATEDSYMBOLS{$1}/ge;
	0					0
131	0					0	$text =~ s/\\textsuperscript{($SUPER_RE)}/$SUPERSCRIPTS{$1}/ge;
	0					0
132	0					0	$text =~ s/\\textsuperscript{\\($SUPERCMD_RE)}/$CMDSUPERSCRIPTS{$1}/ge;
	0					0
133	0					0	$text =~ s/\\dings{([2-9AF][0-9A-F])}/$DINGS{$1}/ge;
	0					0
134							}
135
136	3					34	$text =~ s/(\\[a-zA-Z]+)\\(\s+)/$1\{\}$2/g; # \foo\ bar -> \foo{} bar
137	3					19	$text =~ s/([^{]\\\w)([;,.:%])/$1\{\}$2/g; #} Aaaa\o, -> Aaaa\o{},
138	3					103	$text =~ s/(\\(?:$DIAC_RE_BASE\|$ACCENTS_RE)){\\i}/$1\{i\}/g;
139							# special cases such as '\={\i}' -> '\={i}' -> "i\x{304}"
140
141							## remove {} around macros that print one character
142							## by default we skip that, as it would break constructions like \foo{\i}
143	3	100				10	if ($strip_outer_braces) {
144	1					499	$text =~ s/ \{\\($WORDMAC_RE)\} / $WORDMAC{$1} /gxe;
	0					0
145							}
146	3					1503	$text =~ s/ \\($WORDMAC_RE)(?: \{\} \| \s+ \| \b) / $WORDMAC{$1} /gxe;
	6					37
147
148	3					62	$text =~ s/\\($ACCENTS_RE)\{(\p{L}\p{M}*)\}/$2 . $ACCENTS{$1}/ge;
	6					1497
149
150	3					42	$text =~ s/\\($ACCENTS_RE)(\p{L}\p{M}*)/$2 . $ACCENTS{$1}/ge;
	5					211
151
152	3					120	$text =~ s/\\($DIAC_RE)\s\{(\p{L}\p{M})\}/$2 . $DIAC{$1}/ge;
	6					587
153
154	3					81	$text =~ s/\\($DIAC_RE)\s+(\p{L}\p{M}*)/$2 . $DIAC{$1}/ge;
	4					417
155
156	3					37	$text =~ s/\\($ACCENTS_RE)\{(\p{L}\p{M}*)\}/$2 . $ACCENTS{$1}/ge;
	4					16
157
158	3					178	$text =~ s/\\($ACCENTS_RE)(\p{L}\p{M}*)/$2 . $ACCENTS{$1}/ge;
	0					0
159
160	3					200	$text =~ s/\\($DIAC_RE)\s\{(\p{L}\p{M})\}/$2 . $DIAC{$1}/ge;
	1					6
161
162	3					494	$text =~ s/\\($DIAC_RE)\s+(\p{L}\p{M}*)/$2 . $DIAC{$1}/ge;
	0					0
163
164							## remove {} around letter+combining mark(s)
165							## by default we skip that, as it would destroy constructions like \foo{\`e}
166	3	100				424	if ($strip_outer_braces) {
167	2			2		1362	$text =~ s/{(\PM\pM+)}/$1/g;
	2					14
	2					33
	1					44
168							}
169
170	3	50				9	if ($norm) {
171	3					17	return Unicode::Normalize::normalize( $norm_form, $text );
172							}
173							else {
174	0						return $text;
175							}
176							}
177
178							=head1 AUTHOR
179
180							François Charette, C<< >>
181
182							=head1 BUGS
183
184							Please report any bugs or feature requests to C
185							rt.cpan.org>, or through the web interface at
186							L. I will be
187							notified, and then you'll automatically be notified of progress on your bug as
188							I make changes.
189
190							=head1 COPYRIGHT & LICENSE
191
192							Copyright 2009-2010 François Charette, all rights reserved.
193
194							This module is free software. You can redistribute it and/or
195							modify it under the terms of the Artistic License 2.0.
196
197							This program is distributed in the hope that it will be useful,
198							but without any warranty; without even the implied warranty of
199							merchantability or fitness for a particular purpose.
200
201							=cut
202
203							1;
204
205							# vim: set tabstop=4 shiftwidth=4 expandtab: