File Coverage

blib/lib/Lingua/Deva.pm

Criterion	Covered	Total	%
statement	193	219	88.1
branch	99	118	83.9
condition	37	57	64.9
subroutine	17	17	100.0
pod	6	6	100.0
total	352	417	84.4

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Deva;
2
3	6			6		290138	use v5.12.1;
	6					26
	6					471
4	6			6		34	use strict;
	6					12
	6					233
5	6			6		32	use warnings;
	6					25
	6					215
6	6			6		48	use utf8;
	6					10
	6					63
7	6			6		18275	use charnames qw( :full );
	6					279430
	6					65
8	6			6		10156	use open qw( :encoding(UTF-8) :std );
	6					10626
	6					58
9	6			6		134430	use Unicode::Normalize qw( NFD NFC );
	6					15892
	6					643
10	6			6		145	use Carp qw( croak carp );
	6					15
	6					323
11
12	6			6		5964	use Lingua::Deva::Aksara;
	6					19
	6					256
13	6					3024	use Lingua::Deva::Maps qw( %Consonants %Vowels %Diacritics %Finals
14	6			6		43	$Inherent $Virama $Avagraha );
	6					11
15
16							=encoding UTF-8
17
18							=head1 NAME
19
20							Lingua::Deva - Convert between Latin and Devanagari Sanskrit text
21
22							=cut
23
24							our $VERSION = '1.20';
25
26							=head1 SYNOPSIS
27
28							use v5.12.1;
29							use strict;
30							use utf8;
31							use charnames ':full';
32							use Lingua::Deva;
33
34							# Basic usage
35							my $d = Lingua::Deva->new();
36							say $d->to_latin('आसीद्राजा'); # prints 'āsīdrājā'
37							say $d->to_deva('Nalo nāma'); # prints 'नलो नाम'
38
39							# With configuration: strict, allow Danda, 'w' for 'v'
40							my %c = %Lingua::Deva::Maps::Consonants;
41							$d = Lingua::Deva->new(
42							strict => 1,
43							allow => [ "\N{DEVANAGARI DANDA}" ],
44							C => do { $c{'w'} = delete $c{'v'}; \%c },
45							);
46							say $d->to_deva('ziwāya'); # 'zइवाय', warning for 'z'
47							say $d->to_latin('सर्वम्।'); # 'sarwam।', no warnings
48
49							=head1 DESCRIPTION
50
51							The C module provides facilities for converting Sanskrit in
52							various Latin transliterations to Devanagari and vice-versa. "Deva" is the
53							name for the Devanagari (I) script according to ISO 15924.
54
55							The facilities of this module are exposed through a simple interface in the
56							form of instances of the L class. A number of configuration
57							options can be passed to it during initialization.
58
59							Using the module is as simple as creating a C instance and
60							calling its methods L or L with appropriate string
61							arguments.
62
63							my $d = Lingua::Deva->new();
64							say $d->to_latin('कामसूत्र');
65							say $d->to_deva('Kāmasūtra');
66
67							By default, transliteration follows the widely used IAST conventions. Three
68							other ready-made transliteration schemes are also included with this module,
69							ISO 15919 (C), Harvard-Kyoto (C), and ITRANS.
70
71							my $d = Lingua::Deva->new(map => 'HK');
72							say $d->to_latin('कामसूत्र'); # prints 'kAmasUtra'
73
74							For additional flexibility all mappings can be completely customized; users
75							can also provide their own.
76
77							use Lingua::Deva::Maps::ISO15919;
78							my %f = %Lingua::Deva::Maps::ISO15919::Finals;
79							my $d = Lingua::Deva->new(
80							map => 'IAST', # use IAST transliteration
81							casesensitive => 1, # do not case fold
82							F => \%f, # ISO 15919 mappings for finals
83							);
84							say $d->to_deva('Vṛtraṁ'); # prints 'Vऋत्रं'
85
86							For more information on customization see L.
87
88							Behind the scenes, all translation is done via an intermediate object
89							representation called "Aksara" (Sanskrit I). These objects are
90							instances of L, which provides an interface to inspect
91							and manipulate individual Aksaras.
92
93							# Create an array of Aksaras
94							my $a = $d->l_to_aksaras('Kāmasūtra');
95
96							# Print vowel in the fourth Aksara
97							say $a->[3]->vowel();
98
99							The methods and options of C are described below.
100
101							=head2 Methods
102
103							=over 4
104
105							=item new()
106
107							Constructor. Takes the following optional arguments.
108
109							=over 4
110
111							=item C<< map => 'IAST'\|'ISO15919'\|'HK'\|'ITRANS' >>
112
113							Selects one of the ready-made transliteration schemes.
114
115							=item C<< casesensitive => (0\|1) >>
116
117							Determines whether case is treated as distinctive or not. Some schemes (eg.
118							Harvard-Kyoto) set this to C<1> while others (eg. IAST) set it to C<0>.
119
120							Default is C<0>.
121
122							=item C<< strict => (0\|1) >>
123
124							In I mode invalid input is flagged with warnings. Invalid means
125							either not a Devanagari token (eg. I) or structurally ill-formed (eg. a
126							Devanagari diacritic vowel following an independent vowel).
127
128							Default is C<0>.
129
130							=item C<< allow => [ ... ] >>
131
132							In strict mode, the C array can be used to exempt certain characters
133							from being flagged as invalid even though they normally would be.
134
135							=item C<< avagraha => "'" >>
136
137							Specifies the Latin character used for the transcription of I (ऽ).
138
139							Default is C<"'"> (apostrophe).
140
141							=item C<< C => { consonants map } >>
142
143							=item C<< V => { independent vowels map } >>
144
145							=item C<< D => { diacritic vowels map } >>
146
147							=item C<< F => { finals map } >>
148
149							Transliteration maps in the direction from Latin to Devanagari script.
150
151							=item C<< DC => { consonants map } >>
152
153							=item C<< DV => { independent vowels map } >>
154
155							=item C<< DD => { diacritic vowels map } >>
156
157							=item C<< DF => { finals map } >>
158
159							Transliteration maps in the direction from Devanagari to Latin script. When
160							these are not given, reversed versions of the Latin to Devanagari maps are
161							used.
162
163							The default maps are in L. To customize, make a copy of
164							an existing mapping hash (or create your own) and pass it to one of these
165							parameters.
166
167							=back
168
169							=cut
170
171							sub new {
172	16			16	1	3768	my ($class, %opts) = @_;
173
174	16					202	my $self = {
175							casesensitive => 0,
176							strict => 0,
177							allow => [], # converted to a hash for efficiency
178							avagraha => "'",
179							C => \%Consonants,
180							V => \%Vowels,
181							D => \%Diacritics,
182							F => \%Finals,
183							%opts,
184							};
185
186							# Transliteration scheme setup
187	16	100				80	if (defined $self->{map}) {
188	4	100				27	if ($self->{map} =~ /^(ISO15919\|ITRANS\|IAST\|HK)$/) {
189	6			6		45	no strict 'refs';
	6					12
	6					8115
190	3					12	my $pkg = "Lingua::Deva::Maps::$1";
191	3					321	eval "require $pkg";
192	3					29	for (qw(Consonants Vowels Diacritics Finals)) {
193	12					19	my $k = substr $_, 0, 1;
194	12	100				27	$self->{$k} = do { my %c = %{"${pkg}::$_"}; \%c } unless defined $opts{$k};
	10					12
	10					132
	10					39
195							}
196	3	100	66			16	if (!defined $opts{casesensitive} and defined ${"${pkg}::CASE"}) {
	2					15
197	2					3	$self->{casesensitive} = ${"${pkg}::CASE"};
	2					10
198							}
199							}
200							else {
201	1					205	carp("Invalid transliteration map, using default");
202							}
203							}
204
205							# By default use reversed maps for the opposite direction (DC DV DD DF)
206	16					81	for (qw( C V D F )) {
207	64	100				182	$self->{"D$_"} = do { my %m = reverse %{$self->{$_}}; \%m } if !defined $self->{"D$_"};
	63					78
	63					1183
	63					260
208							}
209
210							# Make the inherent vowel translate to '' in the D map
211	16					55	$self->{D}->{$Inherent} = '';
212
213							# Convert the 'allow' array to a hash for fast lookup
214	16					25	my %allow = map { $_ => 1 } @{ $self->{allow} };
	5					17
	16					44
215	16					36	$self->{allow} = \%allow;
216
217							# Make consonants, vowels, and finals available as tokens
218	16					30	my %tokens = (%{ $self->{C} }, %{ $self->{V} }, %{ $self->{F} });
	16					98
	16					65
	16					1312
219	16					244	$self->{T} = \%tokens;
220
221	16					99	return bless $self, $class;
222							}
223
224							=item l_to_tokens()
225
226							Converts a string of Latin characters into tokens and returns a reference to
227							an array of tokens. A "token" is either a character sequence which may
228							constitute a single Devanagari grapheme or a single non-Devanagari character.
229							In the first sense, a token is simply any key in the transliteration maps.
230
231							my $t = $d->l_to_tokens("Bhārata\n");
232							# $t now refers to the array ['Bh','ā','r','a','t','a',"\n"]
233
234							The input string is normalized with
235							L. No chomping takes place.
236							Upper case and lower case distinctions are preserved.
237
238							=cut
239
240							sub l_to_tokens {
241	20016			20016	1	71452	my ($self, $text) = @_;
242	20016	100				49883	return unless defined $text;
243
244	20015					248435	my $nfdtext = NFD($text);
245
246	20015					31006	my $re = join '\|', reverse sort { length $a <=> length $b } keys %{$self->{T}};
	4103201					5119049
	20015					227972
247	20015	100				4731812	my $reobject = $self->{casesensitive} ? qr/($re\|.)/s : qr/($re\|.)/is;
248
249	20015					76983	my @tokens;
250	20015					139969	while ($nfdtext =~ /$reobject/gc) {
251	1244245					15230580	push @tokens, $1;
252							}
253
254	20015					133648	return \@tokens;
255							}
256
257							=item l_to_aksaras()
258
259							Converts a Latin string (or a reference to an array of tokens) into
260							L and returns a reference to an array of
261							Aksaras.
262
263							my $a = $d->l_to_aksaras('hyaḥ');
264							is( ref($a->[0]), 'Lingua::Deva::Aksara', 'one aksara object' );
265							done_testing();
266
267							Input tokens which can not be part of an Aksara pass through untouched. Thus,
268							the resulting array can contain both C objects and
269							separate tokens.
270
271							In I mode warnings for invalid tokens are output.
272
273							=cut
274
275							sub l_to_aksaras {
276	15014			15014	1	85512	my ($self, $input) = @_;
277
278							# Input can be either a string (scalar) or an array reference
279	15014	100				59226	my $tokens = ref($input) eq '' ? $self->l_to_tokens($input) : $input;
280
281	15014					30286	my @aksaras;
282							my $a;
283	15014					30115	my $state = 0;
284	15014					45633	my ($C, $V, $F) = ($self->{C}, $self->{V}, $self->{F});
285
286							# Aksarization is implemented with a state machine.
287							# State 0: Not currently constructing an aksara, ready for any input
288							# State 1: Constructing consonantal onset
289							# State 2: Onset and vowel read, ready for final or end of aksara
290
291	15014					27810	for my $t (@$tokens) {
292	933222	100				2379750	my $lct = $self->{casesensitive} ? $t : lc $t;
293	933222	100				2343663	if ($state == 0) {
		100
		50
294	189053	100				487581	if (exists $C->{$lct}) { # consonant: new aksara
		100
295	114013					552203	$a = Lingua::Deva::Aksara->new( onset => [ $lct ] );
296	114013					238844	$state = 1;
297							}
298							elsif (exists $V->{$lct}) { # vowel: vowel-initial aksara
299	27013					95121	$a = Lingua::Deva::Aksara->new( vowel => $lct );
300	27013					65529	$state = 2;
301							}
302							else { # final/space/avagraha/other
303	48027	100	100			307835	if ($t !~ /\p{Space}/ and $t ne $self->{avagraha}
			100
			100
304							and $self->{strict} and !exists $self->{allow}->{$t}) {
305	1004					144415	carp("Invalid token $t read");
306							}
307	48027					246876	push @aksaras, $t;
308							}
309							}
310							elsif ($state == 1) {
311	414094	100				1200858	if (exists $C->{$lct}) { # consonant: part of onset
		100
312	84028					98272	push @{ $a->onset() }, $lct;
	84028					246849
313							}
314							elsif (exists $V->{$lct}) { # vowel: vowel nucleus
315	309066					886958	$a->vowel( $lct );
316	309066					576158	$state = 2;
317							}
318							else { # final/space/avagraha/other
319	21000	0	33			87136	if ($t !~ /\p{Space}/ and $t ne $self->{avagraha}
			33
			0
320							and $self->{strict} and !exists $self->{allow}->{$t}) {
321	0					0	carp("Invalid token $t read");
322							}
323	21000					56403	push @aksaras, $a, $t;
324	21000					36881	$state = 0;
325							}
326							}
327							elsif ($state == 2) {
328	330075	100				967348	if (exists $C->{$lct}) { # consonant: new aksara
		50
		100
329	216054					573740	push @aksaras, $a;
330	216054					3295907	$a = Lingua::Deva::Aksara->new( onset => [ $lct ] );
331	216054					466792	$state = 1;
332							}
333							elsif (exists $V->{$lct}) { # vowel: new vowel-initial aksara
334	0					0	push @aksaras, $a;
335	0					0	$a = Lingua::Deva::Aksara->new( vowel => $lct );
336	0					0	$state = 2;
337							}
338							elsif (exists $F->{$lct}) { # final: end of aksara
339	33007					98027	$a->final( $lct );
340	33007					55366	push @aksaras, $a;
341	33007					59856	$state = 0;
342							}
343							else { # space/avagraha/other
344	81014	100	66			368750	if ($t !~ /\p{Space}/ and $t ne $self->{avagraha}
			100
			66
345							and $self->{strict} and !exists $self->{allow}->{$t}) {
346	2000					231307	carp("Invalid token $t read");
347							}
348	81014					341228	push @aksaras, $a, $t;
349	81014					143398	$state = 0;
350							}
351							}
352							}
353
354							# Finish aksara currently under construction
355	15014	100	100			88748	push @aksaras, $a if $state == 1 or $state == 2;
356
357	15014					375844	return \@aksaras;
358							}
359
360							*l_to_aksara = \&l_to_aksaras; # alias
361
362							=item d_to_aksaras()
363
364							Converts a Devanagari string into L and returns
365							a reference to an array of Aksaras.
366
367							my $aksaras = $d->d_to_aksaras('बुद्धः');
368							my $onset = $aksaras->[1]->onset();
369							is_deeply( $onset, ['d', 'dh'], 'onset of second aksara' );
370							done_testing();
371
372							Input tokens which can not be part of an Aksara pass through untouched. Thus,
373							the resulting array can contain both C objects and
374							separate tokens.
375
376							In I mode warnings for invalid tokens are output.
377
378							=cut
379
380							sub d_to_aksaras {
381	5006			5006	1	24579	my ($self, $input) = @_;
382
383	5006					81859	my @chars = split //, $input;
384	5006					16051	my @aksaras;
385							my $a;
386	5006					22898	my $state = 0;
387	5006					14480	my ($DC, $DV, $DD, $DF) = ( $self->{DC}, $self->{DV},
388							$self->{DD}, $self->{DF} );
389
390							# Aksarization is implemented with a state machine.
391							# State 0: Not currently constructing an aksara, ready for any input
392							# State 1: Consonant with inherent vowel, ready for vowel, Virama, final
393							# State 2: Virama read, ready for consonant or end of aksara
394							# State 3: Vowel read, ready for final or end of aksara
395							# The inherent vowel needs to be taken into account specially
396
397	5006					7715	for my $c (@chars) {
398	310113	100				783902	if ($state == 0) {
		100
		100
		50
399	63028	100				164508	if (exists $DC->{$c}) { # consonant: new aksara
		100
		100
400	38006					158510	$a = Lingua::Deva::Aksara->new( onset => [ $DC->{$c} ] );
401	38006					63737	$state = 1;
402							}
403							elsif (exists $DV->{$c}) { # vowel: vowel-initial aksara
404	9007					30528	$a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
405	9007					15130	$state = 3;
406							}
407							elsif ($c =~ /$Avagraha/) { # Avagraha
408	1000					2608	push @aksaras, $self->{avagraha};
409							}
410							else { # final or other: invalid
411	15015	100	100			77501	if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
			100
412	1					132	carp("Invalid character $c read");
413							}
414	15015					33369	push @aksaras, $c;
415							}
416							}
417							elsif ($state == 1) {
418	138050	100				513768	if ($c =~ /$Virama/) { # Virama: consonant-final
		100
		50
		100
		100
		50
419	35013					53157	$state = 2;
420							}
421							elsif (exists $DD->{$c}) { # diacritic: vowel nucleus
422	67016					213527	$a->vowel( $DD->{$c} );
423	67016					107454	$state = 3;
424							}
425							elsif (exists $DV->{$c}) { # vowel: new vowel-initial aksara
426	0					0	$a->vowel( $Inherent );
427	0					0	push @aksaras, $a;
428	0					0	$a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
429	0					0	$state = 3;
430							}
431							elsif (exists $DC->{$c}) { # consonant: new aksara
432	20012					55870	$a->vowel( $Inherent );
433	20012					28520	push @aksaras, $a;
434	20012					78767	$a = Lingua::Deva::Aksara->new( onset => [ $DC->{$c} ] );
435							}
436							elsif (exists $DF->{$c}) { # final: end of aksara
437	7004					17516	$a->vowel( $Inherent );
438	7004					19290	$a->final( $DF->{$c} );
439	7004					9412	push @aksaras, $a;
440	7004					10715	$state = 0;
441							}
442							elsif ($c =~ /$Avagraha/) { # Avagraha
443	0					0	$a->vowel( $Inherent );
444	0					0	push @aksaras, $a, $self->{avagraha};
445	0					0	$state = 0;
446							}
447							else { # other: invalid
448	9005					24371	$a->vowel( $Inherent );
449	9005	50	66			32938	if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
			33
450	0					0	carp("Invalid character $c read");
451							}
452	9005					16797	push @aksaras, $a, $c;
453	9005					12957	$state = 0;
454							}
455							}
456							elsif ($state == 2) {
457	35013	100				99425	if (exists $DC->{$c}) { # consonant: cluster
		50
		50
458	28013					28111	push @{ $a->onset() }, $DC->{$c};
	28013					75180
459	28013					45484	$state = 1;
460							}
461							elsif (exists $DV->{$c}) { # vowel: new vowel-initial aksara
462	0					0	push @aksaras, $a;
463	0					0	$a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
464	0					0	$state = 3;
465							}
466							elsif ($c =~ /$Avagraha/) { # Avagraha
467	0					0	push @aksaras, $a, $self->{avagraha};
468	0					0	$state = 0;
469							}
470							else { # other: invalid
471	7000	0	33			64353	if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
			33
472	0					0	carp("Invalid character $c read");
473							}
474	7000					11510	push @aksaras, $a, $c;
475	7000					10183	$state = 0;
476							}
477							}
478							elsif ($state == 3) { # final: end of aksara
479	74022	100				246388	if (exists $DF->{$c}) {
		100
		50
		50
480	4000					15585	$a->final( $DF->{$c} );
481	4000					5427	push @aksaras, $a;
482	4000					6091	$state = 0;
483							}
484							elsif (exists $DC->{$c}) { # consonant: new aksara
485	52019					84923	push @aksaras, $a;
486	52019					225735	$a = Lingua::Deva::Aksara->new( onset => [ $DC->{$c} ] );
487	52019					86310	$state = 1;
488							}
489							elsif (exists $DV->{$c}) { # vowel: new vowel-initial aksara
490	0					0	push @aksaras, $a;
491	0					0	$a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
492	0					0	$state = 3;
493							}
494							elsif ($c =~ /$Avagraha/) { # Avagraha
495	0					0	push @aksaras, $a, $self->{avagraha};
496	0					0	$state = 0;
497							}
498							else { # other: invalid
499	18003	50	66			68180	if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
			33
500	0					0	carp("Invalid character $c read");
501							}
502	18003					30860	push @aksaras, $a, $c;
503	18003					26456	$state = 0;
504							}
505							}
506							}
507
508							# Finish aksara currently under construction
509	5006					8446	given ($state) {
510	5006					7636	when (1) { $a->vowel( $Inherent ); continue }
	0					0
	0					0
511	5006					15064	when ([1..3]) { push @aksaras, $a }
	2001					4140
512							}
513
514	5006					110962	return \@aksaras;
515							}
516
517							*d_to_aksara = \&d_to_aksaras; # alias
518
519							=item to_deva()
520
521							Converts a Latin string (or a reference to an array of
522							L) into Devanagari and returns a Devanagari
523							string.
524
525							say $d->to_deva('Kāmasūtra');
526
527							# same as
528							my $a = $d->l_to_aksaras('Kāmasūtra');
529							say $d->to_deva($a);
530
531							Aksaras are assumed to be well-formed.
532
533							=cut
534
535							sub to_deva {
536	6			6	1	208	my ($self, $input) = @_;
537
538							# Input can be either a string (scalar) or an array reference
539	6	100				38	my $aksaras = ref($input) eq '' ? $self->l_to_aksaras($input) : $input;
540
541	6					14	my $s = '';
542	6					29	my ($C, $V, $D, $F) = ($self->{C}, $self->{V}, $self->{D}, $self->{F});
543
544	6					18	for my $a (@$aksaras) {
545	48	100				111	if (ref($a) ne 'Lingua::Deva::Aksara') {
546	13	50				46	$s .= $a eq $self->{avagraha} ? $Avagraha : $a;
547							}
548							else {
549	35	100				95	if (defined $a->{onset}) {
		50
550	30					38	$s .= join($Virama, map { $C->{$_} } @{ $a->onset() });
	41					149
	30					172
551	30	50				106	$s .= defined $a->vowel() ? $D->{$a->vowel()} : $Virama;
552							}
553							elsif (defined $a->vowel()) {
554	5					17	$s .= $V->{$a->vowel()};
555							}
556	35	100				156	$s .= $F->{$a->final()} if defined $a->final();
557							}
558							}
559
560	6					104	return $s;
561							}
562
563							=item to_latin()
564
565							Converts a Devanagari string (or a reference to an array of
566							L) into Latin transliteration and returns a
567							Latin string.
568
569							Aksaras are assumed to be well-formed.
570
571							=cut
572
573							sub to_latin {
574	4			4	1	987	my ($self, $input) = @_;
575
576							# Input can be either a string (scalar) or an array reference
577	4	100				29	my $aksaras = ref($input) eq '' ? $self->d_to_aksaras($input) : $input;
578
579	4					8	my $s = '';
580	4					11	for my $a (@$aksaras) {
581	33	100				69	if (ref($a) eq 'Lingua::Deva::Aksara') {
582	24	100				67	$s .= join '', @{ $a->onset() } if defined $a->onset();
	21					51
583	24	50				115	$s .= $a->vowel() if defined $a->vowel();
584	24	100				88	$s .= $a->final() if defined $a->final();
585							}
586							else {
587	9					16	$s .= $a;
588							}
589							}
590
591	4					404	return $s;
592							}
593
594							=back
595
596							=cut
597
598							1;
599							__END__