File Coverage

blib/lib/Lingua/Deva.pm
Criterion Covered Total %
statement 193 219 88.1
branch 99 118 83.9
condition 37 57 64.9
subroutine 17 17 100.0
pod 6 6 100.0
total 352 417 84.4


line stmt bran cond sub pod time code
1             package Lingua::Deva;
2              
3 6     6   290138 use v5.12.1;
  6         26  
  6         471  
4 6     6   34 use strict;
  6         12  
  6         233  
5 6     6   32 use warnings;
  6         25  
  6         215  
6 6     6   48 use utf8;
  6         10  
  6         63  
7 6     6   18275 use charnames qw( :full );
  6         279430  
  6         65  
8 6     6   10156 use open qw( :encoding(UTF-8) :std );
  6         10626  
  6         58  
9 6     6   134430 use Unicode::Normalize qw( NFD NFC );
  6         15892  
  6         643  
10 6     6   145 use Carp qw( croak carp );
  6         15  
  6         323  
11              
12 6     6   5964 use Lingua::Deva::Aksara;
  6         19  
  6         256  
13 6         3024 use Lingua::Deva::Maps qw( %Consonants %Vowels %Diacritics %Finals
14 6     6   43 $Inherent $Virama $Avagraha );
  6         11  
15              
16             =encoding UTF-8
17              
18             =head1 NAME
19              
20             Lingua::Deva - Convert between Latin and Devanagari Sanskrit text
21              
22             =cut
23              
24             our $VERSION = '1.20';
25              
26             =head1 SYNOPSIS
27              
28             use v5.12.1;
29             use strict;
30             use utf8;
31             use charnames ':full';
32             use Lingua::Deva;
33              
34             # Basic usage
35             my $d = Lingua::Deva->new();
36             say $d->to_latin('आसीद्राजा'); # prints 'āsīdrājā'
37             say $d->to_deva('Nalo nāma'); # prints 'नलो नाम'
38              
39             # With configuration: strict, allow Danda, 'w' for 'v'
40             my %c = %Lingua::Deva::Maps::Consonants;
41             $d = Lingua::Deva->new(
42             strict => 1,
43             allow => [ "\N{DEVANAGARI DANDA}" ],
44             C => do { $c{'w'} = delete $c{'v'}; \%c },
45             );
46             say $d->to_deva('ziwāya'); # 'zइवाय', warning for 'z'
47             say $d->to_latin('सर्वम्।'); # 'sarwam।', no warnings
48              
49             =head1 DESCRIPTION
50              
51             The C module provides facilities for converting Sanskrit in
52             various Latin transliterations to Devanagari and vice-versa. "Deva" is the
53             name for the Devanagari (I) script according to ISO 15924.
54              
55             The facilities of this module are exposed through a simple interface in the
56             form of instances of the L class. A number of configuration
57             options can be passed to it during initialization.
58              
59             Using the module is as simple as creating a C instance and
60             calling its methods L or L with appropriate string
61             arguments.
62              
63             my $d = Lingua::Deva->new();
64             say $d->to_latin('कामसूत्र');
65             say $d->to_deva('Kāmasūtra');
66              
67             By default, transliteration follows the widely used IAST conventions. Three
68             other ready-made transliteration schemes are also included with this module,
69             ISO 15919 (C), Harvard-Kyoto (C), and ITRANS.
70              
71             my $d = Lingua::Deva->new(map => 'HK');
72             say $d->to_latin('कामसूत्र'); # prints 'kAmasUtra'
73              
74             For additional flexibility all mappings can be completely customized; users
75             can also provide their own.
76              
77             use Lingua::Deva::Maps::ISO15919;
78             my %f = %Lingua::Deva::Maps::ISO15919::Finals;
79             my $d = Lingua::Deva->new(
80             map => 'IAST', # use IAST transliteration
81             casesensitive => 1, # do not case fold
82             F => \%f, # ISO 15919 mappings for finals
83             );
84             say $d->to_deva('Vṛtraṁ'); # prints 'Vऋत्रं'
85              
86             For more information on customization see L.
87              
88             Behind the scenes, all translation is done via an intermediate object
89             representation called "Aksara" (Sanskrit I). These objects are
90             instances of L, which provides an interface to inspect
91             and manipulate individual Aksaras.
92              
93             # Create an array of Aksaras
94             my $a = $d->l_to_aksaras('Kāmasūtra');
95              
96             # Print vowel in the fourth Aksara
97             say $a->[3]->vowel();
98              
99             The methods and options of C are described below.
100              
101             =head2 Methods
102              
103             =over 4
104              
105             =item new()
106              
107             Constructor. Takes the following optional arguments.
108              
109             =over 4
110              
111             =item C<< map => 'IAST'|'ISO15919'|'HK'|'ITRANS' >>
112              
113             Selects one of the ready-made transliteration schemes.
114              
115             =item C<< casesensitive => (0|1) >>
116              
117             Determines whether case is treated as distinctive or not. Some schemes (eg.
118             Harvard-Kyoto) set this to C<1> while others (eg. IAST) set it to C<0>.
119              
120             Default is C<0>.
121              
122             =item C<< strict => (0|1) >>
123              
124             In I mode invalid input is flagged with warnings. Invalid means
125             either not a Devanagari token (eg. I) or structurally ill-formed (eg. a
126             Devanagari diacritic vowel following an independent vowel).
127              
128             Default is C<0>.
129              
130             =item C<< allow => [ ... ] >>
131              
132             In strict mode, the C array can be used to exempt certain characters
133             from being flagged as invalid even though they normally would be.
134              
135             =item C<< avagraha => "'" >>
136              
137             Specifies the Latin character used for the transcription of I (ऽ).
138              
139             Default is C<"'"> (apostrophe).
140              
141             =item C<< C => { consonants map } >>
142              
143             =item C<< V => { independent vowels map } >>
144              
145             =item C<< D => { diacritic vowels map } >>
146              
147             =item C<< F => { finals map } >>
148              
149             Transliteration maps in the direction from Latin to Devanagari script.
150              
151             =item C<< DC => { consonants map } >>
152              
153             =item C<< DV => { independent vowels map } >>
154              
155             =item C<< DD => { diacritic vowels map } >>
156              
157             =item C<< DF => { finals map } >>
158              
159             Transliteration maps in the direction from Devanagari to Latin script. When
160             these are not given, reversed versions of the Latin to Devanagari maps are
161             used.
162              
163             The default maps are in L. To customize, make a copy of
164             an existing mapping hash (or create your own) and pass it to one of these
165             parameters.
166              
167             =back
168              
169             =cut
170              
171             sub new {
172 16     16 1 3768 my ($class, %opts) = @_;
173              
174 16         202 my $self = {
175             casesensitive => 0,
176             strict => 0,
177             allow => [], # converted to a hash for efficiency
178             avagraha => "'",
179             C => \%Consonants,
180             V => \%Vowels,
181             D => \%Diacritics,
182             F => \%Finals,
183             %opts,
184             };
185              
186             # Transliteration scheme setup
187 16 100       80 if (defined $self->{map}) {
188 4 100       27 if ($self->{map} =~ /^(ISO15919|ITRANS|IAST|HK)$/) {
189 6     6   45 no strict 'refs';
  6         12  
  6         8115  
190 3         12 my $pkg = "Lingua::Deva::Maps::$1";
191 3         321 eval "require $pkg";
192 3         29 for (qw(Consonants Vowels Diacritics Finals)) {
193 12         19 my $k = substr $_, 0, 1;
194 12 100       27 $self->{$k} = do { my %c = %{"${pkg}::$_"}; \%c } unless defined $opts{$k};
  10         12  
  10         132  
  10         39  
195             }
196 3 100 66     16 if (!defined $opts{casesensitive} and defined ${"${pkg}::CASE"}) {
  2         15  
197 2         3 $self->{casesensitive} = ${"${pkg}::CASE"};
  2         10  
198             }
199             }
200             else {
201 1         205 carp("Invalid transliteration map, using default");
202             }
203             }
204              
205             # By default use reversed maps for the opposite direction (DC DV DD DF)
206 16         81 for (qw( C V D F )) {
207 64 100       182 $self->{"D$_"} = do { my %m = reverse %{$self->{$_}}; \%m } if !defined $self->{"D$_"};
  63         78  
  63         1183  
  63         260  
208             }
209              
210             # Make the inherent vowel translate to '' in the D map
211 16         55 $self->{D}->{$Inherent} = '';
212              
213             # Convert the 'allow' array to a hash for fast lookup
214 16         25 my %allow = map { $_ => 1 } @{ $self->{allow} };
  5         17  
  16         44  
215 16         36 $self->{allow} = \%allow;
216              
217             # Make consonants, vowels, and finals available as tokens
218 16         30 my %tokens = (%{ $self->{C} }, %{ $self->{V} }, %{ $self->{F} });
  16         98  
  16         65  
  16         1312  
219 16         244 $self->{T} = \%tokens;
220              
221 16         99 return bless $self, $class;
222             }
223              
224             =item l_to_tokens()
225              
226             Converts a string of Latin characters into tokens and returns a reference to
227             an array of tokens. A "token" is either a character sequence which may
228             constitute a single Devanagari grapheme or a single non-Devanagari character.
229             In the first sense, a token is simply any key in the transliteration maps.
230              
231             my $t = $d->l_to_tokens("Bhārata\n");
232             # $t now refers to the array ['Bh','ā','r','a','t','a',"\n"]
233              
234             The input string is normalized with
235             L. No chomping takes place.
236             Upper case and lower case distinctions are preserved.
237              
238             =cut
239              
240             sub l_to_tokens {
241 20016     20016 1 71452 my ($self, $text) = @_;
242 20016 100       49883 return unless defined $text;
243              
244 20015         248435 my $nfdtext = NFD($text);
245              
246 20015         31006 my $re = join '|', reverse sort { length $a <=> length $b } keys %{$self->{T}};
  4103201         5119049  
  20015         227972  
247 20015 100       4731812 my $reobject = $self->{casesensitive} ? qr/($re|.)/s : qr/($re|.)/is;
248              
249 20015         76983 my @tokens;
250 20015         139969 while ($nfdtext =~ /$reobject/gc) {
251 1244245         15230580 push @tokens, $1;
252             }
253              
254 20015         133648 return \@tokens;
255             }
256              
257             =item l_to_aksaras()
258              
259             Converts a Latin string (or a reference to an array of tokens) into
260             L and returns a reference to an array of
261             Aksaras.
262              
263             my $a = $d->l_to_aksaras('hyaḥ');
264             is( ref($a->[0]), 'Lingua::Deva::Aksara', 'one aksara object' );
265             done_testing();
266              
267             Input tokens which can not be part of an Aksara pass through untouched. Thus,
268             the resulting array can contain both C objects and
269             separate tokens.
270              
271             In I mode warnings for invalid tokens are output.
272              
273             =cut
274              
275             sub l_to_aksaras {
276 15014     15014 1 85512 my ($self, $input) = @_;
277              
278             # Input can be either a string (scalar) or an array reference
279 15014 100       59226 my $tokens = ref($input) eq '' ? $self->l_to_tokens($input) : $input;
280              
281 15014         30286 my @aksaras;
282             my $a;
283 15014         30115 my $state = 0;
284 15014         45633 my ($C, $V, $F) = ($self->{C}, $self->{V}, $self->{F});
285              
286             # Aksarization is implemented with a state machine.
287             # State 0: Not currently constructing an aksara, ready for any input
288             # State 1: Constructing consonantal onset
289             # State 2: Onset and vowel read, ready for final or end of aksara
290              
291 15014         27810 for my $t (@$tokens) {
292 933222 100       2379750 my $lct = $self->{casesensitive} ? $t : lc $t;
293 933222 100       2343663 if ($state == 0) {
    100          
    50          
294 189053 100       487581 if (exists $C->{$lct}) { # consonant: new aksara
    100          
295 114013         552203 $a = Lingua::Deva::Aksara->new( onset => [ $lct ] );
296 114013         238844 $state = 1;
297             }
298             elsif (exists $V->{$lct}) { # vowel: vowel-initial aksara
299 27013         95121 $a = Lingua::Deva::Aksara->new( vowel => $lct );
300 27013         65529 $state = 2;
301             }
302             else { # final/space/avagraha/other
303 48027 100 100     307835 if ($t !~ /\p{Space}/ and $t ne $self->{avagraha}
      100        
      100        
304             and $self->{strict} and !exists $self->{allow}->{$t}) {
305 1004         144415 carp("Invalid token $t read");
306             }
307 48027         246876 push @aksaras, $t;
308             }
309             }
310             elsif ($state == 1) {
311 414094 100       1200858 if (exists $C->{$lct}) { # consonant: part of onset
    100          
312 84028         98272 push @{ $a->onset() }, $lct;
  84028         246849  
313             }
314             elsif (exists $V->{$lct}) { # vowel: vowel nucleus
315 309066         886958 $a->vowel( $lct );
316 309066         576158 $state = 2;
317             }
318             else { # final/space/avagraha/other
319 21000 0 33     87136 if ($t !~ /\p{Space}/ and $t ne $self->{avagraha}
      33        
      0        
320             and $self->{strict} and !exists $self->{allow}->{$t}) {
321 0         0 carp("Invalid token $t read");
322             }
323 21000         56403 push @aksaras, $a, $t;
324 21000         36881 $state = 0;
325             }
326             }
327             elsif ($state == 2) {
328 330075 100       967348 if (exists $C->{$lct}) { # consonant: new aksara
    50          
    100          
329 216054         573740 push @aksaras, $a;
330 216054         3295907 $a = Lingua::Deva::Aksara->new( onset => [ $lct ] );
331 216054         466792 $state = 1;
332             }
333             elsif (exists $V->{$lct}) { # vowel: new vowel-initial aksara
334 0         0 push @aksaras, $a;
335 0         0 $a = Lingua::Deva::Aksara->new( vowel => $lct );
336 0         0 $state = 2;
337             }
338             elsif (exists $F->{$lct}) { # final: end of aksara
339 33007         98027 $a->final( $lct );
340 33007         55366 push @aksaras, $a;
341 33007         59856 $state = 0;
342             }
343             else { # space/avagraha/other
344 81014 100 66     368750 if ($t !~ /\p{Space}/ and $t ne $self->{avagraha}
      100        
      66        
345             and $self->{strict} and !exists $self->{allow}->{$t}) {
346 2000         231307 carp("Invalid token $t read");
347             }
348 81014         341228 push @aksaras, $a, $t;
349 81014         143398 $state = 0;
350             }
351             }
352             }
353              
354             # Finish aksara currently under construction
355 15014 100 100     88748 push @aksaras, $a if $state == 1 or $state == 2;
356              
357 15014         375844 return \@aksaras;
358             }
359              
360             *l_to_aksara = \&l_to_aksaras; # alias
361              
362             =item d_to_aksaras()
363              
364             Converts a Devanagari string into L and returns
365             a reference to an array of Aksaras.
366              
367             my $aksaras = $d->d_to_aksaras('बुद्धः');
368             my $onset = $aksaras->[1]->onset();
369             is_deeply( $onset, ['d', 'dh'], 'onset of second aksara' );
370             done_testing();
371              
372             Input tokens which can not be part of an Aksara pass through untouched. Thus,
373             the resulting array can contain both C objects and
374             separate tokens.
375              
376             In I mode warnings for invalid tokens are output.
377              
378             =cut
379              
380             sub d_to_aksaras {
381 5006     5006 1 24579 my ($self, $input) = @_;
382              
383 5006         81859 my @chars = split //, $input;
384 5006         16051 my @aksaras;
385             my $a;
386 5006         22898 my $state = 0;
387 5006         14480 my ($DC, $DV, $DD, $DF) = ( $self->{DC}, $self->{DV},
388             $self->{DD}, $self->{DF} );
389              
390             # Aksarization is implemented with a state machine.
391             # State 0: Not currently constructing an aksara, ready for any input
392             # State 1: Consonant with inherent vowel, ready for vowel, Virama, final
393             # State 2: Virama read, ready for consonant or end of aksara
394             # State 3: Vowel read, ready for final or end of aksara
395             # The inherent vowel needs to be taken into account specially
396              
397 5006         7715 for my $c (@chars) {
398 310113 100       783902 if ($state == 0) {
    100          
    100          
    50          
399 63028 100       164508 if (exists $DC->{$c}) { # consonant: new aksara
    100          
    100          
400 38006         158510 $a = Lingua::Deva::Aksara->new( onset => [ $DC->{$c} ] );
401 38006         63737 $state = 1;
402             }
403             elsif (exists $DV->{$c}) { # vowel: vowel-initial aksara
404 9007         30528 $a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
405 9007         15130 $state = 3;
406             }
407             elsif ($c =~ /$Avagraha/) { # Avagraha
408 1000         2608 push @aksaras, $self->{avagraha};
409             }
410             else { # final or other: invalid
411 15015 100 100     77501 if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
      100        
412 1         132 carp("Invalid character $c read");
413             }
414 15015         33369 push @aksaras, $c;
415             }
416             }
417             elsif ($state == 1) {
418 138050 100       513768 if ($c =~ /$Virama/) { # Virama: consonant-final
    100          
    50          
    100          
    100          
    50          
419 35013         53157 $state = 2;
420             }
421             elsif (exists $DD->{$c}) { # diacritic: vowel nucleus
422 67016         213527 $a->vowel( $DD->{$c} );
423 67016         107454 $state = 3;
424             }
425             elsif (exists $DV->{$c}) { # vowel: new vowel-initial aksara
426 0         0 $a->vowel( $Inherent );
427 0         0 push @aksaras, $a;
428 0         0 $a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
429 0         0 $state = 3;
430             }
431             elsif (exists $DC->{$c}) { # consonant: new aksara
432 20012         55870 $a->vowel( $Inherent );
433 20012         28520 push @aksaras, $a;
434 20012         78767 $a = Lingua::Deva::Aksara->new( onset => [ $DC->{$c} ] );
435             }
436             elsif (exists $DF->{$c}) { # final: end of aksara
437 7004         17516 $a->vowel( $Inherent );
438 7004         19290 $a->final( $DF->{$c} );
439 7004         9412 push @aksaras, $a;
440 7004         10715 $state = 0;
441             }
442             elsif ($c =~ /$Avagraha/) { # Avagraha
443 0         0 $a->vowel( $Inherent );
444 0         0 push @aksaras, $a, $self->{avagraha};
445 0         0 $state = 0;
446             }
447             else { # other: invalid
448 9005         24371 $a->vowel( $Inherent );
449 9005 50 66     32938 if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
      33        
450 0         0 carp("Invalid character $c read");
451             }
452 9005         16797 push @aksaras, $a, $c;
453 9005         12957 $state = 0;
454             }
455             }
456             elsif ($state == 2) {
457 35013 100       99425 if (exists $DC->{$c}) { # consonant: cluster
    50          
    50          
458 28013         28111 push @{ $a->onset() }, $DC->{$c};
  28013         75180  
459 28013         45484 $state = 1;
460             }
461             elsif (exists $DV->{$c}) { # vowel: new vowel-initial aksara
462 0         0 push @aksaras, $a;
463 0         0 $a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
464 0         0 $state = 3;
465             }
466             elsif ($c =~ /$Avagraha/) { # Avagraha
467 0         0 push @aksaras, $a, $self->{avagraha};
468 0         0 $state = 0;
469             }
470             else { # other: invalid
471 7000 0 33     64353 if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
      33        
472 0         0 carp("Invalid character $c read");
473             }
474 7000         11510 push @aksaras, $a, $c;
475 7000         10183 $state = 0;
476             }
477             }
478             elsif ($state == 3) { # final: end of aksara
479 74022 100       246388 if (exists $DF->{$c}) {
    100          
    50          
    50          
480 4000         15585 $a->final( $DF->{$c} );
481 4000         5427 push @aksaras, $a;
482 4000         6091 $state = 0;
483             }
484             elsif (exists $DC->{$c}) { # consonant: new aksara
485 52019         84923 push @aksaras, $a;
486 52019         225735 $a = Lingua::Deva::Aksara->new( onset => [ $DC->{$c} ] );
487 52019         86310 $state = 1;
488             }
489             elsif (exists $DV->{$c}) { # vowel: new vowel-initial aksara
490 0         0 push @aksaras, $a;
491 0         0 $a = Lingua::Deva::Aksara->new( vowel => $DV->{$c} );
492 0         0 $state = 3;
493             }
494             elsif ($c =~ /$Avagraha/) { # Avagraha
495 0         0 push @aksaras, $a, $self->{avagraha};
496 0         0 $state = 0;
497             }
498             else { # other: invalid
499 18003 50 66     68180 if ($c !~ /\p{Space}/ and $self->{strict} and !exists $self->{allow}->{$c}) {
      33        
500 0         0 carp("Invalid character $c read");
501             }
502 18003         30860 push @aksaras, $a, $c;
503 18003         26456 $state = 0;
504             }
505             }
506             }
507              
508             # Finish aksara currently under construction
509 5006         8446 given ($state) {
510 5006         7636 when (1) { $a->vowel( $Inherent ); continue }
  0         0  
  0         0  
511 5006         15064 when ([1..3]) { push @aksaras, $a }
  2001         4140  
512             }
513              
514 5006         110962 return \@aksaras;
515             }
516              
517             *d_to_aksara = \&d_to_aksaras; # alias
518              
519             =item to_deva()
520              
521             Converts a Latin string (or a reference to an array of
522             L) into Devanagari and returns a Devanagari
523             string.
524              
525             say $d->to_deva('Kāmasūtra');
526              
527             # same as
528             my $a = $d->l_to_aksaras('Kāmasūtra');
529             say $d->to_deva($a);
530              
531             Aksaras are assumed to be well-formed.
532              
533             =cut
534              
535             sub to_deva {
536 6     6 1 208 my ($self, $input) = @_;
537              
538             # Input can be either a string (scalar) or an array reference
539 6 100       38 my $aksaras = ref($input) eq '' ? $self->l_to_aksaras($input) : $input;
540              
541 6         14 my $s = '';
542 6         29 my ($C, $V, $D, $F) = ($self->{C}, $self->{V}, $self->{D}, $self->{F});
543              
544 6         18 for my $a (@$aksaras) {
545 48 100       111 if (ref($a) ne 'Lingua::Deva::Aksara') {
546 13 50       46 $s .= $a eq $self->{avagraha} ? $Avagraha : $a;
547             }
548             else {
549 35 100       95 if (defined $a->{onset}) {
    50          
550 30         38 $s .= join($Virama, map { $C->{$_} } @{ $a->onset() });
  41         149  
  30         172  
551 30 50       106 $s .= defined $a->vowel() ? $D->{$a->vowel()} : $Virama;
552             }
553             elsif (defined $a->vowel()) {
554 5         17 $s .= $V->{$a->vowel()};
555             }
556 35 100       156 $s .= $F->{$a->final()} if defined $a->final();
557             }
558             }
559              
560 6         104 return $s;
561             }
562              
563             =item to_latin()
564              
565             Converts a Devanagari string (or a reference to an array of
566             L) into Latin transliteration and returns a
567             Latin string.
568              
569             Aksaras are assumed to be well-formed.
570              
571             =cut
572              
573             sub to_latin {
574 4     4 1 987 my ($self, $input) = @_;
575              
576             # Input can be either a string (scalar) or an array reference
577 4 100       29 my $aksaras = ref($input) eq '' ? $self->d_to_aksaras($input) : $input;
578              
579 4         8 my $s = '';
580 4         11 for my $a (@$aksaras) {
581 33 100       69 if (ref($a) eq 'Lingua::Deva::Aksara') {
582 24 100       67 $s .= join '', @{ $a->onset() } if defined $a->onset();
  21         51  
583 24 50       115 $s .= $a->vowel() if defined $a->vowel();
584 24 100       88 $s .= $a->final() if defined $a->final();
585             }
586             else {
587 9         16 $s .= $a;
588             }
589             }
590              
591 4         404 return $s;
592             }
593              
594             =back
595              
596             =cut
597              
598             1;
599             __END__