File Coverage

blib/lib/Lingua/Translit.pm
Criterion Covered Total %
statement 81 83 97.5
branch 42 50 84.0
condition 12 18 66.6
subroutine 13 13 100.0
pod 6 6 100.0
total 154 170 90.5


line stmt bran cond sub pod time code
1             package Lingua::Translit;
2              
3             #
4             # Copyright (C) 2007-2008 ...
5             # Alex Linke
6             # Rona Linke
7             # Copyright (C) 2009-2016 Lingua-Systems Software GmbH
8             # Copyright (C) 2016-2017 Netzum Sorglos, Lingua-Systems Software GmbH
9             # Copyright (C) 2017-2022 Netzum Sorglos Software GmbH
10             #
11              
12 26     26   1533049 use strict;
  26         303  
  26         699  
13 26     26   158 use warnings;
  26         48  
  26         967  
14              
15             require 5.008;
16              
17 26     26   132 use Carp qw/croak/;
  26         44  
  26         1530  
18 26     26   9575 use Encode qw/encode decode/;
  26         184485  
  26         1762  
19              
20 26     26   20305 use Lingua::Translit::Tables;
  26         1233  
  26         24116  
21              
22             our $VERSION = '0.29';
23              
24             =pod
25              
26             =encoding utf8
27              
28             =head1 NAME
29              
30             Lingua::Translit - transliterates text between writing systems
31              
32             =head1 SYNOPSIS
33              
34             use Lingua::Translit;
35              
36             my $tr = new Lingua::Translit("ISO 843");
37              
38             my $text_tr = $tr->translit("character oriented string");
39              
40             if ($tr->can_reverse()) {
41             $text_tr = $tr->translit_reverse("character oriented string");
42             }
43              
44             =head1 DESCRIPTION
45              
46             Lingua::Translit can be used to convert text from one writing system to
47             another, based on national or international transliteration tables.
48             Where possible a reverse transliteration is supported.
49              
50             The term C describes the conversion of text from one
51             writing system or alphabet to another one.
52             The conversion is ideally unique, mapping one character to exactly one
53             character, so the original spelling can be reconstructed.
54             Practically this is not always the case and one single letter of the
55             original alphabet can be transcribed as two, three or even more letters.
56              
57             Furthermore there is more than one transliteration scheme for one writing
58             system.
59             Therefore it is an important and necessary information, which scheme will be
60             or has been used to transliterate a text, to work integrative and be able to
61             reconstruct the original data.
62              
63             Reconstruction is a problem though for non-unique transliterations, if no
64             language specific knowledge is available as the resulting clusters of
65             letters may be ambiguous.
66             For example, the Greek character "PSI" maps to "ps", but "ps" could also
67             result from the sequence "PI", "SIGMA" since "PI" maps to "p" and "SIGMA"
68             maps to s.
69             If a transliteration table leads to ambiguous conversions, the provided
70             table cannot be used reverse.
71              
72             Otherwise the table can be used in both directions, if appreciated.
73             So if ISO 9 is originally created to convert Cyrillic letters to
74             the Latin alphabet, the reverse transliteration will transform Latin
75             letters to Cyrillic.
76              
77             =head1 METHODS
78              
79             =head2 new(I<"name of table">)
80              
81             Initializes an object with the specific transliteration table, e.g. "ISO 9".
82              
83             =cut
84              
85             sub new {
86 41     41 1 10437 my $class = shift();
87 41         81 my $name = shift();
88              
89 41         69 my $self;
90              
91             # Assure that a table name was set
92 41 100       258 croak("No transliteration name given.") unless $name;
93              
94             # Stay compatible with programs that use Lingua::Translit < 0.05
95 40 100       211 if ( $name =~ /^DIN 5008$/i ) {
96 1         2 $name = "Common DEU";
97             }
98              
99 40         167 my $table = Lingua::Translit::Tables::_get_table_reference($name);
100              
101             # Check that a table reference was assigned to the object
102 40 50       144 croak("No table found for $name.") unless $table;
103              
104             # Assure the table's data is complete
105 40 50       128 croak("$name table: missing 'name'") unless defined $table->{name};
106 40 50       118 croak("$name table: missing 'desc'") unless defined $table->{desc};
107 40 50       118 croak("$name table: missing 'reverse'") unless defined $table->{reverse};
108 40 50       117 croak("$name table: missing 'rules'") unless defined $table->{rules};
109              
110             # Copy over the table's data
111 40         124 $self->{name} = $table->{name};
112 40         89 $self->{desc} = $table->{desc};
113 40         96 $self->{rules} = $table->{rules};
114              
115             # Set a truth value of the transliteration's reversibility according to
116             # the natural language string in the original transliteration table
117 40 100       203 $self->{reverse} = ( $table->{reverse} =~ /^true$/i ) ? 1 : 0;
118              
119 40         82 undef($table);
120              
121 40         152 return bless $self, $class;
122             }
123              
124             =head2 translit(I<"character oriented string">)
125              
126             Transliterates the given text according to the object's transliteration
127             table.
128             Returns the transliterated text.
129              
130             =cut
131              
132             sub translit {
133 160     160 1 26895 my $self = shift();
134 160         264 my $text = shift();
135              
136             # Return if no input was given
137 160 100       358 return undef if ! defined $text;
138 158 100       349 return '' if $text eq '';
139              
140 157         429 my $utf8_flag_on = Encode::is_utf8($text);
141              
142 157 100       305 unless ($utf8_flag_on) {
143 63         205 $text = decode( "UTF-8", $text );
144             }
145              
146 157         5530 foreach my $rule ( @{ $self->{rules} } ) {
  157         383  
147 14620 100       419783 if ( defined $rule->{context} ) {
148 541         761 my $c = $rule->{context};
149              
150             # single context rules
151 541 100 100     2393 if ( defined $c->{before} && !defined $c->{after} ) {
    100 66        
    50 33        
152 131         1877 $text =~ s/$rule->{from}(?=$c->{before})/$rule->{to}/g;
153             }
154             elsif ( defined $c->{after} && !defined $c->{before} ) {
155 4     4   28 $text =~ s/(?<=$c->{after})$rule->{from}/$rule->{to}/g;
  4         7  
  4         47  
  227         3127  
156             }
157              
158             # double context rules: logical "inbetween"
159             elsif ( defined $c->{before} && defined $c->{after} ) {
160 183     8   2579 $text =~ s/
  8         49  
  8         15  
  8         94  
161             (?<=$c->{after})$rule->{from}(?=$c->{before})
162             /$rule->{to}/gx;
163             }
164              
165             else {
166 0         0 croak("incomplete rule context");
167             }
168             }
169             else {
170 14079         183905 $text =~ s/$rule->{from}/$rule->{to}/g;
171             }
172             }
173              
174 157 100       361 unless ($utf8_flag_on) {
175 63         261 return encode( "UTF-8", $text );
176             }
177             else {
178 94         335 return $text;
179             }
180             }
181              
182             =head2 translit_reverse(I<"character oriented string">)
183              
184             Transliterates the given text according to the object's transliteration
185             table, but uses it the other way round. For example table ISO 9 is a
186             transliteration scheme for the conversion of Cyrillic letters to the Latin
187             alphabet. So if used reverse, Latin letters will be mapped to Cyrillic ones.
188              
189             Returns the transliterated text.
190              
191             =cut
192              
193             sub translit_reverse {
194 25     25 1 15099 my $self = shift();
195 25         55 my $text = shift();
196              
197             # Return if no input was given
198 25 100       124 return undef if ! defined $text;
199 23 100       80 return '' if $text eq '';
200              
201             # Is this transliteration reversible?
202 22 50       93 croak("$self->{name} cannot be reversed") unless $self->{reverse};
203              
204 22         89 my $utf8_flag_on = Encode::is_utf8($text);
205              
206 22 100       83 unless ($utf8_flag_on) {
207 19         69 $text = decode( "UTF-8", $text );
208             }
209              
210 22         951 foreach my $rule ( @{ $self->{rules} } ) {
  22         67  
211 1815 100       41040 if ( defined $rule->{context} ) {
212 226         292 my $c = $rule->{context};
213              
214             # single context rules
215 226 100 100     1211 if ( defined $c->{before} && !defined $c->{after} ) {
    100 66        
    50 33        
216 78         1129 $text =~ s/$rule->{to}(?=$c->{before})/$rule->{from}/g;
217             }
218             elsif ( defined $c->{after} && !defined $c->{before} ) {
219 23         252 $text =~ s/(?<=$c->{after})$rule->{to}/$rule->{from}/g;
220             }
221              
222             # double context rules: logical "inbetween"
223             elsif ( defined $c->{before} && defined $c->{after} ) {
224 125         1696 $text =~ s/
225             (?<=$c->{after})$rule->{to}(?=$c->{before})
226             /$rule->{from}/gx;
227             }
228              
229             else {
230 0         0 croak("incomplete rule context");
231             }
232             }
233             else {
234 1589         10374 $text =~ s/$rule->{to}/$rule->{from}/g;
235             }
236             }
237              
238 22 100       66 unless ($utf8_flag_on) {
239 19         88 return encode( "UTF-8", $text );
240             }
241             else {
242 3         15 return $text;
243             }
244             }
245              
246             =head2 can_reverse()
247              
248             Returns true (1), iff reverse transliteration is possible.
249             False (0) otherwise.
250              
251             =cut
252              
253             sub can_reverse {
254 23     23 1 840 return $_[0]->{reverse};
255             }
256              
257             =head2 name()
258              
259             Returns the name of the chosen transliteration table, e.g. "ISO 9".
260              
261             =cut
262              
263             sub name {
264 12     12 1 5152 return $_[0]->{name};
265             }
266              
267             =head2 desc()
268              
269             Returns a description for the transliteration,
270             e.g. "ISO 9:1995, Cyrillic to Latin".
271              
272             =cut
273              
274             sub desc {
275 12     12 1 46 return $_[0]->{desc};
276             }
277              
278             =head1 SUPPORTED TRANSLITERATIONS
279              
280             =over 4
281              
282             =item Cyrillic
283              
284             I, not reversible, ALA-LC:1997, Cyrillic to Latin, Russian
285              
286             I, reversible, ISO 9:1995, Cyrillic to Latin
287              
288             I, reversible, ISO 9:1954, Cyrillic to Latin
289              
290             I, reversible, DIN 1460:1982, Cyrillic to Latin, Russian
291              
292             I, reversible, DIN 1460:1982, Cyrillic to Latin, Ukrainian
293              
294             I, reversible, DIN 1460:1982, Cyrillic to Latin, Bulgarian
295              
296             I, not reversible, The Streamlined System: 2006,
297             Cyrillic to Latin, Bulgarian
298              
299             I, reversible, GOST 7.79:2000 (table B), Cyrillic to Latin,
300             Russian
301              
302             I, not reversible, GOST 7.79:2000 (table B), Cyrillic to
303             Latin with support for Old Russian (pre 1918), Russian
304              
305             I, reversible, GOST 7.79:2000 (table B), Cyrillic to Latin,
306             Ukrainian
307              
308             I, not reversible, BGN/PCGN:1947 (Standard Variant),
309             Cyrillic to Latin, Russian
310              
311             I, not reversible, BGN/PCGN:1947 (Strict Variant),
312             Cyrillic to Latin, Russian
313              
314             =item Greek
315              
316             I, not reversible, ISO 843:1997, Greek to Latin
317              
318             I, not reversible, DIN 31634:1982, Greek to Latin
319              
320             I, not reversible, Greeklish (Phonetic), Greek to Latin
321              
322             =item Latin
323              
324             I, not reversible, Czech without diacritics
325              
326             I, not reversible, German without umlauts
327              
328             I, not reversible, Unaccented Polish
329              
330             I, not reversible, Romanian without diacritics as commonly used
331              
332             I, not reversible, Slovak without diacritics
333              
334             I, not reversible, Slovenian without diacritics
335              
336             I, reversible, Romanian with appropriate diacritics
337              
338             =item Arabic
339              
340             I, not reversible, Common Romanization of Arabic
341              
342             =item Sanskrit
343              
344             I, not reversible, IAST Romanization to Devanāgarī
345              
346             I, not reversible, Devanāgarī to IAST Romanization
347              
348             =back
349              
350             =head1 ADDING NEW TRANSLITERATIONS
351              
352             In case you want to add your own transliteration tables to
353             L, have a look at the developer documentation at
354             L.
355              
356             A template of a transliteration table is provided as well
357             (F) so you can easily start developing.
358              
359              
360             =head1 RESTRICTIONS
361              
362             L is suited to handle B and utilizes comparisons
363             and regular expressions that rely on B.
364             Therefore, any input is supposed to be B
365             (C, ...) instead of byte oriented.
366              
367             However, if your data is byte oriented, be sure to pass it
368             B to translit() and/or translit_reverse() - it will be
369             converted internally.
370              
371             =head1 BUGS
372              
373             None known.
374              
375             Please report bugs using CPAN's request tracker at
376             L.
377              
378             =head1 SEE ALSO
379              
380             L, L, L
381              
382             C's manpage
383              
384             L
385              
386             =head1 CREDITS
387              
388             Thanks to Dr. Daniel Eiwen, Romanisches Seminar, Universitaet Koeln for his
389             help on Romanian transliteration.
390              
391             Thanks to Dmitry Smal and Rusar Publishing for contributing the "ALA-LC RUS"
392             transliteration table.
393              
394             Thanks to Ahmed Elsheshtawy for his help implementing the "Common ARA" Arabic
395             transliteration.
396              
397             Thanks to Dusan Vuckovic for contributing the "ISO/R 9" transliteration table.
398              
399             Thanks to Ștefan Suciu for contributing the "ISO 8859-16 RON" transliteration
400             table.
401              
402             Thanks to Philip Kime for contributing the "IAST Devanagari" and "Devanagari
403             IAST" transliteration tables.
404              
405             Thanks to Nikola Lečić for contributing the "BGN/PCGN RUS Standard" and
406             "BGN/PCGN RUS Strict" transliteration tables.
407              
408             =head1 AUTHORS
409              
410             Alex Linke
411              
412             Rona Linke
413              
414             =head1 LICENSE AND COPYRIGHT
415              
416             Copyright (C) 2007-2008 Alex Linke and Rona Linke
417              
418             Copyright (C) 2009-2016 Lingua-Systems Software GmbH
419              
420             Copyright (C) 2016-2017 Netzum Sorglos, Lingua-Systems Software GmbH
421              
422             Copyright (C) 2017-2022 Netzum Sorglos Software GmbH
423              
424             This module is free software; you can redistribute it and/or modify it under
425             the same terms as Perl itself.
426              
427             =cut
428              
429             1;
430              
431             # vim: set ft=perl sts=4 sw=4 ts=4 ai et: