File Coverage

blib/lib/Lingua/Translit.pm
Criterion Covered Total %
statement 79 81 97.5
branch 36 46 78.2
condition 12 18 66.6
subroutine 13 13 100.0
pod 6 6 100.0
total 146 164 89.0


line stmt bran cond sub pod time code
1             package Lingua::Translit;
2              
3             #
4             # Copyright (C) 2007-2008 ...
5             # Alex Linke
6             # Rona Linke
7             # Copyright (C) 2009-2016 Lingua-Systems Software GmbH
8             # Copyright (C) 2016-2017 Netzum Sorglos, Lingua-Systems Software GmbH
9             #
10              
11 25     25   381243 use strict;
  25         58  
  25         624  
12 25     25   114 use warnings;
  25         49  
  25         786  
13              
14             require 5.008;
15              
16 25     25   124 use Carp qw/croak/;
  25         56  
  25         1407  
17 25     25   9197 use Encode qw/encode decode/;
  25         159680  
  25         4356  
18              
19 25     25   20479 use Lingua::Translit::Tables;
  25         778  
  25         20696  
20              
21             our $VERSION = '0.27';
22              
23             =pod
24              
25             =encoding utf8
26              
27             =head1 NAME
28              
29             Lingua::Translit - transliterates text between writing systems
30              
31             =head1 SYNOPSIS
32              
33             use Lingua::Translit;
34              
35             my $tr = new Lingua::Translit("ISO 843");
36              
37             my $text_tr = $tr->translit("character oriented string");
38              
39             if ($tr->can_reverse()) {
40             $text_tr = $tr->translit_reverse("character oriented string");
41             }
42              
43             =head1 DESCRIPTION
44              
45             Lingua::Translit can be used to convert text from one writing system to
46             another, based on national or international transliteration tables.
47             Where possible a reverse transliteration is supported.
48              
49             The term C describes the conversion of text from one
50             writing system or alphabet to another one.
51             The conversion is ideally unique, mapping one character to exactly one
52             character, so the original spelling can be reconstructed.
53             Practically this is not always the case and one single letter of the
54             original alphabet can be transcribed as two, three or even more letters.
55              
56             Furthermore there is more than one transliteration scheme for one writing
57             system.
58             Therefore it is an important and necessary information, which scheme will be
59             or has been used to transliterate a text, to work integrative and be able to
60             reconstruct the original data.
61              
62             Reconstruction is a problem though for non-unique transliterations, if no
63             language specific knowledge is available as the resulting clusters of
64             letters may be ambiguous.
65             For example, the Greek character "PSI" maps to "ps", but "ps" could also
66             result from the sequence "PI", "SIGMA" since "PI" maps to "p" and "SIGMA"
67             maps to s.
68             If a transliteration table leads to ambiguous conversions, the provided
69             table cannot be used reverse.
70              
71             Otherwise the table can be used in both directions, if appreciated.
72             So if ISO 9 is originally created to convert Cyrillic letters to
73             the Latin alphabet, the reverse transliteration will transform Latin
74             letters to Cyrillic.
75              
76             =head1 METHODS
77              
78             =head2 new(I<"name of table">)
79              
80             Initializes an object with the specific transliteration table, e.g. "ISO 9".
81              
82             =cut
83              
84             sub new {
85 40     40 1 9761 my $class = shift();
86 40         87 my $name = shift();
87              
88 40         81 my $self;
89              
90             # Assure that a table name was set
91 40 100       266 croak("No transliteration name given.") unless $name;
92              
93             # Stay compatible with programs that use Lingua::Translit < 0.05
94 39 100       193 if ( $name =~ /^DIN 5008$/i ) {
95 1         3 $name = "Common DEU";
96             }
97              
98 39         189 my $table = Lingua::Translit::Tables::_get_table_reference($name);
99              
100             # Check that a table reference was assigned to the object
101 39 50       172 croak("No table found for $name.") unless $table;
102              
103             # Assure the table's data is complete
104 39 50       140 croak("$name table: missing 'name'") unless defined $table->{name};
105 39 50       140 croak("$name table: missing 'desc'") unless defined $table->{desc};
106 39 50       135 croak("$name table: missing 'reverse'") unless defined $table->{reverse};
107 39 50       130 croak("$name table: missing 'rules'") unless defined $table->{rules};
108              
109             # Copy over the table's data
110 39         121 $self->{name} = $table->{name};
111 39         104 $self->{desc} = $table->{desc};
112 39         132 $self->{rules} = $table->{rules};
113              
114             # Set a truth value of the transliteration's reversibility according to
115             # the natural language string in the original transliteration table
116 39 100       192 $self->{reverse} = ( $table->{reverse} =~ /^true$/i ) ? 1 : 0;
117              
118 39         93 undef($table);
119              
120 39         143 return bless $self, $class;
121             }
122              
123             =head2 translit(I<"character oriented string">)
124              
125             Transliterates the given text according to the object's transliteration
126             table.
127             Returns the transliterated text.
128              
129             =cut
130              
131             sub translit {
132 153     153 1 21682 my $self = shift();
133 153         294 my $text = shift();
134              
135             # Return if no input was given
136 153 50       444 return unless $text;
137              
138 153         647 my $utf8_flag_on = Encode::is_utf8($text);
139              
140 153 100       387 unless ($utf8_flag_on) {
141 59         229 $text = decode( "UTF-8", $text );
142             }
143              
144 153         5951 foreach my $rule ( @{ $self->{rules} } ) {
  153         434  
145 14308 100       262762 if ( defined $rule->{context} ) {
146 493         816 my $c = $rule->{context};
147              
148             # single context rules
149 493 100 100     3068 if ( defined $c->{before} && !defined $c->{after} ) {
    100 66        
    50 33        
150 107         1683 $text =~ s/$rule->{from}(?=$c->{before})/$rule->{to}/g;
151             }
152             elsif ( defined $c->{after} && !defined $c->{before} ) {
153 4     4   26 $text =~ s/(?<=$c->{after})$rule->{from}/$rule->{to}/g;
  4         8  
  4         42  
  223         3587  
154             }
155              
156             # double context rules: logical "inbetween"
157             elsif ( defined $c->{before} && defined $c->{after} ) {
158 163     7   2380 $text =~ s/
  7         47  
  7         13  
  7         78  
159             (?<=$c->{after})$rule->{from}(?=$c->{before})
160             /$rule->{to}/gx;
161             }
162              
163             else {
164 0         0 croak("incomplete rule context");
165             }
166             }
167             else {
168 13815         192181 $text =~ s/$rule->{from}/$rule->{to}/g;
169             }
170             }
171              
172 153 100       409 unless ($utf8_flag_on) {
173 59         224 return encode( "UTF-8", $text );
174             }
175             else {
176 94         398 return $text;
177             }
178             }
179              
180             =head2 translit_reverse(I<"character oriented string">)
181              
182             Transliterates the given text according to the object's transliteration
183             table, but uses it the other way round. For example table ISO 9 is a
184             transliteration scheme for the converion of Cyrillic letters to the Latin
185             alphabet. So if used reverse, Latin letters will be mapped to Cyrillic ones.
186              
187             Returns the transliterated text.
188              
189             =cut
190              
191             sub translit_reverse {
192 18     18 1 8315 my $self = shift();
193 18         44 my $text = shift();
194              
195             # Return if no input was given
196 18 50       80 return unless $text;
197              
198             # Is this transliteration reversible?
199 18 50       88 croak("$self->{name} cannot be reversed") unless $self->{reverse};
200              
201 18         68 my $utf8_flag_on = Encode::is_utf8($text);
202              
203 18 100       61 unless ($utf8_flag_on) {
204 15         58 $text = decode( "UTF-8", $text );
205             }
206              
207 18         916 foreach my $rule ( @{ $self->{rules} } ) {
  18         64  
208 1503 100       37370 if ( defined $rule->{context} ) {
209 178         283 my $c = $rule->{context};
210              
211             # single context rules
212 178 100 100     1263 if ( defined $c->{before} && !defined $c->{after} ) {
    100 66        
    50 33        
213 54         819 $text =~ s/$rule->{to}(?=$c->{before})/$rule->{from}/g;
214             }
215             elsif ( defined $c->{after} && !defined $c->{before} ) {
216 19         183 $text =~ s/(?<=$c->{after})$rule->{to}/$rule->{from}/g;
217             }
218              
219             # double context rules: logical "inbetween"
220             elsif ( defined $c->{before} && defined $c->{after} ) {
221 105         1365 $text =~ s/
222             (?<=$c->{after})$rule->{to}(?=$c->{before})
223             /$rule->{from}/gx;
224             }
225              
226             else {
227 0         0 croak("incomplete rule context");
228             }
229             }
230             else {
231 1325         9596 $text =~ s/$rule->{to}/$rule->{from}/g;
232             }
233             }
234              
235 18 100       59 unless ($utf8_flag_on) {
236 15         56 return encode( "UTF-8", $text );
237             }
238             else {
239 3         10 return $text;
240             }
241             }
242              
243             =head2 can_reverse()
244              
245             Returns true (1), iff reverse transliteration is possible.
246             False (0) otherwise.
247              
248             =cut
249              
250             sub can_reverse {
251 22     22 1 897 return $_[0]->{reverse};
252             }
253              
254             =head2 name()
255              
256             Returns the name of the chosen transliteration table, e.g. "ISO 9".
257              
258             =cut
259              
260             sub name {
261 12     12 1 5746 return $_[0]->{name};
262             }
263              
264             =head2 desc()
265              
266             Returns a description for the transliteration,
267             e.g. "ISO 9:1995, Cyrillic to Latin".
268              
269             =cut
270              
271             sub desc {
272 12     12 1 60 return $_[0]->{desc};
273             }
274              
275             =head1 SUPPORTED TRANSLITERATIONS
276              
277             =over 4
278              
279             =item Cyrillic
280              
281             I, not reversible, ALA-LC:1997, Cyrillic to Latin, Russian
282              
283             I, reversible, ISO 9:1995, Cyrillic to Latin
284              
285             I, reversible, ISO 9:1954, Cyrillic to Latin
286              
287             I, reversible, DIN 1460:1982, Cyrillic to Latin, Russian
288              
289             I, reversible, DIN 1460:1982, Cyrillic to Latin, Ukrainian
290              
291             I, reversible, DIN 1460:1982, Cyrillic to Latin, Bulgarian
292              
293             I, not reversible, The Streamlined System: 2006,
294             Cyrillic to Latin, Bulgarian
295              
296             I, reversible, GOST 7.79:2000 (table B), Cyrillic to Latin,
297             Russian
298              
299             I, not reversible, GOST 7.79:2000 (table B), Cyrillic to
300             Latin with support for Old Russian (pre 1918), Russian
301              
302             I, reversible, GOST 7.79:2000 (table B), Cyrillic to Latin,
303             Ukrainian
304              
305             I, not reversible, BGN/PCGN:1947 (Standard Variant),
306             Cyrillic to Latin, Russian
307              
308             I, not reversible, BGN/PCGN:1947 (Strict Variant),
309             Cyrillic to Latin, Russian
310              
311             =item Greek
312              
313             I, not reversible, ISO 843:1997, Greek to Latin
314              
315             I, not reversible, DIN 31634:1982, Greek to Latin
316              
317             I, not reversible, Greeklish (Phonetic), Greek to Latin
318              
319             =item Latin
320              
321             I, not reversible, Czech without diacritics
322              
323             I, not reversible, German without umlauts
324              
325             I, not reversible, Unaccented Polish
326              
327             I, not reversible, Romanian without diacritics as commonly used
328              
329             I, not reversible, Slovak without diacritics
330              
331             I, not reversible, Slovenian without diacritics
332              
333             I, reversible, Romanian with appropriate diacritics
334              
335             =item Arabic
336              
337             I, not reversible, Common Romanization of Arabic
338              
339             =item Sanskrit
340              
341             I, not reversible, IAST Romanization to Devanāgarī
342              
343             I, not reversible, Devanāgarī to IAST Romanization
344              
345             =back
346              
347             =head1 ADDING NEW TRANSLITERATIONS
348              
349             In case you want to add your own transliteration tables to
350             L, have a look at the developer documentation at
351             L.
352              
353             A template of a transliteration table is provided as well
354             (F) so you can easily start developing.
355              
356              
357             =head1 RESTRICTIONS
358              
359             L is suited to handle B and utilizes comparisons
360             and regular expressions that rely on B.
361             Therefore, any input is supposed to be B
362             (C, ...) instead of byte oriented.
363              
364             However, if your data is byte oriented, be sure to pass it
365             B to translit() and/or translit_reverse() - it will be
366             converted internally.
367              
368             =head1 BUGS
369              
370             None known.
371              
372             Please report bugs using CPAN's request tracker at
373             L.
374              
375             =head1 SEE ALSO
376              
377             L, L, L
378              
379             C's manpage
380              
381             L
382              
383             =head1 CREDITS
384              
385             Thanks to Dr. Daniel Eiwen, Romanisches Seminar, Universitaet Koeln for his
386             help on Romanian transliteration.
387              
388             Thanks to Dmitry Smal and Rusar Publishing for contributing the "ALA-LC RUS"
389             transliteration table.
390              
391             Thanks to Ahmed Elsheshtawy for his help implementing the "Common ARA" Arabic
392             transliteration.
393              
394             Thanks to Dusan Vuckovic for contributing the "ISO/R 9" transliteration table.
395              
396             Thanks to Ștefan Suciu for contributing the "ISO 8859-16 RON" transliteration
397             table.
398              
399             Thanks to Philip Kime for contributing the "IAST Devanagari" and "Devanagari
400             IAST" transliteration tables.
401              
402             Thanks to Nikola Lečić for contributing the "BGN/PCGN RUS Standard" and
403             "BGN/PCGN RUS Strict" transliteration tables.
404              
405             =head1 AUTHORS
406              
407             Alex Linke
408              
409             Rona Linke
410              
411             =head1 LICENSE AND COPYRIGHT
412              
413             Copyright (C) 2007-2008 Alex Linke and Rona Linke
414              
415             Copyright (C) 2009-2016 Lingua-Systems Software GmbH
416              
417             Copyright (C) 2016-2017 Netzum Sorglos, Lingua-Systems Software GmbH
418              
419             This module is free software; you can redistribute it and/or modify it under
420             the same terms as Perl itself.
421              
422             =cut
423              
424             1;
425              
426             # vim: set ft=perl sts=4 sw=4 ts=4 ai et: