File Coverage

blib/lib/MILA/Transliterate.pm
Criterion Covered Total %
statement 25 25 100.0
branch 2 2 100.0
condition n/a
subroutine 9 9 100.0
pod 6 7 85.7
total 42 43 97.6


line stmt bran cond sub pod time code
1             package MILA::Transliterate;
2 3     3   95613 use utf8;
  3         34  
  3         24  
3 3     3   99 use Exporter;
  3         10  
  3         3549  
4             our @ISA = qw(Exporter);
5             our @EXPORT_OK = qw(hebrew2treebank treebank2hebrew hebrew2erel erel2hebrew hebrew2fsma fsma2hebrew);
6             our $VERSION = 0.01;
7             =head1 NAME
8              
9             MILA::Transliterate - A Perl Module for transliterating text from Hebrew to various transliterations used in the Knowledge Center for Processing Hebrew (MILA) and vise versa
10              
11             =head1 SYNOPSIS
12              
13             use MILA::Transliterate qw((hebrew2treebank hebrew2erel hebrew2fsma);
14             my $erel_transliterated = hebrew2erel($utf8_encoded_hebrew_text);
15             my $treebank_transliterated = hebrew2treebank($utf8_encoded_hebrew_text);
16             my $fsma_transliterated = hebrew2fsma($utf8_encoded_hebrew_text);
17              
18             # note that the reverse transliteration does NOT maintain final Hebrew letters!
19              
20             =head1 DESCRIPTION
21              
22             Before UNICODE was widely used, applications that were manipulating Hebrew text usually used some transliteration into ASCII characters instead of using Hebrew letters. This was particularly true for software developed in the academia. MILA is a nick name for the Knowledge Center for Processing Hebrew (see: http://mila.cs.technion.ac.il/). This knowledge center develops software and standards that result from research in natural language processing for Hebrew. As a result, some legacy software also needs to be maintained and such legacy software usually used transliteration.
23              
24             This module contains mapping from UTF-8 encoded Hebrew to the various transliteration schemes that MILA needs to support and also contains the reversed mapping.
25              
26             =head1 FUNCTIONS
27              
28             =item $treebank_transliterated = hebrew2treebank( $utf8_encoded_hebrew_text )
29              
30             This function maps UTF-8 encoded Hebrew text into the treebank transliteration. Every character not in the mapping is being copied as is without any conversion.
31              
32             =item $erel_transliterated = hebrew2erel( $utf8_encoded_hebrew_text )
33              
34             This function maps UTF-8 encoded Hebrew text into the erel transliteration. Every character not in the mapping is being copied as is without any conversion.
35              
36             =item $fsma_transliterated = hebrew2fsma( $utf8_encoded_hebrew_text )
37              
38             This function maps UTF-8 encoded Hebrew text into the fsma transliteration. Every character not in the mapping is being copied as is without any conversion.
39              
40             =item $utf8_encoded_hebrew_text = treebank2hebrew( $treebank_transliterated )
41              
42             This function provides the reverse transliteration that is provided by hebrew2treebank(). Note that final letters are not preserved and are lost.
43              
44             =item $utf8_encoded_hebrew_text = erel2hebrew( $erel_transliterated )
45              
46             This function provides the reverse transliteration that is provided by hebrew2erel(). Note that final letters are not preserved and are lost.
47              
48             =item $utf8_encoded_hebrew_text = fsma2hebrew( $fsma_transliterated )
49              
50             This function provides the reverse transliteration that is provided by hebrew2fsma(). Note that final letters are not preserved and are lost.
51              
52             =item AUTHOR
53              
54             Shlomo Yona yona@cs.technion.ac.il http://cs.haifa.ac.il/~shlomo/
55              
56             =head1 COPYRIGHT
57              
58             Copyright (c) 20042 Shlomo Yona. All rights reserved.
59              
60             This library is free software.
61             You can redistribute it and/or modify it under the same terms as Perl itself.
62              
63             =head1 CVS INFO
64              
65             $Revision: 1.1 $
66             $Date: 2004/12/17 09:17:37 $
67              
68             =cut
69              
70             # UTF-8 Encoded Hebrew letters mapped to Treebank alphabet
71             my %h2t =(
72             א => 'A',
73             ב => 'B',
74             ג => 'G',
75             ד => 'D',
76             ה => 'H',
77             ו => 'W',
78             ז => 'Z',
79             ח => 'X',
80             ט => 'J',
81             י => 'I',
82             ך => 'K',
83             כ => 'K',
84             ל => 'L',
85             ם => 'M',
86             מ => 'M',
87             ן => 'N',
88             נ => 'N',
89             ס => 'S',
90             ע => 'E',
91             ף => 'P',
92             פ => 'P',
93             ץ => 'C',
94             צ => 'C',
95             ק => 'Q',
96             ר => 'R',
97             ש => 'F',
98             ת => 'T',
99             '"' => 'U',
100             '%' => 'O',
101             );
102              
103             # Treebank alphabet mapped to UTF-8 Encoded Hebrew letters
104             my %t2h=(
105             'A' => 'א',
106             'B' => 'ב',
107             'G' => 'ג',
108             'D' => 'ד',
109             'H' => 'ה',
110             'W' => 'ו',
111             'Z' => 'ז',
112             'X' => 'ח',
113             'J' => 'ט',
114             'I' => 'י',
115             'K' => 'כ',
116             'L' => 'ל',
117             'M' => 'ם',
118             'M' => 'מ',
119             'N' => 'נ',
120             'S' => 'ס',
121             'E' => 'ע',
122             'P' => 'פ',
123             'C' => 'צ',
124             'Q' => 'ק',
125             'R' => 'ר',
126             'F' => 'ש',
127             'T' => 'ת',
128             'U' => '"',
129             'O' => '%',
130             );
131              
132             # UTF-8 encoded Hebrew letters mapped to Erel's alphabet
133             my %h2e =(
134             א => 'A',
135             ב => 'B',
136             ג => 'G',
137             ד => 'D',
138             ה => 'H',
139             ו => 'W',
140             ז => 'Z',
141             ח => 'X',
142             ט => '@',
143             י => 'I',
144             ך => 'K',
145             כ => 'K',
146             ל => 'L',
147             ם => 'M',
148             מ => 'M',
149             ן => 'N',
150             נ => 'N',
151             ס => 'S',
152             ע => '&',
153             ף => 'P',
154             פ => 'P',
155             ץ => 'C',
156             צ => 'C',
157             ק => 'Q',
158             ר => 'R',
159             ש => '$',
160             ת => 'T',
161             );
162              
163             # Erel's alphabet mapped to UTF-8 encoded Hebrew letters
164             my %e2h=(
165             'A' => 'א',
166             'B' => 'ב',
167             'G' => 'ג',
168             'D' => 'ד',
169             'H' => 'ה',
170             'W' => 'ו',
171             'Z' => 'ז',
172             'X' => 'ח',
173             '@' => 'ט',
174             'I' => 'י',
175             'K' => 'כ',
176             'L' => 'ל',
177             'M' => 'ם',
178             'M' => 'מ',
179             'N' => 'נ',
180             'S' => 'ס',
181             '&' => 'ע',
182             'P' => 'פ',
183             'C' => 'צ',
184             'Q' => 'ק',
185             'R' => 'ר',
186             '$' => 'ש',
187             'T' => 'ת',
188             );
189              
190             # UTF-8 encoded Hebrew letters mapped to FSMA's alphabet
191             my %h2l =(
192             א => 'a',
193             ב => 'b',
194             ג => 'g',
195             ד => 'd',
196             ה => 'h',
197             ו => 'w',
198             ז => 'z',
199             ח => 'x',
200             ט => 'v',
201             י => 'i',
202             ך => 'k',
203             כ => 'k',
204             ל => 'l',
205             ם => 'm',
206             מ => 'm',
207             ן => 'n',
208             נ => 'n',
209             ס => 's',
210             ע => 'y',
211             ף => 'p',
212             פ => 'p',
213             ץ => 'c',
214             צ => 'c',
215             ק => 'q',
216             ר => 'r',
217             ש => 'e',
218             ת => 't',
219             );
220              
221             # FSMA's alphabet mapped to UTF-8 encoded Hebrew letters
222             my %l2h=(
223             'a' => 'א',
224             'b' => 'ב',
225             'g' => 'ג',
226             'd' => 'ד',
227             'h' => 'ה',
228             'w' => 'ו',
229             'z' => 'ז',
230             'x' => 'ח',
231             'v' => 'ט',
232             'i' => 'י',
233             'k' => 'כ',
234             'l' => 'ל',
235             'm' => 'ם',
236             'm' => 'מ',
237             'n' => 'נ',
238             's' => 'ס',
239             'y' => 'ע',
240             'p' => 'פ',
241             'c' => 'צ',
242             'q' => 'ק',
243             'r' => 'ר',
244             'e' => 'ש',
245             't' => 'ת',
246             );
247              
248             sub generic_translation {
249 9     9 0 16 my ($from_string,$mapping_hash) = @_;
250 9         150 my $to_string='';
251 9         1307 foreach my $c (split //,$from_string) {
252 1278 100       2614 if (exists $mapping_hash->{$c}) {
253 849         1748 $to_string.= $mapping_hash->{$c};
254             } else{
255 429         750 $to_string.=$c;
256             }
257             }
258 9         290 return $to_string;
259             }
260              
261             sub hebrew2treebank {
262 1     1 1 14 my ($hebrew_string) = @_;
263 1         7 return generic_translation($hebrew_string,\%h2t);
264             }
265              
266             sub treebank2hebrew {
267 2     2 1 5 my ($treebank_string) = @_;
268 2         6 return generic_translation($treebank_string,\%t2h);
269             }
270              
271             sub hebrew2erel {
272 1     1 1 4 my ($hebrew_string) = @_;
273 1         4 return generic_translation($hebrew_string,\%h2e);
274             }
275              
276             sub erel2hebrew {
277 2     2 1 7 my ($treebank_string) = @_;
278 2         7 return generic_translation($treebank_string,\%e2h);
279             }
280              
281             sub hebrew2fsma {
282 1     1 1 4 my ($hebrew_string) = @_;
283 1         19 return generic_translation($hebrew_string,\%h2l);
284             }
285              
286             sub fsma2hebrew {
287 2     2 1 6 my ($treebank_string) = @_;
288 2         770 return generic_translation($treebank_string,\%l2h);
289             }
290              
291              
292             1;