File Coverage

blib/lib/Lingua/EN/Alphabet/Shaw.pm
Criterion Covered Total %
statement 143 151 94.7
branch 43 50 86.0
condition 17 26 65.3
subroutine 24 24 100.0
pod 6 7 85.7
total 233 258 90.3


line stmt bran cond sub pod time code
1             package Lingua::EN::Alphabet::Shaw;
2              
3 7     7   258167 use 5.005;
  7         28  
  7         307  
4 7     7   42 use strict;
  7         14  
  7         301  
5 7     7   53 use warnings;
  7         32  
  7         221  
6 7     7   393275 use DBI;
  7         267728  
  7         1983  
7 7     7   9239 use Encode;
  7         167369  
  7         1315  
8 7     7   21478 use File::ShareDir qw(dist_file);
  7         82554  
  7         669  
9 7     7   12218 use HTML::Parser;
  7         104934  
  7         27687  
10              
11             our $VERSION = 0.64;
12              
13             sub new {
14 7     7 1 4914 my ($class) = @_;
15             my $self = {
16             dbh => undef,
17             sth => undef,
18             map => undef,
19             # default behaviour for "unknown" is to return its argument
20 3     3   33 unknown => sub { $_[0]; },
21 7         84 };
22 7         42 return bless($self, $class);
23             }
24              
25             sub unknown_handler {
26 2     2 1 17 my ($self, $handler) = @_;
27              
28 2 100       13 $self->{unknown} = $handler if defined $handler;
29              
30 2         13 return $self->{unknown};
31             }
32              
33             my %_source_to_bank = (
34             0 => 'W', # Shavian wiki
35             1 => 'C', # CMUDict
36             2 => 'A', # Androcles and the Lion
37             );
38              
39             sub transliterate_details {
40              
41 23     23 0 44 my @result;
42              
43 23         57 my ($self, @texts) = @_;
44              
45 23 100       137 unless (defined $self->{dbh}) {
46 6         13 my $filename;
47              
48             # allow a local override
49 6         12846 $filename = glob('~/.cache/shavian/shavian-set.sqlite');
50 6 50       140 $filename = dist_file('Lingua-EN-Alphabet-Shaw', 'shavian-set.sqlite') unless -e $filename;
51              
52 6         2115 $self->{dbh} = DBI->connect("dbi:SQLite:dbname=$filename","","");
53 6         129373 $self->{sth} = $self->{dbh}->prepare('select shaw, pos, dab, source from words where latn=?');
54             }
55              
56 23         2827 my $prevpos = 'n'; # sensible default
57              
58             my $lookup_word = sub {
59 113     113   149 my ($word) = @_;
60              
61 113         14329 $self->{sth}->execute(lc $word);
62 113         2064 my $homonyms = $self->{sth}->fetchall_arrayref();
63             return {
64 113 100       319 bank => 'U',
65             src => $word,
66             text => $self->{'unknown'}->($word, $word),
67             } unless @$homonyms;
68 109         158 my $candidate = $homonyms->[0];
69 109         214 for (@$homonyms) {
70 122 100       8024 $candidate = $_ if $_->[2] =~ $prevpos;
71 122 100 100     383 $candidate = $_ if $_->[2] eq 'g' && $word =~ /^[A-Z]/;
72 122 100 100     475 $candidate = $_ if $_->[2] eq 'h' && $word =~ /^[a-z]/;
73             }
74              
75 109         173 $prevpos = $candidate->[1];
76              
77 109   50     643 my $result = {
78             bank => $_source_to_bank{$candidate->[3]} || '?',
79             src => $word,
80             text => decode_utf8($candidate->[0]),
81             };
82              
83 109 100       5995 $result->{'dab'}=1 if scalar(@$homonyms)>1;
84              
85 109         585 return $result;
86 23         153 };
87              
88             my $store_literal = sub {
89 162     162   241 my ($literal) = @_;
90 162 100       366 return if $literal eq '';
91              
92 139 100 100     716 if (@result && $result[-1]->{'bank'} eq 'L') {
93 24         89 $result[-1]->{'text'} .= $literal;
94             } else {
95 115         452 push @result, { bank=>'L', text=>$literal };
96             }
97 23         91 };
98              
99 23         87 while (@texts) {
100 43         81 my $text = shift @texts;
101              
102 43         586 my @splittext = split(m/(?
103              
104 43         121 while (@splittext) {
105 141         331 $store_literal->(shift @splittext);
106              
107 141 100       2351 push @result, $lookup_word->(shift @splittext) if @splittext;
108             }
109              
110 43 100       4819 $store_literal->(shift @texts) if @texts;
111             }
112              
113 23 50       469 return @result if wantarray;
114 0         0 return [@result];
115             }
116              
117             sub transliterate {
118 22     22 1 2714 my ($self, @texts) = @_;
119              
120 22         85 return join('', map { $_->{'text'} } $self->transliterate_details(@texts) );
  211         1368  
121             }
122              
123             sub mapping {
124              
125 5     5 1 8 my ($self, $text) = @_;
126              
127 5 100       15 unless (defined $self->{map}) {
128 1         21 $self->{map} = {};
129 1         3 my $codepoint = 66640;
130 1         5 for (qw(p t k f T s S c j N b d g v H z
131             Z J w h l m i e A a o U Q y r n
132             I E F u O M q Y R P X x D C W V)) {
133 48         90 $self->{map}->{chr($codepoint)} = $_;
134 48         98 $self->{map}->{$_} = chr($codepoint);
135 48         51 $codepoint++;
136             }
137              
138 1         3 my $naming_dot = chr(0xB7);
139 1         2 $self->{map}->{$naming_dot} = 'G';
140 1         2 $self->{map}->{'G'} = $naming_dot;
141 1         3 $self->{map}->{'B'} = $naming_dot;
142             # some standards also map it to the solidus
143             # but that will stop this function being
144             # its own inverse
145             }
146              
147             my $remap = sub {
148 82     82   100 my ($char) = @_;
149 82 100       291 return $self->{map}->{$char} if defined $self->{map}->{$char};
150 24         53 return $char;
151 5         21 };
152              
153 5         25 $text =~ s/(.)/$remap->($1)/ge;
  82         113  
154 5         40 return $text;
155             }
156              
157             sub normalise {
158 7     7 1 15992 my ($self, $shaw) = @_;
159              
160 7         76 my %mappings = (
161             chr(66664).chr(66670) => chr(66680), # ash + roar = are
162             chr(66666).chr(66670) => chr(66681), # on + roar = or
163             chr(66663).chr(66670) => chr(66682), # egg + roar = air
164             chr(66675).chr(66670) => chr(66683), # up + roar = err
165             chr(66665).chr(66670) => chr(66684), # ado + roar = array
166             chr(66662).chr(66670) => chr(66685), # if + roar = ear
167             chr(66662).chr(66665) => chr(66686), # if + ado = ian
168             chr(66648).chr(66677) => chr(66687), # yea + ooze = yew
169             );
170              
171 7         62 for (keys %mappings) {
172 56         600 $shaw =~ s/$_/$mappings{$_}/g;
173             }
174              
175 7         55 return $shaw;
176             }
177              
178             sub transliterate_html {
179 1     1 1 952 my ($self, $html, %flags) = @_;
180              
181 1         3 my @content;
182             my $result;
183              
184 1         5 my %toplevel_tags = map {$_=>1} qw(p div h1 h2 h3 h4 h5 h6 ul ol li dt dd dl title);
  15         35  
185 1         4 my %text_attrs = map {$_=>1} qw(alt title);
  2         16  
186              
187 1         4 my $generator_seen = 0;
188 1         3 my $generator_name = ref($self);
189              
190             my $output = sub {
191 27     27   44 my ($repr, $tag) = @_;
192              
193 27 100 100     328 if (!$tag || $toplevel_tags{$tag}) {
194 9         16 my $want_tag = 0;
195 9         26 my @ordered = ('');
196 9         22 for (@content) {
197 39         97 my $is_tag = /^
198 39 100       81 if ($want_tag != $is_tag) {
199 31         48 push @ordered, '';
200 31         35 $want_tag = $is_tag;
201             }
202 39         85 $ordered[-1] .= $_;
203             }
204 9 50       21 if ($flags{'titles'}) {
205             # FIXME we should also include class="dab" if they ask for it
206 0         0 my $entity = 0;
207 0         0 for my $detail ($self->transliterate_details(@ordered)) {
208 0 0 0     0 if (defined $detail->{'src'} && !$entity) {
209 0         0 $result .= ' 210             $detail->{'src'} .
211             '">' .
212             $detail->{'text'} .
213             '';
214             } else {
215 0         0 $result .= $detail->{'text'};
216 0         0 $entity = ($detail->{'text'} =~ /&$/);
217             }
218             }
219             } else {
220 9         42 $result .= $self->transliterate(@ordered);
221             }
222 9         80 @content = ();
223 9         209 $result .= $repr;
224             } else {
225 18         112 push @content, $repr;
226             }
227 1         8 };
228              
229             my $p = HTML::Parser->new( api_version => 3,
230             handlers => {
231             text => [sub {
232 20     20   36 my ($text) = @_;
233 20         117 push @content, $text;
234             }, 'text'],
235             start => [sub {
236 13     13   29 my ($tag, $attrs) = @_;
237 13         28 my $repr = "<$tag";
238 13         64 for my $attr (sort keys %$attrs) {
239 5 100       14 next if $attr eq '/';
240 4         12 my $value = $attrs->{$attr};
241 4 100       19 $value = $self->transliterate($value)
242             if $text_attrs{$attr};
243 4         36 $repr .= " $attr=\"$value\"";
244             }
245 13 100       43 $repr .= '/' if $attrs->{'/'};
246 13         22 $repr .= '>';
247              
248 13 0 33     55 if ($tag eq 'meta' &&
      33        
249             lc($attrs->{'name'}) eq 'generator' &&
250             lc($attrs->{'content'}) eq lc($generator_name)) {
251            
252 0         0 $generator_seen = 1;
253             }
254              
255 13         29 $output->($repr, $tag);
256             }, 'tagname, attr'],
257             end => [sub {
258 12     12   25 my ($tag) = @_;
259 12         34 my $repr .= "";
260              
261 12 100 66     43 if ($tag eq 'head' && !$generator_seen) {
262 1         5 $output->("",
263             $tag);
264             }
265              
266 12         28 $output->($repr, $tag);
267             }, 'tagname'],
268             comment => [sub {
269 1     1   3 my ($text) = @_;
270 1         5 push @content, $text;
271 1         40 }, 'text'],
272             },
273             marked_sections => 1,
274             );
275              
276 1         112 $p->parse($html);
277 1         12 $p->eof();
278 1         3 $output->('');
279              
280 1         1216 return $result;
281             }
282              
283             sub DESTROY {
284 7     7   8738 my ($self) = @_;
285              
286 7 100       2061 $self->{sth}->finish() if defined $self->{sth};
287             }
288              
289             1;
290             =head1 NAME
291              
292             Lingua::EN::Alphabet::Shaw - transliterate the Latin to Shavian alphabets
293              
294             =head1 AUTHOR
295              
296             Thomas Thurman
297              
298             =head1 SYNOPSIS
299              
300             use Lingua::EN::Alphabet::Shaw;
301              
302             my $shaw = Lingua::EN::Alphabet::Shaw->new();
303             print $shaw->transliterate('I live near a live wire.');
304              
305             =head1 DESCRIPTION
306              
307             The Shaw or Shavian alphabet was commissioned by the will of the playwright
308             George Bernard Shaw in the early 1960s as a replacement for the Latin
309             alphabet for representing English. It is designed to have a one-to-one
310             phonemic (not phonetic) mapping with the sounds of English.
311              
312             Its ISO 15924 code is "Shaw" 281.
313              
314             This module transliterates English text from the Latin alphabet into the
315             Shavian alphabet.
316              
317             The API has changed since version 0.03 to be object-based.
318              
319             If you find an error in the translation database, you can change it
320             yourself at http://shavian.org.uk/wiki/ . You may download a current
321             copy of the dataset at http://shavian.org.uk/set/ .
322             If you want to override the database shipped with this module,
323             place the new copy at ~/.cache/shavian/shavian-set.sqlite and it will
324             be used in preference.
325              
326             =head1 METHODS
327              
328             =head2 Lingua::EN::Alphabet::Shaw->new()
329              
330             Constructor. Currently takes no arguments.
331              
332             =head2 $shaw->transliterate($phrase)
333              
334             Returns the transliteration of the given phrase into the Shavian alphabet.
335             Can handle multi-word phrases. Does a reasonable job resolving homonym
336             ambiguity ("does he like does?").
337              
338             If you pass multiple arguments, the results will be concatenated, and only the
339             odd-numbered arguments will be transliterated. The state of homonym
340             resolution is maintained. This allows you to embed chunks of text
341             which should not be transliterated into the line, such as XML tags.
342              
343             =head2 $shaw->unknown_handler([$handler])
344              
345             If a word is not found in the translation database, the transliteration
346             routines will call a particular handler to find out what to do, with the
347             unknown word as both its first and second arguments. (This is to allow
348             later expansion; see BUGS AND ISSUES, below.)
349             The result of the handler should be
350             a string, which will be inserted into the result of the transliteration
351             routine at the correct place.
352              
353             This method allows you to set a new handler by passing it as an argument.
354             If you pass no argument, this method returns the current handler.
355              
356             The default handler only returns its argument. A replacement handler could,
357             for example, make an attempt at guessing the transliteration; it could die,
358             to abort the transliteration process; it could return its argument but
359             also store the new value in a table so that a list of missing words could
360             later be reported to the user.
361              
362             =head2 $shaw->mapping($phrase)
363              
364             There is a quasi-standard mapping of the conventional alphabet onto the Shavian
365             alphabet. This method maps Shavian text into the conventional alphabet
366             and vice versa. It does not transliterate.
367             Think of this as a kind of ASCII-armouring.
368              
369             Various versions of the standard map the naming dot to "G", "B", and "/".
370             This method does not support "/", but maps both "G" and "B" to the naming
371             dot; in reverse, it maps the naming dot to "G".
372              
373             The letters "K" and "L" have no mapping to Shavian letters, and are
374             left alone.
375              
376             =head2 $shaw->normalise($shavian_text)
377              
378             Certain letters in the Shavian alphabet are ligatures of pairs of
379             other letters: because of this, these pairs should not exist separately.
380             (For example, the letter YEW is a ligature of YEA and OOZE.) This method
381             replaces these pairs with their ligature equivalents.
382              
383             =head2 $shaw->transliterate_html($html)
384              
385             Given a block of text in the conventional alphabet which is formatted
386             as HTML, this will make a reasonable attempt at returning the same text
387             transliterated into the Shavian alphabet. It is aware of which tags
388             commonly break the flow of sentences, and handles homonym resolution
389             accordingly.
390              
391             =head1 BUGS AND ISSUES
392              
393             There should be a version of the main transliteration method which
394             returned a list of hashes, each of which gave the source and
395             destination forms of a word, part of speech and disambiguation
396             information, and a marking of the source (CMUDict or
397             Shavian Wiki).
398              
399             It should probably be possible to transliterate in reverse,
400             from Shavian to the conventional alphabet.
401              
402             It should be possible to handle other alternative scripts, such as
403             Deseret and Tengwar. This shouldn't be very difficult.
404             It would also allow representation in the IPA, which would mean
405             this module could be used for simple text-to-speech processing.
406              
407             The portion of the database which is taken from CMUdict exhibits
408             unhelpful mergers (notably father/bother). There isn't much that
409             can be done about this except extending the Shavian wiki further.
410             In addition, in some cases it does not use the letters ARRAY and
411             ADO in unstressed syllables as they should be; this could and should be
412             fixed.
413              
414             It would be useful on initialisation to read a text file
415             in a standard location, which gave a local mapping overriding the
416             database for given words.
417              
418             It would be helpful if there was a callback for any words found
419             from the CMUDict data rather than from the Shavian Wiki data, so that
420             the wiki could be updated.
421              
422             The HTML transliterator should mark its output as being
423             encoded in UTF-8, whatever the source encoding. (Shavian cannot
424             be represented in any other standard encoding.)
425              
426             The HTML transliterator should have an option which put a span
427             around each word whose title was the word's spelling in the
428             conventional alphabet, in the manner of translate.google.com.
429              
430             The HTML transliterator should have an option to rewrite the
431             destinations of links, and to add a target to them, so that
432             it can be used by a web script to link back to itself.
433              
434             The HTML transliterator should add a "generator" META tag
435             referencing itself, if one is not already present.
436              
437             The HTML transliterator should ignore sections marked as
438             being written in non-English languages.
439              
440             The HTML transliterator should have an option to
441             allow loading documents in chunks, as C already does.
442              
443             The mapping() method should have an extra parameter to
444             cause it to map in one direction only.
445              
446             Most of these will be implemented before this module reaches
447             version 1.00.
448              
449             =head1 FONTS
450              
451             You will need a Shavian Unicode font to use this module.
452             There are several such fonts at http://marnanel.org/shavian/fonts/ .
453             Please be sure to get a Unicode font and not one with the "Latin mapping".
454              
455             However, the Mac can handle the Shavian alphabet out of the box.
456              
457             =head1 COPYRIGHT
458              
459             This Perl module is copyright (C) Thomas Thurman, 2009-2010.
460             This is free software, and can be used/modified under the same terms as
461             Perl itself.
462              
463             The transliteration data is available under various free licences,
464             which are reproduced below.
465              
466             =head1 LICENCES
467              
468             =head2 Androcles and the Lion
469              
470             Part of the transliteration data was taken from the 1962 Shavian alphabet
471             edition of "Androcles and the Lion"; this data is in the public domain.
472              
473             =head2 Shavian Wiki
474              
475             Part of the transliteration data was taken from the Shavian Wiki, and
476             this is available under the Creative Commons cc-by-sa licence.
477              
478             =head2 CMUdict
479              
480             Another part of the transliteration data was taken from CMUdict. Its
481             licence is reproduced below.
482              
483             Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
484              
485             Redistribution and use in source and binary forms, with or without
486             modification, are permitted provided that the following conditions
487             are met:
488              
489             1. Redistributions of source code must retain the above copyright
490             notice, this list of conditions and the following disclaimer.
491             The contents of this file are deemed to be source code.
492              
493             2. Redistributions in binary form must reproduce the above copyright
494             notice, this list of conditions and the following disclaimer in
495             the documentation and/or other materials provided with the
496             distribution.
497              
498             This work was supported in part by funding from the Defense Advanced
499             Research Projects Agency, the Office of Naval Research and the National
500             Science Foundation of the United States of America, and by member
501             companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
502             the contributions of many volunteers to the expansion and improvement of
503             this dictionary.
504              
505             THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
506             ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
507             THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
508             PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
509             NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
510             SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
511             LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
512             DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
513             THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
514             (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
515             OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
516              
517             =head2 Brown tagger
518              
519             The part-of-speech data was taken from the Brown tagger (although the
520             tagger built into this model is not the Brown tagger, so its first
521             sentence is inaccurate). Its licence is also reproduced below:
522              
523             This software was written by Eric Brill.
524              
525             This software is being provided to you, the LICENSEE, by the
526             Massachusetts Institute of Technology (M.I.T.) under the following
527             license. By obtaining, using and/or copying this software, you agree
528             that you have read, understood, and will comply with these terms and
529             conditions:
530              
531             Permission to [use, copy, modify and distribute, including the right to
532             grant others rights to distribute at any tier, this software and its
533             documentation for any purpose and without fee or royalty] is hereby
534             granted, provided that you agree to comply with the following copyright
535             notice and statements, including the disclaimer, and that the same
536             appear on ALL copies of the software and documentation, including
537             modifications that you make for internal use or for distribution:
538              
539             Copyright 1993 by the Massachusetts Institute of Technology and the
540             University of Pennsylvania. All rights reserved.
541              
542             THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS
543             OR WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not
544             limitation, M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF
545             MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
546             THE LICENSED SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY
547             PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
548              
549             The name of the Massachusetts Institute of Technology or M.I.T. may NOT
550             be used in advertising or publicity pertaining to distribution of the
551             software. Title to copyright in this software and any associated
552             documentation shall at all times remain with M.I.T., and USER agrees to
553             preserve same.