File Coverage

blib/lib/MARC/Detrans/Rules.pm
Criterion Covered Total %
statement 59 59 100.0
branch 31 32 96.8
condition 8 11 72.7
subroutine 9 9 100.0
pod 4 7 57.1
total 111 118 94.0


line stmt bran cond sub pod time code
1             package MARC::Detrans::Rules;
2              
3 9     9   1978 use strict;
  9         20  
  9         370  
4 9     9   55 use warnings;
  9         20  
  9         9890  
5              
6             =head1 NAME
7              
8             MARC::Detrans::Rules - A set of detransliteration rules
9              
10             =head1 SYNOPSIS
11              
12             use MARC::Detrans::Rules;
13             my $rules = MARC::Detrans::Rules->new();
14             $rules->addRule( MARC::Detrans::Rule->new( from=>'a', to='b' ) );
15              
16             =head1 DESCRIPTION
17              
18             MARC::Detrans::Rules provides the core functionality for managing
19             detransliteration rules and for converting transliterated text to
20             MARC-8. A MARC::Detrans::Rules object is essentially a collection of
21             MARC::Detrans::Rule objects which are consulted during a call to convert().
22              
23             =head1 METHODS
24              
25             =cut
26              
27             =head2 new()
28              
29             Create an empty rules object to add individual rules to.
30              
31             =cut
32              
33             sub new {
34 9     9 1 1240 my $class = shift;
35 9         58 my $self = { rules => {}, error => undef };
36 9   33     102 return bless $self, ref( $class ) || $class;
37             }
38              
39             =head2 addRule()
40              
41             Add a MARC::Detrans::Rule to the rules object.
42              
43             =cut
44              
45             sub addRule {
46 465     465 1 715 my ( $self, $rule ) = @_;
47             ## get first character off the source for lookup
48             ## since we'll be processing a character at a time
49 465         1465 my $key = substr( $rule->from(), 0, 1 );
50             ## look for existing rules with this key
51 465 100       2562 my $rules = exists($self->{rules}{$key}) ? $self->{rules}{$key} : [];
52             ## and the new rule and sort the rules so that the longest come first.
53             ## this will mean that when we go to use the rules in convert()
54             ## that the longest match will occur first.
55 465         1035 push( @$rules, $rule );
56 465         2886 @$rules = sort byRule @$rules;
57             ## stash away the new rules
58 465         2123 $self->{rules}{$key} = $rules;
59             }
60              
61             sub byRule {
62             return
63 309     309 0 1000 length( $b->from() . $b->position() )
64             <=>
65             length( $a->from() . $a->position() )
66             }
67              
68             =head2 convert()
69              
70             convert() applies the rules contained in the MARC::Detrans::Rules object
71             to convert a string that is passed in.
72              
73             =cut
74              
75             sub convert {
76 82     82 1 1463 my ( $self, $in ) = @_;
77             ## ok, this is probably the most complicated bit of the distro
78             ## and it's not really that bad.
79 82         243 my $inLength = length( $in );
80 82         107 my $out = '';
81 82         104 my $pos = 0;
82 82         112 my $currentEscape = '';
83             ## we're going to step through the source string and build up $out
84             ## to contain the de-transliterated text
85 82         834 while ( $pos < $inLength ) {
86             ## extract the character at the current position
87             ## and look to see if we have a rule for it
88 1117         2740 my $key = substr( $in, $pos, 1 );
89 1117 100       6144 my $rules = exists $self->{rules}{$key} ? $self->{rules}{$key} : [];
90 1117         3371 pos($in) = $pos;
91 1117         1424 my $foundRule;
92             ## go through each of the rules and see if we've got a match
93 1117         1851 foreach my $rule ( @$rules ) {
94 1513         3952 my $from = $rule->from();
95             ## if the rule matches remember it for later and jump out of
96             ## the loop since we've got what we needed
97             ## \G anchors the match at our current position
98             ## \Q...\E makes sure that metacharacters in our pattern are escaped
99 1513 100       26696 if ( $in =~ m/\G\Q$from\E/ ) {
100 1128   100     6786 my $position = $rule->position() || '';
101 1128 100       7513 if ( $position eq 'initial' ) {
    100          
    100          
102 22 100       61 next unless isInitial( $in, $pos );
103             }
104             elsif ( $position eq 'medial' ) {
105 8 100 66     13 next if isInitial( $in, $pos ) or isFinal( $in, $pos );
106             }
107             elsif ( $position eq 'final' ) {
108 4 50       8 next unless isFinal( $in, $pos );
109             }
110 1115         2296 $foundRule = $rule;
111 1115         3051 last;
112             }
113             }
114             ## no matched rule, then we've got a character in the source
115             ## data which doesn't map. Store the error and return asap.
116 1117 100       3280 if ( ! defined($foundRule) ) {
117 2         14 $self->{error} = sprintf(
118             qq(no matching rule found for "%s" [0x%x] at position %i),
119             $key, ord($key), $pos+1 );
120 2         14 return;
121             }
122             ## advance the position the amount of characters that we matched
123 1115         3391 $pos += length( $foundRule->from() );
124             ## if the rule has an associated MARC-8 escape character tag it
125             ## onto the output text
126 1115 100 100     4326 if ($foundRule->escape() and $foundRule->escape() ne $currentEscape) {
127 77         239 $out .= chr(0x1B).$foundRule->escape();
128 77         392 $currentEscape = $foundRule->escape();
129             }
130             ## append the new text
131 1115         3464 $out .= $foundRule->to();
132             }
133             ## escape back to ASCII if approriate
134 80 100       261 if ( $currentEscape ) { $out .= chr(0x1B).chr(0x28).chr(0x42); }
  73         703  
135             ## make sure error flag is undef since we're ok now
136 80         153 $self->{error} = undef;
137             ## return the new text!
138 80         534 return( $out );
139             }
140              
141             =head2 error()
142              
143             Will return the latest error encountered during a call to convert(). Can
144             be useful for determining why a call to convert() failed. A side effect
145             of calling error() is that the error slot is reset.
146              
147             =cut
148              
149             sub error {
150 3     3 1 8 my $self = shift;
151 3         6 my $error = $self->{error};
152 3         5 $self->{error} = undef;
153 3         16 return( $error );
154             }
155              
156             =head1 AUTHORS
157              
158             =over 4
159              
160             =item * Ed Summers
161              
162             =cut
163              
164             ## helper functions to determine whether a specific positon in a string
165             ## is at the start or at the end of a word.
166              
167             sub isInitial {
168 30     30 0 48 my ($string,$position) = @_;
169 30 100       82 return 1 if $position == 0;
170 24 100       93 return 1 if substr($string,$position-1,1) =~ /\W/;
171 17         61 return 0;
172             }
173              
174             sub isFinal {
175 12     12 0 16 my ($string,$position) = @_;
176 12 100       39 return 1 if $position == length($string)-1;
177 6 100       39 return 1 if substr($string,$position+1,1) =~ /\W/;
178             }
179              
180             1;