File Coverage

blib/lib/MARC/Detrans.pm
Criterion Covered Total %
statement 124 145 85.5
branch 47 60 78.3
condition 6 12 50.0
subroutine 17 20 85.0
pod 7 15 46.6
total 201 252 79.7


line stmt bran cond sub pod time code
1             package MARC::Detrans;
2              
3 6     6   169795 use strict;
  6         14  
  6         380  
4 6     6   36 use warnings;
  6         12  
  6         223  
5 6     6   35 use Carp qw( croak );
  6         13  
  6         357  
6 6     6   4216 use MARC::Detrans::Config;
  6         331  
  6         51  
7              
8             our $VERSION = '1.41';
9              
10             =head1 NAME
11              
12             MARC::Detrans - De-transliterate text and MARC records
13              
14             =head1 SYNOPSIS
15              
16             use MARC::Batch;
17             use MARC::Detrans;
18              
19             my $batch = MARC::Batch->new( 'marc.dat' );
20             my $detrans = MARC::Detrans->new( 'config.xml' );
21              
22             while ( my $record = $batch->next() ) {
23             my $newRecord = $detrans->convert( $record );
24             }
25              
26             =head1 DESCRIPTION
27              
28             MARC::Detrans is an eclectic addition to the already eclectic MARC::Record
29             distribution for de-transliterating MARC::Records. What is detransliteration
30             you ask? Well it's the opposite of transliteration, which according to the
31             Merriam-Webster:
32              
33             to represent or spell in the characters of another alphabet
34              
35             Traditionally when librarians catalog an item that has a title in a non-Roman
36             script they will follow transliteration rules for converting the title
37             into the Roman alphabet, so that the bibliographic record could be filed
38             into the card catalog or database index appropriately. These Romanization
39             Rules are published by the Library of Congress
40             http://www.loc.gov/catdir/cpso/roman.html.
41              
42             Now that computer screens can display Unicode fairly well it is now
43             desirable to display the original script for library users who are
44             more familiar with the original script. MARC::Detrans provides a framework
45             for detransliterating MARC records so that the orginal script is available
46             MARC-8 encoded in 880 fields. Very esoteric right?
47              
48             =head1 CONFIGURATION
49              
50             MARC::Detrans behavior is controlled by an XML configuration file. An
51             example of this configuration file can be found in the examples directory
52             of the MARC::Detrans distribution. The configuration determines the
53             detransliteration rules that will be used to add 880 fields to existing
54             records. It is hoped that people will contribute their configurations
55             for various languages to the MARC::Detrans project so that they can
56             be distributed with this package. For more information about the
57             configuration file see L.
58              
59             In addition a sample driver program which uses MARC::Detrans has also
60             been included in the examples directory. This script is meant as a
61             jumping off point showing how to use the MARC::Detrans framework.
62              
63             =head1 METHODS
64              
65             =head2 new()
66              
67             The constructor which you should pass the path to your configuration file.
68              
69             my $detrans = MARC::Detrans->new( config => 'config.xml' );
70              
71             =cut
72              
73             sub new {
74 6     6 1 9343 my ($class,%args) = @_;
75 6 100       58 croak( "must supply config parameter" ) if ! exists $args{config};
76 5 100       168 croak( "config file doesn't exist" ) if ! -f $args{config};
77 4         34 my $config = MARC::Detrans::Config->new( $args{config} );
78             ## verify a few things
79 4 50       18 croak( $args{config} . ": missing code attribute in language element" )
80             if ! $config->languageCode();
81 4         83 return _init( $class, $config );
82             }
83              
84             =head2 newFromConfig()
85              
86             If you want to supply your own MARC::Detran::Config object instead of
87             an XML file configuration as in new() you can use newFromConfig().
88             It's unlikely that you'll ever need to use this method.
89              
90             =cut
91              
92             sub newFromConfig {
93 2     2 1 42 my ($class,$config) = @_;
94 2 50 33     28 croak( "must supply MARC::Detrans::Config object" )
95             if ! ref($config) or ! $config->isa( 'MARC::Detrans::Config' );
96 2         10 return _init( $class, $config );
97             }
98              
99             ## helper to initialize an object
100             sub _init {
101 6     6   16 my ($class,$config) = @_;
102 6   33     102 return bless {
103             config => $config,
104             errors => [],
105             tallyAdd880 => 0,
106             tallyDetrans => {},
107             tallyCopy => {},
108             }, ref($class) || $class;
109             }
110              
111             =head2 convert()
112              
113             Pass a MARC::Record into convert() and you will be returned a
114             the same object with portions of it modified according to your
115             configuration file.
116              
117             IMPORTANT: if the record was not modified or an error was encountered
118             you will be returned undef instead of the MARC::Record object. You
119             will want to use the errors() method for diagnosing what happened.
120              
121             =cut
122              
123             sub convert {
124 24     24 1 211027 my ($self,$record) = @_;
125 24 50 33     304 croak( "must pass in MARC::Record object" )
126             if ! ref($record) or ! $record->isa( 'MARC::Record' );
127 24         86 my $config = $self->{config};
128              
129             ## make sure the script isn't already present
130 24 100       93 if ( $self->scriptAlreadyPresent($record) ) {
131 1         5 $self->addError( "target script already present" );
132 1         3 return;
133             }
134              
135             ## check the language of the record
136 23         89 my $f008 = $record->field( '008' );
137 23 50       855 if ( ! $f008 ) {
138 0         0 $self->addError( "can't determine language in record: missing 008" );
139 0         0 return;
140             }
141 23         115 my $lang = substr( $f008->data(), 35, 3 );
142 23 50       410 if ( $lang ne $config->languageCode() ) {
143 0         0 $self->addError( "record is not correct language: $lang instead of ".
144             $config->languageCode() );
145 0         0 return;
146             }
147              
148             ## add 880 fields and return if the record was edited
149 23 100       485 return $record if $self->add880s( $record );
150              
151             ## otherwise return undef since the record was not modified
152 4         12 return;
153             }
154              
155             ## internal helper for adding 880 fields to a record
156             ## will return 1 if the record is modified and 0 if it isn't
157              
158             sub add880s {
159 23     23 0 60 my ($self,$r) = @_;
160 23         44 my $config = $self->{config};
161 23         99 my $rules = $config->rules();
162 23         251 my $names = $config->names();
163 23         348 my $scriptCode = $config->scriptCode();
164 23         259 my $scriptOrientation = $config->scriptOrientation();
165 23         244 my $count = 0;
166 23         38 my $edited = 0;
167              
168             ## see if the record is for a translation
169             ## since we'll need to skip some fields below if it is
170 23         91 my $isTranslation = isTranslation( $r );
171              
172 23         137 foreach my $tag ( $config->detransFields() ) {
173 42         456 FIELD: foreach my $field ( $r->field($tag) ) {
174 39         14353 my @newSubfields = ();
175              
176             ## we don't process parallel titles
177 39 100       119 if ( isParallelTitle($field) ) {
178 3         66 $self->addError( "field=$tag: skipped parallel title" );
179 3         14 next FIELD;
180             }
181              
182             ## we don't process 1XX and 7XX fields the record
183             ### is for a translation
184 36 100 100     377 if ( $isTranslation and $tag =~ /(1|7)\d\d/ ) {
185 3         62 $self->addError( "field=$tag: skipped because of translation" );
186 3         15 next FIELD;
187             }
188            
189             ## if it's a field that might contain a name look it up
190             ## to see if it has a non-standard detransliteration
191 33 100       103 if ( isNameField($tag) ) {
192 15         120 my $nameData = $names->convert( $field );
193 15 50       61 if ( $nameData ) {
194 0         0 $self->{tallyAdd880}++;
195 0         0 $count++;
196 0         0 add880( $r, $count, $field, $nameData, $scriptCode,
197             $scriptOrientation );
198 0         0 $edited = 1;
199 0         0 next FIELD;
200             }
201             }
202              
203 33         134 SUBFIELD: foreach my $subfield ( $field->subfields() ) {
204 39         845 my ($code,$data) = @$subfield;
205 39 100       248 if ($config->needsDetrans(field=>$tag,subfield=>$code)) {
    50          
206 35         175 my $new = $rules->convert( $data );
207 35 100       116 if ( ! defined $new ) {
208 1         7 $self->addError( "field=$tag subfield=$code: " .
209             $rules->error() );
210 1         4 next FIELD;
211             }
212 34         484 $self->{tallyDetrans}{"$tag-$code"}++;
213 34         129 push( @newSubfields, $code, $rules->convert($data) );
214             }
215             elsif ($config->needsCopy(field=>$tag,subfield=>$code)) {
216 4         18 $self->{tallyCopy}{"$tag-$code"}++;
217 4         16 push( @newSubfields, $code, $data);
218             }
219             }
220              
221 32 50       147 if ( @newSubfields ) {
222 32         89 $self->{tallyAdd880}++;
223 32         53 $count++;
224 32         139 add880($r, $count, $field, \@newSubfields, $scriptCode,
225             $scriptOrientation );
226 32         3499 $edited = 1;
227             }
228             }
229              
230             }
231              
232 23 100       855 if ( $edited ) {
233 19         408 $self->add066($r);
234             }
235              
236 23         3335 return $edited;
237             }
238              
239             sub scriptAlreadyPresent {
240 24     24 0 52 my ($self,$record ) = @_;
241 24         49 my $config = $self->{config};
242 24         98 my $f066 = $record->field( '066' );
243 24 100       3066 return 0 if ! $f066;
244 1         7 foreach my $subfield( $f066->subfields() ) {
245 1 50       51 return 1 if grep { $_ eq $subfield->[1] } $config->allEscapeCodes();
  2         14  
246             }
247 0         0 return 0;
248             }
249              
250             sub isNameField {
251 33     33 0 65 my $tag = shift;
252 33         992 return grep /^$tag$/, qw( 100 110 600 700 810 800 );
253             }
254              
255             sub isParallelTitle {
256 39     39 0 61 my $field = shift;
257 39 100       120 return if $field->tag() ne 246;
258 3 100       26 return 1 if $field->indicator(2) =~ /1|5/;
259 1 50       15 return 1 if ( $field->subfields() )[0]->[0] eq 'i';
260 0         0 return;
261             }
262              
263             sub isTranslation {
264 23     23 0 43 my $r = shift;
265 23         72 my $f041 = $r->field( '041' );
266 23 100       3743 return if ! $f041;
267 2 50       10 return if ! $f041->subfield( 'h' );
268 2         52 return 1;
269             }
270              
271             ## private helper function to add a single 880 based on the
272             ## tag and indicators of another field
273              
274             sub add880 {
275 32     32 0 92 my ( $record, $count, $field, $subfields, $scriptCode, $orientation ) = @_;
276 32         173 my $tag = $field->tag();
277 32         3171 my $occurrence = sprintf( '%02d', $count );
278 32         89 my $sub6 = "$tag-$occurrence";
279 32 100       137 $sub6 .= "/$scriptCode" if defined $scriptCode;
280 32 100       101 $sub6 .= "/$orientation" if defined $orientation;
281 32         177 my $f880 = MARC::Field->new(
282             '880',
283             $field->indicator(1),
284             $field->indicator(2),
285             6 => $sub6, ## subfield 6
286             @$subfields ## the reset of the subfields
287             );
288 32         3462 $record->insert_grouped_field( $f880 );
289              
290             ## now add to the original field
291             ## by creating a new field with the subfield 6
292             ## and replacing the old field with it
293 32         7398 my @subfields = map { $_->[0], $_->[1] } $field->subfields();
  38         752  
294 32         129 unshift( @subfields, '6' => "880-$occurrence" );
295 32         117 $field->replace_with(
296             MARC::Field->new(
297             $tag,
298             $field->indicator(1), $field->indicator(2),
299             @subfields
300             )
301             );
302             }
303              
304             ## private helper function for adding a 066 indicating which
305             ## additional character sets were used in this record
306              
307             sub add066 {
308 19     19 0 42 my ($self,$record) = @_;
309 19         58 my $config = $self->{config};
310              
311             ## get a list of all the 066 fields used in this mapping
312             ## techically we should probably only list here the ones
313             ## that are *actually* used in this record...but there's
314             ## probably no harm in listing all of the ones used in this
315             ## configuration.
316 19         35 my @subfields;
317 19         110 foreach ( $config->allEscapeCodes() ) {
318             ## ignore (B
319 36 50       88 next if $_ eq '(B';
320 36         87 push( @subfields, 'c', $_ );
321             }
322              
323 19 100       68 return if @subfields == 0;
324              
325             ## don't obliterate an 066 that's already present
326 18         79 my $f066 = $record->field( '066' );
327 18 50       3017 if ( $f066 ) {
328 0         0 unshift( @subfields, map { $_->[0], $_->[1] } $f066->subfields() );
  0         0  
329 0         0 my $new066 = MARC::Field->new( '066', '', '', @subfields );
330 0         0 $f066->replace_with( $new066 );
331             } else {
332 18         73 $f066 = MARC::Field->new( '066', '', '', @subfields );
333 18         1098 $record->insert_grouped_field( $f066 );
334             }
335             }
336              
337              
338             =head2 errors()
339              
340             Will return the latest errors encountered during a call to convert(). Can
341             be useful for determining why a call to convert() returned undef. A side
342             effect of calling errors() is that the errors storage is reset.
343              
344             =cut
345              
346             sub errors {
347 7     7 1 843 my $self = shift;
348 7         11 my @errors = @{ $self->{errors} };
  7         18  
349 7         17 $self->{errors} = [];
350 7         31 return @errors;
351             }
352              
353             ## this really should just be used internally...hence no POD
354             sub addError {
355 8     8 0 93 my ($self,$msg) = @_;
356 8         15 push( @{ $self->{errors} }, $msg );
  8         69  
357             }
358              
359             =head2 stats880sAdded()
360              
361             Returns the total amount of 880 fields added to records so far by
362             this MARC::Detrans object.
363              
364             =cut
365              
366             sub stats880sAdded {
367 0     0 1   my $self = shift;
368 0           return $self->{tallyAdd880};
369             }
370              
371             =head2 statsDetransliterated()
372              
373             Returns a hash of stats on the field_subfield combinations that
374             have been detransliterated by a MARC::Detrans object.
375              
376             =cut
377              
378             sub statsDetransliterated {
379 0     0 1   return %{ shift->{tallyDetrans} };
  0            
380             }
381              
382             =head2 statsCopied()
383              
384             Returns a hash of stats on the field_subfield combinations that
385             have been copied by a MARC::Detrans object.
386              
387             =cut
388              
389             sub statsCopied {
390 0     0 1   return %{ shift->{tallyCopy} };
  0            
391             }
392              
393             =head1 AUTHORS
394              
395             MARC::Detrans was developed as part of a project funded by the Queens
396             Borough Public Library in New York City under the direction of Jane Jacobs.
397             It is their generosity that allowed this package to be released on CPAN.
398              
399             =over 4
400              
401             =item * Ed Summers
402              
403             =back
404              
405             =cut
406              
407             1;