File Coverage

blib/lib/MARC/SubjectMap.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             package MARC::SubjectMap;
2              
3 6     6   33664 use strict;
  6         14  
  6         348  
4 6     6   33 use warnings;
  6         11  
  6         202  
5 6     6   28 use Carp qw( croak );
  6         10  
  6         395  
6 6     6   3669 use MARC::Field;
  6         21051  
  6         209  
7 6     6   3816 use MARC::SubjectMap::XML qw( startTag endTag element comment );
  6         17  
  6         499  
8 6     6   3689 use MARC::SubjectMap::Rules;
  0            
  0            
9             use MARC::SubjectMap::Handler;
10             use XML::SAX::ParserFactory;
11             use IO::File;
12              
13             our $VERSION = '0.93';
14              
15             =head1 NAME
16              
17             MARC::SubjectMap - framework for translating subject headings
18              
19             =head1 SYNOPSIS
20              
21             use MARC::SubjectMap;
22             my $map = MARC::SubjectMap->newFromConfig( "config.xml" );
23              
24             my $batch = MARC::Batch->new( 'USMARC', 'batch.dat' );
25             while ( my $record = $batch->next() ) {
26             my $new = $map->translateRecord( $record );
27             ...
28             }
29              
30             =head1 DESCRIPTION
31              
32             MARC::SubjectMap is a framework for providing translations of subject
33             headings. MARC::SubjectMap is essentially a configuration which contains
34             a list of fields/subfields to translate or copy, and a list of rules
35             for translating one field/subfield value into another.
36              
37             Typical usage of the framework will be to use the C
38             command line application to generate a template XML configuration from a
39             batch of MARC records. You tell C the fields you'd like
40             to translate and/or copy and it will look through the records and extract
41             and add rule templates for the unique values. For example:
42              
43             subjmap-template --in=marc.dat --out=config.xml --translate=650ab
44              
45             Once the template configuration has been filled in with translations,
46             the MARC batch file can be run through another command line utility called
47             C which will add new subject headings where possible using
48             the configuration file. If a subject headings can't be translated it will be
49             logged to a file so that the configuration file can be improved if necessary.
50            
51             subjmap --in=marc.dat --out=new.dat --config=config.xml --log=log.txt
52              
53             The idea is that all the configuration is done in the XML file, and the
54             command line programs take care of driving these modules for you. Methods
55             and related modules are listed below for the sake of completeness, and if
56             you want to write your own driving program for some reason.
57              
58             =head1 METHODS
59              
60             =head2 new()
61              
62             The constructor which accepts no arguments.
63              
64             =cut
65              
66             sub new {
67             my ($class) = @_;
68             my $self = {
69             fields => [],
70             sourceLanguage => '',
71             error => '',
72             stats => { recordsProcessed=>0, fieldsAdded=>0, errors=>0 }
73             };
74             return bless $self, ref($class) || $class;
75             }
76              
77             =head2 newFromConfig()
78              
79             Factory method for creating a MARC::SubjectMap object from an XML
80             configuration. If there is an error you will get it on STDERR.
81              
82             my $mapper = MARC::SubjectMap->new( 'config.xml' );
83              
84             =cut
85              
86             sub newFromConfig {
87             my ($package,$file) = @_;
88             my $handler = MARC::SubjectMap::Handler->new();
89             my $parser = XML::SAX::ParserFactory->parser( Handler => $handler );
90             eval { $parser->parse_uri( $file ) };
91             croak( "invalid configuration file: $file: $@" ) if $@;
92             return $handler->config();
93             }
94              
95             =head2 writeConfig()
96              
97             Serializes the configuration to disk as XML.
98              
99             =cut
100              
101             sub writeConfig {
102             my ($self,$file) = @_;
103             my $fh = IO::File->new( ">$file" )
104             or croak( "unable to write to file $file: $! " );
105             $self->toXML($fh);
106             }
107              
108             =head2 addField()
109              
110             Adds a field specification to the configuration. Each specification defines the
111             fields and subfields to look for and copy/translate in MARC data. The
112             information is bundled up in a MARC::SubjectMap::Field object.
113              
114             =cut
115              
116             sub addField {
117             my ($self,$field) = @_;
118             croak( "must supply MARC::SubjectMap::Field object" )
119             if ref($field) ne 'MARC::SubjectMap::Field';
120             push( @{ $self->{fields} }, $field );
121             }
122              
123             =head2 fields()
124              
125             Returns a list of MARC::SubjectMap::Field objects which specify the
126             fields/subfields in MARC data that will be copied and/or translated.
127              
128             =cut
129              
130             sub fields {
131             my ($self) = @_;
132             return @{ $self->{fields} };
133             }
134              
135             =head2 rules()
136              
137             Get/set the rules being used in this configuration. You should pass
138             in a MARC::SubjectMap::Rules object if you are setting the rules.
139              
140             $map->rules( $rules );
141              
142             The reason why a sepearte object is used to hold the Rules as opposed to the
143             fields being contained in the MARC::SubjectMap object is that there can be
144             many (thousands perhaps) of rules -- which need to be stored differently than
145             the handful of fields.
146              
147             =cut
148              
149             sub rules {
150             my ($self,$rules) = @_;
151             croak( "must supply MARC::SubjectMap::Rules object if setting rules" )
152             if $rules and ref($rules) ne 'MARC::SubjectMap::Rules';
153             $self->{rules} = $rules if $rules;
154             return $self->{rules};
155             }
156              
157             =head2 sourceLanguage()
158              
159             Option for specifying the three digit language code to be expected in
160             translation records. If a record is passed is translated that is not of the
161             expected source language then a log message will be generated.
162              
163             =cut
164              
165             sub sourceLanguage {
166             my ($self,$lang) = @_;
167             $self->{sourceLanguage} = $lang if defined $lang;
168             return $self->{sourceLanguage};
169             }
170              
171             =head2 translateRecord()
172              
173             Accepts a MARC::Record object and returns a translated version of it
174             if there were any translations that could be performed. If no translations
175             were possible undef will be returned.
176              
177             =cut
178              
179             sub translateRecord {
180             my ($self,$record) = @_;
181             croak( "must supply MARC::Record object to translateRecord()" )
182             if ! ref($record) or ! $record->isa( 'MARC::Record' );
183              
184             my $record_id = $record->field('001') ? $record->field('001')->data() : '';
185             $record_id =~ s/ +$//;
186              
187             $self->{stats}{recordsProcessed}++;
188              
189             ## log message if the record isn't the expected language
190             if ( language($record) ne $self->sourceLanguage() ) {
191             $self->log( sprintf( "record language=%s instead of %s",
192             language($record), $self->sourceLanguage() ) );
193             }
194              
195             ## create a copy of the record to add to
196             my $clone = $record->clone();
197             my $found = 0;
198              
199             # process each field that we need to look at
200             foreach my $field ( $self->fields() ) {
201              
202             my @marcFields = $record->field( $field->tag() );
203             my $fieldCount = 0;
204              
205             foreach my $marcField ( @marcFields ) {
206             $fieldCount++;
207              
208             # do the translation
209             my $new = $self->translateField( $marcField, $field );
210             my $error = $self->error();
211              
212             if ( $new ) {
213             $clone->insert_grouped_field($new);
214             $self->{stats}{fieldsAdded}++;
215             $found = 1;
216             $self->log("record $record_id: translated \"" .
217             $marcField->as_string() . '" to "' .
218             $new->as_string() . '"') ;
219             }
220             elsif ( $error ) {
221             $self->log("record $record_id: $error");
222             }
223             else {
224             # the field didn't match subfield filters or
225             # it only had copy actions and no translations
226             # so we just continue along
227             }
228             }
229             }
230             return $clone if $found;
231             return;
232             }
233              
234             # you won't want to call this directly so there's no POD for it
235             # warning: subroutine that's longer than your console window alert
236             # TODO: break this up
237              
238             sub translateField {
239             # args are MARC::SubjectMap object, the MARC::Field to translate
240             # and the MARC::SubjectMap::Field object which defines how we translate
241             my ($self,$field,$fieldConfig) = @_;
242             croak( "must supply MARC::Field object to translateField()" )
243             if !ref($field) or !$field->isa('MARC::Field');
244             croak( "must pass in MARC::SubjectMap::Field" )
245             if !ref($fieldConfig) or !$fieldConfig->isa('MARC::SubjectMap::Field');
246              
247             # make sure error flag is undef
248             $self->error( undef );
249              
250             ## subfields with subfield 2 already present are not translated
251             if ($field->subfield(2)) {
252             $self->error( "subfield 2 already present" );
253             return;
254             }
255              
256             ## don't bother translating if it doesn't meet indicator criteria
257             ## no error set here since it really isn't an error just a filter
258             my $indicator1 = $fieldConfig->indicator1();
259             my $indicator2 = $fieldConfig->indicator2();
260             return if defined $indicator1 and $indicator1 ne $field->indicator(1) ;
261             return if defined $indicator2 and $indicator2 ne $field->indicator(2) ;
262              
263             ## these are subfields we can copy wholesale
264             my @copySubfields = $fieldConfig->copy();
265              
266             my (@subfields,%sources,$lastSource,$didTranslation);
267             foreach my $subfield ( $field->subfields() ) {
268             my ($subfieldCode,$subfieldValue) = @$subfield;
269              
270             ## remove trailing period if present
271             $subfieldValue =~ s|\.$||;
272              
273             ## if we just copy this subfield lets do it and move on
274             if ( grep /^$subfieldCode$/, @copySubfields ) {
275             push( @subfields, $subfieldCode, $subfieldValue );
276             next;
277             }
278              
279             ## remove trailing whitespace since all rules have had their
280             ## original , but remember it so we can add it
281             ## back on to the translated subfield
282             my $trailingSpaces = '';
283             if ( $subfieldValue =~ /( +)$/ ) {
284             $trailingSpaces = $1;
285             }
286            
287             ## look up the rule!
288             my $rule = $self->{rules}->getRule(
289             field => $field->tag(),
290             subfield => $subfieldCode,
291             original => $subfieldValue, );
292              
293             ## if we have a matching rule
294             if ( $rule ) {
295             if ( $rule->translation() ) {
296             $didTranslation = 1;
297             push( @subfields, $subfieldCode,
298             $rule->translation() . $trailingSpaces );
299             } else {
300             $self->{stats}{errors}++;
301             $self->error("missing translation for rule: ".$rule->toString);
302             return;
303             }
304            
305             ## if a subfield a store away the source
306             $sources{ $subfieldCode } = $rule->source();
307             $lastSource = $rule->source();
308             }
309              
310             ## uhoh we don't know what to do with this subfield
311             else {
312             $self->{stats}{errors}++;
313             $self->error(
314             sprintf(
315             'could not translate "%s" from %s $%s',
316             $subfieldValue, $field->tag(), $subfieldCode
317             )
318             );
319             return;
320             }
321             }
322              
323             ## if we didn't translate anything no need to make a new field
324             ## note we dont' set an error message since this isn't really an error
325             return if ! $didTranslation;
326              
327             ## if the last subfield doesn't end in a <.> or a <)> add a period
328             $subfields[-1] .= '.' if ( $subfields[-1] !~ /[.)]/ );
329              
330             ## the configuration determines what subfield should have precedence
331             ## in determining the source of the subfield.
332             my $sourceSubfield = $fieldConfig->sourceSubfield();
333             if ( exists $sources{ $sourceSubfield } ) {
334             push( @subfields, '2', $sources{ $sourceSubfield } );
335             } elsif ( defined $lastSource ) {
336             push( @subfields, '2', $lastSource );
337             } else {
338             $self->{stats}{errors}++;
339             $self->log( "missing source for new field: ".join('', @subfields ) );
340             }
341            
342             return MARC::Field->new($field->tag(),$field->indicator(1),7,@subfields);
343             }
344              
345             =head2 stats()
346              
347             Returns a hash of statistics for conversions performed by a MARC::SubjectMap
348             object.
349              
350             =cut
351              
352             sub stats {
353             return %{ shift->{stats} };
354             }
355              
356             =head2 setLog()
357              
358             Set a file to send diagnostic messages to. If unspecified messages will go to
359             STDERR. Alternatively you can pass in a IO::Handle object.
360              
361             =cut
362              
363             ## logging methods
364              
365             sub setLog {
366             my ($self,$f) = @_;
367             if ( ref($f) ) {
368             $self->{log} = $f;
369             } else {
370             $self->{log} = IO::File->new( ">$f" );
371             }
372             }
373              
374             sub log {
375             my ($self,$msg) = @_;
376             $msg .= "\n";
377             if ( $self->{log} ) {
378             $self->{log}->print( $msg );
379             } else {
380             print STDERR $msg;
381             }
382             }
383              
384             # returns entire object as XML
385             # this is essentially the configuration
386             # since it can be big a filehandle must be passed in
387              
388             sub toXML {
389             my ($self,$fh) = @_;
390             print $fh qq(\n);
391             print $fh startTag( "config" ),"\n\n";
392            
393             # language limiter if present
394             my $lang = $self->sourceLanguage() || '';
395             print $fh comment( "the original language" ), "\n";
396             print $fh element( "sourceLanguage", $self->sourceLanguage() ), "\n\n";
397              
398             ## add fields
399             print $fh comment( "the fields and subfields to be processed" )."\n";
400             print $fh startTag( "fields" ), "\n\n";
401             foreach my $field ( $self->fields() ) {
402             print $fh $field->toXML(), "\n";
403             }
404             print $fh endTag( "fields" ), "\n\n";
405              
406             ## add rules
407             if ( $self->rules() ) {
408             $self->rules()->toXML( $fh );
409             }
410              
411             print $fh "\n", endTag( "config" ), "\n";
412             }
413              
414             # helper to extract the language code from the 008
415              
416             sub language {
417             my $r = shift;
418             my $f008 = $r->field('008');
419             return '' if ! $f008;
420             return substr( $f008->data(), 35, 3 );
421             }
422              
423              
424             # helper to store a single error message, not really for public use
425              
426             sub error {
427             my ($self,$msg) = @_;
428             if ( $msg ) { $self->{error} = $msg; }
429             return $self->{error};
430             }
431              
432             sub DESTROY {
433             my $self = shift;
434             ## close log file handle if its open
435             $self->{log}->close() if exists( $self->{log} );
436             }
437              
438             =head1 SEE ALSO
439              
440             =over 4
441              
442             =item * L
443              
444             =item * L
445              
446             =item * L
447              
448             =back
449              
450             =head1 AUTHORS
451              
452             =over 4
453              
454             =item * Ed Summers
455              
456             =back
457              
458             =cut
459              
460             1;