File Coverage

blib/lib/MARC/File/XML.pm
Criterion Covered Total %
statement 90 174 51.7
branch 11 60 18.3
condition 8 69 11.5
subroutine 18 30 60.0
pod 0 3 0.0
total 127 336 37.8


line stmt bran cond sub pod time code
1             package MARC::File::XML;
2              
3 4     4   2548 use warnings;
  4         6  
  4         145  
4 4     4   19 use strict;
  4         5  
  4         137  
5 4     4   29 use vars qw( $VERSION %_load_args );
  4         5  
  4         257  
6 4     4   19 use base qw( MARC::File );
  4         5  
  4         3300  
7 4     4   9617 use MARC::Record;
  4         31993  
  4         219  
8 4     4   29 use MARC::Field;
  4         6  
  4         77  
9 4     4   2099 use MARC::File::SAX;
  4         12  
  4         146  
10 4     4   34 use XML::SAX qw(Namespaces Validation);
  4         6  
  4         203  
11              
12 4     4   18 use MARC::Charset qw( marc8_to_utf8 utf8_to_marc8 );
  4         5  
  4         142  
13 4     4   2241 use IO::File;
  4         26303  
  4         482  
14 4     4   27 use Carp qw( croak );
  4         6  
  4         133  
15 4     4   17 use Encode ();
  4         6  
  4         6512  
16              
17             $VERSION = '0.88_1';
18              
19             my $handler = MARC::File::SAX->new();
20              
21             my $factory = XML::SAX::ParserFactory->new();
22             $factory->require_feature(Namespaces);
23              
24             my $parser = $factory->parser( Handler => $handler, ProtocolEncoding => 'UTF-8' );
25              
26             sub import {
27 0     0   0 my $class = shift;
28 0         0 %_load_args = @_;
29 0   0     0 $_load_args{ DefaultEncoding } ||= 'UTF-8';
30 0   0     0 $_load_args{ RecordFormat } ||= 'USMARC';
31              
32 0         0 $parser = $factory->parser( Handler => $handler, ProtocolEncoding => $_load_args{DefaultEncoding} );
33             }
34              
35             =head1 NAME
36              
37             MARC::File::XML - Work with MARC data encoded as XML
38              
39             =head1 SYNOPSIS
40              
41             ## Loading with USE options
42             use MARC::File::XML ( BinaryEncoding => 'utf8', RecordFormat => 'UNIMARC' );
43              
44             ## Setting the record format without USE options
45             MARC::File::XML->default_record_format('USMARC');
46            
47             ## reading with MARC::Batch
48             my $batch = MARC::Batch->new( 'XML', $filename );
49             my $record = $batch->next();
50              
51             ## or reading with MARC::File::XML explicitly
52             my $file = MARC::File::XML->in( $filename );
53             my $record = $file->next();
54              
55             ## serialize a single MARC::Record object as XML
56             print $record->as_xml();
57              
58             ## write a bunch of records to a file
59             my $file = MARC::File::XML->out( 'myfile.xml' );
60             $file->write( $record1 );
61             $file->write( $record2 );
62             $file->write( $record3 );
63             $file->close();
64              
65             ## instead of writing to disk, get the xml directly
66             my $xml = join( "\n",
67             MARC::File::XML::header(),
68             MARC::File::XML::record( $record1 ),
69             MARC::File::XML::record( $record2 ),
70             MARC::File::XML::footer()
71             );
72              
73             =head1 DESCRIPTION
74              
75             The MARC-XML distribution is an extension to the MARC-Record distribution for
76             working with MARC21 data that is encoded as XML. The XML encoding used is the
77             MARC21slim schema supplied by the Library of Congress. More information may
78             be obtained here: http://www.loc.gov/standards/marcxml/
79              
80             You must have MARC::Record installed to use MARC::File::XML. In fact
81             once you install the MARC-XML distribution you will most likely not use it
82             directly, but will have an additional file format available to you when you
83             use MARC::Batch.
84              
85             This version of MARC-XML supersedes an the versions ending with 0.25 which
86             were used with the MARC.pm framework. MARC-XML now uses MARC::Record
87             exclusively.
88              
89             If you have any questions or would like to contribute to this module please
90             sign on to the perl4lib list. More information about perl4lib is available
91             at L.
92              
93             =head1 METHODS
94              
95             When you use MARC::File::XML your MARC::Record objects will have two new
96             additional methods available to them:
97              
98             =head2 MARC::File::XML->default_record_format([$format])
99              
100             Sets or returns the default record format used by MARC::File::XML. Valid
101             formats are B, B, B and B.
102              
103             MARC::File::XML->default_record_format('UNIMARC');
104              
105             =cut
106              
107             sub default_record_format {
108 0     0   0 my $self = shift;
109 0         0 my $format = shift;
110              
111 0 0       0 $_load_args{RecordFormat} = $format if ($format);
112              
113 0         0 return $_load_args{RecordFormat};
114             }
115              
116              
117             =head2 as_xml()
118              
119             Returns a MARC::Record object serialized in XML. You can pass an optional format
120             parameter to tell MARC::File::XML what type of record (USMARC, UNIMARC, UNIMARCAUTH) you are
121             serializing.
122              
123             print $record->as_xml([$format]);
124              
125             =cut
126              
127             sub MARC::Record::as_xml {
128 1     1 0 507 my $record = shift;
129 1   33     11 my $format = shift || $_load_args{RecordFormat};
130 1         4 return( MARC::File::XML::encode( $record, $format ) );
131             }
132              
133             =head2 as_xml_record([$format])
134              
135             Returns a MARC::Record object serialized in XML without a collection wrapper.
136             You can pass an optional format parameter to tell MARC::File::XML what type of
137             record (USMARC, UNIMARC, UNIMARCAUTH) you are serializing.
138              
139             print $record->as_xml_record('UNIMARC');
140              
141             =cut
142              
143             sub MARC::Record::as_xml_record {
144 0     0 0 0 my $record = shift;
145 0   0     0 my $format = shift || $_load_args{RecordFormat};
146 0         0 return( MARC::File::XML::encode( $record, $format, 1 ) );
147             }
148              
149             =head2 new_from_xml([$encoding, $format])
150              
151             If you have a chunk of XML and you want a record object for it you can use
152             this method to generate a MARC::Record object. You can pass an optional
153             encoding parameter to specify which encoding (UTF-8 or MARC-8) you would like
154             the resulting record to be in. You can also pass a format parameter to specify
155             the source record type, such as UNIMARC, UNIMARCAUTH, USMARC or MARC21.
156              
157             my $record = MARC::Record->new_from_xml( $xml, $encoding, $format );
158              
159             Note: only works for single record XML chunks.
160              
161             =cut
162              
163             sub MARC::Record::new_from_xml {
164 0     0 0 0 my $xml = shift;
165             ## to allow calling as MARC::Record::new_from_xml()
166             ## or MARC::Record->new_from_xml()
167 0 0 0     0 $xml = shift if ( ref($xml) || ($xml eq "MARC::Record") );
168              
169 0   0     0 my $enc = shift || $_load_args{BinaryEncoding};
170 0   0     0 my $format = shift || $_load_args{RecordFormat};
171 0         0 return( MARC::File::XML::decode( $xml, $enc, $format ) );
172             }
173              
174             =pod
175              
176             If you want to write records as XML to a file you can use out() with write()
177             to serialize more than one record as XML.
178              
179             =head2 out()
180              
181             A constructor for creating a MARC::File::XML object that can write XML to a
182             file. You must pass in the name of a file to write XML to. If the $encoding
183             parameter or the DefaultEncoding (see above) is set to UTF-8 then the binmode
184             of the output file will be set appropriately.
185              
186             my $file = MARC::File::XML->out( $filename [, $encoding] );
187              
188             =cut
189              
190             sub out {
191 0     0   0 my ( $class, $filename, $enc ) = @_;
192 0 0       0 my $fh = IO::File->new( ">$filename" ) or croak( $! );
193 0   0     0 $enc ||= $_load_args{DefaultEncoding};
194              
195 0 0       0 if ($enc =~ /^utf-?8$/oi) {
196 0         0 $fh->binmode(':utf8');
197             } else {
198 0         0 $fh->binmode(':raw');
199             }
200              
201 0         0 my %self = (
202             filename => $filename,
203             fh => $fh,
204             header => 0,
205             encoding => $enc
206             );
207 0   0     0 return( bless \%self, ref( $class ) || $class );
208             }
209              
210             =head2 write()
211              
212             Used in tandem with out() to write records to a file.
213              
214             my $file = MARC::File::XML->out( $filename );
215             $file->write( $record1 );
216             $file->write( $record2 );
217              
218             =cut
219              
220             sub write {
221 0     0   0 my ( $self, $record, $enc ) = @_;
222 0 0       0 if ( ! $self->{ fh } ) {
223 0         0 croak( "MARC::File::XML object not open for writing" );
224             }
225 0 0       0 if ( ! $record ) {
226 0         0 croak( "must pass write() a MARC::Record object" );
227             }
228             ## print the XML header if we haven't already
229 0 0       0 if ( ! $self->{ header } ) {
230 0   0     0 $enc ||= $self->{ encoding } || $_load_args{DefaultEncoding};
      0        
231 0         0 $self->{ fh }->print( header( $enc ) );
232 0         0 $self->{ header } = 1;
233             }
234             ## print out the record
235 0 0       0 $self->{ fh }->print( record( $record ) ) || croak( $! );
236 0         0 return( 1 );
237             }
238              
239             =head2 close()
240              
241             When writing records to disk the filehandle is automatically closed when you
242             the MARC::File::XML object goes out of scope. If you want to close it explicitly
243             use the close() method.
244              
245             =cut
246              
247             sub close {
248 0     0   0 my $self = shift;
249 0 0       0 if ( $self->{ fh } ) {
250 0 0       0 $self->{ fh }->print( footer() ) if $self->{ header };
251 0         0 $self->{ fh } = undef;
252 0         0 $self->{ filename } = undef;
253 0         0 $self->{ header } = undef;
254             }
255 0         0 return( 1 );
256             }
257              
258             ## makes sure that the XML file is closed off
259              
260             sub DESTROY {
261 0     0   0 shift->close();
262             }
263              
264             =pod
265              
266             If you want to generate batches of records as XML, but don't want to write to
267             disk you'll have to use header(), record() and footer() to generate the
268             different portions.
269              
270             $xml = join( "\n",
271             MARC::File::XML::header(),
272             MARC::File::XML::record( $record1 ),
273             MARC::File::XML::record( $record2 ),
274             MARC::File::XML::record( $record3 ),
275             MARC::File::XML::footer()
276             );
277              
278             =head2 header()
279              
280             Returns a string of XML to use as the header to your XML file.
281              
282             =cut
283              
284             sub header {
285 1     1   1 my $enc = shift;
286 1 0 0     3 $enc = shift if ( $enc && (ref($enc) || ($enc eq "MARC::File::XML")) );
      33        
287 1   50     6 $enc ||= 'UTF-8';
288 1         4 return( <
289            
290            
291             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
292             xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
293             xmlns="http://www.loc.gov/MARC21/slim">
294             MARC_XML_HEADER
295             }
296              
297             =head2 footer()
298              
299             Returns a string of XML to use at the end of your XML file.
300              
301             =cut
302              
303             sub footer {
304 1     1   2 return( "" );
305             }
306              
307             =head2 record()
308              
309             Returns a chunk of XML suitable for placement between the header and the footer.
310              
311             =cut
312              
313             sub record {
314 1     1   2 my $record = shift;
315 1         2 my $format = shift;
316 1         2 my $without_header = shift;
317 1         1 my $enc = shift;
318              
319 1   33     7 $format ||= $_load_args{RecordFormat};
320              
321 1         2 my $_transcode = 0;
322 1         6 my $ldr = $record->leader;
323 1         9 my $original_encoding = substr($ldr,9,1);
324              
325             # Does the record think it is already Unicode?
326 1 50 33     49 if ($original_encoding ne 'a' && lc($format) !~ /^unimarc/o) {
327             # If not, we'll make it so
328 1         2 $_transcode++;
329             }
330              
331 1         3 my @xml = ();
332              
333 1 50       4 if ($without_header) {
334 0         0 push @xml, <
335            
336            
337             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
338             xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/ standards/marcxml/schema/MARC21slim.xsd"
339             xmlns="http://www.loc.gov/MARC21/slim">
340             HEADER
341              
342             } else {
343 1         2 push( @xml, "" );
344             }
345              
346 1         5 push( @xml, " " . escape( $record->leader ) . "" );
347              
348 1         7 foreach my $field ( $record->fields() ) {
349 2         14 my $tag = $field->tag();
350 2 100       15 if ( $field->is_control_field() ) {
351 1         8 my $data = $field->data;
352 1 50       23 push( @xml, qq( ) .
353             escape( ($_transcode ? marc8_to_utf8($data) : $data) ). qq() );
354             } else {
355 1         8 my $i1 = $field->indicator( 1 );
356 1         15 my $i2 = $field->indicator( 2 );
357 1         12 push( @xml, qq( ) );
358 1         5 foreach my $subfield ( $field->subfields() ) {
359 1         20 my ( $code, $data ) = @$subfield;
360 1 50       9 push( @xml, qq( ).
361             escape( ($_transcode ? marc8_to_utf8($data) : $data) ).qq() );
362             }
363 1         4 push( @xml, " " );
364             }
365             }
366 1         3 push( @xml, "\n" );
367              
368 1 50       4 if ($_transcode) {
369 1         3 substr($ldr,9,1,$original_encoding);
370 1         4 $record->leader( $ldr );
371             }
372              
373 1         19 return( join( "\n", @xml ) );
374             }
375              
376             my %ESCAPES = (
377             '&' => '&',
378             '<' => '<',
379             '>' => '>',
380             );
381             my $ESCAPE_REGEX =
382             eval 'qr/' .
383             join( '|', map { $_ = "\Q$_\E" } keys %ESCAPES ) .
384             '/;'
385             ;
386              
387             sub escape {
388 6     6   2136 my $string = shift;
389 6 50 33     70 return '' if ! defined $string or $string eq '';
390 6         89 $string =~ s/($ESCAPE_REGEX)/$ESCAPES{$1}/oge;
  69         620  
391 6         54 return( $string );
392             }
393              
394             sub _next {
395 0     0   0 my $self = shift;
396 0         0 my $fh = $self->{ fh };
397              
398             ## return undef at the end of the file
399 0 0       0 return if eof($fh);
400              
401             ## get a chunk of xml for a record
402 0         0 local $/ = '';
403 0         0 my $xml = <$fh>;
404              
405             ## trim stuff before the start record element
406 0         0 $xml =~ s/.*//s;
407              
408             ## return undef if there isn't a good chunk of xml
409 0 0       0 return if ( $xml !~ m|.*|s );
410            
411             ## return the chunk of xml
412 0         0 return( $xml );
413             }
414              
415             =head2 decode()
416              
417             You probably don't ever want to call this method directly. If you do
418             you should pass in a chunk of XML as the argument.
419              
420             It is normally invoked by a call to next(), see L or L.
421              
422             =cut
423              
424             sub decode {
425              
426 0     0   0 my $text;
427 0         0 my $location = '';
428 0         0 my $self = shift;
429              
430             ## see MARC::File::USMARC::decode for explanation of what's going on
431             ## here
432 0 0       0 if ( ref($self) =~ /^MARC::File/ ) {
433 0         0 $location = 'in record '.$self->{recnum};
434 0         0 $text = shift;
435             } else {
436 0         0 $location = 'in record 1';
437 0 0       0 $text = $self=~/MARC::File/ ? shift : $self;
438             }
439              
440 0   0     0 my $enc = shift || $_load_args{BinaryEncoding};
441 0   0     0 my $format = shift || $_load_args{RecordFormat};
442              
443 0         0 $parser->{ tagStack } = [];
444 0         0 $parser->{ subfields } = [];
445 0         0 $parser->{ Handler }{ record } = MARC::Record->new();
446 0         0 $parser->{ Handler }{ toMARC8 } = decideMARC8Binary($format,$enc);
447              
448 0         0 $parser->parse_string( $text );
449              
450 0         0 return( $parser->{ Handler }{ record } );
451            
452             }
453              
454             sub decideMARC8Binary {
455 0     0   0 my $format = shift;
456 0         0 my $enc = shift;
457              
458 0 0 0     0 return 0 if (defined($format) && lc($format) =~ /^unimarc/o);
459 0 0 0     0 return 0 if (defined($enc) && lc($enc) =~ /^utf-?8/o);
460 0         0 return 1;
461             }
462              
463              
464             =head2 encode()
465              
466             You probably want to use the as_xml() method on your MARC::Record object
467             instead of calling this directly. But if you want to you just need to
468             pass in the MARC::Record object you wish to encode as XML, and you will be
469             returned the XML as a scalar.
470              
471             =cut
472              
473             sub encode {
474 1     1   2 my $record = shift;
475 1   33     6 my $format = shift || $_load_args{RecordFormat};
476 1         1 my $without_header = shift;
477 1   33     7 my $enc = shift || $_load_args{DefaultEncoding};
478              
479 1 50       34 if (lc($format) =~ /^unimarc/o) {
480 0         0 $enc = _unimarc_encoding( $format => $record );
481             }
482              
483 1         4 my @xml = ();
484 1 50       6 push( @xml, header( $enc ) ) unless ($without_header);
485 1         4 push( @xml, record( $record, $format, $without_header, $enc ) );
486 1 50       6 push( @xml, footer() ) unless ($without_header);
487              
488 1         8 return( join( "\n", @xml ) );
489             }
490              
491             sub _unimarc_encoding {
492 0     0     my $f = shift;
493 0           my $r = shift;
494              
495 0           my $pos = 26;
496 0 0         $pos = 13 if (lc($f) eq 'unimarcauth');
497              
498 0           my $enc = substr( $r->subfield(100 => 'a'), $pos, 2 );
499              
500 0 0 0       if ($enc eq '01' || $enc eq '03') {
    0          
501 0           return 'ISO-8859-1';
502             } elsif ($enc eq '50') {
503 0           return 'UTF-8';
504             } else {
505 0           die "Unsupported UNIMARC character encoding [$enc] for XML output for $f; 100$a -> " . $r->subfield(100 => 'a');
506             }
507             }
508              
509             =head1 TODO
510              
511             =over 4
512              
513             =item * Support for callback filters in decode().
514              
515             =item * Command line utilities marc2xml, etc.
516              
517             =back
518              
519             =head1 SEE ALSO
520              
521             =over 4
522              
523             =item L
524              
525             =item L
526              
527             =item L
528              
529             =item L
530              
531             =back
532              
533             =head1 AUTHORS
534              
535             =over 4
536              
537             =item * Ed Summers
538              
539             =back
540              
541             =cut
542              
543             1;