File Coverage

blib/lib/MARC/File/USMARC.pm
Criterion Covered Total %
statement 122 131 93.1
branch 37 52 71.1
condition 4 9 44.4
subroutine 17 18 94.4
pod 3 3 100.0
total 183 213 85.9


line stmt bran cond sub pod time code
1             package MARC::File::USMARC;
2              
3             =head1 NAME
4              
5             MARC::File::USMARC - USMARC-specific file handling
6              
7             =cut
8              
9 28     28   330225 use strict;
  28         67  
  28         1199  
10 28     28   167 use warnings;
  28         60  
  28         10885  
11 28     28   3354 use integer;
  28         85  
  28         181  
12              
13 28     28   868 use vars qw( $ERROR );
  28         57  
  28         1539  
14 28     28   15870 use MARC::File::Encode qw( marc_to_utf8 );
  28         388  
  28         2472  
15              
16 28     28   16697 use MARC::File;
  28         80  
  28         883  
17 28     28   189 use vars qw( @ISA ); @ISA = qw( MARC::File );
  28         58  
  28         1767  
18              
19 28     28   13474 use MARC::Record qw( LEADER_LEN );
  28         80  
  28         2848  
20 28     28   249 use MARC::Field;
  28         164  
  28         1266  
21 28     28   195 use constant SUBFIELD_INDICATOR => "\x1F";
  28         53  
  28         2266  
22 28     28   164 use constant END_OF_FIELD => "\x1E";
  28         55  
  28         1479  
23 28     28   159 use constant END_OF_RECORD => "\x1D";
  28         59  
  28         1340  
24 28     28   177 use constant DIRECTORY_ENTRY_LEN => 12;
  28         55  
  28         76525  
25              
26             =head1 SYNOPSIS
27              
28             use MARC::File::USMARC;
29              
30             my $file = MARC::File::USMARC->in( $filename );
31              
32             while ( my $marc = $file->next() ) {
33             # Do something
34             }
35             $file->close();
36             undef $file;
37              
38             =head1 EXPORT
39              
40             None.
41              
42             =head1 METHODS
43              
44             =cut
45              
46             sub _next {
47 176     176   286 my $self = shift;
48 176         567 my $fh = $self->{fh};
49              
50 176         251 my $reclen;
51 176 100       2125 return if eof($fh);
52              
53 164         11974 local $/ = END_OF_RECORD;
54 164         2439 my $usmarc = <$fh>;
55              
56             # remove illegal garbage that sometimes occurs between records
57 164         819 $usmarc =~ s/^[ \x00\x0a\x0d\x1a]+//;
58              
59 164         1814 return $usmarc;
60             }
61              
62             =head2 decode( $string [, \&filter_func ] )
63              
64             Constructor for handling data from a USMARC file. This function takes care of
65             all the tag directory parsing & mangling.
66              
67             Any warnings or coercions can be checked in the C function.
68              
69             The C<$filter_func> is an optional reference to a user-supplied function
70             that determines on a tag-by-tag basis if you want the tag passed to it
71             to be put into the MARC record. The function is passed the tag number
72             and the raw tag data, and must return a boolean. The return of a true
73             value tells MARC::File::USMARC::decode that the tag should get put into
74             the resulting MARC record.
75              
76             For example, if you only want title and subject tags in your MARC record,
77             try this:
78              
79             sub filter {
80             my ($tagno,$tagdata) = @_;
81              
82             return ($tagno == 245) || ($tagno >= 600 && $tagno <= 699);
83             }
84              
85             my $marc = MARC::File::USMARC->decode( $string, \&filter );
86              
87             Why would you want to do such a thing? The big reason is that creating
88             fields is processor-intensive, and if your program is doing read-only
89             data analysis and needs to be as fast as possible, you can save time by
90             not creating fields that you'll be ignoring anyway.
91              
92             Another possible use is if you're only interested in printing certain
93             tags from the record, then you can filter them when you read from disc
94             and not have to delete unwanted tags yourself.
95              
96             =cut
97              
98             sub decode {
99              
100 168     168 1 2905 my $text;
101 168         948 my $location = '';
102              
103             ## decode can be called in a variety of ways
104             ## $object->decode( $string )
105             ## MARC::File::USMARC->decode( $string )
106             ## MARC::File::USMARC::decode( $string )
107             ## this bit of code covers all three
108              
109 168         298 my $self = shift;
110 168 100       935 if ( ref($self) =~ /^MARC::File/ ) {
111 162         601 $location = 'in record '.$self->{recnum};
112 162         439 $text = shift;
113             } else {
114 6         13 $location = 'in record 1';
115 6 100       31 $text = $self=~/MARC::File/ ? shift : $self;
116             }
117 168         407 my $filter_func = shift;
118              
119             # ok this the empty shell we will fill
120 168         1549 my $marc = MARC::Record->new();
121              
122             # Check for an all-numeric record length
123 168 100       1439 ($text =~ /^(\d{5})/)
124             or return $marc->_warn( "Record length \"", substr( $text, 0, 5 ), "\" is not numeric $location" );
125              
126 165         1044 my $reclen = $1;
127 165         955 my $realLength = bytes::length( $text );
128 165 50       34295 $marc->_warn( "Invalid record length $location: Leader says $reclen " .
129             "bytes but it's actually $realLength" ) unless $reclen == $realLength;
130              
131 165 50       1090 (substr($text, -1, 1) eq END_OF_RECORD)
132             or $marc->_warn( "Invalid record terminator $location" );
133              
134 165         994 $marc->leader( substr( $text, 0, LEADER_LEN ) );
135              
136             # bytes 12 - 16 of leader give offset to the body of the record
137 165         1728 my $data_start = 0 + bytes::substr( $text, 12, 5 );
138              
139             # immediately after the leader comes the directory (no separator)
140 165         3477 my $dir = substr( $text, LEADER_LEN, $data_start - LEADER_LEN - 1 ); # -1 to allow for \x1e at end of directory
141              
142             # character after the directory must be \x1e
143 165 100       956 (substr($text, $data_start-1, 1) eq END_OF_FIELD)
144             or $marc->_warn( "No directory found $location" );
145              
146             # all directory entries 12 bytes long, so length % 12 must be 0
147 165 50       833 (length($dir) % DIRECTORY_ENTRY_LEN == 0)
148             or $marc->_warn( "Invalid directory length $location" );
149              
150              
151             # go through all the fields
152 165         537 my $nfields = length($dir)/DIRECTORY_ENTRY_LEN;
153 165         648 for ( my $n = 0; $n < $nfields; $n++ ) {
154 2723         24662 my ( $tagno, $len, $offset ) = unpack( "A3 A4 A5", substr($dir, $n*DIRECTORY_ENTRY_LEN, DIRECTORY_ENTRY_LEN) );
155              
156             # Check directory validity
157 2723 50       12791 ($tagno =~ /^[0-9A-Za-z]{3}$/)
158             or $marc->_warn( "Invalid tag in directory $location: \"$tagno\"" );
159              
160 2723 50       15959 ($len =~ /^\d{4}$/)
161             or $marc->_warn( "Invalid length in directory $location tag $tagno: \"$len\"" );
162              
163 2723 50       8277 ($offset =~ /^\d{5}$/)
164             or $marc->_warn( "Invalid offset in directory $location tag $tagno: \"$offset\"" );
165              
166 2723 50       13472 ($offset + $len <= $reclen)
167             or $marc->_warn( "Directory entry $location runs off the end of the record tag $tagno" );
168              
169 2723         11471 my $tagdata = bytes::substr( $text, $data_start+$offset, $len );
170              
171             # if utf8 the we encode the string as utf8
172 2723 100       39371 if ( $marc->encoding() eq 'UTF-8' ) {
173 1         5 $tagdata = marc_to_utf8( $tagdata );
174             }
175              
176 2723 50       8332 $marc->_warn( "Invalid length in directory for tag $tagno $location" )
177             unless ( $len == bytes::length($tagdata) );
178              
179 2723 50       21816 if ( substr($tagdata, -1, 1) eq END_OF_FIELD ) {
180             # get rid of the end-of-tag character
181 2723         5383 chop $tagdata;
182 2723         5988 --$len;
183             } else {
184 0         0 $marc->_warn( "field does not end in end of field character in tag $tagno $location" );
185             }
186              
187 2723 50       6037 warn "Specs: ", join( "|", $tagno, $len, $offset, $tagdata ), "\n" if $MARC::Record::DEBUG;
188              
189 2723 100       5616 if ( $filter_func ) {
190 167 100       426 next unless $filter_func->( $tagno, $tagdata );
191             }
192              
193 2569 100       14135 if ( MARC::Field->is_controlfield_tag($tagno) ) {
194 559         2080 $marc->append_fields( MARC::Field->new( $tagno, $tagdata ) );
195             } else {
196 2010         13351 my @subfields = split( SUBFIELD_INDICATOR, $tagdata );
197 2010         5651 my $indicators = shift @subfields;
198 2010         3699 my ($ind1, $ind2);
199              
200 2010 50 33     14012 if ( length( $indicators ) > 2 or length( $indicators ) == 0 ) {
201 0         0 $marc->_warn( "Invalid indicators \"$indicators\" forced to blanks $location for tag $tagno\n" );
202 0         0 ($ind1,$ind2) = (" ", " ");
203             } else {
204 2010         5564 $ind1 = substr( $indicators,0, 1 );
205 2010         5564 $ind2 = substr( $indicators,1, 1 );
206             }
207              
208             # Split the subfield data into subfield name and data pairs
209 2010         6332 my @subfield_data;
210 2010         4101 for ( @subfields ) {
211 3341 50       8318 if ( length > 0 ) {
212 3341         17845 push( @subfield_data, substr($_,0,1),substr($_,1) );
213             } else {
214 0         0 $marc->_warn( "Entirely empty subfield found in tag $tagno" );
215             }
216             }
217              
218 2010 50       5289 if ( !@subfield_data ) {
219 0         0 $marc->_warn( "no subfield data found $location for tag $tagno" );
220 0         0 next;
221             }
222              
223 2010         7558 my $field = MARC::Field->new($tagno, $ind1, $ind2, @subfield_data );
224 2010 100       6558 if ( $field->warnings() ) {
225 2         8 $marc->_warn( $field->warnings() );
226             }
227 2010         6165 $marc->append_fields( $field );
228             }
229             } # looping through all the fields
230              
231              
232 165         2617 return $marc;
233             }
234              
235             =head2 update_leader()
236              
237             If any changes get made to the MARC record, the first 5 bytes of the
238             leader (the length) will be invalid. This function updates the
239             leader with the correct length of the record as it would be if
240             written out to a file.
241              
242             =cut
243              
244             sub update_leader {
245 0     0 1 0 my $self = shift;
246              
247 0         0 my (undef,undef,$reclen,$baseaddress) = $self->_build_tag_directory();
248              
249 0         0 $self->_set_leader_lengths( $reclen, $baseaddress );
250             }
251              
252             =head2 _build_tag_directory()
253              
254             Function for internal use only: Builds the tag directory that gets
255             put in front of the data in a MARC record.
256              
257             Returns two array references, and two lengths: The tag directory, and the data fields themselves,
258             the length of all data (including the Leader that we expect will be added),
259             and the size of the Leader and tag directory.
260              
261             =cut
262              
263             sub _build_tag_directory {
264 10     10   17 my $marc = shift;
265 10 50 33     51 $marc = shift if (ref($marc)||$marc) =~ /^MARC::File/;
266 10 50       36 die "Wanted a MARC::Record but got a ", ref($marc) unless ref($marc) eq "MARC::Record";
267              
268 10         13 my @fields;
269             my @directory;
270              
271 10         55 my $dataend = 0;
272 10         41 for my $field ( $marc->fields() ) {
273             # Dump data into proper format
274 44         141 my $str = $field->as_usmarc;
275 44         91 push( @fields, $str );
276              
277             # Create directory entry
278 44         134 my $len = bytes::length( $str );
279              
280 44         5415 my $direntry = sprintf( "%03s%04d%05d", $field->tag, $len, $dataend );
281 44         93 push( @directory, $direntry );
282 44         92 $dataend += $len;
283             }
284              
285 10         31 my $baseaddress =
286             LEADER_LEN + # better be 24
287             ( @directory * DIRECTORY_ENTRY_LEN ) +
288             # all the directory entries
289             1; # end-of-field marker
290              
291              
292 10         23 my $total =
293             $baseaddress + # stuff before first field
294             $dataend + # Length of the fields
295             1; # End-of-record marker
296              
297              
298              
299 10         37 return (\@fields, \@directory, $total, $baseaddress);
300             }
301              
302             =head2 encode()
303              
304             Returns a string of characters suitable for writing out to a USMARC file,
305             including the leader, directory and all the fields.
306              
307             =cut
308              
309             sub encode {
310 10     10 1 394 my $marc = shift;
311 10 100 66     96 $marc = shift if (ref($marc)||$marc) =~ /^MARC::File/;
312              
313 10         32 my ($fields,$directory,$reclen,$baseaddress) = _build_tag_directory($marc);
314 10         47 $marc->set_leader_lengths( $reclen, $baseaddress );
315              
316             # Glomp it all together
317 10         39 return join("",$marc->leader, @$directory, END_OF_FIELD, @$fields, END_OF_RECORD);
318             }
319             1;
320              
321             __END__