File Coverage

blib/lib/MARC/Lint.pm
Criterion Covered Total %
statement 284 305 93.1
branch 150 188 79.7
condition 43 64 67.1
subroutine 21 22 95.4
pod 9 9 100.0
total 507 588 86.2


line stmt bran cond sub pod time code
1             package MARC::Lint;
2              
3 5     5   425697 use strict;
  5         47  
  5         148  
4 5     5   26 use warnings;
  5         9  
  5         130  
5 5     5   23 use integer;
  5         12  
  5         30  
6 5     5   1692 use MARC::Record;
  5         8989  
  5         216  
7 5     5   30 use MARC::Field;
  5         10  
  5         146  
8              
9 5     5   2527 use MARC::Lint::CodeData qw(%GeogAreaCodes %ObsoleteGeogAreaCodes %LanguageCodes %ObsoleteLanguageCodes);
  5         15  
  5         4668  
10              
11             our $VERSION = 1.53;
12              
13             =head1 NAME
14              
15             MARC::Lint - Perl extension for checking validity of MARC records
16              
17             =head1 SYNOPSIS
18              
19             use MARC::File::USMARC;
20             use MARC::Lint;
21              
22             my $lint = new MARC::Lint;
23             my $filename = shift;
24              
25             my $file = MARC::File::USMARC->in( $filename );
26             while ( my $marc = $file->next() ) {
27             $lint->check_record( $marc );
28              
29             # Print the title tag
30             print $marc->title, "\n";
31              
32             # Print the errors that were found
33             print join( "\n", $lint->warnings ), "\n";
34             } # while
35              
36             Given the following MARC record:
37              
38             LDR 00000nam 22002538a 4500
39             040 _aMdSSJTT
40             _cMdSSJTT
41             040 _aMdSSJTT
42             _beng
43             _cMdSSJTT
44             100 14 _aWall, Larry.
45             110 1 _aO'Reilly & Associates.
46             245 90 _aProgramming Perl /
47             _aBig Book of Perl /
48             _cLarry Wall, Tom Christiansen & Jon Orwant.
49             250 _a3rd ed.
50             250 _a3rd ed.
51             260 _aCambridge, Mass. :
52             _bO'Reilly,
53             _r2000.
54             590 4 _aPersonally signed by Larry.
55             856 43 _uhttp://www.perl.com/
56              
57             the following errors are generated:
58              
59             1XX: Only one 1XX tag is allowed, but I found 2 of them.
60             100: Indicator 2 must be blank but it's "4"
61             245: Indicator 1 must be 0 or 1 but it's "9"
62             245: Subfield _a is not repeatable.
63             040: Field is not repeatable.
64             260: Subfield _r is not allowed.
65             856: Indicator 2 must be blank, 0, 1, 2 or 8 but it's "3"
66              
67             =head1 DESCRIPTION
68              
69             Module for checking validity of MARC records. 99% of the users will want to do
70             something like is shown in the synopsis. The other intrepid 1% will overload the
71             C module's methods and provide their own special field-level checking.
72              
73             What this means is that if you have certain requirements, such as making sure that
74             all 952 tags have a certain call number in them, you can write a function that
75             checks for that, and still get all the benefits of the MARC::Lint framework.
76              
77             =head1 EXPORT
78              
79             None. Everything is done through objects.
80              
81             =head1 METHODS
82              
83             =head2 new()
84              
85             No parms needed. The C object is little more than a list of warnings
86             and a bunch of rules.
87              
88             =cut
89              
90             sub new {
91 6     6 1 6580 my $class = shift;
92              
93 6         24 my $self = {
94             _warnings => [],
95             };
96 6         16 bless $self, $class;
97              
98 6         30 $self->_read_rules();
99              
100 6         44 return $self;
101             }
102              
103             =head2 warnings()
104              
105             Returns a list of warnings found by C and its brethren.
106              
107             =cut
108              
109             sub warnings {
110 64     64 1 1114 my $self = shift;
111              
112 64 50       116 return wantarray ? @{$self->{_warnings}} : scalar @{$self->{_warnings}};
  64         191  
  0         0  
113             }
114              
115             =head2 clear_warnings()
116              
117             Clear the list of warnings for this linter object. It's automatically called
118             when you call C.
119              
120             =cut
121              
122             sub clear_warnings {
123 64     64 1 180 my $self = shift;
124              
125 64         214 $self->{_warnings} = [];
126             }
127              
128             =head2 warn( $str [, $str...] )
129              
130             Create a warning message, built from strings passed, like a C
131             statement.
132              
133             Typically, you'll leave this to C, but industrious
134             programmers may want to do their own checking as well.
135              
136             =cut
137              
138             sub warn {
139 59     59 1 1681 my $self = shift;
140              
141 59         76 push( @{$self->{_warnings}}, join( "", @_ ) );
  59         197  
142              
143 59         225 return;
144             }
145              
146             =head2 check_record( $marc )
147              
148             Does all sorts of lint-like checks on the MARC record I<$marc>,
149             both on the record as a whole, and on the individual fields &
150             subfields.
151              
152             =cut
153              
154             sub check_record {
155 13     13 1 36395 my $self = shift;
156 13         24 my $marc = shift;
157              
158 13         33 $self->clear_warnings();
159              
160 13 50 33     95 ( (ref $marc) && $marc->isa('MARC::Record') )
161             or return $self->warn( "Must pass a MARC::Record object to check_record" );
162              
163 13         42 my @_1xx = $marc->field( "1.." );
164 13         1384 my $n1xx = scalar @_1xx;
165 13 100       38 if ( $n1xx > 1 ) {
166 1         7 $self->warn( "1XX: Only one 1XX tag is allowed, but I found $n1xx of them." );
167             }
168              
169 13 50       28 if ( not $marc->field( 245 ) ) {
170 0         0 $self->warn( "245: No 245 tag." );
171             }
172              
173              
174 13         972 my %field_seen;
175 13         33 my $rules = $self->{_rules};
176 13         30 for my $field ( $marc->fields ) {
177 208         467 my $tagno = $field->tag;
178              
179 208         848 my $tagrules = '';
180             #if 880 field, inherit rules from tagno in subfield _6
181 208         252 my $is_880 = 0;
182 208 100       354 if ($tagno eq '880') {
183 3         6 $is_880 = 1;
184 3 100       7 if ($field->subfield('6')) {
185 2         48 my $sub6 = $field->subfield('6');
186 2         44 $tagno = substr($sub6, 0, 3);
187              
188 2 50       7 $tagrules = $rules->{$tagno} or next;
189             #880 is repeatable, but its linked field may not be
190 2 50 66     15 if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{'880.'.$tagno} ) {
      66        
191 0         0 $self->warn( "$tagno: Field is not repeatable." );
192             } #if repeatability
193             } #if subfield 6 present
194             else {
195 1         23 $self->warn( "880: No subfield 6." );
196 1         2 next;
197             } #else no subfield 6 in 880 field
198             } #if this is 880 field
199             else {
200 205 100       446 $tagrules = $rules->{$tagno} or next;
201              
202 204 50 66     851 if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{$tagno} ) {
      66        
203 0         0 $self->warn( "$tagno: Field is not repeatable." );
204             } #if repeatability
205             } #else not 880
206              
207 206 100       440 if ( $tagno >= 10 ) {
    50          
208 159         274 for my $ind ( 1..2 ) {
209 318         586 my $indvalue = $field->indicator($ind);
210 318 100       4519 if ( not ($indvalue =~ $tagrules->{"ind$ind" . "_regex"}) ) {
211             $self->warn(
212             "$tagno: Indicator $ind must be ",
213 5         43 $tagrules->{"ind$ind" . "_desc"},
214             " but it's \"$indvalue\""
215             );
216             }
217             } # for
218              
219 159         222 my %sub_seen;
220 159         301 for my $subfield ( $field->subfields ) {
221 264         3006 my ($code,$data) = @$subfield;
222              
223 264         393 my $rule = $tagrules->{$code};
224 264 100 66     734 if ( not defined $rule ) {
    100          
225 3         31 $self->warn( "$tagno: Subfield _$code is not allowed." );
226             } elsif ( ($rule eq "NR") && $sub_seen{$code} ) {
227 1         6 $self->warn( "$tagno: Subfield _$code is not repeatable." );
228             }
229              
230 264 50       556 if ( $data =~ /[\t\r\n]/ ) {
231 0         0 $self->warn( "$tagno: Subfield _$code has an invalid control character" );
232             }
233              
234 264         547 ++$sub_seen{$code};
235             } # for $subfields
236             } # if $tagno >= 10
237              
238             elsif ($tagno < 10) {
239             #check for subfield characters
240 47 100       93 if ($field->data() =~ /\x1F/) {
241 1         36 $self->warn( "$tagno: Subfields are not allowed in fields lower than 010" );
242             } #if control field has subfield delimiter
243             } #elsif $tagno < 10
244              
245             # Check to see if a check_xxx() function exists, and call it on the field if it does
246 206         812 my $checker = "check_$tagno";
247 206 100       708 if ( $self->can( $checker ) ) {
248 25         84 $self->$checker( $field );
249             }
250              
251 206 100       3290 if ($is_880) {
252 2         7 ++$field_seen{'880.'.$tagno};
253             } #if 880 field
254             else {
255 204         408 ++$field_seen{$tagno};
256             }
257             } # for my $fields
258              
259 13         54 return;
260             }
261              
262             =head2 check_I( $field )
263              
264             Various functions to check the different fields. If the function doesn't exist,
265             then it doesn't get checked.
266              
267             =head2 check_020()
268              
269             Looks at 020$a and reports errors if the check digit is wrong.
270             Looks at 020$z and validates number if hyphens are present.
271              
272             Uses Business::ISBN to do validation. Thirteen digit checking is currently done
273             with the internal sub _isbn13_check_digit(), based on code from Business::ISBN.
274              
275             TO DO (check_020):
276              
277             Fix 13-digit ISBN checking.
278              
279             =cut
280              
281             sub check_020 {
282              
283              
284 5     5   2869 use Business::ISBN;
  5         257226  
  5         18335  
285              
286 22     22 1 12920 my $self = shift;
287 22         45 my $field = shift;
288              
289             ###################################################
290              
291             # break subfields into code-data array and validate data
292              
293 22         50 my @subfields = $field->subfields();
294              
295 22         394 while (my $subfield = pop(@subfields)) {
296 22         44 my ($code, $data) = @$subfield;
297 22         37 my $isbnno = $data;
298             #remove any hyphens
299 22         62 $isbnno =~ s/\-//g;
300             #remove nondigits
301 22         138 $isbnno =~ s/^\D*(\d{9,12}[X\d])\b.*$/$1/;
302              
303             #report error if this is subfield 'a'
304             #and the first 10 or 13 characters are not a match for $isbnno
305 22 100       61 if ($code eq 'a') {
    50          
306 21 100       58 if ((substr($data,0,length($isbnno)) ne $isbnno)) {
307 2         5 $self->warn( "020: Subfield a may have invalid characters.");
308             } #if first characters don't match
309              
310             #report error if no space precedes a qualifier in subfield a
311 21 100       59 if ($data =~ /\(/) {
312 8 100       36 $self->warn( "020: Subfield a qualifier must be preceded by space, $data.") unless ($data =~ /[X0-9] \(/);
313             } #if data has parenthetical qualifier
314              
315             #report error if unable to find 10-13 digit string of digits in subfield 'a'
316 21 100       68 if (($isbnno !~ /(?:^\d{10}$)|(?:^\d{13}$)|(?:^\d{9}X$)/)) {
317 3         12 $self->warn( "020: Subfield a has the wrong number of digits, $data.");
318             } # if subfield 'a' but not 10 or 13 digit isbn
319             #otherwise, check 10 and 13 digit checksums for validity
320             else {
321 18 100       47 if ((length ($isbnno) == 10)) {
    50          
322              
323 16 50 33     45 if (($Business::ISBN::VERSION gt '2.02_01') || ($Business::ISBN::VERSION gt '2.009')) {
    0          
324 16 100       56 $self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::valid_isbn_checksum($isbnno) != 1);
325             } #if Business::ISBN version higher than 2.02_01 or 2.009
326             elsif ($Business::ISBN::VERSION lt '2') {
327 0 0       0 $self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::is_valid_checksum($isbnno) != 1);
328             } #elsif Business::ISBN version lower than 2
329             else {
330 0         0 $self->warn( "Business::ISBN version must be below 2 or above 2.02_02 or 2.009." );
331             } #else Business::ISBN version between 2 and 2.02_02
332             } #if 10 digit ISBN has invalid check digit
333             # do validation check for 13 digit isbn
334             #########################################
335             ### Not yet fully implemented ###########
336             #########################################
337             elsif (length($isbnno) == 13){
338             #change line below once Business::ISBN handles 13-digit ISBNs
339 2         4 my $is_valid_13 = _isbn13_check_digit($isbnno);
340 2 100       16 $self->warn( "020: Subfield a has bad checksum (13 digit), $data.") unless ($is_valid_13 == 1);
341             } #elsif 13 digit ISBN has invalid check digit
342             ###################################################
343             } #else subfield 'a' has 10 or 13 digits
344             } #if subfield 'a'
345             #look for valid isbn in 020$z
346             elsif ($code eq 'z') {
347 1 50 33     13 if (($data =~ /^ISBN/) || ($data =~ /^\d*\-\d+/)){
348             ##################################################
349             ## Turned on for now--Comment to unimplement ####
350             ##################################################
351 0 0 0     0 $self->warn( "020: Subfield z is numerically valid.") if ((length ($isbnno) == 10) && (Business::ISBN::is_valid_checksum($isbnno) == 1));
352             } #if 10 digit ISBN has invalid check digit
353             } #elsif subfield 'z'
354              
355             } # while @subfields
356              
357             } #check_020
358              
359             =head2 _isbn13_check_digit($ean)
360              
361             Internal sub to determine if 13-digit ISBN has a valid checksum. The code is
362             taken from Business::ISBN::as_ean. It is expected to be temporary until
363             Business::ISBN is updated to check 13-digit ISBNs itself.
364              
365             =cut
366              
367             sub _isbn13_check_digit {
368              
369 2     2   4 my $ean = shift;
370             #remove and store current check digit
371 2         4 my $check_digit = chop($ean);
372              
373             #calculate valid checksum
374 2         4 my $sum = 0;
375 2         4 foreach my $index ( 0, 2, 4, 6, 8, 10 )
376             {
377 12         16 $sum += substr($ean, $index, 1);
378 12         24 $sum += 3 * substr($ean, $index + 1, 1);
379             }
380              
381             #take the next higher multiple of 10 and subtract the sum.
382             #if $sum is 37, the next highest multiple of ten is 40. the
383             #check digit would be 40 - 37 => 3.
384 2         10 my $valid_check_digit = ( 10 * ( int( $sum / 10 ) + 1 ) - $sum ) % 10;
385              
386 2 100       6 return $check_digit == $valid_check_digit ? 1 : 0;
387              
388             } # _isbn13_check_digit
389              
390             #########################################
391              
392             =head2 check_041( $field )
393              
394             Warns if subfields are not evenly divisible by 3 unless second indicator is 7
395             (future implementation would ensure that each subfield is exactly 3 characters
396             unless ind2 is 7--since subfields are now repeatable. This is not implemented
397             here due to the large number of records needing to be corrected.). Validates
398             against the MARC Code List for Languages (L) using the
399             MARC::Lint::CodeData data pack to MARC::Lint (%LanguageCodes,
400             %ObsoleteLanguageCodes).
401              
402             =cut
403              
404             sub check_041 {
405              
406              
407 3     3 1 3506 my $self = shift;
408 3         7 my $field = shift;
409              
410             # break subfields into code-data array (so the entire field is in one array)
411              
412 3         11 my @subfields = $field->subfields();
413 3         66 my @newsubfields = ();
414              
415 3         11 while (my $subfield = pop(@subfields)) {
416 7         14 my ($code, $data) = @$subfield;
417 7         28 unshift (@newsubfields, $code, $data);
418             } # while
419              
420             #warn if length of each subfield is not divisible by 3 unless ind2 is 7
421 3 50       14 unless ($field->indicator(2) eq '7') {
422 3         55 for (my $index = 0; $index <=$#newsubfields; $index+=2) {
423 7 100       23 if (length ($newsubfields[$index+1]) %3 != 0) {
424 3         18 $self->warn( "041: Subfield _$newsubfields[$index] must be evenly divisible by 3 or exactly three characters if ind2 is not 7, ($newsubfields[$index+1])." );
425             } #if field length not divisible evenly by 3
426             ##############################################
427             # validation against code list data
428             ## each subfield has a multiple of 3 chars
429             # need to look at each group of 3 characters
430             else {
431              
432             #break each character of the subfield into an array position
433 4         19 my @codechars = split '', $newsubfields[$index+1];
434              
435 4         8 my $pos = 0;
436             #store each 3 char code in a slot of @codes041
437 4         7 my @codes041 = ();
438 4         23 while ($pos <= $#codechars) {
439 6         22 push @codes041, (join '', @codechars[$pos..$pos+2]);
440 6         16 $pos += 3;
441             }
442              
443              
444 4         8 foreach my $code041 (@codes041) {
445             #see if language code matches valid code
446 6 50       28 my $validlang = $LanguageCodes{$code041} ? 1 : 0;
447             #look for invalid code match if valid code was not matched
448 6 100       15 my $obsoletelang = $ObsoleteLanguageCodes{$code041} ? 1 : 0;
449              
450             # skip valid subfields
451 6 50       25 unless ($validlang) {
452             #report invalid matches as possible obsolete codes
453 6 100       13 if ($obsoletelang) {
454 1         5 $self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1], may be obsolete.");
455             }
456             else {
457 5         49 $self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1] ($code041), is not valid.");
458             } #else code not found
459             } # unless found valid code
460             } #foreach code in 041
461             } # else subfield has multiple of 3 chars
462             ##############################################
463             } # foreach subfield
464             } #unless ind2 is 7
465             } #check_041
466              
467             =head2 check_043( $field )
468              
469             Warns if each subfield a is not exactly 7 characters. Validates each code
470             against the MARC code list for Geographic Areas (L)
471             using the MARC::Lint::CodeData data pack to MARC::Lint (%GeogAreaCodes,
472             %ObsoleteGeogAreaCodes).
473              
474             =cut
475              
476             sub check_043 {
477              
478 2     2 1 2485 my $self = shift;
479 2         5 my $field = shift;
480              
481             # break subfields into code-data array (so the entire field is in one array)
482              
483 2         7 my @subfields = $field->subfields();
484 2         41 my @newsubfields = ();
485              
486 2         8 while (my $subfield = pop(@subfields)) {
487 5         11 my ($code, $data) = @$subfield;
488 5         17 unshift (@newsubfields, $code, $data);
489             } # while
490              
491             #warn if length of subfield a is not exactly 7
492 2         10 for (my $index = 0; $index <=$#newsubfields; $index+=2) {
493 5 100 66     36 if (($newsubfields[$index] eq 'a') && (length ($newsubfields[$index+1]) != 7)) {
    50          
494 2         7 $self->warn( "043: Subfield _a must be exactly 7 characters, $newsubfields[$index+1]" );
495             } # if suba and length is not 7
496             #check against code list for geographic areas.
497             elsif ($newsubfields[$index] eq 'a') {
498              
499             #see if geog area code matches valid code
500 3 50       15 my $validgac = $GeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
501             #look for obsolete code match if valid code was not matched
502 3 100       13 my $obsoletegac = $ObsoleteGeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
503              
504             # skip valid subfields
505 3 50       31 unless ($validgac) {
506             #report invalid matches as possible obsolete codes
507 3 100       11 if ($obsoletegac) {
508 1         5 $self->warn( "043: Subfield _a, $newsubfields[$index+1], may be obsolete.");
509             }
510             else {
511 2         11 $self->warn( "043: Subfield _a, $newsubfields[$index+1], is not valid.");
512             } #else code not found
513             } # unless found valid code
514              
515             } #elsif suba
516             } #foreach subfield
517             } #check_043
518              
519             =head2 check_245( $field )
520              
521             -Makes sure $a exists (and is first subfield).
522             -Warns if last character of field is not a period
523             --Follows LCRI 1.0C, Nov. 2003 rather than MARC21 rule
524             -Verifies that $c is preceded by / (space-/)
525             -Verifies that initials in $c are not spaced
526             -Verifies that $b is preceded by :;= (space-colon, space-semicolon, space-equals)
527             -Verifies that $h is not preceded by space unless it is dash-space
528             -Verifies that data of $h is enclosed in square brackets
529             -Verifies that $n is preceded by . (period)
530             --As part of that, looks for no-space period, or dash-space-period (for replaced elipses)
531             -Verifies that $p is preceded by , (no-space-comma) when following $n and . (period) when following other subfields.
532             -Performs rudimentary article check of 245 2nd indicator vs. 1st word of 245$a (for manual verification).
533              
534             Article checking is done by internal _check_article method, which should work for 130, 240, 245, 440, 630, 730, and 830.
535              
536             =cut
537              
538             sub check_245 {
539              
540 49     49 1 27701 my $self = shift;
541 49         71 my $field = shift;
542              
543             #set tagno for reporting
544 49         76 my $tagno = '245';
545            
546 49 100       146 if ( not $field->subfield( "a" ) ) {
547 1         25 $self->warn( "245: Must have a subfield _a." );
548             }
549              
550             # break subfields into code-data array (so the entire field is in one array)
551              
552 49         1192 my @subfields = $field->subfields();
553 49         896 my @newsubfields = ();
554 49         70 my $has_sub_6 = 0;
555              
556 49         162 while (my $subfield = pop(@subfields)) {
557 90         154 my ($code, $data) = @$subfield;
558             #check for subfield 6 being present
559 90 100       161 $has_sub_6 = 1 if ($code eq '6');
560 90         303 unshift (@newsubfields, $code, $data);
561             } # while
562            
563             # 245 must end in period (may want to make this less restrictive by allowing trailing spaces)
564             #do 2 checks--for final punctuation (MARC21 rule), and for period (LCRI 1.0C, Nov. 2003; LCPS 1.7.1)
565 49 100       301 if ($newsubfields[$#newsubfields] !~ /[.?!]$/) {
    100          
566 1         5 $self->warn ( "245: Must end with . (period).");
567             }
568             elsif($newsubfields[$#newsubfields] =~ /[?!]$/) {
569 2         7 $self->warn ( "245: MARC21 allows ? or ! as final punctuation but LCRI 1.0C, Nov. 2003 (LCPS 1.7.1 for RDA records), requires period.");
570             }
571              
572             ##Check for first subfield
573             #subfield a should be first subfield (or 2nd if subfield '6' is present)
574 49 100       96 if ($has_sub_6) {
575             #make sure there are at least 2 subfields
576 2 50       5 if ($#newsubfields < 3) {
577 0         0 $self->warn ("$tagno: May have too few subfields.");
578             } #if fewer than 2 subfields
579             else {
580 2 50       5 if ($newsubfields[0] ne '6') {
581 0         0 $self->warn ( "$tagno: First subfield must be _6, but it is $newsubfields[0]");
582             } #if 1st subfield not '6'
583 2 50       4 if ($newsubfields[2] ne 'a') {
584 0         0 $self->warn ( "$tagno: First subfield after subfield _6 must be _a, but it is _$newsubfields[2]");
585             } #if 2nd subfield not 'a'
586             } #else at least 2 subfields
587             } #if has subfield 6
588             else {
589             #1st subfield must be 'a'
590 47 100       119 if ($newsubfields[0] ne 'a') {
591 1         5 $self->warn ( "$tagno: First subfield must be _a, but it is _$newsubfields[0]");
592             } #if 2nd subfield not 'a'
593             } #else no subfield _6
594             ##End check for first subfield
595            
596             #subfield c, if present, must be preceded by /
597             #also look for space between initials
598 49 100       134 if ($field->subfield("c")) {
599            
600 14         367 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
601             # 245 subfield c must be preceded by / (space-/)
602 17 100       46 if ($newsubfields[$index] eq 'c') {
603 14 100       65 $self->warn ( "245: Subfield _c must be preceded by /") if ($newsubfields[$index-1] !~ /\s\/$/);
604             # 245 subfield c initials should not have space
605 14 100 66     70 $self->warn ( "245: Subfield _c initials should not have a space.") if (($newsubfields[$index+1] =~ /\b\w\. \b\w\./) && ($newsubfields[$index+1] !~ /\[\bi\.e\. \b\w\..*\]/));
606 14         31 last;
607             } #if
608             } #for
609             } # subfield c exists
610              
611             #each subfield b, if present, should be preceded by :;= (colon, semicolon, or equals sign)
612             ### Are there others? ###
613 49 100       761 if ($field->subfield("b")) {
614              
615             # 245 subfield b should be preceded by space-:;= (colon, semicolon, or equals sign)
616 13         307 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
617             #report error if subfield 'b' is not preceded by space-:;= (colon, semicolon, or equals sign)
618 16 100 100     108 if (($newsubfields[$index] eq 'b') && ($newsubfields[$index-1] !~ / [:;=]$/)) {
619 4         10 $self->warn ( "245: Subfield _b should be preceded by space-colon, space-semicolon, or space-equals sign.");
620             } #if
621             } #for
622             } # subfield b exists
623              
624              
625             #each subfield h, if present, should be preceded by non-space
626 49 100       753 if ($field->subfield("h")) {
627              
628             # 245 subfield h should not be preceded by space
629 4         94 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
630             #report error if subfield 'h' is preceded by space (unless dash-space)
631 6 100 100     59 if (($newsubfields[$index] eq 'h') && ($newsubfields[$index-1] !~ /(\S$)|(\-\- $)/)) {
632 1         6 $self->warn ( "245: Subfield _h should not be preceded by space.");
633             } #if h and not preceded by no-space (unless dash)
634             #report error if subfield 'h' does not start with open square bracket with a matching close bracket
635             ##could have check against list of valid values here
636 6 100 100     40 if (($newsubfields[$index] eq 'h') && ($newsubfields[$index+1] !~ /^\[\w*\s*\w*\]/)) {
637 1         6 $self->warn ( "245: Subfield _h must have matching square brackets, $newsubfields[$index].");
638             }
639             } #for
640             } # subfield h exists
641              
642             #each subfield n, if present, must be preceded by . (period)
643 49 100       935 if ($field->subfield("n")) {
644              
645             # 245 subfield n must be preceded by . (period)
646 4         92 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
647             #report error if subfield 'n' is not preceded by non-space-period or dash-space-period
648 6 100 100     47 if (($newsubfields[$index] eq 'n') && ($newsubfields[$index-1] !~ /(\S\.$)|(\-\- \.$)/)) {
649 1         5 $self->warn ( "245: Subfield _n must be preceded by . (period).");
650             } #if
651             } #for
652             } # subfield n exists
653              
654             #each subfield p, if present, must be preceded by a , (no-space-comma) if it follows subfield n, or by . (no-space-period or dash-space-period) following other subfields
655 49 100       926 if ($field->subfield("p")) {
656              
657             # 245 subfield p must be preceded by . (period) or , (comma)
658 4         90 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
659             #only looking for subfield p
660 6 100       16 if ($newsubfields[$index] eq 'p') {
661             # case for subfield 'n' being field before this one (allows dash-space-comma)
662 4 100 100     51 if (($newsubfields[$index-2] eq 'n') && ($newsubfields[$index-1] !~ /(\S,$)|(\-\- ,$)/)) {
    100 100        
663 1         3 $self->warn ( "245: Subfield _p must be preceded by , (comma) when it follows subfield _n.");
664             } #if subfield n precedes this one
665             # elsif case for subfield before this one is not n
666             elsif (($newsubfields[$index-2] ne 'n') && ($newsubfields[$index-1] !~ /(\S\.$)|(\-\- \.$)/)) {
667 1         5 $self->warn ( "245: Subfield _p must be preceded by . (period) when it follows a subfield other than _n.");
668             } #elsif subfield p preceded by non-period when following a non-subfield 'n'
669             } #if index is looking at subfield p
670             } #for
671             } # subfield p exists
672              
673             ######################################
674             #check for invalid 2nd indicator
675 49         971 $self->_check_article($field);
676              
677             } # check_245
678              
679              
680              
681              
682             ############
683             # Internal #
684             ############
685              
686             =head2 _check_article
687              
688             Check of articles is based on code from Ian Hamilton. This version is more
689             limited in that it focuses on English, Spanish, French, Italian and German
690             articles. Certain possible articles have been removed if they are valid English
691             non-articles. This version also disregards 008_language/041 codes and just uses
692             the list of articles to provide warnings/suggestions.
693              
694             source for articles = L
695              
696             Should work with fields 130, 240, 245, 440, 630, 730, and 830. Reports error if
697             another field is passed in.
698              
699             =cut
700              
701             sub _check_article {
702              
703 49     49   76 my $self = shift;
704 49         81 my $field = shift;
705              
706             #add articles here as needed
707             ##Some omitted due to similarity with valid words (e.g. the German 'die').
708 49         744 my %article = (
709             'a' => 'eng glg hun por',
710             'an' => 'eng',
711             'das' => 'ger',
712             'dem' => 'ger',
713             'der' => 'ger',
714             'ein' => 'ger',
715             'eine' => 'ger',
716             'einem' => 'ger',
717             'einen' => 'ger',
718             'einer' => 'ger',
719             'eines' => 'ger',
720             'el' => 'spa',
721             'en' => 'cat dan nor swe',
722             'gl' => 'ita',
723             'gli' => 'ita',
724             'il' => 'ita mlt',
725             'l' => 'cat fre ita mlt',
726             'la' => 'cat fre ita spa',
727             'las' => 'spa',
728             'le' => 'fre ita',
729             'les' => 'cat fre',
730             'lo' => 'ita spa',
731             'los' => 'spa',
732             'os' => 'por',
733             'the' => 'eng',
734             'um' => 'por',
735             'uma' => 'por',
736             'un' => 'cat spa fre ita',
737             'una' => 'cat spa ita',
738             'une' => 'fre',
739             'uno' => 'ita',
740             );
741              
742             #add exceptions here as needed
743             # may want to make keys lowercase
744 49         531 my %exceptions = (
745             'A & E' => 1,
746             'A & ' => 1,
747             'A-' => 1,
748             'A+' => 1,
749             'A is ' => 1,
750             'A isn\'t ' => 1,
751             'A l\'' => 1,
752             'A la ' => 1,
753             'A posteriori' => 1,
754             'A priori' => 1,
755             'A to ' => 1,
756             'El Nino' => 1,
757             'El Salvador' => 1,
758             'L is ' => 1,
759             'L-' => 1,
760             'La Salle' => 1,
761             'Las Vegas' => 1,
762             'Lo cual' => 1,
763             'Lo mein' => 1,
764             'Lo que' => 1,
765             'Los Alamos' => 1,
766             'Los Angeles' => 1,
767             );
768              
769             #get tagno to determine which indicator to check and for reporting
770 49         126 my $tagno = $field->tag();
771             #retrieve tagno from subfield 6 if 880 field
772 49 100       282 if ($tagno eq '880') {
773 1 50       13 if ($field->subfield('6')) {
774 1         31 my $sub6 = $field->subfield('6');
775 1         23 $tagno = substr($sub6, 0, 3);
776             } #if subfield 6
777             } #if 880 field
778              
779             #$ind holds nonfiling character indicator value
780 49         91 my $ind = '';
781             #$first_or_second holds which indicator is for nonfiling char value
782 49         71 my $first_or_second = '';
783 49 50       344 if ($tagno !~ /^(?:130|240|245|440|630|730|830)$/) {
    50          
    50          
784 0         0 print $tagno, " is not a valid field for article checking\n";
785 0         0 return;
786             } #if field is not one of those checked for articles
787             #130, 630, 730 => ind1
788             elsif ($tagno =~ /^(?:130|630|730)$/) {
789 0         0 $ind = $field->indicator(1);
790 0         0 $first_or_second = '1st';
791             } #if field is 130, 630, or 730
792             #240, 245, 440, 830 => ind2
793             elsif ($tagno =~ /^(?:240|245|440|830)$/) {
794 49         107 $ind = $field->indicator(2);
795 49         553 $first_or_second = '2nd';
796             } #if field is 240, 245, 440, or 830
797              
798              
799             #report non-numeric non-filing indicators as invalid
800 49 50       121 $self->warn ( $tagno, ": Non-filing indicator is non-numeric" ) unless ($ind =~ /^[0-9]$/);
801             #get subfield 'a' of the title field
802 49   100     116 my $title = $field->subfield('a') || '';
803              
804              
805 49         1026 my $char1_notalphanum = 0;
806             #check for apostrophe, quote, bracket, or parenthesis, before first word
807             #remove if found and add to non-word counter
808 49         155 while ($title =~ /^["'\[\(*]/){
809 4         11 $char1_notalphanum++;
810 4         16 $title =~ s/^["'\[\(*]//;
811             }
812             # split title into first word + rest on space, parens, bracket, apostrophe, quote, or hyphen
813 49         238 my ($firstword, $separator, $etc) = $title =~ /^([^ \(\)\[\]'"\-]+)([ \(\)\[\]'"\-])?(.*)/i;
814 49 100       122 $firstword = '' if ! defined( $firstword );
815 49 100       82 $separator = '' if ! defined( $separator );
816 49 100       90 $etc = '' if ! defined( $etc );
817              
818             #get length of first word plus the number of chars removed above plus one for the separator
819 49         83 my $nonfilingchars = length($firstword) + $char1_notalphanum + 1;
820              
821             #check to see if first word is an exception
822 49         59 my $isan_exception = 0;
823 49         229 $isan_exception = grep {$title =~ /^\Q$_\E/i} (keys %exceptions);
  1078         6258  
824              
825             #lowercase chars of $firstword for comparison with article list
826 49         146 $firstword = lc($firstword);
827              
828 49         66 my $isan_article = 0;
829              
830             #see if first word is in the list of articles and not an exception
831 49 100 100     166 $isan_article = 1 if (($article{$firstword}) && !($isan_exception));
832              
833             #if article then $nonfilingchars should match $ind
834 49 100       101 if ($isan_article) {
835             #account for quotes, apostrophes, parens, or brackets before 2nd word
836             # if (($separator eq ' ') && ($etc =~ /^['"]/)) {
837 9 100 66     38 if (($separator) && ($etc =~ /^[ \(\)\[\]'"\-]+/)) {
838 4         11 while ($etc =~ /^[ "'\[\]\(\)*]/){
839 6         9 $nonfilingchars++;
840 6         23 $etc =~ s/^[ "'\[\]\(\)*]//;
841             } #while etc starts with nonfiling chars
842             } #if separator defined and etc starts with nonfiling chars
843             #special case for 'en' (unsure why)
844 9 50       72 if ($firstword eq 'en') {
    100          
845 0 0 0     0 $self->warn ( $tagno, ": First word, , $firstword, may be an article, check $first_or_second indicator ($ind)." ) unless (($ind eq '3') || ($ind eq '0'));
846             }
847             elsif ($nonfilingchars ne $ind) {
848 3         14 $self->warn ( $tagno, ": First word, $firstword, may be an article, check $first_or_second indicator ($ind)." );
849             } #unless ind is same as length of first word and nonfiling characters
850             } #if first word is in article list
851             #not an article so warn if $ind is not 0
852             else {
853 40 100       384 unless ($ind eq '0') {
854 1         7 $self->warn ( $tagno, ": First word, $firstword, does not appear to be an article, check $first_or_second indicator ($ind)." );
855             } #unless ind is 0
856             } #else not in article list
857              
858             #######################################
859              
860             } #_check_article
861              
862              
863             ############
864              
865             =head1 SEE ALSO
866              
867             Check the docs for L. All software links are there.
868              
869             =head1 TODO
870              
871             =over 4
872              
873             =item * Subfield 6
874              
875             For subfield 6, it should always be the 1st subfield according to MARC 21 specifications. Perhaps a generic check should be added that warns if subfield 6 is not the 1st subfield.
876              
877             =item * Subfield 8.
878              
879             This subfield could be the 1st or 2nd subfield, so the code that checks for the 1st few subfields (check_245, check_250) should take that into account.
880              
881             =item * Subfield 9
882              
883             This subfield is not officially allowed in MARC, since it is locally defined. Some way needs to be made to allow messages/warnings about this subfield to be turned off (or otherwise deal with records using/allowing locally defined subfield 9).
884              
885             =item * 008 length and presence check
886              
887             Currently, 008 validation is not implemented in MARC::Lint, but is left to MARC::Errorchecks. It might be useful if MARC::Lint's basic validation checks included a verification that the 008 exists and is exactly 40 characters long. Additional 008-related checking and byte validation would remain in MARC::Errorchecks.
888              
889             =item * ISBN and ISSN checking
890              
891             020 and 022 fields are validated with the C and
892             C modules, respectively. Business::ISBN versions between 2 and
893             2.02_01 are incompatible with MARC::Lint.
894              
895             =item * check_041 cleanup
896              
897             Splitting subfield code strings every 3 chars could probably be written more efficiently.
898              
899             =item * check_245 cleanup
900              
901             The article checking in particular.
902              
903             =item * Method for turning off checks
904              
905             Provide a way for users to skip checks more easily when using check_record, or a
906             specific check_xxx method (e.g. skip article checking).
907              
908             =back
909              
910             =head1 LICENSE
911              
912             This code may be distributed under the same terms as Perl itself.
913              
914             Please note that these modules are not products of or supported by the
915             employers of the various contributors to the code.
916              
917             =cut
918              
919             # Used only to read the stuff from __DATA__
920             sub _read_rules {
921 6     6   14 my $self = shift;
922              
923 6         24 my $tell = tell(DATA); # Stash the position so we can reset it for next time
924              
925 6         31 local $/ = "";
926 6         129 while ( my $tagblock = ) {
927 1464         6942 my @lines = split( /\n/, $tagblock );
928 1464         17880 s/\s+$// for @lines;
929              
930 1464 100       3270 next unless @lines >= 4; # Some of our entries are tag-only
931              
932 1362         2255 my $tagline = shift @lines;
933 1362         4142 my @keyvals = split( /\s+/, $tagline, 3 );
934 1362         2265 my $tagno = shift @keyvals;
935 1362         1870 my $repeatable = shift @keyvals;
936              
937 1362         2875 $self->_parse_tag_rules( $tagno, $repeatable, @lines );
938             } # while
939              
940             # Set the pointer back to where it was, in case we do this again
941 6         97 seek( DATA, $tell, 0 );
942             }
943              
944             sub _parse_tag_rules {
945 1362     1362   1834 my $self = shift;
946 1362         1739 my $tagno = shift;
947 1362         1635 my $repeatable = shift;
948 1362         4234 my @lines = @_;
949              
950 1362   50     7992 my $rules = ($self->{_rules}->{$tagno} ||= {});
951 1362         2861 $rules->{'repeatable'} = $repeatable;
952              
953 1362         2062 for my $line ( @lines ) {
954 17190         50461 my @keyvals = split( /\s+/, $line, 3 );
955 17190         26909 my $key = shift @keyvals;
956 17190         23211 my $val = shift @keyvals;
957              
958             # Do magic for indicators
959 17190 100       28758 if ( $key =~ /^ind/ ) {
960 2724         5017 $rules->{$key} = $val;
961              
962 2724         3535 my $desc;
963             my $regex;
964              
965 2724 100       4256 if ( $val eq "blank" ) {
966 1674         2039 $desc = "blank";
967 1674         5028 $regex = qr/^ $/;
968             } else {
969 1050         1717 $desc = _nice_list($val);
970 1050         2419 $val =~ s/^b/ /;
971 1050         12108 $regex = qr/^[$val]$/;
972             }
973              
974 2724         6720 $rules->{$key."_desc"} = $desc;
975 2724         6086 $rules->{$key."_regex"} = $regex;
976             } # if indicator
977             else {
978 14466 50       19686 if ( $key =~ /(.)-(.)/ ) {
979 0         0 my ($min,$max) = ($1,$2);
980 0         0 $rules->{$_} = $val for ($min..$max);
981             } else {
982 14466         39826 $rules->{$key} = $val;
983             }
984             } # not an indicator
985             } # for $line
986             }
987              
988              
989             sub _nice_list {
990 1050     1050   1355 my $str = shift;
991              
992 1050 100       2471 if ( $str =~ s/(\d)-(\d)/$1 thru $2/ ) {
993 60         202 return $str;
994             }
995              
996 990         2380 my @digits = split( //, $str );
997 990 100       1899 $digits[0] = "blank" if $digits[0] eq "b";
998 990         1413 my $last = pop @digits;
999 990         3149 return join( ", ", @digits ) . " or $last";
1000             }
1001              
1002             sub _ind_regex {
1003 0     0     my $str = shift;
1004              
1005 0 0         return qr/^ $/ if $str eq "blank";
1006              
1007 0           return qr/^[$str]$/;
1008             }
1009              
1010              
1011             1;
1012              
1013             __DATA__