File Coverage

blib/lib/MARC/Lint.pm
Criterion Covered Total %
statement 284 304 93.4
branch 150 188 79.7
condition 41 64 64.0
subroutine 21 22 95.4
pod 9 9 100.0
total 505 587 86.0


line stmt bran cond sub pod time code
1             package MARC::Lint;
2              
3 5     5   2013194 use strict;
  5         9  
  5         149  
4 5     5   27 use warnings;
  5         8  
  5         142  
5 5     5   23 use integer;
  5         15  
  5         29  
6 5     5   2962 use MARC::Record;
  5         9440  
  5         212  
7 5     5   30 use MARC::Field;
  5         8  
  5         142  
8              
9 5     5   183630 use MARC::Lint::CodeData qw(%GeogAreaCodes %ObsoleteGeogAreaCodes %LanguageCodes %ObsoleteLanguageCodes);
  5         13  
  5         5566  
10              
11             our $VERSION = 1.50 ;
12              
13             =head1 NAME
14              
15             MARC::Lint - Perl extension for checking validity of MARC records
16              
17             =head1 SYNOPSIS
18              
19             use MARC::File::USMARC;
20             use MARC::Lint;
21              
22             my $lint = new MARC::Lint;
23             my $filename = shift;
24              
25             my $file = MARC::File::USMARC->in( $filename );
26             while ( my $marc = $file->next() ) {
27             $lint->check_record( $marc );
28              
29             # Print the title tag
30             print $marc->title, "\n";
31              
32             # Print the errors that were found
33             print join( "\n", $lint->warnings ), "\n";
34             } # while
35              
36             Given the following MARC record:
37              
38             LDR 00000nam 22002538a 4500
39             040 _aMdSSJTT
40             _cMdSSJTT
41             040 _aMdSSJTT
42             _beng
43             _cMdSSJTT
44             100 14 _aWall, Larry.
45             110 1 _aO'Reilly & Associates.
46             245 90 _aProgramming Perl /
47             _aBig Book of Perl /
48             _cLarry Wall, Tom Christiansen & Jon Orwant.
49             250 _a3rd ed.
50             250 _a3rd ed.
51             260 _aCambridge, Mass. :
52             _bO'Reilly,
53             _r2000.
54             590 4 _aPersonally signed by Larry.
55             856 43 _uhttp://www.perl.com/
56              
57             the following errors are generated:
58              
59             1XX: Only one 1XX tag is allowed, but I found 2 of them.
60             100: Indicator 2 must be blank but it's "4"
61             245: Indicator 1 must be 0 or 1 but it's "9"
62             245: Subfield _a is not repeatable.
63             040: Field is not repeatable.
64             260: Subfield _r is not allowed.
65             856: Indicator 2 must be blank, 0, 1, 2 or 8 but it's "3"
66              
67             =head1 DESCRIPTION
68              
69             Module for checking validity of MARC records. 99% of the users will want to do
70             something like is shown in the synopsis. The other intrepid 1% will overload the
71             C module's methods and provide their own special field-level checking.
72              
73             What this means is that if you have certain requirements, such as making sure that
74             all 952 tags have a certain call number in them, you can write a function that
75             checks for that, and still get all the benefits of the MARC::Lint framework.
76              
77             =head1 EXPORT
78              
79             None. Everything is done through objects.
80              
81             =head1 METHODS
82              
83             =head2 new()
84              
85             No parms needed. The C object is little more than a list of warnings
86             and a bunch of rules.
87              
88             =cut
89              
90             sub new {
91 6     6 1 5698 my $class = shift;
92              
93 6         24 my $self = {
94             _warnings => [],
95             };
96 6         17 bless $self, $class;
97              
98 6         29 $self->_read_rules();
99              
100 6         34 return $self;
101             }
102              
103             =head2 warnings()
104              
105             Returns a list of warnings found by C and its brethren.
106              
107             =cut
108              
109             sub warnings {
110 64     64 1 949 my $self = shift;
111              
112 64 50       141 return wantarray ? @{$self->{_warnings}} : scalar @{$self->{_warnings}};
  64         218  
  0         0  
113             }
114              
115             =head2 clear_warnings()
116              
117             Clear the list of warnings for this linter object. It's automatically called
118             when you call C.
119              
120             =cut
121              
122             sub clear_warnings {
123 64     64 1 198 my $self = shift;
124              
125 64         215 $self->{_warnings} = [];
126             }
127              
128             =head2 warn( $str [, $str...] )
129              
130             Create a warning message, built from strings passed, like a C
131             statement.
132              
133             Typically, you'll leave this to C, but industrious
134             programmers may want to do their own checking as well.
135              
136             =cut
137              
138             sub warn {
139 56     56 1 1262 my $self = shift;
140              
141 56         70 push( @{$self->{_warnings}}, join( "", @_ ) );
  56         189  
142              
143 56         242 return;
144             }
145              
146             =head2 check_record( $marc )
147              
148             Does all sorts of lint-like checks on the MARC record I<$marc>,
149             both on the record as a whole, and on the individual fields &
150             subfields.
151              
152             =cut
153              
154             sub check_record {
155 13     13 1 807132 my $self = shift;
156 13         25 my $marc = shift;
157              
158 13         44 $self->clear_warnings();
159              
160 13 50 33     129 ( (ref $marc) && $marc->isa('MARC::Record') )
161             or return $self->warn( "Must pass a MARC::Record object to check_record" );
162              
163 13         47 my @_1xx = $marc->field( "1.." );
164 13         1455 my $n1xx = scalar @_1xx;
165 13 100       41 if ( $n1xx > 1 ) {
166 1         8 $self->warn( "1XX: Only one 1XX tag is allowed, but I found $n1xx of them." );
167             }
168              
169 13 50       42 if ( not $marc->field( 245 ) ) {
170 0         0 $self->warn( "245: No 245 tag." );
171             }
172              
173              
174 13         988 my %field_seen;
175 13         28 my $rules = $self->{_rules};
176 13         50 for my $field ( $marc->fields ) {
177 206         606 my $tagno = $field->tag;
178              
179 206         843 my $tagrules = '';
180             #if 880 field, inherit rules from tagno in subfield _6
181 206         292 my $is_880 = 0;
182 206 100       426 if ($tagno eq '880') {
183 1         2 $is_880 = 1;
184 1 50       3 if ($field->subfield('6')) {
185 1         21 my $sub6 = $field->subfield('6');
186 1         19 $tagno = substr($sub6, 0, 3);
187              
188 1 50       3 $tagrules = $rules->{$tagno} or next;
189             #880 is repeatable, but its linked field may not be
190 1 50 33     12 if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{'880.'.$tagno} ) {
      33        
191 0         0 $self->warn( "$tagno: Field is not repeatable." );
192             } #if repeatability
193             } #if subfield 6 present
194             else {
195 0         0 $self->warn( "880: No subfield 6." );
196             } #else no subfield 6 in 880 field
197             } #if this is 880 field
198             else {
199 205 100       587 $tagrules = $rules->{$tagno} or next;
200              
201 204 50 66     1266 if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{$tagno} ) {
      66        
202 0         0 $self->warn( "$tagno: Field is not repeatable." );
203             } #if repeatability
204             } #else not 880
205              
206 205 100       527 if ( $tagno >= 10 ) {
    50          
207 158         293 for my $ind ( 1..2 ) {
208 316         882 my $indvalue = $field->indicator($ind);
209 316 100       4426 if ( not ($indvalue =~ $tagrules->{"ind$ind" . "_regex"}) ) {
210             $self->warn(
211             "$tagno: Indicator $ind must be ",
212 4         21 $tagrules->{"ind$ind" . "_desc"},
213             " but it's \"$indvalue\""
214             );
215             }
216             } # for
217              
218 158         246 my %sub_seen;
219 158         415 for my $subfield ( $field->subfields ) {
220 262         2915 my ($code,$data) = @$subfield;
221              
222 262         500 my $rule = $tagrules->{$code};
223 262 100 66     1082 if ( not defined $rule ) {
    100          
224 2         9 $self->warn( "$tagno: Subfield _$code is not allowed." );
225             } elsif ( ($rule eq "NR") && $sub_seen{$code} ) {
226 1         5 $self->warn( "$tagno: Subfield _$code is not repeatable." );
227             }
228              
229 262 50       685 if ( $data =~ /[\t\r\n]/ ) {
230 0         0 $self->warn( "$tagno: Subfield _$code has an invalid control character" );
231             }
232              
233 262         766 ++$sub_seen{$code};
234             } # for $subfields
235             } # if $tagno >= 10
236              
237             elsif ($tagno < 10) {
238             #check for subfield characters
239 47 100       173 if ($field->data() =~ /\x1F/) {
240 1         12 $self->warn( "$tagno: Subfields are not allowed in fields lower than 010" );
241             } #if control field has subfield delimiter
242             } #elsif $tagno < 10
243              
244             # Check to see if a check_xxx() function exists, and call it on the field if it does
245 205         847 my $checker = "check_$tagno";
246 205 100       904 if ( $self->can( $checker ) ) {
247 25         75 $self->$checker( $field );
248             }
249              
250 205 100       2959 if ($is_880) {
251 1         4 ++$field_seen{'880.'.$tagno};
252             } #if 880 field
253             else {
254 204         545 ++$field_seen{$tagno};
255             }
256             } # for my $fields
257              
258 13         65 return;
259             }
260              
261             =head2 check_I( $field )
262              
263             Various functions to check the different fields. If the function doesn't exist,
264             then it doesn't get checked.
265              
266             =head2 check_020()
267              
268             Looks at 020$a and reports errors if the check digit is wrong.
269             Looks at 020$z and validates number if hyphens are present.
270              
271             Uses Business::ISBN to do validation. Thirteen digit checking is currently done
272             with the internal sub _isbn13_check_digit(), based on code from Business::ISBN.
273              
274             TO DO (check_020):
275              
276             Fix 13-digit ISBN checking.
277              
278             =cut
279              
280             sub check_020 {
281              
282              
283 5     5   4479 use Business::ISBN;
  5         244205  
  5         18226  
284              
285 22     22 1 9915 my $self = shift;
286 22         34 my $field = shift;
287              
288             ###################################################
289              
290             # break subfields into code-data array and validate data
291              
292 22         57 my @subfields = $field->subfields();
293              
294 22         360 while (my $subfield = pop(@subfields)) {
295 22         42 my ($code, $data) = @$subfield;
296 22         29 my $isbnno = $data;
297             #remove any hyphens
298 22         56 $isbnno =~ s/\-//g;
299             #remove nondigits
300 22         120 $isbnno =~ s/^\D*(\d{9,12}[X\d])\b.*$/$1/;
301              
302             #report error if this is subfield 'a'
303             #and the first 10 or 13 characters are not a match for $isbnno
304 22 100       62 if ($code eq 'a') {
    50          
305 21 100       68 if ((substr($data,0,length($isbnno)) ne $isbnno)) {
306 2         8 $self->warn( "020: Subfield a may have invalid characters.");
307             } #if first characters don't match
308              
309             #report error if no space precedes a qualifier in subfield a
310 21 100       58 if ($data =~ /\(/) {
311 8 100       41 $self->warn( "020: Subfield a qualifier must be preceded by space, $data.") unless ($data =~ /[X0-9] \(/);
312             } #if data has parenthetical qualifier
313              
314             #report error if unable to find 10-13 digit string of digits in subfield 'a'
315 21 100       86 if (($isbnno !~ /(?:^\d{10}$)|(?:^\d{13}$)|(?:^\d{9}X$)/)) {
316 3         16 $self->warn( "020: Subfield a has the wrong number of digits, $data.");
317             } # if subfield 'a' but not 10 or 13 digit isbn
318             #otherwise, check 10 and 13 digit checksums for validity
319             else {
320 18 100       52 if ((length ($isbnno) == 10)) {
    50          
321              
322 16 50 33     101 if (($Business::ISBN::VERSION gt '2.02_01') || ($Business::ISBN::VERSION gt '2.009')) {
    0          
323 16 100       63 $self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::valid_isbn_checksum($isbnno) != 1);
324             } #if Business::ISBN version higher than 2.02_01 or 2.009
325             elsif ($Business::ISBN::VERSION lt '2') {
326 0 0       0 $self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::is_valid_checksum($isbnno) != 1);
327             } #elsif Business::ISBN version lower than 2
328             else {
329 0         0 $self->warn( "Business::ISBN version must be below 2 or above 2.02_02 or 2.009." );
330             } #else Business::ISBN version between 2 and 2.02_02
331             } #if 10 digit ISBN has invalid check digit
332             # do validation check for 13 digit isbn
333             #########################################
334             ### Not yet fully implemented ###########
335             #########################################
336             elsif (length($isbnno) == 13){
337             #change line below once Business::ISBN handles 13-digit ISBNs
338 2         7 my $is_valid_13 = _isbn13_check_digit($isbnno);
339 2 100       14 $self->warn( "020: Subfield a has bad checksum (13 digit), $data.") unless ($is_valid_13 == 1);
340             } #elsif 13 digit ISBN has invalid check digit
341             ###################################################
342             } #else subfield 'a' has 10 or 13 digits
343             } #if subfield 'a'
344             #look for valid isbn in 020$z
345             elsif ($code eq 'z') {
346 1 50 33     13 if (($data =~ /^ISBN/) || ($data =~ /^\d*\-\d+/)){
347             ##################################################
348             ## Turned on for now--Comment to unimplement ####
349             ##################################################
350 0 0 0     0 $self->warn( "020: Subfield z is numerically valid.") if ((length ($isbnno) == 10) && (Business::ISBN::is_valid_checksum($isbnno) == 1));
351             } #if 10 digit ISBN has invalid check digit
352             } #elsif subfield 'z'
353              
354             } # while @subfields
355              
356             } #check_020
357              
358             =head2 _isbn13_check_digit($ean)
359              
360             Internal sub to determine if 13-digit ISBN has a valid checksum. The code is
361             taken from Business::ISBN::as_ean. It is expected to be temporary until
362             Business::ISBN is updated to check 13-digit ISBNs itself.
363              
364             =cut
365              
366             sub _isbn13_check_digit {
367              
368 2     2   3 my $ean = shift;
369             #remove and store current check digit
370 2         6 my $check_digit = chop($ean);
371              
372             #calculate valid checksum
373 2         3 my $sum = 0;
374 2         5 foreach my $index ( 0, 2, 4, 6, 8, 10 )
375             {
376 12         19 $sum += substr($ean, $index, 1);
377 12         17 $sum += 3 * substr($ean, $index + 1, 1);
378             }
379              
380             #take the next higher multiple of 10 and subtract the sum.
381             #if $sum is 37, the next highest multiple of ten is 40. the
382             #check digit would be 40 - 37 => 3.
383 2         6 my $valid_check_digit = ( 10 * ( int( $sum / 10 ) + 1 ) - $sum ) % 10;
384              
385 2 100       7 return $check_digit == $valid_check_digit ? 1 : 0;
386              
387             } # _isbn13_check_digit
388              
389             #########################################
390              
391             =head2 check_041( $field )
392              
393             Warns if subfields are not evenly divisible by 3 unless second indicator is 7
394             (future implementation would ensure that each subfield is exactly 3 characters
395             unless ind2 is 7--since subfields are now repeatable. This is not implemented
396             here due to the large number of records needing to be corrected.). Validates
397             against the MARC Code List for Languages (L) using the
398             MARC::Lint::CodeData data pack to MARC::Lint (%LanguageCodes,
399             %ObsoleteLanguageCodes).
400              
401             =cut
402              
403             sub check_041 {
404              
405              
406 3     3 1 2383 my $self = shift;
407 3         5 my $field = shift;
408              
409             # break subfields into code-data array (so the entire field is in one array)
410              
411 3         11 my @subfields = $field->subfields();
412 3         60 my @newsubfields = ();
413              
414 3         12 while (my $subfield = pop(@subfields)) {
415 7         15 my ($code, $data) = @$subfield;
416 7         29 unshift (@newsubfields, $code, $data);
417             } # while
418              
419             #warn if length of each subfield is not divisible by 3 unless ind2 is 7
420 3 50       11 unless ($field->indicator(2) eq '7') {
421 3         48 for (my $index = 0; $index <=$#newsubfields; $index+=2) {
422 7 100       23 if (length ($newsubfields[$index+1]) %3 != 0) {
423 3         15 $self->warn( "041: Subfield _$newsubfields[$index] must be evenly divisible by 3 or exactly three characters if ind2 is not 7, ($newsubfields[$index+1])." );
424             } #if field length not divisible evenly by 3
425             ##############################################
426             # validation against code list data
427             ## each subfield has a multiple of 3 chars
428             # need to look at each group of 3 characters
429             else {
430              
431             #break each character of the subfield into an array position
432 4         18 my @codechars = split '', $newsubfields[$index+1];
433              
434 4         9 my $pos = 0;
435             #store each 3 char code in a slot of @codes041
436 4         6 my @codes041 = ();
437 4         13 while ($pos <= $#codechars) {
438 6         22 push @codes041, (join '', @codechars[$pos..$pos+2]);
439 6         15 $pos += 3;
440             }
441              
442              
443 4         9 foreach my $code041 (@codes041) {
444             #see if language code matches valid code
445 6 50       18 my $validlang = $LanguageCodes{$code041} ? 1 : 0;
446             #look for invalid code match if valid code was not matched
447 6 100       16 my $obsoletelang = $ObsoleteLanguageCodes{$code041} ? 1 : 0;
448              
449             # skip valid subfields
450 6 50       15 unless ($validlang) {
451             #report invalid matches as possible obsolete codes
452 6 100       14 if ($obsoletelang) {
453 1         13 $self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1], may be obsolete.");
454             }
455             else {
456 5         48 $self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1] ($code041), is not valid.");
457             } #else code not found
458             } # unless found valid code
459             } #foreach code in 041
460             } # else subfield has multiple of 3 chars
461             ##############################################
462             } # foreach subfield
463             } #unless ind2 is 7
464             } #check_041
465              
466             =head2 check_043( $field )
467              
468             Warns if each subfield a is not exactly 7 characters. Validates each code
469             against the MARC code list for Geographic Areas (L)
470             using the MARC::Lint::CodeData data pack to MARC::Lint (%GeogAreaCodes,
471             %ObsoleteGeogAreaCodes).
472              
473             =cut
474              
475             sub check_043 {
476              
477 2     2 1 1725 my $self = shift;
478 2         5 my $field = shift;
479              
480             # break subfields into code-data array (so the entire field is in one array)
481              
482 2         8 my @subfields = $field->subfields();
483 2         36 my @newsubfields = ();
484              
485 2         10 while (my $subfield = pop(@subfields)) {
486 5         10 my ($code, $data) = @$subfield;
487 5         21 unshift (@newsubfields, $code, $data);
488             } # while
489              
490             #warn if length of subfield a is not exactly 7
491 2         10 for (my $index = 0; $index <=$#newsubfields; $index+=2) {
492 5 100 66     65 if (($newsubfields[$index] eq 'a') && (length ($newsubfields[$index+1]) != 7)) {
    50          
493 2         7 $self->warn( "043: Subfield _a must be exactly 7 characters, $newsubfields[$index+1]" );
494             } # if suba and length is not 7
495             #check against code list for geographic areas.
496             elsif ($newsubfields[$index] eq 'a') {
497              
498             #see if geog area code matches valid code
499 3 50       12 my $validgac = $GeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
500             #look for obsolete code match if valid code was not matched
501 3 100       11 my $obsoletegac = $ObsoleteGeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
502              
503             # skip valid subfields
504 3 50       14 unless ($validgac) {
505             #report invalid matches as possible obsolete codes
506 3 100       8 if ($obsoletegac) {
507 1         5 $self->warn( "043: Subfield _a, $newsubfields[$index+1], may be obsolete.");
508             }
509             else {
510 2         9 $self->warn( "043: Subfield _a, $newsubfields[$index+1], is not valid.");
511             } #else code not found
512             } # unless found valid code
513              
514             } #elsif suba
515             } #foreach subfield
516             } #check_043
517              
518             =head2 check_245( $field )
519              
520             -Makes sure $a exists (and is first subfield).
521             -Warns if last character of field is not a period
522             --Follows LCRI 1.0C, Nov. 2003 rather than MARC21 rule
523             -Verifies that $c is preceded by / (space-/)
524             -Verifies that initials in $c are not spaced
525             -Verifies that $b is preceded by :;= (space-colon, space-semicolon, space-equals)
526             -Verifies that $h is not preceded by space unless it is dash-space
527             -Verifies that data of $h is enclosed in square brackets
528             -Verifies that $n is preceded by . (period)
529             --As part of that, looks for no-space period, or dash-space-period (for replaced elipses)
530             -Verifies that $p is preceded by , (no-space-comma) when following $n and . (period) when following other subfields.
531             -Performs rudimentary article check of 245 2nd indicator vs. 1st word of 245$a (for manual verification).
532              
533             Article checking is done by internal _check_article method, which should work for 130, 240, 245, 440, 630, 730, and 830.
534              
535             =cut
536              
537             sub check_245 {
538              
539 49     49 1 24512 my $self = shift;
540 49         68 my $field = shift;
541              
542             #set tagno for reporting
543 49         76 my $tagno = '245';
544            
545 49 100       144 if ( not $field->subfield( "a" ) ) {
546 1         24 $self->warn( "245: Must have a subfield _a." );
547             }
548              
549             # break subfields into code-data array (so the entire field is in one array)
550              
551 49         1068 my @subfields = $field->subfields();
552 49         833 my @newsubfields = ();
553 49         73 my $has_sub_6 = 0;
554              
555 49         147 while (my $subfield = pop(@subfields)) {
556 90         146 my ($code, $data) = @$subfield;
557             #check for subfield 6 being present
558 90 100       198 $has_sub_6 = 1 if ($code eq '6');
559 90         412 unshift (@newsubfields, $code, $data);
560             } # while
561            
562             # 245 must end in period (may want to make this less restrictive by allowing trailing spaces)
563             #do 2 checks--for final punctuation (MARC21 rule), and for period (LCRI 1.0C, Nov. 2003; LCPS 1.7.1)
564 49 100       293 if ($newsubfields[$#newsubfields] !~ /[.?!]$/) {
    100          
565 1         5 $self->warn ( "245: Must end with . (period).");
566             }
567             elsif($newsubfields[$#newsubfields] =~ /[?!]$/) {
568 2         7 $self->warn ( "245: MARC21 allows ? or ! as final punctuation but LCRI 1.0C, Nov. 2003 (LCPS 1.7.1 for RDA records), requires period.");
569             }
570              
571             ##Check for first subfield
572             #subfield a should be first subfield (or 2nd if subfield '6' is present)
573 49 100       111 if ($has_sub_6) {
574             #make sure there are at least 2 subfields
575 2 50       5 if ($#newsubfields < 3) {
576 0         0 $self->warn ("$tagno: May have too few subfields.");
577             } #if fewer than 2 subfields
578             else {
579 2 50       6 if ($newsubfields[0] ne '6') {
580 0         0 $self->warn ( "$tagno: First subfield must be _6, but it is $newsubfields[0]");
581             } #if 1st subfield not '6'
582 2 50       6 if ($newsubfields[2] ne 'a') {
583 0         0 $self->warn ( "$tagno: First subfield after subfield _6 must be _a, but it is _$newsubfields[2]");
584             } #if 2nd subfield not 'a'
585             } #else at least 2 subfields
586             } #if has subfield 6
587             else {
588             #1st subfield must be 'a'
589 47 100       122 if ($newsubfields[0] ne 'a') {
590 1         5 $self->warn ( "$tagno: First subfield must be _a, but it is _$newsubfields[0]");
591             } #if 2nd subfield not 'a'
592             } #else no subfield _6
593             ##End check for first subfield
594            
595             #subfield c, if present, must be preceded by /
596             #also look for space between initials
597 49 100       146 if ($field->subfield("c")) {
598            
599 14         325 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
600             # 245 subfield c must be preceded by / (space-/)
601 17 100       57 if ($newsubfields[$index] eq 'c') {
602 14 100       73 $self->warn ( "245: Subfield _c must be preceded by /") if ($newsubfields[$index-1] !~ /\s\/$/);
603             # 245 subfield c initials should not have space
604 14 100 66     73 $self->warn ( "245: Subfield _c initials should not have a space.") if (($newsubfields[$index+1] =~ /\b\w\. \b\w\./) && ($newsubfields[$index+1] !~ /\[\bi\.e\. \b\w\..*\]/));
605 14         30 last;
606             } #if
607             } #for
608             } # subfield c exists
609              
610             #each subfield b, if present, should be preceded by :;= (colon, semicolon, or equals sign)
611             ### Are there others? ###
612 49 100       730 if ($field->subfield("b")) {
613              
614             # 245 subfield b should be preceded by space-:;= (colon, semicolon, or equals sign)
615 13         287 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
616             #report error if subfield 'b' is not preceded by space-:;= (colon, semicolon, or equals sign)
617 16 100 100     137 if (($newsubfields[$index] eq 'b') && ($newsubfields[$index-1] !~ / [:;=]$/)) {
618 4         14 $self->warn ( "245: Subfield _b should be preceded by space-colon, space-semicolon, or space-equals sign.");
619             } #if
620             } #for
621             } # subfield b exists
622              
623              
624             #each subfield h, if present, should be preceded by non-space
625 49 100       709 if ($field->subfield("h")) {
626              
627             # 245 subfield h should not be preceded by space
628 4         95 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
629             #report error if subfield 'h' is preceded by space (unless dash-space)
630 6 100 100     65 if (($newsubfields[$index] eq 'h') && ($newsubfields[$index-1] !~ /(\S$)|(\-\- $)/)) {
631 1         4 $self->warn ( "245: Subfield _h should not be preceded by space.");
632             } #if h and not preceded by no-space (unless dash)
633             #report error if subfield 'h' does not start with open square bracket with a matching close bracket
634             ##could have check against list of valid values here
635 6 100 100     58 if (($newsubfields[$index] eq 'h') && ($newsubfields[$index+1] !~ /^\[\w*\s*\w*\]/)) {
636 1         8 $self->warn ( "245: Subfield _h must have matching square brackets, $newsubfields[$index].");
637             }
638             } #for
639             } # subfield h exists
640              
641             #each subfield n, if present, must be preceded by . (period)
642 49 100       863 if ($field->subfield("n")) {
643              
644             # 245 subfield n must be preceded by . (period)
645 4         82 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
646             #report error if subfield 'n' is not preceded by non-space-period or dash-space-period
647 6 100 100     53 if (($newsubfields[$index] eq 'n') && ($newsubfields[$index-1] !~ /(\S\.$)|(\-\- \.$)/)) {
648 1         4 $self->warn ( "245: Subfield _n must be preceded by . (period).");
649             } #if
650             } #for
651             } # subfield n exists
652              
653             #each subfield p, if present, must be preceded by a , (no-space-comma) if it follows subfield n, or by . (no-space-period or dash-space-period) following other subfields
654 49 100       844 if ($field->subfield("p")) {
655              
656             # 245 subfield p must be preceded by . (period) or , (comma)
657 4         82 for (my $index = 2; $index <=$#newsubfields; $index+=2) {
658             #only looking for subfield p
659 6 100       59 if ($newsubfields[$index] eq 'p') {
660             # case for subfield 'n' being field before this one (allows dash-space-comma)
661 4 100 100     49 if (($newsubfields[$index-2] eq 'n') && ($newsubfields[$index-1] !~ /(\S,$)|(\-\- ,$)/)) {
    100 100        
662 1         4 $self->warn ( "245: Subfield _p must be preceded by , (comma) when it follows subfield _n.");
663             } #if subfield n precedes this one
664             # elsif case for subfield before this one is not n
665             elsif (($newsubfields[$index-2] ne 'n') && ($newsubfields[$index-1] !~ /(\S\.$)|(\-\- \.$)/)) {
666 1         3 $self->warn ( "245: Subfield _p must be preceded by . (period) when it follows a subfield other than _n.");
667             } #elsif subfield p preceded by non-period when following a non-subfield 'n'
668             } #if index is looking at subfield p
669             } #for
670             } # subfield p exists
671              
672             ######################################
673             #check for invalid 2nd indicator
674 49         823 $self->_check_article($field);
675              
676             } # check_245
677              
678              
679              
680              
681             ############
682             # Internal #
683             ############
684              
685             =head2 _check_article
686              
687             Check of articles is based on code from Ian Hamilton. This version is more
688             limited in that it focuses on English, Spanish, French, Italian and German
689             articles. Certain possible articles have been removed if they are valid English
690             non-articles. This version also disregards 008_language/041 codes and just uses
691             the list of articles to provide warnings/suggestions.
692              
693             source for articles = L
694              
695             Should work with fields 130, 240, 245, 440, 630, 730, and 830. Reports error if
696             another field is passed in.
697              
698             =cut
699              
700             sub _check_article {
701              
702 49     49   78 my $self = shift;
703 49         119 my $field = shift;
704              
705             #add articles here as needed
706             ##Some omitted due to similarity with valid words (e.g. the German 'die').
707 49         868 my %article = (
708             'a' => 'eng glg hun por',
709             'an' => 'eng',
710             'das' => 'ger',
711             'dem' => 'ger',
712             'der' => 'ger',
713             'ein' => 'ger',
714             'eine' => 'ger',
715             'einem' => 'ger',
716             'einen' => 'ger',
717             'einer' => 'ger',
718             'eines' => 'ger',
719             'el' => 'spa',
720             'en' => 'cat dan nor swe',
721             'gl' => 'ita',
722             'gli' => 'ita',
723             'il' => 'ita mlt',
724             'l' => 'cat fre ita mlt',
725             'la' => 'cat fre ita spa',
726             'las' => 'spa',
727             'le' => 'fre ita',
728             'les' => 'cat fre',
729             'lo' => 'ita spa',
730             'los' => 'spa',
731             'os' => 'por',
732             'the' => 'eng',
733             'um' => 'por',
734             'uma' => 'por',
735             'un' => 'cat spa fre ita',
736             'una' => 'cat spa ita',
737             'une' => 'fre',
738             'uno' => 'ita',
739             );
740              
741             #add exceptions here as needed
742             # may want to make keys lowercase
743 49         565 my %exceptions = (
744             'A & E' => 1,
745             'A & ' => 1,
746             'A-' => 1,
747             'A+' => 1,
748             'A is ' => 1,
749             'A isn\'t ' => 1,
750             'A l\'' => 1,
751             'A la ' => 1,
752             'A posteriori' => 1,
753             'A priori' => 1,
754             'A to ' => 1,
755             'El Nino' => 1,
756             'El Salvador' => 1,
757             'L is ' => 1,
758             'L-' => 1,
759             'La Salle' => 1,
760             'Las Vegas' => 1,
761             'Lo cual' => 1,
762             'Lo mein' => 1,
763             'Lo que' => 1,
764             'Los Alamos' => 1,
765             'Los Angeles' => 1,
766             );
767              
768             #get tagno to determine which indicator to check and for reporting
769 49         137 my $tagno = $field->tag();
770             #retrieve tagno from subfield 6 if 880 field
771 49 100       270 if ($tagno eq '880') {
772 1 50       3 if ($field->subfield('6')) {
773 1         19 my $sub6 = $field->subfield('6');
774 1         18 $tagno = substr($sub6, 0, 3);
775             } #if subfield 6
776             } #if 880 field
777              
778             #$ind holds nonfiling character indicator value
779 49         79 my $ind = '';
780             #$first_or_second holds which indicator is for nonfiling char value
781 49         72 my $first_or_second = '';
782 49 50       395 if ($tagno !~ /^(?:130|240|245|440|630|730|830)$/) {
    50          
    50          
783 0         0 print $tagno, " is not a valid field for article checking\n";
784 0         0 return;
785             } #if field is not one of those checked for articles
786             #130, 630, 730 => ind1
787             elsif ($tagno =~ /^(?:130|630|730)$/) {
788 0         0 $ind = $field->indicator(1);
789 0         0 $first_or_second = '1st';
790             } #if field is 130, 630, or 730
791             #240, 245, 440, 830 => ind2
792             elsif ($tagno =~ /^(?:240|245|440|830)$/) {
793 49         144 $ind = $field->indicator(2);
794 49         502 $first_or_second = '2nd';
795             } #if field is 240, 245, 440, or 830
796              
797              
798             #report non-numeric non-filing indicators as invalid
799 49 50       158 $self->warn ( $tagno, ": Non-filing indicator is non-numeric" ) unless ($ind =~ /^[0-9]$/);
800             #get subfield 'a' of the title field
801 49   100     130 my $title = $field->subfield('a') || '';
802              
803              
804 49         928 my $char1_notalphanum = 0;
805             #check for apostrophe, quote, bracket, or parenthesis, before first word
806             #remove if found and add to non-word counter
807 49         172 while ($title =~ /^["'\[\(*]/){
808 4         5 $char1_notalphanum++;
809 4         18 $title =~ s/^["'\[\(*]//;
810             }
811             # split title into first word + rest on space, parens, bracket, apostrophe, quote, or hyphen
812 49         240 my ($firstword, $separator, $etc) = $title =~ /^([^ \(\)\[\]'"\-]+)([ \(\)\[\]'"\-])?(.*)/i;
813 49 100       124 $firstword = '' if ! defined( $firstword );
814 49 100       109 $separator = '' if ! defined( $separator );
815 49 100       107 $etc = '' if ! defined( $etc );
816              
817             #get length of first word plus the number of chars removed above plus one for the separator
818 49         83 my $nonfilingchars = length($firstword) + $char1_notalphanum + 1;
819              
820             #check to see if first word is an exception
821 49         101 my $isan_exception = 0;
822 49         243 $isan_exception = grep {$title =~ /^\Q$_\E/i} (keys %exceptions);
  1078         7953  
823              
824             #lowercase chars of $firstword for comparison with article list
825 49         153 $firstword = lc($firstword);
826              
827 49         64 my $isan_article = 0;
828              
829             #see if first word is in the list of articles and not an exception
830 49 100 100     206 $isan_article = 1 if (($article{$firstword}) && !($isan_exception));
831              
832             #if article then $nonfilingchars should match $ind
833 49 100       102 if ($isan_article) {
834             #account for quotes, apostrophes, parens, or brackets before 2nd word
835             # if (($separator eq ' ') && ($etc =~ /^['"]/)) {
836 9 100 66     47 if (($separator) && ($etc =~ /^[ \(\)\[\]'"\-]+/)) {
837 4         18 while ($etc =~ /^[ "'\[\]\(\)*]/){
838 6         7 $nonfilingchars++;
839 6         28 $etc =~ s/^[ "'\[\]\(\)*]//;
840             } #while etc starts with nonfiling chars
841             } #if separator defined and etc starts with nonfiling chars
842             #special case for 'en' (unsure why)
843 9 50       88 if ($firstword eq 'en') {
    100          
844 0 0 0     0 $self->warn ( $tagno, ": First word, , $firstword, may be an article, check $first_or_second indicator ($ind)." ) unless (($ind eq '3') || ($ind eq '0'));
845             }
846             elsif ($nonfilingchars ne $ind) {
847 3         13 $self->warn ( $tagno, ": First word, $firstword, may be an article, check $first_or_second indicator ($ind)." );
848             } #unless ind is same as length of first word and nonfiling characters
849             } #if first word is in article list
850             #not an article so warn if $ind is not 0
851             else {
852 40 100       542 unless ($ind eq '0') {
853 1         6 $self->warn ( $tagno, ": First word, $firstword, does not appear to be an article, check $first_or_second indicator ($ind)." );
854             } #unless ind is 0
855             } #else not in article list
856              
857             #######################################
858              
859             } #_check_article
860              
861              
862             ############
863              
864             =head1 SEE ALSO
865              
866             Check the docs for L. All software links are there.
867              
868             =head1 TODO
869              
870             =over 4
871              
872             =item * Subfield 6
873              
874             For subfield 6, it should always be the 1st subfield according to MARC 21 specifications. Perhaps a generic check should be added that warns if subfield 6 is not the 1st subfield.
875              
876             =item * Subfield 8.
877              
878             This subfield could be the 1st or 2nd subfield, so the code that checks for the 1st few subfields (check_245, check_250) should take that into account.
879              
880             =item * Subfield 9
881              
882             This subfield is not officially allowed in MARC, since it is locally defined. Some way needs to be made to allow messages/warnings about this subfield to be turned off (or otherwise deal with records using/allowing locally defined subfield 9).
883              
884             =item * 008 length and presence check
885              
886             Currently, 008 validation is not implemented in MARC::Lint, but is left to MARC::Errorchecks. It might be useful if MARC::Lint's basic validation checks included a verification that the 008 exists and is exactly 40 characters long. Additional 008-related checking and byte validation would remain in MARC::Errorchecks.
887              
888             =item * ISBN and ISSN checking
889              
890             020 and 022 fields are validated with the C and
891             C modules, respectively. Business::ISBN versions between 2 and
892             2.02_01 are incompatible with MARC::Lint.
893              
894             =item * check_041 cleanup
895              
896             Splitting subfield code strings every 3 chars could probably be written more efficiently.
897              
898             =item * check_245 cleanup
899              
900             The article checking in particular.
901              
902             =item * Method for turning off checks
903              
904             Provide a way for users to skip checks more easily when using check_record, or a
905             specific check_xxx method (e.g. skip article checking).
906              
907             =back
908              
909             =head1 LICENSE
910              
911             This code may be distributed under the same terms as Perl itself.
912              
913             Please note that these modules are not products of or supported by the
914             employers of the various contributors to the code.
915              
916             =cut
917              
918             # Used only to read the stuff from __DATA__
919             sub _read_rules {
920 6     6   14 my $self = shift;
921              
922 6         38 my $tell = tell(DATA); # Stash the position so we can reset it for next time
923              
924 6         26 local $/ = "";
925 6         87 while ( my $tagblock = ) {
926 1428         7121 my @lines = split( /\n/, $tagblock );
927 1428         36566 s/\s+$// for @lines;
928              
929 1428 100       3391 next unless @lines >= 4; # Some of our entries are tag-only
930              
931 1326         1972 my $tagline = shift @lines;
932 1326         4493 my @keyvals = split( /\s+/, $tagline, 3 );
933 1326         1874 my $tagno = shift @keyvals;
934 1326         1905 my $repeatable = shift @keyvals;
935              
936 1326         3232 $self->_parse_tag_rules( $tagno, $repeatable, @lines );
937             } # while
938              
939             # Set the pointer back to where it was, in case we do this again
940 6         54 seek( DATA, $tell, 0 );
941             }
942              
943             sub _parse_tag_rules {
944 1326     1326   1811 my $self = shift;
945 1326         1760 my $tagno = shift;
946 1326         1743 my $repeatable = shift;
947 1326         4045 my @lines = @_;
948              
949 1326   50     6795 my $rules = ($self->{_rules}->{$tagno} ||= {});
950 1326         2780 $rules->{'repeatable'} = $repeatable;
951              
952 1326         2121 for my $line ( @lines ) {
953 15816         48805 my @keyvals = split( /\s+/, $line, 3 );
954 15816         22185 my $key = shift @keyvals;
955 15816         21935 my $val = shift @keyvals;
956              
957             # Do magic for indicators
958 15816 100       30593 if ( $key =~ /^ind/ ) {
959 2652         5335 $rules->{$key} = $val;
960              
961 2652         3404 my $desc;
962             my $regex;
963              
964 2652 100       4859 if ( $val eq "blank" ) {
965 1620         1944 $desc = "blank";
966 1620         5282 $regex = qr/^ $/;
967             } else {
968 1032         1786 $desc = _nice_list($val);
969 1032         2123 $val =~ s/^b/ /;
970 1032         10205 $regex = qr/^[$val]$/;
971             }
972              
973 2652         6092 $rules->{$key."_desc"} = $desc;
974 2652         7486 $rules->{$key."_regex"} = $regex;
975             } # if indicator
976             else {
977 13164 100       20903 if ( $key =~ /(.)-(.)/ ) {
978 18         48 my ($min,$max) = ($1,$2);
979 18         367 $rules->{$_} = $val for ($min..$max);
980             } else {
981 13146         43364 $rules->{$key} = $val;
982             }
983             } # not an indicator
984             } # for $line
985             }
986              
987              
988             sub _nice_list {
989 1032     1032   1511 my $str = shift;
990              
991 1032 100       2504 if ( $str =~ s/(\d)-(\d)/$1 thru $2/ ) {
992 66         188 return $str;
993             }
994              
995 966         2214 my @digits = split( //, $str );
996 966 100       2141 $digits[0] = "blank" if $digits[0] eq "b";
997 966         1329 my $last = pop @digits;
998 966         3304 return join( ", ", @digits ) . " or $last";
999             }
1000              
1001             sub _ind_regex {
1002 0     0     my $str = shift;
1003              
1004 0 0         return qr/^ $/ if $str eq "blank";
1005              
1006 0           return qr/^[$str]$/;
1007             }
1008              
1009              
1010             1;
1011              
1012             __DATA__