File Coverage

blib/lib/Convert/MRC.pm
Criterion Covered Total %
statement 57 457 12.4
branch 6 238 2.5
condition 0 91 0.0
subroutine 16 31 51.6
pod 6 6 100.0
total 85 823 10.3


line stmt bran cond sub pod time code
1             #
2             # This file is part of Convert-MRC
3             #
4             # This software is copyright (c) 2013 by Alan K. Melby.
5             #
6             # This is free software; you can redistribute it and/or modify it under
7             # the same terms as the Perl 5 programming language system itself.
8             #
9             # MRC to TBX converter
10             # written June-Nov 2008 by Nathan E. Rasmussen
11             # Modified 2013 by Nathan G. Glenn
12              
13             # Example input data follows:
14              
15             # TEST DATA HERE
16              
17             package Convert::MRC;
18 5     5   445899 use strict;
  5         17  
  5         179  
19 5     5   28 use warnings;
  5         8  
  5         134  
20 5     5   2422 use Data::Dumper;
  5         6796  
  5         386  
21 5     5   32 use Carp;
  5         9  
  5         320  
22 5     5   12265 use English qw(-no_match_vars);
  5         31921  
  5         42  
23              
24 5     5   7358 use Log::Message::Simple qw (:STD);
  5         391518  
  5         1117  
25              
26             #import global constants used in processing
27 5     5   4030 use Convert::MRC::Variables;
  5         17  
  5         1527  
28              
29             # ABSTRACT: CONVERT MRC TO TBX-BASIC
30             our $VERSION = '4.03'; # VERSION
31              
32 5     5   6633 use open ':encoding(utf8)', ':std'; # incoming/outgoing data will be UTF-8
  5         8819  
  5         38  
33              
34             our @origARGV = @ARGV;
35             local @ARGV = (q{-}) unless @ARGV; # if no filenames given, take std input
36              
37             #use batch() if called as a script
38             __PACKAGE__->new->batch(@ARGV) unless caller;
39              
40             #allows us to get some kind of version string during development, when $VERSION is undefined
41             #($VERSION is inserted by a dzil plugin at build time)
42             sub _version {
43             ## no critic (ProhibitNoStrict)
44 5     5   131448 no strict 'vars';
  5         17  
  5         7446  
45 0   0 0   0 return $VERSION || q{??};
46             }
47              
48              
49             sub new {
50 5     5 1 40348 my ($class) = @_;
51 5         28 my $self = bless {}, $class;
52 5         64 $self->_init;
53 5         20 return $self;
54             }
55              
56             sub _init {
57 5     5   14 my ($self) = @_;
58 5         36 $self->input_fh( \*STDIN );
59 5         35 $self->tbx_fh( \*STDOUT );
60 5         26 $self->log_fh( \*STDERR );
61 5         9 return;
62             }
63              
64              
65             sub tbx_fh {
66             ## no critic (RequireBriefOpen)
67 5     5 1 15 my ( $application, $fh ) = @_;
68 5 50       23 if ($fh) {
69 5 50       25 if ( ref($fh) eq 'GLOB' ) {
70 5         15 $application->{tbx_fh} = $fh;
71             }
72             else {
73 0 0       0 open my $fh2, '>', $fh or die "Couldn't open $fh";
74 0         0 $application->{tbx_fh} = $fh2;
75             }
76             }
77 5         16 return $application->{tbx_fh};
78             }
79              
80              
81             sub log_fh {
82             ## no critic (RequireBriefOpen)
83 5     5 1 18 my ( $application, $fh ) = @_;
84 5 50       93 if ($fh) {
85 5 50       25 if ( ref($fh) eq 'GLOB' ) {
86 5         17 $application->{log_fh} = $fh;
87             }
88             else {
89 0 0       0 open my $fh2, '>', $fh or die "Couldn't open $fh";
90 0         0 $application->{log_fh} = $fh2;
91             }
92             }
93 5         15 return $application->{log_fh};
94             }
95              
96             #same thing as Log::Message::Simple::error, but verbose is always off.
97             sub _error {
98 0     0   0 my ($msg) = @_;
99 0         0 error $msg, 0;
100 0         0 return;
101             }
102              
103             #prints the given message to the current log file handle.
104             sub _log {
105 0     0   0 my ( $self, $message ) = @_;
106 0         0 print { $self->{log_fh} } $message;
  0         0  
107 0         0 return;
108             }
109              
110              
111             sub input_fh {
112             ## no critic (RequireBriefOpen)
113 5     5 1 13 my ( $application, $fh ) = @_;
114 5 50       41 if ($fh) {
115 5 50       46 if ( ref($fh) eq 'GLOB' ) {
    0          
116 5         43 $application->{input_fh} = $fh;
117             }
118             #emulate diamond operator
119             elsif ($fh eq q{-}){
120 0         0 $application->{input_fh} = \*STDIN;
121             }
122             else {
123 0 0       0 open my $fh2, '<', $fh or die "Couldn't open $fh";
124 0         0 $application->{input_fh} = $fh2;
125             }
126             }
127 5         17 return $application->{input_fh};
128             }
129              
130              
131             sub batch {
132 0     0 1   my ( $self, @mrc_files ) = @_;
133             ## no critic (ProhibitOneArgSelect)
134 0           for my $mrc (@mrc_files) {
135              
136             # find an appropriate name for output and warning files
137 0           my $suffix = _get_suffix($mrc);
138              
139             #set output, error and input files
140 0           my $outTBX = "$mrc$suffix.tbx";
141 0           my $outWarn = "$mrc$suffix.log";
142              
143             # print STDERR "See $outTBX and $outWarn for output.\n";
144 0           $self->input_fh($mrc);
145 0           $self->log_fh($outWarn);
146 0           $self->tbx_fh($outTBX);
147              
148             #convert the input file, sending output to appropriate file handles
149 0           $self->convert;
150              
151             # close these so that they are written.
152 0           close $self->log_fh();
153 0           close $self->tbx_fh();
154              
155             # close input too, since it's been exhausted.
156 0           close $self->input_fh();
157              
158             # print STDERR "Finished processing $mrc.\n";
159             }
160 0           return;
161             }
162              
163             # return a file suffix to ensure nothing is overwritten
164             sub _get_suffix {
165 0     0     my ($file_name) = @_;
166 0           my $suffix = q{};
167 0   0       $suffix--
168             while ( -e "$file_name$suffix.tbx" or -e "$file_name$suffix.log" );
169 0           return $suffix;
170             }
171              
172              
173             sub convert {
174             ## no critic (ProhibitOneArgSelect)
175 0     0 1   my ($self) = @_;
176 0           my $select = select $self->{tbx_fh};
177              
178             # informative header for the log file
179 0           my $version = _version();
180 0           msg("MRC2TBX converter version $version");
181              
182             #if called as a script, output this
183             # if ( not caller ) {
184             # msg( "Called with "
185             # . scalar @origARGV
186             # . " argument"
187             # . ( @origARGV == 1 ? '' : 's' ) . ":\n\t"
188             # . ( join "\t", @origARGV ) );
189             # }
190              
191             # set up per-file status flags
192 0           my %header; # contains the header information
193 0           my $segment = 'header'; # header, body, back
194              
195             # what's open; need to be accessible in all methods
196 0           $self->{concept} = undef;
197 0           $self->{langSet} = undef;
198 0           $self->{term} = undef;
199 0           $self->{party} = undef;
200 0           $self->{langSetDefined} = 0;
201              
202             #array containing all rows for an ID
203 0           $self->{unsortedTerm} = undef;
204              
205 0           my @party; # collect all rows for a responsible party
206             my %responsible; # accumulate parties by type
207 0           my ( @idsUsed, @linksMade ); # track these
208 0           my $started = 0; # flag for MRCTermTable line (start-of-processing)
209 0           my $aborted = 0; # flag for early end-of-processing
210             # process the file
211 0           while ( readline( $self->{input_fh} ) ) {
212              
213             # eliminate a (totally superfluous) byte-order mark
214 0 0         s/^(?:\xef\xbb\xbf|\x{feff})// if $INPUT_LINE_NUMBER == 1;
215              
216             #check for =MRCtermTable at the beginning of the file to begin processing
217 0 0         if (/^=MRCtermTable/i) { # start processing
218 0           $started = 1;
219 0           next;
220             }
221 0 0         next unless $started;
222              
223 0 0         next if (/^\s*$/); # if it's only whitespace
224 0           my $row;
225 0 0         next unless $row = $self->_parseRow($_);
226              
227             # (if the row won't parse, _parseRow() returns undef)
228              
229             # if in header, A row?
230              
231             # print STDOUT $segment;
232             # print STDOUT Dumper $row;
233             # A-row: build header
234 0 0 0       if ( $segment eq 'header' && $row->{'ID'} eq 'A' ) {
235 0 0         $self->_buildHeader( $self->_parseRow($_), \%header )
236             or _error "Could not interpret header line $INPUT_LINE_NUMBER, skipped.";
237             }
238              
239             # not A-row: print header, segment = body
240 0 0 0       if ( $segment eq 'header' && $row->{'ID'} ne 'A' ) {
241              
242             # better have enough for a header!
243 0 0         unless ( $self->_printHeader( \%header ) ) {
244 0           _error
245             "TBX header could not be completed because a required A-row is missing or malformed.";
246 0           $aborted = 1;
247 0           last;
248             }
249 0           $segment = 'body';
250             }
251              
252             # if in body, C row?
253              
254             # C-row: lots to do
255 0 0 0       if ( $segment eq 'body' && exists $row->{'Concept'} ) {
256              
257             # catch a misordered-input problem
258              
259             # The next 3 if tests are one action in principle.
260             # Each depends on the preceding, and all depend on the
261             # closeX() subs being no-ops if it's already closed,
262             # and on the fact that nothing follows terms in langSet
263             # or follows langSet in termEntry. Meddle not, blah blah.
264              
265             {
266             ## no critic (ProhibitNoWarnings)
267 5     5   47 no warnings 'uninitialized';
  5         10  
  5         37092  
  0            
268             ## use critic
269              
270             # concept, langSet, term might be undef
271             # if new concept, close old and open new
272 0 0         if ( $row->{'Concept'} ne $self->{concept} ) {
273 0           $self->_closeTerm();
274 0           $self->_closeLangSet();
275 0           $self->_closeConcept();
276              
277             # open concept
278 0           $self->{concept} = $row->{'Concept'};
279 0           print '\n";
280              
281             # (not row ID, which may go further)
282 0           push @idsUsed, 'C' . $self->{concept};
283             }
284              
285             # if new langSet ...
286 0 0 0       if ( exists $row->{'LangSet'}
287             && $row->{'LangSet'} ne $self->{langSet} )
288             {
289 0           $self->_closeTerm();
290 0           $self->_closeLangSet();
291              
292             # open langSet
293 0           $self->{langSet} = $row->{'LangSet'};
294 0           print '\n";
295             }
296              
297             # if new term ...
298 0 0 0       if ( exists $row->{'Term'}
299             && $row->{'Term'} ne $self->{term} )
300             {
301 0           $self->_closeTerm();
302              
303             # open term
304 0           $self->{term} = $row->{'Term'};
305 0           undef $self->{unsortedTerm}; # redundant
306 0           push @idsUsed,
307             'C' . $self->{concept} . $self->{langSet} . $self->{term};
308             }
309             } # resume warnings on uninitialized values
310              
311             # verify legal insertion
312 0           my $level; # determine where we are from row ID
313 0 0         if ( defined $row->{'Term'} ) {
    0          
    0          
314 0           $level = 'Term';
315             }
316             elsif ( defined $row->{'LangSet'} ) {
317 0 0         if ( defined $self->{term} ) {
318 0           _error
319             "LangSet-level row out of order in line $INPUT_LINE_NUMBER, skipped.";
320 0           next;
321             }
322 0           $level = 'LangSet';
323             }
324             elsif ( defined $row->{'Concept'} ) {
325 0 0         if ( defined $self->{langSet} ) {
326 0           _error
327             "Concept-level row out of order in line $INPUT_LINE_NUMBER, skipped.";
328 0           next;
329             }
330 0           $level = 'Concept';
331             }
332             else {
333             #this should never happen; missing level is found when reading the row in
334 0           croak "Can't find level in row $INPUT_LINE_NUMBER, stopped";
335             }
336              
337             # (can't happen)
338              
339             # is the datcat allowed at the level of the ID?
340 0 0         unless ( $legalIn{$level}{ $row->{'DatCat'} } ) {
341 0           _error
342             "Data category '$row->{'DatCat'}' not allowed at the $level level in line $INPUT_LINE_NUMBER, skipped.";
343 0           next;
344             }
345              
346             # set langSetDefined if definition (legal only at langSet level)
347 0 0         $self->{langSetDefined} = 1 if ( $row->{'DatCat'} eq 'definition' );
348              
349             # bookkeeping: record links made
350 0 0         push @linksMade, $row->{'Link'}->{'Value'}
351             if ( defined $row->{'Link'} );
352              
353             # print item, or push into pre-tig list, depending
354 0 0         if ( $level eq 'Term' ) {
355 0           push @{ $self->{unsortedTerm} }, $row;
  0            
356             }
357             else {
358 0           $self->_printRow($row);
359             }
360              
361             } # end if (in body, reading C-row)
362              
363             # not C-row: close any structures, segment = back
364 0 0 0       if ( $segment eq 'body' && !exists $row->{'Concept'} ) {
365 0           $self->_closeTerm();
366 0           $self->_closeLangSet();
367 0           $self->_closeConcept();
368 0           print "\n";
369 0           $segment = 'back';
370 0           print "\n";
371             }
372              
373             # if in back, R row?
374             # R-row: separate parties, verify legality, stack it up
375 0 0 0       if ( $segment eq 'back' && exists $row->{'Party'} ) {
376              
377             # have we changed parties?
378 0 0 0       if ( defined $self->{party} && $row->{'Party'} ne $self->{party} ) {
379              
380             # change parties
381 0           my $type;
382              
383             # what kind of party is the old one?
384 0           my $topRow = shift @party;
385 0 0         if ( $topRow->{'DatCat'} eq 'type' ) {
386 0           $type = $topRow->{'Value'};
387             }
388             else {
389 0           unshift @party, $topRow;
390 0           $type = 'unknown';
391             }
392              
393             # file its info under its type and clean it out
394 0           push @{ $responsible{$type} }, [@party];
  0            
395 0           undef @party;
396             }
397              
398             # no? OK, add it to the current party.
399 0           $self->{party} = $row->{'Party'}; # the party don't stop!
400             # article says the first row must be type, but we can sort:
401 0 0         if ( $row->{'DatCat'} eq 'type' ) {
402 0           unshift @party, $row;
403             }
404             else {
405 0           push @party, $row;
406             }
407             } # end if (in back and reading R-row)
408              
409             # not R-row: warn file is misordered, last line
410             # this code only runs if the A C R order is broken
411 0 0 0       if ( $segment eq 'back' && !exists $row->{'Party'} ) {
412 0           _error
413             "Don't know what to do with line $INPUT_LINE_NUMBER, processing stopped. The rows in your file are not in proper A C R order.";
414 0           $aborted = 1;
415 0           last;
416             }
417              
418             } # end while (<$self->input_fh>)
419              
420             # finish up
421              
422             # if in body, close structures, body
423 0 0         if ( $segment eq 'body' ) {
424 0           $self->_closeTerm();
425 0           $self->_closeLangSet();
426 0           $self->_closeConcept();
427 0           print "\n";
428             }
429              
430             # if in back, sort and print parties, close back
431 0 0         if ( $segment eq 'back' ) {
432              
433             # file the last party under its type
434 0           my $type;
435 0           my $topRow = shift @party;
436 0 0         if ( $topRow->{'DatCat'} eq 'type' ) {
437 0           $type = $topRow->{'Value'};
438             }
439             else {
440 0           unshift @party, $topRow;
441 0           $type = 'unknown';
442             }
443 0           push @{ $responsible{$type} }, [@party];
  0            
444              
445             # print a refObjectList for each type of party,
446             # within which each arrayref gets noted and _printRow()ed.
447 0 0         if ( exists $responsible{'person'} ) {
448 0           print "\n";
449 0           push @idsUsed, $_->[0]->{'ID'} foreach @{ $responsible{'person'} };
  0            
450 0           $self->_printRow($_) foreach @{ $responsible{'person'} };
  0            
451 0           print "\n";
452             }
453 0 0         if ( exists $responsible{'organization'} ) {
454 0           print "\n";
455 0           push @idsUsed, $_->[0]->{'ID'}
456 0           foreach @{ $responsible{'organization'} };
457 0           $self->_printRow($_) foreach @{ $responsible{'organization'} };
  0            
458 0           print "\n";
459             }
460 0 0         if ( exists $responsible{'unknown'} ) {
461 0           _error
462             "At least one of your responsible parties has no type (person, organization, etc.) and has been provisionally printed as a respParty. To conform to TBX-Basic, you must list each party as either a person or an organization.";
463 0           print "\n";
464 0           push @idsUsed, $_->[0]->{'ID'} foreach @{ $responsible{'unknown'} };
  0            
465 0           $self->_printRow($_) foreach @{ $responsible{'unknown'} };
  0            
466 0           print "\n";
467             }
468 0           print "\n";
469             }
470              
471             # closing formalities
472 0 0         if ( not $started ) {
473 0           my $err =
474             "The input MRC is missing a line beginning with =MRCTermTable. You must include such a line to switch on the TBX converter -- all preceding material is ignored.";
475              
476 0           carp $err;
477 0           _error $err;
478              
479 0           $self->_finish_processing($select);
480 0           return;
481             }
482              
483             #in case the file was header only
484 0 0 0       if ( $segment eq 'header' and not $aborted ) {
485              
486             #check and print header
487 0 0         unless ( $self->_printHeader( \%header ) ) {
488 0           _error
489             "TBX header could not be completed because a required A-row is missing or malformed.";
490 0           $aborted = 1;
491             }
492              
493             #alert user to lack of content
494 0           _error('The file contained no concepts or parties.');
495              
496             #close the opened, and empty, body element
497 0           print "\n";
498             }
499              
500 0 0         if ($aborted) {
501 0           carp "See log -- processing could not be completed.\n";
502 0           $self->_finish_processing($select);
503 0           return;
504             }
505              
506 0           print "\n\n";
507 0 0         msg( "File includes links to:\n\t" . ( join "\n\t", @linksMade ) )
508             if @linksMade;
509              
510 0 0         msg "File includes IDs:\n\t" . ( join "\n\t", @idsUsed )
511             if @idsUsed;
512              
513             # TODO: is this necessary? also look for tbx_fh and input_fh
514             # next open would close implicitly but not reset $INPUT_LINE_NUMBER
515 0           $self->_finish_processing($select);
516 0           return;
517             }
518              
519             sub _finish_processing {
520             ## no critic (ProhibitOneArgSelect)
521 0     0     my ( $self, $select ) = @_;
522              
523             #clear all processing data
524 0           delete $self->{concept};
525 0           delete $self->{langSet};
526 0           delete $self->{term};
527 0           delete $self->{party};
528 0           delete $self->{unsortedTerm};
529 0           delete $self->{party};
530 0           delete $self->{langSetDefined};
531              
532             #print all messages to the object's log
533 0           $self->_log( Log::Message::Simple->stack_as_string() );
534 0           Log::Message::Simple->flush();
535              
536 0           select $select;
537              
538             # user's responsibility to close the various filehandles
539 0           return;
540             }
541              
542              
543             # do nothing if no term level is open
544             sub _closeTerm {
545 0     0     my ($self) = @_;
546 0 0         if ( defined $self->{term} ) {
547              
548             # print STDOUT Dumper $self->{unsortedTerm} ;
549             # print STDOUT Dumper $self;
550 0   0       my $id = ${ $self->{unsortedTerm} }[0]->{'ID'} ||
551              
552             #necessary for error reporting; $ID might be undef
553             'C' . $self->{concept} . $self->{langSet} . $self->{term};
554 0           my $tig = $self->_sortRefs( @{ $self->{unsortedTerm} } );
  0            
555 0           my $posContext = pop @$tig;
556 0 0 0       unless ( $posContext || $self->{langSetDefined} ) {
557 0           _error
558 0           "Term $id (see line @{[$INPUT_LINE_NUMBER - 1]}) is lacking an element necessary for TBX-Basic.\n\tTo make it valid for human use only, add one of:\n\t\ta definition (at the language level)\n\t\tan example of use in context (at the term level).\n\tTo make it valid for human or machine processing, add its part of speech (at the term level).";
559             }
560 0           $self->_printRow($tig);
561 0           undef $self->{term};
562 0           undef $self->{unsortedTerm};
563             }
564 0           return;
565             }
566              
567             # nothing if no lang level is open
568             sub _closeLangSet {
569 0     0     my ($self) = @_;
570 0 0         if ( defined $self->{langSet} ) {
571 0           print "\n";
572 0           undef $self->{langSet};
573 0           undef $self->{langSetDefined};
574             }
575 0           return;
576             }
577              
578             # nothing if no concept level is open
579             sub _closeConcept {
580 0     0     my ($self) = @_;
581 0 0         if ( defined $self->{concept} ) {
582 0           print "\n";
583 0           undef $self->{concept};
584             }
585 0           return;
586             }
587              
588              
589             my $NUM_MONTHS = 12;
590             sub _parseRow {
591 0     0     my ( $self, $row_text ) = @_;
592 0           $row_text =~ s/\s*$//; # super-chomp: cut off any trailing whitespace at all
593             # later, split will eliminate between-field whitespace
594             # and the keyword and langtag parsers will eliminate other space
595             # outside of values
596              
597             # fields are delimited by at least one tab and possibly spaces
598 0           my @field = split / *(?:\t *)+/, $row_text;
599              
600             # grab the three mandatory fields
601 0           my %row;
602 0           $row{'ID'} = shift @field;
603 0           $row{'DatCat'} = shift @field;
604 0           $row{'Value'} = shift @field;
605              
606             # verify essential completeness
607 0 0 0       unless ( $row{'ID'} && $row{'DatCat'} && $row{'Value'} ) {
      0        
608 0           _error "Incomplete row in line $INPUT_LINE_NUMBER, skipped.";
609 0           return;
610             }
611              
612             # verify well-formed ID and extract its semantics
613 0 0         if ( $row{'ID'} =~ /^[Cc] *(\d{3}) *($langCode)? *(\d*)$/ ) {
    0          
    0          
614 0 0 0       if ( $3 && !$2 ) {
615 0           _error
616             "Bad ID '$row{'ID'}' (no language section) in line $INPUT_LINE_NUMBER, skipped.";
617 0           return;
618             }
619 0           $row{'Concept'} = $1;
620 0 0         $row{'LangSet'} = "\L$2" if ($2); # smash to lowercase
621 0 0 0       $row{'Term'} = 0 + $3 if ( $2 && $3 ne q{} ); # cast to int
622             # clean up the ID itself
623 0           $row{'ID'} = "C$row{'Concept'}";
624 0 0         $row{'ID'} .= $row{'LangSet'} if $row{'LangSet'};
625 0 0         $row{'ID'} .= $row{'Term'} if defined $row{'Term'};
626             }
627             elsif ( $row{'ID'} =~ /^[Rr] *(\d{3})$/ ) {
628 0           $row{'Party'} = $1;
629 0           $row{'ID'} = "R$1";
630             }
631             elsif ( $row{'ID'} =~ /^[Aa]$/ ) {
632              
633             # this is a header line and okey-dokey
634 0           $row{'ID'} = 'A';
635             }
636             else {
637 0           _error
638             "Bad ID '$row{'ID'}' (format not recognized) in line $INPUT_LINE_NUMBER, skipped.";
639 0           return;
640             }
641              
642             # correct case of the datcat, or warn and skip if can't match
643 0 0         if ( my $correct = $correctCaps{'DatCat'}{ lc( $row{'DatCat'} ) } ) {
644              
645             # the datcat is recognized
646 0 0         unless ( $row{'DatCat'} eq $correct ) {
647 0           _error "Correcting '$row{'DatCat'}' to '$correct' in line $INPUT_LINE_NUMBER.";
648 0           $row{'DatCat'} = $correct;
649             }
650             }
651             else {
652 0           _error "Unknown data category '$row{'DatCat'}' in line $INPUT_LINE_NUMBER, skipped.";
653 0           return;
654             }
655              
656             # parse off any local language override in Value
657 0 0         if ( $row{'Value'} =~ /^\[($langCode)] *(.*)$/ ) {
658 0           $row{'RowLang'} = " xml:lang=\"\L$1\""; # lower case
659 0           $row{'Value'} = $2;
660             } # otherwise RowLang will (warn and) print nothing when asked
661              
662             # check certain Values against picklists and case-correct
663 0 0         if ( $row{'DatCat'} eq 'termLocation' ) {
    0          
664 0 0         if ( my $correct = $correctCaps{'termLocation'}{ lc( $row{'Value'} ) } )
665             {
666             # the value is a recognized termLocation
667 0 0         unless ( $row{'Value'} eq $correct ) {
668 0           _error "Correcting '$row{'Value'}' to '$correct' in line $INPUT_LINE_NUMBER.";
669 0           $row{'Value'} = $correct;
670             }
671             }
672             else {
673 0           _error
674             "Unfamiliar termLocation '$row{'Value'}' in line $INPUT_LINE_NUMBER. If this is a location in a user interface, consult the suggested values in the TBX spec.";
675              
676             # but DON'T return undef, because this should not
677             # lead to skipping the row, unlike other picklists
678             }
679             }
680             elsif ( $correctCaps{ $row{'DatCat'} } ) {
681 0           my %caps = %{ $correctCaps{ $row{'DatCat'} } };
  0            
682              
683             # grab a correction hash appropriate to DatCat,
684             # if one exists
685 0 0         if ( my $correct = $caps{ lc( $row{'Value'} ) } ) {
686 0 0         unless ( $row{'Value'} eq $correct ) {
687 0           _error "Correcting '$row{'Value'}' to '$correct' in line $INPUT_LINE_NUMBER.";
688 0           $row{'Value'} = $correct;
689             }
690             }
691             else {
692 0           _error
693             "'$row{'Value'}' not a valid $row{'DatCat'} in line $INPUT_LINE_NUMBER, skipped. See picklist for valid values.";
694 0           return;
695             }
696             } # else it's not a correctible datcat, so let it be
697              
698             # get additional fields and language tags off of the row
699             # forcing the keyword to one initial cap and prewriting the XMLattr
700 0           foreach (@field) {
701 0           my $keyword;
702 0 0         if (/^([^:]+): *(?:\[($langCode)])? *(.+)$/) {
703 0           $keyword = "\u\L$1";
704 0           $row{$keyword}{'Value'} = $3;
705 0 0         $row{$keyword}{'FieldLang'} = " xml:lang=\"\L$2\"" if $2;
706             }
707             else {
708 0           _error "Can't parse additional field '$_' in line $INPUT_LINE_NUMBER, ignored.";
709 0           next;
710             }
711              
712             # check if a FieldLang makes sense
713 0 0 0       if ( $row{$keyword}{'FieldLang'} && !$allowed{$keyword}{'FieldLang'} ) {
714 0           _error
715             "Language tag makes no sense with keyword '$keyword' in line $INPUT_LINE_NUMBER, ignored.";
716 0           delete $row{$keyword}{'FieldLang'};
717             }
718              
719             # check if this datcat can have this keyword
720             # this bit might be better done in the controller?
721             # heh. Too late now.
722 0 0         unless ( $allowed{ $row{'DatCat'} }{$keyword} ) {
723 0           _error
724             "Data category $row{'DatCat'} does not allow keyword '$keyword', ignored in line $INPUT_LINE_NUMBER.";
725 0 0 0       if ( $keyword eq 'Source' or $keyword eq 'Note' ) {
726 0           _error
727             "You may attach a source or note to an entire term entry (or a language section or concept entry) by placing it on its own line with the appropriate ID, like this: \n\t$row{ 'ID' }\t\l$keyword\t$row{ $keyword }{ 'Value' }";
728             }
729 0           delete $row{$keyword};
730             }
731             }
732             # check for malformed Date
733 0 0         if ( $row{'Date'} ) {
734 0 0         if ( $row{'Date'}{'Value'} =~ /^(\d{4})-(\d{2})-(\d{2})$/ ) {
735 0 0 0       if ( $1 eq '0000' || $2 eq '00' || $3 eq '00' ) {
    0 0        
    0 0        
736 0           _error
737             "Consider correcting: Zeroes in date '$row{'Date'}{'Value'}', line $INPUT_LINE_NUMBER.";
738             }
739             elsif ( $2 <= $NUM_MONTHS && $3 <= $NUM_MONTHS ) {
740 0           _error
741             "Consider double-checking: Month and day are ambiguous in '$row{'Date'}{'Value'}', line $INPUT_LINE_NUMBER.";
742             }
743             elsif ( $2 > $NUM_MONTHS ) {
744 0           _error "Consider correcting: Month $2 is nonsense in line $INPUT_LINE_NUMBER.";
745             }
746             }
747             else {
748 0           _error
749             "Date '$row{'Date'}{'Value'}' not in ISO format (yyyy-mm-dd) in line $INPUT_LINE_NUMBER, ignored.";
750 0           delete $row{'Date'};
751             }
752             }
753              
754             # check for Link where it's needed
755 0 0 0       if ( $row{'DatCat'} eq 'transactionType' ) {
    0          
756 0 0         _error
757             "Consider adding information: No responsible party linked in line $INPUT_LINE_NUMBER."
758             unless $row{'Link'};
759             }
760             elsif (
761             $row{'DatCat'} =~ /^(?:crossReference|externalCrossReference|xGraphic)$/
762             && !$row{'Link'} )
763             {
764 0           _error "$row{'DatCat'} without Link in line $INPUT_LINE_NUMBER, skipped.";
765 0           return;
766             }
767              
768 0           return \%row;
769             }
770              
771             sub _buildHeader {
772 0     0     my ( $self, $srcRef, $destRef ) = @_;
773 0           my $destKey;
774 0 0         return unless ( $destKey = $corresp{ $srcRef->{'DatCat'} } );
775              
776             # print STDOUT "$destKey\n" . Dumper ($destRef) . "\n" . Dumper ($srcRef) . "\n";
777             # a validity check, not just a pointless translation
778 0 0 0       if ( $destKey eq 'Language' and defined $destRef->{'Language'} ) {
779 0           _error "Duplicate workingLanguage ignored in line $INPUT_LINE_NUMBER.";
780 0           return;
781             }
782 0           push @{ $destRef->{$destKey} }, $srcRef->{'Value'};
  0            
783 0           return 1;
784             }
785              
786             sub _printHeader {
787 0     0     my ( $self, $info ) = @_;
788              
789             # my $info = %{shift}; # that's a copy, but the hash is small
790 0 0 0       return unless ( defined $info->{'Language'} && defined $info->{'Source'} );
791 0           print <<"REQUIRED1";
792            
793            
794            
795            
796            
797            
798             termbase from MRC file
799             REQUIRED1
800              
801             # print termbase-wide subjects, if such there be
802 0           _error
803             "Termbase-wide subject fields are recorded in the element of the TBX header."
804 0 0 0       if ( exists $info->{'Subject'} and scalar @{ $info->{'Subject'} } );
805 0           my $sbj;
806 0           print <<"SUBJECT" while $sbj = shift @{ $info->{'Subject'} };
  0            
807             entire termbase concerns subject: $sbj
808             SUBJECT
809 0           my $version = _version();
810 0           print <<"REQUIRED2";
811            
812            
813            

generated by Convert::MRC version $version

814            
815             REQUIRED2
816 0           while ( my $src = shift @{ $info->{'Source'} } ) {
  0            
817 0           print <<"SOURCE";
818            
819            

$src

820            
821             SOURCE
822             }
823              
824 0           print <<'REQUIRED3';
825            
826            
827            

TBXBasicXCSV02.xcs

828             REQUIRED3
829              
830             # my $sbj;
831             # print <{'Subject'}};
832             #

$sbj

833             #SUBJECT
834              
835 0           print <<'REQUIRED3';
836            
837            
838            
839            
840             REQUIRED3
841              
842 0           return 1;
843             }
844              
845             # structure a term's worth of data rows for printing
846             sub _sortRefs {## no critic (RequireArgUnpacking)
847 0     0     my ( $self, @rows ) = @_;
848 0           my ( @termGrp, @auxInfo, $term, $pos, $context, $ID );
849              
850 0   0       $ID = $_[0]->{'ID'}
851              
852             #this is necessary for printing diagnostics when something has gone wrong ($ID would be undef otherwise)
853             || 'C' . $self->{concept} . $self->{langSet} . $self->{term};
854              
855             # print STDOUT Dumper $_[0];
856             # print STDOUT Dumper \@rows;
857             # print STDOUT Dumper $self;
858 0           for my $row (@rows) {
859 0 0         if ( not defined $row->{'DatCat'} ) {
860              
861             #this should never happen; it should be caught during row parsing.
862 0           next;
863             }
864 0           my $datCat = $row->{'DatCat'};
865 0 0         if ( $datCat eq 'term' ) {
    0          
866 0           unshift @termGrp, $row; # stick it on the front
867 0           $term = 1;
868             }
869             elsif ( my $position = $position{$datCat} ) {
870 0 0         if ( 'termGrp' eq $position ) {
    0          
871 0           push @termGrp, $row; # stick it on the back
872 0 0         $pos = 1 if $datCat eq 'partOfSpeech';
873             }
874             elsif ( 'auxInfo' eq $position ) {
875 0           push @auxInfo, $row;
876 0 0         $context = 1 if $datCat eq 'context';
877             }
878             }
879             else {
880             #should never happen; should be caught during row parsing
881 0           _error "Data category '$datCat' is not allowed at the term level.";
882             }
883             }
884              
885 0 0         if ( not $term ) {
886 0           _error
887 0           "There is no term row for '$ID', although other data categories describe such a term. See line @{[$INPUT_LINE_NUMBER - 1]}.";
888             }
889              
890 0 0         if ( not $pos ) {
891 0           _error
892 0           "Term $ID lacks a partOfSpeech row. This TBX file may not be machine processed. See line @{[$INPUT_LINE_NUMBER - 1]}.";
893             }
894              
895 0           unshift @auxInfo, \@termGrp;
896 0   0       push @auxInfo, ( $pos || $context ); # 1 or undef
897 0           return \@auxInfo;
898             }
899              
900             sub _printRow {
901 0     0     my ( $self, $item ) = @_;
902             ## no critic (ProhibitNoWarnings)
903 5     5   69 no warnings 'uninitialized'; # for undefined language attributes
  5         12  
  5         6623  
904             ## use critic
905 0 0         if ( ref $item eq 'HASH' ) { # printing a single item
    0          
906             # print as appropriate
907 0           my $datCat;
908 0           $datCat = $item->{'DatCat'};
909 0 0         if ( not defined $datCat ) {
910              
911             #should never happen; rows with undefined datcats are skipped.
912 0           _error "Data category undefined. Cannot print row at $INPUT_LINE_NUMBER";
913 0           return;
914             }
915              
916             # sort by datcat
917 0 0 0       if ( $datCat eq 'term' ) {
    0          
    0          
    0          
    0          
    0          
    0          
    0          
918 0           print "$item->{'Value'}\n";
919              
920             # we deliberately ignore RowLang, because LangSet
921             # should give the language of a term entry
922             }
923              
924             # note and source as standalones, not keyword-fields
925             elsif ( $datCat eq 'note' ) {
926 0           print "{'RowLang'}>$item->{'Value'}\n";
927             }
928              
929             elsif ( $datCat =~ /^(?:source|customerSubset|projectSubset)$/ ) {
930 0           print
931             "{'RowLang'}>$item->{'Value'}\n";
932             }
933              
934             # sorry this one's so gross, but it is
935             elsif ( $datCat eq 'transactionType' ) {
936 0           print "\n";
937 0           print
938             "\t$item->{'Value'}\n";
939 0 0         print "\t$item->{'Date'}->{'Value'}\n"
940             if $item->{'Date'};
941              
942             #I don't think Note is allowed in transationType (Nate G)
943 0 0         print
944             "\t{'Note'}->{'FieldLang'}>$item->{'Note'}->{'Value'}\n"
945             if $item->{'Note'};
946 0 0 0       if ( $item->{'Responsibility'} || $item->{'Link'} ) {
947 0           print "\t
948 0 0         print " target=\"$item->{'Link'}->{'Value'}\""
949             if $item->{'Link'};
950 0           print
951             "$item->{'Responsibility'}->{'FieldLang'}>$item->{'Responsibility'}->{'Value'}";
952 0 0         print "Responsible Party"
953             unless $item->{'Responsibility'}->{'Value'};
954 0           print "\n";
955             }
956 0           print "\n";
957             }
958              
959             elsif ( $datCat eq 'crossReference' ) {
960 0           print
961             "{'Link'}->{'Value'}\"$item->{'RowLang'}>$item->{'Value'}\n";
962             }
963              
964             elsif ($datCat eq 'externalCrossReference'
965             || $datCat eq 'xGraphic' )
966             {
967 0           print
968             "{'Link'}->{'Value'}\"$item->{'RowLang'}>$item->{'Value'}\n";
969             }
970              
971             elsif ( $datCat =~ /^(?:email|title|role|org|uid|tel|adr|fn)$/ ) {
972 0           print "\t$item->{'Value'}\n";
973              
974             # RowLang is ignored here too -- attr not allowed
975             }
976              
977             elsif ( $meta{$datCat} eq 'termNote' ) {
978 0           print
979             "{'RowLang'}>$item->{'Value'}\n"
980             ; # using tigs means no termNoteGrp
981             }
982              
983             else { # everything else is easy
984 0           my $meta;
985 0 0         $meta = $meta{$datCat}
986             or die "_printRow() can't print a $datCat "; # shouldn't happen
987 0           print "<${meta}Grp>\n";
988 0           print
989             "\t<$meta type=\"$datCat\"$item->{'RowLang'}>$item->{'Value'}\n";
990              
991             #I don't think Note is allowed in transationType (Nate G)
992 0 0         print
993             "\t{'Note'}->{'FieldLang'}>$item->{'Note'}->{'Value'}\n"
994             if $item->{'Note'};
995 0 0         print
996             "\t{'Source'}->{'FieldLang'}>$item->{'Source'}->{'Value'}\n"
997             if $item->{'Source'};
998 0           print "\n";
999             }
1000              
1001             }
1002             elsif ( ref $item eq 'ARRAY' ) {
1003              
1004             # if first item isn't arrayref, it's a resp-party
1005 0 0         if ( ref $item->[0] ne 'ARRAY' ) {
1006 0           print "[0]->{'ID'}\">\n";
1007 0           $self->_printRow($_) foreach @$item;
1008 0           print "\n";
1009             }
1010             else {
1011             # then it's a tig
1012 0           my $termGrp = shift @$item;
1013 0           my $id;
1014 0 0         if ( exists $termGrp->[0] ) {
1015              
1016             # if there's a term or any termNote
1017 0           $id = $termGrp->[0]->{'ID'};
1018             }
1019             else {
1020             #should never happen (right? Nate G)
1021             # if must, get the ID from an auxInfo
1022             # (implies the input is defective)
1023 0           $id = $item->[0]->{'ID'};
1024             }
1025 0           print "\n";
1026              
1027             # if this were an ntig
1028 0           $self->_printRow($_) foreach @$termGrp;
1029              
1030             #
1031 0           $self->_printRow($_) foreach @$item;
1032 0           print "\n";
1033             }
1034             }
1035             else {
1036             #this should never happen
1037 0           die "_printRow() called incorrectly, stopped";
1038             }
1039 0           return;
1040             }
1041              
1042             1;
1043              
1044             __END__