File Coverage

blib/lib/Convert/TBX/Basic.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             #
2             # This file is part of Convert-TBX-Basic
3             #
4             # This software is copyright (c) 2014 by Alan K. Melby.
5             #
6             # This is free software; you can redistribute it and/or modify it under
7             # the same terms as the Perl 5 programming language system itself.
8             #
9             package Convert::TBX::Basic;
10 1     1   43518 use strict;
  1         3  
  1         34  
11 1     1   6 use warnings;
  1         3  
  1         32  
12             # ABSTRACT: Convert TBX-Basic data into TBX-Min
13             our $VERSION = '0.02'; # VERSION
14 1     1   1542 use XML::Twig;
  0            
  0            
15             use autodie;
16             use Path::Tiny;
17             use Carp;
18             use Log::Any '$log';
19             use TBX::Min 0.06;
20             use Try::Tiny;
21             use Exporter::Easy (
22             OK => ['basic2min']
23             );
24              
25             my %status_map = (
26             'preferredTerm-admn-sts' => 'preferred',
27             'admittedTerm-admn-sts' => 'admitted',
28             'deprecatedTerm-admn-sts' => 'notRecommended',
29             'supersededTerm-admn-st' => 'obsolete'
30             );
31              
32             sub basic2min {
33             (my ($data, $source, $target) = @_) == 3 or
34             croak 'Usage: basic2min(data, source-language, target-language)';
35              
36             my $fh = _get_handle($data);
37              
38             # build a twig out of the input document
39             my $twig = XML::Twig->new(
40             output_encoding => 'UTF-8',
41             do_not_chain_handlers => 1,
42             keep_spaces => 0,
43              
44             # these store new entries, langGroups and termGroups
45             start_tag_handlers => {
46             termEntry => \&_entry_start,
47             langSet => \&_langStart,
48             tig => \&_termGrpStart,
49             },
50              
51             TwigHandlers => {
52             # header attributes
53             title => \&_title,
54             sourceDesc => \&_source_desc,
55             'titleStmt/note' => \&_title_note,
56              
57             # decide whether to add a new entry
58             termEntry => \&_entry,
59              
60             # becomes part of the current TBX::Min::ConceptEntry object
61             'termEntry/descrip[@type="subjectField"]' => sub {
62             shift->{tbx_min_min_current_entry}->
63             subject_field($_->text)},
64              
65             # these become attributes of the current
66             # TBX::Min::TermGroup object
67             'tig/termNote[@type="administrativeStatus"]' => \&_status,
68             term => sub {shift->{tbx_min_current_term_grp}->
69             term($_->text)},
70             'tig/termNote[@type="partOfSpeech"]' => sub {
71             shift->{tbx_min_current_term_grp}->
72             part_of_speech($_->text)},
73             'tig/note' => \&_as_note,
74             'tig/admin[@type="customerSubset"]' => sub {
75             shift->{tbx_min_current_term_grp}->customer($_->text)},
76              
77             # the information which cannot be converted faithfully
78             # gets added as a note to the current TBX::Min::TermGroup,
79             # with its data category prepended
80             'tig/admin' => \&_as_note,
81             'tig/descrip' => \&_as_note,
82             'tig/termNote' => \&_as_note,
83             'tig/transac' => \&_as_note,
84             'tig/transacNote' => \&_as_note,
85             'tig/transacGrp/date' => \&_as_note,
86              
87             # add no-op handlers for twigs not needing conversion
88             # so that they aren't logged as being skipped
89             'sourceDesc/p' => sub {}, # treated in sourceDesc handler
90             titleStmt => sub {},
91             fileDesc => sub {},
92             martifHeader => sub {},
93             text => sub {},
94             body => sub {},
95             martif => sub {},
96             langSet => sub {},
97             tig => sub {},
98             transacGrp => sub {},
99              
100             # log anything that wasn't converted
101             _default_ => \&_log_missed,
102             }
103             );
104              
105             # provide language info to the handlers via storage in the twig
106             $twig->{tbx_languages} = [lc($source), lc($target)];
107              
108             my $min = TBX::Min->new();
109             $min->source_lang($source);
110             $min->target_lang($target);
111              
112             # use handlers to process individual tags and
113             # add information to $min
114             $twig->{tbx_min} = $min;
115             $twig->parse($fh);
116              
117             # warn if the document didn't have tig's of the given source and
118             # target language
119             if(keys %{ $twig->{tbx_found_languages} } != 2 and
120             $log->is_warn){
121             # find the difference between the expected languages
122             # and those found in the TBX document
123             my %missing;
124             @missing{ lc $min->source_lang, lc $min->target_lang() } = undef;
125             delete @missing{ keys %{$twig->{tbx_found_languages}} };
126             $log->warn('could not find langSets for language(s): ' .
127             join ', ', sort keys %missing);
128             }
129              
130             return $min;
131             }
132              
133             sub _get_handle {
134             my ($data) = @_;
135             my $fh;
136             if((ref $data) eq 'SCALAR'){
137             open $fh, '<', $data; ## no critic(RequireBriefOpen)
138             }else{
139             $fh = path($data)->filehandle('<');
140             }
141             return $fh;
142             }
143              
144             ######################
145             ### XML TWIG HANDLERS
146             ######################
147             # all of the twig handlers store state on the XML::Twig object. A bit kludgy,
148             # but it works.
149              
150             sub _title {
151             my ($twig, $node) = @_;
152             $twig->{tbx_min}->id($node->text);
153             return 0;
154             }
155              
156             sub _title_note {
157             my ($twig, $node) = @_;
158             my $description = $twig->{tbx_min}->description || '';
159             $twig->{tbx_min}->description($description . $node->text . "\n");
160             return 0;
161             }
162              
163             sub _source_desc {
164             my ($twig, $node) = @_;
165             for my $p ($node->children('p')){
166             my $description = $twig->{tbx_min}->description || '';
167             $twig->{tbx_min}->description(
168             $description . $p->text . "\n");
169             }
170             return 0;
171             }
172              
173             # remove whitespace and convert to TBX-Min picklist value
174             sub _status {
175             my ($twig, $node) = @_;
176             my $status = $node->text;
177             $status =~ s/[\s\v]//g;
178             $twig->{tbx_min_current_term_grp}->status($status_map{$status});
179             return 0;
180             }
181              
182             # turn the node info into a note labeled with the type;
183             # paste this at the end of any existing note
184             sub _as_note {
185             my ($twig, $node) = @_;
186             my $grp = $twig->{tbx_min_current_term_grp};
187             my $note = $grp->note() || '';
188             my $type = $node->att('type') || '';
189             $type .= ':' if $type;
190              
191             $grp->note($note . "\n" .
192             $type . $node->text);
193             if($log->is_info and $node->name ne 'note'){
194             $log->info('element ' . $node->xpath . ' pasted in note');
195             }
196              
197             return 1;
198             }
199              
200             # add a new entry to the list of those found in this file
201             sub _entry_start {
202             my ($twig, $node) = @_;
203             my $entry = TBX::Min::Entry->new();
204             if($node->att('id')){
205             $entry->id($node->att('id'));
206             }else{
207             carp 'found entry missing id attribute';
208             }
209             $twig->{tbx_min_min_current_entry} = $entry;
210             return 1;
211             }
212              
213             # add the entry to the TBX::Min object if it has any langGroups
214             sub _entry {
215             my ($twig, $node) = @_;
216             my $entry = $twig->{tbx_min_min_current_entry};
217             if(@{$entry->lang_groups}){
218             $twig->{tbx_min}->add_entry($entry);
219             }elsif($log->is_info){
220             $log->info('element ' . $node->xpath . ' not converted');
221             }
222             return;
223             }
224              
225             #just set the subject_field of the current entry
226             sub _subjectField {
227             my ($twig, $node) = @_;
228             $twig->{tbx_min_min_current_entry}->subject_field($node->text);
229             return 1;
230             }
231              
232             # Create a new LangGroup, add it to the current entry,
233             # and set it as the current LangGroup.
234             # This langSet is ignored if its language is different from
235             # the source and target languages specified to basic2min
236             sub _langStart {
237             my ($twig, $node) = @_;
238             my $lang_grp;
239             my $lang = $node->att('xml:lang');
240             if(!$lang){
241             # skip if missing language
242             $log->warn('skipping langSet without language: ' .
243             $node->xpath) if $log->is_warn;
244             $node->ignore;
245             return 1;
246             }elsif(!grep {$_ eq lc $lang} @{$twig->{tbx_languages}}){
247             # skip if non-applicable language
248             $node->ignore;
249             return 1;
250             }
251              
252             $lang_grp = TBX::Min::LangGroup->new();
253             $lang_grp->code($lang);
254             $twig->{tbx_found_languages}{lc $lang} = undef;
255             $twig->{tbx_min_min_current_entry}->add_lang_group($lang_grp);
256             $twig->{tbx_min_current_lang_grp} = $lang_grp;
257             return 1;
258             }
259              
260             # Create a new termGroup, add it to the current langGroup,
261             # and set it as the current termGroup.
262             sub _termGrpStart {
263             my ($twig) = @_;
264             my $term = TBX::Min::TermGroup->new();
265             $twig->{tbx_min_current_lang_grp}->add_term_group($term);
266             $twig->{tbx_min_current_term_grp} = $term;
267             return 1;
268             }
269              
270             # log that an element was not converted
271             sub _log_missed {
272             my (undef, $node) = @_;
273             $log->info('element ' . $node->xpath . ' not converted')
274             if $log->is_info();
275             return;
276             }
277              
278             1;
279              
280             __END__