File Coverage

blib/lib/Lingua/Thesaurus/IO/LivelinkCollectionServer.pm
Criterion Covered Total %
statement 55 56 98.2
branch 20 24 83.3
condition 13 16 81.2
subroutine 5 5 100.0
pod 1 1 100.0
total 94 102 92.1


line stmt bran cond sub pod time code
1             package Lingua::Thesaurus::IO::LivelinkCollectionServer;
2 4     4   1879 use Moose;
  4         5  
  4         19  
3             with 'Lingua::Thesaurus::IO';
4              
5             has '_term_rev_idx' => (is => 'bare',
6             documentation => "internal reverse index while parsing terms");
7              
8             has '_rel_types' => (is => 'ro',
9             documentation => "default reltypes for Livelink Collection Server",
10             default => sub { {
11             # rel description inverse is_external
12             # === =========== ======= ===========
13             AB => ['Abbreviation' => AF => undef],
14             AF => ['Abbreviation For' => AB => undef],
15             EQ => ['See Also' => UF => undef],
16             UF => ['Used For' => EQ => undef],
17             EQA => ['See AND' => UFA => undef],
18             UFA => ['Used For AND' => EQA => undef],
19             BT => ['Broad Term' => NT => undef],
20             NT => ['Narrow Term' => BT => undef],
21             RT => ['Related Term' => RT => undef],
22             SN => ['Scope Note' => undef , 1 ],
23             HN => ['History Note' => undef , 1 ],
24             }});
25              
26              
27             sub load {
28 3     3 1 4 my $self = shift;
29              
30             # files are given either as ->load($f1, $f2, ..), or
31             # as ->load({$ori1 => $f1, $ori2 => $f2, ...});
32 3         5 my @files; # list of pairs [$filename, $origin]
33 3 100 66     20 if (@_ == 1 && ref $_[0] eq 'HASH') {
34 1         1 my $files_hash = shift;
35 1         3 @files = map {[$files_hash->{$_}, $_]} keys %$files_hash;
  2         5  
36             }
37             else {
38 2         5 @files = map {[$_, undef]} @_;
  2         7  
39             }
40              
41             # initialize storage structure
42 3         87 my $storage = $self->storage;
43 3         24 $storage->initialize;
44              
45             # store relation types
46 3         13 while (my ($rel_id, $rel_data) = each %{$self->_rel_types}) {
  33         2356  
47 30         45 my ($descr, $is_external) = @{$rel_data}[0, 2];
  30         96  
48 30         170 $storage->store_rel_type($rel_id, $descr, $is_external);
49             }
50              
51             # load each file
52 3     4   74 $storage->do_transaction(sub {$self->_load_file(@$_)}) foreach @files;
  4         49  
53              
54             # cleanup internal reverse index
55 3         138 $self->{_term_rev_idx} = {};
56              
57             }
58              
59             sub _load_file {
60 4     4   9 my ($self, $file, $origin) = @_;
61              
62             # lecture du fichier (force :crlf IO mode so that Win32 dumpfiles also work)
63 4 50       296 open my $fh, "<:crlf", $file or die "open $file: $!";
64              
65 4         7 my %term;
66             my $term_count;
67 0         0 my $thesaurus_name;
68              
69             LINE:
70 4         140 while (<$fh>) {
71              
72             # skip initial lines until thesaurus name declaration
73 3231 100 100     6349 $thesaurus_name //= do {s/^BEGIN_REL THES_NAME=(.*)//; $1}
  44         48  
  44         329  
74             or next LINE;
75              
76             # stop at ending line
77 3191 100       4716 last LINE if /^END_REL/;
78              
79             # unfold continuation lines
80             CONTINUATION_LINE:
81 3187         2303 while (1) {
82 3609 100       6399 s/\+\n$/<$fh>/e or last CONTINUATION_LINE;
  422         1669  
83             }
84              
85             # suppress comments
86 3187         2986 s/###.*//;
87              
88             # skip empty lines
89 3187 100       6088 next LINE if /^\s*$/;
90              
91 3174 50       18286 my ($rel_id, $term_string) = ($_ =~ /^([A-Z]+)\d*\s*=\s*(.+?)\s*$/)
92             or die "incorrect thesaurus syntax at $file line $.: $_\n";
93              
94 3174 100       4712 if ($rel_id eq 'LT') {
95             # insert last term
96 1233 100       3080 $self->_insert_term(\%term) if keys %term;
97              
98             # build a new term
99 1233         11944 %term = (LT => $term_string, origin => $origin);
100             }
101             else {
102             # store relation info into current term
103 1941         1201 push @{$term{$rel_id}}, $term_string;
  1941         11687  
104             }
105             }
106             # insert last term
107 4 50       19 $self->_insert_term(\%term) if keys %term;
108             }
109              
110              
111             sub _insert_term {
112 1233     1233   1053 my ($self, $term_hash) = @_;
113 1233         40346 my $storage = $self->storage;
114              
115 1233         1472 my $origin = delete $term_hash->{origin};
116              
117             # store the lead term
118 1233         1316 my $term_string = delete $term_hash->{LT};
119 1233   100     7164 my $term_id = $self->{_term_rev_idx}{$origin || ''}{$term_string}
      66        
120             //= $storage->store_term($term_string, $origin);
121              
122             # store each collection of relations
123 1233         35491 my $rel_types = $self->_rel_types;
124 1233         3477 while (my ($rel_id, $related) = each %$term_hash) {
125 606 50       1131 my $rel_type = $rel_types->{$rel_id}
126             or die "unknown relation type: $rel_id\n";
127 606         480 my ($inverse_id, $is_external) = @{$rel_type}[1, 2];
  606         898  
128              
129             # for internal relations, replace strings by ids of related terms
130 606 100       922 unless ($is_external) {
131 379         461 foreach my $rel (@$related) {
132 1705   100     8721 $rel = $self->{_term_rev_idx}{$origin || ''}{$rel}
      66        
133             //= $storage->store_term($rel, $origin);
134             }
135             }
136              
137             # store it
138 606         1464 $storage->store_relation($term_id, $rel_id, $related,
139             $is_external, $inverse_id);
140             }
141             }
142              
143             1;
144              
145             __END__
146              
147              
148             =encoding ISO8859-1
149              
150             =head1 NAME
151              
152             Lingua::Thesaurus::IO::LivelinkCollectionServer - IO class for Livelink Collection Server thesaurus files
153              
154             =head1 DESCRIPTION
155              
156             This class implements the L<Lingua::Thesaurus::IO> role for
157             files issued from the I<Livelink Collection Server> database
158             (formerly known as I<Basis Plus>). Parsing is quite rudimentary
159             and does not claim to comply with the full BasisPlus specification.
160              
161             =head2 File syntax
162              
163             Files start with a header of shape:
164              
165             BEGIN_LAYOUT
166             FORMAT=FREE
167             DATA_TERM_SEPARATOR='&&&'
168             END_OF_DATA_STATEMENT='@@@'
169             DATA_QUAL_SEPARATOR='###'
170             END_LAYOUT
171             ACTION_CODE=A
172             <<<
173             <<< Thesaurus dump in FREE format.
174             <<<
175              
176             which is completely ignored.
177              
178             Actual data starts with
179              
180             BEGIN_REL THES_NAME=<thesaurus_name>
181              
182             and ends with
183              
184             END_REL.
185              
186             All data lines are of shape
187              
188             <relation_name> = <target>
189              
190             If the last character on the line is '+', then this indicates that
191             the next line is a continuation line, which should be concatenated
192             to the current line.
193              
194             The relation name 'LT' (for I<Lead Term>) introduces a new term.
195             The following lines are relations for this term, until the next LT.
196             Relations may be :
197              
198             # rel description reverse is_external
199             # === =========== ======= ===========
200             [AB => 'Abbreviation' => AF => undef],
201             [AF => 'Abbreviation For' => AB => undef],
202             [EQ => 'See Also' => UF => undef],
203             [UF => 'Used For' => EQ => undef],
204             [EQA => 'See AND' => UFA => undef],
205             [UFA => 'Used For AND' => EQA => undef],
206             [BT => 'Broad Term' => NT => undef],
207             [NT => 'Narrow Term' => BT => undef],
208             [RT => 'Related Term' => RT => undef],
209             [SN => 'Scope Note' => undef , 1 ],
210             [HN => 'History Note' => undef , 1 ],
211             );
212              
213             =head1 METHODS
214              
215             =head2 load
216              
217             Loading a thesaurus file in LivelinkCollectionServer format.