File Coverage

blib/lib/Lingua/Thesaurus.pm
Criterion Covered Total %
statement 26 26 100.0
branch 1 2 50.0
condition 1 3 33.3
subroutine 8 8 100.0
pod 2 2 100.0
total 38 41 92.6


line stmt bran cond sub pod time code
1             package Lingua::Thesaurus;
2 6     6   83844 use 5.010;
  6         15  
3 6     6   2822 use Moose;
  6         1775850  
  6         36  
4 6     6   31952 use Module::Load ();
  6         4899  
  6         101  
5 6     6   29 use Carp;
  6         8  
  6         398  
6 6     6   2371 use namespace::clean -except => 'meta';
  6         24391  
  6         23  
7              
8             our $VERSION = '0.13';
9              
10             has 'storage' => (is => 'ro', does => 'Lingua::Thesaurus::Storage',
11             handles => [qw/search_terms fetch_term
12             rel_types fetch_rel_type/],
13             documentation => "storage engine for the thesaurus");
14              
15             sub BUILDARGS {
16 6     6 1 1631 my $class = shift;
17 6 50       25 @_ > 1 or croak "not enough arguments";
18              
19             # load the storage subclass
20 6         22 my $storage_class = $class->_load_component_class(Storage => shift);
21              
22             # instanciate storage, passing all our args, and get this as input for new()
23 6         47 return {storage => $storage_class->new(@_)};
24             }
25              
26             sub load {
27 3     3 1 3951 my $self = shift;
28              
29             # load and instanciate the IO subclass
30 3         11 my $io_class = $self->_load_component_class(IO => shift);
31 3         81 my $io_obj = $io_class->new(storage => $self->storage);
32              
33             # forward the call to the IO object
34 3         1651 $io_obj->load(@_);
35             }
36              
37             sub _load_component_class {
38 9     9   20 my ($class, $family, $subclass) = @_;
39              
40             # prefix $subclass by the family namespace, unless it starts with '+'
41 9   33     100 s/^\+// or s/^/Lingua::Thesaurus::${family}::/ for $subclass;
42              
43             # load that class and return
44 9         39 Module::Load::load($subclass);
45 9         147 return $subclass;
46             }
47              
48             1; # End of Lingua::Thesaurus
49              
50              
51             __END__
52              
53             =encoding ISO8859-1
54              
55             =head1 NAME
56              
57             Lingua::Thesaurus - Thesaurus management
58              
59             =head1 SYNOPSIS
60              
61             =head2 Creating a thesaurus
62              
63             my $thesaurus = Lingua::Thesaurus->new(SQLite => $dbname);
64             $thesaurus->load($io_class => @files);
65             $thesaurus->load($io_class => {$origin1 => $file1, ...});
66             $thesaurus->load($io_class => {files => \@files,
67             params => {termClass => ..,
68             relTypeClass => ..}});
69              
70             =head2 Using a thesaurus
71              
72             my $thesaurus = Lingua::Thesaurus->new(SQLite => $dbname);
73              
74             my @terms = $thesaurus->search_terms('*foo*');
75             my $term = $thesaurus->fetch_term('foobar');
76              
77             my $scope_note = $term->SN; # returns a string
78             my @synonyms = $term->UF; # returns a list of other terms
79              
80             foreach my $pair ($term->related(qw/NT RT/)) {
81             my ($rel_type, $item) = @$pair;
82             printf " %s(%s) = %s\n", $rel_type->description, $rel_type->rel_id, $item;
83             }
84              
85             # transitive search
86             foreach my $quadruple ($term->transitively_related(qw/NT/)) {
87             my ($rel_type, $related_term, $through_term, $level) = @$quadruple;
88             printf " %s($level): %s (through %s)\n",
89             $rel_type->rel_id,
90             $level,
91             $related_term->string,
92             $through_term->string;
93             }
94              
95             =head1 DESCRIPTION
96              
97             This distribution manages I<thesauri>. A thesaurus is a list of
98             terms, with some relations (like for example "broader term" /
99             "narrower term"). Relations are either "internal" (between two terms),
100             or "external" (between a term and some external data, like for example
101             a "Scope Note"). Relations may have a reciprocal;
102             see L<Lingua::Thesaurus::RelType>.
103              
104             Thesauri are loaded from one or several I<IO formats>; usually this will be
105             the ISO 2788 format, or some derivative from it. See classes under the
106             L<Lingua::Thesaurus::IO> namespace for various implementations.
107              
108             Once loaded, thesauri are stored via a I<storage class>; this is
109             meant to be an efficient internal structure for supporting searches.
110             Currently, only L<Lingua::Thesaurus::Storage::SQLite> is implemented;
111             but the architecture allows for other storage classes to be defined,
112             as long as they comply with the L<Lingua::Thesaurus::Storage> role.
113              
114             Terms are retrieved through the L</"search_terms"> and L</"fetch_term">
115             methods. The results are instances of L<Lingua::Thesaurus::Term>;
116             these objects have navigation methods for retrieving related terms.
117              
118             This distribution was originally targeted for dealing with the
119             Swiss thesaurus for justice "Jurivoc"
120             (see L<Lingua::Thesaurus::IO::Jurivoc>).
121             However, the framework should be easily extensible to other needs.
122             Other Perl modules for thesauri are briefly discussed below
123             in the L</"SEE ALSO"> section.
124              
125             Side note: another motivation for writing this distribution was also
126             to experiment with L<Moose> meta-programming possibilities.
127             Subclasses of L<Lingua::Thesaurus::Term> are created dynamically
128             for implementing relation methods C<NT>, C<BT>, etc. ---
129             see L<Lingua::Thesaurus::Storage> source code.
130              
131             B<Caveat>: at the moment, IO classes only implement loading and
132             searching; methods for editing and dumping a thesaurus will be added in a
133             future version.
134              
135              
136             =head1 METHODS
137              
138             =head2 new
139              
140             my $thesaurus = Lingua::Thesaurus->new($storage_class => @storage_args);
141              
142             Instanciates a thesaurus on a given storage.
143             The C<$storage_class> will be automatically prefixed by
144             C<Lingua::Thesaurus::Storage::>, unless the classname contains
145             an initial C<'+'>. The remaining arguments are transmitted to the
146             storage class. Since L<Lingua::Thesaurus::Storage::SQLite> is the default
147             storage class supplied with this distribution, thesauri are usually opened
148             as
149              
150             my $dbname = '/path/to/some/file.sqlite';
151             my $thesaurus = Lingua::Thesaurus->new(SQLite => $dbname);
152              
153             =head2 load
154              
155             $thesaurus->load($io_class => @files);
156             $thesaurus->load($io_class => {$origin1 => $file1, ...});
157             $thesaurus->load($io_class => {files => \@files,
158             params => {termClass => ..,
159             relTypeClass => ..}});
160              
161             Populates a thesaurus database with data from thesauri dumpfiles. The
162             job of parsing these files is delegated to some C<IO> subclass, given
163             as first argument. The C<$io_class> will be automatically prefixed by
164             C<Lingua::Thesaurus::IO::>, unless the classname contains an initial
165             C<'+'>. The remaining arguments are transmitted to the IO class; the
166             simplest form is just a list of dumpfiles, or a hashref of pairs C<<
167             {$origin1 => $dumpfile1, ...} >>. Each C<$origin> is a string for
168             tagging terms coming from that dumpfile; while interrogating the
169             thesaurus, origins can be retrieved from C<< $term->origin >>. See IO
170             subclasses in the L<Lingua::Thesaurus::IO> namespace for more details.
171              
172             =head3 search_terms
173              
174             my @terms = $thesaurus->search_terms($pattern, $origin);
175              
176             Searches the term database according to C<$pattern>, where
177             the pattern may contain C<'*'> to mean word completion.
178              
179             The interpretation of patterns depends on the storage
180             engine; by default, this is implemented using SQLite's
181             "LIKE" function (see L<http://www.sqlite.org/lang_expr.html#like>).
182             Characters C<'*'> in the pattern are translated into
183             C<'%'> for the LIKE function to work as expected.
184              
185             It is also possible to configure the storage to use fulltext
186             searches, so that a pattern such as C<'sci*'> would also match
187             C<'computer science'>; see
188             L<Lingua::Thesaurus::Storage::SQLite/use_fulltext>.
189              
190             If C<$pattern> is empty, the method returns the list
191             of all terms in the thesaurus.
192              
193             The second argument C<$origin> is optional; it may be used
194             to restrict the search on terms loaded from one specific origin.
195              
196             Results are instances of L<Lingua::Thesaurus::Term>.
197              
198             =head3 fetch_term
199              
200             my $term = $thesaurus->fetch_term($term_string, $origin);
201              
202             Retrieves a specific term and
203             returns an instance of L<Lingua::Thesaurus::Term>
204             (or C<undef> if the term is unknown). The second argument C<$origin>
205             is optional.
206              
207              
208             =head3 rel_types
209              
210             Returns the list of ids of relation types stored in this thesaurus
211             (i.e. 'NT', 'RT', etc.).
212              
213             =head3 fetch_rel_type
214              
215             my $rel_type = $thesaurus->fetch_rel_type($rel_type_id);
216              
217             Returns the L<Lingua::Thesaurus::RelType> object
218             corresponding to C<$rel_type_id>.
219              
220              
221             =head3 storage
222              
223             Returns the internal object playing role L<Lingua::Thesaurus::Storage>.
224              
225             =head1 FURTHER DOCUMENTATION
226              
227             More details can be found in the various implementation classes :
228              
229             =over
230              
231             =item *
232              
233             L<Lingua::Thesaurus::IO> : Role for input/output operations on a thesaurus
234              
235             =item *
236              
237             L<Lingua::Thesaurus::IO::ISO2788> :
238             IO class for ISO thesauri (not implemented yet)
239              
240             =item *
241              
242             L<Lingua::Thesaurus::IO::Jurivoc> :
243             IO class for "Jurivoc", the Swiss thesaurus for justice
244              
245              
246             =item *
247              
248             L<Lingua::Thesaurus::IO::LivelinkCollectionServer> :
249             IO class for Livelink Collection Server thesaurus files
250              
251             =item *
252              
253             L<Lingua::Thesaurus::RelType> :
254             Relation type in a thesaurus
255              
256             =item *
257              
258             L<Lingua::Thesaurus::Storage>:
259             Role for thesaurus storage
260              
261             =item *
262              
263             L<Lingua::Thesaurus::Storage::SQLite>:
264             Thesaurus storage in an SQLite database
265              
266             =item *
267              
268             L<Lingua::Thesaurus::Term>:
269             parent class for thesaurus terms; in particular, this class
270             implements methods for navigating through relations.
271              
272             =back
273              
274              
275             =head1 SEE ALSO
276              
277             Here is a brief review of some other thesaurus modules on CPAN :
278              
279             =over
280              
281             =item *
282              
283             L<Thesaurus> has several backend implementations
284             (CSV, BerkeleyDB, DBI), but it just handles synonyms (a single relation
285             between terms).
286              
287             =item *
288              
289             L<Text::Thesaurus::ISO> is quite old (1998), uses obsolete technology
290             (C<dbmopen>), and has a fixed number of relations, some of which are
291             apparently targeted to the specific needs of UK electronic libraries.
292              
293             =item *
294              
295             L<Biblio::Thesaurus> has a rich set of features, not only for
296             reading and searching, but also for editing and exporting a thesaurus.
297             Storage is directly in hashes in memory; those can be saved into
298             files in L<Storable> format. The set of relations is flexible; it
299             is read from the ISO dumpfiles. If it fits directly your needs, it's
300             probably a good choice; but if you need to adapt/extend it, it's not
301             totally obvious because all features are mingled into one monolithic
302             module.
303              
304             =item *
305              
306             L<Biblio::Thesaurus::SQLite> has an unclear status : it sits in the
307             same namespace as L<Biblio::Thesaurus>, and actually calls it in the
308             source code, but doesn't inherit or call it.
309             A separate API is provided for storing some thesaurus data into
310             an SQLite database; but the full features of L<Biblio::Thesaurus> are absent.
311              
312             =back
313              
314              
315             =head1 AUTHOR
316              
317             Laurent Dami, C<< <dami at cpan.org> >>
318              
319             =head1 BUGS
320              
321             Please report any bugs or feature requests to C<bug-lingua-thesaurus at rt.cpan.org>, or through
322             the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-Thesaurus>. I will be notified, and then you'll
323             automatically be notified of progress on your bug as I make changes.
324              
325              
326              
327             =head1 SUPPORT
328              
329             You can find documentation for this module with the perldoc command.
330              
331             perldoc Lingua::Thesaurus
332              
333              
334             You can also look for information at:
335              
336             =over 4
337              
338             =item * RT: CPAN's request tracker (report bugs here)
339              
340             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Lingua-Thesaurus>
341              
342             =item * AnnoCPAN: Annotated CPAN documentation
343              
344             L<http://annocpan.org/dist/Lingua-Thesaurus>
345              
346             =item * CPAN Ratings
347              
348             L<http://cpanratings.perl.org/d/Lingua-Thesaurus>
349              
350             =item * Search MetaCPAN
351              
352             L<https://metacpan.org/module/Lingua::Thesaurus>
353              
354             =back
355              
356              
357             =head1 LICENSE AND COPYRIGHT
358              
359             Copyright 2013 Laurent Dami.
360              
361             This program is free software; you can redistribute it and/or modify it
362             under the terms of the the Artistic License (2.0). You may obtain a
363             copy of the full license at:
364              
365             L<http://www.perlfoundation.org/artistic_license_2_0>
366              
367             The test suite contains a short excerpt from the Swiss Jurivoc thesaurus,
368             copyright 1999-2012 Tribunal fédéral Suisse
369             (see L<http://www.bger.ch/fr/index/juridiction/jurisdiction-inherit-template/jurisdiction-jurivoc-home.htm>).
370              
371              
372             =head1 TODO
373              
374             =head2 Thesaurus
375              
376             - support for multiple thesauri files (a term belongs to one-to-many
377             thesaurus files; a relation belongs to exactly one thesaurus file)
378              
379             =head2 SQLite
380              
381             - use_unaccent without fulltext ==> use collation sequence or redefine LIKE
382             - store thesaurus name for each term
383             => adapt search_terms($pattern, $thes_name);
384              
385              
386              
387              
388             =cut
389              
390