File Coverage

blib/lib/RDF/Generator/Void.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package RDF::Generator::Void;
2              
3 1     1   20412 use 5.006;
  1         4  
  1         34  
4 1     1   6 use strict;
  1         1  
  1         29  
5 1     1   4 use warnings;
  1         6  
  1         31  
6 1     1   413 use Moose;
  0            
  0            
7             use Moose::Util::TypeConstraints;
8             use Data::UUID;
9             use RDF::Trine qw[iri literal blank variable statement];
10             use RDF::Generator::Void::Stats;
11             # use less ();
12             use utf8;
13             use URI::Split qw(uri_split uri_join);
14              
15             use aliased 'RDF::Generator::Void::Meta::Attribute::ObjectList';
16              
17             # Define some namespace prefixes
18             my $void = RDF::Trine::Namespace->new('http://rdfs.org/ns/void#');
19             my $rdf = RDF::Trine::Namespace->new('http://www.w3.org/1999/02/22-rdf-syntax-ns#');
20             my $xsd = RDF::Trine::Namespace->new('http://www.w3.org/2001/XMLSchema#');
21             my $dct = RDF::Trine::Namespace->new('http://purl.org/dc/terms/');
22             my $prov = RDF::Trine::Namespace->new('http://www.w3.org/ns/prov#');
23              
24             =head1 NAME
25              
26             RDF::Generator::Void - Generate VoID descriptions based on data in an RDF model
27              
28             =head1 VERSION
29              
30             Version 0.13
31              
32             =cut
33              
34             our $VERSION = '0.13';
35              
36             =head1 SYNOPSIS
37              
38             use RDF::Generator::Void;
39             use RDF::Trine::Model;
40             my $mymodel = RDF::Trine::Model->temporary_model;
41             [add some data to $mymodel here]
42             my $generator = RDF::Generator::Void->new(inmodel => $mymodel);
43             $generator->urispace('http://example.org');
44             $generator->add_endpoints('http://example.org/sparql');
45             my $voidmodel = $generator->generate;
46              
47             =head1 DESCRIPTION
48              
49             This module takes a L<RDF::Trine::Model> object as input to the
50             constructor, and based on the data in that model as well as data
51             supplied by the user, it creates a new model with a VoID description
52             of the data in the model.
53              
54             For a description of VoID, see L<http://www.w3.org/TR/void/>.
55              
56             =head1 METHODS
57              
58             =head2 new(inmodel => $mymodel, dataset_uri => URI->new($dataset_uri), level => 1);
59              
60             The constructor. It can be called with two parameters, namely,
61             C<inmodel> which is a model we want to describe and C<dataset_uri>,
62             which is the URI we want to use for the description. Users should make
63             sure it is possible to get this with HTTP. If this is not possible,
64             you may leave this field empty so that a simple URN can be created for
65             you as a default.
66              
67             =head2 C<inmodel>
68              
69             Read-only accessor for the model used in description creation.
70              
71             =head2 C<dataset_uri>
72              
73             Read-only accessor for the URI to the dataset.
74              
75             =cut
76              
77             has inmodel => (
78             is => 'ro',
79             isa => 'RDF::Trine::Model',
80             required => 1,
81             );
82              
83             # This is setting up the dataset_uri method, and make it possible to
84             # create a resource of it from strings or URI objects.
85             class_type 'URI';
86              
87             subtype 'DatasetURI',
88             as 'Object',
89             where { $_->isa('RDF::Trine::Node::Resource') || $_->isa('RDF::Trine::Node::Blank') };
90              
91             coerce 'DatasetURI',
92             from 'URI', via { iri("$_") },
93             from 'Str', via { iri($_) };
94              
95             has dataset_uri => (
96             is => 'ro',
97             isa => 'DatasetURI',
98             lazy => 1,
99             builder => '_build_dataset_uri',
100             coerce => 1,
101             );
102              
103             # This will create a URN with a UUID by default
104             sub _build_dataset_uri {
105             my ($self) = @_;
106             return iri sprintf('urn:uuid:%s', Data::UUID->new->create_str);
107             }
108              
109             =head2 Property Attributes
110              
111             The below attributes concern some essential properties in the VoID
112             vocabulary. They are mostly arrays, and can be manipulated using array
113             methods. Methods starting with C<all_> will return an array of unique
114             values. Methods starting with C<add_> takes a list of values to add,
115             and those starting with C<has_no_> return a boolean value, false if
116             the array is empty.
117              
118             =head3 C<all_vocabularies>, C<add_vocabularies>, C<has_no_vocabularies>
119              
120             Methods to manipulate a list of vocabularies used in the dataset. The
121             values should be a string that represents the URI of a vocabulary.
122              
123             =cut
124              
125             # All the following attributes have that in common that they
126             # automatically the method names also specified in handles, to
127             # manipulate and query the data.
128             has _vocabularies => ( traits => [ObjectList] );
129              
130             =head3 C<all_endpoints>, C<add_endpoints>, C<has_no_endpoints>
131              
132             Methods to manipulate a list of SPARQL endpoints that can be used to
133             query the dataset. The values should be a string that represents the
134             URI of a SPARQL endpoint.
135              
136             =cut
137              
138              
139             has _endpoints => ( traits => [ObjectList] );
140              
141             =head3 C<all_titles>, C<add_titles>, C<has_no_titles>
142              
143             Methods to manipulate the titles of the datasets. The values should be
144             L<RDF::Trine::Node::Literal> objects, and should be set with
145             language. Typically, you would have a value per language.
146              
147             =cut
148              
149              
150             has _titles => (
151             traits => [ObjectList],
152             isa => 'ArrayRef[RDF::Trine::Node::Literal]',
153             );
154              
155              
156             =head3 C<all_licenses>, C<add_licenses>, C<has_no_licenses>
157              
158             Methods to manipulate a list of licenses that regulates the use of the
159             dataset. The values should be a string that represents the URI of a
160             license.
161              
162             =cut
163              
164             has _licenses => ( traits => [ObjectList] );
165              
166             =head3 C<urispace>, C<has_urispace>
167              
168             This method is used to set the URI prefix string that will match the
169             entities in your dataset. The computation of the number of entities
170             depends on this being set. C<has_urispace> can be used to check if it
171             is set.
172              
173             =cut
174              
175             # There should only be a single uriSpace per Dataset (but there may be
176             # more for subsets), thus this is a simple scalar attribute.
177             has urispace => (
178             is => 'rw',
179             isa => 'Str',
180             predicate => 'has_urispace',
181             );
182              
183             =head2 Running this stuff
184              
185             =head3 C<level>, C<has_level>
186              
187             Set the level of detail. 0 doesn't do any statistics or heuristics, 1
188             has some statistics for the dataset as a whole, 2 will give some
189             partition statistics and 3 will give subject and object counts for
190             property partitions. Setting no level will give everything.
191              
192             =cut
193              
194             has level => (is => 'rw', isa => 'Int', predicate => 'has_level');
195              
196              
197             =head3 C<stats>, C<clear_stats>, C<has_stats>
198              
199             Method to compute a statistical summary for the data in the dataset,
200             such as the number of entities, predicates, etc. C<clear_stats> will
201             clear the statistics and C<has_stats> will return true if exists.
202              
203             =cut
204              
205             # In practice, this method just calls the ::Stats class to do
206             # everything.
207             has stats => (
208             is => 'rw',
209             isa => 'RDF::Generator::Void::Stats',
210             lazy => 1,
211             builder => '_build_stats',
212             clearer => 'clear_stats',
213             predicate => 'has_stats',
214             );
215              
216             sub _build_stats {
217             my ($self) = @_;
218             return RDF::Generator::Void::Stats->new(generator => $self);
219             }
220              
221              
222             =head3 generate( [ $model ] )
223              
224             Returns the VoID as an RDF::Trine::Model. You may pass a model with
225             statements as argument to this method. This model may then contain
226             arbitrary RDF that will be added to the RDF model. If you do not send
227             a model, one will be created for you.
228              
229             =cut
230              
231             sub generate {
232             my $self = shift;
233             my $void_model = shift || RDF::Trine::Model->temporary_model;
234              
235             local $self->{void_model} = $void_model;
236              
237             # Start generating the actual VoID statements
238             $void_model->add_statement(statement(
239             $self->dataset_uri,
240             $rdf->type,
241             $void->Dataset,
242             ));
243              
244             my ($scheme, $auth, $path, $query, $frag) = uri_split($self->dataset_uri->uri_value);
245             if ($frag) { # Then, we have a document that could be described with provenance
246             my $uri = iri(uri_join($scheme, $auth, $path, $query, undef));
247             my $blank = blank();
248             $void_model->add_statement(statement($uri,
249             $prov->wasGeneratedBy,
250             $blank));
251             (my $ver = $VERSION) =~ s/\./-/;
252             my $release_uri = iri("http://purl.org/NET/cpan-uri/dist/RDF-Generator-Void/v_$ver");
253             $void_model->add_statement(statement($blank,
254             $prov->wasAssociatedWith,
255             $release_uri));
256             $void_model->add_statement(statement($release_uri,
257             $rdf->type,
258             $prov->SoftwareAgent));
259             $void_model->add_statement(statement($release_uri,
260             iri('http://www.w3.org/2000/01/rdf-schema#label'),
261             literal("RDF::Generator::Void, Version $VERSION", 'en')));
262             }
263              
264              
265             foreach my $endpoint ($self->all_endpoints) {
266             $void_model->add_statement(statement(
267             $self->dataset_uri,
268             $void->sparqlEndpoint,
269             iri($endpoint)
270             ));
271             }
272              
273             foreach my $title ($self->all_titles) {
274             $void_model->add_statement(statement(
275             $self->dataset_uri,
276             $dct->title,
277             $title
278             ));
279             }
280            
281             foreach my $license ($self->all_licenses) {
282             $void_model->add_statement(statement(
283             $self->dataset_uri,
284             $dct->license,
285             iri($license)
286             ));
287             }
288              
289              
290             $void_model->add_statement(statement(
291             $self->dataset_uri,
292             $void->triples,
293             literal($self->inmodel->size, undef, $xsd->integer),
294             ));
295              
296             if ($self->has_urispace) {
297             $void_model->add_statement(statement(
298             $self->dataset_uri,
299             $void->uriSpace,
300             literal($self->urispace)
301             ));
302             return $void_model if ($self->has_level && ($self->level == 0));
303             $self->_generate_counts($void->entities, $self->stats->entities);
304             }
305              
306             return $void_model if ($self->has_level && $self->level == 0);
307             $self->_generate_counts($void->distinctSubjects, $self->stats->subjects);
308             $self->_generate_counts($void->properties, $self->stats->properties);
309             $self->_generate_counts($void->distinctObjects, $self->stats->objects);
310              
311             $self->_generate_most_common_vocabs($self->stats) if $self->has_stats;
312              
313             return $void_model if ($self->has_level && $self->level <= 1);
314              
315             $self->_generate_propertypartitions;
316             $self->_generate_classpartitions;
317             return $void_model;
318             }
319              
320             sub _generate_counts {
321             my ($self, $predicate, $count) = @_;
322             return undef unless $self->has_stats;
323             $self->{void_model}->add_statement(statement(
324             $self->dataset_uri,
325             $predicate,
326             literal($count, undef, $xsd->integer),
327             ));
328             }
329              
330             sub _generate_propertypartitions {
331             my ($self) = @_;
332             return undef unless $self->has_stats;
333             my $properties = $self->stats->propertyPartitions;
334             while (my ($uri, $counts) = each(%{$properties})) {
335             my $blank = blank();
336             $self->{void_model}->add_statement(statement(
337             $self->dataset_uri,
338             $void->propertyPartition,
339             $blank));
340             $self->{void_model}->add_statement(statement($blank,
341             $void->property,
342             iri($uri)));
343             $self->{void_model}->add_statement(statement($blank,
344             $void->triples,
345             literal($counts->{'triples'}, undef, $xsd->integer)));
346             # OK, so sometimes, one has to balance elegance and performance...
347             if ($counts->{'countsubjects'}) {
348             $self->{void_model}->add_statement(statement($blank,
349             $void->distinctSubjects,
350             literal(scalar keys %{$counts->{'countsubjects'}}, undef, $xsd->integer)));
351             $self->{void_model}->add_statement(statement($blank,
352             $void->distinctObjects,
353             literal(scalar keys %{$counts->{'countobjects'}}, undef, $xsd->integer)));
354             }
355              
356            
357              
358             }
359             }
360              
361             sub _generate_classpartitions {
362             my ($self) = @_;
363             return undef unless $self->has_stats;
364             my $classes = $self->stats->classPartitions;
365             while (my ($uri, $count) = each(%{$classes})) {
366             my $blank = blank();
367             $self->{void_model}->add_statement(statement(
368             $self->dataset_uri,
369             $void->classPartition,
370             $blank));
371             $self->{void_model}->add_statement(statement($blank,
372             $void->class,
373             iri($uri)));
374             $self->{void_model}->add_statement(statement($blank,
375             $void->triples,
376             literal($count, undef, $xsd->integer)));
377             }
378             }
379              
380             sub _generate_most_common_vocabs {
381             my ($self) = @_;
382              
383             # Which vocabularies are most commonly used for predicates in the
384             # dataset? Vocabularies used for less than 1% of triples need not
385             # apply.
386             my $threshold = $self->inmodel->size / 100;
387             my %vocabs = %{ $self->stats->vocabularies };
388             $self->add_vocabularies(grep { $vocabs{$_} > $threshold } keys %vocabs);
389            
390             foreach my $vocab ($self->all_vocabularies) {
391             $self->{void_model}->add_statement(statement(
392             $self->dataset_uri,
393             $void->vocabulary,
394             iri($vocab),
395             ));
396             }
397             }
398              
399              
400             =head1 AUTHORS
401              
402             Kjetil Kjernsmo C<< <kjetilk@cpan.org> >>
403             Toby Inkster C<< <tobyink@cpan.org> >>
404              
405             =head1 TODO
406              
407             =over
408              
409             =item * URI regexps support.
410              
411             =item * Technical features (esp. serializations).
412              
413             =item * Example resources and root resources.
414              
415             =item * Data dumps.
416              
417             =item * Subject classification.
418              
419             =item * Method to disable heuristics.
420              
421             =item * More heuristics.
422              
423             =item * Linkset descriptions.
424              
425             =item * Set URI space on partitions.
426              
427             =item * Use L<CHI> to cache?
428              
429             =item * Use schema introspection to generate property attributes with L<MooseX::Semantics>.
430              
431              
432              
433             =back
434              
435              
436             =head1 BUGS
437              
438             Please report any bugs you find to L<https://github.com/kjetilk/RDF-Generator-Void/issues>
439              
440             Note that any claim that this module will generate a void in
441             spacetime, a wormhole, black hole, or funny philosophy is totally
442             bogus and without any scientific merit whatsoever. The lead author has
443             made elaborate precautions to avoid any such issues, and expects
444             everyone to take his word for it. Oh, BTW, should it just happen
445             anyway, it won't L<hurt much|http://news.sciencemag.org/sciencenow/2012/03/scienceshot-one-black-hole-wont-.html>.
446              
447              
448             =head1 SUPPORT
449              
450             You can find documentation for this module with the perldoc command.
451              
452             perldoc RDF::Generator::Void
453              
454             The Perl and RDF community website is at L<http://www.perlrdf.org/>
455             where you can also find a mailing list to direct questions to.
456              
457             You can also look for information at:
458              
459             =over 4
460              
461             =item * AnnoCPAN: Annotated CPAN documentation
462              
463             L<http://annocpan.org/dist/RDF-Generator-Void>
464              
465             =item * CPAN Ratings
466              
467             L<http://cpanratings.perl.org/d/RDF-Generator-Void>
468              
469             =item * MetaCPAN
470              
471             L<https://metacpan.org/module/RDF::Generator::Void>
472              
473             =back
474              
475              
476             =head1 ACKNOWLEDGEMENTS
477              
478             Many thanks to Konstantin Baierer for help with L<RDF::Generator::Void::Meta::Attribute::ObjectList>.
479              
480             =head1 LICENSE AND COPYRIGHT
481              
482             Copyright 2012 Toby Inkster.
483             Copyright 2012-2013 Kjetil Kjernsmo.
484              
485             This program is free software; you can redistribute it and/or modify it
486             under the terms of either: the GNU General Public License as published
487             by the Free Software Foundation; or the Artistic License.
488              
489             See http://dev.perl.org/licenses/ for more information.
490              
491              
492             =cut
493              
494             1; # End of RDF::Generator::Void