File Coverage

blib/lib/RDF/Generator/Void.pm
Criterion Covered Total %
statement 126 128 98.4
branch 16 22 72.7
condition 13 15 86.6
subroutine 18 19 94.7
pod 1 1 100.0
total 174 185 94.0


line stmt bran cond sub pod time code
1             package RDF::Generator::Void;
2              
3 5     5   53503 use 5.006;
  5         21  
4 5     5   17 use strict;
  5         5  
  5         78  
5 5     5   14 use warnings;
  5         13  
  5         97  
6 5     5   501 use Moose;
  5         298575  
  5         39  
7 5     5   24092 use Moose::Util::TypeConstraints;
  5         7  
  5         41  
8 5     5   6495 use Data::UUID;
  5         604  
  5         304  
9 5     5   489 use RDF::Trine qw[iri literal blank variable statement];
  5         132069  
  5         284  
10 5     5   2017 use RDF::Generator::Void::Stats;
  5         13  
  5         198  
11             # use less ();
12 5     5   34 use utf8;
  5         6  
  5         39  
13 5     5   2642 use URI::Split qw(uri_split uri_join);
  5         2744  
  5         312  
14 5     5   25 use Progress::Any;
  5         6  
  5         42  
15              
16 5     5   2113 use aliased 'RDF::Generator::Void::Meta::Attribute::ObjectList';
  5         2545  
  5         21  
17              
18             # Define some namespace prefixes
19             my $void = RDF::Trine::Namespace->new('http://rdfs.org/ns/void#');
20             my $rdf = RDF::Trine::Namespace->new('http://www.w3.org/1999/02/22-rdf-syntax-ns#');
21             my $xsd = RDF::Trine::Namespace->new('http://www.w3.org/2001/XMLSchema#');
22             my $dct = RDF::Trine::Namespace->new('http://purl.org/dc/terms/');
23             my $prov = RDF::Trine::Namespace->new('http://www.w3.org/ns/prov#');
24              
25             =head1 NAME
26              
27             RDF::Generator::Void - Generate VoID descriptions based on data in an RDF model
28              
29             =head1 VERSION
30              
31             Version 0.13_1
32              
33             =cut
34              
35             our $VERSION = '0.13_1';
36              
37             =head1 SYNOPSIS
38              
39             use RDF::Generator::Void;
40             use RDF::Trine::Model;
41             my $mymodel = RDF::Trine::Model->temporary_model;
42             [add some data to $mymodel here]
43             my $generator = RDF::Generator::Void->new(inmodel => $mymodel);
44             $generator->urispace('http://example.org');
45             $generator->add_endpoints('http://example.org/sparql');
46             my $voidmodel = $generator->generate;
47              
48             =head1 DESCRIPTION
49              
50             This module takes a L<RDF::Trine::Model> object as input to the
51             constructor, and based on the data in that model as well as data
52             supplied by the user, it creates a new model with a VoID description
53             of the data in the model.
54              
55             For a description of VoID, see L<http://www.w3.org/TR/void/>.
56              
57             =head1 METHODS
58              
59             =head2 new(inmodel => $mymodel, dataset_uri => URI->new($dataset_uri), level => 1);
60              
61             The constructor. It can be called with two parameters, namely,
62             C<inmodel> which is a model we want to describe and C<dataset_uri>,
63             which is the URI we want to use for the description. Users should make
64             sure it is possible to get this with HTTP. If this is not possible,
65             you may leave this field empty so that a simple URN can be created for
66             you as a default.
67              
68             =head2 C<inmodel>
69              
70             Read-only accessor for the model used in description creation.
71              
72             =head2 C<dataset_uri>
73              
74             Read-only accessor for the URI to the dataset.
75              
76             =cut
77              
78             has inmodel => (
79             is => 'ro',
80             isa => 'RDF::Trine::Model',
81             required => 1,
82             );
83              
84             # This is setting up the dataset_uri method, and make it possible to
85             # create a resource of it from strings or URI objects.
86             class_type 'URI';
87              
88             subtype 'DatasetURI',
89             as 'Object',
90             where { $_->isa('RDF::Trine::Node::Resource') || $_->isa('RDF::Trine::Node::Blank') };
91              
92             coerce 'DatasetURI',
93             from 'URI', via { iri("$_") },
94             from 'Str', via { iri($_) };
95              
96             has dataset_uri => (
97             is => 'ro',
98             isa => 'DatasetURI',
99             lazy => 1,
100             builder => '_build_dataset_uri',
101             coerce => 1,
102             );
103              
104             our $progress; # Declared for everything in here
105              
106              
107             # This will create a URN with a UUID by default
108             sub _build_dataset_uri {
109 0     0   0 my ($self) = @_;
110 0         0 return iri sprintf('urn:uuid:%s', Data::UUID->new->create_str);
111             }
112              
113             =head2 Property Attributes
114              
115             The below attributes concern some essential properties in the VoID
116             vocabulary. They are mostly arrays, and can be manipulated using array
117             methods. Methods starting with C<all_> will return an array of unique
118             values. Methods starting with C<add_> takes a list of values to add,
119             and those starting with C<has_no_> return a boolean value, false if
120             the array is empty.
121              
122             =head3 C<all_vocabularies>, C<add_vocabularies>, C<has_no_vocabularies>
123              
124             Methods to manipulate a list of vocabularies used in the dataset. The
125             values should be a string that represents the URI of a vocabulary.
126              
127             =cut
128              
129             # All the following attributes have that in common that they
130             # automatically the method names also specified in handles, to
131             # manipulate and query the data.
132             has _vocabularies => ( traits => [ObjectList] );
133              
134             =head3 C<all_endpoints>, C<add_endpoints>, C<has_no_endpoints>
135              
136             Methods to manipulate a list of SPARQL endpoints that can be used to
137             query the dataset. The values should be a string that represents the
138             URI of a SPARQL endpoint.
139              
140             =cut
141              
142              
143             has _endpoints => ( traits => [ObjectList] );
144              
145             =head3 C<all_titles>, C<add_titles>, C<has_no_titles>
146              
147             Methods to manipulate the titles of the datasets. The values should be
148             L<RDF::Trine::Node::Literal> objects, and should be set with
149             language. Typically, you would have a value per language.
150              
151             =cut
152              
153              
154             has _titles => (
155             traits => [ObjectList],
156             isa => 'ArrayRef[RDF::Trine::Node::Literal]',
157             );
158              
159              
160             =head3 C<all_licenses>, C<add_licenses>, C<has_no_licenses>
161              
162             Methods to manipulate a list of licenses that regulates the use of the
163             dataset. The values should be a string that represents the URI of a
164             license.
165              
166             =cut
167              
168             has _licenses => ( traits => [ObjectList] );
169              
170             =head3 C<urispace>, C<has_urispace>
171              
172             This method is used to set the URI prefix string that will match the
173             entities in your dataset. The computation of the number of entities
174             depends on this being set. C<has_urispace> can be used to check if it
175             is set.
176              
177             =cut
178              
179             # There should only be a single uriSpace per Dataset (but there may be
180             # more for subsets), thus this is a simple scalar attribute.
181             has urispace => (
182             is => 'rw',
183             isa => 'Str',
184             predicate => 'has_urispace',
185             );
186              
187             =head2 Running this stuff
188              
189             =head3 C<level>, C<has_level>
190              
191             Set the level of detail. 0 doesn't do any statistics or heuristics, 1
192             has some statistics for the dataset as a whole, 2 will give some
193             partition statistics and 3 will give subject and object counts for
194             property partitions. Setting no level will give everything.
195              
196             =cut
197              
198             has level => (is => 'rw', isa => 'Int', predicate => 'has_level');
199              
200              
201             =head3 C<stats>, C<clear_stats>, C<has_stats>
202              
203             Method to compute a statistical summary for the data in the dataset,
204             such as the number of entities, predicates, etc. C<clear_stats> will
205             clear the statistics and C<has_stats> will return true if exists.
206              
207             =cut
208              
209             # In practice, this method just calls the ::Stats class to do
210             # everything.
211             has stats => (
212             is => 'rw',
213             isa => 'RDF::Generator::Void::Stats',
214             lazy => 1,
215             builder => '_build_stats',
216             clearer => 'clear_stats',
217             predicate => 'has_stats',
218             );
219              
220             sub _build_stats {
221 6     6   17 my ($self) = @_;
222 6         48 return RDF::Generator::Void::Stats->new(generator => $self);
223             }
224              
225              
226             =head3 generate( [ $model ] )
227              
228             Returns the VoID as an RDF::Trine::Model. You may pass a model with
229             statements as argument to this method. This model may then contain
230             arbitrary RDF that will be added to the RDF model. If you do not send
231             a model, one will be created for you.
232              
233             =cut
234              
235             sub generate {
236 13     13 1 19530 my $self = shift;
237 13   66     88 my $void_model = shift || RDF::Trine::Model->temporary_model;
238 13         642 local $progress = Progress::Any->get_indicator(task => "compute");
239 13         479 $progress->pos(0);
240 13         326 my $target_size = 11;
241 13 100 100     439 if ($self->has_level && ($self->level > 0)) {
242 10         232 $target_size += $self->inmodel->size;
243             }
244 13         488 $progress->target($target_size);
245 13         258 $progress->update(message => "Adding base statements");
246 13         529 local $self->{void_model} = $void_model;
247              
248             # Start generating the actual VoID statements
249 13         349 $void_model->add_statement(statement(
250             $self->dataset_uri,
251             $rdf->type,
252             $void->Dataset,
253             ));
254 13         8840 $progress->update(message => "Adding base statements");
255              
256 13         784 my ($scheme, $auth, $path, $query, $frag) = uri_split($self->dataset_uri->uri_value);
257 13 100       166 if ($frag) { # Then, we have a document that could be described with provenance
258 7         19 my $uri = iri(uri_join($scheme, $auth, $path, $query, undef));
259 7         214 my $blank = blank();
260 7         416 $void_model->add_statement(statement($uri,
261             $prov->wasGeneratedBy,
262             $blank));
263 7         3721 (my $ver = $VERSION) =~ s/\./-/;
264 7         23 my $release_uri = iri("http://purl.org/NET/cpan-uri/dist/RDF-Generator-Void/v_$ver");
265 7         121 $void_model->add_statement(statement($blank,
266             $prov->wasAssociatedWith,
267             $release_uri));
268 7         3604 $void_model->add_statement(statement($release_uri,
269             $rdf->type,
270             $prov->SoftwareAgent));
271 7         3582 $void_model->add_statement(statement($release_uri,
272             iri('http://www.w3.org/2000/01/rdf-schema#label'),
273             literal("RDF::Generator::Void, Version $VERSION", 'en')));
274 7         3061 $progress->update(message => "Adding provenance statements");
275             }
276              
277 13         683 foreach my $endpoint ($self->all_endpoints) {
278 6         139 $void_model->add_statement(statement(
279             $self->dataset_uri,
280             $void->sparqlEndpoint,
281             iri($endpoint)
282             ));
283             }
284              
285 13         3398 foreach my $title ($self->all_titles) {
286 10         2739 $void_model->add_statement(statement(
287             $self->dataset_uri,
288             $dct->title,
289             $title
290             ));
291             }
292            
293 13         2609 foreach my $license ($self->all_licenses) {
294 4         93 $void_model->add_statement(statement(
295             $self->dataset_uri,
296             $dct->license,
297             iri($license)
298             ));
299             }
300              
301 13         1996 $progress->update(message => "Adding user-set statements");
302              
303 13         761 $void_model->add_statement(statement(
304             $self->dataset_uri,
305             $void->triples,
306             literal($self->inmodel->size, undef, $xsd->integer),
307             ));
308              
309 13         6752 $progress->update(message => "Adding base statements");
310 13 50       969 if ($self->has_urispace) {
311 13         301 $void_model->add_statement(statement(
312             $self->dataset_uri,
313             $void->uriSpace,
314             literal($self->urispace)
315             ));
316 13 100 100     6041 return $void_model if ($self->has_level && ($self->level == 0));
317 12         75 $self->_generate_counts($void->entities, $self->stats->entities);
318             }
319              
320              
321 12 50 66     821 return $void_model if ($self->has_level && $self->level == 0);
322 12         77 $self->_generate_counts($void->distinctSubjects, $self->stats->subjects);
323 12         498 $self->_generate_counts($void->properties, $self->stats->properties);
324 12         496 $self->_generate_counts($void->distinctObjects, $self->stats->objects);
325              
326 12 50       791 $self->_generate_most_common_vocabs($self->stats) if $self->has_stats;
327              
328 12 100 100     783 return $void_model if ($self->has_level && $self->level <= 1);
329              
330 4         7 $target_size += scalar(keys(%{$self->stats->propertyPartitions}));
  4         90  
331 4         7 $target_size += scalar(keys(%{$self->stats->classPartitions}));
  4         92  
332 4         17 $progress->target($target_size);
333              
334 4         83 $self->_generate_propertypartitions;
335 4         15 $self->_generate_classpartitions;
336 4         14 $progress->update(message => "Finishing");
337              
338 4         114 return $void_model;
339             }
340              
341             sub _generate_counts {
342 48     48   68 my ($self, $predicate, $count) = @_;
343 48 50       1191 return undef unless $self->has_stats;
344 48         1345 $self->{void_model}->add_statement(statement(
345             $self->dataset_uri,
346             $predicate,
347             literal($count, undef, $xsd->integer),
348             ));
349 48         22203 $progress->update(message => "Adding counts statements");
350             }
351              
352             sub _generate_propertypartitions {
353 4     4   7 my ($self) = @_;
354 4 50       103 return undef unless $self->has_stats;
355 4         87 my $properties = $self->stats->propertyPartitions;
356 4         6 while (my ($uri, $counts) = each(%{$properties})) {
  30         1090  
357 26         70 my $blank = blank();
358 26         1590 $self->{void_model}->add_statement(statement(
359             $self->dataset_uri,
360             $void->propertyPartition,
361             $blank));
362 26         10857 $self->{void_model}->add_statement(statement($blank,
363             $void->property,
364             iri($uri)));
365             $self->{void_model}->add_statement(statement($blank,
366             $void->triples,
367 26         13689 literal($counts->{'triples'}, undef, $xsd->integer)));
368             # OK, so sometimes, one has to balance elegance and performance...
369 26 100       11805 if ($counts->{'countsubjects'}) {
370             $self->{void_model}->add_statement(statement($blank,
371             $void->distinctSubjects,
372 23         243 literal(scalar keys %{$counts->{'countsubjects'}}, undef, $xsd->integer)));
  23         671  
373             $self->{void_model}->add_statement(statement($blank,
374             $void->distinctObjects,
375 23         9727 literal(scalar keys %{$counts->{'countobjects'}}, undef, $xsd->integer)));
  23         792  
376             }
377              
378 26         9667 $progress->update(message => "Adding property partition statements");
379              
380             }
381             }
382              
383             sub _generate_classpartitions {
384 4     4   7 my ($self) = @_;
385 4 50       134 return undef unless $self->has_stats;
386 4         97 my $classes = $self->stats->classPartitions;
387 4         6 while (my ($uri, $count) = each(%{$classes})) {
  13         392  
388 9         22 my $blank = blank();
389 9         465 $self->{void_model}->add_statement(statement(
390             $self->dataset_uri,
391             $void->classPartition,
392             $blank));
393 9         3892 $self->{void_model}->add_statement(statement($blank,
394             $void->class,
395             iri($uri)));
396 9         4953 $self->{void_model}->add_statement(statement($blank,
397             $void->triples,
398             literal($count, undef, $xsd->integer)));
399 9         4050 $progress->update(message => "Adding class partition statements");
400             }
401             }
402              
403             sub _generate_most_common_vocabs {
404 12     12   16 my ($self) = @_;
405              
406             # Which vocabularies are most commonly used for predicates in the
407             # dataset? Vocabularies used for less than 1% of triples need not
408             # apply.
409 12         263 my $threshold = $self->inmodel->size / 100;
410 12         502 my %vocabs = %{ $self->stats->vocabularies };
  12         278  
411 12         31 $self->add_vocabularies(grep { $vocabs{$_} > $threshold } keys %vocabs);
  39         423  
412            
413 12         360 foreach my $vocab ($self->all_vocabularies) {
414 39         14573 $self->{void_model}->add_statement(statement(
415             $self->dataset_uri,
416             $void->vocabulary,
417             iri($vocab),
418             ));
419             }
420 12         5747 $progress->update(message => "Adding vocabulary statements");
421            
422             }
423              
424              
425             =head1 AUTHORS
426              
427             Kjetil Kjernsmo C<< <kjetilk@cpan.org> >>
428             Toby Inkster C<< <tobyink@cpan.org> >>
429              
430             =head1 TODO
431              
432             =over
433              
434             =item * URI regexps support.
435              
436             =item * Technical features (esp. serializations).
437              
438             =item * Example resources and root resources.
439              
440             =item * Data dumps.
441              
442             =item * Subject classification.
443              
444             =item * Method to disable heuristics.
445              
446             =item * More heuristics.
447              
448             =item * Linkset descriptions.
449              
450             =item * Set URI space on partitions.
451              
452             =item * Use L<CHI> to cache?
453              
454             =item * Use schema introspection to generate property attributes with L<MooseX::Semantics>.
455              
456              
457              
458             =back
459              
460              
461             =head1 BUGS
462              
463             Please report any bugs you find to L<https://github.com/kjetilk/RDF-Generator-Void/issues>
464              
465             Note that any claim that this module will generate a void in
466             spacetime, a wormhole, black hole, or funny philosophy is totally
467             bogus and without any scientific merit whatsoever. The lead author has
468             made elaborate precautions to avoid any such issues, and expects
469             everyone to take his word for it. Oh, BTW, should it just happen
470             anyway, it won't L<hurt much|http://news.sciencemag.org/sciencenow/2012/03/scienceshot-one-black-hole-wont-.html>.
471              
472              
473             =head1 SUPPORT
474              
475             You can find documentation for this module with the perldoc command.
476              
477             perldoc RDF::Generator::Void
478              
479             The Perl and RDF community website is at L<http://www.perlrdf.org/>
480             where you can also find a mailing list to direct questions to.
481              
482             You can also look for information at:
483              
484             =over 4
485              
486             =item * AnnoCPAN: Annotated CPAN documentation
487              
488             L<http://annocpan.org/dist/RDF-Generator-Void>
489              
490             =item * CPAN Ratings
491              
492             L<http://cpanratings.perl.org/d/RDF-Generator-Void>
493              
494             =item * MetaCPAN
495              
496             L<https://metacpan.org/module/RDF::Generator::Void>
497              
498             =back
499              
500              
501             =head1 ACKNOWLEDGEMENTS
502              
503             Many thanks to Konstantin Baierer for help with L<RDF::Generator::Void::Meta::Attribute::ObjectList>.
504              
505             =head1 LICENSE AND COPYRIGHT
506              
507             Copyright 2012 Toby Inkster.
508             Copyright 2012-2013 Kjetil Kjernsmo.
509              
510             This program is free software; you can redistribute it and/or modify it
511             under the terms of either: the GNU General Public License as published
512             by the Free Software Foundation; or the Artistic License.
513              
514             See http://dev.perl.org/licenses/ for more information.
515              
516              
517             =cut
518              
519             1; # End of RDF::Generator::Void