File Coverage

blib/lib/RDF/Trine/Pattern.pm
Criterion Covered Total %
statement 184 193 95.3
branch 32 40 80.0
condition 5 15 33.3
subroutine 28 28 100.0
pod 14 14 100.0
total 263 290 90.6


line stmt bran cond sub pod time code
1             # RDF::Trine::Pattern
2             # -----------------------------------------------------------------------------
3              
4             =head1 NAME
5              
6             RDF::Trine::Pattern - Class for basic graph patterns
7              
8             =head1 VERSION
9              
10             This document describes RDF::Trine::Pattern version 1.018
11              
12             =cut
13              
14             package RDF::Trine::Pattern;
15              
16 68     68   401 use strict;
  68         144  
  68         1527  
17 68     68   305 use warnings;
  68         136  
  68         1465  
18 68     68   309 no warnings 'redefine';
  68         139  
  68         1681  
19              
20 68     68   349 use Data::Dumper;
  68         142  
  68         2530  
21 68     68   360 use Log::Log4perl;
  68         159  
  68         398  
22 68     68   3603 use Scalar::Util qw(blessed refaddr);
  68         162  
  68         2778  
23 68     68   392 use List::Util qw(any);
  68         158  
  68         3479  
24 68     68   377 use Carp qw(carp croak confess);
  68         173  
  68         3099  
25 68     68   369 use RDF::Trine::Iterator qw(smap);
  68         147  
  68         2324  
26 68     68   422 use RDF::Trine qw(iri);
  68         140  
  68         3099  
27              
28             ######################################################################
29              
30             our ($VERSION);
31             BEGIN {
32 68     68   93756 $VERSION = '1.018';
33             }
34              
35             ######################################################################
36              
37             =head1 METHODS
38              
39             =over 4
40              
41             =item C<< new ( @triples ) >>
42              
43             Returns a new BasicGraphPattern structure.
44              
45             =cut
46              
47             sub new {
48 990     990 1 7933 my $class = shift;
49 990         2125 my @triples = @_;
50 990         1798 foreach my $t (@triples) {
51 1038 100 100     7515 unless (blessed($t) and $t->isa('RDF::Trine::Statement')) {
52 2         17 throw RDF::Trine::Error -text => "Patterns belonging to a BGP must be triples";
53             }
54             }
55 988         3211 return bless( [ @triples ], $class );
56             }
57              
58             =item C<< construct_args >>
59              
60             Returns a list of arguments that, passed to this class' constructor,
61             will produce a clone of this algebra pattern.
62              
63             =cut
64              
65             sub construct_args {
66 1     1 1 501 my $self = shift;
67 1         4 return ($self->triples);
68             }
69              
70             =item C<< triples >>
71              
72             Returns a list of triples belonging to this BGP.
73              
74             =cut
75              
76             sub triples {
77 2950     2950 1 4998 my $self = shift;
78 2950         8546 return @$self;
79             }
80              
81             =item C<< type >>
82              
83             =cut
84              
85             sub type {
86 25     25 1 81 return 'BGP';
87             }
88              
89             =item C<< sse >>
90              
91             Returns the SSE string for this algebra expression.
92              
93             =cut
94              
95             sub sse {
96 31     31 1 81 my $self = shift;
97 31         70 my $context = shift;
98            
99             return sprintf(
100             '(bgp %s)',
101 31         117 join(' ', map { $_->sse( $context ) } $self->triples)
  54         235  
102             );
103             }
104              
105             =item C<< referenced_variables >>
106              
107             Returns a list of the variable names used in this algebra expression.
108              
109             =cut
110              
111             sub referenced_variables {
112 65     65 1 135 my $self = shift;
113 65         192 return RDF::Trine::_uniq(map { $_->referenced_variables } $self->triples);
  118         366  
114             }
115              
116             =item C<< definite_variables >>
117              
118             Returns a list of the variable names that will be bound after evaluating this algebra expression.
119              
120             =cut
121              
122             sub definite_variables {
123 1     1 1 3 my $self = shift;
124 1         3 return RDF::Trine::_uniq(map { $_->definite_variables } $self->triples);
  1         4  
125             }
126              
127             =item C<< clone >>
128              
129             =cut
130              
131             sub clone {
132 1     1 1 3 my $self = shift;
133 1         2 my $class = ref($self);
134 1         3 return $class->new( map { $_->clone } $self->triples );
  1         4  
135             }
136              
137             =item C<< bind_variables ( \%bound ) >>
138              
139             Returns a new pattern with variables named in %bound replaced by their corresponding bound values.
140              
141             =cut
142              
143             sub bind_variables {
144 1     1 1 3 my $self = shift;
145 1         2 my $class = ref($self);
146 1         2 my $bound = shift;
147 1         4 return $class->new( map { $_->bind_variables( $bound ) } $self->triples );
  1         3  
148             }
149              
150             =item C<< subsumes ( $statement ) >>
151              
152             Returns true if the pattern will subsume the $statement when matched against a
153             triple store.
154              
155             =cut
156              
157             sub subsumes {
158 5     5 1 436 my $self = shift;
159 5         8 my $st = shift;
160            
161 5         23 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
162 5         538 my @triples = $self->triples;
163 5         10 foreach my $t (@triples) {
164 5 100       16 if ($t->subsumes( $st )) {
165 4         10 $l->debug($self->sse . " \x{2292} " . $st->sse);
166 4         46 return 1;
167             }
168             }
169 1         5 return 0;
170             }
171              
172             =item C<< merge_patterns ( @patterns ) >>
173              
174             Given an array of patterns, this will merge them into one.
175              
176             =cut
177              
178             sub merge_patterns {
179 12     12 1 38 my ($class, @patterns) = @_;
180 12         21 my @all_triples;
181 12         24 foreach my $pattern (@patterns) {
182 12 50 33     84 unless (blessed($pattern) and $pattern->isa('RDF::Trine::Pattern')) {
183 0         0 throw RDF::Trine::Error -text => "Patterns to be merged must be patterns themselves";
184             }
185 12         39 push(@all_triples, $pattern->triples);
186             }
187 12         41 return $class->new(@all_triples);
188             }
189              
190             =item C<< sort_for_join_variables >>
191              
192             Returns a new pattern object with the subpatterns of the referrant
193             sorted based on heuristics that ensure firstly that patterns can be
194             joined on the same variable and secondly on the usual selectivity
195             (i.e. how quickly the engine can drill down to the answer) of triple
196             patterns. Calls C<< subgroup >>, C<< sort_triples >> and C<<
197             merge_patterns >> in that order.
198              
199             =cut
200              
201             sub sort_for_join_variables {
202 914     914 1 1586 my $self = shift;
203 914 100       1913 return $self if (scalar $self->triples == 1);
204              
205 12         30 my $class = ref($self);
206 12         96 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
207 12         2522 $l->debug('Reordering ' . scalar $self->triples . ' triples for heuristical optimizations');
208              
209 12         128 my @sorted_triple_patterns = $self->subgroup;
210              
211 12         20 my @patterns;
212 12         31 foreach my $pattern (@sorted_triple_patterns) {
213 12         36 my $sorted = $pattern->sort_triples;
214 12         32 push(@patterns, $sorted);
215             }
216 12         37 return $class->merge_patterns(@patterns);
217             }
218              
219              
220             =item C<< subgroup >>
221              
222             Splits the pattern object up in an array of pattern objects where the
223             same triple patterns occur. It will group on common variables, so that
224             triple patterns can be joined together is in a group together. It will
225             also group triples that have no connection to other triples in a
226             group. It will then order the groups, first by number triples with
227             common variables, then by number of literals, then by the total number
228             of terms that are not variables.
229              
230              
231             =cut
232              
233             sub subgroup {
234 12     12 1 26 my $self = shift;
235 12         32 my @triples = $self->triples;
236 12         44 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
237 12         279 my %structure_counts;
238             my %triples_by_tid;
239             # First, we loop the dataset to compile some numbers for the
240             # variables in each triple pattern. This is to break the pattern
241             # into subpatterns that can be joined on the same variable
242 12         29 foreach my $t (@triples) {
243 24         61 my $tid = refaddr($t);
244 24         67 $triples_by_tid{$tid} = $t;
245 24         43 my $not_variable = 0;
246 24         75 foreach my $n ($t->nodes) {
247 80 100       237 if ($n->isa('RDF::Trine::Node::Variable')) {
248 31         89 my $name = $n->name;
249 31         96 $structure_counts{ $name }{ 'name' } = $name; # TODO: Worth doing in an array?
250 31         48 push(@{$structure_counts{$name}{'claimed_patterns'}}, $tid);
  31         99  
251 31         67 $structure_counts{ $name }{ 'common_variable_count' }++;
252 31 100       91 $structure_counts{ $name }{ 'not_variable_count' } = 0 unless ($structure_counts{ $name }{ 'not_variable_count' });
253 31 50       87 $structure_counts{ $name }{ 'literal_count' } = 0 unless ($structure_counts{ $name }{ 'literal_count' });
254 31         84 foreach my $char (split(//, $n->as_string)) { # TODO: Use a more standard format
255 93         180 $structure_counts{ $name }{ 'string_sum' } += ord($char);
256             }
257 31         96 foreach my $o ($t->nodes) {
258 99 100 33     411 unless ($o->isa('RDF::Trine::Node::Variable')) {
259 48         92 $structure_counts{ $name }{ 'not_variable_count' }++;
260             }
261             elsif ($o->isa('RDF::Trine::Node::Literal')) {
262             $structure_counts{ $name }{ 'literal_count' }++;
263             }
264             }
265             } else {
266 49         80 $not_variable++;
267             }
268             }
269 24 100       85 if ($not_variable == 3) { # Then, there are no variables in the pattern
270 6         13 my $name = '_no_definite';
271 6         16 $structure_counts{ $name }{ 'not_variable_count' } = $not_variable;
272 6         14 $structure_counts{ $name }{ 'common_variable_count' } = 0;
273 6         14 $structure_counts{ $name }{ 'literal_count' } = 0; # Doesn't mean anything now
274 6         13 $structure_counts{ $name }{ 'string_sum' } = 0; # Doesn't mean anything now
275 6         10 push(@{$structure_counts{$name}{'claimed_patterns'}}, $tid);
  6         18  
276             }
277              
278             }
279              
280             # Group triple subpatterns with just one triple pattern
281 12         33 my $just_ones;
282 12         60 while (my ($name, $data) = each(%structure_counts)) {
283 27 100       105 if($data->{'common_variable_count'} <= 1) {
284 19         44 $just_ones->{'common_variable_count'} = 1;
285 19         29 $just_ones->{'string_sum'} = 1;
286 19         42 $just_ones->{'literal_count'} += $data->{'literal_count'};
287 19         39 $just_ones->{'not_variable_count'} += $data->{'not_variable_count'};
288 19         29 my @claimed = @{$data->{'claimed_patterns'}};
  19         46  
289 19 100   7   65 unless (any { $_ == $claimed[0] } @{$just_ones->{'claimed_patterns'}}) {
  7         19  
  19         87  
290 14         24 push(@{$just_ones->{'claimed_patterns'}}, $claimed[0]);
  14         38  
291             }
292 19         110 delete $structure_counts{$name};
293             }
294             }
295              
296 12         71 $l->trace('Results of structural analysis: ' . Dumper(\%structure_counts));
297 12         1255 $l->trace('Block of single-triple patterns: ' . Dumper($just_ones));
298              
299             # Now, sort the patterns in the order specified by first the number
300             # of occurances of common variables, then the number of literals
301             # and then the number of terms that are not variables
302 12         796 my @sorted_patterns = sort { $b->{'common_variable_count'} <=> $a->{'common_variable_count'}
303             or $b->{'literal_count'} <=> $a->{'literal_count'}
304             or $b->{'not_variable_count'} <=> $a->{'not_variable_count'}
305 0 0 0     0 or $b->{'string_sum'} <=> $a->{'string_sum'}
      0        
306             } values(%structure_counts);
307              
308 12         30 push (@sorted_patterns, $just_ones);
309              
310 12         36 my @sorted_triple_patterns;
311              
312             # Now, loop through the sorted patterns, let the one with most
313             # weight first select the triples it wants to join. Within those
314             # subpatterns, apply the sort order of triple pattern heuristic
315 12         33 foreach my $item (@sorted_patterns) {
316 12         20 my @triple_patterns;
317 12         29 my $triples_left = scalar keys(%triples_by_tid);
318 12 50       44 if ($triples_left > 2) {
319 0         0 foreach my $tid (@{$item->{'claimed_patterns'}}) {
  0         0  
320 0 0       0 if (defined($triples_by_tid{$tid})) {
321 0         0 push(@triple_patterns, $triples_by_tid{$tid});
322 0         0 delete $triples_by_tid{$tid};
323             }
324             }
325 0         0 $l->debug("There are $triples_left triples left");
326 0         0 push(@sorted_triple_patterns, RDF::Trine::Pattern->new(@triple_patterns)); # TODO: Better way to call ourselves?
327             } else {
328 12         59 $l->debug("There is a rest of $triples_left triples");
329 12         102 push(@sorted_triple_patterns, RDF::Trine::Pattern->new(values(%triples_by_tid)));
330 12         42 last;
331             }
332             }
333              
334 12         69 return @sorted_triple_patterns;
335             }
336              
337             =item C<< sort_triples >>
338              
339             Will sort the triple patterns based on heuristics that looks at how
340             many variables the patterns have, and where they occur, see REFERENCES
341             for details. Returns a new sorted pattern object.
342              
343             =cut
344              
345             sub sort_triples {
346 12     12 1 26 my $self = shift;
347 12         36 return $self->_hsp_heuristic_1_4_triple_pattern_order;
348             }
349              
350             sub _hsp_heuristic_1_4_triple_pattern_order { # Heuristic 1 and 4 of HSP
351 12     12   20 my $self = shift;
352 12         25 my $class = ref($self);
353 12         29 my @triples = @$self;
354 12 50       66 return $self if (scalar @triples == 1);
355 12         22 my %triples_by_tid;
356 12         28 foreach my $t (@triples) {
357 24         61 my $tid = refaddr($t);
358 24         83 $triples_by_tid{$tid}{'tid'} = $tid; # TODO: Worth doing this in an array?
359 24         48 $triples_by_tid{$tid}{'triple'} = $t;
360 24         65 $triples_by_tid{$tid}{'sum'} = _hsp_heuristic_triple_sum($t);
361             }
362 12         62 my @sorted_tids = sort { $a->{'sum'} <=> $b->{'sum'} } values(%triples_by_tid);
  12         54  
363 12         24 my @sorted_triples;
364 12         29 foreach my $entry (@sorted_tids) {
365 24         53 push(@sorted_triples, $triples_by_tid{$entry->{'tid'}}->{'triple'});
366             }
367 12         43 return $class->new(@sorted_triples);
368             }
369              
370             # The below function finds a number to aid sorting
371             # It takes into account Heuristic 1 and 4 of the HSP paper, see REFERENCES
372             # as well as that it was noted in the text that rdf:type is usually less selective.
373              
374             # By assigning the integers to nodes, depending on whether they are in
375             # triple (subject, predicate, object), variables, rdf:type and
376             # literals, and sum them, they may be sorted. See code for the actual
377             # values used.
378              
379             # Denoting s for bound subject, p for bound predicate, a for rdf:type
380             # as predicate, o for bound object and l for literal object and ? for
381             # variable, we get the following order, most of which are identical to
382             # the HSP:
383              
384             # spl: 6
385             # spo: 8
386             # sao: 10
387             # s?l: 14
388             # s?o: 16
389             # ?pl: 25
390             # ?po: 27
391             # sp?: 30
392             # sa?: 32
393             # ??l: 33
394             # ??o: 35
395             # s??: 38
396             # ?p?: 49
397             # ?a?: 51
398             # ???: 57
399              
400             # Note that this number is not intended as an estimate of selectivity,
401             # merely a sorting key, but further research may possibly create such
402             # numbers.
403              
404             sub _hsp_heuristic_triple_sum {
405 24     24   36 my $t = shift;
406 24         39 my $sum = 0;
407 24 100       82 if ($t->subject->is_variable) {
408 15         32 $sum = 20;
409             } else {
410 9         19 $sum = 1;
411             }
412 24 100       75 if ($t->predicate->is_variable) {
413 2         5 $sum += 10;
414             } else {
415 22 100       58 if ($t->predicate->equal(iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))) {
416 7         17 $sum += 4;
417             } else {
418 15         32 $sum += 2;
419             }
420             }
421 24 100       88 if ($t->object->is_variable) {
    100          
422 14         33 $sum += 27;
423             } elsif ($t->object->is_literal) {
424 3         8 $sum += 3;
425             } else {
426 7         13 $sum += 5;
427             }
428 24         100 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
429             # Now a trick to get an deterministic sort order, hard to test without.
430 24         619 $sum *= 10000000;
431 24         76 foreach my $c (split(//,$t->as_string)) {
432 1643         2145 $sum += ord($c);
433             }
434 24         132 $l->debug($t->as_string . " triple has sorting sum " . $sum);
435 24         242 return $sum;
436             }
437              
438              
439            
440              
441             1;
442              
443             __END__
444              
445             =back
446              
447             =head1 BUGS
448              
449             Please report any bugs or feature requests to through the GitHub web interface
450             at L<https://github.com/kasei/perlrdf/issues>.
451              
452             =head1 REFERENCES
453              
454             The heuristics to order triple patterns in this module is strongly
455             influenced by L<The ICS-FORTH Heuristics-based SPARQL Planner
456             (HSP)|http://www.ics.forth.gr/isl/index_main.php?l=e&c=645>.
457              
458             =head1 AUTHOR
459              
460             Gregory Todd Williams C<< <gwilliams@cpan.org> >>
461              
462             Kjetil Kjernsmo C<< <kjetilk@cpan.org> >>
463              
464             =head1 COPYRIGHT
465              
466             Copyright (c) 2006-2012 Gregory Todd Williams. This
467             program is free software; you can redistribute it and/or modify it under
468             the same terms as Perl itself.
469              
470             =cut