File Coverage

blib/lib/RDF/TrineX/Merge/Bnodes.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1 2     2   62584 use strict;
  2         6  
  2         129  
2             package RDF::TrineX::Merge::Bnodes;
3             #ABSTRACT: Merge blank nodes that obviously refer to the same resource
4             our $VERSION = '0.1.1'; #VERSION
5              
6 2     2   1484 use parent 'Exporter';
  2         602  
  2         13  
7             our @EXPORT = qw(merge_bnodes);
8              
9 2     2   7382 use Digest;
  2         1101  
  2         53  
10 2     2   3155 use RDF::Trine::Model;
  0            
  0            
11              
12             sub merge_bnodes {
13             my ($iterator, %options) = @_;
14              
15             # configuration
16              
17             $iterator = $iterator->as_stream if $iterator->isa('RDF::Trine::Model');
18              
19             my $digest = $options{digest} || 'MD5';
20             $digest = Digest->new($digest) unless ref $digest;
21              
22             my $model = $options{model} || RDF::Trine::Model->new;
23              
24              
25             # iterate and buffer triples with a single blank node
26              
27             my %buffer;
28             while (my $triple = $iterator->next) {
29             my $id = undef;
30             my $subj = $triple->subject;
31             my $obj = $triple->object;
32              
33             if ( $subj->isa('RDF::Trine::Node::Blank') ) {
34             if ( $obj->isa('RDF::Trine::Node::Blank') ) {
35             # both blank => flush buffer
36             my @ids = map { $_->blank_identifier } $subj, $obj;
37             foreach (@ids) {
38             foreach (@{ $buffer{$_} || [] }) {
39             $model->add_statement($_);
40             }
41             $buffer{$_} = undef;
42             }
43             } else {
44             $id = $subj->blank_identifier;
45             }
46             } elsif ( $obj->isa('RDF::Trine::Node::Blank') ) {
47             $id = $obj->blank_identifier;
48             }
49              
50             if ( defined $id and ($buffer{$id} or !exists $buffer{$id}) ) {
51             push @{ $buffer{$id} }, $triple;
52             next;
53             }
54              
55             $model->add_statement( $triple );
56             }
57              
58             my %id2digest;
59             my %digest2id;
60              
61             while (my ($id, $triples) = each %buffer) {
62             next if !defined $triples;
63              
64             # calculate digest for the set of triples connected to bnode $id
65             my @canonical;
66             foreach (@$triples) {
67             my ($subj, $obj) = map {
68             $_->isa('RDF::Trine::Node::Blank') ? '~' : $_->as_ntriples
69             } $_->subject, $_->object;
70             push @canonical, join ' ', $subj, $_->predicate->as_ntriples, $obj;
71             }
72             # print "$_\n" for sort @canonical;
73              
74             $digest->reset;
75             $digest->add($_) for sort @canonical;
76             my $base64 = $digest->b64digest;
77              
78             $id2digest{$id} = $base64;
79             push @{$digest2id{$base64}}, $id;
80             }
81              
82             # use Data::Dumper; print Dumper(\%digest2id)."\n";
83              
84             # keep only of of each bnode that obviously refer to the same resource
85              
86             foreach my $base64 ( keys %digest2id ) {
87             # sort only required for stable bnode ids (FIXME?)
88             my @ids = sort @{$digest2id{$base64}};
89              
90             shift @ids; # keep the first
91             foreach (@ids) {
92             $buffer{$_} = undef;
93             }
94             }
95            
96              
97             # add remaining triples with bnodes
98            
99             foreach (grep { defined $_ } values %buffer) {
100             foreach ( @$_ ) {
101             $model->add_statement( $_ );
102             }
103             }
104              
105             return $model;
106             }
107              
108             __END__
109              
110             =pod
111              
112             =encoding UTF-8
113              
114             =head1 NAME
115              
116             RDF::TrineX::Merge::Bnodes - Merge blank nodes that obviously refer to the same resource
117              
118             =head1 VERSION
119              
120             version 0.1.1
121              
122             =head1 SYNOPSIS
123              
124             use RDF::TrineX::Merge::Bnodes;
125              
126             $model = merge_bnodes($model_or_iterator, %options);
127              
128             To give an example, applying C<merge_bnodes> on this graph:
129              
130             @prefix foaf: <http://xmlns.com/foaf/0.1/> .
131             @base <http://example.org/> .
132              
133             <Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] .
134             <Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] . # obviously the same
135              
136             will remove the second Bob.
137              
138             =head1 DESCRIPTION
139              
140             This module exports the function B<merge_bnodes> to merge blank nodes that
141             obviously refer to the same resource in an RDF graph. The function gets passed
142             a L<RDF::Trine::Model> or L<RDF::Trine::Iterator>. The model or iterator
143             should only contain RDF-compatible statements (e.g. no blank node predicates).
144              
145             The function can be applied to get rid of obviously duplicated statements.
146             Obviously duplicated statements are defined as following:
147              
148             =over
149              
150             =item
151              
152             The statements include either a blank node subject or a blank node object.
153              
154             =item
155              
156             The statements only differ by their blank node identifier.
157              
158             =item
159              
160             The blank nodes are not part of any other statement that includes two blank
161             nodes.
162              
163             =back
164              
165             In other words, the algorithm first finds all star subgraphs with the internal
166             node as only blank nodes in the subgraph. Each subgraph is assigned a digest
167             value calculated from all triples and nodes expect the blank nodes. Then
168             duplicated subgraphs with same digest are removed.
169              
170             =head1 LIMITATIONS
171              
172             Statements that involve multiple blank nodes or blank nodes that are connected
173             to another blank node are never removed.
174              
175             Don't expect the algorithm to understand what you is actually meant by the
176             existence of blank nodes in your data.
177              
178             =head1 CONFIGURATION
179              
180             Options can be passed as key-value pairs:
181              
182             =over
183              
184             =item digest
185              
186             A L<Digest> or the name of a Digest module, e.g. "C<MD4>". The default digest
187             is L<Digest::MD5>.
188              
189             =back
190              
191             Options not implemented yet:
192              
193             =over
194              
195             =item
196              
197             Option to skolemize blank nodes (IRIs with C<.well-known/genid/>).
198              
199             =item
200              
201             Option to also remove entailed statements with blank nodes:
202              
203             <Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] .
204             <Alice> foaf:knows [ a foaf:Person ] . # could also be removed
205              
206             =item
207              
208             =back
209              
210             =head1 AUTHOR
211              
212             Jakob Voß
213              
214             =head1 COPYRIGHT AND LICENSE
215              
216             This software is copyright (c) 2014 by Jakob Voß.
217              
218             This is free software; you can redistribute it and/or modify it under
219             the same terms as the Perl 5 programming language system itself.
220              
221             =cut