File Coverage

blib/lib/Plack/App/RDF/Files.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package Plack::App::RDF::Files;
2             {
3             $Plack::App::RDF::Files::VERSION = '0.02';
4             }
5             #ABSTRACT: Serve RDF data from files
6              
7 2     2   204519 use v5.14;
  2         8  
  2         85  
8              
9 2     2   10 use parent 'Plack::Component';
  2         3  
  2         17  
10 2         19 use Plack::Util::Accessor qw(
11             base_dir base_uri file_types path_map
12             include_index index_property namespaces
13 2     2   3010 );
  2         290  
14              
15 2     2   1980 use Plack::Request;
  2         102272  
  2         84  
16 2     2   456599 use RDF::Trine qw(statement iri);
  0            
  0            
17             use RDF::Trine::Model;
18             use RDF::Trine::Parser;
19             use RDF::Trine::Serializer;
20             use RDF::Trine::Iterator::Graph;
21             use File::Spec::Functions qw(catfile catdir);
22             use URI;
23             use Scalar::Util qw(blessed reftype);
24             use Carp qw(croak);
25             use Digest::MD5 qw(md5_hex);
26             use HTTP::Date;
27             use List::Util qw(max);
28              
29              
30             our %FORMATS = (
31             ttl => 'Turtle',
32             nt => 'NTriples',
33             n3 => 'Notation3',
34             json => 'RDFJSON',
35             rdfxml => 'RDFXML'
36             );
37              
38             sub prepare_app {
39             my $self = shift;
40             return if $self->{prepared};
41              
42             die "missing base_dir" unless $self->base_dir and -d $self->base_dir;
43              
44             $self->base_uri( URI->new( $self->base_uri ) )
45             if $self->base_uri;
46              
47             my $types = join '|', @{ $self->file_types // [qw(rdfxml nt ttl)] };
48             $self->file_types( qr/^($types)/ );
49              
50             if ( $self->include_index ) {
51             $self->index_property( 'http://www.w3.org/2000/01/rdf-schema#seeAlso' )
52             unless defined $self->index_property;
53             $self->index_property( iri( $self->index_property ) )
54             if $self->index_property;
55             }
56              
57             $self->path_map( sub { shift } ) unless $self->path_map;
58              
59             $self->{prepared} = 1;
60              
61             $self;
62             }
63              
64              
65             sub files {
66             my $self = shift;
67              
68             my ($env, $req, $path);
69              
70             if (!reftype($_[0])) { # $str
71             return unless $self->base_uri and defined $_[0];
72             # TODO: support full URIs via HTTP::Request
73             $path = substr(shift,1);
74             $env = { };
75             } elsif (!blessed($_[0]) and reftype($_[0]) eq 'HASH') { # $env
76             $env = shift;
77             $req = Plack::Request->new($env);
78             $path = substr($req->path,1);
79             } elsif (blessed($_[0]) and $_[0]->isa('Plack::Request')) { # $req
80             $req = shift;
81             $env = $req->env;
82             $path = substr($req->path,1);
83             } else {
84             croak "expected PSGI request or string";
85             }
86              
87             return if $path !~ /^[a-z0-9:\._@\/-]*$/i or $path =~ /\.\.\/|^\//;
88              
89             $env->{'rdf.uri'} = URI->new( ($self->base_uri // $req->base) . $path );
90              
91             return if $path eq '' and !$self->include_index;
92              
93             my $dir = catdir( $self->base_dir, $self->path_map->($path) );
94              
95             return unless -d $dir;
96             return ($dir) unless -r $dir and opendir(my $dh, $dir);
97              
98             my $files = { };
99             while( readdir $dh ) {
100             next if $_ !~ /\.(\w+)$/ || $1 !~ $self->file_types;
101             my $full = catfile( $dir, $_ );
102             $files->{$_} = {
103             full => $full,
104             size => (stat($full))[7],
105             mtime => (stat($full))[9],
106             }
107             }
108             closedir $dh;
109              
110             return ( $dir => $files );
111             }
112              
113              
114             sub call {
115             my ($self, $env) = @_;
116             my $req = Plack::Request->new($env);
117              
118             return [ 405, [ 'Content-type' => 'text/plain' ], [ 'Method not allowed' ] ]
119             unless (($req->method eq 'GET') || ($req->method eq 'HEAD'));
120              
121             my ($dir, $files) = $self->files( $req );
122              
123             if (!$files) {
124             my $status = 404;
125             my $message = $req->env->{'rdf.uri'}
126             ? "Not found: " . $req->env->{'rdf.uri'} : "Not found";
127              
128             if ($dir and -d $dir) {
129             $status = 404;
130             $message =~ s/found/accesible/;
131             }
132              
133             return [ $status, [ 'Content-type' => 'text/plain' ], [ $message ] ];
134             }
135              
136             my $uri = $env->{'rdf.uri'};
137             my @headers;
138              
139             # TODO: show example with Plack::Middleware::ConditionalGET
140              
141             my $md5 = md5_hex( map { values %{$_} } values %$files );
142             push @headers, ETag => "W/\"$md5\"";
143              
144             my $lastmod = max map { $_->{mtime} } values %$files;
145             push @headers, 'Last-Modified' => HTTP::Date::time2str($lastmod) if $lastmod;
146              
147             # TODO: HEAD method
148              
149             # parse RDF
150             my $model = RDF::Trine::Model->new;
151             my $triples = 0;
152             foreach (keys %$files) { # TODO: parse sorted by modifcation time?
153             my $file = $files->{$_};
154              
155             my $parser = RDF::Trine::Parser->guess_parser_by_filename( $file->{full} );
156             eval {
157             $parser->parse_file_into_model( $uri, $file->{full}, $model );
158             };
159             if ($@) {
160             $file->{error} = $@;
161             } else {
162             $file->{triples} = $model->size - $triples;
163             $triples = $model->size;
164             }
165             }
166             $env->{'rdf.files'} = $files;
167              
168             my $iter = $model->as_stream;
169              
170             # add listing on base URI
171             if ( $self->index_property and "$uri" eq ($self->base_uri // $req->base) ) {
172             my $subject = iri( $uri );
173             my $predicate = $self->index_property;
174             my @stms;
175             opendir(my $dirhandle, $dir);
176             foreach my $p (readdir $dirhandle) {
177             next unless -d catdir( $dir, $p ) and $p !~ /^\.\.?$/;
178             push @stms, statement(
179             $subject,
180             $predicate,
181             RDF::Trine::Node::Resource->new( "$uri$p" )
182             );
183             }
184             closedir $dirhandle;
185              
186             my $i2 = RDF::Trine::Iterator::Graph->new( \@stms );
187             $iter = $iter->concat( $i2 );
188             }
189              
190             # add axiomatic triple to empty graphs
191             if ($iter->finished) {
192             $iter = RDF::Trine::Iterator::Graph->new( [ statement(
193             iri($uri),
194             iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
195             iri('http://www.w3.org/2000/01/rdf-schema#Resource')
196             ) ] );
197             }
198              
199             # TODO: do not serialize at all on request
200              
201             # TODO: HTTP HEAD method
202              
203             # negotiate and serialize
204             my ($ser, @h) = $self->negotiate( $env, $uri );
205             push @headers, @h;
206              
207             if (!$ser) {
208             $ser = RDF::Trine::Serializer->new( 'NTriples', base_uri => $uri );
209             @headers = ('Content-type' => 'text/plain');
210             }
211              
212             if ( $env->{'psgi.streaming'} ) {
213             $env->{'rdf.iterator'} = $iter;
214             return sub {
215             my $responder = shift;
216             # TODO: use IO::Handle::Iterator to serialize as last as possible
217             my $rdf = $ser->serialize_iterator_to_string( $iter );
218             use Encode; # must be bytes
219             $rdf = encode("UTF8",$rdf);
220             $responder->( [ 200, [ @headers ], [ $rdf ] ] );
221             };
222             } else {
223             my $rdf = $ser->serialize_iterator_to_string( $iter );
224             return [200, [ @headers ], [ $rdf ] ];
225             }
226             }
227              
228              
229             sub negotiate {
230             my ($self, $env) = @_;
231              
232             if ( $env->{'negotiate.format'} ) {
233             # TODO: catch RDF::Trine::Error::SerializationError and log
234             my $ser = eval {
235             RDF::Trine::Serializer->new(
236             $FORMATS{$env->{'negotiate.format'}} // $env->{'negotiate.format'},
237             base => $env->{'rdflow.uri'},
238             namespaces => ( $self->namespaces // { } ),
239             )
240             };
241             # TODO: push @headers, Vary => 'Accept'; ?
242             return ($ser);
243             } else {
244             my ($ctype, $ser) = RDF::Trine::Serializer->negotiate(
245             request_headers => Plack::Request->new($env)->headers,
246             );
247             my @headers = ( 'Content-type' => $ctype, Vary => 'Accept' );
248             return ($ser, @headers);
249             }
250             }
251              
252             1;
253              
254             __END__
255              
256             =pod
257              
258             =head1 NAME
259              
260             Plack::App::RDF::Files - Serve RDF data from files
261              
262             =head1 VERSION
263              
264             version 0.02
265              
266             =head1 SYNOPSIS
267              
268             my $app = Plack::App::RDF::Files->new(
269             base_dir => '/var/rdf/
270             );
271              
272             # Requests URI => RDF files
273             # http://example.org/ => /path/to/rdf/*.(nt|ttl|rdfxml)
274             # http://example.org/foo => /path/to/rdf/foo/*.(nt|ttl|rdfxml)
275             # http://example.org/x/y => /path/to/rdf/x/y/*.(nt|ttl|rdfxml)
276              
277             =head1 DESCRIPTION
278              
279             This L<PSGI> application serves RDF from files. Each accessible RDF resource
280             corresponds to a (sub)directory, located in a common based directory. All RDF
281             files in a directory are merged and returned as RDF graph.
282              
283             =head1 METHODS
284              
285             =head2 files( $env | $req | $str )
286              
287             Get a list of RDF files that will be read for a given request. The request can
288             be specified as L<PSGI> environment, as L<Plack::Request>, or as partial URI
289             that follows C<base_uri> (given as string). The requested URI is saved in field
290             C<rdf.uri> of the request environment. On success returns the base directory
291             and a list of files, each mapped to its last modification time. Undef is
292             returned if the request contained invalid characters (everything but
293             C<a-zA-Z0-9:.@-> and the forbidden sequence C<../>) or if the request equals ro
294             the base URI and C<include_index> was not enabled.
295              
296             =head2 negotiate( $env )
297              
298             This internal methods selects an RDF serializer based on the PSGI environment
299             variable C<negotiate.format> (see L<Plack::Middleware::Negotiate>) or the
300             C<negotiate> method of L<RDF::Trine::Serializer>. Returns first a
301             L<RDF::Trine::Serializer> on success or C<undef> on error) and second a
302             (possibly empty) list of HTTP response headers.
303              
304             =head1 CONFIGURATION
305              
306             =over 4
307              
308             =item base_dir
309              
310             Mandatory base directory that all resource directories are located in.
311              
312             =item base_uri
313              
314             The base URI of all resources. If no base URI has been specified, the
315             base URI is taken from the PSGI request.
316              
317             =item file_types
318              
319             An array of RDF file types (extensions) to look for. Set to
320             C<['rdfxml','nt','ttl']> by default.
321              
322             =item include_index
323              
324             By default a HTTP 404 error is returned if one tries to access the base
325             directory. Enable this option to also serve RDF data from this location.
326              
327             =item index_property
328              
329             RDF property to use for listing all resources connected to the base URI (if
330             <include_index> is enabled). Set to C<rdfs:seeAlso> by default. Can be
331             disabled by setting a false value.
332              
333             =item path_map
334              
335             Optional code reference that maps a local part of an URI to a relative
336             directory. Set to the identity mapping by default.
337              
338             =item namespaces
339              
340             Optional namespaces for serialization, passed to L<RDF::Trine::Serializer>.
341              
342             =back
343              
344             =head1 PSGI environment variables
345              
346             The following PSGI environment variables are relevant:
347              
348             =over 4
349              
350             =item rdf.uri
351              
352             The requested URI
353              
354             =item rdf.iterator
355              
356             The L<RDF::Trine::Iterator> that will be used for serializing, if
357             C<psgi.streaming> is set. One can use this variable to catch the RDF
358             data in another post-processing middleware.
359              
360             =item rdf.files
361              
362             An hash of source filenames, each with the number of triples (on success)
363             as property C<size>, an error message as C<error> if parsing failed, and
364             the timestamp of last modification as C<mtime>. C<size> and C<error> may
365             not be given before parsing, if C<rdf.iterator> is set.
366              
367             =back
368              
369             =head1 LIMITATIONS
370              
371             B<This module is an early developer release. Be warned!>
372              
373             All resource URIs to be served must have a common URI prefix (such as
374             C<http://example.org/> above) and a local part that may be restricted to a
375             limited set of characters. For instance the character sequence C<../> is
376             not allowed.
377              
378             =head1 NOTES
379              
380             If an existing resource does not contain triples, the axiomatic triple
381             C<< ?uri rdf:type rdfs:Resource >> is returned.
382              
383             To update the files, add a middleware that catches 404 and 202 responses.
384              
385             =head1 TODO
386              
387             VoID descriptions could be added, possibly with L<RDF::Generator::Void>.
388              
389             =head1 SEE ALSO
390              
391             Use L<Plack::Middleware::Negotiate> to add content negotiation based on
392             an URL parameter and/or suffix.
393              
394             See L<RDF::LinkedData> for a different module to serve RDF as linked data.
395             See also L<RDF::Flow> and L<RDF::Lazy> for processing RDF data.
396              
397             See L<http://foafpress.org/> for a similar approach in PHP.
398              
399             =head1 AUTHOR
400              
401             Jakob Voß <voss@gbv.de>
402              
403             =head1 COPYRIGHT AND LICENSE
404              
405             This software is copyright (c) 2013 by Jakob Voß.
406              
407             This is free software; you can redistribute it and/or modify it under
408             the same terms as the Perl 5 programming language system itself.
409              
410             =cut