File Coverage

blib/lib/RDF/Flow/Source.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1 7     7   825 use strict;
  7         14  
  7         209  
2 7     7   34 use warnings;
  7         11  
  7         297  
3             package RDF::Flow::Source;
4             {
5             $RDF::Flow::Source::VERSION = '0.178';
6             }
7             #ABSTRACT: Source of RDF data
8              
9 7     7   6330 use Log::Contextual::WarnLogger;
  7         7367  
  7         373  
10 7         90 use Log::Contextual qw(:log), -default_logger
11 7     7   12278 => Log::Contextual::WarnLogger->new({ env_prefix => __PACKAGE__ });
  7         783420  
12              
13 7     7   451392 use 5.010;
  7         29  
  7         321  
14 7     7   43 use re qw(is_regexp);
  7         23  
  7         1260  
15              
16 7     7   11718 use RDF::Trine qw(iri);
  0            
  0            
17             use Scalar::Util qw(blessed refaddr reftype);
18             use Try::Tiny;
19             use Carp;
20              
21             use URI;
22             use URI::Escape;
23             use File::Spec::Functions;
24              
25             use parent 'Exporter';
26             our @EXPORT_OK = qw(sourcelist_args iterator_to_model empty_rdf rdflow_uri);
27             our %EXPORT_TAGS = (
28             util => [qw(sourcelist_args iterator_to_model empty_rdf rdflow_uri)],
29             );
30              
31             use RDF::Trine::Model;
32             use RDF::Trine::Parser;
33              
34             #require RDF::Flow::Pipeline;
35              
36             sub new {
37             my $class = shift;
38             my ($src, %args) = ref($_[0]) ? @_ : (undef,@_);
39              
40             $src = delete $args{from} unless defined $src;
41              
42             my $match = delete $args{match};
43             my $code;
44              
45             my $self = bless { }, $class;
46              
47             if ( $src and not ref $src ) { # load from file or directory
48             my $model = RDF::Trine::Model->new;
49             my @files;
50             if ( $src =~ /^https?:\/\// ) {
51             eval { RDF::Trine::Parser->parse_url_into_model( $src, $model ); };
52             goto CHECK;
53             } else {
54             if ( -d $src ) {
55             if ( opendir(DIR, $src) ) {
56             my $ext = join ('|', keys %RDF::Trine::Parser::file_extensions);
57             @files = map { catfile($src,$_) } grep(/\.($ext)$/,readdir(DIR));
58             closedir DIR;
59             } else {
60             log_warn { "failed to open directory $src"; }
61             }
62             } else {
63             @files = ($src);
64             }
65             }
66              
67             while ($src = shift @files) {
68             eval { RDF::Trine::Parser->parse_file_into_model( "file:///$src", $src, $model ); };
69              
70             CHECK: # yes, it's an evil goto statement jump target :-)
71             if ( @_ ) {
72             log_warn { "failed to load from $src"; }
73             } else {
74             log_info { "loaded from $src"; }
75             }
76             }
77              
78             $src = $model;
79             }
80              
81             if (blessed $src and $src->isa('RDF::Flow::Source')) {
82             $self->{from} = $src;
83             $code = sub {
84             $src->retrieve( @_ );
85             };
86             # return $src; # don't wrap
87             # TODO: use args to modify object!
88             } elsif ( blessed $src and $src->isa('RDF::Trine::Model') ) {
89             $self->{from} = $src;
90             $code = sub {
91             my $uri = rdflow_uri( shift );
92             iterator_to_model( $src->bounded_description(
93             iri( $uri )
94             ) );
95             };
96             } elsif ( ref $src and ref $src eq 'CODE' ) {
97             $code = $src;
98             } elsif (not defined $src) {
99             carp 'Missing RDF source in plain RDF::Flow::Source'
100             if $class eq 'RDF::Flow::Source';
101             $code = sub { };
102             } else {
103             croak 'expected RDF::Source, RDF::Trine::Model, or code reference'
104             }
105              
106             $self->{name} = $args{name} if defined $args{name};
107             $self->{code} = $code;
108              
109             $self->match( $match );
110              
111             $self->init();
112              
113             $self;
114             }
115              
116             sub init { }
117              
118             sub match { # accessor
119             my $self = shift;
120             return $self->{match} unless @_;
121              
122             my $match = shift;
123             if ( defined $match ) {
124             my $pattern = $match;
125             $match = sub { $_[0] =~ $pattern; }
126             if is_regexp($match);
127             croak 'url parameter must be code or regexp'.reftype($match). ": $match"
128             if reftype $match ne 'CODE';
129             $self->{match} = $match;
130             } else {
131             $self->{match} = undef;
132             }
133             }
134              
135             sub retrieve {
136             my ($self, $env) = @_;
137             $env = { 'rdflow.uri' => $env } if ($env and not ref $env);
138             log_trace {
139             sprintf "retrieve from %s with %s", about($self), rdflow_uri($env);
140             };
141             $self->timestamp( $env );
142              
143             my $result;
144             if ( $self->{match} ) {
145             my $uri = $env->{'rdflow.uri'};
146             if ( $self->{match}->( $env->{'rdflow.uri'} ) ) {
147             $result = $self->retrieve_rdf( $env );
148             $env->{'rdflow.uri'} = $uri;
149             } else {
150             log_trace { "URI did not match: " . $env->{'rdflow.uri'} };
151             $result = RDF::Trine::Model->new;
152             }
153             } else {
154             $result = $self->retrieve_rdf( $env );
155             }
156              
157             return $self->trigger_retrieved( $result );
158             }
159              
160             sub retrieve_rdf {
161             my ($self, $env) = @_;
162             return try {
163             $self->{code}->( $env );
164             } catch {
165             s/[.]?\s+$//s;
166             RDF::Flow::Source::trigger_error( $self, $_, $env );
167             RDF::Trine::Model->new;
168             }
169             }
170              
171             sub trigger_error {
172             my ($self, $message, $env) = @_;
173             $message = 'unknown error' unless $message;
174             $env->{'rdflow.error'} = $message if $env;
175             log_error { $message };
176             }
177              
178             sub trigger_retrieved {
179             my ($self, $result, $msg) = @_;
180             log_trace {
181             $msg = "%s returned %s" unless $msg;
182             my $size = 'no';
183             if ( $result ) {
184             $size = (blessed $result and $result->can('size'))
185             ? $result->size : 'some';
186             };
187             sprintf $msg, name($self), "$size triples";
188             };
189             return $result;
190             }
191              
192             sub id {
193             return "source".refaddr(shift);
194             }
195              
196              
197             sub graphviz {
198             return scalar shift->graphviz_addnode( @_ );
199             }
200              
201             sub graphviz_addnode {
202             my $self = shift;
203             my $g = ( blessed $_[0] and $_[0]->isa('GraphViz') )
204             ? shift : eval { GraphViz->new( @_ ) };
205             return unless $g;
206              
207             $g->add_node( $self->id, $self->_graphviz_nodeattr );
208              
209             my $i=1;
210             foreach my $s ( $self->inputs ) {
211             $s->graphviz($g);
212             $g->add_edge( $s->id, $self->id, $self->_graphviz_edgeattr($i++) );
213             }
214              
215             return $g;
216             }
217              
218             sub _graphviz_nodeattr {
219             return (label => shift->name);
220             }
221              
222             sub _graphviz_edgeattr { }
223              
224             use POSIX qw(strftime);
225              
226             sub timestamp {
227             my ($self, $env) = @_;
228             my $now = time();
229             my $tz = strftime("%z", localtime($now));
230             $tz =~ s/(\d{2})(\d{2})/$1:$2/;
231             $tz =~ s/00:00/Z/; # UTC aka Z-Time
232             my $timestamp = strftime("%Y-%m-%dT%H:%M:%S", localtime($now)) . $tz;
233             $env->{'rdflow.timestamp'} = $timestamp if $env;
234             return $timestamp;
235             }
236              
237             sub name {
238             shift->{name} || 'anonymous source';
239             }
240              
241             sub about {
242             shift->name;
243             }
244              
245             sub inputs {
246             my $self = shift;
247             return $self->{inputs} ? @{ $self->{inputs} } : ();
248             }
249              
250             sub size {
251             my $self = shift;
252             return 1 unless $self->{inputs};
253             return scalar @{ $self->{inputs} };
254             }
255              
256             sub sourcelist_args {
257             my ($inputs, $args) = ([],{});
258             while ( @_ ) {
259             my $s = shift @_;
260             if ( ref $s ) {
261             push @$inputs, map { RDF::Flow::Source->new($_) } $s;
262             } elsif ( defined $s ) {
263             $args->{$s} = shift @_;
264             } else {
265             croak 'undefined parameter';
266             }
267             }
268             return ($inputs, $args);
269             }
270              
271             sub iterator_to_model {
272             my $iterator = shift;
273             return $iterator if $iterator->isa('RDF::Trine::Model');
274              
275             my $model = shift || RDF::Trine::Model->new;
276              
277             $model->begin_bulk_ops;
278             while (my $st = $iterator->next) {
279             $model->add_statement( $st );
280             }
281             $model->end_bulk_ops;
282              
283             $model;
284             }
285              
286             sub empty_rdf {
287             my $rdf = shift;
288             return 1 unless blessed $rdf;
289             return !($rdf->isa('RDF::Trine::Model') and $rdf->size > 0) &&
290             !($rdf->isa('RDF::Trine::Iterator') and $rdf->peek);
291             }
292              
293             sub rdflow_uri {
294             my $env = shift;
295             return ($env || '') unless ref $env; # plain scalar or undef
296              
297             return $env->{'rdflow.uri'} if defined $env->{'rdflow.uri'};
298              
299             # a few lines of code from Plack::Request, so we don't require all of Plack
300             my $base = ($env->{'psgi.url_scheme'} || "http") .
301             "://" . ($env->{HTTP_HOST} || (($env->{SERVER_NAME} || "") .
302             ":" . ($env->{SERVER_PORT} || 80))) . ($env->{SCRIPT_NAME} || '/');
303             $base = URI->new($base)->canonical;
304              
305             my $path_escape_class = '^A-Za-z0-9\-\._~/';
306              
307             my $path = URI::Escape::uri_escape( $env->{PATH_INFO} || '', $path_escape_class );
308              
309             $path .= '?' . $env->{QUERY_STRING} if !$env->{'rdflow.ignorepath'} &&
310             defined $env->{QUERY_STRING} && $env->{QUERY_STRING} ne '';
311              
312             $base =~ s!/$!! if $path =~ m!^/!;
313              
314             $env->{'rdflow.uri'} = URI->new( $base . $path )->canonical->as_string;
315              
316             $env->{'rdflow.uri'} =~ s/^https?:\/\/\/$//;
317             $env->{'rdflow.uri'};
318             }
319              
320             # put at the end to prevent circular references in require
321             require RDF::Flow::Pipeline;
322              
323             sub pipe_to {
324             my ($self, $next) = @_;
325             return RDF::Flow::Pipeline->new( $self, $next );
326             }
327              
328             1;
329              
330              
331             __END__
332             =pod
333              
334             =head1 NAME
335              
336             RDF::Flow::Source - Source of RDF data
337              
338             =head1 VERSION
339              
340             version 0.178
341              
342             =head1 SYNOPSIS
343              
344             $src = rdflow( "mydata.ttl", name => "RDF file as source" );
345             $src = rdflow( "mydirectory", name => "directory with RDF files as source" );
346             $src = rdflow( \&mysource, name => "code reference as source" );
347             $src = rdflow( $model, name => "RDF::Trine::Model as source" );
348              
349             package MySource;
350             use parent 'RDF::Flow::Source';
351              
352             sub retrieve_rdf {
353             my ($self, $env) = @_;
354             my $uri = $env->{'rdflow.uri'};
355              
356             # ... your logic here ...
357              
358             return $model;
359             }
360              
361             =head1 DESCRIPTION
362              
363             Each RDF::Flow::Source provides a C<retrieve> method, which returns RDF data on
364             request. RDF data is always returned as instance of L<RDF::Trine::Model> or as
365             instance of L<RDF::Trine::Iterator> with simple statements. The
366             L<request format|/REQUEST FORMAT> is specified below. Sources can access RDF
367             for instance parsed from a file or multiple files in a directory, via HTTP,
368             from a L<RDF::Trine::Store>, or from a custom method. All sources share a set
369             of common configurations options.
370              
371             =head1 METHODS
372              
373             =head2 new ( $from {, %configuration } )
374              
375             Create a new RDF source by wrapping a code reference, a L<RDF::Trine::Model>,
376             or loading RDF data from a file or URL.
377              
378             If you pass an existing RDF::Flow::Source object, it will not be wrapped.
379              
380             A source returns RDF data as instance of L<RDF::Trine::Model> or
381             L<RDF::Trine::Iterator> when queried by a L<PSGI> requests. This is
382             similar to PSGI applications, which return HTTP responses instead of
383             RDF data. RDF::Light supports three types of sources: code references,
384             instances of RDF::Flow, and instances of RDF::Trine::Model.
385              
386             This constructor is exported as function C<rdflow> by L<RDF::Flow>:
387              
388             use RDF::Flow qw(rdflow);
389              
390             $src = rdflow( @args ); # short form
391             $src = RDF:Source->new( @args ); # explicit constructor
392              
393             =head2 init
394              
395             Called from the constructor. Can be used in your sources.
396              
397             =head2 retrieve
398              
399             Retrieve RDF data. Always returns an instance of L<RDF::Trine::Model> or
400             L<RDF::Trine::Iterator>. You can use the method L</empty_rdf> to check
401             whether the RDF data contains some triples or not.
402              
403             =head2 retrieve_rdf
404              
405             Internal method to retrieve RDF data. You should define this when
406             L<subclassing RDF::Flow::Source|RDF::Flow/DEFINING NEW SOURCE TYPES>, it
407             is called by method C<retrieve>.
408              
409             =head2 trigger_retrieved ( $source, $result [, $message ] )
410              
411             Creates a logging event at trace level to log that some result has been
412             retrieved from a source. Returns the result. By default the logging messages is
413             constructed from the source's name and the result's size. This function is
414             automatically called at the end of method C<retrieve>, so you do not have to
415             call it, if your source only implements the method C<retrieve_rdf>.
416              
417             =head2 name
418              
419             Returns the name of the source.
420              
421             =head2 about
422              
423             Returns a string with short information (name and size) of the source.
424              
425             =head2 size
426              
427             Returns the number of inputs (for multi-part sources, such as
428             L<RDF::Flow::Source::Union>).
429              
430             =head2 inputs
431              
432             Returns a list of inputs (unstable).
433              
434             =head2 id
435              
436             Returns a unique id of the source, based on its memory address.
437              
438             =head2 pipe_to
439              
440             Pipes the source to another source (L<RDF::Flow::Pipeline>).
441             C<< $a->pipe_to($b) >> is equivalent to C<< RDF::Flow::Pipeline->new($a,$b) >>.
442              
443             =head2 timestamp
444              
445             Returns an ISO 8601 timestamp and possibly sets in
446             C<rdflow.timestamp> environment variable.
447              
448             =head2 trigger_error
449              
450             Triggers an error and possibly sets the C<rdflow.error> environment variable.
451              
452             =head2 graphviz
453              
454             Purely experimental method for visualizing nets of sources.
455              
456             =head2 graphviz_addnode
457              
458             Purely experimental method for visualizing nets of sources.
459              
460             =head1 CONFIGURATION
461              
462             =over 4
463              
464             =item name
465              
466             Name of the source. Defaults to "anonymous source".
467              
468             =item from
469              
470             Filename, URL, directory, L<RDF::Trine::Model> or code reference to retrieve
471             RDF from. This option is not supported by all source types.
472              
473             =item match
474              
475             Optional regular expression or code reference to match and/or map request URIs.
476             For instance you can rewrite URNs to HTTP URIs like this:
477              
478             match => sub { $_[0] =~ s/^urn:isbn:/http://example.org/isbn/; }
479              
480             The URI in C<rdflow.uri> is set back to its original value after retrieval.
481              
482             =back
483              
484             =head1 REQUEST FORMAT
485              
486             A valid request can either by an URI (as byte string) or a hash reference, that
487             is called an environment. The environment must be a specific subset of a
488             L<PSGI> environment with the following variables:
489              
490             =over 4
491              
492             =item C<rdflow.uri>
493              
494             A request URI as byte string. If this variable is provided, no other variables
495             are needed and the following variables will not modify this value.
496              
497             =item C<psgi.url_scheme>
498              
499             A string C<http> (assumed if not set) or C<https>.
500              
501             =item C<HTTP_HOST>
502              
503             The base URL of the host for constructing an URI. This or SERVER_NAME is
504             required unless rdflow.uri is set.
505              
506             =item C<SERVER_NAME>
507              
508             Name of the host for construction an URI. Only used if HTTP_HOST is not set.
509              
510             =item C<SERVER_PORT>
511              
512             Port of the host for constructing an URI. By default C<80> is used, but not
513             kept as part of an HTTP-URI due to URI normalization.
514              
515             =item C<SCRIPT_NAME>
516              
517             Path for constructing an URI. Must start with C</> if given.
518              
519             =item C<QUERY_STRING>
520              
521             Portion of the request URI that follows the ?, if any.
522              
523             =item C<rdflow.ignorepath>
524              
525             If this variable is set, no query part is used when constructing an URI.
526              
527             =back
528              
529             The method reuses code from L<Plack::Request> by Tatsuhiko Miyagawa. Note that
530             the environment variable REQUEST_URI is not included. When this method
531             constructs a request URI from a given environment hash, it always sets the
532             variable C<rdflow.uri>, so it is always guaranteed to be set after calling.
533             However it may be the empty string, if an environment without HTTP_HOST or
534             SERVER_NAME was provided.
535              
536             =head1 FUNCTIONS
537              
538             The following functions are defined to be used in custom source types.
539              
540             =head2 rdflow_uri ( $env | $uri )
541              
542             Prepares and returns a request URI, as given by an evironment hash or by an
543             existing URI. Sets C<rdflow.uri> if an environment has been given. URI
544             construction is based on code from L<Plack>, as described in the L</REQUEST
545             FORMAT>. The following environment variables are used: C<psgi.url_scheme>,
546             C<HTTP_HOST> or C<SERVER_NAME>, C<SERVER_PORT>, C<SCRIPT_NAME>, C<PATH_INFO>,
547             C<QUERY_STRING>, and C<rdflow.ignorepath>.
548              
549             =head2 sourcelist_args ( @_ )
550              
551             Parses a list of inputs (code or other references) mixed with key-value pairs
552             and returns both separated in an array and and hash.
553              
554             =head2 iterator_to_model ( [ $iterator ] [, $model ] )
555              
556             Adds all statements from a L<RDF::Trine::Iterator> to a (possibly new)
557             L<RDF::Trine::Model> model and returns the model.
558              
559             =head2 empty_rdf ( $rdf )
560              
561             Returns true if the argument is an empty L<RDF::Trine::Model>, an
562             empty L<RDF::Trine::Iterator>, or no RDF data at all.
563              
564             =head1 AUTHOR
565              
566             Jakob Voß <voss@gbv.de>
567              
568             =head1 COPYRIGHT AND LICENSE
569              
570             This software is copyright (c) 2011 by Jakob Voß.
571              
572             This is free software; you can redistribute it and/or modify it under
573             the same terms as the Perl 5 programming language system itself.
574              
575             =cut
576