File Coverage

blib/lib/Catmandu/Importer/Pure.pm
Criterion Covered Total %
statement 63 157 40.1
branch 25 78 32.0
condition 22 71 30.9
subroutine 14 25 56.0
pod 1 2 50.0
total 125 333 37.5


line stmt bran cond sub pod time code
1             package Catmandu::Importer::Pure;
2              
3 2     2   221295 use Catmandu::Sane;
  2         404771  
  2         16  
4 2     2   648 use Catmandu::Util qw(:is);
  2         4  
  2         538  
5 2     2   1071 use URI::Escape;
  2         3121  
  2         119  
6 2     2   991 use MIME::Base64;
  2         1447  
  2         114  
7 2     2   954 use Furl;
  2         49616  
  2         64  
8 2     2   15 use Moo;
  2         6  
  2         16  
9 2     2   964 use Carp;
  2         4  
  2         125  
10 2     2   1392 use XML::LibXML;
  2         93779  
  2         16  
11 2     2   324 use XML::LibXML::XPathContext;
  2         6  
  2         49  
12 2     2   1052 use Data::Validate::URI qw(is_web_uri);
  2         86001  
  2         151  
13 2     2   21 use Scalar::Util qw(blessed);
  2         4  
  2         6084  
14              
15             our $VERSION = '0.03';
16              
17             with 'Catmandu::Importer';
18              
19             has base => ( is => 'ro' );
20             has endpoint => ( is => 'ro' );
21             has path => ( is => 'ro' );
22             has apiKey => ( is => 'ro' );
23             has user => ( is => 'ro' );
24             has password => ( is => 'ro' );
25             has post_xml => ( is => 'ro' );
26              
27             has handler =>
28             ( is => 'ro', default => sub { 'simple' }, coerce => \&_coerce_handler );
29             has options =>
30             ( is => 'ro', default => sub { +{} }, coerce => \&_coerce_options );
31             has fullResponse => ( is => 'ro', default => sub { 0 } );
32             has trim_text => ( is => 'ro', default => sub { 0 } );
33             has filter => ( is => 'ro' );
34             has userAgent => ( is => 'ro', default => sub { 'Mozilla/5.0' } );
35             has timeout => ( is => 'ro', default => sub { 50 } );
36             has furl => (
37             is => 'ro',
38             isa => sub {
39             Catmandu::BadVal->throw("Invalid furl, should be compatible with Furl")
40             unless is_maybe_able( $_[0], 'get' );
41             },
42             lazy => 1,
43             builder => \&_build_furl
44             );
45             has max_retries => ( is => 'ro', default => sub { 0 } );
46             has _currentRecordSet => ( is => 'ro' );
47             has _n => ( is => 'ro', default => sub { 0 } );
48             has _start => ( is => 'ro', default => sub { 0 } );
49             has _rs_size => ( is => 'ro', default => sub { 0 } );
50             has _total_size => ( is => 'ro', default => sub { 0 } );
51             has _next_url => ( is => 'ro');
52              
53              
54             sub BUILD {
55 24     24 0 122 my $self = shift;
56              
57 24 100 100     211 Catmandu::BadVal->throw("Base URL, endpoint and apiKey are required")
      100        
58             unless $self->base && $self->endpoint && $self->apiKey;
59              
60 21 100 100     79 Catmandu::BadVal->throw( "Password is needed for user " . $self->user )
61             if $self->user && !$self->password;
62              
63 20 100 100     71 Catmandu::BadVal->throw("Invalid filter, filter should be a CODE ref")
64             if $self->filter && !is_code_ref( $self->filter );
65              
66 19 100       63 Catmandu::BadVal->throw(
67             "Invalid value for timeout, should be non negative integer")
68             if !is_natural( $self->timeout );
69              
70 18 50       182 Catmandu::BadVal->throw(
71             "Invalid value for max_retries, should be non negative integer")
72             if !is_natural( $self->max_retries );
73              
74 18         119 my $url = $self->base;
75              
76             # remove first any username password:
77 18         79 $url =~ s|^(\w+://)[^\@/]+[:][^\@/]*\@|$1|;
78 18 100       419 if ( !is_web_uri($url) ) {
79 2         497 Catmandu::BadVal->throw( "Invalid base URL: " . $self->base );
80             }
81              
82 16         7509 my $options = $self->options;
83              
84 16 50 33     84 if ( !$self->fullResponse && $self->post_xml ) {
85 0 0 0     0 if ( $options->{offset} || $options->{page} ||
      0        
      0        
86             (defined $options->{size} && $options->{size}==0) ) {
87 0         0 $self->{fullResponse} = 1;
88             }
89             }
90 16 100 66     168 if ( !$self->fullResponse && $options->{offset} ) {
91 1         14 $self->{_start} = $options->{offset};
92             }
93             }
94              
95             sub _build_furl {
96 0     0   0 my ( $user, $password, $apiKey ) = ( $_[0]->user, $_[0]->password, $_[0]->apiKey );
97 0         0 my @headers;
98              
99 0 0       0 push @headers,
100             ( 'Authorization' => ( 'Basic ' . encode_base64("$user:$password") ) )
101             if $user;
102 0 0       0 push @headers, ( 'api-key' => $apiKey )
103             if $apiKey;
104 0         0 Furl->new(
105             agent => $_[0]->userAgent,
106             timeout => $_[0]->timeout,
107             headers => \@headers
108             );
109             }
110              
111             sub _coerce_handler {
112 27     27   152 my ($handler) = @_;
113              
114 27 100 100     224 return $handler if is_invocant($handler) or is_code_ref($handler);
115              
116 25 100 100     137 if ( is_string($handler) && !is_number($handler) ) {
117 23 100       88 my $class =
118             $handler =~ /^\+(.+)/
119             ? $1
120             : "Catmandu::Importer::Pure::Parser::$handler";
121              
122 23         35 my $handler;
123 23         37 eval { $handler = Catmandu::Util::require_package($class)->new; };
  23         74  
124 23 100       6034 if ($@) {
125 1         874 Catmandu::Error->throw("Unable to load handler $class: $@");
126             } else {
127 22         397 return $handler;
128             }
129             }
130              
131 2   50     6 $handler ||= '';
132 2         16 Catmandu::BadVal->throw("Invalid handler: '$handler'");
133             }
134              
135             sub _coerce_options {
136 24     24   69 my ($options) = @_;
137              
138 24 100       352 return $options if !%$options;
139            
140             return { # arrays to comman separated values
141 3 50       13 map { $_ => (ref $options->{$_} eq 'ARRAY' ? join (',', @{$options->{$_}}) : $options->{$_})}
  3         61  
  0            
142             keys %$options
143             };
144             }
145              
146             sub _request {
147 0     0     my ( $self, $url, $rcontent ) = @_;
148              
149 0           $self->log->debug("requesting $url\n");
150              
151 0           my $res;
152 0           my $tries = $self->max_retries;
153             try {
154 0   0 0     do {
      0        
155            
156 0 0         $res = $rcontent
157             ? $self->furl->post($url, ['Content-Type' => 'application/xml'], $$rcontent)
158             : $self->furl->get($url);
159             } while ( $res->status >= 500 && $tries-- && sleep(10) )
160             ; # Retry on server error;
161 0 0 0       die( $res->status_line )
      0        
162             unless $res->is_success
163             || ( $res->content && $res->content =~ m|<\?xml| );
164 0           return $res->content;
165             }
166             catch {
167 0     0     Catmandu::Error->throw(
168             "Requested '$url'\nStatus code: " . $res->status_line );
169 0           };
170             }
171              
172             sub _url {
173 0     0     my ( $self, $options ) = @_;
174              
175 0 0         my $url = $self->base . '/' . $self->endpoint
176             . ($self->path ? '/' . $self->path : '');
177              
178 0 0 0       if ($options && %$options) {
179             $url .= '?' . join '&',
180 0           map { "$_=" . uri_escape( $options->{$_}, "^A-Za-z0-9\-\._~," ) }
181 0           sort keys %{$options};
  0            
182             }
183 0           return $url;
184             }
185              
186             sub _nextRecordSet {
187 0     0     my ($self) = @_;
188              
189 0           my %options = %{ $self->options };
  0            
190            
191 0 0 0       if (!$self->fullResponse && $self->post_xml) {
192 0           $options{offset} = $self->_start;
193             }
194            
195 0   0       my $url = $self->_next_url || $self->_url( \%options );
196              
197 0 0         my $xml = $self->_request( $url, ($self->post_xml ? \$self->post_xml : undef) );
198              
199 0 0         if ( $self->filter ) {
200 0           &{ $self->filter }( \$xml );
  0            
201             }
202              
203 0           my $hash = $self->_hashify($xml);
204              
205 0 0         if ( exists $hash->{'error'} ) {
206             Catmandu::Error->throw(
207             "Requested '$url'\nPure REST Error ($hash->{error}{code}): "
208             . $hash->{error}{title}
209             . (
210             $hash->{'error'}{'description'}
211             ? "\nDescription:\n" . $hash->{error}{description}
212 0 0         : ''
213             )
214             );
215             }
216              
217 0 0         if ( $self->fullResponse ) {
218 0           $self->{_rs_size} = 1;
219 0           $self->{_total_size} = 1;
220 0           return $hash->{results}; #check
221             }
222            
223 0           $self->{_next_url} = $hash->{next_url}; #only GET requests
224              
225             # get total number of results
226 0           $self->{_total_size} = $hash->{count};
227              
228 0           my $set = $hash->{results};
229              
230 0           $self->{_rs_size} = scalar(@$set);
231              
232 0           return $set;
233             }
234              
235             # Internal: gets the next record from our current resultset.
236             # Returns a hash representation of the next record.
237             sub _nextRecord {
238 0     0     my ($self) = @_;
239              
240             # fetch recordset if we don't have one yet.
241 0   0       $self->{_currentRecordSet} ||= $self->_nextRecordSet || return;
      0        
242              
243             return
244 0 0 0       if (!$self->_next_url ) && $self->_total_size
      0        
245             && ( $self->_start + $self->_n ) >=
246             $self->_total_size; # no more results
247              
248             # check for a exhausted recordset.
249 0 0         if ( $self->_n >= $self->_rs_size ) {
250 0           $self->{_start} += $self->_rs_size;
251 0           $self->{_n} = 0;
252 0           $self->{_currentRecordSet} = $self->_nextRecordSet;
253             }
254            
255 0           my $record_dom = $self->_currentRecordSet->[ $self->{_n}++ ];
256            
257 0           return $self->_handle_record($record_dom);
258              
259             }
260              
261             # Internal: Converts XML to a perl hash.
262             # $in - the raw XML input.
263             # Returns a hash representation of the given XML.
264             sub _hashify {
265 0     0     my ( $self, $in ) = @_;
266              
267 0           my $parser = XML::LibXML->new();
268 0           my $doc = $parser->load_xml( string => $in );
269 0           my $root = $doc->documentElement;
270 0           my $xc = XML::LibXML::XPathContext->new($root);
271              
272 0 0         if ( $self->trim_text ) {
273 0           my $all_text_nodes = $doc->findnodes('//text()');
274             $all_text_nodes->foreach(
275             sub {
276 0     0     my $node = shift;
277 0           my $t = $node->data;
278 0   0       my $subs_done =
      0        
279             ( $t =~ s/\A\s+// || 0 ) + ( $t =~ s/\s+\Z// || 0 );
280 0 0         $node->setData($t) if $subs_done;
281             }
282 0           );
283             }
284              
285 0           my $out;
286              
287 0 0         if ( $xc->exists('/error') ) {
288 0           my $code = $xc->findvalue('/error/code');
289 0           my $title = $xc->findvalue('/error/title');
290 0           my $description = $xc->findvalue('/error/description');
291              
292 0           $out->{error} = { code => $code, title => $title, description => $description };
293 0           return $out;
294             }
295              
296 0           my $next_url = $xc->findvalue('/result/navigationLink[@ref="next"]/@href|/result/navigationLinks/navigationLink[@ref="next"]/@href');
297 0 0         $next_url =~ s/&amp;/&/g if $next_url;
298 0 0         $out->{next_url} = $next_url if $next_url;
299              
300 0           $out->{count} = $xc->findvalue("/result/count");
301              
302 0           my @result_nodes;
303              
304 0 0         if ( $xc->exists('/result/items') ) {
    0          
305 0           @result_nodes = $xc->findnodes('/result/items/*');
306             } elsif ($self->endpoint eq 'changes') {
307 0           @result_nodes = $xc->findnodes('/result/contentChange');
308             } else {
309 0           @result_nodes = $xc->findnodes('/result/*[@uuid]');
310             };
311              
312 0 0         if ( $self->fullResponse ) {
313 0           $out->{results} = [$root];
314 0           return $out;
315             }
316              
317 0           $out->{results} = \@result_nodes;
318              
319 0           return $out;
320             }
321              
322             sub _handle_record {
323 0     0     my ( $self, $dom ) = @_;
324 0 0         return unless $dom;
325              
326 0 0         return blessed( $self->handler )
327             ? $self->handler->parse($dom)
328             : $self->handler->($dom);
329             }
330              
331              
332             # Public Methods. --------------------------------------------------------------
333              
334             sub url {
335 0     0 1   my ($self) = @_;
336 0           return $self->_url( $self->options )
337             ;
338             }
339              
340             sub generator {
341             my ($self) = @_;
342              
343             return sub {
344             $self->_nextRecord;
345             };
346             }
347              
348             1;
349              
350             =head1 NAME
351              
352             Catmandu::Importer::Pure - Package that imports Pure data.
353              
354             =head1 SYNOPSIS
355              
356             # From the command line
357             $ catmandu convert Pure \
358             --base https://host/ws/api/... \
359             --endpoint research-outputs \
360             --apiKey "..."
361              
362             # In Perl
363             use Catmandu;
364              
365             my %attrs = (
366             base => 'http://host/path',
367             endpoint => 'research-outputs',
368             apiKey => '...',
369             );
370              
371             TODO: add options
372              
373              
374             my $importer = Catmandu->importer('Pure', %attrs);
375              
376             my $n = $importer->each(sub {
377             my $hashref = $_[0];
378             # ...
379             });
380              
381             # get number of valid and approved publications
382             my $count = Catmandu->importer(
383             'Pure',
384             base => base,
385             endpoint => 'research-outputs',
386             apiKey => '...',
387             fullResponse => 1,
388             post_xml => '<?xml version="1.0" encoding="utf-8"?>'
389             . '<researchOutputsQuery>',
390             . '<workflowSteps>approved</workflowSteps>',
391             . '</researchOutputsQuery>',
392             )->first->{result}{count};
393              
394             =head1 DESCRIPTION
395              
396             Catmandu::Importer::Pure is a Catmandu package that seamlessly imports data from Elsevier's Pure system using its REST service.
397             In order to use the Pure Web Service you need an API key. List of all available endpoints and further documentation can currently
398             be found under /ws on a webserver that is running Pure. Note that this version of the importer is tested with Pure API version
399             5.18 and might not work with later versions.
400              
401             =head1 CONFIGURATION
402              
403             =over
404              
405             =item base
406              
407             Base URL for the REST service is required, for example 'http://purehost.com/ws/api/518'
408              
409             =item endpoint
410              
411             Valid endpoint is required, like 'research-outputs'
412              
413             =item apiKey
414              
415             Valid API key is required for access
416              
417             =item path
418              
419             Path after the endpoint
420              
421             =item user
422              
423             User name if basic authentication is used
424              
425             =item password
426              
427             Password if basic authentication is used
428              
429             =item options
430              
431             Options passed as parameters to the REST service, for example:
432             {
433             'size' => 20,
434             'fields' => 'title,type,authors.*'
435             }
436              
437              
438             =item post_xml
439              
440             xml containing a query that will be submitted with a POST request
441              
442             =item fullResponse
443              
444             Optional flag. If true delivers the complete results as a single item (record), corresponding to the
445             XML response received. Only one request to the REST service is made in this case. Default is false.
446              
447             If the flag is false then the items are set to child
448             elements of the element 'result' or in case the 'result' element does not exist they are set to child elements
449             of the root element for each response.
450              
451             =item handler( sub {} | $object | 'NAME' | '+NAME' )
452              
453             Handler to transform each record from XML DOM (L<XML::LibXML::Element>) into
454             Perl hash.
455              
456             Handlers can be provided as function reference, an instance of a Perl
457             package that implements 'parse', or by a package NAME. Package names should
458             be prepended by C<+> or prefixed with C<Catmandu::Importer::Pure::Parser>. E.g
459             C<foobar> will create a C<Catmandu::Importer::Pure::Parser::foobar> instance.
460              
461             By default the handler L<Catmandu::Importer::Pure::Parser::simple> is used.
462             It provides a simple XML parsing, using XML::LibXML::Simple,
463              
464             Other possible values are L<Catmandu::Importer::Pure::Parser::struct> for XML::Struct
465             based structure that preserves order and L<Catmandu::Importer::Pure::Parser::raw> that
466             returns the XML as it is.
467              
468             =item userAgent
469              
470             HTTP user agent string, set to C<Mozilla/5.0> by default.
471              
472             =item furl
473              
474             Instance of L<Furl> or compatible class to fetch URLs with.
475              
476             =item timeout
477              
478             Timeout for HTTP requests in seonds. Defaults to 50.
479              
480             =item trim_text
481              
482             Optional flag. If true then all text nodes in the REST response are trimmed so that any leading and trailing whitespace is removed before parsing.
483             This is useful if you don't want to risk getting leading and trailing whitespace in your data, since Pure doesn't currently clean leading/trailing white space from
484             user input. Note that there is a small performance penalty when using this option. Default is false.
485              
486             =item filter( sub {} )
487              
488             Optional reference to function that processes the XML response before it is parsed. The argument to the function is a reference to the XML text,
489             which is then used to modify it. This is option is normally not needed but can helpful if there is a problem parsing the response due to a bug
490             in the REST service.
491              
492             =back
493              
494             =head1 METHODS
495              
496             In addition to methods inherited from Catmandu::Iterable, this module provides the following public methods:
497              
498             =over
499              
500             =item B<url >
501              
502             Return the current Pure REST request URL (useful for debugging).
503              
504             =back
505              
506             =head1 SEE ALSO
507              
508             L<Catmandu::Importer>
509              
510             L<Catmandu::Iterable>
511              
512             Furl
513              
514             http://librecat.org
515              
516             =head1 AUTHOR
517              
518             Snorri Briem E<lt>briem@cpan.orgE<gt>
519              
520             =head1 COPYRIGHT
521              
522             Copyright 2017- Lund University Library
523              
524             =head1 LICENSE
525              
526             This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself.
527              
528             =cut