File Coverage

blib/lib/Catmandu/Importer/Pure.pm
Criterion Covered Total %
statement 63 157 40.1
branch 25 78 32.0
condition 22 71 30.9
subroutine 14 25 56.0
pod 1 2 50.0
total 125 333 37.5


line stmt bran cond sub pod time code
1             package Catmandu::Importer::Pure;
2              
3 2     2   180866 use Catmandu::Sane;
  2         330202  
  2         12  
4 2     2   828 use Catmandu::Util qw(:is);
  2         3  
  2         452  
5 2     2   834 use URI::Escape;
  2         2597  
  2         108  
6 2     2   799 use MIME::Base64;
  2         1118  
  2         92  
7 2     2   798 use Furl;
  2         41629  
  2         60  
8 2     2   13 use Moo;
  2         3  
  2         16  
9 2     2   889 use Carp;
  2         3  
  2         107  
10 2     2   1139 use XML::LibXML;
  2         77798  
  2         13  
11 2     2   292 use XML::LibXML::XPathContext;
  2         4  
  2         42  
12 2     2   861 use Data::Validate::URI qw(is_web_uri);
  2         72084  
  2         120  
13 2     2   16 use Scalar::Util qw(blessed);
  2         2  
  2         5072  
14              
15             our $VERSION = '0.04';
16              
17             with 'Catmandu::Importer';
18              
19             has base => ( is => 'ro' );
20             has endpoint => ( is => 'ro' );
21             has path => ( is => 'ro' );
22             has apiKey => ( is => 'ro' );
23             has user => ( is => 'ro' );
24             has password => ( is => 'ro' );
25             has post_xml => ( is => 'ro' );
26              
27             has handler =>
28             ( is => 'ro', default => sub { 'simple' }, coerce => \&_coerce_handler );
29             has options =>
30             ( is => 'ro', default => sub { +{} }, coerce => \&_coerce_options );
31             has fullResponse => ( is => 'ro', default => sub { 0 } );
32             has trim_text => ( is => 'ro', default => sub { 0 } );
33             has filter => ( is => 'ro' );
34             has userAgent => ( is => 'ro', default => sub { 'Mozilla/5.0' } );
35             has timeout => ( is => 'ro', default => sub { 50 } );
36             has furl => (
37             is => 'ro',
38             isa => sub {
39             Catmandu::BadVal->throw("Invalid furl, should be compatible with Furl")
40             unless is_maybe_able( $_[0], 'get' );
41             },
42             lazy => 1,
43             builder => \&_build_furl
44             );
45             has max_retries => ( is => 'ro', default => sub { 0 } );
46             has _currentRecordSet => ( is => 'ro' );
47             has _n => ( is => 'ro', default => sub { 0 } );
48             has _start => ( is => 'ro', default => sub { 0 } );
49             has _rs_size => ( is => 'ro', default => sub { 0 } );
50             has _total_size => ( is => 'ro', default => sub { 0 } );
51             has _next_url => ( is => 'ro');
52              
53              
54             sub BUILD {
55 24     24 0 109 my $self = shift;
56              
57 24 100 100     167 Catmandu::BadVal->throw("Base URL, endpoint and apiKey are required")
      100        
58             unless $self->base && $self->endpoint && $self->apiKey;
59              
60 21 100 100     75 Catmandu::BadVal->throw( "Password is needed for user " . $self->user )
61             if $self->user && !$self->password;
62              
63 20 100 100     61 Catmandu::BadVal->throw("Invalid filter, filter should be a CODE ref")
64             if $self->filter && !is_code_ref( $self->filter );
65              
66 19 100       51 Catmandu::BadVal->throw(
67             "Invalid value for timeout, should be non negative integer")
68             if !is_natural( $self->timeout );
69              
70 18 50       154 Catmandu::BadVal->throw(
71             "Invalid value for max_retries, should be non negative integer")
72             if !is_natural( $self->max_retries );
73              
74 18         94 my $url = $self->base;
75              
76             # remove first any username password:
77 18         65 $url =~ s|^(\w+://)[^\@/]+[:][^\@/]*\@|$1|;
78 18 100       345 if ( !is_web_uri($url) ) {
79 2         410 Catmandu::BadVal->throw( "Invalid base URL: " . $self->base );
80             }
81              
82 16         6215 my $options = $self->options;
83              
84 16 50 33     74 if ( !$self->fullResponse && $self->post_xml ) {
85 0 0 0     0 if ( $options->{offset} || $options->{page} ||
      0        
      0        
86             (defined $options->{size} && $options->{size}==0) ) {
87 0         0 $self->{fullResponse} = 1;
88             }
89             }
90 16 100 66     142 if ( !$self->fullResponse && $options->{offset} ) {
91 1         11 $self->{_start} = $options->{offset};
92             }
93             }
94              
95             sub _build_furl {
96 0     0   0 my ( $user, $password, $apiKey ) = ( $_[0]->user, $_[0]->password, $_[0]->apiKey );
97 0         0 my @headers;
98              
99 0 0       0 push @headers,
100             ( 'Authorization' => ( 'Basic ' . encode_base64("$user:$password") ) )
101             if $user;
102 0 0       0 push @headers, ( 'api-key' => $apiKey )
103             if $apiKey;
104 0         0 Furl->new(
105             agent => $_[0]->userAgent,
106             timeout => $_[0]->timeout,
107             headers => \@headers
108             );
109             }
110              
111             sub _coerce_handler {
112 27     27   121 my ($handler) = @_;
113              
114 27 100 100     198 return $handler if is_invocant($handler) or is_code_ref($handler);
115              
116 25 100 100     119 if ( is_string($handler) && !is_number($handler) ) {
117 23 100       72 my $class =
118             $handler =~ /^\+(.+)/
119             ? $1
120             : "Catmandu::Importer::Pure::Parser::$handler";
121              
122 23         27 my $handler;
123 23         33 eval { $handler = Catmandu::Util::require_package($class)->new; };
  23         65  
124 23 100       5199 if ($@) {
125 1         725 Catmandu::Error->throw("Unable to load handler $class: $@");
126             } else {
127 22         333 return $handler;
128             }
129             }
130              
131 2   50     6 $handler ||= '';
132 2         15 Catmandu::BadVal->throw("Invalid handler: '$handler'");
133             }
134              
135             sub _coerce_options {
136 24     24   59 my ($options) = @_;
137              
138 24 100       291 return $options if !%$options;
139            
140             return { # arrays to comman separated values
141 3 50       11 map { $_ => (ref $options->{$_} eq 'ARRAY' ? join (',', @{$options->{$_}}) : $options->{$_})}
  3         52  
  0            
142             keys %$options
143             };
144             }
145              
146             sub _request {
147 0     0     my ( $self, $url, $rcontent ) = @_;
148              
149 0           $self->log->debug("requesting $url\n");
150              
151 0           my $res;
152 0           my $tries = $self->max_retries;
153             try {
154 0   0 0     do {
      0        
155            
156 0 0         $res = $rcontent
157             ? $self->furl->post($url, ['Content-Type' => 'application/xml'], $$rcontent)
158             : $self->furl->get($url);
159             } while ( $res->status >= 500 && $tries-- && sleep(10) )
160             ; # Retry on server error;
161 0 0 0       die( $res->status_line )
      0        
162             unless $res->is_success
163             || ( $res->content && $res->content =~ m|<\?xml| );
164 0           return $res->content;
165             }
166             catch {
167 0     0     Catmandu::Error->throw(
168             "Requested '$url'\nStatus code: " . $res->status_line );
169 0           };
170             }
171              
172             sub _url {
173 0     0     my ( $self, $options ) = @_;
174              
175 0 0         my $url = $self->base . '/' . $self->endpoint
176             . ($self->path ? '/' . $self->path : '');
177              
178 0 0 0       if ($options && %$options) {
179             $url .= '?' . join '&',
180 0           map { "$_=" . uri_escape( $options->{$_}, "^A-Za-z0-9\-\._~," ) }
181 0           sort keys %{$options};
  0            
182             }
183 0           return $url;
184             }
185              
186             sub _nextRecordSet {
187 0     0     my ($self) = @_;
188              
189 0           my %options = %{ $self->options };
  0            
190            
191 0 0 0       if (!$self->fullResponse && $self->post_xml) {
192 0           $options{offset} = $self->_start;
193             }
194            
195 0   0       my $url = $self->_next_url || $self->_url( \%options );
196              
197 0 0         my $xml = $self->_request( $url, ($self->post_xml ? \$self->post_xml : undef) );
198              
199 0 0         if ( $self->filter ) {
200 0           &{ $self->filter }( \$xml );
  0            
201             }
202              
203 0           my $hash = $self->_hashify($xml);
204              
205 0 0         if ( exists $hash->{'error'} ) {
206             Catmandu::Error->throw(
207             "Requested '$url'\nPure REST Error ($hash->{error}{code}): "
208             . $hash->{error}{title}
209             . (
210             $hash->{'error'}{'description'}
211             ? "\nDescription:\n" . $hash->{error}{description}
212 0 0         : ''
213             )
214             );
215             }
216              
217 0 0         if ( $self->fullResponse ) {
218 0           $self->{_rs_size} = 1;
219 0           $self->{_total_size} = 1;
220 0           return $hash->{results}; #check
221             }
222            
223 0           $self->{_next_url} = $hash->{next_url}; #only GET requests
224              
225             # get total number of results
226 0           $self->{_total_size} = $hash->{count};
227              
228 0           my $set = $hash->{results};
229              
230 0           $self->{_rs_size} = scalar(@$set);
231              
232 0           return $set;
233             }
234              
235             # Internal: gets the next record from our current resultset.
236             # Returns a hash representation of the next record.
237             sub _nextRecord {
238 0     0     my ($self) = @_;
239              
240             # fetch recordset if we don't have one yet.
241 0   0       $self->{_currentRecordSet} ||= $self->_nextRecordSet || return;
      0        
242              
243             return
244 0 0 0       if (!$self->_next_url ) && $self->_total_size
      0        
245             && ( $self->_start + $self->_n ) >=
246             $self->_total_size; # no more results
247              
248             # check for a exhausted recordset.
249 0 0         if ( $self->_n >= $self->_rs_size ) {
250 0           $self->{_start} += $self->_rs_size;
251 0           $self->{_n} = 0;
252 0           $self->{_currentRecordSet} = $self->_nextRecordSet;
253             }
254            
255 0           my $record_dom = $self->_currentRecordSet->[ $self->{_n}++ ];
256            
257 0           return $self->_handle_record($record_dom);
258              
259             }
260              
261             # Internal: Converts XML to a perl hash.
262             # $in - the raw XML input.
263             # Returns a hash representation of the given XML.
264             sub _hashify {
265 0     0     my ( $self, $in ) = @_;
266              
267 0           my $parser = XML::LibXML->new();
268 0           my $doc = $parser->load_xml( string => $in );
269 0           my $root = $doc->documentElement;
270 0           my $xc = XML::LibXML::XPathContext->new($root);
271              
272 0 0         if ( $self->trim_text ) {
273 0           my $all_text_nodes = $doc->findnodes('//text()');
274             $all_text_nodes->foreach(
275             sub {
276 0     0     my $node = shift;
277 0           my $t = $node->data;
278 0   0       my $subs_done =
      0        
279             ( $t =~ s/\A\s+// || 0 ) + ( $t =~ s/\s+\Z// || 0 );
280 0 0         $node->setData($t) if $subs_done;
281             }
282 0           );
283             }
284              
285 0           my $out;
286              
287 0 0         if ( $xc->exists('/error') ) {
288 0           my $code = $xc->findvalue('/error/code');
289 0           my $title = $xc->findvalue('/error/title');
290 0           my $description = $xc->findvalue('/error/description');
291              
292 0           $out->{error} = { code => $code, title => $title, description => $description };
293 0           return $out;
294             }
295              
296 0           my $next_url = $xc->findvalue('/result/navigationLink[@ref="next"]/@href|/result/navigationLinks/navigationLink[@ref="next"]/@href');
297 0 0         $next_url =~ s/&amp;/&/g if $next_url;
298 0 0         $out->{next_url} = $next_url if $next_url;
299              
300 0           $out->{count} = $xc->findvalue("/result/count");
301              
302 0           my @result_nodes;
303              
304 0 0         if ( $xc->exists('/result/items') ) {
    0          
305 0           @result_nodes = $xc->findnodes('/result/items/*');
306             } elsif ($self->endpoint eq 'changes') {
307 0           @result_nodes = $xc->findnodes('/result/contentChange');
308             } else {
309 0           @result_nodes = $xc->findnodes('/result/*[@uuid]');
310             };
311              
312 0 0         if ( $self->fullResponse ) {
313 0           $out->{results} = [$root];
314 0           return $out;
315             }
316              
317 0           $out->{results} = \@result_nodes;
318              
319 0           return $out;
320             }
321              
322             sub _handle_record {
323 0     0     my ( $self, $dom ) = @_;
324 0 0         return unless $dom;
325              
326 0 0         return blessed( $self->handler )
327             ? $self->handler->parse($dom)
328             : $self->handler->($dom);
329             }
330              
331              
332             # Public Methods. --------------------------------------------------------------
333              
334             sub url {
335 0     0 1   my ($self) = @_;
336 0           return $self->_url( $self->options )
337             ;
338             }
339              
340             sub generator {
341             my ($self) = @_;
342              
343             return sub {
344             $self->_nextRecord;
345             };
346             }
347              
348             1;
349              
350             =head1 NAME
351              
352             Catmandu::Importer::Pure - Package that imports Pure data.
353              
354             =head1 SYNOPSIS
355              
356             # From the command line
357             $ catmandu convert Pure \
358             --base https://host/ws/api/... \
359             --endpoint research-outputs \
360             --apiKey "..."
361              
362             # In Perl
363             use Catmandu;
364              
365             my %attrs = (
366             base => 'https://host/path',
367             endpoint => 'research-outputs',
368             apiKey => '...',
369             options => { 'fields' => 'title,type,authors.*' }
370             );
371              
372             my $importer = Catmandu->importer('Pure', %attrs);
373              
374             my $n = $importer->each(sub {
375             my $hashref = $_[0];
376             # ...
377             });
378              
379             # get number of validated and approved publications
380             my $count = Catmandu->importer(
381             'Pure',
382             base => 'https://host/path',
383             endpoint => 'research-outputs',
384             apiKey => '...',
385             fullResponse => 1,
386             post_xml => '<?xml version="1.0" encoding="utf-8"?>'
387             . '<researchOutputsQuery>'
388             . '<size>0</size>'
389             . '<workflowSteps>'
390             . ' <workflowStep>approved</workflowStep>'
391             . ' <workflowStep>validated</workflowStep>'
392             . '</workflowSteps>'
393             . '</researchOutputsQuery>'
394             )->first->{result}[0]{count};
395              
396             =head1 DESCRIPTION
397              
398             Catmandu::Importer::Pure is a Catmandu package that seamlessly imports data from Elsevier's Pure system using its REST service.
399             In order to use the Pure Web Service you need an API key. List of all available endpoints and further documentation can currently
400             be found under /ws on a webserver that is running Pure. Note that this version of the importer is tested with Pure API version
401             5.18 and might not work with later versions.
402              
403             =head1 CONFIGURATION
404              
405             =over
406              
407             =item base
408              
409             Base URL for the REST service is required, for example 'http://purehost.com/ws/api/518'
410              
411             =item endpoint
412              
413             Valid endpoint is required, like 'research-outputs'
414              
415             =item apiKey
416              
417             Valid API key is required for access
418              
419             =item path
420              
421             Path after the endpoint
422              
423             =item user
424              
425             User name if basic authentication is used
426              
427             =item password
428              
429             Password if basic authentication is used
430              
431             =item options
432              
433             Options passed as parameters to the REST service, for example:
434             {
435             'size' => 20,
436             'fields' => 'title,type,authors.*'
437             }
438              
439              
440             =item post_xml
441              
442             xml containing a query that will be submitted with a POST request
443              
444             =item fullResponse
445              
446             Optional flag. If true delivers the complete results as a single item (record), corresponding to the
447             XML response received. Only one request to the REST service is made in this case. Default is false.
448              
449             If the flag is false then the items are set to child
450             elements of the element 'result' or in case the 'result' element does not exist they are set to child elements
451             of the root element for each response.
452              
453             =item handler( sub {} | $object | 'NAME' | '+NAME' )
454              
455             Handler to transform each record from XML DOM (L<XML::LibXML::Element>) into
456             Perl hash.
457              
458             Handlers can be provided as function reference, an instance of a Perl
459             package that implements 'parse', or by a package NAME. Package names should
460             be prepended by C<+> or prefixed with C<Catmandu::Importer::Pure::Parser>. E.g
461             C<foobar> will create a C<Catmandu::Importer::Pure::Parser::foobar> instance.
462              
463             By default the handler L<Catmandu::Importer::Pure::Parser::simple> is used.
464             It provides a simple XML parsing, using XML::LibXML::Simple,
465              
466             Other possible values are L<Catmandu::Importer::Pure::Parser::struct> for XML::Struct
467             based structure that preserves order and L<Catmandu::Importer::Pure::Parser::raw> that
468             returns the XML as it is.
469              
470             =item userAgent
471              
472             HTTP user agent string, set to C<Mozilla/5.0> by default.
473              
474             =item furl
475              
476             Instance of L<Furl> or compatible class to fetch URLs with.
477              
478             =item timeout
479              
480             Timeout for HTTP requests in seonds. Defaults to 50.
481              
482             =item trim_text
483              
484             Optional flag. If true then all text nodes in the REST response are trimmed so that any leading and trailing whitespace is removed before parsing.
485             This is useful if you don't want to risk getting leading and trailing whitespace in your data, since Pure doesn't currently clean leading/trailing white space from
486             user input. Note that there is a small performance penalty when using this option. Default is false.
487              
488             =item filter( sub {} )
489              
490             Optional reference to function that processes the XML response before it is parsed. The argument to the function is a reference to the XML text,
491             which is then used to modify it. This is option is normally not needed but can helpful if there is a problem parsing the response due to a bug
492             in the REST service.
493              
494             =back
495              
496             =head1 METHODS
497              
498             In addition to methods inherited from Catmandu::Iterable, this module provides the following public methods:
499              
500             =over
501              
502             =item B<url >
503              
504             Return the current Pure REST request URL (useful for debugging).
505              
506             =back
507              
508             =head1 SEE ALSO
509              
510             L<Catmandu>
511              
512             L<Catmandu::Importer>
513              
514             L<Catmandu::Iterable>
515              
516             L<Furl>
517              
518             =head1 AUTHOR
519              
520             Snorri Briem E<lt>briem@cpan.orgE<gt>
521              
522             =head1 COPYRIGHT
523              
524             Copyright 2017- Lund University Library
525              
526             =head1 LICENSE
527              
528             This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself.
529              
530             =cut