File Coverage

blib/lib/Museum/Rijksmuseum/Object/Harvester.pm
Criterion Covered Total %
statement 27 60 45.0
branch 0 24 0.0
condition 0 17 0.0
subroutine 9 10 90.0
pod 1 1 100.0
total 37 112 33.0


line stmt bran cond sub pod time code
1             package Museum::Rijksmuseum::Object::Harvester;
2              
3 1     1   1270 use strictures 2;
  1         9  
  1         44  
4              
5 1     1   215 use Carp;
  1         2  
  1         64  
6 1     1   449 use HTTP::OAI;
  1         120260  
  1         38  
7 1     1   8 use Moo;
  1         4  
  1         10  
8 1     1   415 use Time::HiRes qw( sleep );
  1         2  
  1         10  
9 1     1   110 use URI;
  1         3  
  1         22  
10 1     1   6 use URI::QueryParam;
  1         2  
  1         22  
11              
12             =head1 NAME
13              
14             Museum::Rijksmuseum::Object::Harvester - Bulk-fetching of Rijksmuseum data via the OAI-PMH interface
15              
16             =head1 VERSION
17              
18             See L<Museum::Rijksmuseum::Object>
19              
20             =cut
21              
22 1     1   4 use Museum::Rijksmuseum::Object; our $VERSION = $Museum::Rijksmuseum::Object::VERSION;
  1         2  
  1         34  
23 1     1   6 use namespace::clean;
  1         2  
  1         11  
24              
25             =head1 SYNOPSIS
26              
27             Does a bulk fetch of the Rijksmuseum collection database using the OAI-PMH
28             interface. For each record a callback will be called with the data. Note that
29             the format of this data won't necessarily be the same as returned by the
30             L<Museum::Rijksmuseum::Object> calls, as it's coming from a different endpoint.
31              
32             use Museum::Rijksmuseum::Object::Harvester;
33              
34             my $h = Museum::Rijksmuseum::Object::Harvester->new( key => 'abc123xyz' );
35             my $status = $h->harvest(
36             set => 'subject:PublicDomainImages',
37             from => '2023-01-01',
38             type => 'identifiers',
39             callback => \&process_record,
40             );
41             if ( $status->{error} ) {
42             die "Error: $status->{error}\nLast resumption token: $status->{resumptionToken}\n";
43             }
44             if ( $status->{resumptionToken} ) {
45             print "Finished, token: $status->{resumptionToken}\n";
46             }
47              
48             =head1 SUBROUTINES/METHODS
49              
50             =head2 new
51              
52             my $h = Museum::Rijksmuseum::Object::Harvester->new( key => 'abc123xyz' );
53              
54             Create a new instance of the harvester. C<key> is required.
55              
56             =cut
57              
58             =head2 harvest
59              
60             my $status = $h->harvest(
61             set => 'subject:PublicDomainImages',
62             from => '2023-01-01',
63             to => '2023-01-31',
64             resumptionToken => $last_token_you_saw,
65             delay => 1_000, # 1 second
66             type => 'identifiers',
67             callback => \&process_record,
68             );
69            
70             Begins harvesting the records from the Rijksmuseum. The only required fields
71             are C<callback> and C<type>, but the default delay is 10 seconds so you
72             probably want to think about putting something sensible in there (or leave it
73             at 10 seconds if you don't mind being very polite.) If you have a resumption
74             token, perhaps you're recovering from a previous failure, you can supply that.
75             C<from> and C<to> are not defined in the API documentation, so it's uncertain
76             what they refer to. Latest update time maybe?
77              
78             C<type> can in theory be C<identifiers> or C<records> (mapping to
79             C<ListIdenifiers> and C<ListRecords> internally), but C<records> is currently
80             unsupported as at writing time I don't need it and it's a fair bit of work to
81             do right.
82              
83             C<callback> will be called for every identifier or record, in the case of
84             identifers it'll eceive a hashref containing C<identifier> and C<datestamp>.
85             If the callback returns a non-false value (i.e. any value), we quietly shut down.
86             Due to the way resumption tokens work (i.e. they can be the same for subsequent
87             requests), even if you request a shutdown, you'll still be fed the rest of the
88             batch. This helps avoid missing records.
89              
90             The return value is a hashref that contains C<error> if something went wrong,
91             and possibly a C<resumptionToken> to let you know how to pick up again.
92              
93             There is some basic retry logic with exponential backoff that'll hopefully help
94             seamlessly recover from transient network or service issues.
95              
96             =cut
97              
98             sub harvest {
99 0     0 1   my ( $self, %args ) = @_;
100              
101             my $params = {
102             $args{set} ? ( set => $args{set} ) : (),
103             $args{from} ? ( from => $args{from} ) : (),
104             $args{until} ? ( until => $args{until} ) : (),
105 0 0         $args{resumptionToken} ? ( resumptionToken => $args{resumptionToken} ) : (),
    0          
    0          
    0          
106             };
107              
108 0   0       my $delay = $args{delay} // 10_000;
109 0 0 0       if ( !$args{type} || $args{type} ne 'identifiers' ) {
110 0           croak 'Only type "identifiers" is currently supported, but you still have to say it';
111             }
112 0           my $verb = 'ListIdentifiers';
113 0           my $callback = $args{callback};
114 0 0         croak 'A "callback" parameter is required.' unless $callback;
115              
116 0           my $url = sprintf( 'https://www.rijksmuseum.nl/api/oai/%s', $self->key );
117 0           my $harv = HTTP::OAI::Harvester->new( baseURL => $url );
118             # We'll handle resume ourselves, because I think the default way
119             # wants to load _everything_ all in one go, or something. It's weird
120             # and not useful anyway.
121 0           $harv->resume(0);
122              
123 0           $params->{metadataPrefix} = 'edm_dc';
124 0           my $last_resumption_token = undef;
125 0           my ( $li, $shutdown );
126 0   0       do {
      0        
127 0           $li = $harv->ListIdentifiers(%$params);
128 0           my $retries = 10;
129 0           my $backoff_delay = 1;
130 0           while (ref($li) ne 'HTTP::OAI::Response') {
131             # TODO it'd be nice to put some proper logging in here.
132 0           sleep($backoff_delay);
133 0 0         if (--$retries <= 0) {
134 0           die "Error connecting to API server, all retries used up: " . $li->status_line . "\n";
135             }
136 0           $backoff_delay *= 1.5; # poor man's exponential backoff
137 0           $li = $harv->ListIdentifiers(%$params);
138             }
139              
140 0           while ( my $rec = $li->next ) {
141 0           my $sd = $callback->($rec);
142 0   0       $shutdown ||= $sd;
143             }
144 0 0         if ( $li->is_error ) {
    0          
145             return {
146 0 0         $last_resumption_token ? ( resumptionToken => $last_resumption_token ) : (),
147             error => $li->message,
148             };
149             } elsif ( !$shutdown ) {
150 0           $last_resumption_token = $li->resumptionToken;
151 0           $params = { resumptionToken => $last_resumption_token->resumptionToken };
152 0 0 0       sleep( $delay / 1000.0 ) unless !$delay || !$last_resumption_token;
153             }
154             } while ( !$shutdown && $li->is_success && $last_resumption_token );
155              
156             return {
157 0 0         resumptionToken => $last_resumption_token ? $last_resumption_token->resumptionToken : undef,
158             shutdownRequested => $shutdown,
159             };
160             }
161              
162             =head1 ATTRIBUTES
163              
164             =head2 key
165              
166             The API key provided by the Rijksmuseum.
167              
168             =cut
169              
170             has key => (
171             is => 'rw',
172             required => 1,
173             );
174              
175             =head1 AUTHOR
176              
177             Robin Sheat, C<< <rsheat at cpan.org> >>
178              
179             =head1 TODO
180              
181             =over 4
182              
183             =item Handle the ListRecords verb
184              
185             This'll require writing a parser for EDM-DC or similar.
186              
187             =item Implement logging
188              
189             A proper logging system would allow recording of transient failures to see if
190             they are becoming a problem. It would also allow the option for more
191             fine-grained progress information to be displayed.
192              
193             =back
194              
195             =cut
196              
197             =head1 BUGS
198              
199             Please report any bugs or feature requests to C<bug-museum-rijksmuseum-object at rt.cpan.org>, or through
200             the web interface at L<https://rt.cpan.org/NoAuth/ReportBug.html?Queue=Museum-Rijksmuseum-Object>. I will be notified, and then you'll
201             automatically be notified of progress on your bug as I make changes.
202              
203             Alternately, use the tracker on the repository page at L<https://gitlab.com/eythian/museum-rijksmuseum-object>.
204              
205              
206             =head1 SUPPORT
207              
208             You can find documentation for this module with the perldoc command.
209              
210             perldoc Museum::Rijksmuseum::Object::Harvester
211              
212              
213             You can also look for information at:
214              
215             =over 4
216              
217             =item * Repository page (report bugs here)
218              
219             L<https://gitlab.com/eythian/museum-rijksmuseum-object>
220              
221             =item * RT: CPAN's request tracker (or here)
222              
223             L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=Museum-Rijksmuseum-Object>
224              
225             =item * CPAN Ratings
226              
227             L<https://cpanratings.perl.org/d/Museum-Rijksmuseum-Object>
228              
229             =item * Search CPAN
230              
231             L<https://metacpan.org/release/Museum-Rijksmuseum-Object>
232              
233              
234             =back
235              
236              
237             =head1 ACKNOWLEDGEMENTS
238              
239              
240             =head1 LICENSE AND COPYRIGHT
241              
242             This software is Copyright (c) 2023 by Robin Sheat.
243              
244             This is free software, licensed under:
245              
246             The Artistic License 2.0 (GPL Compatible)
247              
248              
249             =cut
250              
251             1;