File Coverage

blib/lib/Net/OAI/Harvester.pm
Criterion Covered Total %
statement 240 268 89.5
branch 57 96 59.3
condition 11 27 40.7
subroutine 35 38 92.1
pod 11 12 91.6
total 354 441 80.2


line stmt bran cond sub pod time code
1             package Net::OAI::Harvester;
2              
3 19     19   101938 use strict;
  19         1407  
  19         572  
4 18     18   516 use warnings;
  18         607  
  18         692  
5              
6 17     17   102 use constant XMLNS_OAI => "http://www.openarchives.org/OAI/2.0/";
  17         31  
  17         1105  
7              
8 17     17   14058 use URI;
  17         78714  
  17         496  
9 17     17   16891 use LWP::UserAgent;
  17         726970  
  17         775  
10 17     17   13160 use XML::SAX qw( Namespaces Validation );
  17         103981  
  17         1095  
11 17     17   20383 use File::Temp qw( tempfile );
  17         311676  
  17         1251  
12 17     17   129 use Carp qw( carp croak );
  17         37  
  17         851  
13              
14 17     17   10839 use Net::OAI::Error;
  17         65  
  17         1002  
15 17     17   12062 use Net::OAI::ResumptionToken;
  17         51  
  17         440  
16 17     17   9568 use Net::OAI::Identify;
  17         48  
  17         538  
17 17     17   10038 use Net::OAI::ListMetadataFormats;
  17         43  
  17         448  
18 17     17   9977 use Net::OAI::ListIdentifiers;
  17         50  
  17         549  
19 17     17   10130 use Net::OAI::ListRecords;
  17         46  
  17         516  
20 17     17   9618 use Net::OAI::GetRecord;
  17         46  
  17         426  
21 17     17   9255 use Net::OAI::ListSets;
  17         47  
  17         428  
22 17     17   94 use Net::OAI::Record::Header;
  17         28  
  17         368  
23 17     17   9964 use Net::OAI::Record::OAI_DC;
  17         43  
  17         49971  
24              
25             our $VERSION = "1.16_12";
26             our $DEBUG = 0;
27             # compatibility mode for metadataHandler
28             our $OLDmetadataHandler = 0;
29              
30             =head1 NAME
31              
32             Net::OAI::Harvester - A package for harvesting metadata using OAI-PMH
33              
34             =head1 SYNOPSIS
35              
36             ## create a harvester for the Library of Congress
37             my $harvester = Net::OAI::Harvester->new(
38             'baseURL' => 'http://memory.loc.gov/cgi-bin/oai2_0'
39             );
40              
41             ## list all the records in a repository
42             my $records = $harvester->listRecords(
43             'metadataPrefix' => 'oai_dc'
44             );
45             while ( my $record = $records->next() ) {
46             my $header = $record->header();
47             my $metadata = $record->metadata();
48             print "identifier: ", $header->identifier(), "\n";
49             print "title: ", $metadata->title(), "\n";
50             }
51              
52             ## find out the name for a repository
53             my $identity = $harvester->identify();
54             print "name: ",$identity->repositoryName(),"\n";
55              
56             ## get a list of identifiers
57             my $identifiers = $harvester->listIdentifiers(
58             'metadataPrefix' => 'oai_dc'
59             );
60             while ( my $header = $identifiers->next() ) {
61             print "identifier: ",$header->identifier(), "\n";
62             }
63              
64             ## list all the records in a repository
65             my $records = $harvester->listRecords(
66             'metadataPrefix' => 'oai_dc'
67             );
68             while ( my $record = $records->next() ) {
69             my $header = $record->header();
70             my $metadata = $record->metadata();
71             print "identifier: ", $header->identifier(), "\n";
72             print "title: ", $metadata->title(), "\n";
73             }
74              
75             ## GetRecord, ListSets, ListMetadataFormats also supported
76              
77             =head1 DESCRIPTION
78              
79             Net::OAI::Harvester is a Perl extension for easily querying OAI-PMH
80             repositories. OAI-PMH is the Open Archives Initiative Protocol for Metadata
81             Harvesting. OAI-PMH allows data repositories to share metadata about their
82             digital assets. Net::OAI::Harvester is a OAI-PMH client, so it does for
83             OAI-PMH what LWP::UserAgent does for HTTP.
84              
85             You create a Net::OAI::Harvester object which you can then use to
86             retrieve metadata from a selected repository. Net::OAI::Harvester tries to keep
87             things simple by providing an API to get at the data you want; but it also has
88             a framework which is easy to extend should you need to get more fancy.
89              
90             The guiding principle behind OAI-PMH is to allow metadata about online
91             resources to be shared by data providers, so that the metadata can be harvested
92             by interested parties. The protocol is essentially XML over HTTP (much like
93             XMLRPC or SOAP). Net::OAI::Harvester does XML parsing for you
94             (using XML::SAX internally), but you can get at the raw XML if you want to do
95             your own XML processing, and you can drop in your own XML::SAX handler if you
96             would like to do your own parsing of metadata elements.
97              
98             A OAI-PMH repository supports 6 verbs: GetRecord, Identify, ListIdentifiers,
99             ListMetadataFormats, ListRecords, and ListSets. The verbs translate directly
100             into methods that you can call on a Net::OAI::Harvester object. More details
101             about these methods are supplied below, however for the real story please
102             consult the spec at http://www.openarchives.org.
103              
104             Net::OAI::Harvester has a few features that are worth mentioning:
105              
106             =over 4
107              
108             =item 1
109              
110             Since the OAI-PMH results can be arbitrarily large, a stream based (XML::SAX)
111             parser is used. As the document is parsed corresponding Perl objects are
112             created (records, headers, etc), which are then serialized on disk (using
113             Storable if you are curious). The serialized objects on disk can then be
114             iterated over one at a time. The benefit of this is a lower memory footprint
115             when (for example) a ListRecords verb is exercised on a repository that
116             returns 100,000 records.
117              
118              
119             =item 2
120              
121             XML::SAX filters are used which will allow interested developers to write
122             their own metadata parsing packages, and drop them into place. This is useful
123             because OAI-PMH is itself metadata schema agnostic, so you can use OAI-PMH
124             to distribute all kinds of metadata (Dublin Core, MARC, EAD, or your favorite
125             metadata schema). OAI-PMH does require that a repository at least provides
126             Dublin Core metadata as a baseline. Net::OAI::Harvester has built in support for
127             unqualified Dublin Core, and has a framework for dropping in your own parser
128             for other kinds of metadata. If you create a XML::Handler that you would like
129             to contribute back into the Net::OAI::Harvester project please get in touch!
130              
131             =back
132              
133             =head1 METHODS
134              
135             All the Net::OAI::Harvester methods return other objects. As you would expect
136             new() returns an Net::OAI::Harvester object; similarly getRecord() returns an
137             Net::OAI::Record object, listIdentifiers() returns a Net::OAI::ListIdentifiers
138             object, identify() returns an Net::OAI::Identify object, and so on. So when
139             you use one of these methods you'll probably want to check out the docs for
140             the object that gets returned so you can see what to do with it. Many
141             of these classes inherit from Net::OAI::Base which provides some base
142             functionality for retrieving errors, getting the raw XML, and the
143             temporary file where the XML is stored (see Net::OAI::Base documentation for
144             more details).
145              
146             =head2 new()
147              
148             The constructor which returns an Net::OAI::Harvester object. You must supply the
149             baseURL parameter, to tell Net::OAI::Harvester what data repository you are
150             going to be harvesting. For a list of data providers check out the directory
151             available on the Open Archives Initiative homepage.
152              
153             my $harvester = Net::OAI::Harvester->new(
154             baseURL => 'http://memory.loc.gov/cgi-bin/oai2_0'
155             );
156              
157             If you want to pull down all the XML files and keep them in a directory, rather
158             than having the stored as transient temp files pass in the dumpDir parameter.
159              
160             my $harvester = Net::OAI::Harvester->new(
161             baseUrl => 'http://memory.loc.gov/cgi-bin/oai2_0',
162             dumpDir => 'american-memory'
163             );
164              
165             Also if you would like to fine tune the HTTP client used by Net::OAI::Harvester
166             you can pass in a configured object. For example this can be handy if you
167             want to adjust the client timeout:
168              
169             my $ua = LWP::UserAgent->new();
170             $ua->timeout(20); ## set timeout to 20 seconds
171             my $harvester = Net::OAI::Harvester->new(
172             baseURL => 'http://memory.loc.gov/cgi-bin/oai2_0',
173             userAgent => $ua
174             );
175              
176             =cut
177              
178             sub new {
179 22     22 1 61677 my ( $class, %opts ) = @_;
180              
181             ## uppercase options
182 22         86 my %normalOpts = map { ( uc($_), $opts{$_} ) } keys( %opts );
  24         144  
183            
184             ## we must be told a baseURL
185 22 50       129 croak( "new() needs the baseUrl parameter" ) if !$normalOpts{ BASEURL };
186 22         202 my $baseURL = URI->new( $normalOpts{ BASEURL } );
187              
188 22   33     130752 my $self = bless( { baseURL => $baseURL }, ref( $class ) || $class );
189              
190             ## set the user agent
191 22 100       110 if ( $normalOpts{ USERAGENT } ) {
192 1         6 $self->userAgent( $normalOpts{ USERAGENT } );
193             } else {
194 21         226 my $ua = LWP::UserAgent->new();
195 21         48079 $ua->agent( $class );
196 21         1264 $self->userAgent( $ua );
197             }
198              
199             # set up some stuff if we are dumping xml to a directory
200 22 100       105 if ($normalOpts{ DUMPDIR }) {
201 1         2 my $dir = $normalOpts{ DUMPDIR };
202 1 50       20 croak "no such directory '$dir'" unless -d $dir;
203 1         2 $self->{ dumpDir } = $dir;
204 1         3 $self->{ lastDump } = 0;
205             }
206              
207 22         124 return( $self );
208             }
209              
210             =head2 identify()
211              
212             identify() is the OAI verb that tells a metadata repository to provide a
213             description of itself. A call to identify() returns a Net::OAI::Identify object
214             which you can then call methods on to retrieve the information you are
215             intersted in. For example:
216              
217             my $identity = $harvester->identify();
218             print "repository name: ",$identity->repositoryName(),"\n";
219             print "protocol version: ",$identity->protocolVersion(),"\n";
220             print "earliest date stamp: ",$identity->earliestDatestamp(),"\n";
221             print "admin email(s): ", join( ", ", $identity->adminEmail() ), "\n";
222             ...
223              
224             For more details see the L documentation.
225              
226             =cut
227              
228             sub identify {
229 6     6 1 2861 my $self = shift;
230 6         48 my $uri = $self->{ baseURL }->clone();
231 6         229 $uri->query_form( 'verb' => 'Identify' );
232              
233 6         542 my $identity = Net::OAI::Identify->new( $self->_get( $uri ) );
234 6 100       110 return $identity if $identity->{ error };
235              
236 1         13 my $error = Net::OAI::Error->new( Handler => $identity );
237 1         6 my $parser = _parser( $error );
238 1         18 debug( "parsing Identify response " . $identity->file() );
239 1         3 eval { $parser->parse_uri( $identity->file() ) };
  1         5  
240 1 50       354 if ( $@ ) { _xmlError( $error ); }
  0         0  
241 1         12 $error->set_handler( undef );
242 1         44 $identity->{ error } = $error;
243 1         27 return( $identity );
244             }
245              
246             =head2 listMetadataFormats()
247              
248             listMetadataFormats() asks the repository to return a list of metadata formats
249             that it supports. A call to listMetadataFormats() returns an
250             Net::OAI::ListMetadataFormats object.
251              
252             my $list = $harvester->listMetadataFormats();
253             print "archive supports metadata prefixes: ",
254             join( ',', $list->prefixes() ),"\n";
255              
256             If you are interested in the metadata formats available for
257             a particular resource identifier then you can pass in that identifier.
258            
259             my $list = $harvester->listMetadataFormats( identifier => '1234567' );
260             print "record identifier 1234567 can be retrieved as ",
261             join( ',', $list->prefixes() ),"\n";
262              
263             See documentation for L for more details.
264              
265             =cut
266              
267             sub listMetadataFormats {
268 2     2 1 1295 my ( $self, %opts ) = @_;
269 2         29 my $uri = $self->{ baseURL }->clone();
270              
271             $uri->query_form( verb => 'ListMetadataFormats',
272 2 100       98 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( identifier )
  2         21  
273             );
274              
275 2         189 my $list = Net::OAI::ListMetadataFormats->new( $self->_get( $uri ) );
276 2 50       15 return $list if $list->{ error };
277              
278 2         27 my $error = Net::OAI::Error->new( Handler => $list );
279 2         12 my $parser = _parser( $error );
280 2         26 debug( "parsing ListMetadataFormats response: ".$list->file() );
281 2         6 eval { $parser->parse_uri( $list->file() ) };
  2         8  
282 2 50       708 if ( $@ ) { _xmlError( $error ); }
  0         0  
283 2         17 $error->set_handler( undef );
284 2         61 $list->{ error } = $error;
285 2         50 return( $list );
286             }
287              
288             =head2 getRecord()
289              
290             getRecord() is used to retrieve a single record from a repository. You must pass
291             in the C and an optional C parameters to identify
292             the record, and the flavor of metadata you would like. Net::OAI::Harvester
293             includes a parser for OAI DublinCore, so if you do not specifiy a
294             metadataPrefix 'oai_dc' will be assumed. If you would like to drop in your own
295             XML::Handler for another type of metadata use either the C
296             or the C parameter, either the name of the class as string
297             or an already instantiated object of that class.
298              
299             my $result = $harvester->getRecord(
300             identifier => 'abc123',
301             );
302              
303             ## did something go wrong?
304             if ( my $oops = $result->errorCode() ) { ... };
305              
306             ## get the result as Net::OAI::Record object
307             my $record = $result->record(); # undef if error
308              
309             ## directly get the Net::OAI::Record::Header object
310             my $header = $result->header(); # undef if error
311             ## same as
312             my $header = $result->record()->header(); # undef if error
313              
314             ## get the metadata object
315             my $metadata = $result->metadata(); # undef if error or harvested with recordHandler
316              
317             ## or if you would rather use your own XML::Handler
318             ## pass in the package name for the object you would like to create
319             my $result = $harvester->getRecord(
320             identifier => 'abc123',
321             metadataHandler => 'MyHandler'
322             );
323             my $metadata = $result->metadata();
324            
325             my $result = $harvester->getRecord(
326             identifier => 'abc123',
327             recordHandler => 'MyCompleteHandler'
328             );
329             my $complete_record = $result->recorddata(); # undef if error or harvested with metadataHandler
330            
331             =cut
332              
333             sub getRecord {
334 9     9 1 19876 my ( $self, %opts ) = @_;
335              
336             croak( "getRecord(): the 'identifier' parameter is required" )
337 9 50       44 unless defined $opts{ 'identifier' };
338             croak( "getRecord(): the 'metadataPrefix' parameter is required" )
339 9 50       33 unless exists $opts{ 'metadataPrefix' };
340             croak( "getRecord(): recordHandler and metadataHandler are mutually exclusive" )
341 9 50 66     39 if $opts{ recordHandler } and $opts{ metadataHandler };
342              
343 9         105 my $uri = $self->{ baseURL }->clone();
344              
345             $uri->query_form( verb => 'GetRecord',
346 9 50       328 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( identifier metadataPrefix )
  18         118  
347             );
348              
349             my $record = Net::OAI::GetRecord->new( $self->_get( $uri ),
350             recordHandler => $opts{ recordHandler },
351             metadataHandler => $opts{ metadataHandler },
352 9         1336 );
353 9 50       65 return $record if $record->{ error };
354              
355 9         111 my $error = Net::OAI::Error->new( Handler => $record );
356 9         47 my $parser = _parser( $error );
357 9         111 debug( "parsing GetRecord response " . $record->file() );
358 9         25 eval { $parser->parse_uri( $record->file() ) };
  9         41  
359 9 50       2540 if ( $@ ) { _xmlError( $error ); }
  0         0  
360              
361 9         49 $error->set_handler( undef );
362 9         241 $record->{ error } = $error;
363 9         205 return( $record );
364              
365             }
366              
367              
368             =head2 listRecords()
369              
370             listRecords() allows you to retrieve all the records in a data repository.
371             You must supply the C parameter to tell your Net::OAI::Harvester
372             which type of records you are interested in. listRecords() returns an
373             Net::OAI::ListRecords object. There are four other optional parameters C,
374             C, C, and C which are better described in the
375             OAI-PMH spec.
376              
377             my $records = $harvester->listRecords(
378             metadataPrefix => 'oai_dc'
379             );
380              
381             ## iterate through the results with next()
382             while ( my $record = $records->next() ) {
383             my $metadata = $record->metadata();
384             ...
385             }
386              
387             If you would like to use your own metadata handler then you can specify
388             the package name of the handler as the C (will be exposed
389             to events below the C element) or C (will be
390             exposed to the C element and all its children) parameter, passing
391             either
392              
393             =over 4
394              
395             =item the name of the class as string, in that case a new instance
396             will be created for any OAI record encountered or
397              
398             =item an already instantiated object of that class which will
399             be reused for all records.
400              
401             =back
402              
403             my $records = $harvester->listRecords(
404             metadataPrefix => 'mods',
405             metadataHandler => 'MODS::Handler'
406             );
407              
408             while ( my $record = $records->next() ) {
409             my $metadata = $record->metadata();
410             # $metadata will be a MODS::Handler object
411             }
412              
413             If you want to automatically handle resumption tokens you can achieve
414             this with the listAllRecords() method. In this case the C
415             method transparently causes the next response to be fetched from
416             the repository if the current response ran out of records and
417             contained a resumptionToken.
418              
419             If you prefer you can handle resumption tokens yourself with a
420             loop, and the resumptionToken() method. You might want to do this
421             if you are working with a repository that wants you to wait between
422             requests or if connectivity problems become an issue during particulary
423             long harvesting runs and you want to implement a retransmission
424             strategy for failing requests.
425              
426             my $records = $harvester->listRecords( metadataPrefix => 'oai_dc' );
427             my $responseDate = $records->responseDate();
428             my $finished = 0;
429              
430             while ( ! $finished ) {
431              
432             while ( my $record = $records->next() ) { # a Net::OAI::Record object
433             my $metadata = $record->metadata();
434             # do interesting stuff here
435             }
436              
437             my $rToken = $records->resumptionToken();
438             if ( $rToken ) {
439             $records = $harvester->listRecords(
440             resumptionToken => $rToken->token()
441             );
442             } else {
443             $finished = 1;
444             }
445              
446             }
447              
448             Please note: Since C stashes away the individual
449             records it encounters with C, special care has to
450             be taken if the handlers you provided make use of XS modules
451             since these objects cannot be reliably handled. Therefore you will
452             have to provide the special serializing and deserializing methods
453             C and C for the objects
454             used by your filter(s).
455              
456              
457             =cut
458              
459             sub listRecords {
460 9     9 1 6073 my ( $self, %opts ) = @_;
461              
462             croak( "listRecords(): the 'metadataPrefix' parameter is required" )
463             unless ( exists $opts{ 'metadataPrefix' }
464 9 50 66     70 or defined $opts{ 'resumptionToken' } );
465             croak( "listRecords(): recordHandler and metadataHandler are mutually exclusive" )
466 9 50 66     50 if $opts{ recordHandler } and $opts{ metadataHandler };
467              
468 9         103 my $uri = $self->{ baseURL }->clone();
469              
470             $uri->query_form( verb => 'ListRecords',
471 9 100       340 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( metadataPrefix from until set resumptionToken )
  45         194  
472             );
473              
474             my $list = Net::OAI::ListRecords->new( $self->_get( $uri ),
475             metadataHandler => $opts{ metadataHandler },
476             recordHandler => $opts{ recordHandler },
477 9         940 );
478 9 50       81 return $list if $list->{ error };
479              
480 9         128 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
481 9         114 my $error = Net::OAI::Error->new( Handler => $token );
482 9         55 my $parser = _parser( $error );
483 9         122 debug( "parsing ListRecords response " . $list->file() );
484 9         22 eval { $parser->parse_uri( $list->file() ) };
  9         38  
485 9 50       2433 if ( $@ ) { _xmlError( $error ); }
  0         0  
486              
487 9         65 $token->set_handler( undef );
488 9 100       258 $list->{ token } = $token->token() ? $token : undef;
489              
490 9         52 $error->set_handler( undef );
491 9         191 $list->{ error } = $error;
492              
493 9         220 return( $list );
494             }
495              
496             =head2 listAllRecords()
497              
498             Does exactly what listRecords() does except the C method
499             will automatically submit resumption tokens as needed.
500              
501             my $records = $harvester->listAllRecords( metadataPrefix => 'oai_dc' );
502              
503             while ( my $record = $records->next() ) { # a Net::OAI::Record object until undef
504             my $metadata = $record->metadata();
505             # do interesting stuff here
506             }
507              
508              
509             =cut
510              
511             sub listAllRecords {
512 1     1 1 504 my $self = shift;
513 1         5 debug( "calling listRecords() as part of listAllRecords request" );
514 1         4 my $list = listRecords( $self, @_ );
515 1         4 $list->{ harvester } = $self;
516 1         4 return( $list );
517             }
518              
519             =head2 listIdentifiers()
520              
521             listIdentifiers() takes the same parameters that listRecords() takes, but it
522             returns only the record headers, allowing you to quickly retrieve all the
523             record identifiers for a particular repository. The object returned is a
524             L object.
525              
526             my $headers = $harvester->listIdentifiers(
527             metadataPrefix => 'oai_dc'
528             );
529              
530             ## iterate through the results with next()
531             while ( my $header = $identifiers->next() ) { # a Net::OAI::Record::Header object
532             print "identifier: ", $header->identifier(), "\n";
533             }
534              
535             If you want to automatically handle resumption tokens use listAllIdentifiers().
536             If you are working with a repository that encourages pauses between requests
537             you can handle the tokens yourself using the technique described above
538             in listRecords().
539              
540             =cut
541              
542             sub listIdentifiers {
543 6     6 1 12118 my ( $self, %opts ) = @_;
544             croak( "listIdentifiers(): the 'metadataPrefix' parameter is required" )
545             unless ( exists $opts{ 'metadataPrefix' }
546 6 50 66     40 or defined $opts{ 'resumptionToken' } );
547 6         65 my $uri = $self->{ baseURL }->clone();
548              
549             $uri->query_form( verb => 'ListIdentifiers',
550 6 100       327 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( metadataPrefix from until set resumptionToken )
  30         141  
551             );
552              
553 6         654 my $list = Net::OAI::ListIdentifiers->new( $self->_get( $uri ) );
554 6 50       50 return( $list ) if $list->{ error };
555              
556 6         88 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
557 6         135 my $error = Net::OAI::Error->new( Handler => $token );
558 6         44 my $parser = _parser( $error );
559 6         82 debug( "parsing ListIdentifiers response " . $list->file() );
560 6         16 eval { $parser->parse_uri( $list->file() ) };
  6         31  
561 6 50       2127 if ( $@ ) { _xmlError( $error ); }
  0         0  
562              
563 6         89 $token->set_handler( undef );
564 6 100       175 $list->{ token } = $token->token() ? $token : undef;
565              
566 6         33 $error->set_handler( undef );
567 6         122 $list->{ error } = $error;
568              
569 6         156 return( $list );
570             }
571              
572             =head2 listAllIdentifiers()
573              
574             Does exactly what listIdentifiers() does except C will automatically
575             submit resumption tokens as needed.
576              
577             =cut
578              
579             sub listAllIdentifiers {
580 1     1 1 429 my $self = shift;
581 1         5 debug( "calling listIdentifiers() as part of listAllIdentifiers() call" );
582 1         4 my $list = listIdentifiers( $self, @_ );
583 1         3 $list->{ harvester } = $self;
584 1         5 return( $list );
585             }
586              
587              
588             =head2 listSets()
589              
590             listSets() takes an optional C parameter, and returns a
591             Net::OAI::ListSets object. listSets() allows you to harvest a subset of a
592             particular repository with listRecords(). For more information see the OAI-PMH
593             spec and the Net::OAI::ListSets docs.
594              
595             my $sets = $harvester->listSets();
596             foreach ( $sets->setSpecs() ) {
597             print "set spec: $_ ; set name: ", $sets->setName( $_ ), "\n";
598             }
599              
600             =cut
601              
602             sub listSets {
603 1     1 1 406 my ( $self, %opts ) = @_;
604              
605 1         20 my $uri = $self->{ baseURL }->clone();
606              
607             $uri->query_form( verb => 'ListSets',
608 1 50       87 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( resumptionToken )
  1         10  
609             );
610              
611 1         93 my $list = Net::OAI::ListSets->new( $self->_get( $uri ) );
612 1 50       10 return( $list ) if $list->{ error };
613              
614 1         14 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
615 1         14 my $error = Net::OAI::Error->new( Handler => $token );
616 1         6 my $parser = _parser( $error );
617 1         19 debug( "parsing ListSets response " . $list->file() );
618 1         3 eval { $parser->parse_uri( $list->file() ) };
  1         5  
619 1 50       355 if ( $@ ) { _xmlError( $error ); }
  0         0  
620              
621 1         10 $token->set_handler( undef );
622 1 50       37 $list->{ token } = $token->token() ? $token : undef;
623 1         6 $error->set_handler( undef );
624 1         21 $list->{ error } = $error;
625 1         34 return( $list );
626             }
627              
628             =head2 baseURL()
629              
630             Gets or sets the base URL for the repository being harvested (as L).
631              
632             $harvester->baseURL( 'http://memory.loc.gov/cgi-bin/oai2_0' );
633              
634             Or if you want to know what the current baseURL is
635              
636             $baseURL = $harvester->baseURL();
637              
638             =cut
639              
640             sub baseURL {
641 0     0 1 0 my ( $self, $url ) = @_;
642 0 0       0 if ( $url ) { $self->{ baseURL } = URI->new( $url ); }
  0         0  
643             ## The HTTP UserAgent modifies its URI object upon execution,
644             ## therefore we'll always provide a clone have to reconstruct: trim the query part ...
645             # my $c = $self->{ baseURL }; # ->canonical();
646             # if ( $c && ($c =~ /^([^\?]*)\?/) ) { # $c might be undefined
647             # return $1};
648             # return $c;
649 0         0 return $self->{ baseURL };
650             }
651              
652             =head2 userAgent()
653              
654             Gets or sets the LWP::UserAgent object being used to perform the HTTP
655             transactions. This method could be useful if you wanted to change the
656             agent string, timeout, or some other feature.
657              
658             =cut
659              
660             sub userAgent {
661 23     23 1 740 my ( $self, $ua ) = @_;
662 23 100       144 if ( $ua ) {
663 22 50       235 $ua->isa('LWP::UserAgent') or croak( "userAgent() needs a valid LWP::UserAgent" );
664 22         189 $self->{ userAgent } = $ua;
665             }
666 23         68 return( $self->{ userAgent } );
667             }
668              
669             ## internal stuff
670              
671             sub _get {
672 33     33   81 my ($self,$uri) = @_;
673 33         95 my $ua = $self->{ userAgent };
674              
675 33         58 my ($fh, $file);
676 33 100       138 if ( $self->{ dumpDir } ) {
677 2         6 my $filePrefix = $self->{lastDump}++;
678 2         14 $file = sprintf("%s/%08d.xml", $self->{dumpDir}, $filePrefix);
679 2         23 $fh = IO::File->new($file, 'w');
680             } else {
681 31         218 ( $fh, $file ) = tempfile(UNLINK => 1);
682             }
683              
684 33         129129 debug( "fetching ".$uri->as_string() );
685 33         132 debug( "writing to file: $file" );
686 33         133 my $request = HTTP::Request->new( GET => $uri->as_string() );
687 33     1161   5741 my $response = $ua->request( $request, sub { print $fh shift; }, 8192 );
  1161         86454294  
688 33         128536100 close( $fh );
689              
690 33 100       281 if ( $response->is_error() ) {
691             # HTTP::Request does not provide a file in case of HTTP level errors,
692             # therefore we do not return the name of the non-existant file but
693             # rather the original HTTP::Response object
694 5         95 debug( "caught HTTP level error" . $response->message() );
695 5   50     27 my $error = Net::OAI::Error->new(
696             errorString => 'HTTP Level Error: ' . $response->message(),
697             errorCode => $response->code(),
698             HTTPError => $response,
699             HTTPRetryAfter => $response->header("Retry-After") || "",
700             );
701             return(
702             # file => $file,
703 5         73 error => $error
704             );
705             }
706 28 50       759 if ( my $ct = $response->header("Content-Type") ) {
707 28         1966 debug( "Content-type $ct in HTTP response" );
708 28 50       276 unless ( $ct =~ /^text\/xml(;|$)/ ) {
709 0   0     0 return (error => Net::OAI::Error->new(errorCode => 'xmlContentError',
710             errorString => "Content-Type: text/xml is mandatory (got: $ct)!"),
711             HTTPError => $response,
712             HTTPRetryAfter => $response->header("Retry-After") || "",
713             )
714             };
715 28 50       288 if ( $ct =~ /; charset=(\S+)/ ) {
716 28         134 my $cs = $1;
717 28 50 0     240 return (error => Net::OAI::Error->new(errorCode => 'xmlContentError',
718             errorString => "charset=UTF-8 is mandatory (got: $cs)!"),
719             HTTPError => $response,
720             HTTPRetryAfter => $response->header("Retry-After") || "",
721             ) unless $cs =~ /^utf-8/i;
722             };
723             }
724              
725             return(
726 28         1233 file => $file,
727             );
728              
729             }
730              
731             sub _parser {
732 30     30   1481 my $handler = shift;
733 30         404 my $factory = XML::SAX::ParserFactory->new();
734 30         7136 my $parser;
735 30         214 $factory->require_feature(Namespaces);
736 30         360 eval { $parser = $factory->parser( Handler => $handler ) };
  30         184  
737 30 50       646020 carp ref($factory)." threw an exception:\n\t$@" if $@;
738              
739 30 50 33     319 if ( $parser && ref($parser) ) {
740 30         630 debug( "using SAX parser " . ref($parser) . " " . $parser->VERSION );
741 30         350 return $parser;
742             };
743              
744 0         0 carp "!!! Please check your setup of XML::SAX, especially ParserDetails.ini !!!\n";
745 0         0 local($XML::SAX::ParserPackage) = "XML::SAX::PurePerl";
746 0         0 eval { $parser = $factory->parser( Handler => $handler ) };
  0         0  
747 0 0       0 carp ref($factory)." threw an exception again:\n\t$@" if $@;
748 0 0 0     0 if ( $parser && ref($parser) ) {
749 0         0 carp "Successfuly forced assignment of a parser: " . ref($parser) . " " . $parser->VERSION ."\n";
750 0         0 return $parser;
751             };
752              
753 0         0 croak( ref($factory)." on request did not even give us the default XML::SAX::PurePerl parser.\nGiving up." );
754             }
755              
756             sub _xmlError {
757 0     0   0 my $e = shift;
758 0         0 carp "caught xml parsing error: $@";
759 0         0 $e->errorString( "XML parsing error: $@" );
760 0         0 $e->errorCode( 'xmlParseError' );
761             }
762              
763              
764             sub _verifyHandler {
765 219     219   454 my $package_or_instance = shift;
766 219 100       845 if ( ref($package_or_instance) ) {
767 206 50       1274 $package_or_instance->isa('XML::SAX::Base')
768             or _fatal( "Handler $package_or_instance must inherit from XML::SAX::Base\n" )
769             }
770             else {
771 13     6   2889 eval( "use $package_or_instance" );
  6     4   505  
  6         334  
  6         181  
  4         1026  
  4         1263  
  4         120  
772 13 50       92 _fatal( "unable to locate Handler $package_or_instance in: " .
773             join( "\n\t", @INC ) ) if $@;
774             _fatal( "Handler $package_or_instance must inherit from XML::SAX::Base\n" )
775 13 50       931 if ( ! grep { 'XML::SAX::Base' } eval( '@' . $package_or_instance . '::ISA' ) );
  13         109  
776             }
777 219         589 return( 1 );
778             }
779              
780              
781             sub debug {
782 6642 50   6642 0 24077 return unless $Net::OAI::Harvester::DEBUG;
783 0         0 my $msg = shift;
784 0         0 carp "oai-harvester: " . localtime() . ": $msg\n";
785             }
786              
787             sub _fatal {
788 0     0   0 my $msg = shift;
789 0         0 croak "fatal: $msg";
790             }
791              
792             =head1 DIAGNOSTICS
793              
794             If you would like to see diagnostic information when harvesting is running
795             then set Net::OAI::Harvester::DEBUG to a true value.
796              
797             $Net::OAI::Harvester::DEBUG = 1;
798              
799              
800              
801              
802             =head1 PERFORMANCE
803              
804             XML::SAX is used for parsing, but it presents a generalized interface to many
805             parsers. It comes with XML::Parser::PurePerl by default, which is nice since
806             you don't have to worry about getting the right libraries installed. However
807             XML::Parser::PurePerl is rather slow compared to XML::LibXML. If you
808             are a speed freak install XML::LibXML from CPAN today.
809              
810             If you have a particular parser you want to use you can set the
811             $XML::SAX::ParserPackage variable appropriately. See XML::SAX::ParserFactory
812             documentation for details.
813              
814              
815              
816             =head1 ENVIRONMENT
817              
818             The modules use LWP for HTTP operations, thus C controls
819             wether the "_proxy" environment settings shall be honored.
820              
821              
822             =head1 TODO
823              
824             =over 4
825              
826             =item *
827              
828             Allow Net::OAI::ListMetadataFormats to store more than just the metadata
829             prefixes.
830              
831             =item *
832              
833             Implement Net::OAI::Set for iterator access to Net::OAI::ListSets.
834              
835             =item *
836              
837             Implement Net::OAI::Harvester::listAllSets().
838              
839             =item *
840              
841             More documentation of other classes.
842              
843             =item *
844              
845             Document custom XML::Handler creation.
846              
847             =item *
848              
849             Handle optional compression.
850              
851             =item *
852              
853             Create common handlers for other metadata formats (MARC, qualified DC, etc).
854              
855             =item *
856              
857             Or at least provide a generic record handler as fallback, since using
858             L makes absolutely no sense except for ... oai_dc records.
859              
860             =item *
861              
862             Selectively load Net::OAI::* classes as needed, rather than getting all of them
863             at once at the beginning of Net::OAI::Harvester.
864              
865             =back
866              
867             =head1 SEE ALSO
868              
869             =over 4
870              
871             =item *
872              
873             OAI-PMH Specification at L
874              
875             =item *
876              
877             L
878              
879             =item *
880              
881             L
882              
883             =item *
884              
885             L
886              
887             =item *
888              
889             L
890              
891             =item *
892              
893             L
894              
895             =item *
896              
897             L
898              
899             =item *
900              
901             L
902              
903             =item *
904              
905             L
906              
907             =item *
908              
909             L
910              
911             =item *
912              
913             L
914              
915             =item *
916              
917             L
918              
919             =item *
920              
921             L
922              
923             =item *
924              
925             L
926              
927             =item *
928              
929             L
930              
931             =item *
932              
933             L
934              
935             =back
936              
937              
938             =head1 AUTHORS
939              
940             Ed Summers
941              
942             Martin Emmerich
943              
944             Thomas Berger
945              
946             =head1 LICENSE
947              
948             This is free software, you may use it and distribute it under the same terms as Perl itself.
949              
950             =cut
951              
952             1;
953