File Coverage

blib/lib/Net/OAI/Harvester.pm
Criterion Covered Total %
statement 240 268 89.5
branch 57 96 59.3
condition 11 27 40.7
subroutine 35 38 92.1
pod 11 12 91.6
total 354 441 80.2


line stmt bran cond sub pod time code
1             package Net::OAI::Harvester;
2              
3 18     18   62434 use strict;
  18         1145  
  18         486  
4 17     17   457 use warnings;
  17         538  
  17         568  
5              
6 16     16   86 use constant XMLNS_OAI => "http://www.openarchives.org/OAI/2.0/";
  16         24  
  16         875  
7              
8 16     16   12598 use URI;
  16         70924  
  16         437  
9 16     16   15111 use LWP::UserAgent;
  16         618082  
  16         642  
10 16     16   11488 use XML::SAX qw( Namespaces Validation );
  16         92798  
  16         948  
11 16     16   18374 use File::Temp qw( tempfile );
  16         281276  
  16         1109  
12 16     16   112 use Carp qw( carp croak );
  16         29  
  16         756  
13              
14 16     16   10134 use Net::OAI::Error;
  16         56  
  16         818  
15 16     16   10473 use Net::OAI::ResumptionToken;
  16         47  
  16         399  
16 16     16   8906 use Net::OAI::Identify;
  16         43  
  16         467  
17 16     16   8951 use Net::OAI::ListMetadataFormats;
  16         41  
  16         392  
18 16     16   8590 use Net::OAI::ListIdentifiers;
  16         44  
  16         475  
19 16     16   9021 use Net::OAI::ListRecords;
  16         41  
  16         452  
20 16     16   8803 use Net::OAI::GetRecord;
  16         40  
  16         390  
21 16     16   8602 use Net::OAI::ListSets;
  16         102  
  16         480  
22 16     16   95 use Net::OAI::Record::Header;
  16         29  
  16         340  
23 16     16   9105 use Net::OAI::Record::OAI_DC;
  16         42  
  16         46291  
24              
25             our $VERSION = '1.16_10';
26             our $DEBUG = 0;
27              
28             =head1 NAME
29              
30             Net::OAI::Harvester - A package for harvesting metadata using OAI-PMH
31              
32             =head1 SYNOPSIS
33              
34             ## create a harvester for the Library of Congress
35             my $harvester = Net::OAI::Harvester->new(
36             'baseURL' => 'http://memory.loc.gov/cgi-bin/oai2_0'
37             );
38              
39             ## list all the records in a repository
40             my $records = $harvester->listRecords(
41             'metadataPrefix' => 'oai_dc'
42             );
43             while ( my $record = $records->next() ) {
44             my $header = $record->header();
45             my $metadata = $record->metadata();
46             print "identifier: ", $header->identifier(), "\n";
47             print "title: ", $metadata->title(), "\n";
48             }
49              
50             ## find out the name for a repository
51             my $identity = $harvester->identify();
52             print "name: ",$identity->repositoryName(),"\n";
53              
54             ## get a list of identifiers
55             my $identifiers = $harvester->listIdentifiers(
56             'metadataPrefix' => 'oai_dc'
57             );
58             while ( my $header = $identifiers->next() ) {
59             print "identifier: ",$header->identifier(), "\n";
60             }
61              
62             ## list all the records in a repository
63             my $records = $harvester->listRecords(
64             'metadataPrefix' => 'oai_dc'
65             );
66             while ( my $record = $records->next() ) {
67             my $header = $record->header();
68             my $metadata = $record->metadata();
69             print "identifier: ", $header->identifier(), "\n";
70             print "title: ", $metadata->title(), "\n";
71             }
72              
73             ## GetRecord, ListSets, ListMetadataFormats also supported
74              
75             =head1 DESCRIPTION
76              
77             Net::OAI::Harvester is a Perl extension for easily querying OAI-PMH
78             repositories. OAI-PMH is the Open Archives Initiative Protocol for Metadata
79             Harvesting. OAI-PMH allows data repositories to share metadata about their
80             digital assets. Net::OAI::Harvester is a OAI-PMH client, so it does for
81             OAI-PMH what LWP::UserAgent does for HTTP.
82              
83             You create a Net::OAI::Harvester object which you can then use to
84             retrieve metadata from a selected repository. Net::OAI::Harvester tries to keep
85             things simple by providing an API to get at the data you want; but it also has
86             a framework which is easy to extend should you need to get more fancy.
87              
88             The guiding principle behind OAI-PMH is to allow metadata about online
89             resources to be shared by data providers, so that the metadata can be harvested
90             by interested parties. The protocol is essentially XML over HTTP (much like
91             XMLRPC or SOAP). Net::OAI::Harvester does XML parsing for you
92             (using XML::SAX internally), but you can get at the raw XML if you want to do
93             your own XML processing, and you can drop in your own XML::SAX handler if you
94             would like to do your own parsing of metadata elements.
95              
96             A OAI-PMH repository supports 6 verbs: GetRecord, Identify, ListIdentifiers,
97             ListMetadataFormats, ListRecords, and ListSets. The verbs translate directly
98             into methods that you can call on a Net::OAI::Harvester object. More details
99             about these methods are supplied below, however for the real story please
100             consult the spec at http://www.openarchives.org.
101              
102             Net::OAI::Harvester has a few features that are worth mentioning:
103              
104             =over 4
105              
106             =item 1
107              
108             Since the OAI-PMH results can be arbitrarily large, a stream based (XML::SAX)
109             parser is used. As the document is parsed corresponding Perl objects are
110             created (records, headers, etc), which are then serialized on disk (using
111             Storable if you are curious). The serialized objects on disk can then be
112             iterated over one at a time. The benefit of this is a lower memory footprint
113             when (for example) a ListRecords verb is exercised on a repository that
114             returns 100,000 records.
115              
116              
117             =item 2
118              
119             XML::SAX filters are used which will allow interested developers to write
120             their own metadata parsing packages, and drop them into place. This is useful
121             because OAI-PMH is itself metadata schema agnostic, so you can use OAI-PMH
122             to distribute all kinds of metadata (Dublin Core, MARC, EAD, or your favorite
123             metadata schema). OAI-PMH does require that a repository at least provides
124             Dublin Core metadata as a baseline. Net::OAI::Harvester has built in support for
125             unqualified Dublin Core, and has a framework for dropping in your own parser
126             for other kinds of metadata. If you create a XML::Handler that you would like
127             to contribute back into the Net::OAI::Harvester project please get in touch!
128              
129             =back
130              
131             =head1 METHODS
132              
133             All the Net::OAI::Harvester methods return other objects. As you would expect
134             new() returns an Net::OAI::Harvester object; similarly getRecord() returns an
135             Net::OAI::Record object, listIdentifiers() returns a Net::OAI::ListIdentifiers
136             object, identify() returns an Net::OAI::Identify object, and so on. So when
137             you use one of these methods you'll probably want to check out the docs for
138             the object that gets returned so you can see what to do with it. Many
139             of these classes inherit from Net::OAI::Base which provides some base
140             functionality for retrieving errors, getting the raw XML, and the
141             temporary file where the XML is stored (see Net::OAI::Base documentation for
142             more details).
143              
144             =head2 new()
145              
146             The constructor which returns an Net::OAI::Harvester object. You must supply the
147             baseURL parameter, to tell Net::OAI::Harvester what data repository you are
148             going to be harvesting. For a list of data providers check out the directory
149             available on the Open Archives Initiative homepage.
150              
151             my $harvester = Net::OAI::Harvester->new(
152             baseURL => 'http://memory.loc.gov/cgi-bin/oai2_0'
153             );
154              
155             If you want to pull down all the XML files and keep them in a directory, rather
156             than having the stored as transient temp files pass in the dumpDir parameter.
157              
158             my $harvester = Net::OAI::Harvester->new(
159             baseUrl => 'http://memory.loc.gov/cgi-bin/oai2_0',
160             dumpDir => 'american-memory'
161             );
162              
163             Also if you would like to fine tune the HTTP client used by Net::OAI::Harvester
164             you can pass in a configured object. For example this can be handy if you
165             want to adjust the client timeout:
166              
167             my $ua = LWP::UserAgent->new();
168             $ua->timeout(20); ## set timeout to 20 seconds
169             my $harvester = Net::OAI::Harvester->new(
170             baseURL => 'http://memory.loc.gov/cgi-bin/oai2_0',
171             userAgent => $ua
172             );
173              
174             =cut
175              
176             sub new {
177 22     22 1 37745 my ( $class, %opts ) = @_;
178              
179             ## uppercase options
180 22         75 my %normalOpts = map { ( uc($_), $opts{$_} ) } keys( %opts );
  24         125  
181            
182             ## we must be told a baseURL
183 22 50       164 croak( "new() needs the baseUrl parameter" ) if !$normalOpts{ BASEURL };
184 22         187 my $baseURL = URI->new( $normalOpts{ BASEURL } );
185              
186 22   33     119371 my $self = bless( { baseURL => $baseURL }, ref( $class ) || $class );
187              
188             ## set the user agent
189 22 100       101 if ( $normalOpts{ USERAGENT } ) {
190 1         5 $self->userAgent( $normalOpts{ USERAGENT } );
191             } else {
192 21         203 my $ua = LWP::UserAgent->new();
193 21         44379 $ua->agent( $class );
194 21         1237 $self->userAgent( $ua );
195             }
196              
197             # set up some stuff if we are dumping xml to a directory
198 22 100       81 if ($normalOpts{ DUMPDIR }) {
199 1         2 my $dir = $normalOpts{ DUMPDIR };
200 1 50       18 croak "no such directory '$dir'" unless -d $dir;
201 1         2 $self->{ dumpDir } = $dir;
202 1         3 $self->{ lastDump } = 0;
203             }
204              
205 22         101 return( $self );
206             }
207              
208             =head2 identify()
209              
210             identify() is the OAI verb that tells a metadata repository to provide a
211             description of itself. A call to identify() returns a Net::OAI::Identify object
212             which you can then call methods on to retrieve the information you are
213             intersted in. For example:
214              
215             my $identity = $harvester->identify();
216             print "repository name: ",$identity->repositoryName(),"\n";
217             print "protocol version: ",$identity->protocolVersion(),"\n";
218             print "earliest date stamp: ",$identity->earliestDatestamp(),"\n";
219             print "admin email(s): ", join( ", ", $identity->adminEmail() ), "\n";
220             ...
221              
222             For more details see the L documentation.
223              
224             =cut
225              
226             sub identify {
227 6     6 1 2440 my $self = shift;
228 6         47 my $uri = $self->{ baseURL }->clone();
229 6         226 $uri->query_form( 'verb' => 'Identify' );
230              
231 6         503 my $identity = Net::OAI::Identify->new( $self->_get( $uri ) );
232 6 100       53 return $identity if $identity->{ error };
233              
234 1         10 my $error = Net::OAI::Error->new( Handler => $identity );
235 1         5 my $parser = _parser( $error );
236 1         15 debug( "parsing Identify response " . $identity->file() );
237 1         2 eval { $parser->parse_uri( $identity->file() ) };
  1         4  
238 1 50       326 if ( $@ ) { _xmlError( $error ); }
  0         0  
239 1         9 $error->set_handler( undef );
240 1         27 $identity->{ error } = $error;
241 1         27 return( $identity );
242             }
243              
244             =head2 listMetadataFormats()
245              
246             listMetadataFormats() asks the repository to return a list of metadata formats
247             that it supports. A call to listMetadataFormats() returns an
248             Net::OAI::ListMetadataFormats object.
249              
250             my $list = $harvester->listMetadataFormats();
251             print "archive supports metadata prefixes: ",
252             join( ',', $list->prefixes() ),"\n";
253              
254             If you are interested in the metadata formats available for
255             a particular resource identifier then you can pass in that identifier.
256            
257             my $list = $harvester->listMetadataFormats( identifier => '1234567' );
258             print "record identifier 1234567 can be retrieved as ",
259             join( ',', $list->prefixes() ),"\n";
260              
261             See documentation for L for more details.
262              
263             =cut
264              
265             sub listMetadataFormats {
266 2     2 1 1149 my ( $self, %opts ) = @_;
267 2         28 my $uri = $self->{ baseURL }->clone();
268              
269             $uri->query_form( verb => 'ListMetadataFormats',
270 2 100       94 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( identifier )
  2         20  
271             );
272              
273 2         179 my $list = Net::OAI::ListMetadataFormats->new( $self->_get( $uri ) );
274 2 50       11 return $list if $list->{ error };
275              
276 2         26 my $error = Net::OAI::Error->new( Handler => $list );
277 2         10 my $parser = _parser( $error );
278 2         23 debug( "parsing ListMetadataFormats response: ".$list->file() );
279 2         5 eval { $parser->parse_uri( $list->file() ) };
  2         10  
280 2 50       692 if ( $@ ) { _xmlError( $error ); }
  0         0  
281 2         13 $error->set_handler( undef );
282 2         55 $list->{ error } = $error;
283 2         43 return( $list );
284             }
285              
286             =head2 getRecord()
287              
288             getRecord() is used to retrieve a single record from a repository. You must pass
289             in the C and an optional C parameters to identify
290             the record, and the flavor of metadata you would like. Net::OAI::Harvester
291             includes a parser for OAI DublinCore, so if you do not specifiy a
292             metadataPrefix 'oai_dc' will be assumed. If you would like to drop in your own
293             XML::Handler for another type of metadata use either the C
294             or the C parameter, either the name of the class as string
295             or an already instantiated object of that class.
296              
297             my $result = $harvester->getRecord(
298             identifier => 'abc123',
299             );
300              
301             ## did something go wrong?
302             if ( my $oops = $result->errorCode() ) { ... };
303              
304             ## get the result as Net::OAI::Record object
305             my $record = $result->record(); # undef if error
306              
307             ## directly get the Net::OAI::Record::Header object
308             my $header = $result->header(); # undef if error
309             ## same as
310             my $header = $result->record()->header(); # undef if error
311              
312             ## get the metadata object
313             my $metadata = $result->metadata(); # undef if error or harvested with recordHandler
314              
315             ## or if you would rather use your own XML::Handler
316             ## pass in the package name for the object you would like to create
317             my $result = $harvester->getRecord(
318             identifier => 'abc123',
319             metadataHandler => 'MyHandler'
320             );
321             my $metadata = $result->metadata();
322            
323             my $result = $harvester->getRecord(
324             identifier => 'abc123',
325             recordHandler => 'MyCompleteHandler'
326             );
327             my $complete_record = $result->recorddata(); # undef if error or harvested with metadataHandler
328            
329             =cut
330              
331             sub getRecord {
332 8     8 1 18984 my ( $self, %opts ) = @_;
333              
334             croak( "getRecord(): the 'identifier' parameter is required" )
335 8 50       40 unless defined $opts{ 'identifier' };
336             croak( "getRecord(): the 'metadataPrefix' parameter is required" )
337 8 50       30 unless exists $opts{ 'metadataPrefix' };
338             croak( "getRecord(): recordHandler and metadataHandler are mutually exclusive" )
339 8 50 66     34 if $opts{ recordHandler } and $opts{ metadataHandler };
340              
341 8         81 my $uri = $self->{ baseURL }->clone();
342              
343             $uri->query_form( verb => 'GetRecord',
344 8 50       291 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( identifier metadataPrefix )
  16         104  
345             );
346              
347             my $record = Net::OAI::GetRecord->new( $self->_get( $uri ),
348             recordHandler => $opts{ recordHandler },
349             metadataHandler => $opts{ metadataHandler },
350 8         1107 );
351 8 50       59 return $record if $record->{ error };
352              
353 8         110 my $error = Net::OAI::Error->new( Handler => $record );
354 8         48 my $parser = _parser( $error );
355 8         90 debug( "parsing GetRecord response " . $record->file() );
356 8         22 eval { $parser->parse_uri( $record->file() ) };
  8         152  
357 8 50       2214 if ( $@ ) { _xmlError( $error ); }
  0         0  
358              
359 8         39 $error->set_handler( undef );
360 8         198 $record->{ error } = $error;
361 8         172 return( $record );
362              
363             }
364              
365              
366             =head2 listRecords()
367              
368             listRecords() allows you to retrieve all the records in a data repository.
369             You must supply the C parameter to tell your Net::OAI::Harvester
370             which type of records you are interested in. listRecords() returns an
371             Net::OAI::ListRecords object. There are four other optional parameters C,
372             C, C, and C which are better described in the
373             OAI-PMH spec.
374              
375             my $records = $harvester->listRecords(
376             metadataPrefix => 'oai_dc'
377             );
378              
379             ## iterate through the results with next()
380             while ( my $record = $records->next() ) {
381             my $metadata = $record->metadata();
382             ...
383             }
384              
385             If you would like to use your own metadata handler then you can specify
386             the package name of the handler as the C (will be exposed
387             to events below the C element) or C (will be
388             exposed to the C element and all its children) parameter, passing
389             either the name of the class as string or an already instantiated object of
390             that class which will be reused for all records.
391              
392             my $records = $harvester->listRecords(
393             metadataPrefix => 'mods',
394             metadataHandler => 'MODS::Handler'
395             );
396              
397             while ( my $record = $records->next() ) {
398             my $metadata = $record->metadata();
399             # $metadata will be a MODS::Handler object
400             }
401              
402             If you want to automatically handle resumption tokens you can with
403             the listAllRecords() method.
404              
405             If you prefer you can handle resumption tokens yourself with a
406             loop, and the resumptionToken() method. You might want to do this
407             if you are working with a repository that wants you to wait between
408             requests.
409              
410             my $records = $harvester->listRecords( metadataPrefix => 'oai_dc' );
411             my $responseDate = $records->responseDate();
412             my $finished = 0;
413              
414             while ( ! $finished ) {
415              
416             while ( my $record = $records->next() ) { # a Net::OAI::Record object
417             my $metadata = $record->metadata();
418             # do interesting stuff here
419             }
420              
421             my $rToken = $records->resumptionToken();
422             if ( $rToken ) {
423             $records = $harvester->listRecords(
424             resumptionToken => $rToken->token()
425             );
426             } else {
427             $finished = 1;
428             }
429              
430             }
431              
432             Please note: Since C stashes away the individual
433             records it encounters with C, special care has to
434             be taken if the handlers you provided make use of XS modules
435             since these objects cannot be reliably handled. Therefore you will
436             have to provide the special serializing and deserializing methods
437             C and C for the objects
438             used by your filter(s).
439              
440              
441             =cut
442              
443             sub listRecords {
444 9     9 1 6398 my ( $self, %opts ) = @_;
445              
446             croak( "listRecords(): the 'metadataPrefix' parameter is required" )
447             unless ( exists $opts{ 'metadataPrefix' }
448 9 50 66     61 or defined $opts{ 'resumptionToken' } );
449             croak( "listRecords(): recordHandler and metadataHandler are mutually exclusive" )
450 9 50 66     51 if $opts{ recordHandler } and $opts{ metadataHandler };
451              
452 9         89 my $uri = $self->{ baseURL }->clone();
453              
454             $uri->query_form( verb => 'ListRecords',
455 9 100       331 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( metadataPrefix from until set resumptionToken )
  45         198  
456             );
457              
458             my $list = Net::OAI::ListRecords->new( $self->_get( $uri ),
459             metadataHandler => $opts{ metadataHandler },
460             recordHandler => $opts{ recordHandler },
461 9         923 );
462 9 50       81 return $list if $list->{ error };
463              
464 9         135 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
465 9         116 my $error = Net::OAI::Error->new( Handler => $token );
466 9         53 my $parser = _parser( $error );
467 9         121 debug( "parsing ListRecords response " . $list->file() );
468 9         25 eval { $parser->parse_uri( $list->file() ) };
  9         43  
469 9 50       2620 if ( $@ ) { _xmlError( $error ); }
  0         0  
470              
471 9         62 $token->set_handler( undef );
472 9 100       280 $list->{ token } = $token->token() ? $token : undef;
473              
474 9         46 $error->set_handler( undef );
475 9         198 $list->{ error } = $error;
476              
477 9         253 return( $list );
478             }
479              
480             =head2 listAllRecords()
481              
482             Does exactly what listRecords() does except it will automatically
483             submit resumption tokens as needed.
484              
485             =cut
486              
487             sub listAllRecords {
488 1     1 1 435 my $self = shift;
489 1         13 debug( "calling listRecords() as part of listAllRecords request" );
490 1         3 my $list = listRecords( $self, @_ );
491 1         3 $list->{ harvester } = $self;
492 1         4 return( $list );
493             }
494              
495             =head2 listIdentifiers()
496              
497             listIdentifiers() takes the same parameters that listRecords() takes, but it
498             returns only the record headers, allowing you to quickly retrieve all the
499             record identifiers for a particular repository. The object returned is a
500             L object.
501              
502             my $headers = $harvester->listIdentifiers(
503             metadataPrefix => 'oai_dc'
504             );
505              
506             ## iterate through the results with next()
507             while ( my $header = $identifiers->next() ) { # a Net::OAI::Record::Header object
508             print "identifier: ", $header->identifier(), "\n";
509             }
510              
511             If you want to automtically handle resumption tokens use listAllIdentifiers().
512             If you are working with a repository that encourages pauses between requests
513             you can handle the tokens yourself using the technique described above
514             in listRecords().
515              
516             =cut
517              
518             sub listIdentifiers {
519 6     6 1 11450 my ( $self, %opts ) = @_;
520             croak( "listIdentifiers(): the 'metadataPrefix' parameter is required" )
521             unless ( exists $opts{ 'metadataPrefix' }
522 6 50 66     46 or defined $opts{ 'resumptionToken' } );
523 6         56 my $uri = $self->{ baseURL }->clone();
524              
525             $uri->query_form( verb => 'ListIdentifiers',
526 6 100       310 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( metadataPrefix from until set resumptionToken )
  30         117  
527             );
528              
529 6         626 my $list = Net::OAI::ListIdentifiers->new( $self->_get( $uri ) );
530 6 50       50 return( $list ) if $list->{ error };
531              
532 6         84 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
533 6         80 my $error = Net::OAI::Error->new( Handler => $token );
534 6         37 my $parser = _parser( $error );
535 6         71 debug( "parsing ListIdentifiers response " . $list->file() );
536 6         19 eval { $parser->parse_uri( $list->file() ) };
  6         30  
537 6 50       2288 if ( $@ ) { _xmlError( $error ); }
  0         0  
538              
539 6         47 $token->set_handler( undef );
540 6 100       265 $list->{ token } = $token->token() ? $token : undef;
541              
542 6         52 $error->set_handler( undef );
543 6         128 $list->{ error } = $error;
544              
545 6         169 return( $list );
546             }
547              
548             =head2 listAllIdentifiers()
549              
550             Does exactly what listIdentifiers() does except it will automatically
551             submit resumption tokens as needed.
552              
553             =cut
554              
555             sub listAllIdentifiers {
556 1     1 1 398 my $self = shift;
557 1         4 debug( "calling listIdentifiers() as part of listAllIdentifiers() call" );
558 1         3 my $list = listIdentifiers( $self, @_ );
559 1         3 $list->{ harvester } = $self;
560 1         5 return( $list );
561             }
562              
563             =head2 listSets()
564              
565             listSets() takes an optional C parameter, and returns a
566             Net::OAI::ListSets object. listSets() allows you to harvest a subset of a
567             particular repository with listRecords(). For more information see the OAI-PMH
568             spec and the Net::OAI::ListSets docs.
569              
570             my $sets = $harvester->listSets();
571             foreach ( $sets->setSpecs() ) {
572             print "set spec: $_ ; set name: ", $sets->setName( $_ ), "\n";
573             }
574              
575             =cut
576              
577             sub listSets {
578 1     1 1 626 my ( $self, %opts ) = @_;
579              
580 1         13 my $uri = $self->{ baseURL }->clone();
581              
582             $uri->query_form( verb => 'ListSets',
583 1 50       86 map { (defined $opts{$_}) ? ($_ => $opts{$_}) : () } qw( resumptionToken )
  1         11  
584             );
585              
586 1         94 my $list = Net::OAI::ListSets->new( $self->_get( $uri ) );
587 1 50       8 return( $list ) if $list->{ error };
588              
589 1         15 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
590 1         15 my $error = Net::OAI::Error->new( Handler => $token );
591 1         8 my $parser = _parser( $error );
592 1         17 debug( "parsing ListSets response " . $list->file() );
593 1         2 eval { $parser->parse_uri( $list->file() ) };
  1         5  
594 1 50       355 if ( $@ ) { _xmlError( $error ); }
  0         0  
595              
596 1         10 $token->set_handler( undef );
597 1 50       36 $list->{ token } = $token->token() ? $token : undef;
598 1         7 $error->set_handler( undef );
599 1         21 $list->{ error } = $error;
600 1         36 return( $list );
601             }
602              
603             =head2 baseURL()
604              
605             Gets or sets the base URL for the repository being harvested (as L).
606              
607             $harvester->baseURL( 'http://memory.loc.gov/cgi-bin/oai2_0' );
608              
609             Or if you want to know what the current baseURL is
610              
611             $baseURL = $harvester->baseURL();
612              
613             =cut
614              
615             sub baseURL {
616 0     0 1 0 my ( $self, $url ) = @_;
617 0 0       0 if ( $url ) { $self->{ baseURL } = URI->new( $url ); }
  0         0  
618             ## The HTTP UserAgent modifies its URI object upon execution,
619             ## therefore we'll always provide a clone have to reconstruct: trim the query part ...
620             # my $c = $self->{ baseURL }; # ->canonical();
621             # if ( $c && ($c =~ /^([^\?]*)\?/) ) { # $c might be undefined
622             # return $1};
623             # return $c;
624 0         0 return $self->{ baseURL };
625             }
626              
627             =head2 userAgent()
628              
629             Gets or sets the LWP::UserAgent object being used to perform the HTTP
630             transactions. This method could be useful if you wanted to change the
631             agent string, timeout, or some other feature.
632              
633             =cut
634              
635             sub userAgent {
636 23     23 1 450 my ( $self, $ua ) = @_;
637 23 100       127 if ( $ua ) {
638 22 50       210 $ua->isa('LWP::UserAgent') or croak( "userAgent() needs a valid LWP::UserAgent" );
639 22         95 $self->{ userAgent } = $ua;
640             }
641 23         67 return( $self->{ userAgent } );
642             }
643              
644             ## internal stuff
645              
646             sub _get {
647 32     32   69 my ($self,$uri) = @_;
648 32         85 my $ua = $self->{ userAgent };
649              
650 32         52 my ($fh, $file);
651 32 100       136 if ( $self->{ dumpDir } ) {
652 2         5 my $filePrefix = $self->{lastDump}++;
653 2         12 $file = sprintf("%s/%08d.xml", $self->{dumpDir}, $filePrefix);
654 2         18 $fh = IO::File->new($file, 'w');
655             } else {
656 30         186 ( $fh, $file ) = tempfile(UNLINK => 1);
657             }
658              
659 32         358489 debug( "fetching ".$uri->as_string() );
660 32         127 debug( "writing to file: $file" );
661 32         128 my $request = HTTP::Request->new( GET => $uri->as_string() );
662 32     1154   5147 my $response = $ua->request( $request, sub { print $fh shift; }, 8192 );
  1154         93138238  
663 32         637355649 close( $fh );
664              
665 32 100       349 if ( $response->is_error() ) {
666             # HTTP::Request does not provide a file in case of HTTP level errors,
667             # therefore we do not return the name of the non-existant file but
668             # rather the original HTTP::Response object
669 5         96 debug( "caught HTTP level error" . $response->message() );
670 5   50     30 my $error = Net::OAI::Error->new(
671             errorString => 'HTTP Level Error: ' . $response->message(),
672             errorCode => $response->code(),
673             HTTPError => $response,
674             HTTPRetryAfter => $response->header("Retry-After") || "",
675             );
676             return(
677             # file => $file,
678 5         72 error => $error
679             );
680             }
681 27 50       625 if ( my $ct = $response->header("Content-Type") ) {
682 27         2106 debug( "Content-type $ct in HTTP response" );
683 27 50       265 unless ( $ct =~ /^text\/xml(;|$)/ ) {
684 0   0     0 return (error => Net::OAI::Error->new(errorCode => 'xmlContentError',
685             errorString => "Content-Type: text/xml is mandatory (got: $ct)!"),
686             HTTPError => $response,
687             HTTPRetryAfter => $response->header("Retry-After") || "",
688             )
689             };
690 27 50       285 if ( $ct =~ /; charset=(\S+)/ ) {
691 27         150 my $cs = $1;
692 27 50 0     230 return (error => Net::OAI::Error->new(errorCode => 'xmlContentError',
693             errorString => "charset=UTF-8 is mandatory (got: $cs)!"),
694             HTTPError => $response,
695             HTTPRetryAfter => $response->header("Retry-After") || "",
696             ) unless $cs =~ /^utf-8/i;
697             };
698             }
699              
700             return(
701 27         1129 file => $file,
702             );
703              
704             }
705              
706             sub _parser {
707 29     29   884 my $handler = shift;
708 29         389 my $factory = XML::SAX::ParserFactory->new();
709 29         6452 my $parser;
710 29         197 $factory->require_feature(Namespaces);
711 29         358 eval { $parser = $factory->parser( Handler => $handler ) };
  29         181  
712 29 50       593629 carp ref($factory)." threw an exception:\n\t$@" if $@;
713              
714 29 50 33     298 if ( $parser && ref($parser) ) {
715 29         606 debug( "using SAX parser " . ref($parser) . " " . $parser->VERSION );
716 29         334 return $parser;
717             };
718              
719 0         0 carp "!!! Please check your setup of XML::SAX, especially ParserDetails.ini !!!\n";
720 0         0 local($XML::SAX::ParserPackage) = "XML::SAX::PurePerl";
721 0         0 eval { $parser = $factory->parser( Handler => $handler ) };
  0         0  
722 0 0       0 carp ref($factory)." threw an exception again:\n\t$@" if $@;
723 0 0 0     0 if ( $parser && ref($parser) ) {
724 0         0 carp "Successfuly forced assignment of a parser: " . ref($parser) . " " . $parser->VERSION ."\n";
725 0         0 return $parser;
726             };
727              
728 0         0 croak( ref($factory)." on request did not even give us the default XML::SAX::PurePerl parser.\nGiving up." );
729             }
730              
731             sub _xmlError {
732 0     0   0 my $e = shift;
733 0         0 carp "caught xml parsing error: $@";
734 0         0 $e->errorString( "XML parsing error: $@" );
735 0         0 $e->errorCode( 'xmlParseError' );
736             }
737              
738              
739             sub _verifyHandler {
740 217     217   529 my $package_or_instance = shift;
741 217 100       840 if ( ref($package_or_instance) ) {
742 205 50       1262 $package_or_instance->isa('XML::SAX::Base')
743             or _fatal( "Handler $package_or_instance must inherit from XML::SAX::Base\n" )
744             }
745             else {
746 12     5   1812 eval( "use $package_or_instance" );
  5     4   73  
  5         17  
  5         213  
  4         1025  
  4         1227  
  4         116  
747 12 50       76 _fatal( "unable to locate Handler $package_or_instance in: " .
748             join( "\n\t", @INC ) ) if $@;
749             _fatal( "Handler $package_or_instance must inherit from XML::SAX::Base\n" )
750 12 50       1081 if ( ! grep { 'XML::SAX::Base' } eval( '@' . $package_or_instance . '::ISA' ) );
  12         100  
751             }
752 217         588 return( 1 );
753             }
754              
755              
756             sub debug {
757 6632 50   6632 0 22526 return unless $Net::OAI::Harvester::DEBUG;
758 0         0 my $msg = shift;
759 0         0 carp "oai-harvester: " . localtime() . ": $msg\n";
760             }
761              
762             sub _fatal {
763 0     0   0 my $msg = shift;
764 0         0 croak "fatal: $msg";
765             }
766              
767             =head1 DIAGNOSTICS
768              
769             If you would like to see diagnostic information when harvesting is running
770             then set Net::OAI::Harvester::DEBUG to a true value.
771              
772             $Net::OAI::Harvester::DEBUG = 1;
773              
774              
775              
776              
777             =head1 PERFORMANCE
778              
779             XML::SAX is used for parsing, but it presents a generalized interface to many
780             parsers. It comes with XML::Parser::PurePerl by default, which is nice since
781             you don't have to worry about getting the right libraries installed. However
782             XML::Parser::PurePerl is rather slow compared to XML::LibXML. If you
783             are a speed freak install XML::LibXML from CPAN today.
784              
785             If you have a particular parser you want to use you can set the
786             $XML::SAX::ParserPackage variable appropriately. See XML::SAX::ParserFactory
787             documentation for details.
788              
789              
790              
791             =head1 ENVIRONMENT
792              
793             The modules use LWP for HTTP operations, thus C controls
794             wether the "_proxy" environment settings shall be honored.
795              
796              
797             =head1 TODO
798              
799             =over 4
800              
801             =item *
802              
803             Allow Net::OAI::ListMetadataFormats to store more than just the metadata
804             prefixes.
805              
806             =item *
807              
808             Implement Net::OAI::Set for iterator access to Net::OAI::ListSets.
809              
810             =item *
811              
812             Implement Net::OAI::Harvester::listAllSets().
813              
814             =item *
815              
816             More documentation of other classes.
817              
818             =item *
819              
820             Document custom XML::Handler creation.
821              
822             =item *
823              
824             Handle optional compression.
825              
826             =item *
827              
828             Create common handlers for other metadata formats (MARC, qualified DC, etc).
829              
830             =item *
831              
832             Selectively load Net::OAI::* classes as needed, rather than getting all of them
833             at once at the beginning of Net::OAI::Harvester.
834              
835             =back
836              
837             =head1 SEE ALSO
838              
839             =over 4
840              
841             =item *
842              
843             OAI-PMH Specification at L
844              
845             =item *
846              
847             L
848              
849             =item *
850              
851             L
852              
853             =item *
854              
855             L
856              
857             =item *
858              
859             L
860              
861             =item *
862              
863             L
864              
865             =item *
866              
867             L
868              
869             =item *
870              
871             L
872              
873             =item *
874              
875             L
876              
877             =item *
878              
879             L
880              
881             =item *
882              
883             L
884              
885             =item *
886              
887             L
888              
889             =item *
890              
891             L
892              
893             =item *
894              
895             L
896              
897             =item *
898              
899             L
900              
901             =back
902              
903              
904             =head1 AUTHORS
905              
906             Ed Summers
907              
908             Martin Emmerich
909              
910             Thomas Berger
911              
912             =cut
913              
914             1;