File Coverage

blib/lib/Net/OAI/Harvester.pm
Criterion Covered Total %
statement 247 270 91.4
branch 55 86 63.9
condition 8 18 44.4
subroutine 34 37 91.8
pod 11 12 91.6
total 355 423 83.9


line stmt bran cond sub pod time code
1             package Net::OAI::Harvester;
2              
3 16     16   80606 use strict;
  16         731  
  16         668  
4 14     14   80 use warnings;
  14         32  
  14         473  
5              
6 14     14   35489 use URI;
  14         97317  
  14         631  
7 14     14   17969 use LWP::UserAgent;
  14         1914621  
  14         561  
8 14     14   14272 use XML::SAX qw( Namespaces Validation );
  14         101093  
  14         987  
9 14     14   18828 use File::Temp qw( tempfile );
  14         307495  
  14         1163  
10 14     14   142 use Carp qw( croak );
  14         31  
  14         665  
11              
12 14     14   15418 use Net::OAI::Error;
  14         57  
  14         926  
13 14     14   10561 use Net::OAI::ResumptionToken;
  14         37  
  14         347  
14 14     14   9833 use Net::OAI::Identify;
  14         48  
  14         694  
15 14     14   8799 use Net::OAI::ListMetadataFormats;
  14         57  
  14         424  
16 14     14   8558 use Net::OAI::ListIdentifiers;
  14         41  
  14         504  
17 14     14   8798 use Net::OAI::ListRecords;
  14         45  
  14         407  
18 14     14   8042 use Net::OAI::GetRecord;
  14         77  
  14         433  
19 14     14   92 use Net::OAI::ListRecords;
  14         27  
  14         270  
20 14     14   8226 use Net::OAI::ListSets;
  14         37  
  14         383  
21 14     14   90 use Net::OAI::Record::Header;
  14         30  
  14         365  
22 14     14   8928 use Net::OAI::Record::OAI_DC;
  14         459  
  14         42652  
23              
24             our $VERSION = '1.15';
25             our $DEBUG = 0;
26              
27             =head1 NAME
28              
29             Net::OAI::Harvester - A package for harvesting metadata using OAI-PMH
30              
31             =head1 SYNOPSIS
32              
33             ## create a harvester for the Library of Congress
34             my $harvester = Net::OAI::Harvester->new(
35             'baseURL' => 'http://memory.loc.gov/cgi-bin/oai2_0'
36             );
37              
38             ## list all the records in a repository
39             my $records = $harvester->listRecords(
40             'metadataPrefix' => 'oai_dc'
41             );
42             while ( my $record = $records->next() ) {
43             my $header = $record->header();
44             my $metadata = $record->metadata();
45             print "identifier: ", $header->identifier(), "\n";
46             print "title: ", $metadata->title(), "\n";
47             }
48              
49             ## find out the name for a repository
50             my $identity = $harvester->identify();
51             print "name: ",$identity->repositoryName(),"\n";
52              
53             ## get a list of identifiers
54             my $identifiers = $harvester->listIdentifiers(
55             'metadataPrefix' => 'oai_dc'
56             );
57             while ( my $header = $identifiers->next() ) {
58             print "identifier: ",$header->identifier(), "\n";
59             }
60              
61             ## list all the records in a repository
62             my $records = $harvester->listRecords(
63             'metadataPrefix' => 'oai_dc'
64             );
65             while ( my $record = $records->next() ) {
66             my $header = $record->header();
67             my $metadata = $record->metadata();
68             print "identifier: ", $header->identifier(), "\n";
69             print "title: ", $metadata->title(), "\n";
70             }
71              
72             ## GetRecord, ListSets, ListMetadataFormats also supported
73              
74             =head1 DESCRIPTION
75              
76             Net::OAI::Harvester is a Perl extension for easily querying OAI-PMH
77             repositories. OAI-PMH is the Open Archives Initiative Protocol for Metadata
78             Harvesting. OAI-PMH allows data repositories to share metadata about their
79             digital assets. Net::OAI::Harvester is a OAI-PMH client, so it does for
80             OAI-PMH what LWP::UserAgent does for HTTP.
81              
82             You create a Net::OAI::Harvester object which you can then use to
83             retrieve metadata from a selected repository. Net::OAI::Harvester tries to keep
84             things simple by providing an API to get at the data you want; but it also has
85             a framework which is easy to extend should you need to get more fancy.
86              
87             The guiding principle behind OAI-PMH is to allow metadata about online
88             resources to be shared by data providers, so that the metadata can be harvested
89             by interested parties. The protocol is essentially XML over HTTP (much like
90             XMLRPC or SOAP). Net::OAI::Harvester does XML parsing for you
91             (using XML::SAX internally), but you can get at the raw XML if you want to do
92             your own XML processing, and you can drop in your own XML::SAX handler if you
93             would like to do your own parsing of metadata elements.
94              
95             A OAI-PMH repository supports 6 verbs: GetRecord, Identify, ListIdentifiers,
96             ListMetadataFormats, ListRecords, and ListSets. The verbs translate directly
97             into methods that you can call on a Net::OAI::Harvester object. More details
98             about these methods are supplied below, however for the real story please
99             consult the spec at http://www.openarchives.org.
100              
101             Net::OAI::Harvester has a few features that are worth mentioning:
102              
103             =over 4
104              
105             =item 1
106              
107             Since the OAI-PMH results can be arbitrarily large, a stream based (XML::SAX)
108             parser is used. As the document is parsed corresponding Perl objects are
109             created (records, headers, etc), which are then serialized on disk (using
110             Storable if you are curious). The serialized objects on disk can then be
111             iterated over one at a time. The benefit of this is a lower memory footprint
112             when (for example) a ListRecords verb is exercised on a repository that
113             returns 100,000 records.
114              
115             =item 2
116              
117             XML::SAX filters are used which will allow interested developers to write
118             their own metadata parsing packages, and drop them into place. This is useful
119             because OAI-PMH is itself metadata schema agnostic, so you can use OAI-PMH
120             to distribute all kinds of metadata (Dublin Core, MARC, EAD, or your favorite
121             metadata schema). OAI-PMH does require that a repository at least provides
122             Dublin Core metadata as a baseline. Net::OAI::Harvester has built in support for
123             unqualified Dublin Core, and has a framework for dropping in your own parser
124             for other kinds of metadata. If you create a XML::Handler that you would like
125             to contribute back into the Net::OAI::Harvester project please get in touch!
126              
127             =back
128              
129             =head1 METHODS
130              
131             All the Net::OAI::Harvester methods return other objects. As you would expect
132             new() returns an Net::OAI::Harvester object; similarly getRecord() returns an
133             Net::OAI::Record object, listIdentifiers() returns a Net::OAI::ListIdentifiers
134             object, identify() returns an Net::OAI::Identify object, and so on. So when
135             you use one of these methods you'll probably want to check out the docs for
136             the object that gets returned so you can see what to do with it. Many
137             of these classes inherit from Net::OAI::Base which provides some base
138             functionality for retrieving errors, getting the raw XML, and the
139             temporary file where the XML is stored (see Net::OAI::Base documentation for
140             more details).
141              
142             =head2 new()
143              
144             The constructor which returns an Net::OAI::Harvester object. You must supply the
145             baseURL parameter, to tell Net::OAI::Harvester what data repository you are
146             going to be harvesting. For a list of data providers check out the directory
147             available on the Open Archives Initiative homepage.
148              
149             my $harvester = Net::OAI::Harvester->new(
150             baseURL => 'http://memory.loc.gov/cgi-bin/oai2_0'
151             );
152              
153             If you want to pull down all the XML files and keep them in a directory, rather
154             than having the stored as transient temp files pass in the dumpDir parameter.
155              
156             my $harvester = Net::OAI::Harvester->new(
157             baseUrl => 'http://memory.loc.gov/cgi-bin/oai2_0',
158             dumpDir => 'american-memory'
159             );
160              
161             Also if you would like to fine tune the HTTP client used by Net::OAI::Harvester
162             you can pass in a configured object. For example this can be handy if you
163             want to adjust the client timeout:
164              
165             my $ua = LWP::UserAgent->new();
166             $ua->timeout(20); ## set timeout to 20 seconds
167             my $harvester = Net::OAI::Harvester->new(
168             baseURL => 'http://memory.loc.gov/cgi-bin/oai2_0',
169             userAgent => $ua
170             );
171              
172             =cut
173              
174             sub new {
175 17     17 1 17125 my ( $class, %opts ) = @_;
176              
177             ## uppercase options
178 17         67 my %normalOpts = map { ( uc($_), $opts{$_} ) } keys( %opts );
  19         129  
179            
180             ## we must be told a baseURL
181 17 50       101 croak( "new() needs the baseUrl parameter" ) if !$normalOpts{ BASEURL };
182 17         156 my $baseURL = URI->new( $normalOpts{ BASEURL } );
183              
184 17   33     5082610 my $self = bless( { baseURL => $baseURL }, ref( $class ) || $class );
185              
186             ## set the user agent
187 17 100       99 if ( $normalOpts{ USERAGENT } ) {
188 1         6 $self->userAgent( $normalOpts{ USERAGENT } );
189             } else {
190 16         253 my $ua = LWP::UserAgent->new();
191 16         47969 $ua->agent( $class );
192 16         1109 $self->userAgent( $ua );
193             }
194              
195             # set up some stuff if we are dumping xml to a directory
196 17 100       114 if ($normalOpts{ DUMPDIR }) {
197 1         2 my $dir = $normalOpts{ DUMPDIR };
198 1 50       18 croak "no such directory '$dir'" unless -d $dir;
199 1         3 $self->{ dumpDir } = $dir;
200 1         3 $self->{ lastDump } = 0;
201             }
202              
203 17         105 return( $self );
204             }
205              
206             =head2 identify()
207              
208             identify() is the OAI verb that tells a metadata repository to provide a
209             description of itself. A call to identify() returns a Net::OAI::Identify object
210             which you can then call methods on to retrieve the information you are
211             intersted in. For example:
212              
213             my $identity = $harvester->identify();
214             print "repository name: ",$identity->repositoryName(),"\n";
215             print "protocol version: ",$identity->protocolVersion(),"\n";
216             print "earliest date stamp: ",$identity->earliestDatestamp(),"\n";
217             print "admin email(s): ", join( ", ", $identity->adminEmail() ), "\n";
218             ...
219              
220             For more details see the Net::OAI::Identify documentation.
221              
222             =cut
223              
224             sub identify {
225 4     4 1 1589 my $self = shift;
226 4         12 my $uri = $self->{ baseURL };
227 4         43 $uri->query_form( 'verb' => 'Identify' );
228              
229 4         677 my $identity = Net::OAI::Identify->new( $self->_get( $uri ) );
230 4 100       35 return $identity if $identity->errorCode();
231              
232 2         23 my $token = Net::OAI::ResumptionToken->new( Handler => $identity );
233 2         19 my $error = Net::OAI::Error->new( Handler => $token );
234 2         8 my $parser = _parser( $error );
235 2         20 debug( "parsing Identify response " . $identity->file() );
236 2         4 eval { $parser->parse_uri( $identity->file() ) };
  2         7  
237 2 50       664 if ( $@ ) {_xmlError( $error ); }
  0         0  
238 2 50       11 $identity->{ token } = $token->token() ? $token : undef;
239 2         9 $identity->{ error } = $error;
240 2         69 return( $identity );
241             }
242              
243             =head2 listMetadataFormats()
244              
245             listMetadataFormats() asks the repository to return a list of metadata formats
246             that it supports. A call to listMetadataFormats() returns an
247             Net::OAI::ListMetadataFormats object.
248              
249             my $list = $harvester->listMetadataFormats();
250             print "archive supports metadata prefixes: ",
251             join( ',', $list->prefixes() ),"\n";
252              
253             If you are interested in the metadata formats available for
254             a particular resource identifier then you can pass in that identifier.
255            
256             my $list = $harvester->listMetadataFormats( identifier => '1234567' );
257             print "record identifier 1234567 can be retrieved as ",
258             join( ',', $list->prefixes() ),"\n";
259              
260             See documentation for Net::OAI::ListMetadataFormats for more details.
261              
262             =cut
263              
264             sub listMetadataFormats {
265 2     2 1 452 my ( $self, %opts ) = @_;
266 2         8 my $uri = $self->{ baseURL };
267 2         8 my %pairs = ( verb => 'ListMetadataFormats' );
268 2 100       8 if ( $opts{ identifier } ) {
269 1         4 $pairs{ identifier } = $opts{ identifier };
270             }
271 2         26 $uri->query_form( %pairs );
272              
273 2         348 my $list = Net::OAI::ListMetadataFormats->new( $self->_get( $uri ) );
274 2 50       21 return $list if $list->errorCode();
275              
276 2         24 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
277 2         23 my $error = Net::OAI::Error->new( Handler => $token );
278 2         9 my $parser = _parser( $error );
279 2         24 debug( "parsing ListMetadataFormats response: ".$list->file() );
280 2         6 eval{ $parser->parse_uri( $list->file() ) };
  2         11  
281 2 50       9530 if ( $@ ) { _xmlError( $error ); }
  0         0  
282 2 50       16 $list->{ token } = $token->token() ? $token : undef;
283 2         7 $list->{ error } = $error;
284 2         100 return( $list );
285             }
286              
287             =head2 getRecord()
288              
289             getRecord() is used to retrieve a single record from a repository. You must pass
290             in the C and an optional C parameters to identify
291             the record, and the flavor of metadata you would like. Net::OAI::Harvester
292             includes a parser for OAI DublinCore, so if you do not specifiy a
293             metadataPrefix 'oai_dc' will be assumed. If you would like to drop in you own
294             XML::Handler for another type of metadata use the C parameter.
295              
296             my $record = $harvester->getRecord(
297             identifier => 'abc123',
298             );
299              
300             ## get the Net::OAI::Record::Header object
301             my $header = $record->header();
302              
303             ## get the metadata object
304             my $metadata = $record->metadata();
305              
306             ## or if you would rather use your own XML::Handler
307             ## pass in the package name for the object you would like to create
308             my $record = $harvester->getRecord(
309             identifier => 'abc123',
310             metadataHandler => 'MyHandler'
311             );
312             my $metadata = $record->metadata();
313            
314             =cut
315              
316             sub getRecord {
317 3     3 1 1323 my ( $self, %opts ) = @_;
318              
319 3 50       15 croak( "you must pass the identifier parameter to getRecord()" )
320             if ( ! exists( $opts{ 'identifier' } ) );
321 3 50       59 croak( "you must pass the metadataPrefix parameter to getRecord()" )
322             if ( ! exists( $opts{ 'metadataPrefix' } ) );
323              
324 3         6 my $metadataHandler;
325 3 100       13 if ( exists( $opts{ metadataHandler } ) ) {
326 1         4 my $package = $opts{ metadataHandler };
327 1         5 _verifyMetadataHandler( $package );
328 1         9 $metadataHandler = $package->new();
329             } else {
330 2         30 $metadataHandler = Net::OAI::Record::OAI_DC->new();
331             }
332              
333 3         91 my $uri = $self->{ baseURL };
334 3         41 $uri->query_form(
335             verb => 'GetRecord',
336             identifier => $opts{ 'identifier' },
337             metadataPrefix => $opts{ 'metadataPrefix' }
338             );
339              
340 3         644 my $record = Net::OAI::GetRecord->new( $self->_get( $uri ) );
341 3 50       31 return $record if $record->errorCode();
342              
343 3         37 my $header = Net::OAI::Record::Header->new( Handler => $metadataHandler );
344 3         36 my $error = Net::OAI::Error->new( Handler => $header );
345 3         12 my $parser = _parser( $error );
346 3         69 debug( "parsing GetRecord response " . $record->file() );
347 3         13 $parser->parse_uri( $record->file() );
348 3 50       1262 if ( $@ ) { _xmlError( $error ); }
  0         0  
349              
350 3         16 $record->{ error } = $error;
351 3         10 $record->{ metadata } = $metadataHandler;
352 3         11 $record->{ header } = $header;
353 3         126 return( $record );
354              
355             }
356              
357              
358             =head2 listRecords()
359              
360             listRecords() allows you to retrieve all the records in a data repository.
361             You must supply the C parameter to tell your Net::OAI::Harvester
362             which type of records you are interested in. listRecords() returns an
363             Net::OAI::ListRecords object. There are four other optional parameters C,
364             C, C, and C which are better described in the
365             OAI-PMH spec.
366              
367             my $records = $harvester->listRecords(
368             metadataPrefix => 'oai_dc'
369             );
370              
371             ## iterate through the results with next()
372             while ( my $record = $records->next() ) {
373             my $metadata = $record->metadata();
374             ...
375             }
376              
377             If you would like to use your own metadata handler then you can specify
378             the package name of the handler.
379              
380             my $records = $harvester->listRecords(
381             metadataPrefix => 'mods',
382             metadataHandler => 'MODS::Handler'
383             );
384              
385             while ( my $record = $records->next() ) {
386             my $metadata = $record->metadata();
387             # $metadata will be a MODS::Handler object
388             }
389              
390             If you want to automatically handle resumption tokens you can with
391             the listAllRecords() method.
392              
393             If you prefer you can handle resumption tokens yourself with a
394             loop, and the resumptionToken() method. You might want to do this
395             if you are working with a repository that wants you to wait between
396             requests.
397              
398             my $records = $harvester->listRecords( metadataPrefix => 'oai_dc' );
399             my $finished = 0;
400              
401             while ( ! $finished ) {
402              
403             while ( my $record = $records->next() ) {
404             my $metadata = $record->metadata();
405             # do interesting stuff here
406             }
407              
408             my $rToken = $records->resumptionToken();
409             if ( $rToken ) {
410             $records = $harvester->listRecords(
411             resumptionToken => $rToken->token()
412             );
413             } else {
414             $finished = 1;
415             }
416              
417             }
418              
419             =cut
420              
421             sub listRecords {
422 5     5 1 1121 my ( $self, %opts ) = @_;
423              
424 5 50 66     34 croak( "you must pass the metadataPrefix parameter to listRecords()" )
425             if ( ! exists( $opts{ 'metadataPrefix' } )
426             and ! exists( $opts{ 'resumptionToken' } ) );
427 5         17 my %pairs = (
428             verb => 'ListRecords',
429             );
430              
431 5         16 foreach ( qw( metadataPrefix from until set resumptionToken ) ) {
432 25 100       68 if ( exists( $opts{ $_ } ) ) {
433 8         22 $pairs{ $_ } = $opts{ $_ };
434             }
435             }
436 5         26 my $uri = $self->{ baseURL };
437 5         62 $uri->query_form( %pairs );
438              
439             my $list = Net::OAI::ListRecords->new( $self->_get( $uri ),
440 5         903 metadataHandler => $opts{ metadataHandler } );
441 5 50       64 return $list if $list->errorCode();
442              
443 5         53 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
444 5         53 my $error = Net::OAI::Error->new( Handler => $token );
445 5         24 my $parser = _parser( $error );
446 5         64 debug( "parsing ListRecords response " . $list->file() );
447 5         13 eval { $parser->parse_uri( $list->file() ) };
  5         19  
448 5 50       1724 if ( $@ ) { _xmlError( $error ); }
  0         0  
449              
450 5         19 $list->{ error } = $error;
451 5 100       28 $list->{ token } = $token->token() ? $token : undef;
452 5         196 return( $list );
453             }
454              
455             =head2 listAllRecords()
456              
457             Does exactly what listRecords() does except it will automatically
458             submit resumption tokens as needed.
459              
460             =cut
461              
462             sub listAllRecords {
463 1     1 1 426 my $self = shift;
464 1         6 debug( "calling listRecords() as part of listAllRecords request" );
465 1         4 my $list = listRecords( $self, @_ );
466 1         4 $list->{ harvester } = $self;
467 1         5 return( $list );
468             }
469              
470             =head2 listIdentifiers()
471              
472             listIdentifiers() takes the same parameters that listRecords() takes, but it
473             returns only the record headers, allowing you to quickly retrieve all the
474             record identifiers for a particular repository. The object returned is a
475             Net::OAI::ListIdentifiers object.
476              
477             my $headers = $harvester->listIdentifiers(
478             metadataPrefix => 'oai_dc'
479             );
480              
481             ## iterate through the results with next()
482             while ( my $header = $identifiers->next() ) {
483             print "identifier: ", $header->identifier(), "\n";
484             }
485              
486             If you want to automtically handle resumption tokens use listAllIdentifiers().
487             If you are working with a repository that encourages pauses between requests
488             you can handle the tokens yourself using the technique described above
489             in listRecords().
490              
491             =cut
492              
493             sub listIdentifiers {
494 7     7 1 12425 my ( $self, %opts ) = @_;
495             croak( "listIdentifiers(): metadataPrefix is a required parameter" )
496             if ( ! exists( $opts{ metadataPrefix } )
497 7 50 66     44 and ! exists( $opts{ resumptionToken } ) );
498 7         24 my $uri = $self->{ baseURL };
499 7         31 my %pairs = (
500             verb => 'ListIdentifiers',
501             );
502 7         29 foreach ( qw( metadataPrefix from until set resumptionToken ) ) {
503 35 100       100 if ( exists( $opts{ $_ } ) ) {
504 10         30 $pairs{ $_ } = $opts{ $_ };
505             }
506             }
507 7         128 $uri->query_form( %pairs );
508              
509 7         1506 my $list = Net::OAI::ListIdentifiers->new( $self->_get( $uri ) );
510 7 50       264 return( $list ) if $list->errorCode();
511              
512 7         83 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
513 7         84 my $error = Net::OAI::Error->new( Handler => $token );
514 7         34 my $parser = _parser( $error );
515 7         85 debug( "parsing ListIdentifiers response " . $list->file() );
516 7         16 eval { $parser->parse_uri( $list->file() ) };
  7         32  
517 7 50       2693 if ( $@ ) { _xmlError( $error ); }
  0         0  
518 7 100       41 $list->{ token } = $token->token() ? $token : undef;
519 7         25 $list->{ error } = $error;
520 7         320 return( $list );
521             }
522              
523             =head2 listAllIdentifiers()
524              
525             Does exactly what listIdentifiers() does except it will automatically
526             submit resumption tokens as needed.
527              
528             =cut
529              
530             sub listAllIdentifiers {
531 1     1 1 735 my $self = shift;
532 1         10 debug( "calling listIdentifiers() as part of listAllIdentifiers() call" );
533 1         7 my $list = listIdentifiers( $self, @_ );
534 1         4 $list->{ harvester } = $self;
535 1         4 return( $list );
536             }
537              
538             =head2 listSets()
539              
540             listSets() takes an optional C parameter, and returns a
541             Net::OAI::ListSets object. listSets() allows you to harvest a subset of a
542             particular repository with listRecords(). For more information see the OAI-PMH
543             spec and the Net::OAI::ListSets docs.
544              
545             my $sets = $harvester->listSets();
546             foreach ( $sets->setSpecs() ) {
547             print "set spec: $_ ; set name: ", $sets->setName( $_ ), "\n";
548             }
549              
550             =cut
551              
552             sub listSets {
553 1     1 1 880 my ( $self, %opts ) = @_;
554 1         7 my %pairs = ( verb => 'ListSets' );
555 1 50       6 if ( exists( $opts{ resumptionToken } ) ) {
556 0         0 $pairs{ resumptionToken } = $opts{ resumptionToken };
557             }
558 1         4 my $uri = $self->{ baseURL };
559 1         21 $uri->query_form( %pairs );
560              
561 1         264 my $list = Net::OAI::ListSets->new( $self->_get( $uri ) );
562 1 50       11 return( $list ) if $list->errorCode();
563              
564 1         8 my $token = Net::OAI::ResumptionToken->new( Handler => $list );
565 1         10 my $error = Net::OAI::Error->new( Handler => $token );
566 1         6 my $parser = _parser( $error );
567 1         18 debug( "parsing ListSets response " . $list->file() );
568 1         2 eval{ $parser->parse_uri( $list->file() ) };
  1         6  
569 1 50       368 if ( $@ ) { _xmlError( $error ); }
  0         0  
570 1         6 $list->{ error } = $error;
571 1 50       7 $list->{ token } = $token->token() ? $token : undef;
572 1         55 return( $list );
573             }
574              
575             =head2 baseURL()
576              
577             Gets or sets the base URL for the repository being harvested.
578              
579             $harvester->baseURL( 'http://memory.loc.gov/cgi-bin/oai2_0' );
580              
581             Or if you want to know what the current baseURL is
582              
583             $baseURL = $harvester->baseURL();
584              
585             =cut
586              
587             sub baseURL {
588 0     0 1 0 my ( $self, $url ) = @_;
589 0 0       0 if ( $url ) { $self->{ baseURL } = URI->new( $url ); }
  0         0  
590             # The HTTP UserAgent modifies its URI object upon execution,
591             # therefore we have to reconstruct: trim the query part ...
592 0         0 my $c = $self->{ baseURL }->canonical();
593 0 0 0     0 if ( $c && ($c =~ /^([^\?]*)\?/) ) { # $c might be undefined
594 0         0 return $1};
595 0         0 return $c;
596             }
597              
598             =head2 userAgent()
599              
600             Gets or sets the LWP::UserAgent object being used to perform the HTTP
601             transactions. This method could be useful if you wanted to change the
602             agent string, timeout, or some other feature.
603              
604             =cut
605              
606             sub userAgent {
607 18     18 1 49 my ( $self, $ua ) = @_;
608 18 100       120 if ( $ua ) {
609 17 50       105 croak( "userAgent() needs a valid LWP::UserAgent" )
610             if ref( $ua ) ne 'LWP::UserAgent';
611 17         110 $self->{ userAgent } = $ua;
612             }
613 18         56 return( $self->{ userAgent } );
614             }
615              
616             ## internal stuff
617              
618             sub _get {
619 22     22   55 my ($self,$uri) = @_;
620 22         67 my $ua = $self->{ userAgent };
621              
622 22         64 my ($fh, $file);
623              
624 22 100       89 if ( $self->{ dumpDir } ) {
625 2         6 my $filePrefix = $self->{lastDump}++;
626 2         14 $file = sprintf("%s/%08d.xml", $self->{dumpDir}, $filePrefix);
627 2         23 $fh = IO::File->new($file, 'w');
628             }
629              
630             else {
631 20         162 ( $fh, $file ) = tempfile(UNLINK => 1);
632             }
633              
634 22         15741 debug( "fetching ".$uri->as_string() );
635 22         94 debug( "writing to file: $file" );
636 22         89 my $request = HTTP::Request->new( GET => $uri->as_string() );
637 22     1588   4023 my $response = $ua->request( $request, sub { print $fh shift; }, 4096 );
  1588         117661344  
638 22         1020756 close( $fh );
639              
640 22 100       172 if ( $response->is_error() ) {
641             # HTTP::Request does not provide a file in case of HTTP level errors,
642             # therefore we do not return the name of the non-existant file but
643             # rather the original HTTP::Response object
644 2         33 debug( "caught HTTP level error" . $response->message() );
645 2         13 my $error = Net::OAI::Error->new(
646             errorString => 'HTTP Level Error: ' . $response->message(),
647             errorCode => $response->code(),
648             HTTPError => $response,
649             );
650             return(
651             # file => $file,
652 2         26 error => $error
653             );
654             }
655              
656             return(
657 20         1452 file => $file,
658             );
659              
660             }
661              
662             sub _parser {
663 22     22   1328 my $handler = shift;
664 22         247 my $factory = XML::SAX::ParserFactory->new();
665 22         4000 my $parser;
666 22         125 $factory->require_feature(Namespaces);
667 22         231 eval { $parser = $factory->parser( Handler => $handler ) };
  22         115  
668 22 100       1937 warn ref($factory)." threw an exception:\n\t$@" if $@;
669              
670 22 100 66     3807 if ( $parser && ref($parser) ) {
671 1         12 debug( "using SAX parser " . ref($parser) . " " . $parser->VERSION );
672 1         5 return $parser;
673             };
674              
675 21         1603 warn "!!! Please check your setup of XML::SAX, especially ParserDetails.ini !!!\n";
676 21         93 local($XML::SAX::ParserPackage) = "XML::SAX::PurePerl";
677 21         62 eval { $parser = $factory->parser( Handler => $handler ) };
  21         210  
678 21 50       10575526 warn ref($factory)." threw an exception again:\n\t$@" if $@;
679 21 50 33     209 if ( $parser && ref($parser) ) {
680 21         2431 warn "Successfuly forced assignment of a parser: " . ref($parser) . " " . $parser->VERSION ."\n";
681 21         322 return $parser;
682             };
683              
684 0         0 croak( ref($factory)." on request did not even give us the default XML::SAX::PurePerl parser.\nGiving up." );
685             }
686              
687             sub _xmlError {
688 0     0   0 my $e = shift;
689 0         0 warn "caught xml parsing error: $@";
690 0         0 $e->errorString( "XML parsing error: $@" );
691 0         0 $e->errorCode( 'xmlParseError' );
692             }
693              
694              
695             sub _verifyMetadataHandler {
696 6     6   85 my $package = shift;
697 6     4   991 eval( "use $package" );
  4         409  
  4         291  
  4         72  
698 6 50       36 _fatal( "unable to locate metadataHandler $package in: " .
699             join( "\n\t", @INC ) ) if $@;
700 6         33 _fatal( "metadataHandler $package must inherit from XML::SAX::Base\n" )
701 6 50       296 if ( ! grep { 'XML::SAX::Base' } eval( '@' . $package . '::ISA' ) );
702 6         26 return( 1 );
703             }
704              
705              
706             sub debug {
707 6683     6683 0 14369 my $msg = shift;
708 6683 50       28477 if ( $Net::OAI::Harvester::DEBUG ) {
709 0         0 print STDERR "oai-harvester: " . localtime() . ": $msg\n";
710             }
711             }
712              
713             sub _fatal {
714 0     0   0 my $msg = shift;
715 0         0 print STDERR "fatal: $msg\n\n";
716 0         0 exit( 1 );
717             }
718              
719             =head1 DIAGNOSTICS
720              
721             If you would like to see diagnostic information when harvesting is running
722             then set Net::OAI::Harvester::DEBUG to a true value.
723              
724             $Net::OAI::Harvester::DEBUG = 1;
725              
726             =head1 PERFORMANCE
727              
728             XML::SAX is used for parsing, but it presents a generalized interface to many
729             parsers. It comes with XML::Parser::PurePerl by default, which is nice since
730             you don't have to worry about getting the right libraries installed. However
731             XML::Parser::PurePerl is rather slow compared to XML::LibXML. If you
732             are a speed freak install XML::LibXML from CPAN today.
733              
734             If you have a particular parser you want to use you can set the
735             $XML::SAX::ParserPackage variable appropriately. See XML::SAX::ParserFactory
736             documentation for details.
737              
738             =head1 TODO
739              
740             =over 4
741              
742             =item *
743              
744             Allow Net::OAI::ListMetadataFormats to store more than just the metadata
745             prefixes.
746              
747             =item *
748              
749             Implement Net::OAI::Set for iterator access to Net::OAI::ListSets.
750              
751             =item *
752              
753             Implement Net::OAI::Harvester::listAllSets().
754              
755             =item *
756              
757             More documentation of other classes.
758              
759             =item *
760              
761             Document custom XML::Handler creation.
762              
763             =item *
764              
765             Handle optional compression.
766              
767             =item *
768              
769             Create common handlers for other metadata formats (MARC, qualified DC, etc).
770              
771             =item *
772              
773             Selectively load Net::OAI::* classes as needed, rather than getting all of them
774             at once at the beginning of Net::OAI::Harvester.
775              
776             =back
777              
778             =head1 SEE ALSO
779              
780             =over 4
781              
782             =item *
783              
784             OAI-PMH Specification at L
785              
786             =item *
787              
788             L
789              
790             =item *
791              
792             L
793              
794             =item *
795              
796             L
797              
798             =item *
799              
800             L
801              
802             =item *
803              
804             L
805              
806             =item *
807              
808             L
809              
810             =item *
811              
812             L
813              
814             =item *
815              
816             L
817              
818             =item *
819              
820             L
821              
822             =item *
823              
824             L
825              
826             =item *
827              
828             L
829              
830             =item *
831              
832             L
833              
834             =item *
835              
836             L
837              
838             =back
839              
840             =head1 AUTHORS
841              
842             =over 4
843              
844             =item *
845              
846             Ed Summers
847              
848             =item *
849              
850             Martin Emmerich
851              
852             =back
853              
854             =cut
855              
856             1;