File Coverage

blib/lib/HTTP/OAIPMH/Validator.pm
Criterion Covered Total %
statement 34 36 94.4
branch n/a
condition n/a
subroutine 12 12 100.0
pod n/a
total 46 48 95.8


line stmt bran cond sub pod time code
1             package HTTP::OAIPMH::Validator;
2              
3             =head1 NAME
4              
5             HTTP::OAIPMH::Validator - OAI-PMH validator class
6              
7             =head1 SYNOPSIS
8              
9             Validation suite for OAI-PMH data providers that checks for responses
10             in accord with OAI-PMH v2
11             L.
12              
13             Typical use:
14              
15             use HTTP::OAIPMH::Validator;
16             use Try::Tiny;
17             my $val = HTTP::OAIPMH::Validator->new( base_url=>'http://example.com/oai' );
18             try {
19             $val->run_complete_validation;
20             } catch {
21             warn "oops, validation didn't run to completion: $!\n";
22             };
23             print "Validation status of data provider ".$val->base_url." is ".$val->status."\n";
24              
25             =cut
26              
27 1     1   37934 use strict;
  1         1  
  1         47  
28              
29             our $VERSION = '1.05';
30              
31 1     1   3 use base qw(Class::Accessor::Fast);
  1         2  
  1         476  
32 1     1   2335 use Data::UUID;
  1         566  
  1         49  
33 1     1   361 use Date::Manip;
  1         100517  
  1         119  
34 1     1   397 use HTTP::Request; # for rendering http queries
  1         732  
  1         20  
35 1     1   5 use HTTP::Headers;
  1         1  
  1         18  
36 1     1   431 use HTTP::Request::Common; # makes POST easier
  1         1597  
  1         48  
37 1     1   5 use HTTP::Status; # for checking error codes
  1         1  
  1         218  
38 1     1   667 use LWP::UserAgent; # send http requests
  1         9853  
  1         27  
39 1     1   494 use LWP::Protocol::https; # explicit include so we fail without https support
  1         67356  
  1         43  
40 1     1   7 use URI::Escape; # excape special characters
  1         1  
  1         53  
41 1     1   764 use XML::DOM;
  0            
  0            
42             use HTTP::OAIPMH::Log;
43              
44             =head2 METHODS
45              
46             =head3 new(%args)
47              
48             Create new HTTP::OAIPMH::Validator object and initialize counters.
49              
50             The following instance variables may be set via %args and have read-write
51             accessors (via L):
52              
53             base_url - base URL of the data provdier being validated
54             run_id - UUID identifying the run (will be generated if none supplied)
55             protocol_version - protocol version supported
56             admin_email - admin email extracted from Identify response
57             granularity - datestamp granularity (defaults to 'days', else 'seconds')
58             uses_https - set true if the validator sees an https URL at any stage
59              
60             debug - set true to add extra debugging output
61             log - logging object (usually L)
62             parser - XML DOM parser instance
63              
64             identify_response - string of identify response (used for registration record)
65             earliest_datestamp - value extracted from earliestDatestamp in Identify response
66             namespace_id - if the oai-identifier is used then this records the namespace identifier extracted
67             set_names - array of all the set names reported in listSets
68              
69             example_record_id - example id used for tests that require a specific identifier
70             example_set_spec - example setSpec ("&set=name") used for tests that require a set
71             example_metadata_prefix - example metadataPrefix which defaults to 'oai_dc'
72              
73             =cut
74              
75             HTTP::OAIPMH::Validator->mk_accessors( qw( base_url protocol_version
76             admin_email granularity uses_503 uses_https
77             debug parser run_id ua allow_https doc save_all_responses
78             response_number http_timeout max_retries max_size
79             protocol guidelines
80             identify_response earliest_datestamp namespace_id set_names
81             example_record_id example_set_spec example_metadata_prefix
82             log status
83             ) );
84              
85             sub new {
86             my $this=shift;
87             my $class=ref($this) || $this;
88             my $self={
89             'base_url' => undef,
90             'protocol_version' => undef,
91             # Repository features extracted
92             'granularity' => 'days', # can also be "seconds"
93             'uses_503' => 0, # set true if 503 responses ever used
94             'uses_https' => 0, # set to true if https is ever used
95             # Control
96             'debug' => 0,
97             'parser' => XML::DOM::Parser->new(),
98             'run_id' => undef,
99             'ua' => undef,
100             'allow_https' => 0, # allow https URIs
101             'doc' => undef, # current parsed xml document
102             'save_all_responses' => 0, # set True to save all HTTP responses
103             'response_number' => 1, # initial response number
104             'http_timeout' => 600,
105             'max_retries' => 5, # number of 503's in a row that we will accept
106             'max_size' => 100000000, # max response size in bytes (100MB)
107             'protocol' => 'http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm', #URL of protocol spec
108             'guidelines' => 'http://www.openarchives.org/OAI/2.0/guidelines-repository.htm', #URL of repository guidelines doc
109             # Results
110             'namespace_id' => undef,
111             'set_names' => [],
112             'example_record_id' => undef,
113             'example_set_spec' => undef,
114             'example_metadata_prefix' => 'oai_dc',
115             'log' => HTTP::OAIPMH::Log->new(),
116             'status' => 'unknown',
117             @_};
118             bless($self, $class);
119             $self->setup_run_id if (not $self->run_id);
120             $self->setup_user_agent if (not $self->ua);
121             return($self);
122             }
123              
124             =head3 setup_run_id()
125              
126             Set a UUID for the run_id.
127              
128             =cut
129              
130             sub setup_run_id {
131             my $self=shift;
132             my $ug=Data::UUID->new;
133             $self->run_id(lc($ug->to_string($ug->create)));
134             }
135              
136             =head3 setup_user_agent()
137              
138             Setup L for the validator.
139              
140             =cut
141              
142             sub setup_user_agent {
143             my $self=shift;
144             my $ua = LWP::UserAgent->new(); # User agent, to render http requests
145             $ua->timeout($self->http_timeout); # give responses 10 minutes
146             $ua->max_size($self->max_size); # size limit ##seems to break http://eprints.soton.ac.uk/perl/oai2 [Simeon/2005-06-06]
147             $ua->requests_redirectable([]); # we will do redirects manually
148             $ua->agent('OAIPMH_Validator'); # set user agent
149             $ua->from('https://groups.google.com/d/forum/oai-pmh'); # set a default From: address -> direct to google group for dicussion
150             $self->ua($ua);
151             }
152              
153              
154             =head3 abort($msg)
155              
156             Special purpose "die" routine because tests cannot continue. Logs
157             failure and then dies.
158              
159             =cut
160              
161             sub abort {
162             my $self=shift;
163             my ($msg)=@_;
164             $self->log->fail('ABORT: '.$msg);
165             $self->status('FAILED');
166             die('ABORT: '.$msg."\n");
167             }
168              
169              
170             =head3 run_complete_validation($skip_test_identify)
171              
172             Run all tests for a complete validation and return true is the data provider passes,
173             false otherwise. All actions are logged and may be accessed to provide a report
174             (including warnings that do not indicate failure) after the run.
175              
176             Arguments:
177             $skip_identify - set true to skip the text_identify() step
178              
179             =cut
180              
181             sub run_complete_validation {
182             my $self=shift;
183             my ($skip_identify)=@_;
184              
185             $self->response_number(1);
186             $self->test_identify unless ($skip_identify);
187             $self->test_list_sets;
188             $self->test_list_identifiers;
189              
190             my $baseURL = $self->base_url;
191             my ($formats, $gotDC) = $self->test_list_metadata_formats;
192              
193             # If the repository doesn't support oai_dc then this is a failure (because
194             # the standard demands it) but see whether we can find another metadataPrefix
195             # in order to continue the tests
196             if ( $gotDC ) {
197             $self->log->pass("Data provider supports oai_dc metadataPrefix");
198             } else {
199             if ($formats and $formats->getLength()>0) {
200             $self->example_metadata_prefix( $formats->item(0)->getFirstChild->getData );
201             $self->log->fail("Data provider does not support the simple Dublin Core metadata ".
202             "format with metadataPrefix oai_dc. Tests that require a ".
203             "metadataPrefix to be specified will use '".
204             $self->example_metadata_prefix."'");
205             } else {
206             $self->log->fail("There are no metadata formats available to use with the GetRecord ".
207             "request. The metadataPrefix ".
208             $self->example_metadata_prefix.
209             " will be used for later tests even though it seems unsupported.");
210             }
211             }
212              
213             my ($dateStamp)=$self->test_get_record($self->example_record_id,$self->example_metadata_prefix);
214             $self->test_list_records($dateStamp,$self->example_metadata_prefix);
215              
216             # Check responses to erroneous queries
217             $self->test_expected_errors($self->example_record_id);
218              
219             if ($self->protocol_version eq '2.0') {
220             $self->test_expected_v2_errors($self->earliest_datestamp,$self->example_metadata_prefix);
221             # As of version 2.0, data providers must support HTTP POST requests
222             $self->test_post_requests($self->example_metadata_prefix);
223             }
224             $self->test_resumption_tokens;
225              
226             # Getting here with no failures means that the data provider is compliant
227             # (there may be warnings which are not counted in num_fail)
228             $self->status( $self->log->num_fail==0 ? 'COMPLIANT' : 'FAILED' );
229             return($self->log->num_fail==0);
230             }
231              
232              
233             =head3 failures()
234              
235             Return Markdown summary of failure log entries, along with the appropriate
236             titles and request details. Will return empty string if there are no
237             failures in the log.
238              
239             =cut
240              
241             sub failures {
242             my $self=shift;
243             return($self->log->failures());
244             }
245              
246              
247             =head3 summary()
248              
249             Return summary statistics for the validation in Markdown (designed to agree
250             with conversion to HTML by L).
251              
252             =cut
253              
254             sub summary {
255             my $self=shift;
256              
257             my $sf=($self->log->num_fail>0?'failure':'success');
258              
259             my $str="\n## Summary - *$sf*\n\n";
260             my $namespace_id = $self->namespace_id;
261             if ($namespace_id) {
262             if ($namespace_id=~/\./) { #v2.0
263             $str.=" * Namespace declared for v2.0 oai-identifiers is $namespace_id\n";
264             } else { #v1.1
265             $str.=" * Namespace declared for v1.1 oai-identifiers (the repositoryIdentifier) is $namespace_id\n";
266             }
267             }
268             $str.=" * Uses 503 for flow control\n" if ($self->uses_503);
269             $str.=" * Uses https URIs (not specified in protocol)\n" if ($self->uses_https);
270             $str.=" * Total tests passed: ".$self->log->num_pass."\n";
271             $str.=" * Total warnings: ".$self->log->num_warn."\n";
272             $str.=" * Total error count: ".$self->log->num_fail."\n";
273             $str.=" * Validation status: ".($self->status || 'unknown')."\n";
274             return($str);
275             }
276              
277              
278             =head2 METHODS TESTING SPECIFIC OAI-PMH VERBS
279              
280             =head3 test_identify()
281              
282             Check response to an Identify request. Returns false if tests cannot
283             continue, true otherwise.
284              
285             Side effects based on values extracted:
286              
287             - $self->admin_email set to email extracted from adminEmail element
288             - $self->granularity set to 'days' or 'seconds'
289              
290             =cut
291              
292             sub test_identify {
293             my $self=shift;
294              
295             my $cantContinue=0;
296             $self->log->start("Checking Identify response");
297              
298             # Send the verb request to the base URL - vet extracts the email address
299             my $burl=$self->base_url;
300             my $req = $burl."?verb=Identify";
301              
302             my $response = $self->make_request($req); #don't use make_request_and_validate() just do simplest thing here
303             unless ($response->is_success) {
304             my $r="Server at base URL '$burl' failed to respond to Identify. The HTTP GET request with URL $req received response code '".$response->code()."'.";
305             if ($response->code() == 301) {
306             $self->log->fail("$r HTTP code 301 'Moved Permanently' is not widely supported by ".
307             "harvesters and is anyway inappropriate for registration of a ".
308             "service. If requests must be redirected then an HTTP response 302 ".
309             "may be used as outlined in the guidelines [".
310             $self->guidelines."#LoadBalancing].");
311             } else {
312             $self->log->fail($r);
313             }
314             $self->abort("Failed to get Identify response from server at base URL '$burl'.\n");
315             return;
316             }
317              
318             # Parse the XML response
319             unless ($self->parse_response($req,$response)) {
320             $self->log->fail("Failed to parse Identify response");
321             $self->abort("Failed to parse Identify response from server at base URL '$burl'.\n");
322             }
323              
324             # Check that this really is a Identify response
325             my $oaipmhNode=$self->doc->getFirstChild();
326             # skip over and processing instructions such as XML stylesheets
327             while ($oaipmhNode->getNodeType==PROCESSING_INSTRUCTION_NODE or
328             $oaipmhNode->getNodeType==COMMENT_NODE) {
329             $oaipmhNode=$oaipmhNode->getNextSibling();
330             }
331             unless (defined $oaipmhNode and $oaipmhNode->getNodeName eq 'OAI-PMH') {
332             $self->log->fail("Identify response does not have OAI-PMH as root element! ".
333             "Found node named '".$oaipmhNode->getNodeName."' instead");
334             $self->abort("Identify response from server at base URL '$burl' does not have ".
335             "OAI-PMH as root element!\n");
336             }
337             my $identifyNode=$oaipmhNode->getElementsByTagName('Identify',0);
338             unless ($identifyNode->getLength()>0) {
339             my $errorNode=$oaipmhNode->getElementsByTagName('error',0);
340             if ($errorNode->getLength()>0) {
341             # give specific message if response is and error
342             $self->log->fail("Error response to Identify request!\n");
343             $self->abort("Error response to Identify request from server at base URL '$burl'.\n");
344             return;
345             } else {
346             $self->log->fail("Identify response does not contain <Identify> block.\n");
347             $self->abort("Identify response does not contain Identify block from server at base URL '$burl'.\n");
348             return;
349             }
350             }
351              
352             # Extract admin email and protocol version numbers, check
353             my ($admin_email,$email_error)=$self->get_admin_email;
354             if (not $admin_email or $email_error) {
355             $self->abort(($email_error || "Failed to extract adminEmail").", aborting.\n");
356             return;
357             }
358             $self->admin_email($admin_email);
359             $self->check_protocol_version; # bails if not Version 2.0
360              
361             # URL is valid, Identify response was provided, extract content as string
362             $self->identify_response( $response->content );
363              
364             my $baseURL = $self->doc->getElementsByTagName('baseURL');
365              
366             # BUG FOUND ON AUGUST 26, 2002: empty baseURL still returns length > 0
367             # So it is necessary to explicity check for an empty element.
368             if ( $baseURL->getLength() > 0 ) {
369             $baseURL = $baseURL->item(0)->getFirstChild;
370             if ( $baseURL ) { $baseURL = $baseURL->getData; }
371              
372             # $burl is the one given on the form; $baseURL is the one in the XML doc.
373             if ($burl eq $baseURL) {
374             $self->log->pass("baseURL supplied matches the Identify response");
375             } else {
376             # report the error, but keep the form URL
377             # (at least it answered Identify!)
378             $self->log->fail("baseURL supplied '$burl' does not match the baseURL in the ".
379             "Identify response '$baseURL'. The baseURL you enter must EXACTLY ".
380             "match the baseURL returned in the Identify response. It must ".
381             "match in case (http://Wibble.org/ does not match http://wibble.org/) ".
382             "and include any trailing slashes etc.");
383             $cantContinue++;
384             }
385             }
386              
387             # For Version 2.0, Check for seconds granularity
388             if ($self->protocol_version eq '2.0') {
389             my $gran_el = $self->doc->getElementsByTagName('granularity');
390             if ($self->parse_granularity($gran_el)) {
391             $self->log->pass("Datestamp granularity is '".$self->granularity."'");
392             } else {
393             $cantContinue++;
394             }
395             }
396              
397             # For an exception check new to Version 2.0, extract the earliest date
398             # and also check that its granularity is right
399             if (my $err=$self->get_earliest_datestamp) {
400             $self->log->fail("Bad earliestDatestamp: $err");
401             $cantContinue++;
402             } else {
403             $self->log->pass("Extracted earliestDatestamp ".$self->earliest_datestamp);
404             }
405              
406             # Check for OAI-identifier. If already in use by another base URL, bump
407             # the error count to avoid having this URL register.
408             #
409             my $oaiIds = $self->doc->getElementsByTagName('oai-identifier');
410             if ($oaiIds and $oaiIds->getLength()>0) {
411             if ($oaiIds->getLength()>1) {
412             $self->log->fail("Found more than one oai-identifier element. The intention ".
413             "is that this declaration only be used by repositories ".
414             "declaring the use of a single identifier namespace.");
415             $cantContinue++;
416             } else {
417             $oaiIds=$oaiIds->item(0);
418              
419             # Now find out if this is v1.1 or v2.0
420             my $oai_id_version='2.0';
421             if (my $xmlns=$oaiIds->getAttribute('xmlns')) { #FIXME this requires default namespace to be set to oai-id
422             if ($xmlns eq 'http://www.openarchives.org/OAI/2.0/oai-identifier') {
423             $oai_id_version='2.0';
424             $self->log->pass("oai-identifier description for version $oai_id_version is being used");
425             } elsif ($xmlns eq 'http://www.openarchives.org/OAI/1.1/oai-identifier') {
426             $oai_id_version='1.1';
427             $self->log->pass("oai-identifier description for version $oai_id_version is being used");
428             } elsif ($xmlns) {
429             $self->log->fail("Unrecognized namespace declaration '$xmlns' for ".
430             "oai-identifier, expected ".
431             "http://www.openarchives.org/OAI/2.0/oai-identifier ".
432             "(for v2.0) or ".
433             "http://www.openarchives.org/OAI/1.1/oai-identifier ".
434             "(for v1.1). Assuming version $oai_id_version.");
435             } else {
436             $self->log->fail("No namespace declaration found for oai-identifier, expected ".
437             "http://www.openarchives.org/OAI/2.0/oai-identifier ".
438             "(for v2.0) or ".
439             "http://www.openarchives.org/OAI/1.1/oai-identifier ".
440             "(for v1.1). Assuming version $oai_id_version/");
441             }
442             } else {
443             $self->log->fail("Can't find namespace declaration for the oai-identifier description. ".
444             "This must be added as ".
445             "(or 1.1), there will likely also be schema validation weeors. Will ".
446             "assume that the oai-identifier is version $oai_id_version for ".
447             "later tests");
448             }
449             my $repoIds = $oaiIds->getElementsByTagName('repositoryIdentifier');
450             if ($repoIds) {
451             my $temp = $repoIds->item(0);
452             if (!defined($temp)) {
453             $self->log->fail("No namespace-identifier (repositoryIdentifier element) in ".
454             "the oai-identifier block of the Identify description");
455             return;
456             }
457             my $nsel = $temp->getFirstChild;
458             unless ( $nsel ) {
459             # Empty repositoryIdentifier element, squawk loudly
460             $self->log->fail("Empty namespace-identifier (repositoryIdentifier element) in ".
461             "the oai-identifier block of the Identify description");
462             return;
463             }
464             my $namespace_id = $nsel->getData;
465             # Having validated the value of namespace-identifier, we can now tell if it is v1.1 or v2.0 based
466             # on whether is has a . in it (i.e. if /\./)
467             if ($oai_id_version eq '2.0') {
468             #schema:
469             unless ($namespace_id=~/^[a-z][a-z0-9\-]*(\.[a-z][a-z0-9\-]+)+$/i) {
470             $self->log->fail("Bad namespace-identifier (repositoryIdentifier element) ".
471             "'$namespace_id' in oai-identifier declaration. See section ".
472             "2.1 of the OAI Identifier specification for details ".
473             "(http://www.openarchives.org/OAI/2.0/guidelines-oai-identifier.htm).");
474             $cantContinue++;
475             } else {
476             $self->log->pass("namespace-identifier (repositoryIdentifier element) in oai-identifier ".
477             "declaration is $namespace_id");
478             $self->namespace_id( $namespace_id );
479             }
480             } else { #v1.1 schema:
481             unless ($namespace_id=~/^[a-z0-9]+$/i) {
482             $self->log->fail("Bad namespace-identifier (repositoryIdentifier element) ".
483             "'$namespace_id' in oai-identifier declaration. See section ".
484             "2.1 of the OAI Identifier specification for details ".
485             "(http://www.openarchives.org/OAI/1.1/guidelines-oai-identifier.htm).");
486             $cantContinue++;
487             } else {
488             $self->log->pass("namespace-identifier (repositoryIdentifier element) in oai-identifier ".
489             "declaration is $namespace_id");
490             $self->namespace_id( $namespace_id );
491             }
492             }
493             }
494             }
495             }
496             return(not $cantContinue);
497             }
498              
499              
500             =head3 test_list_sets()
501              
502             Check response to the ListSets verb.
503              
504             Save the setSpecs for later use.
505              
506             Note that the any set might be empty. So if test_list_identifiers doesn't
507             get a match, we need to try the second set identifier, and so on.
508             So keep a list of the setSpec elements.
509              
510             =cut
511              
512             sub test_list_sets {
513             my $self=shift;
514              
515             $self->log->start("Checking ListSets response");
516             my $req=$self->base_url."?verb=ListSets";
517             my $response = $self->make_request_and_validate("ListSets", $req);
518             unless ($response) {
519             $self->log->fail("Can't check set names");
520             return;
521             }
522              
523             unless ($self->parse_response($req,$response)) {
524             $self->log->fail("Can't parse response");
525             $self->abort("failed to parse response to ListSets");
526             }
527              
528             $self->set_names( [] );
529             $self->example_set_spec( '' );
530             my $set_elements=$self->doc->getElementsByTagName('setSpec');
531             if (not defined($set_elements) or ($set_elements->getLength<1)) {
532             # No setSpec elements, so there should be an element
533             my $details={};
534             if ($self->is_error_response($details)) {
535             if ($details->{'noSetHierarchy'}) {
536             $self->log->pass("Repository does not support sets and the is correctly reported with a ".
537             "noSetHierarchy exception in the ListSets response");
538             } else {
539             $self->log->fail("Failed to extract any setSpec elements from ListSets ".
540             "but did not find a noSetHierarchy exception. Found instead a '".
541             join(', ',keys %{$details})."' exception(s). See <".
542             $self->protocol."#ListSets>.");
543             }
544             } else {
545             $self->log->fail("Failed to extract any setSpec elements from ListSets but did not ".
546             "find an exception message. If sets are not supported by the ".
547             "repository then the ListSets response must be the noSetHierarchy ".
548             "error. See <".$self->protocol."#ListSets>.");
549             }
550             } else {
551             # Have setSpec elements, record all set names and pick an example set spec
552             for (my $j=0; $j<$set_elements->getLength; $j++) {
553             my $set_name=$set_elements->item($j)->getFirstChild->getData;
554             ##FIXME - should validate each set name
555             push(@{$self->set_names},$set_name);
556             }
557             # Sanity check, did we get the number we expected?
558             my $num_sets=scalar(@{$self->set_names});
559             if ($num_sets!=$set_elements->getLength) {
560             $self->log->fail("Failed to extract the expected number of set names (got ".
561             "$num_sets, expected ".$set_elements->getLength.")");
562             }
563             if ($num_sets>0) {
564             $self->example_set_spec( "&set=".$self->set_names->[0] );
565             }
566             my $msg='';
567             for (my $j=0; $j<$num_sets and $j<3; $j++) { $msg.=" ".$self->set_names->[$j]; }
568             $msg.=" ..." if ($num_sets>3);
569             $self->log->pass("Extracted $num_sets set names: {$msg }, will use setSpec ".
570             $self->example_set_spec." in tests");
571             }
572             }
573              
574              
575             =head3 test_list_identifiers()
576              
577             Check response to ListIdentifiers and record an example record id in
578             $self->example_record_id to be used in other tests.
579              
580             If there are no identifiers, but the response is legal, stop the test with
581             errors=0, number of verbs checked is three.
582              
583             As of version 2.0, a metadataPrefix argument is required. Unfortunately
584             we need to call test_list_identifiers first in order to get an id for
585             GetRecord, so we simply use oai_dc.
586              
587             =cut
588              
589             sub test_list_identifiers {
590             my $self=shift;
591              
592             $self->log->start("Checking ListIdentifiers response");
593              
594             ### FIXME -- skip the set= restriction because this code doesn't
595             ### FIXME work right for set hierarchies - 2002-10-17
596             ### FIXME 2015-01-02 - put/left in, is it OK?
597             my $set_spec = $self->example_set_spec;
598             my $req = $self->base_url."?verb=ListIdentifiers&metadataPrefix=oai_dc".$set_spec;
599             my $response = $self->make_request_and_validate("ListIdentifiers", $req);
600              
601             # Note: $response will come back null if an error code was returned
602             # An error code of "noRecordsMatch" comes back if specified set is
603             # empty. In that case we should drop the set and try again.
604             if ( $set_spec and (! $response or $self->is_no_records_match ) ) {
605             $self->log->note("Empty set made ListIdentifiers fail - trying other sets...");
606             my $i=1;
607             my $m = scalar(@{$self->set_names});
608             while ($i<$m and not $response ) {
609             $set_spec = "&set=".$self->set_names->[$i];
610             $req = $self->base_url."?verb=ListIdentifiers&metadataPrefix=oai_dc".$set_spec;
611             $response = $self->make_request_and_validate("ListIdentifiers", $req);
612             $self->log->note("Trying set ".$set_spec);
613             }
614             # If we were successful then set the example_set_spec for any future tests
615             if ($response) {
616             $self->example_set_spec( $set_spec );
617             }
618             }
619              
620             # None of the sets had any identifiers in them. Try the whole entire
621             # list of identifiers.
622             if ( $set_spec and !$response ) {
623             $self->log->note("Last attempt is without any sets...");
624             $req = $self->base_url."?verb=ListIdentifiers&metadataPrefix=oai_dc";
625             $response = $self->make_request_and_validate("ListIdentifiers",$req);
626             }
627              
628             # Now we are for real in trouble if $response is null
629             unless ($response) {
630             $self->log->fail("No ListIdentifiers response with content");
631             $self->log->note("The base URL did not respond to the ListIdentifiers verb.".
632             "Without that, we cannot proceed with the validation test. Exiting.");
633             $self->abort("The base URL did not respond to the ListIdentifiers verb. Without that, we cannot proceed with the validation test. Exiting.");
634             }
635              
636             # Grab the first identifier for later use
637             unless ($self->parse_response($req,$response)) {
638             $self->log->fail("Can't parse ListIdentifiers response");
639             $self->abort("unable to parse response");
640             }
641             #
642             # Now look for the identifier of a non-deleted record
643             # If there are no identifiers to be harvested, we cannot complete the
644             # validation test.
645             #
646             # FIXME - this still doesn't solve the problem that there may be no
647             # non-deleted items listed in the particular response or partial
648             # response that we are looking at [Simeon/2005-07-20]
649             #
650             my $headers = $self->doc->getElementsByTagName('header');
651             my $h;
652             my $record_id;
653             for ($h=0; $h<$headers->getLength(); $h++) {
654             my $hdnode=$headers->item($h);
655             my $idnode=$hdnode->getElementsByTagName('identifier',0);
656             next unless ($idnode and $idnode->getLength()==1);
657             $record_id=$idnode->item(0)->getFirstChild->getData;
658             last unless ($hdnode->getAttribute('status') and $hdnode->getAttribute('status') eq 'deleted');
659             $self->log->warn("Identifier ".($h+1).", '$record_id', is for a deleted record, skipping");
660             }
661             if ($h==$headers->getLength()) {
662             # No identifiers were in the ListIdentifiers response. Further testing
663             # is not possible.
664             $self->log->fail("The response to the ListIdentifiers verb with metadataPrefix oai_dc ".
665             "contained no identifiers. Without at least one identifier, we cannot ".
666             "proceed with the validation tests.");
667             $self->abort("No identifiers in response to ListIdentifiers. Without an identifier ".
668             "we cannot proceed with validation tests.");
669             }
670             $self->log->pass("Good ListIdentifiers response, extracted id '$record_id' for use in future tests.");
671             $self->example_record_id( $record_id );
672             }
673              
674              
675             =head3 test_list_metadata_formats()
676              
677             Vet the verb as usual, and then make sure that Dublin Core in included
678             In particular, we will check the metadata formats available for "record_id",
679             obtained from checking the ListIdentifier verb.
680             Side effect: save available formats for later use (global "formats").
681             NOTE:if there are no formats, error will be picked up by getRecord
682              
683             =cut
684              
685             sub test_list_metadata_formats {
686             my $self=shift;
687              
688             $self->log->start("Checking ListMetadataFormats response");
689              
690             # Do we have an example record id to check with?
691             my $record_id = $self->example_record_id;
692             unless ($record_id) {
693             $self->log->fail("Cannot check ListMetadataFormats as we do not have an example id");
694             return;
695             }
696              
697             my $req = $self->base_url."?verb=ListMetadataFormats&identifier=".url_encode($record_id);
698             my $response = $self->make_request_and_validate("ListMetadataFormats",$req);
699             unless ($response) {
700             $self->log->fail("Can't check metadataFormats available for item $record_id, no ".
701             "response to ListMetadataFormats request.");
702             return;
703             }
704              
705             # Check for Dublin Core
706             unless ($self->parse_response($req,$response)) {
707             $self->log->fail("Can't parse response to ListMetadataFormats request for item $record_id.");
708             return;
709             }
710              
711             my $formats = $self->doc->getElementsByTagName('metadataPrefix');
712             unless ($formats->getLength() > 0) {
713             $self->log->fail("No metadata formats are listed in the response to a ListMetadataFormats ".
714             "request for item $record_id.");
715             return;
716             }
717              
718             if ($self->debug) {
719             $self->log->note("debug: ".$formats->getLength()." formats supported for identifier '$record_id'");
720             }
721             my $gotDC=0;
722             for my $i (0..$formats->getLength()-1) {
723             my $format = $formats->item($i);
724             #assume this node has only one child, and its data for a format
725             if ( $format->getFirstChild->getData =~ /^\s*oai_dc\s*$/ ) {
726             $gotDC = 1;
727             last;
728             }
729             }
730             if ($gotDC) {
731             $self->log->pass("Good ListMetadataFormats response, includes oai_dc");
732             } else {
733             $self->log->pass("Good ListMetadataFormats response, BUT DID NOT FIND oai_dc");
734             }
735             return($formats, $gotDC);
736             }
737              
738              
739             =head3 test_get_record($record_id, $format)
740              
741             Try to get record $record_id in $format.
742              
743             If either $record_id or $format are undef then we have an error
744             right off the bat. Else make the request and return the
745             datestamp of the record.
746              
747             =cut
748              
749             sub test_get_record {
750             my $self=shift;
751             my ($record_id, $format)=@_;
752              
753             $self->log->start("Checking GetRecord response");
754              
755             unless (defined $format) {
756             $self->log->fail("Skipping GetRecord test as no metadata format is listed as being available.");
757             return;
758             }
759             unless (defined $record_id) {
760             $self->log->fail("Skipping GetRecord test as no items are listed as having metadata available.");
761             return;
762             }
763              
764             my $numerr=0; #count up non-fatal errors
765              
766             my $req = $self->base_url."?verb=GetRecord&identifier=".url_encode($record_id)."&metadataPrefix=".url_encode($format);
767             my $response = $self->make_request_and_validate("GetRecord", $req);
768             unless ($response) {
769             $self->log->fail("Can't complete datestamp check for GetRecord");
770             $self->abort("Can't complete datestamp check for GetRecord");
771             }
772              
773             # Save the datestamp for later use by ListRecords
774             # As of version 2.0, Identify response can have a granularity and the
775             # datestamp MUST be in the finest granularity supported by the repository
776             unless ($self->parse_response($req,$response)) {
777             $self->log->fail("Can't parse response");
778             $self->abort("Unable to parse response from GetRecord");
779             }
780              
781             if (my $msg=$self->is_error_response) {
782             $self->log->fail("The response to the GetRecord verb was the OAI exception $msg. ".
783             "It is this not possible to extract a valid datestamp for remaining tests");
784             $self->abort("Unexpected OAI exception response");
785             }
786              
787             my $datestamps = $self->doc->getElementsByTagName('datestamp');
788             # If there is no there is no datestamp ... but there should be a record
789             unless ( $datestamps->getLength() > 0 ) {
790             $self->log->fail("The response to the GetRecord verb did not have a datestamp, which is ".
791             "needed to continue checking verbs.");
792             $self->abort("No datestamp in the response for GetRecord");
793             }
794              
795             my $datestamp=undef;
796             eval {
797             $datestamp = $datestamps->item(0)->getFirstChild->getData;
798             };
799             if (not defined($datestamp)) {
800             $self->log->fail("Failed to extract datestamp from the GetRecord response. See <".
801             $self->protocol."#Dates>.");
802             $numerr++;
803             } elsif ( my $granularity=$self->get_datestamp_granularity($datestamp) ) {
804             $self->log->pass("Datestamp in GetRecord response ($datestamp) has the correct form for ".
805             "$granularity granularity.");
806             if ( $granularity eq $self->granularity ) {
807             # The granularity in v2.0 must match the finest granularity supported (see sec3.3.2)
808             $self->log->pass("Datestamp in GetRecord response ($datestamp) matched the ".
809             $self->granularity." granularity specified in the Identify response. ");
810             } else {
811             $self->log->fail("Datestamp in GetRecord response ($datestamp) is not consistent ".
812             "with the ".$self->granularity." granularity specified in the ".
813             "Identify response");
814             $numerr++;
815             }
816             } else {
817             $self->log->fail("Datestamp in GetRecord response ($datestamp) is not valid. See <".
818             $self->protocol."#Dates>.");
819             $numerr++;
820             }
821              
822             # As of OAI-PMH Version 2.0, GetRecord must return a set spec if the
823             # repository supports sets and the item is in a set
824             if (not $self->example_set_spec) {
825             $self->log->pass("Valid GetRecord response") unless ($numerr>0);
826             return($datestamp);
827             }
828              
829             my $set_list = $self->doc->getElementsByTagName('setSpec');
830             my $set_value = $self->example_set_spec;
831             $set_value =~ s/&set=//;
832             $self->log->note("Looking for set '".$set_value."' or a descendant set.") if $self->debug;
833             my $i;
834             my $subset_str = '';
835             for ($i=0; $i<$set_list->getLength; $i++) {
836             my $s = $set_list->item($i)->getFirstChild->getData;
837             last if ($s eq $set_value);
838             if ($s =~ m/^${set_value}:/) {
839             $subset_str = " (implied by a descendant setSpec)";
840             last;
841             }
842             }
843             if ($i==$set_list->getLength) { # error
844             $self->log->fail("Expected setSpec was missing from the response. The GetRecord ".
845             "response for identifier $record_id did not contain a set ".
846             "specification for $set_value");
847             } else {
848             $self->log->pass("Expected setSpec was returned in the response".$subset_str);
849             }
850             return($datestamp);
851             }
852              
853              
854             =head3 test_list_records($datestamp,$metadata_prefix)
855              
856             Test the response for the ListRecords verb. In addition, if there is
857             no Dublin Core available for this repository, this is an error.
858             (And the error has already been counted in test_get_record)
859             We can still test the verb, however, with one of the available
860             formats found by testGetMetadataFormats. Since the output of
861             ListRecords is likely to be large, use the datestamp of the one
862             record we did retrieve to limit the output.
863              
864             =cut
865              
866             sub test_list_records {
867             my $self=shift;
868             my ($datestamp,$metadata_prefix)=@_;
869              
870             $self->log->start("Checking ListRecords response");
871              
872             my $req = $self->base_url."?verb=ListRecords";
873             if ($datestamp) {
874             $req.="&from=".$datestamp."&until=".$datestamp;
875             } else {
876             $self->log->warn("Omitting datestamp parameter as none was obtained from earlier test");
877             }
878             $req.="&metadataPrefix=".$metadata_prefix;
879             my $list_not_complete=1;
880              
881             while ($list_not_complete) {
882             $list_not_complete=0;
883             my $response = $self->make_request_and_validate("ListRecords", $req);
884             unless ($response) {
885             #Nothing else to say since we don't do other tests
886             return;
887             }
888              
889             if ($self->parse_response($req,$response)) {
890             $self->log->pass("Response is well formed");
891             } else {
892             $self->log->fail("The ListRecords response was not well formed XML");
893             }
894              
895             # Now check to make sure that we got back the record for the identifier
896             # $self->example_record_id if there is one specified, else fail that
897             # test.
898             my $record_id=$self->example_record_id;
899             unless ($record_id) {
900             $self->log->fail("Cannot check for correct record inclusion without an example record id");
901             return;
902             }
903             my $details={};
904             if ($self->is_error_response($details)) {
905             if ($details->{'noRecordsMatch'}) {
906             $self->log->fail("ListRecords response gave a noRecordsMatch error when it should ".
907             "have included at least the record with identifier $record_id. ".
908             "The from and until parameters of the request were set to the ".
909             "datestamp of this record ($datestamp). The from and until parameters ".
910             "are inclusive, see protocol spec section 2.7.1. The message ".
911             "included in the error response was: '".
912             $details->{'noRecordsMatch'}."'");
913             } else {
914             my @txt=();
915             foreach my $k (keys %$details) {
916             push(@txt,"$k (".$details->{$k}.")");
917             }
918             $self->log->fail("ListRecords gave an unexpected error response to a request using ".
919             "from and until datestamps taken from the previous GetRecord response: ".
920             join(', ',@txt));
921             }
922             } else {
923             my $id_list = $self->doc->getElementsByTagName('identifier');
924             my $i;
925             my $badly_formed=0;
926             for ($i=0; $i<$id_list->getLength; $i++) {
927             if (my $child=$id_list->item($i)->getFirstChild()) {
928             last if ($id_list->item($i)->getFirstChild->getData eq $record_id);
929             } else {
930             $badly_formed++;
931             last;
932             }
933             }
934             if ($badly_formed) {
935             $self->log->fail("ListRecords response badly formed, identifier element for record ".
936             ($i+1)." is empty");
937             } elsif ($i<$id_list->getLength) {
938             $self->log->pass("ListRecords response correctly included record with identifier $record_id");
939             } elsif (my $token=$self->get_resumption_token) {
940             # More responses to come, may just not have got to the
941             # record yet... roll around for more:
942             $self->log->pass("ListRecords response includes resumptionToken. Haven't found ".
943             "record with identifier $record_id yet, will continue with resumptionToken...");
944             $list_not_complete=1;
945             $req = $self->base_url."?verb=ListRecords&resumptionToken=".url_encode($token);
946             } else {
947             $self->log->fail("ListRecords response did not include the identifier $record_id ".
948             "which should have been included because both the from and until ".
949             "parameters were set to the datestamp of this record ($datestamp). ".
950             "The from and until parameters are inclusive, see protocol spec ".
951             "section 2.7.1");
952             }
953             }
954             }
955             }
956              
957              
958             =head3 test_resumption_tokens()
959              
960             Request an unlimited ListRecords. If there is a resumption token, see
961             if it works. It is an error if resumption is there but doesn't work.
962             Empty resumption tokens are OK -- this ends the list.
963              
964             CGI takes care of URL-encoding the resumption token.
965              
966             =cut
967              
968             sub test_resumption_tokens {
969             my $self=shift;
970              
971             $self->log->start("Checking for correct use of resumptionToken (if used)");
972              
973             my $req = $self->base_url."?verb=ListRecords&metadataPrefix=oai_dc";
974             my $response = $self->make_request($req);
975              
976             # was there a resumption token?
977             unless ($self->parse_response($req,$response)) {
978             $self->log->fail("Can't parse malformed XML in response to ListRecords request. ".
979             "Cannot complete test for correct use of resumptionToken (if used)");
980             return;
981             }
982              
983             my $tokenList = $self->doc->getElementsByTagName('resumptionToken');
984             if ( !$tokenList or $tokenList->getLength()==0 ) {
985             $self->log->pass("resumptionToken not used");
986             return;
987             }
988             if ( $tokenList->getLength()>1 ) {
989             $self->log->fail("More than one resumptionToken in response!");
990             return;
991             }
992              
993             # Dig out the resumption token from the document
994             my $tokenElement = $tokenList->item(0);
995              
996             # Try getting the resumption token, $token will be will be undefined
997             # unless the element has content
998             my $token = $tokenElement->getFirstChild;
999             my $tokenString;
1000             if ($token) {
1001             $tokenString = $token->getData;
1002             }
1003             unless ($tokenString) {
1004             $self->log->fail("Empty resumption token in response to $req There should never ".
1005             "be an empty resumptionToken in response to a request without a ".
1006             "resumptionToken argument");
1007             return;
1008             }
1009              
1010             # If there us a 'cursor' value given then check that it is
1011             # correct. It must have the value 0 in the first response
1012             my $usingCursor=0;
1013             if (my $cursor=$tokenElement->getAttribute('cursor')) {
1014             $usingCursor=1;
1015             if ($cursor==0) {
1016             $self->log->pass("A cursor value was supplied with the resumptionToken and it ".
1017             "correctly had the value zero in the first response");
1018             } else {
1019             $self->log->fail("A cursor value was supplied with the resumptionToken but it ".
1020             "did not have the correct value zero for the first response. ".
1021             "The value was '$cursor'.");
1022             }
1023             }
1024              
1025             $self->log->note("Got resumptionToken ".$tokenString);
1026              
1027             # Try using the resumption token. Before including a resumptionToken in
1028             # the URL of a subsequent request, we must encode all special characters
1029             # getData in this version of XML::DOM expands entitities
1030             $req = $self->base_url."?verb=ListRecords&resumptionToken=".url_encode($tokenString);
1031             $response = $self->make_request($req);
1032             unless ( $response ) {
1033             $self->log->fail("Site failed to respond to request using resumptionToken: $req");
1034             return;
1035             }
1036             unless ( $self->parse_response($req,$response)) {
1037             $self->log->fail("Response to request is using resumptionToken not valid XML: $req");
1038             return;
1039             }
1040              
1041             my $errorList = $self->doc->getElementsByTagName('error');
1042             if ( $errorList and $errorList->getLength() > 0 ) {
1043             $self->log->fail("Response to request using resumptionToken was an error code: $req");
1044             return;
1045             }
1046              
1047             ###FIXME: put in test for cursor again, should be number of items returned in the
1048             ###FIXME: first response [Simeon/2005-10-11]
1049              
1050             $self->log->pass("Resumption tokens appear to work");
1051             }
1052              
1053              
1054             =head2 METHODS CHECKING ERRORS AND EXCEPTIONS
1055              
1056             =head3 test_expected_errors($record_id)
1057              
1058             Each one of these requests should get a 400 response in OAI-PHM v1.1,
1059             or a 200 response in 2.0, along with a Reason_Phrase. Bump error_count
1060             if this does not hold. Return the number of errorneous responses.
1061              
1062             $record_id is a valid record identifier to be used in tests that require
1063             one.
1064              
1065             =cut
1066              
1067             sub test_expected_errors {
1068             my $self=shift;
1069             my ($record_id)=@_;
1070              
1071             $self->log->start("Checking exception handling (errors)");
1072              
1073             my @request_list = (
1074             [ 'junk', [ 'badVerb' ], '', '' ],
1075             [ 'verb=junk', [ 'badVerb' ], '', '' ],
1076             [ 'verb=GetRecord&metadataPrefix=oai_dc', [ 'badArgument' ], '', '' ],
1077             [ 'verb=GetRecord&identifier='.$record_id, [ 'badArgument' ], '', '' ],
1078             [ 'verb=GetRecord&identifier=invalid"id&metadataPrefix=oai_dc', [ 'badArgument','idDoesNotExist' ], 'An XML parsing error may be due to incorrectly including the invalid identifier in the element of your XML error response; only valid arguments should be included. A response that includes ..baseURL.. is not well-formed XML because of the quotation mark (") in the identifier.', 'Either the badArgument or idDoesNotExist error codes would be appropriate to report this case.' ],
1079             [ 'verb=ListIdentifiers&until=junk', [ 'badArgument' ], '', '' ],
1080             [ 'verb=ListIdentifiers&from=junk', [ 'badArgument' ], '', '' ],
1081             [ 'verb=ListIdentifiers&resumptionToken=junk&until=2000-02-05', [ 'badArgument','badResumptionToken' ], '', 'Either the badArgument and/or badResumptionToken error codes may be reported in this case. If only one is reported then the badArgument error is to be preferred because the resumptionToken and until parameters are exclusive.' ],
1082             [ 'verb=ListRecords&metadataPrefix=oai_dc&from=junk', [ 'badArgument' ], '', '' ],
1083             [ 'verb=ListRecords&resumptionToken=junk', [ 'badResumptionToken' ], '', '' ],
1084             [ 'verb=ListRecords&metadataPrefix=oai_dc&resumptionToken=junk&until=1990-01-10', [ 'badArgument','badResumptionToken' ] , '', 'Either the badArgument and/or badResumptionToken error codes may be reported in this case. If only one is reported then the badArgument error is to be preferred because the resumptionToken and until parameters are exclusive.' ],
1085             [ 'verb=ListRecords&metadataPrefix=oai_dc&until=junk', [ 'badArgument' ], '', '' ],
1086             [ 'verb=ListRecords', [ 'badArgument' ], '', '' ]
1087             );
1088              
1089             my $n=0;
1090             foreach my $rrr ( @request_list ) {
1091             my ($request_string, $error_codes, $xml_reason, $reason)=@$rrr;
1092             my $req = $self->base_url.'?'.$request_string;
1093             my $ok_errors=join(' or ',@$error_codes);
1094              
1095             my $response=$self->make_request($req);
1096              
1097             # TBD: $response->status_line should also be checked? see output from
1098             # physnet.uni-oldenburg.de/oai/oai.php
1099             if ($self->protocol_version eq "1.1") {
1100             if ($response->code ne "400") {
1101             $self->log->note("Invalid requests which failed to return 400:") if $n == 0;
1102             $n++;
1103             $self->log->fail("Expected 400 from: $request_string");
1104             }
1105             } elsif ($self->protocol_version eq "2.0") {
1106             # The document must contain the proper error code
1107             unless ($self->parse_response($req,$response,$xml_reason)) {
1108             $self->log->fail("Can't parse malformed response. ".html_escape($xml_reason));
1109             $n++;
1110             next;
1111             }
1112             # check that the error code is in the error_list
1113             my $error_elements = $self->doc->getElementsByTagName('error');
1114             if (my $matching_code=$self->error_elements_include($error_elements, $error_codes)) {
1115             $self->log->pass("Error response correctly includes error code '$matching_code'");
1116             } else {
1117             $self->log->fail("Exception/error response did not contain error code ".
1118             "'$ok_errors' ".html_escape($reason));
1119             $n++;
1120             next;
1121             }
1122             } else {
1123             $self->log->fail("Invalid protocol version returned");
1124             $self->abort("test_expected_errors - invalid protocol version");
1125             }
1126             }
1127             my $total = scalar @request_list;
1128             if ($n==0) {
1129             $self->log->pass("All $total error requests properly handled");
1130             } else {
1131             $self->log->warn("Only ".($total-$n)." out of $total error requests properly handled");
1132             }
1133             return($n);
1134             }
1135              
1136              
1137             =head3 test_expected_v2_errors($earliest_datestamp,$metadata_prefix)
1138              
1139             There are some additional exception tests for OAI-PMH version 2.0.
1140              
1141             =cut
1142              
1143             sub test_expected_v2_errors {
1144             my $self=shift;
1145             my ($earliest_datestamp,$metadata_prefix)=@_;
1146              
1147             $self->log->start("Checking for version 2.0 specific exceptions");
1148              
1149             my $too_early_date=one_year_before($earliest_datestamp);
1150              
1151             # format of entries: [ request_string, [error_codes_accepable], resaon ]
1152             my @request_list = (
1153             [ "verb=ListRecords&metadataPrefix=".url_encode($metadata_prefix)."&from=2002-02-05&until=2002-02-06T05:35:00Z", ['badArgument'],
1154             'The request has different granularities for the from and until parameters.' ],
1155             [ "verb=ListRecords&metadataPrefix=".url_encode($metadata_prefix)."&until=$too_early_date" , ['noRecordsMatch'],
1156             'The request specified a date one year before the earliestDatestamp given in the Identify response. '.
1157             'There should therefore not be any records with datestamps on or before this date and a noRecordsMatch '.
1158             'error code should be returned.' ]
1159             );
1160              
1161             foreach my $rrr ( @request_list ) {
1162             my ($request_string,$error_codes,$reason)=@$rrr;
1163              
1164             my $req=$self->base_url."?$request_string";
1165             my $response = $self->make_request($req);
1166             # parse the response content for the desired error code
1167             unless ( $self->parse_response($req,$response) ) {
1168             $self->log->fail("Error in parsing XML response to exception request: $request_string");
1169             next;
1170             }
1171             # check that there is at least the desired error code
1172             my $ok_errors=join(' or ',@$error_codes);
1173             my $error_elements = $self->doc->getElementsByTagName('error');
1174             if ( !$error_elements or $error_elements->getLength==0 ) {
1175             $self->log->fail("Failed to extract error code from the response to request: ".
1176             "$request_string $reason");
1177             } elsif (my $matching_code=$self->error_elements_include($error_elements, $error_codes) ) {
1178             $self->log->pass("Error response correctly includes error code '$matching_code'");
1179             } else {
1180             $self->log->fail("Error code $ok_errors not found in response but should be given ".
1181             "to the request: $request_string $reason");
1182             }
1183             }
1184             return;
1185             }
1186              
1187              
1188             =head2 METHODS TO TEST USE OF HTTP POST
1189              
1190             =head3 test_post_requests()
1191              
1192             Test responses to POST requests. Do both the simplest possible -- the Identify
1193             verb -- and a GetRecord request which uses two additional parameters.
1194              
1195             =cut
1196              
1197             sub test_post_requests {
1198             my $self=shift;
1199             my ($metadata_prefix)=@_;
1200              
1201             $self->log->start("Checking that HTTP POST requests are handled correctly");
1202              
1203             $self->test_post_request(1,{verb => "Identify"});
1204              
1205             my $record_id=$self->example_record_id;
1206             if ($record_id) {
1207             $self->test_post_request(2,{verb => "GetRecord", 'identifier' => $record_id, 'metadataPrefix' => $metadata_prefix});
1208             } else {
1209             $self->log->fail("Cannot test GetRecord via POST without and example record identifier");
1210             }
1211             }
1212              
1213              
1214             # Called just by test_post_requests to actually run the test
1215             #
1216             sub test_post_request {
1217             my $self=shift;
1218             my ($num, $post_data) = @_;
1219             my $response = $self->make_request($self->base_url, $post_data);
1220             if ($response->is_success) {
1221             my $verb = $post_data->{verb};
1222             if ( $self->is_verb_response($response,$verb) ) {
1223             $self->log->pass("POST test $num for $verb was successful");
1224             } elsif ( $self->check_error_response($response) ) {
1225             $self->log->fail("POST test $num for $verb was unsuccessful, an OAI error ".
1226             "response was received");
1227             } else {
1228             $self->log->fail("POST test $num for $verb was unsuccessful, got neither a ".
1229             "valid response nor an error");
1230             }
1231             } else {
1232             $self->log->fail("POST test $num was unsuccessful. Server returned HTTP Status: '".
1233             $response->status_line."'");
1234             }
1235             }
1236              
1237              
1238             =head2 METHODS CHECKING ELEMENTS WITHIN VERB AND ERROR RESPONSES
1239              
1240             =head3 check_response_date($req, $doc)
1241              
1242             Check responseDate for being in UTC format
1243             (should perhaps also check that it is at least the current day?)
1244              
1245             =cut
1246              
1247             sub check_response_date {
1248             my $self=shift;
1249             my ($req, $doc) = @_;
1250              
1251             my $elements = $self->doc->getElementsByTagName('responseDate');
1252             # assume rest of validity already checked, just take first
1253             my $item;
1254             my $child;
1255             if ($elements and $item=$elements->item(0) and $child=$item->getFirstChild()) {
1256             my $date = $child->getData();
1257             if ($date=~/\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ/) {
1258             $self->log->pass("responseDate has correct format: $date");
1259             } else {
1260             $self->log->fail("Bad responseDate of $date, this is not in UTC DateTime ".
1261             "(YYYY-MM-DDThh:mm:ssZ) format");
1262             }
1263             } else {
1264             $self->log->fail("Failed to extract responseDate");
1265             }
1266             }
1267              
1268              
1269             =head3 check_schema_name($req, $doc)
1270              
1271             Given the response to one of the OAI verbs, make sure that it it
1272             going to be validated against the "official" OAI schema, and not
1273             one that the repository made up for itself. If the response can't
1274             be parsed, or if there is no OAI-PMH element, or if the schema is
1275             incorrect, print an error message and bump the error_count.
1276              
1277             Return true if the schema name and date check out, else return undef
1278              
1279             =cut
1280              
1281             sub check_schema_name {
1282             my $self=shift;
1283             my ($req, $doc) = @_;
1284              
1285             my $namespace = 'http://www.openarchives.org/OAI/2.0/';
1286             my $location = 'http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd';
1287              
1288             my $elements = $self->doc->getElementsByTagName('OAI-PMH'); #NodeList
1289             unless ( $elements->getLength() > 0 ) {
1290             $self->log->fail("Response to $req did not contain a OAI-PMH element");
1291             return(0);
1292             }
1293             my $attributes = $elements->item(0)->getAttributes; #Node->NamedNodeMap
1294             my $attr = $attributes->getNamedItem('xsi:schemaLocation'); #Node
1295             unless ( $attr ) {
1296             $self->log->fail("No xsi:schemaLocation attribute for the OAI-PMH element was ".
1297             "found, expected xsi:schemaLocation=\"$namespace $location\"");
1298             return(0);
1299             }
1300             $attr = $attributes->getNamedItem('xsi:schemaLocation'); #Node
1301             my $pair = $attr->getNodeValue(); # must pair OAI namespace with schema
1302             unless ( $pair =~ /^\s?$namespace\s*$location/ ) {
1303             $self->log->fail("Error in pairing OAI namespace with schema location, expected: ".
1304             "xsi:schemaLocation=\"$namespace $location\" but got $pair");
1305             return(0);
1306             }
1307             return(1);
1308             }
1309              
1310              
1311             =head3 check_protocol_version
1312              
1313             Extract the protocol version being used from the Identify response, check that it is
1314             valid and then abort unless 2.0.
1315              
1316             =cut
1317              
1318             sub check_protocol_version {
1319             my $self=shift;
1320             my $doc;
1321             # Extract the version number of the validator to run
1322             my $x = $self->doc->getElementsByTagName('protocolVersion');
1323             if (not $x) {
1324             $self->abort("Unknown protocol version, failed to extract protocolVersion element from Identify response");
1325             }
1326             my $protocol_version = $x->item(0)->getFirstChild->getData;
1327             if ($protocol_version ne '2.0' and
1328             $protocol_version ne '1.1' and
1329             $protocol_version ne '1.0') {
1330             $self->abort("Invalid protocol version ($protocol_version)");
1331             }
1332             $self->protocol_version( $protocol_version );
1333             if ($protocol_version ne '2.0') {
1334             $self->abort("Repository reports OAI-PMH protocol version $protocol_version and will not be validated. Guidelines for upgrading to 2.0 can be found at http://www.openarchives.org/OAI/2.0/migration.htm\n\n");
1335             }
1336             $self->log->pass("Correctly reports OAI-PMH protocol version 2.0");
1337             }
1338              
1339              
1340             =head2 is_verb_response($reponse,$verb)
1341              
1342             Return true if $response is a response for the specified $verb.
1343              
1344             FIXME -- need better checks!
1345              
1346             =cut
1347              
1348             sub is_verb_response {
1349             my $self=shift;
1350             my ($response,$verb) = @_;
1351             my $doc;
1352             eval { $doc=$self->parser->parse($response->content); };
1353             return unless $doc; # We can't parse it so it isn't a valid doc
1354             my $verb_elements = $doc->getElementsByTagName($verb);
1355             return(1) if ( $verb_elements and $verb_elements->getLength==1 );
1356             return; # not the one element we expected
1357             }
1358              
1359              
1360             =head3 error_elements_include($error_elements,$error_codes)
1361              
1362             Determine whether the list of error elements ($error_elements) includes at least
1363             one of the desired codes. Return string with first matching error code, else
1364             return false/nothing.
1365              
1366             Does a sanity check on $error_list to check that it is set and has length>0
1367             before trying to match, so cose calling it can simply do a
1368             getElementsByTagName or similar before caling.
1369              
1370             =cut
1371              
1372             sub error_elements_include {
1373             my $self=shift;
1374             my ($error_elements, $error_codes) = @_;
1375             # sanity check
1376             return if (!$error_elements or $error_elements->getLength==0);
1377             for (my $i=0; $i<$error_elements->getLength; $i++) {
1378             foreach my $ec (@$error_codes) {
1379             my $code = $error_elements->item($i)->getAttribute('code') || 'no-code-attribute';
1380             $self->log->note("$code =? $ec") if ($self->debug);
1381             return($ec) if ($code eq $ec);
1382             }
1383             }
1384             return;
1385             }
1386              
1387              
1388              
1389             =head3 check_error_response($response)
1390              
1391             Given the response to an HTTP request, make sure it is not an
1392             OAI-PMH error message. The $response is a success. If it is an
1393             OAI error message, return 2; if the response cannot be parsed, return
1394             -1; otherwise return undef (it must be a real Identify response).
1395              
1396             FIXME -- need better checks!
1397              
1398             FIXME -- need to merge thic functionality in with is_error_response
1399              
1400             =cut
1401              
1402             sub check_error_response {
1403             my $self=shift;
1404             my ($response) = @_;
1405             my $doc;
1406             eval { $doc=$self->parser->parse($response->content); };
1407             return unless $doc; # We can't parse it so it isn't a valid error
1408             my $error_elements = $doc->getElementsByTagName('error');
1409             return(1) if ( $error_elements and $error_elements->getLength() > 0 );
1410             return; # no error codes so not an error response
1411             }
1412              
1413              
1414             =head3 get_earliest_datestamp()
1415              
1416             A new exception check for Version 2.0 raises noRecordsMatch errorcode
1417             if the set of records returned by ListRecords is empty. This requires
1418             that we know the earliest date in the repository. Also check that the
1419             earliest date matches the specified granularity.
1420              
1421             Called only for version 2.0 or greater.
1422              
1423             Since the Identify response has already been validated, we know
1424             there is exactly one earliestDatestamp element in the current document.
1425             Extract this value, check it, and if it looks good then set
1426             $self->earliest_datestamp and return false.
1427              
1428             If there is an error then return string explaining that.
1429              
1430             =cut
1431              
1432             sub get_earliest_datestamp {
1433             my $self=shift;
1434              
1435             my $earliest = $self->doc->getElementsByTagName('earliestDatestamp');
1436             my $el = $earliest->item(0);
1437             return("Can't get earliestDatestamp element from Identify response.") unless ($el);
1438             return("earliestDatestamp element is empty in Identify response.") unless ($el->getFirstChild);
1439              
1440             my $error='';
1441             my $earliest_datestamp = $el->getFirstChild->getData;
1442             $self->log->note("Earliest datestamp in repository is $earliest_datestamp") if $self->debug;
1443              
1444             $earliest_datestamp =~ /^([0-9]{4})-([0-9][0-9])-([0-9][0-9])(.*)$/;
1445             if ($1 eq '' || $2 eq '' || $3 eq '') {
1446             $error="is not in ISO8601 format";
1447             } elsif ( $4 eq '' and $self->granularity eq 'seconds') {
1448             $error="must have seconds granularity (format YYYY-MM-DDThh:mm:ssZ) to match ".
1449             "the granularity for the repository. The granularity has been set to seconds ".
1450             "by the granularity element of the Identify response.\n";
1451             } elsif ( $4 ne '' and $self->granularity eq 'days') {
1452             $error="must have days granularity (format YYYY-MM-DD) to match the granularity for ".
1453             "the repository. The granularity has been set to days by the granularity ".
1454             "element of the Identify response (or that element is bad/missing).\n";
1455             } elsif ( $self->granularity eq 'seconds' and $4 !~ /^T\d\d:\d\d:\d\d(\.\d+)?Z$/ ) {
1456             $error="does not have the correct format for the time part of the UTCdatetime. The ".
1457             "overall format must be YYYY-MM-DDThh:mm:ssZ.\n";
1458             }
1459             if ($error) {
1460             # Sanitize for error message
1461             return("The earliestDatestamp in the identify response (".
1462             sanitize($earliest_datestamp).") $error");
1463             } else {
1464             $self->earliest_datestamp($earliest_datestamp);
1465             return;
1466             }
1467             }
1468              
1469              
1470             =head3 parse_granularity($granularity_element)
1471              
1472             Parse contents of the granularity element of the Identify response. Returns either
1473             'days', 'seconds' or nothing on failure. Sets $self->granularity if valid, otherwise
1474             does not change setting.
1475              
1476             As of v2.0 the granularity element is mandatory, see:
1477             http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify
1478              
1479             =cut
1480              
1481             sub parse_granularity {
1482             my $self=shift;
1483             my ($gran) = @_;
1484             if (!$gran or $gran->getLength==0) {
1485             $self->log->fail("Missing granularity element");
1486             return;
1487             } elsif ($gran->getLength>1) {
1488             $self->log->fail("Multiple granularity elements");
1489             return;
1490             }
1491             #schema validation guarantees that there is a spec here
1492             my $el=$gran->item(0)->getFirstChild->getData;
1493             if ($el eq 'YYYY-MM-DD') {
1494             $self->granularity('days');
1495             return($self->granularity);
1496             } elsif ($el eq 'YYYY-MM-DDThh:mm:ssZ') {
1497             $self->granularity('seconds');
1498             return($self->granularity);
1499             } else {
1500             $self->log->fail("Bad value for the granularity element '$el', must be either ".
1501             "YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ");
1502             return;
1503             }
1504             }
1505              
1506              
1507             =head3 get_datestamp_granularity($datestamp)
1508              
1509             Parse the datestamp supplied and return 'days' if it is valid with granularity
1510             of days, 'seconds' if it is valid for seconds granularity, and nothing if it is not
1511             valid.
1512              
1513             # FIXME - should add more validation
1514              
1515             =cut
1516              
1517             sub get_datestamp_granularity {
1518             my $self=shift;
1519             my ($datestamp)=@_;
1520             if ($datestamp=~/^(\d\d\d\d)-(\d\d)-(\d\d)$/) {
1521             return 'days' if ($2>=1 and $2<=12 and $3>=1 and $3<=31);
1522             } elsif ($datestamp=~/^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)(\.\d+)?Z$/) {
1523             return 'seconds' if ($2>=1 and $2<=12 and $3>=1 and $3<=31 and $4<24 and $5<60);
1524             }
1525             return;
1526             }
1527              
1528              
1529             =head3 is_no_records_match
1530              
1531             Returns true if the current document contains and error code element with the code "noRecordsMatch"
1532              
1533             ### FIXME - should be merged into an extended is_error_response
1534              
1535             =cut
1536              
1537             sub is_no_records_match {
1538             my $self=shift;
1539             my $error_elements = $self->doc->getElementsByTagName('error');
1540             return( $self->error_elements_include($error_elements, ['noRecordsMatch']) );
1541             }
1542              
1543              
1544             =head3 get_resumption_token()
1545              
1546             See if there is a resumptionToken with this response, return
1547             value if present, empty if not or if there is some other error.
1548              
1549             =cut
1550              
1551             sub get_resumption_token {
1552             my $self=shift;
1553              
1554             my $tokenList = $self->doc->getElementsByTagName('resumptionToken');
1555             if ( !$tokenList or $tokenList->getLength()==0 ) {
1556             return; #no resumptionToken
1557             }
1558              
1559             # Dig out the resumption token from the document
1560             my $token = $tokenList->item(0)->getFirstChild();
1561              
1562             # Try getting the resumption token, $token will be will be undefined
1563             # unless the element has content
1564             if ($token) {
1565             return($token->getData());
1566             }
1567             return;
1568             }
1569              
1570              
1571             =head3 is_error_response($details)
1572              
1573             Look at the parsed response in $self->doc to see if it is an error response,
1574             parse data and return true if it is.
1575              
1576             Returns true (a printable string containing the error messages) if response was a valid
1577             OAI_PMH error response, codes in %$details if a hash reference is passed in.
1578              
1579             =cut
1580              
1581             sub is_error_response {
1582             my $self=shift;
1583             my ($details)=@_;
1584             $details={} unless (ref($details) eq 'HASH'); #dummy hash unless one supplied
1585             #
1586             my $error_elements = $self->doc->getElementsByTagName('error');
1587             if ($error_elements and $error_elements->getLength()>=1) {
1588             my $msg='';
1589             for (my $i=0; $i<$error_elements->getLength; $i++) {
1590             my $code=$error_elements->item($i)->getAttribute("code");
1591             my $child=$error_elements->item($i)->getFirstChild();
1592             unless ($child) {
1593             # Warn about no content unless it is the special case of noSetHierarchy
1594             # where the error code really is sufficient
1595             unless ($code eq 'noSetHierarchy') {
1596             $self->log->warn("No human readable message included in error element for ".
1597             "$code error, this is discouraged");
1598             }
1599             $details->{$code}='[NO MESSAGE RETURNED]';
1600             $msg.="[$code] ";
1601             } else {
1602             $details->{$code}=$child->getData();
1603             $msg.="[$code: $details->{$code}] ";
1604             }
1605             }
1606             return($msg);
1607             } else {
1608             return;
1609             }
1610             }
1611              
1612              
1613             =head3 get_admin_email()
1614              
1615             Extract admin email from a parsed Identify response in $self->doc).
1616             Also note that the email target may have been set via form option
1617              
1618             Returns the pair of ($email,$error) where $email is the combined
1619             set of email addresses (comma separated). $error will be undef
1620             or a string with error message to users.
1621              
1622             =cut
1623              
1624             sub get_admin_email {
1625             my $self=shift;
1626              
1627             my $adminEmailElements = $self->doc->getElementsByTagName('adminEmail');
1628             my @emails=();
1629             my $n = $adminEmailElements->getLength;
1630             if ($n > 0) {
1631             my $name_node = $adminEmailElements->item(0)->getFirstChild;
1632             if ($name_node) {
1633             for (my $i=0; $i<$n; $i++) {
1634             my $e=$adminEmailElements->item($i)->getFirstChild->getData;
1635             if ($e=~s/mailto://g) {
1636             $self->log->warn("Stripped mailto: prefix from adminEmail address, this ".
1637             "should not be included.");
1638             }
1639             if (my $msg=$self->bad_admin_email($e)) {
1640             return(undef,$msg);
1641             }
1642             push(@emails,$e);
1643             }
1644             } else {
1645             $self->log->fail("adminEmail element is empty!");
1646             return(undef);
1647             }
1648             } else {
1649             $self->log->fail("No adminEmail element!");
1650             return(undef);
1651             }
1652             my $email=join(',',@emails);
1653             $self->log->pass("Administrator email address is '$email'");
1654             return($email);
1655             }
1656              
1657              
1658             =head3 bad_admin_email($admin_email)
1659              
1660             Check for some stupid email addresses to avoid so much bounced email.
1661             Returns a string (True) if bad, else nothing.
1662              
1663             =cut
1664              
1665             sub bad_admin_email {
1666             my $self=shift;
1667             my ($admin_email)=@_;
1668             if ($admin_email=~/\@localhost$/) {
1669             $self->log->fail("adminEmail '$admin_email' is local. This must be corrected to a ".
1670             "valid globally resolvable email address before tests can continue");
1671             return("local adminEmail");
1672             } elsif ($admin_email!~/^\w[\w\-\.]+\@[a-zA-Z0-9\-\.]+\.[a-z]{2,}$/) {
1673             $self->log->fail("adminEmail '$admin_email' looks bogus. This must be corrected to ".
1674             "a valid email address before tests can continue");
1675             return("looks like bogus adminEmail");
1676             }
1677             return;
1678             }
1679              
1680              
1681             =head2 METHODS FOR MAKING REQUESTS AND PARSING RESPONSES
1682              
1683             =head3 make_request_and_validate($verb, $req)
1684              
1685             Given the base URL that we are validating, the Verb that we are checking
1686             and the complete query to be sent to the OAI server, get the response to
1687             the verb. Validation has already been done, so we need only do some
1688             special checks here. Return the response to the OAI verb,
1689             or undef if the OAI server failed to respond to that verb.
1690              
1691             Side effects: errors may be printed and error_count bumped.
1692             If the verb involved is "Identify" then set the version number and the
1693             email address, assuming that some response has been obtained.
1694              
1695             Simple well-formedness is checked by this routine. An undef exit means
1696             that any calling code should fail the test but need not report 'no response'.
1697              
1698             If the response is true then $self->doc contains a parsed XML
1699             document.
1700              
1701             This is the usual way we make requests with integrated parsing and error
1702             checking. This method is built around calls to L and
1703             L.
1704              
1705             =cut
1706              
1707             sub make_request_and_validate {
1708             my $self=shift;
1709             my ($verb, $req) = @_;
1710              
1711             my $response = $self->make_request($req);
1712              
1713             unless ( $response->is_success ) {
1714             my $status = $response->status_line;
1715             my $age = $response->current_age;
1716             my $lifetime = $response->freshness_lifetime;
1717             my $is_fresh = $response->is_fresh;
1718             $self->log->fail("Server failed to respond to the $verb request (HTTP header ".
1719             "values: status=$status, age=$age, lifetime=$lifetime, ".
1720             "is fresh:=$is_fresh)");
1721             return;
1722             }
1723              
1724             unless ($self->parse_response($req, $response)) {
1725             $self->log->fail("Failed to parse response");
1726             return;
1727             }
1728              
1729             # Check that the responseDate is in UTC format
1730             $self->check_response_date($req,$self->doc);
1731             # Check that the response refers to the "official" OAI schema
1732             $self->check_schema_name($req,$self->doc);
1733              
1734             return($response);
1735             }
1736              
1737              
1738             =head3 make_request($url,$post_data)
1739              
1740             Routine to GET or POST a request, handle 503's, and return the response
1741              
1742             Second parameter, $post_data, must be hasfref to POST data to indicate that
1743             the request should be an HTTP POST request instead of a GET.
1744              
1745             =cut
1746              
1747             sub make_request {
1748             my $self=shift;
1749             my ($url,$post_data) = @_;
1750              
1751             # Is this https and do we allow that?
1752             if (is_https_uri($url)) {
1753             $self->uses_https(1);
1754             if (not $self->allow_https) {
1755             $self->abort("URI $url is https. Use of https URIs is not allowed ".
1756             "by the OAI-PMH v2.0 specification");
1757             }
1758             }
1759              
1760             my $request;
1761             if ($post_data) {
1762             my $content_msg=''; #nice string to report
1763             # Sort keys in alpha order for consistent behavior
1764             foreach my $k (sort keys(%$post_data)) {
1765             my $v=$post_data->{$k};
1766             $content_msg.="$k:$v ";
1767             }
1768             $self->log->request($url,'POST',$content_msg);
1769             $request = POST($url,'Content'=>$post_data);
1770             } else {
1771             $self->log->request($url,'GET');
1772             $request = GET($url);
1773             }
1774             my $response;
1775             my $tries=0;
1776             my $try_again = 1;
1777             while ( $try_again ) {
1778             #$ua->max_redirect(0);
1779             $response = $self->ua->request($request);
1780             #
1781             # Write response if requested
1782             if ($self->save_all_responses) {
1783             my $response_file="/tmp/".$self->run_id.".".$self->response_number;
1784             open(my $fh,'>',$response_file) || $self->abort("Can't write response $response_file: $!");
1785             print {$fh} $response->content();
1786             $self->log->note("Response saved as $response_file") if ($self->debug);
1787             close($fh);
1788             $self->{response_number}++;
1789             }
1790             $tries++;
1791             if ($tries > $self->max_retries) {
1792             $self->abort("Too many 503 Retry-After or 302 Redirect responses received in a row");
1793             }
1794             #
1795             # Check response for 503 and 302
1796             if ($response->code eq '503') {
1797             # 503 (Retry-After), expect to get a time too
1798             $self->uses_503(1);
1799             if (defined $response->header("Retry-After")) {
1800             my $retryAfter=$response->header("Retry-After");
1801             if ($retryAfter=~/^\d+$/) {
1802             if ($retryAfter<=3600) {
1803             ###FIXME: Should check the Retry-After value carefully and barf if bad
1804             my $sleep_time = 1 + $response->header("Retry-After");
1805             $self->log->note("Status: ".$response->code().
1806             " -- going to sleep for $sleep_time seconds.");
1807             sleep $sleep_time;
1808             } else {
1809             $self->abort("503 response with Retry-After > 1hour (3600s), aborting");
1810             }
1811             } else {
1812             $self->log->fail("503 response with bad (non-numeric) Retry-After time, ".
1813             "will wait 10s");
1814             sleep 10;
1815             }
1816             } else {
1817             $self->log->warn("503 response without Retry-After time, will wait 10s");
1818             sleep 10;
1819             }
1820             } elsif ($response->code eq '302') {
1821             # 302 (Found) redirect
1822             my $loc=$response->header('Location');
1823             if ($loc!~m%^http://([^\?&]+)%) {
1824             if (is_https_uri($loc)) {
1825             $self->uses_https(1);
1826             if (not $self->allow_https) {
1827             $self->abort("Redirect URI specified in 302 response is https. Use of ".
1828             "https URIs is not allowed by the OAI-PMH v2.0 specification");
1829             }
1830             } else {
1831             $self->abort("Bad redirect URI specified in 302 response");
1832             }
1833             }
1834             # Make new request
1835             if ($post_data and $loc!~/\?/) { #don't do POST if new Location includes ?
1836             $request = POST($loc,'Content'=>$post_data);
1837             } else {
1838             $request = GET($loc);
1839             }
1840             } elsif ($response->code eq '501') {
1841             $self->abort("Got 501 Not Implemented response which may either have come from ".
1842             "the server or have been generated within the validator because the ".
1843             "request type (perhaps https) is not supported.");
1844             } else {
1845             $try_again=0;
1846             }
1847             }
1848             # Check for oversize limit (indicated by X-Content-Range header)
1849             if (defined $response->header('X-Content-Range')) {
1850             $self->log->fail("Response to <$url> exceeds maximum size limit (".$self->max_size." bytes), discarded. ".
1851             "While this limit is set only in this validation program you should not use excessively ".
1852             "large responses as service providers will likely have problems parsing the XML. You ".
1853             "should split the responses using the resumptionToken mechanism. (X-Content-Range: '".
1854             $response->header('X-Content-Range')."' Content-Length: '".$response->content_length."')\n");
1855             $response->content('');
1856             }
1857             return $response;
1858             }
1859              
1860              
1861             =head3 parse_response($request_url,$response,$xml_reason)
1862              
1863             Attempt to parse the HTTP response $response, examining both the response code
1864             and then attempting to parse the content as XML.
1865              
1866             If $xml_reason is specified then this is added to the failure message, if
1867             nothing is specified then a standard message about UTF-8 issues is
1868             added.
1869              
1870             Returns true on success and sets $self->doc with the parsed XML document.
1871             If unsuccessful, log an error message, bump the error count, and
1872             return false.
1873              
1874             =cut
1875              
1876             sub parse_response {
1877             my $self=shift;
1878             my ($request_url,$response,$xml_reason) = @_;
1879             $xml_reason='' unless (defined $xml_reason);
1880             #
1881             # Fail if reponse=undef, else check to see if response is ref to
1882             # response object or is string
1883             if (!defined($response) or not ref($response)) {
1884             $self->log->warn("Bad response from server");
1885             return;
1886             }
1887             # Unpack the bits we want from response object
1888             my $code=$response->code;
1889             my $content=$response->content;
1890             # Check return code (if given)
1891             if ($code and $code=~/^[45]/) {
1892             $self->log->warn("Bad HTTP status code from server: $code");
1893             return;
1894             }
1895             #
1896             # Check content
1897             my $doc;
1898             eval { $doc=$self->parser->parse($content); };
1899             unless ( $doc ) {
1900             my $err=$@;
1901             $err=~s/^\s+//;
1902             $err=~s%at\s+/usr/lib/perl.*%%i; #trim stuff about our perl installation
1903             if ($request_url) {
1904             unless ($xml_reason) {
1905             $xml_reason="The most common reason for malformed responses is illegal bytes in ".
1906             "UTF-8 streams (e.g. the inclusion of Latin1 characters with codes>127 ".
1907             "without creating proper UTF-8 mutli-byte sequences). You might find ".
1908             "the utf8conditioner, found on the OAI tools page helpful for debugging.";
1909             }
1910             $self->log->warn("Malformed response: $err. $xml_reason");
1911             }
1912             return;
1913             }
1914             # Set parsed document
1915             $self->doc( $doc );
1916             return(1);
1917             }
1918              
1919              
1920             =head2 UTILITY FUNCTIONS
1921              
1922             =head3 html_escape($str)
1923              
1924             Escapes characters which have special meanings in HTML
1925              
1926             =cut
1927              
1928             sub html_escape {
1929             my $string = shift;
1930             $string =~ s/&/&/g; #must be first!
1931             $string =~ s/
1932             $string =~ s/>/>/g;
1933             $string =~ s/"/"/g;
1934             $string =~ s/'/'/g;
1935             return $string;
1936             }
1937              
1938             =head3 one_year_before($date)
1939              
1940             Assumes properly formatted date, decrements year by one
1941             via string manipulation and returns date.
1942              
1943             =cut
1944              
1945             sub one_year_before {
1946             my ($date)=@_;
1947             my ($year) = $date =~ /^([0-9]{4})/;
1948             my $year_minus_one = sprintf('%04d',($year - 1)); #make sure we get leading zeros
1949             $date =~ s/^$year/$year_minus_one/;
1950             return($date);
1951             }
1952              
1953             =head3 url_encode($str)
1954              
1955             Escape/encode any characters that aren't in the small safe set for URLs
1956              
1957             =cut
1958              
1959             sub url_encode {
1960             my $str=shift;
1961             $str =~ s/([^\w\/\,\- ])/sprintf("%%%02X",ord($1))/eg;
1962             $str =~ tr/ /+/;
1963             return($str);
1964             }
1965              
1966              
1967             =head3 is_https_uri($uri)
1968              
1969             Return true if the URI is an https URI, false otherwise.
1970              
1971             =cut
1972              
1973             sub is_https_uri {
1974             my $uri=shift;
1975             return($uri=~m%^https:%);
1976             }
1977              
1978              
1979             =head3 sanitize($str)
1980              
1981             Return a sanitized version of $str that doesn't contain odd
1982             characters and it not over 80 chars long. Will have the
1983             string '(sanitized)' appended if changed.
1984              
1985             =cut
1986              
1987             sub sanitize {
1988             my ($str)=@_;
1989             my $out=$str;
1990             $out=~s/[^\w\-:;.!@#%^*\(\) ]/_/g;
1991             $out=substr($out,0,80);
1992             if ($out ne $str) {
1993             $out.='(sanitized)';
1994             }
1995             return($out);
1996             }
1997              
1998              
1999             =head1 SUPPORT
2000              
2001             Please report any bugs of questions about validation via the
2002             OAI-PMH discussion list at L.
2003             Be sure to make it clear that you are talking about the
2004             HTTP::OAIPMH::Validator module.
2005              
2006             =head1 AUTHORS
2007              
2008             Simeon Warner, Donna Bergmark
2009              
2010             =head1 HISTORY
2011              
2012             This module is based on an OAI-PMH validator first written by Donna Bergmark
2013             (Cornell University) in 2001-01 for the OAI-PMH validation and registration
2014             service (L).
2015             Simeon Warner (Cornell University) took over the validator and operation of
2016             the registration service in 2004-01, and then did a significant tidy/rework
2017             of the code. That code ran the validation and registration service with
2018             few changes through 2015-01. Some of the early work on the OAI-PMH validation
2019             service was supported through NSF award number 0127308.
2020              
2021             Code was abstracted into this module 2015-01 by Simeon Warner and is
2022             used for the OAI-PMH validation and registration service on
2023             L.
2024              
2025             =head1 COPYRIGHT
2026              
2027             Copyright 2001..2017 by Simeon Warner, Donna Bergmark.
2028              
2029             This library is free software; you can redistribute it and/or modify it under
2030             the same terms as Perl itself.
2031              
2032             =cut
2033              
2034             1;