File Coverage

blib/lib/SeeAlso/Source/BeaconAggregator/Maintenance.pm
Criterion Covered Total %
statement 20 20 100.0
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 26 26 100.0


line stmt bran cond sub pod time code
1             package SeeAlso::Source::BeaconAggregator::Maintenance;
2 10     10   165616 use strict;
  10         16  
  10         274  
3 10     10   30 use warnings;
  10         9  
  10         212  
4              
5             BEGIN {
6 10     10   27 use Exporter ();
  10         9  
  10         174  
7 10     10   32 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         11  
  10         862  
8 10     10   16 $VERSION = '0.2_90';
9 10         62 @ISA = qw(Exporter);
10             #Give a hoot don't pollute, do not export more than needed by default
11 10         13 @EXPORT = qw();
12 10         7 @EXPORT_OK = qw();
13 10         133 %EXPORT_TAGS = ();
14             }
15              
16 10     10   38 use base ("SeeAlso::Source::BeaconAggregator");
  10         17  
  10         3570  
17             use Carp;
18             use HTTP::Date; # not perfect, but the module is commonly installed...
19             use HTTP::Request;
20             use LWP::UserAgent;
21             use File::Temp;
22              
23             =head1 NAME
24              
25             sasbactrl.pl - command line interface to SeeAlso::Source::BeaconAggregator and
26             auxiliary classes
27              
28             =head1 SYNOPSIS
29              
30              
31             =head1 DESCRIPTION
32              
33             This Module allows a collection of BEACON files (cf. http://de.wikipedia.org/wiki/Wikipedia:BEACON)
34             to be used as SeeAlso::Source (probably in the context of an SeeAlso::Server application).
35             Therefore it implements the four methods documented in SeeAlso::Source
36              
37             The BEACON files (lists of non-local identifiers of a certain type documenting the coverage of a given
38             online database plus means for access) are imported by the methods provided by
39             SeeAlso::Source::BeaconAggregator::Maintenance.pm, usually by employing the script sasbactrl.pl
40             as command line client.
41              
42             Serving other formats than SeeAlso or providing a BEACON file with respect to this
43             SeeAlso service is achieved by using SeeAlso::Source::BeaconAggregator::Publisher.
44              
45              
46             =head1 USAGE
47              
48             Use the C method inherited from C to
49             access an existing database or create a new one.
50              
51              
52             =head2 Database Methods
53              
54             =head3 init( [ %options] )
55              
56             Sets up and initializes the database structure for the object.
57             This has to be done once after creating a new database and after
58             upgrading this module.
59              
60             Valid options include:
61              
62             =over 8
63              
64             =item verbose
65              
66             =item prepareRedirs
67              
68             =item identifierClass
69              
70             =back
71              
72              
73             The I table contains as columns all valid beacon fields plus
74             the following administrative fields which have to be prefixed with
75             "_" in the interface:
76              
77             =over 8
78              
79             =item seqno
80              
81             Sequence number: Is incremented on any successfull load
82              
83             =item alias
84              
85             Unique key: On update older seqences with the same alias are
86             automatically discarded. Most methods take an alias as
87             argument thus obliterating the need to determine the sequence
88             number.
89              
90             =item sort
91              
92             optional sort key
93              
94              
95             =item uri
96              
97             Overrides the #FEED header for updates
98              
99             =item ruri
100              
101             Real uri from which the last instance was loaded
102              
103              
104             =item ftime
105              
106             Fetch time: Timestamp as to when this instance was loaded
107              
108             Clear this or mtime to force automatic reload.
109              
110             =item fstat
111              
112             Short statistics line of last successful reload on update.
113              
114              
115             =item mtime
116              
117             Modification time: Timestamp of the file / HTTP object from which this instance was loaded.
118             Identical to ftime if no timestamp is provided
119              
120             Clear this or ftime to force automatic reload on update.
121              
122              
123             =item utime
124              
125             Timestamp of last update attempt
126              
127             =item ustat
128              
129             Short status line of last update attempt.
130              
131              
132             =item counti
133              
134             Identifier count
135              
136             =item countu
137              
138             Unique identifier count
139              
140              
141             =item admin
142              
143             Just to store some remarks.
144              
145             =back
146              
147             The I table stores the individual beacon entries from the input files.
148             Its columns are:
149              
150             =over 8
151              
152             =item hash
153              
154             Identifier. If a (subclass of) C instance is provided,
155             this will be transformed by the C method.
156              
157             =item seqno
158              
159             Sequence number of the beacon file in the database
160              
161             =item altid
162              
163             optional identifier from an alternative identifier system for use
164             with ALTTARGET templates.
165              
166             =item hits
167              
168             optional number of hits for this identifier in the given resource
169              
170             =item info
171              
172             optional information text
173              
174             =item link
175              
176             optional explicit URL
177              
178             =back
179              
180              
181             The I table contains C, C pairs for various metadata
182             concerning the collection as such, notably the values needed for
183             the Open Search Description and the Header fields needed in case
184             of publishing a beacon file for this collection.
185              
186             The I table stores (unique) C, C pairs for
187             general persistent data. Currently the following keys are defined:
188              
189             =over 8
190              
191             =item DATA_VERSION
192              
193             Integer version number to migrate database layout.
194              
195             =item IDENTIFIER_CLASS
196              
197             Name of the Identifier class to be used.
198              
199             =item REDIRECTION_INDEX
200              
201             Control creation of an additional index for the I column
202             (facialiates reverse lookups as needed for clustering).
203              
204             =back
205              
206              
207             =cut
208              
209             sub init {
210             my ($self, %options) = @_;
211             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
212              
213             my @fieldlist = SeeAlso::Source::BeaconAggregator->beaconfields();
214             my @bf = map{ join(" ", @{[SeeAlso::Source::BeaconAggregator->beaconfields($_)]}[0..1]) } @fieldlist;
215             my $hdl = $self->{dbh} or croak("no database handle?");
216              
217             local($") = ",\n";
218             $hdl->do(<<"XxX"
219             CREATE TABLE IF NOT EXISTS repos (
220             seqno INTEGER PRIMARY KEY AUTOINCREMENT,
221             alias TEXT,
222             sort TEXT,
223             uri VARCHAR(512),
224             ruri VARCHAR(512),
225             mtime INTEGER,
226             utime INTEGER,
227             ftime INTEGER,
228             counti INTEGER DEFAULT 0,
229             countu INTEGER DEFAULT 0,
230             fstat TEXT,
231             ustat TEXT,
232             admin VARCHAR(512),
233             @bf
234             );
235             XxX
236             ) or croak("Setup error: ".$hdl->errstr);
237              
238             $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS seqnos ON repos(seqno);") or croak("Setup error: ".$hdl->errstr);
239             $hdl->do("CREATE INDEX IF NOT EXISTS aliases ON repos(alias);") or croak("Setup error: ".$hdl->errstr);
240              
241             $hdl->do(<<"XxX"
242             CREATE TABLE IF NOT EXISTS beacons (
243             hash CHARACTER(64) NOT NULL,
244             seqno INTEGER REFERENCES repos(seqno) ON DELETE CASCADE,
245             altid TEXT,
246             hits INTEGER,
247             info VARCHAR(255),
248             link VARCHAR(1024)
249             );
250             XxX
251             ) or croak("Setup error: ".$hdl->errstr);
252              
253              
254             # Faciliate lookups
255             $hdl->do("CREATE INDEX IF NOT EXISTS lookup ON beacons(hash);") or croak("Setup error: ".$hdl->errstr);
256             # maintenance and enforce constraints
257             # (Problem: Dupes w/o altid but differing in link *and* info fields should be legitimate, too)
258             $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS mntnce ON beacons(seqno, hash, altid);") or croak("Setup error: ".$hdl->errstr);
259              
260             # foreign key on cascade does not work?
261              
262             $hdl->do(<<"XxX"
263             CREATE TRIGGER IF NOT EXISTS on_delete_seqno BEFORE DELETE ON repos FOR EACH ROW
264             BEGIN
265             DELETE FROM beacons WHERE seqno=OLD.seqno;
266             END;
267             XxX
268             ) or croak("Setup error: ".$hdl->errstr);
269              
270             # OpenSearchDescription
271             $hdl->do(<<"XxX"
272             CREATE TABLE IF NOT EXISTS osd (
273             key CHAR(20) NOT NULL,
274             val VARCHAR(1024)
275             );
276             XxX
277             ) or croak("Setup error: ".$hdl->errstr);
278             $hdl->do("CREATE INDEX IF NOT EXISTS OSDKeys ON osd(key);") or croak("Setup error: ".$hdl->errstr);
279              
280             # Admin Stuff
281             $hdl->do(<<"XxX"
282             CREATE TABLE IF NOT EXISTS admin (
283             key CHAR(20) PRIMARY KEY NOT NULL,
284             val VARCHAR(1024)
285             );
286             XxX
287             ) or croak("Setup error: ".$hdl->errstr);
288              
289             $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS ADMKeys ON admin(key);") or croak("Setup error: ".$hdl->errstr);
290              
291             my $admref = $self->admhash();
292              
293             my $verkey = "DATA_VERSION";
294             my $goalver = $SeeAlso::Source::BeaconAggregator::DATA_VERSION;
295             my $dbver = $admref->{$verkey} || 0;
296             if ( $dbver != $goalver ) {
297             print "NOTICE: Database version $dbver: Upgrading to $goalver\n";
298             # alter tables here
299             if ( $dbver < 2 ) {
300             # my ($at, $type) = SeeAlso::Source::BeaconAggregator->beaconfields("COUNT");
301             # $hdl->do("ALTER TABLE repos ADD COLUMN $at $type;");
302             # ($at, $type) = SeeAlso::Source::BeaconAggregator->beaconfields("REMARK");
303             # $hdl->do("ALTER TABLE repos ADD COLUMN $at $type;");
304             };
305             }
306             elsif ( $options{'verbose'} ) {
307             print "INFO: Database version $dbver is current\n"};
308              
309             unless ( $dbver == $goalver) {
310             my $verh = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);", "update version statement");
311             $verh->execute($verkey, $goalver)
312             or croak("Could not execute update version statement: ".$verh->errstr);
313             };
314              
315             unless ( exists $options{'identifierClass'} ) {
316             $options{'identifierClass'} = $self->{'identifierClass'} if exists $self->{'identifierClass'};
317             };
318              
319             my $ickey = "IDENTIFIER_CLASS";
320             if ( (exists $options{identifierClass}) and (my $wanttype = ref($options{'identifierClass'})) ) {
321             if ( (exists $self->{identifierClass}) && (ref($self->{identifierClass}) ne $wanttype) ) {
322             croak("Cannot override identifierClass set on new()")};
323             if ( my $oldtype = $admref->{$ickey} ) {
324             croak ("Identifier mismatch: Cannot set to $wanttype since database already branded to $oldtype")
325             unless($oldtype eq $wanttype);
326             }
327             else {
328             print "fixing identifierClass as $wanttype\n" if $options{'verbose'};
329             my $ichdl = $self->stmtHdl("INSERT INTO admin VALUES (?, ?);", "fix identifier class statement");
330             $ichdl->execute($ickey, $wanttype)
331             or croak("Could not execute fix identifier class statement: ".$ichdl->errstr);
332             $self->{identifierClass} = $options{identifierClass};
333             };
334             }
335             elsif ( (exists $options{identifierClass}) and (not $options{identifierClass}) ) {
336             print "removing fixed identifierClass from admin table\n" if $options{'verbose'};
337             my $ichdl = $self->stmtHdl("DELETE FROM admin WHERE key=?;", "identifier class statement");
338             $ichdl->execute($ickey)
339             or croak("Could not execute remove identifier class statement: ".$ichdl->errstr);
340             delete $self->{identifierClass};
341             };
342              
343             my $rikey = "REDIRECTION_INDEX";
344             if ( exists $options{prepareRedirs} or exists $admref->{$rikey} ) {
345             my $rihdl = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);", "fix redirection index statement");
346             if ( $options{prepareRedirs} or ( $admref->{$rikey} and not exists $options{prepareRedirs} ) ) {
347             print "creating redirection index\n" if $options{prepareRedirs} and $options{'verbose'};
348             $hdl->do("CREATE INDEX IF NOT EXISTS redir ON beacons(altid,seqno);") or croak("Setup error: ".$hdl->errstr);
349             $rihdl->execute($rikey, 1)
350             or croak("Could not execute fix redirection index: ".$rihdl->errstr);
351             }
352             elsif ( not( $admref->{$rikey} and ($options{prepareRedirs} or (not exists $options{prepareRedirs})) ) ) {
353             print "dropping redirection index\n" if $options{'verbose'};
354             $hdl->do("DROP INDEX IF EXISTS redir;") or croak("Setup error: ".$hdl->errstr);
355             $rihdl->execute($rikey, 0)
356             or croak("Could not execute fix redirection index: ".$rihdl->errstr);
357             };
358             # $admref = $self->admhash();
359             }
360              
361             print "[ANALYZE ..." if $options{'verbose'};
362             $hdl->do("ANALYZE;");
363             print "]\n" if $options{'verbose'};
364             return 1; # o.k.
365             };
366              
367              
368             =head3 deflate()
369              
370             Maintenance action: performs VACCUUM, REINDEX and ANALYZE on the database
371              
372             =cut
373              
374             sub deflate {
375             my ($self) = @_;
376             my $hdl = $self->{dbh} or croak("no handle?");
377             local $hdl->{AutoCommit} = 1;
378             print "VACUUM\n";
379             $hdl->do("VACUUM") or carp("could not VACUUM: Skipping...");
380             print "REINDEX\n";
381             $hdl->do("REINDEX") or croak("could not REINDEX: Abort");
382             print "ANALYZE\n";
383             $hdl->do("ANALYZE;") or croak("could not ANALYZE: Abort");
384             return 1;
385             }
386              
387              
388             =head2 Handling of beacon files
389              
390             =head3 loadFile ( $file, $fields, %options )
391              
392             Reads a physical beacon file and stores it with a new Sequence number in the
393             database.
394              
395             Returns a triple:
396              
397             my ($seqno, $rec_ok, $message) = loadFile ( $file, $fields, %options )
398              
399             $seqno is undef on error
400              
401             $seqno and $rec_ok are zero with $message containing an explanation in case
402             of no action taken.
403              
404             $seqno is an positive integer if something was loaded: The L
405             (internal unique identifier) for the representation of the beacon file in
406             the database.
407              
408             =over 8
409              
410             =item $file
411              
412             File to read: Must be a beacon file
413              
414             =item $fields
415              
416             Hashref with additional meta and admin fields to store
417              
418             =item Supported options:
419              
420             verbose => (0|1)
421             force => (0|1) process unconditionally without timestamp comparison
422             nostat => (0|1) don't refresh global identifier counters
423              
424             =back
425              
426             If the file does not contain a minimal correct header (eg. is an empty file
427             or an HTML error page accidentaly caught) no action is performed.
428              
429             Otherwise, a fresh SeqNo (sequence number) is generated and meta and
430             BEACON-Lines are stored in the appropriate tables in the database.
431              
432             If the _alias field is provided, existing database entries for this
433             Alias are updated, identifiers not accounted for any more are
434             eventually discarded.
435              
436             =cut
437              
438             sub loadFile {
439             my ($self, $file, $fields, %options) = @_;
440             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
441             $options{'verbose'} ||= 0;
442              
443             if ( ! $file ) {
444             croak("Missing file argument")}
445             elsif ( ! -e $file ) {
446             print "ERROR: no such file $file\n" && return undef}
447             elsif ( ! -r _ ) {
448             print "ERROR: no read permissions for $file\n" && return undef}
449             elsif ( -z _ ) {
450             print "WARNING: empty file $file\n";
451             return (0,0, "empty file: Will not process");
452             }
453             my $mtime = (stat(_))[9];
454             open(BKN, "<:utf8", $file) or (print "ERROR: cannot read $file\n", return undef);
455             local($.) = 0;
456              
457             unless ( defined $self->{identifierClass} ) {
458             my $package = $self->autoIdentifier();
459             $options{'verbose'} && ref($package) && print "Assuming identifiers of type ".ref($package)."\n";
460             };
461              
462             $fields = {} unless $fields;
463             $fields->{'_ftime'} ||= time();
464             $fields->{'_mtime'} ||= $mtime;
465             delete $fields->{_uri} unless $fields->{_uri};
466             delete $fields->{_alias} unless $fields->{_alias};
467             my $autopurge = $fields->{_alias} || "";
468             my $showme = $fields->{_alias} || $fields->{_uri} || $file;
469              
470             if ( $options{'verbose'} ) {
471             printf("* Loading %s from URI %s\n", $fields->{_alias} || "", $fields->{_uri} || "");
472             printf("* local input %s (%s)\n", $file, SeeAlso::Source::BeaconAggregator::tToISO($mtime));
473             };
474              
475             my ($collno, $inserthandle, $replacehandle, $err, $format);
476             my ($linecount, $headerseen, $oseq) = (0, 0, 0);
477             my ($reccount, $recill, $recign, $recnil, $recupd, $recnew, $recdupl, $recdel) = (0, 0, 0, 0, 0, 0, 0, 0);
478             local($_);
479             lines:
480             while ( ) {
481             s/[ \x0d\x0a]+$//;
482             unless ( $linecount++ ) {
483             if ( s/^\x{FEFF}// ) { # BOM-Character
484             }
485             elsif ( s/^\xef\xbb\xbf// ) { # BOM-Bytes
486             print "ERROR: cannot cope with doubly UTF-8 encoded $file\n";
487             return (undef, undef, "encoding trouble")};
488             if ( /^\s*$/ ) {
489             print "WARNING: Discarding blank line before beacon header [$showme l.$.]\n";
490             next;
491             };
492             };
493             if ( not defined $collno ) { # $collno used as flag: "still in header"
494             if ( /^#\s*([A-Z][\w-]*):\s*(.*)$/ ) {
495             $headerseen++;
496             my ($field, $data) = ($1, $2);
497             $field =~ s/^DATE$/TIMESTAMP/ && print "WARNING: corrected DATE to TIMESTAMP in Beacon-Header [$showme l.$.]\n";
498             $data =~ s/\s+$//;
499             next if $data =~ /^\s*$/;
500             if ( SeeAlso::Source::BeaconAggregator->beaconfields($field) ) {
501             if ( $fields->{$field} ) {
502             print "WARNING: Skipping already set $field [$showme l.$.]\n"}
503             else {
504             $fields->{$field} = $data}
505             }
506             else {
507             print "WARNING: Ignoring unknown $field [$data] [$showme l.$.]\n";
508             };
509             }
510             elsif ( /^(#[^:\s]+)/ ) {
511             print "WARNING: Discarding unparseable line >$1...< in beacon header context [$showme l.$.]\n"}
512             elsif ( /^\s*$/ ) {
513             print "NOTICE: Discarding blank line in beacon header context [$showme l.$.]\n" if $options{'verbose'}}
514             elsif ( ! $headerseen ) {
515             print "ERROR: no header fields [$showme l.$.]\n";
516             return (0, 0, "no header fields: Will not proceed");
517             }
518             else {
519             ($collno, $err, $format, $inserthandle, $replacehandle, $oseq) = $self->processbeaconheader($fields, %options);
520             unless ( $collno ) {
521             print "ERROR: metadata error [$showme l.$.]\n";
522             return (0, 0, "metadata error: $err");
523             };
524             $self->{dbh}->{AutoCommit} = 0;
525             $linecount --;
526             redo lines;
527             }
528             }
529             else {
530             s/^\s+//; s/\s+$//;
531             my ($id, $altid, @rest);
532             ($id, @rest) = split(/\s*\|\s*/, $_, 4);
533             ($id, $altid) = split(/\s*=\s*/, $id, 2) if $id;
534             $id || ($recnil++, next);
535              
536             if ( $options{'filter'} ) {
537             ($id, $altid) = &{$options{'filter'}}($id, $altid, @rest);
538             unless ( $id ) {
539             $recign ++;
540             unless ( ++$reccount % 10000 ) {
541             $self->{dbh}->{AutoCommit} = 1;
542             print "$reccount\n" if $options{'verbose'};
543             $self->{dbh}->{AutoCommit} = 0;
544             };
545             next lines;
546             };
547             };
548             $altid ||= "";
549              
550             my($hits, $info, $link);
551             if ( @rest && ($rest[$#rest] =~ m!^\S+://\S+$!) ) {
552             $link = pop @rest}
553             elsif ( defined $rest[2] ) {
554             print "WARNING: unparseable link content >$rest[2]< [$showme l.$.]"};
555              
556             if ( @rest && ($rest[0] =~ /^\d*$/) ) {
557             $hits = shift @rest;
558             # really disregard hits with explicit 0?
559             if ( (!$altid) and (!$link) and ($format =~ /\baltTARGET\b/) ) {
560             $altid = shift @rest || "";
561             }
562             else {
563             $info = shift @rest || "";
564             };
565             }
566             elsif ( defined $rest[1] ) {
567             $hits = "";
568             if ( (!$altid) and (!$link) and ($format =~ /\baltTARGET\b/) ) {
569             $info = shift @rest;
570             $altid = shift @rest;
571             }
572             else {
573             shift @rest;
574             $info = shift @rest;
575             };
576             }
577             elsif ( defined $rest[0] ) {
578             $hits = "";
579             $info = shift @rest;
580             };
581             if ( @rest ) {
582             print "WARNING: unparseable content >$_< [$showme l.$.]"};
583              
584             unless ( $link ) {
585             if ( ($format =~ /\bhasTARGET\b/) ) { # ok
586             }
587             elsif ( $altid && ($format =~ /\baltTARGET\b/) ) { # also ok
588             }
589             elsif ( $format =~ /\bnoTARGET\b/ ) {
590             print "NOTICE: discarding >$id<".(defined $hits ? " ($hits)" : "")." without link [$showme l.$.]\n" if $options{'verbose'} > 1;
591             $recill++;
592             next lines;
593             }
594             else {
595             print "WARNING: discarding >$id<".(defined $hits ? " ($hits)" : "")." without link [$showme l.$.] (assertion failed)\n";
596             $recill++;
597             next lines;
598             }
599             };
600              
601             if ( $format !~ /\baltTARGET\b/ ) { # Allow certain duplicates (force disambiguisation)
602             $altid ||= $info || $link}
603              
604             $hits = "" unless defined $hits;
605             ($hits =~ /^0+/) && ($recnil++, next); # Explizit "0" => raus
606             $hits = 0 if $hits eq "";
607             $altid ||= "";
608             my $hash;
609             if ( defined $self->{identifierClass} ) {
610             $self->{identifierClass}->value($id);
611             unless ( $self->{identifierClass}->valid ) {
612             print "NOTICE: invalid identifier >$id< ($hits) [$showme l.$.]\n" if $options{'verbose'};
613             $recill++;
614             next lines;
615             };
616             $hash = $self->{identifierClass}->hash();
617             }
618             else {
619             $hash = $id};
620             my $did;
621             if ( $replacehandle && ($did = $replacehandle->execute($hits, $info, $link, $hash, $altid)) ) { # UPDATE OR FAIL old record
622             if ( $replacehandle->err ) {
623             carp("update in trouble: $replacehandle->errstring [$showme l.$.]");
624             $recdupl++;
625             }
626             elsif ( $did eq "0E0" ) { # not found, try insert
627             $did = $inserthandle->execute($hash, $altid, $hits, $info, $link);
628             if ( $did eq "0E0" ) {
629             $recdupl++;
630             if ( $altid ) {
631             print "INFO: did not insert duplicate Id >$id< = >$altid< ($hits) [$showme l.$.]\n" if $options{'verbose'}}
632             else {
633             print "INFO: did not insert duplicate Id >$id< ($hits) [$showme l.$.]\n" if $options{'verbose'} > 1};
634             }
635             else {
636             $recnew++};
637             }
638             else {
639             $recupd++};
640             }
641             elsif ( $did = $inserthandle->execute($hash, $altid, $hits, $info, $link) ) { # INSERT OR IGNORE new record
642             if ( $did eq "0E0" ) {
643             $recdupl++;
644             print "INFO: did not insert duplicate Id $id ($hits) [$showme l.$.]\n" if $options{'verbose'} > 1;
645             }
646             else {
647             $recnew++};
648             }
649             elsif ( $inserthandle->errstr =~ /constraint/ ) {
650             $recdupl++;
651             print "INFO: duplicate Id $id ($hits): not inserting [$showme l.$.]\n" if $options{'verbose'} > 1;
652             }
653             else {
654             croak("Could not insert: ($id, $hits, $info, $link): ".$inserthandle->errstr)};
655              
656             unless ( ++$reccount % 10000 ) {
657             $self->{dbh}->{AutoCommit} = 1;
658             print "$reccount\n" if $options{'verbose'};
659             $self->{dbh}->{AutoCommit} = 0;
660             };
661             }
662             };
663             if ( not defined $collno ) {
664             if ( $headerseen ) {
665             ($collno, $err, $format, $inserthandle, $replacehandle, $oseq) = $self->processbeaconheader($fields, %options);
666             if ( $collno ) {
667             print "WARNING: no idn content in file [$showme l.$.]\n"}
668             else {
669             print "ERROR: metadata error [$showme l.$.]\n";
670             return (0,0, "metadata error: $err");
671             };
672             }
673             elsif ( $. ) {
674             print "ERROR: no header fields [$showme l.$.]\n";
675             return (0, 0, "no header fields: Will not proceed");
676             }
677             else {
678             print "WARNING: empty file [$showme]\n";
679             return (0,0, "empty file");
680             };
681             }
682             $self->{dbh}->{AutoCommit} = 1;
683              
684             if ( $autopurge ) {
685             $self->{dbh}->{AutoCommit} = 0;
686             if ( $oseq ) {
687             my ($bcdelh, $bcdelexpl) = $self->stmtHdl("DELETE FROM beacons WHERE seqno==?");
688             $self->stmtExplain($bcdelexpl, $oseq) if $ENV{'DBI_PROFILE'};
689             my $rows = $bcdelh->execute($oseq) or croak("Could not execute >".$bcdelh->{Statement}."<: ".$bcdelh->errstr);
690             $self->{dbh}->{AutoCommit} = 1;
691             printf("INFO: Purged %s surplus identifiers from old sequence %u\n", $rows, $oseq) if $options{'verbose'};
692             $rows = "0" if $rows eq "0E0";
693             $recdel += $rows;
694             };
695              
696             $self->{dbh}->{AutoCommit} = 0;
697             my ($rpdelh, $rpdelexpl) = $self->stmtHdl("DELETE FROM repos WHERE (alias=?) AND (seqno
698             $self->stmtExplain($rpdelexpl, $autopurge, $collno) if $ENV{'DBI_PROFILE'};
699             my $rows = $rpdelh->execute($autopurge, $collno) or croak("Could not execute >".$rpdelh->{Statement}."<: ".$rpdelh->errstr);
700             $self->{dbh}->{AutoCommit} = 1;
701             $rows = "0" if $rows eq "0E0";
702             printf("INFO: %u old sequences discarded\n", $rows) if $options{'verbose'};
703             }
704              
705             printf "NOTICE: New sequence %u for %s: processed %u Records from %u lines\n",
706             $collno, $autopurge || "???", $reccount, $linecount;
707             my $statline = sprintf "%u replaced, %u new, %u deleted, %u duplicate, %u nil, %u invalid, %u ignored",
708             $recupd, $recnew, $recdel, $recdupl, $recnil, $recill, $recign;
709             print " ($statline)\n";
710              
711             my $recok = $recupd + $recnew;
712             my $numchg = ($recnew or $recdel) ? 1 : 0;
713              
714             # my $ct1hdl = $self->stmtHdl("SELECT COUNT(*) FROM beacons WHERE seqno==? LIMIT 1;");
715             # $ct1hdl->execute($collno) or croak("could not execute live count: ".$ct1hdl->errstr);
716             # my $ct1ref = $ct1hdl->fetchrow_arrayref();
717             # my $counti = $ct1ref->[0] || 0;
718              
719             # my $ct2hdl = $self->stmtHdl("SELECT COUNT(DISTINCT hash) FROM beacons WHERE seqno==?");
720             # using subquery to trick SQLite into using indices
721             # my $ct2hdl = $self->stmtHdl("SELECT COUNT(*) FROM (SELECT DISTINCT hash FROM beacons WHERE seqno==?) LIMIT 1;");
722             # $ct2hdl->execute($collno) or croak("could not execute live count: ".$ct2hdl->errstr);
723             # my $ct2ref = $ct2hdl->fetchrow_arrayref();
724             # my $countu = $ct2ref->[0] || 0;
725              
726             # combined query turned out as not as efficient
727             # my $ct0hdl = $self->stmtHdl("SELECT COUNT(*), COUNT(DISTINCT hash) FROM beacons WHERE seqno==? LIMIT 1;");
728             # $ct0hdl->execute($collno) or croak("could not execute live count: ".$ct0hdl->errstr);
729             # my $ct0ref = $ct0hdl->fetchrow_arrayref();
730             # my ($counti, $countu) = ($ct0ref->[0] || 0, $ct0ref->[1] || 0);
731              
732             my ($updh, $updexpl) = $self->stmtHdl(<<"XxX");
733             UPDATE OR FAIL repos SET counti=?,countu=?,fstat=?,utime=?,ustat=?,sort=? WHERE seqno==?;
734             XxX
735              
736             my $counti = $self->idStat($collno, 'distinct' => 0) || 0;
737             printf("WARNING: expected %u valid records, counted %u\n", $recok, $counti) if $recok != $counti;
738             unless ( $numchg ) {
739             $fields->{'_counti'} ||= 0;
740             printf("WARNING: expected unchanged number %u valid records, counted %u\n", $fields->{'_counti'}, $counti) if $fields->{'_counti'} != $counti;
741             };
742              
743             my $sort = $fields->{'_sort'} || "";
744             my $countu = $numchg ? ( $self->idStat($collno, 'distinct' => 1) || 0 )
745             : ( $fields->{'_countu'} || $self->idStat($collno, 'distinct' => 1) || 0 );
746             $self->stmtExplain($updexpl, $counti, $countu, $statline, time(), "successfully loaded", $sort, $collno) if $ENV{'DBI_PROFILE'};
747             $updh->execute($counti, $countu, $statline, time(), "successfully loaded", $sort, $collno)
748             or croak("Could not execute >".$updh->{Statement}."<: ".$updh->errstr);
749             close(BKN);
750              
751             if ( $numchg or $options{'force'} ) {
752             # if ( $options{'force'} ) {
753             # print "[ANALYZE ..." if $options{'verbose'};
754             # $self->{dbh}->do("ANALYZE;");
755             # print "]\n" if $options{'verbose'};
756             # };
757              
758             if ( $options{'nostat'} ) { # invalidate since they might have changed
759             $self->admin('gcounti', undef);
760             $self->admin('gcountu', undef);
761             }
762             else {
763             $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
764             $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
765             }
766             };
767              
768             return ($collno, $recok, undef);
769             }
770              
771              
772             =head4 processbeaconheader($self, $fieldref, [ %options] )
773              
774             Internal subroutine used by C.
775              
776             =over 8
777              
778             =item $fieldref
779              
780             Hash with raw fields.
781              
782             =item Supported options:
783              
784             verbose => (0|1)
785              
786             Show seqnos of old instances which are met by the alias
787              
788             =back
789              
790              
791             =cut
792              
793             sub processbeaconheader {
794             my ($self, $fieldref, %options) = @_;
795             my $osq = 0;
796             my @carp;
797              
798             if ( my $alias = $fieldref->{_alias} ) {
799             my $stampfield = SeeAlso::Source::BeaconAggregator->beaconfields("TIMESTAMP");
800             my ($listh, $listexpl) = $self->stmtHdl("SELECT seqno, $stampfield, mtime, counti, countu FROM repos WHERE alias=?;");
801             $self->stmtExplain($listexpl, $alias) if $ENV{'DBI_PROFILE'};
802             $listh->execute($alias) or croak("Could not execute >".$listh->{Statement}."<: ".$listh->errstr);
803             my ($rowcnt, $ocounti, $ocountu);
804             while ( my($row) = $listh->fetchrow_arrayref ) {
805             last unless defined $row;
806             $rowcnt ++;
807             ($ocounti, $ocountu) = ($row->[3], $row->[4]);
808             if ( $options{'verbose'} ) {
809             print "* Old Instances for $alias:\n" unless $osq;
810             $osq = $row->[0];
811             print "+\t#$osq ", SeeAlso::Source::BeaconAggregator::tToISO($row->[1] || $row->[2]), " (", $row->[3] || "???", ")\n";
812             }
813             else {
814             $osq = $row->[0]};
815             }
816             if ( $rowcnt && ($rowcnt == 1) ) {
817             $fieldref->{_counti} ||= $ocounti if $ocounti;
818             $fieldref->{_countu} ||= $ocountu if $ocountu;
819             }
820             };
821              
822             my $format = "";
823             if ( $fieldref->{'FORMAT'} && $self->{accept}->{'FORMAT'} ) {
824             if ( $fieldref->{'FORMAT'} =~ $self->{accept}->{'FORMAT'} ) {
825             $format = $fieldref->{'FORMAT'}}
826             else {
827             push(@carp, "ERROR: only FORMAT '".$self->{accept}->{'FORMAT'}."' are supported, this is ".$fieldref->{'FORMAT'})}
828             }
829             elsif ( $fieldref->{'FORMAT'} ) {
830             $format = $fieldref->{'FORMAT'}}
831             elsif ( $fieldref->{'VERSION'} or $fieldref->{'TARGET'} or $fieldref->{'PREFIX'} or $fieldref->{'MESSAGE'} ) {
832             push(@carp, "WARNING: header line #FORMAT: BEACON should be supplied")}
833             elsif ( $self->{accept}->{'FORMAT'} ) {
834             push(@carp, "ERROR: header line #FORMAT is missing")}
835             else {
836             push(@carp, "WARNING: header line #FORMAT: BEACON should be supplied")};
837              
838             if ( $fieldref->{'FORMAT'} && ($fieldref->{'FORMAT'} =~ /v(?:ersion)?\s*(\d+(?:\.\d*)?)/i) ) {
839             $fieldref->{'VERSION'} ||= $1};
840             unless ( $fieldref->{'VERSION'} ) {
841             $fieldref->{'VERSION'} = $fieldref->{'FORMAT'} ? "0.1" : "1.0";
842             push(@carp, "NOTICE: added header field #VERSION as '".$fieldref->{'VERSION'}."'");
843             };
844             if ( $self->{accept}->{'VERSION'} ) {
845             ($fieldref->{'VERSION'} =~ $self->{accept}->{'VERSION'})
846             || push(@carp, "ERROR: only VERSION '".$self->{accept}->{'VERSION'}."' is supported, this is ".$fieldref->{'VERSION'});
847             };
848              
849             if ( $fieldref->{'ALTTARGET'} ) {
850             $fieldref->{'ALTTARGET'} = "" unless defined $fieldref->{'ALTTARGET'};
851             my $parsed = hDecode($fieldref, 'ALTTARGET');
852             if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%\d\$s/) ) {
853             $fieldref->{'ALTTARGET'} = $parsed;
854             $format =~ s/\s*-altTARGET//;
855             $format .= " -altTARGET";
856             ($parsed =~ /(^|[^%])(%.)*%2\$s/) or
857             push(@carp, "WARNING: header field #ALTTARGET should contain placeholder {ALTID}");
858             }
859             elsif ( $parsed ) {
860             push(@carp, "ERROR: header field #ALTTARGET must contain placeholder {ALTID} (or {ID})");
861             delete $fieldref->{'ALTTARGET'};
862             }
863             else {
864             push(@carp, "ERROR: could not parse header field #ALTTARGET: '".$fieldref->{'ALTTARGET'}."'");
865             delete $fieldref->{'ALTTARGET'};
866             }
867             };
868              
869             if ( $fieldref->{'IMGTARGET'} ) {
870             $fieldref->{'IMGTARGET'} = "" unless defined $fieldref->{'IMGTARGET'};
871             my $parsed = hDecode($fieldref, 'IMGTARGET');
872             if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%\d\$s/) ) {
873             $fieldref->{'IMGTARGET'} = $parsed;
874             $format =~ s/\s*-imgTARGET//;
875             $format .= " -imgTARGET";
876             }
877             elsif ( $parsed ) {
878             push(@carp, "WARNING: header field #IMGTARGET should contain placeholders {ID} or {ALTID}")}
879             else {
880             push(@carp, "ERROR: could not parse header field #IMGTARGET: '".$fieldref->{'IMGTARGET'}."'");
881             delete $fieldref->{'IMGTARGET'};
882             }
883             };
884              
885             if ( exists $fieldref->{'TARGET'} ) {
886             $fieldref->{'TARGET'} = "" unless defined $fieldref->{'TARGET'};
887             my $parsed = hDecode($fieldref, 'TARGET');
888             if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%1\$s/) && ($parsed !~ /(^|[^%])(%.)*%[2-9]\$s/) ) {
889             $fieldref->{'TARGET'} = $parsed;
890             $format .= " -hasTARGET";
891             }
892             elsif ( $parsed ) {
893             if ( exists $fieldref->{'ALTTARGET'} ) {
894             push(@carp, "ERROR: header field #TARGET must contain placeholder {ID} only");
895             }
896             else {
897             push(@carp, "WARNING: Adding implicit {ID} to #TARGET as #ALTTARGET");
898             $fieldref->{'ALTTARGET'} = $parsed."%2\$s";
899             $format .= " -altTARGET";
900             }
901             delete $fieldref->{'TARGET'};
902             }
903             else {
904             push(@carp, "ERROR: could not parse header field #TARGET: '".$fieldref->{'TARGET'}."'");
905             delete $fieldref->{'TARGET'};
906             }
907             }
908             elsif ( $format =~ /^BEACON/ ) {
909             push(@carp, "WARNING: header field #TARGET not set: ALL beacon lines will have to provide their link by other means!");
910             $format =~ s/\s*-noTARGET//;
911             $format .= " -noTARGET";
912             }
913             else {
914             push(@carp, "ERROR: header field #TARGET is mandatory")};
915              
916              
917             $fieldref->{'MESSAGE'} = hDecode($fieldref, 'MESSAGE') if $fieldref->{'MESSAGE'};
918              
919             if ( $fieldref->{'TIMESTAMP'} ) {
920             if ( my $parsed = hDecode($fieldref, 'TIMESTAMP') ) {
921             printf("* %-30s %s\n", "Beacon Timestamp:", hEncode($parsed, 'TIMESTAMP')) if $options{'verbose'};
922             $fieldref->{'TIMESTAMP'} = $parsed;
923             }
924             else { # unparseable => use current
925             push(@carp, "WARNING: cannot parse TIMESTAMP '".$fieldref->{'TIMESTAMP'}."', using current time");
926             $fieldref->{'TIMESTAMP'} = $^T;
927             };
928             }
929             else {
930             # $fieldref->{'TIMESTAMP'} = $fieldref->{'_mtime'} || $^T;
931             push(@carp, "NOTICE: no header field #TIMESTAMP detected");
932             };
933              
934             if ( $fieldref->{'REVISIT'} ) {
935             if ( my $parsed = hDecode($fieldref, 'REVISIT') ) {
936             if ( $parsed < $^T ) {
937             printf("* %-30s %s [%s]\n", "STALE Revisit hint parsed as", hEncode($parsed, 'REVISIT'), $fieldref->{'REVISIT'})} # if $options{'verbose'}
938             else {
939             printf("* %-30s %s\n", "Revisit hint parsed as", hEncode($parsed, 'REVISIT')) if $options{'verbose'}};
940             $fieldref->{'REVISIT'} = $parsed;
941             }
942             else { # unparseable => discard
943             push(@carp, "WARNING: cannot parse #REVISIT '".$fieldref->{'REVISIT'}."', discarding");
944             delete $fieldref->{'REVISIT'};
945             };
946             }
947             else {
948             push(@carp, "INFO: no header field #REVISIT detected");
949             };
950              
951             my $cancontinue = 1;
952             my $err = "";
953             foreach ( @carp ) {
954             print "$_\n";
955             if ( s/^ERROR: // ) {
956             $cancontinue = 0;
957             $err .= " | " if $err;
958             $err .= $_;
959             };
960             }
961             unless ( $cancontinue or $options{'ignore-header-errors'} ) {
962             print "CRITICAL: Aborting because of Header Errors\n";
963             return (undef, $err, $format);
964             };
965              
966             $fieldref->{'_uri'} ||= $fieldref->{'FEED'};
967             delete $fieldref->{'_uri'} unless $fieldref->{'_uri'};
968              
969             $fieldref->{'_alias'} ||= $fieldref->{'FEED'} || $fieldref->{'TARGET'};
970              
971             my (@fn, @fd);
972             while ( my ($key, $val) = each %$fieldref ) {
973             next unless defined $val;
974             my $dbkey = "";
975             if ( $dbkey = SeeAlso::Source::BeaconAggregator->beaconfields($key) ) {
976             push(@fn, $dbkey)}
977             elsif ( $key =~ /_(\w+)$/ ) {
978             push(@fn, $1)}
979             else {
980             next};
981             my $myval = $val;
982             unless ( $myval =~ /^\d+$/ ) {
983             $myval =~ s/'/''/g;
984             $myval = "'".$myval."'";
985             };
986             push(@fd, $myval);
987             };
988             local($") = ",\n";
989             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
990             INSERT INTO repos ( seqno, @fn ) VALUES ( NULL, @fd );
991             XxX
992             $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
993             $sth->execute() or croak("Could not execute >".$sth->{Statement}."<:".$sth->errstr);
994             my $collno = $self->{dbh}->last_insert_id("", "", "", "");
995              
996             my $rhandle;
997             if ( $osq ) {
998             $rhandle = $self->stmtHdl(<<"XxX");
999             UPDATE OR FAIL beacons SET seqno=$collno, hits=?, info=?, link=? WHERE hash=? AND seqno==$osq AND altid=?;
1000             XxX
1001             };
1002             my $ihandle = $self->stmtHdl(<<"XxX");
1003             INSERT OR IGNORE INTO beacons ( hash, seqno, altid, hits, info, link ) VALUES (?, $collno, ?, ?, ?, ?);
1004             XxX
1005             return ($collno, "", $format, $ihandle, $rhandle, $osq);
1006             }
1007              
1008              
1009              
1010              
1011             my ($lwpcarp817, $lwpcarp827);
1012              
1013             =head3 update ($sq_or_alias, $params, %options)
1014              
1015             Loads a beacon file into the database, possibly replacing a previous instance.
1016              
1017             Some magic is employed to autoconvert ISO-8859-1 or doubly UTF-8 encoded files
1018             back to UTF-8.
1019              
1020             Returns undef, if something goes wrong, or the file was not modified since,
1021             otherwise returns a pair (new seqence number, number of lines imported).
1022              
1023              
1024             =over 8
1025              
1026              
1027             =item $sq_or_alias
1028              
1029             Sequence number or alias: Used to determine an existing instance.
1030              
1031              
1032             =item $params
1033              
1034             Hashref, containing
1035              
1036             agent => LWP::UserAgent to use
1037             _uri => Feed URL to load from
1038              
1039             =item %options
1040              
1041             Hash, propagated to C
1042              
1043             verbose => (0|1)
1044             force => (0|1) process unconditionally without timestamp comparison
1045             nostat => (0|1) don't refresh global identifier counters
1046              
1047             =back
1048              
1049             Incorporates a new beacon source from a URI in the database or updates an existing one.
1050             For HTTP URIs care is taken not to reload an unmodified BEACON feed (unless the 'force'
1051             option is provided).
1052              
1053             If the feed appears to be newer than the previously loaded version it is fetched,
1054             some UTF-8 adjustments are performed if necessary, then it is stored to a temporary file
1055             and from there finally processed by the C method above.
1056              
1057             The URI to load is determined by the following order of precedence:
1058              
1059             =over 8
1060              
1061             =item 1
1062              
1063             _uri Option
1064              
1065             =item 2
1066              
1067             admin field uri stored in the database
1068              
1069             =item 3
1070              
1071             meta field #FEED taken from the database
1072              
1073             =back
1074              
1075             Typical use is with an alias, not with a sequence number:
1076              
1077             $db->update('whatever');
1078              
1079             Can be used to initially load beacon files from URIs:
1080              
1081             $db->update("new_alias", {_uri => $file_uri} );
1082              
1083             =cut
1084              
1085             sub update {
1086             my ($self, $sq_or_alias, $params, %options) = @_;
1087             $params = {} unless $params;
1088             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1089              
1090             my $ua = $params->{'agent'};
1091             unless ( $ua ) {
1092             require LWP::UserAgent;
1093             $ua = LWP::UserAgent->new(agent => "SA-S-BeaconAggregator ", # end with space to get default agent appended
1094             env_proxy => 1,
1095             timeout => 300,
1096             ) or croak("cannot create UserAgent");
1097             };
1098              
1099             my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($sq_or_alias);
1100             my $alias = ($sq_or_alias =~ /^\d+$/) ? "" : $sq_or_alias;
1101             my $feedname = SeeAlso::Source::BeaconAggregator->beaconfields("FEED");
1102             my ($ssth, $ssthexpl) = $self->stmtHdl(<<"XxX");
1103             SELECT seqno, uri, alias, $feedname, ftime, mtime, sort FROM repos $cond;
1104             XxX
1105             $self->stmtExplain($ssthexpl, @cval) if $ENV{'DBI_PROFILE'};
1106             $ssth->execute(@cval) or croak("Could not execute >".$ssth->{Statement}."<: ".$ssth->errstr);
1107             croak("Select old instance error: ".$ssth->errstr) if $ssth->err;
1108             my $aryref = $ssth->fetchrow_arrayref;
1109             my ($osq, $ouri, $oalias, $feed, $fetchtime, $modtime, $osort) = $aryref ? @$aryref : ();
1110              
1111             my $uri = $params->{'_uri'} || $ouri || $feed;
1112             croak("Cannot update $sq_or_alias: URI not given nor determinable from previous content") unless $uri;
1113             $uri =~ s/\s$//;
1114             $alias ||= $oalias || "";
1115              
1116             print "Requesting $uri\n" if $options{'verbose'};
1117             my $rq = HTTP::Request->new('GET', $uri, ['Accept' => 'text/*']) or croak("could not construct request from $uri");
1118             if ( $fetchtime && $modtime && !$options{'force'} ) { # allow force-reload by deleting _ftime or _mtime
1119             printf(" %-30s %s\n", "Old instance stamped", scalar localtime($modtime)) if $options{'verbose'};
1120             $rq->header('If-Modified-Since', HTTP::Date::time2str($modtime));
1121             };
1122             if ( $rq->can("accept_decodable") ) { # LWP 5.817 and newer
1123             $rq->accept_decodable}
1124             else {
1125             carp("please upgrade to LWP >= 5.817 for compression negotiation") if $options{'verbose'} && (!$lwpcarp817++)};
1126              
1127             my $response = $ua->request($rq); # Well, we hoggishly slurp everything into memory,
1128             # however explicit decompression of an already dumped result would be PITA
1129             my $nuri = ($response->request)->uri;
1130             print "NOTICE: Differing result URI: $nuri\n" if $uri ne $nuri;
1131             if ( $response->is_success ) {
1132             print $osq ? "INFO: refreshing $alias sq $osq from $uri\n"
1133             : "INFO: importing previously unseen $alias from $uri\n";
1134             my $charset;
1135             if ( $response->can("content_charset") ) { # LWP 5.827 and above
1136             $charset = $response->content_charset;
1137             print "DEBUG: Content charset is $charset\n" if $charset && $options{'verbose'};
1138             }
1139             else {
1140             carp("please upgrade to LWP >= 5.827 for better detection of content_charset") if $options{'verbose'} && (!$lwpcarp827++)};
1141             $charset ||= "UTF-8";
1142              
1143             my $lm = $response->last_modified;
1144             printf(" %-30s %s\n", "Last modified", scalar localtime($lm)) if $lm && $options{'verbose'};
1145             $lm ||= $^T;
1146              
1147             my $vt = $response->fresh_until(h_min => 1800, h_max => 30 * 86400);
1148             printf(" %-30s %s\n", "Should be valid until", scalar localtime($vt)) if $vt && $options{'verbose'};
1149             $vt ||= 0;
1150              
1151             # temporary file for dumped contents
1152             my ($tmpfh, $tmpfile) = File::Temp::tempfile("BeaconAggregator-XXXXXXXX", SUFFIX => ".txt", TMPDIR => 1) or croak("Could not acquire temporary file for storage");
1153             my $contref; # reference to content buffer
1154             if ( ! $response->content_is_text ) {
1155             my $ct = $response->content_type;
1156             print "WARNING: Response content is $ct, not text/*\n";
1157             if ( my $ce = $response->content_encoding ) {
1158             print "NOTICE: Response is also Content-encoded: $ce\n"}
1159             my $ctt = join("|", $response->decodable());
1160             if ( $ct =~ s!^(.+\/)?($ctt)$!$2! ) {
1161             # yes: decode anyway since it could be a gzip-encoded .txt.gz file!
1162             my $cr = $response->decoded_content( raise_error => 1, ref => 1); # method exists since LWP 5.802 (2004-11-30)
1163             $response->remove_content_headers;
1164             my $newresp = HTTP::Response->new($response->code, $response->message, $response->headers);
1165             $newresp->content_type("text/plain; charset: $charset");
1166             $newresp->content_encoding($ct);
1167             $newresp->content_ref($cr);
1168             $response = $newresp;
1169             }
1170             };
1171             $contref = $response->decoded_content( raise_error => 1, ref => 1); # method exists since LWP 5.802 (2004-11-30)
1172              
1173             if ( $$contref =~ /^\x{FFEF}/ ) { # properly encoded BOM => put Characters to file
1174             binmode($tmpfh, ":utf8");
1175             print "INFO: properly encoded BOM detected: Groked UTF8\n"; # if $options{'verbose'};
1176             }
1177             elsif ( $$contref =~ s/^\xef\xbb\xbf// ) { # BOM Bytes => put Bytes to file, re-read as UTF-8
1178             print "INFO: Byte coded BOM detected: trying to restitute character semantics\n"; # if $options{'verbose'};
1179             print "INFO: Length is ", length($$contref), " ", (utf8::is_utf8($$contref) ? "characters" : "bytes"), "\n";
1180             binmode($tmpfh, ":bytes");
1181             utf_deduplicate($contref) && binmode($tmpfh, ":utf8");
1182             }
1183             elsif ( utf8::is_utf8($$contref) ) { # already Upgraded strings should be written as utf-8
1184             print "INFO: UTF8-ness already established\n" if $options{'verbose'};
1185             binmode($tmpfh, ":utf8");
1186             utf_deduplicate($contref); # but don't trust it (older LWP with file URLs, ...)
1187             }
1188             elsif ( utf8::decode($$contref) ) { # everything in character semantics now
1189             print "INFO: Could decode bytes to UTF8-characters\n" if $options{'verbose'};
1190             binmode($tmpfh, ":utf8");
1191             }
1192             else { # leave it alone
1193             print "WARNING: No clue about character encoding: Assume ISO 8859-1\n"; # if $options{'verbose'};
1194             binmode($tmpfh, ":utf8");
1195             };
1196             print $tmpfh $$contref;
1197             close($tmpfh);
1198             # early cleanup since everything might be huge....
1199             $contref = $response = undef;
1200              
1201             my ($collno, $count, $statref) = $self->loadFile($tmpfile, {_alias => $alias, _uri => $uri, _ruri => $nuri, _mtime => $lm, _sort => $osort}, %options);
1202             if ( ! $collno && $osq ) {
1203             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1204             UPDATE OR FAIL repos SET utime=?,ustat=? WHERE seqno==?;
1205             XxX
1206             $self->stmtExplain($usthexpl, time(), ($statref ? "load error: $statref" : "internal error"), $osq) if $ENV{'DBI_PROFILE'};
1207             $usth->execute(time(), ($statref ? "load error: $statref" : "internal error"), $osq)
1208             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1209             };
1210              
1211             unlink($tmpfile) if -f $tmpfile;
1212             return $collno ? ($collno, $count) : undef;
1213             }
1214             elsif ( $response->code == 304 ) {
1215             print "INFO: $alias not modified since ".HTTP::Date::time2str($modtime)."\n" if $options{'verbose'};
1216             my $vt = $response->fresh_until(h_min => 1800, h_max => 6 * 86400);
1217             printf(" %-30s %s\n", "Will not try again before", scalar localtime($vt)) if $options{'verbose'};
1218             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1219             UPDATE OR FAIL repos SET utime=?,ustat=?,ruri=? WHERE seqno==?;
1220             XxX
1221             $self->stmtExplain($usthexpl, time(), $response->status_line, $nuri, $osq) if $ENV{'DBI_PROFILE'};
1222             $usth->execute(time(), $response->status_line, $nuri, $osq)
1223             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1224             return undef;
1225             }
1226             else {
1227             print "WARNING: No access to $uri for $alias [".$response->status_line."]\n";
1228             print $response->headers_as_string, "\n";
1229             return undef unless $osq;
1230             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1231             UPDATE OR FAIL repos SET utime=?,ustat=?,ruri=? WHERE seqno==?;
1232             XxX
1233             $self->stmtExplain($usthexpl, time(), $response->status_line, $nuri, $osq) if $ENV{'DBI_PROFILE'};
1234             $usth->execute(time(), $response->status_line, $nuri, $osq)
1235             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1236             return undef;
1237             };
1238             }
1239              
1240              
1241              
1242             sub utf_deduplicate {
1243             my ($success, $stringref) = (0, @_);
1244             if ( utf8::downgrade($$stringref, 1) ) { # 1 = FAIL_OK
1245             my $prevlength = length($$stringref);
1246             print "INFO: Downgrade was possible, length now $prevlength ", (utf8::is_utf8($$stringref) ? "characters" : "bytes"), "\n";
1247             while ( utf8::decode($$stringref) ) {
1248             $success ++;
1249             my $newlength = length($$stringref);
1250             print "DEBUG: Reassembling as UTF-8 succeeded, length now $newlength ", (utf8::is_utf8($$stringref) ? "characters" : "bytes"), "\n";
1251             last if $newlength == $prevlength;
1252             $prevlength = $newlength;
1253             # last unless utf8::downgrade($$stringref, 1);
1254             }
1255             }
1256             else {
1257             print "WARNING: no downgrade possible, proceed with byte semantics";
1258             };
1259             return $success;
1260             }
1261              
1262             =head3 unload ( [ $seqno_or_alias, %options ] )
1263              
1264             Deletes the sequence(s).
1265              
1266             =over 8
1267              
1268             =item $seqno_or_alias
1269              
1270             numeric sequence number, Alias or SQL pattern.
1271              
1272             =item Supported options:
1273              
1274             force => (0|1)
1275              
1276             Needed to purge the complete database ($seqno_or_alias empty) or to purge
1277             more than one sequence ($seqno_or_alias yields more than one seqno).
1278              
1279             =back
1280              
1281              
1282             =cut
1283              
1284             sub unload {
1285             my ($self, $seqno_or_alias, %options) = @_;
1286             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1287              
1288             my @seqnos = ();
1289             if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
1290             @seqnos = ($seqno_or_alias)}
1291             elsif ( $seqno_or_alias || $options{'force'} ) {
1292             @seqnos = $self->Seqnos('_alias', $seqno_or_alias ? ($seqno_or_alias) : ());
1293             unless ( @seqnos ) {
1294             carp("no Seqnos selected by $seqno_or_alias");
1295             return 0;
1296             };
1297             unless ( $options{'force'} or (@seqnos == 1) ) {
1298             carp("Use --force to purge more than one sequence (@seqnos)");
1299             return 0;
1300             };
1301             }
1302             else {
1303             carp("Use --force to purge the complete database");
1304             return 0;
1305             };
1306              
1307             if ( $options{'force'} ) {
1308             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1309             DELETE FROM beacons WHERE seqno==?;
1310             XxX
1311             foreach my $seqno ( @seqnos ) {
1312             $self->stmtExplain($sthexpl, $seqno_or_alias) if $ENV{'DBI_PROFILE'};
1313             my $rows = $sth->execute($seqno_or_alias) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1314             print "INFO: $rows forced for $seqno\n" if $options{'verbose'};
1315             };
1316             };
1317              
1318             my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1319             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1320             DELETE FROM repos $cond;
1321             XxX
1322             $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1323             my $rows = $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1324             $rows = 0 if $rows eq "0E0";
1325              
1326             if ( $rows or $options{'force'} ) {
1327             # if ( $options{'force'} ) {
1328             # print "[ANALYZE ..." if $options{'verbose'};
1329             # $self->{dbh}->do("ANALYZE;");
1330             # print "]\n" if $options{'verbose'};
1331             # };
1332              
1333             if ( $options{'nostat'} ) { # invalidate since they might have changed
1334             $self->admin('gcounti', undef);
1335             $self->admin('gcountu', undef);
1336             }
1337             else {
1338             $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
1339             $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
1340             }
1341             };
1342              
1343             return $rows;
1344             }
1345              
1346              
1347             =head3 purge ( $seqno_or_alias[, %options ] )
1348              
1349             Deletes all identifiers from the database to the given pattern,
1350             but leaves the stored header information intact, such that it
1351             can be updated automatically.
1352              
1353             =over 8
1354              
1355             =item $seqno_or_alias
1356              
1357             Pattern
1358              
1359             =item Supported options:
1360              
1361             force => (0|1)
1362              
1363             Allow purging of more than one sequence.
1364              
1365             =back
1366              
1367              
1368             =cut
1369              
1370             sub purge {
1371             my ($self, $seqno_or_alias, %options) = @_;
1372             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1373             my @seqnos;
1374             if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
1375             @seqnos = ($seqno_or_alias)}
1376             elsif ( $seqno_or_alias || $options{'force'} ) {
1377             @seqnos = $self->Seqnos('_alias', $seqno_or_alias ? ($seqno_or_alias) : ());
1378             unless ( @seqnos ) {
1379             carp("no Seqnos selected by $seqno_or_alias");
1380             return 0;
1381             };
1382             unless ( $options{'force'} or (@seqnos == 1) ) {
1383             carp("Use --force to purge more than one sequence (@seqnos)");
1384             return 0;
1385             };
1386             }
1387             else {
1388             carp("Use --force to purge the complete database");
1389             return 0;
1390             };
1391             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1392             DELETE FROM beacons WHERE seqno==?;
1393             XxX
1394             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1395             UPDATE OR FAIL repos SET counti=?,countu=?,utime=?,ustat=? WHERE seqno==?;
1396             XxX
1397             my $trows = 0;
1398             foreach my $seqno ( @seqnos ) {
1399             $self->stmtExplain($sthexpl, $seqno) if $ENV{'DBI_PROFILE'};
1400             my $rows = $sth->execute($seqno) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1401             $rows = "0" if $rows eq "0E0";
1402             print "INFO: $rows purged for $seqno\n" if $options{'verbose'};
1403             $trows += $rows;
1404             $self->stmtExplain($usthexpl, 0, 0, time, "purged", $seqno) if $ENV{'DBI_PROFILE'};
1405             $usth->execute(0, 0, time, "purged", $seqno)
1406             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1407             };
1408              
1409             if ( $trows or $options{'force'} ) {
1410             # if ( $options{'force'} ) {
1411             # print "[ANALYZE ..." if $options{'verbose'};
1412             # $self->{dbh}->do("ANALYZE;");
1413             # print "]\n" if $options{'verbose'};
1414             # };
1415              
1416             if ( $options{'nostat'} ) { # invalidate since they might have changed
1417             $self->admin('gcounti', undef);
1418             $self->admin('gcountu', undef);
1419             }
1420             else {
1421             $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
1422             $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
1423             }
1424             };
1425              
1426             return $trows;
1427             }
1428              
1429              
1430             =head2 Methods for headers
1431              
1432             =head3 ($rows, @oldvalues) = headerfield ( $sq_or_alias, $key [, $value] )
1433              
1434             Gets or sets an meta or admin Entry for the constituent file indicated by $sq_or_alias
1435              
1436             =cut
1437              
1438             sub headerfield {
1439             my ($self, $sq_or_alias, $key, $value) = @_;
1440              
1441             my $dbkey = "";
1442             if ( $dbkey = SeeAlso::Source::BeaconAggregator->beaconfields($key) ) {
1443             }
1444             elsif ( $key =~ /_(\w+)$/ ) {
1445             $dbkey = $1}
1446             else {
1447             carp "Field $key not known";
1448             return undef;
1449             };
1450              
1451             my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($sq_or_alias);
1452              
1453             my ($osth, $osthexpl) = $self->stmtHdl(<<"XxX");
1454             SELECT $dbkey FROM repos $cond;
1455             XxX
1456             $self->stmtExplain($osthexpl, @cval) if $ENV{'DBI_PROFILE'};
1457             $osth->execute(@cval) or croak("Could not execute >".$osth->{Statement}."<:".$osth->errstr);
1458             my $tmpval = $osth->fetchall_arrayref();
1459             my @oval = map { hEncode($_, $key) } map { (defined $_->[0]) ? ($_->[0]) : () } @$tmpval;
1460             my $rows = scalar @oval;
1461              
1462             if ( (defined $value) and ($value ne "") ) { # set
1463             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1464             UPDATE OR FAIL repos SET $dbkey=? $cond;
1465             XxX
1466             $value = hDecode($value, $key) || "";
1467             $self->stmtExplain($usthexpl, $value, @cval) if $ENV{'DBI_PROFILE'};
1468             $rows = $usth->execute($value, @cval) or croak("Could not execute >".$usth->{Statement}."<:".$usth->errstr);
1469             }
1470             elsif ( defined $value ) { # clear
1471             my ($dsth, $dsthexpl) = $self->stmtHdl(<<"XxX");
1472             UPDATE OR FAIL repos SET $dbkey=? $cond;
1473             XxX
1474             $self->stmtExplain($dsthexpl, undef, @cval) if $ENV{'DBI_PROFILE'};
1475             $rows = $dsth->execute(undef, @cval) or croak("Could not execute >".$dsth->{Statement}."<:".$dsth->errstr);
1476             }
1477             else { # read
1478             }
1479              
1480             return ($rows, @oval);
1481             }
1482              
1483             =head3 ($resultref, $metaref) = headers ( [ $seqno_or_alias ] )
1484              
1485             Iterates over all
1486              
1487             For each iteration returns two hash references:
1488              
1489             =over 8
1490              
1491             =item 1
1492             all official beacon fields
1493              
1494             =item 2
1495             all administrative fields (_alias, ...)
1496              
1497             =back
1498              
1499             =cut
1500              
1501             sub headers {
1502             my ($self, $seqno_or_alias) = @_;
1503              
1504             unless ( $self->{_iterator_info} ) {
1505             my ($constraint, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1506             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1507             SELECT * FROM repos $constraint;
1508             XxX
1509             $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1510             $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1511             $self->{_iterator_info} = $sth;
1512             };
1513              
1514             my $info = $self->{_iterator_info}->fetchrow_hashref;
1515             unless ( defined $info ) {
1516             croak("Error listing Collections: $self->{_iterator_info}->errstr") if $self->{_iterator_info}->err;
1517             delete $self->{_iterator_info};
1518             return undef;
1519             }
1520              
1521             my $collno = $info->{seqno} || $seqno_or_alias;
1522             my %meta = (_seqno => $collno);
1523             my %result = ();
1524             while ( my($key, $val) = each %$info ) {
1525             next unless defined $val;
1526             my $pval = hEncode($val, $key);
1527              
1528             if ( $key =~ /^bc(\w+)$/ ) {
1529             $result{$1} = $pval}
1530             else {
1531             $meta{"_$key"} = $pval};
1532             }
1533             return \%result, \%meta;
1534             }
1535              
1536             =head3 listCollections ( [ $seqno_or_alias ] )
1537              
1538             Iterates over all Sequences and returns on each call an array of
1539              
1540             Seqno, Alias, Uri, Modification time, Identifier Count and Unique identifier count
1541              
1542             Returns undef if done.
1543              
1544             =cut
1545              
1546             sub listCollections {
1547             my ($self, $seqno_or_alias) = @_;
1548              
1549             unless ( $self->{_iterator_listCollections} ) {
1550             my ($constraint, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1551             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1552             SELECT seqno, alias, uri, mtime, counti, countu FROM repos $constraint;
1553             XxX
1554             $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1555             $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1556             $self->{_iterator_listCollections} = $sth;
1557             };
1558             my $onerow = $self->{_iterator_listCollections}->fetchrow_arrayref;
1559             unless ( $onerow ) {
1560             croak("Error listing Collections: $self->{_iterator_listCollections}->errstr") if $self->{_iterator_listCollections}->err;
1561             delete $self->{_iterator_listCollections};
1562             return ();
1563             };
1564             return @$onerow;
1565             }
1566              
1567             =head2 Statistics
1568              
1569             =head3 idStat ( [ $seqno_or_alias, %options ] )
1570              
1571             Count identifiers for the given pattern.
1572              
1573             =over 8
1574              
1575             =item Supported options:
1576              
1577             distinct => (0|1)
1578              
1579             Count multiple occurences only once
1580              
1581             verbose => (0|1)
1582              
1583             =back
1584              
1585              
1586             =cut
1587              
1588             sub idStat {
1589             my ($self, $seqno_or_alias, %options) = @_;
1590             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1591             my $cond = "";
1592             if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
1593             $cond = "WHERE seqno==$seqno_or_alias"}
1594             elsif ( $seqno_or_alias ) {
1595             my @seqnos = $self->Seqnos('_alias', $seqno_or_alias);
1596             if ( @seqnos ) {
1597             $cond = "WHERE seqno IN (".join(",", @seqnos).")"}
1598             else {
1599             carp("no Seqnos selected by $seqno_or_alias");
1600             return 0;
1601             };
1602             };
1603             # my $count_what = $options{'distinct'} ? "DISTINCT hash" : "*";
1604             # will not be optimized by SQLite or mySQL: SELECT COUNT($count_what) FROM beacons $cond;
1605             # my $sth= $self->stmtHdl("SELECT COUNT($count_what) FROM beacons $cond LIMIT 1;");
1606             my $from = $options{'distinct'} ? "(SELECT DISTINCT hash FROM beacons $cond)"
1607             : "beacons $cond";
1608             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1609             SELECT COUNT(*) FROM $from LIMIT 1;
1610             XxX
1611             $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
1612             $sth->execute() or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1613             my $hits = $sth->fetchrow_arrayref;
1614              
1615             return $hits->[0] || 0;
1616             };
1617              
1618              
1619             =head3 idCounts ( [ $pattern, %options ] )
1620              
1621             Iterates through the entries according to the optional id filter expression.
1622              
1623             For each iteration the call returns a triple consisting of (identifier,
1624             number of rows, and sum of all individual counts).
1625              
1626             =over 8
1627              
1628             =item Supported options:
1629              
1630             distinct => (0|1)
1631              
1632             Count multiple occurences in one beacon file only once.
1633              
1634             =back
1635              
1636             =cut
1637              
1638             sub idCounts {
1639             my ($self, $pattern, %options) = @_;
1640             my $cond = $pattern ? qq!WHERE hash LIKE "$pattern"! : "";
1641             my $count_what = $options{'distinct'} ? "DISTINCT seqno" : "seqno";
1642             unless ( $self->{_iterator_idCounts} ) {
1643             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1644             SELECT hash, COUNT($count_what), SUM(hits) FROM beacons $cond GROUP BY hash ORDER BY hash;
1645             XxX
1646             $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
1647             $sth->execute() or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1648             $self->{_iterator_idCounts} = $sth;
1649             unless ( defined $self->{identifierClass} ) {
1650             my $package = $self->autoIdentifier();
1651             $options{'verbose'} && ref($package) && carp "Assuming identifiers of type ".ref($package)."\n";
1652             }
1653             };
1654             my $onerow = $self->{_iterator_idCounts}->fetchrow_arrayref;
1655             unless ( $onerow ) {
1656             croak("Error listing Collections: $self->{_iterator_idCounts}->errstr") if $self->{_iterator_idCounts}->err;
1657             delete $self->{_iterator_idCounts};
1658             return ();
1659             };
1660             if ( defined $self->{identifierClass} ) {
1661             my $c = $self->{identifierClass};
1662             # compat: hash might not take an argument, must resort to value, has to be cleared before...
1663             $c->value("");
1664             my $did = $c->hash($onerow->[0]) || $c->value($onerow->[0]);
1665             $onerow->[0] = $c->can("pretty") ? $c->pretty() : $c->value();
1666             };
1667             return @$onerow;
1668             };
1669              
1670              
1671             =head3 idList ( [ $pattern ] )
1672              
1673             Iterates through the entries according to the optional selection.
1674              
1675             For each iteration the call returns a tuple consisting of identifier and an
1676             list of array references (Seqno, Hits, Info, explicit Link, AltId) or the emtpy list
1677             if finished.
1678              
1679             Hits, Info, Link and AltId are normalized to the empty string if undefined (or < 2 for hits).
1680              
1681             It is important to finish all iterations before calling this method for "new" arguments:
1682              
1683             1 while $db->idList(); # flush pending results
1684              
1685             =cut
1686              
1687             sub idList {
1688             my ($self, $pattern) = @_;
1689             my $cond = $pattern ? ($pattern =~ /%/ ? "WHERE hash LIKE ?" : qq"WHERE hash=?")
1690             : "";
1691             unless ( $self->{_iterator_idList_handle} ) {
1692             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1693             SELECT hash, seqno, hits, info, link, altid FROM beacons $cond ORDER BY hash, seqno, altid;
1694             XxX
1695             $self->stmtExplain($sthexpl, ($pattern ? ($pattern) : () )) if $ENV{'DBI_PROFILE'};
1696             $sth->execute(($pattern ? ($pattern) : () )) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1697             $self->{_iterator_idList_handle} = $sth;
1698             $self->{_iterator_idList_crosscheck} = $self->RepoCols("ALTTARGET");
1699             $self->{_iterator_idList_prefetch} = undef;
1700             $self->autoIdentifier() unless defined $self->{identifierClass};
1701             };
1702             unless ( exists $self->{_iterator_idList_prefetch} ) { # deferred exit
1703             delete $self->{_iterator_idList_handle};
1704             delete $self->{_iterator_idList_crosscheck};
1705             return ();
1706             };
1707             my $pf = $self->{_iterator_idList_prefetch};
1708             while ( my $onerow = $self->{_iterator_idList_handle}->fetchrow_arrayref ) {
1709             # $onerow->[2] = "" unless $self->{_iterator_idList_crosscheck}->{$onerow->[1]}; # kill artefacts
1710             $onerow->[2] = "" unless $onerow->[2]; # kill artefacts
1711             $onerow->[3] = "" unless defined $onerow->[3]; # kill artefacts
1712             $onerow->[4] = "" unless defined $onerow->[4]; # kill artefacts
1713             $onerow->[5] = "" unless defined $onerow->[5]; # kill artefacts
1714             if ( defined $self->{identifierClass} ) {
1715             my $c = $self->{identifierClass};
1716             # compat: hash might not take an argument, must resort to value, has to be cleared before...
1717             $c->value("");
1718             my $did = $c->hash($onerow->[0]) || $c->value($onerow->[0]);
1719             $onerow->[0] = $c->can("pretty") ? $c->pretty() : $c->value();
1720             };
1721             if ( $pf ) {
1722             if ( $pf->[0] eq $onerow->[0] ) {
1723             push(@$pf, [@$onerow[1..@$onerow-1]]);
1724             next;
1725             }
1726             else {
1727             $self->{_iterator_idList_prefetch} = [$onerow->[0], [@$onerow[1..@$onerow-1]]];
1728             return @$pf;
1729             }
1730             }
1731             else {
1732             $pf = [$onerow->[0], [@$onerow[1..@$onerow-1]]]};
1733             };
1734            
1735             if ( $self->{_iterator_idList_handle}->err ) {
1736             croak("Error listing Collections: $self->{_iterator_idList_handle}->errstr");
1737             };
1738             delete $self->{_iterator_idList_prefetch};
1739             return $pf ? @$pf : ();
1740             };
1741              
1742              
1743             =head2 Manipulation of global metadata: Open Search Description
1744              
1745             =head3 setOSD ( $field, @values }
1746              
1747             Sets the field $field of the OpenSearchDescription to @value(s).
1748              
1749             =cut
1750              
1751             sub setOSD {
1752             my ($self) = shift;
1753             $self->clearOSD($_[0]) or return undef;
1754             return (defined $_[1]) ? $self->addOSD(@_) : 0; # value(s) to set
1755             };
1756              
1757             =head3 clearOSD ( $field }
1758              
1759             Clears the field $field of the OpenSearchDescription.
1760              
1761             =cut
1762              
1763             sub clearOSD {
1764             my ($self, $field) = @_;
1765             $field || (carp("no OSD field name provided"), return undef);
1766             defined $self->osdKeys($field) || (carp("no valid OSD field '$field'"), return undef);
1767             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1768             DELETE FROM osd WHERE key=?;
1769             XxX
1770             $self->stmtExplain($sthexpl, $field) if $ENV{'DBI_PROFILE'};
1771             $sth->execute($field) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1772             return 1;
1773             }
1774              
1775             =head3 addOSD ( $field, @values }
1776              
1777             Adds more @value(s) as (repeatable) field $field of the OpenSearchDescription.
1778              
1779             =cut
1780              
1781             sub addOSD {
1782             my ($self, $field, @values) = @_;
1783             $field || (carp("no OSD field name provided"), return undef);
1784             return 0 unless @values;
1785             defined $self->osdKeys($field) || (carp("no valid OSD field '$field'"), return undef);
1786             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1787             INSERT INTO osd ( key, val ) VALUES ( ?, ? );
1788             XxX
1789             $self->stmtExplain($sthexpl, $field, $values[0]) if $ENV{'DBI_PROFILE'};
1790             my $tstatus = [];
1791             my $tuples = $sth->execute_array({ArrayTupleStatus => $tstatus}, $field, \@values) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1792             return $tuples;
1793             }
1794              
1795             =head2 Manipulation of global metadata: Beacon Metadata
1796              
1797             These headers are used when you will be publishing a beacon file for the collection.
1798              
1799             =head3 setBeaconMeta ( $field, $value )
1800              
1801             Sets the field $field of the Beacon meta table (used to generate a BEACON file for this
1802             service) to $value.
1803              
1804             =cut
1805              
1806             sub setBeaconMeta {
1807             my ($self) = shift;
1808             $self->clearBeaconMeta(@_) or return undef;
1809             return (defined $_[1]) ? $self->addBeaconMeta(@_) : 0; # value to set
1810             };
1811              
1812             =head3 clearBeaconMeta ( $field }
1813              
1814             Deletes the field $field of the Beacon meta table.
1815              
1816             =cut
1817              
1818             sub clearBeaconMeta {
1819             my ($self, $rfield) = @_;
1820             $rfield || (carp("no Beacon field name provided"), return undef);
1821             my $field = $self->beaconfields($rfield) or (carp("no valid Beacon field '$rfield'"), return undef);
1822             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1823             DELETE FROM osd WHERE key=?;
1824             XxX
1825             $self->stmtExplain($sthexpl, $field) if $ENV{'DBI_PROFILE'};
1826             $sth->execute($field) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1827             return 1;
1828             }
1829              
1830             =head3 addBeaconMeta ( $field, $value )
1831              
1832             Appends $value to the field $field of the BEACON meta table
1833              
1834             =cut
1835             sub addBeaconMeta {
1836             my ($self, $rfield, $value) = @_;
1837             $rfield || (carp("no Beacon field name provided"), return undef);
1838             my $field = $self->beaconfields($rfield) or (carp("no valid Beacon field '$rfield'"), return undef);
1839             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1840             INSERT INTO osd ( key, val ) VALUES ( ?, ? );
1841             XxX
1842             $self->stmtExplain($sthexpl, $field, $value) if $ENV{'DBI_PROFILE'};
1843             $sth->execute($field, $value) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1844             return 1;
1845             }
1846              
1847             =head3 admin ( [$field, [$value]] )
1848              
1849             Manipulates the admin table.
1850              
1851             Yields a hashref to the admin table if called without arguments.
1852              
1853             If called with $field, returns the current value, and sets the
1854             table entry to $value if defined.
1855              
1856              
1857             =cut
1858              
1859             sub admin {
1860             my ($self, $field, $value) = @_;
1861             my $admref = $self->admhash();
1862             return $admref unless $field;
1863             my $retval = $admref->{$field};
1864             return $retval unless defined $value;
1865              
1866             my ($admh, $admexpl) = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);");
1867             $self->stmtExplain($admexpl, $field, $value) if $ENV{'DBI_PROFILE'};
1868             $admh->execute($field, $value)
1869             or croak("Could not execute update to admin table: ".$admh->errstr);
1870             return defined($retval) ? $retval : "";
1871             }
1872              
1873              
1874             # on-the-fly conversions
1875              
1876             sub hDecode { # external time to numeric timestamp, printf placeholders
1877             my ($val, $fnam) = @_;
1878             return $val unless $fnam;
1879             local($_) = (ref $val) ? $val->{$fnam} : $val;
1880             return undef unless defined $_;
1881              
1882             if ( $fnam =~ /target$/i ) { s/%/%%/g; s/(\{id\}|\$PND)/%1\$s/gi; s/(\{altid\}|\$PND)/%2\$s/gi; }
1883             elsif ( $fnam =~ /message$/i ) { s/%/%%/g; s/\{hits?\}/%s/gi; }
1884             elsif ( $fnam =~ /time|revisit/i ) {
1885             if ( /^\d+$/ ) { # legacy UNIX timestamp
1886             }
1887             elsif ( my $p = HTTP::Date::str2time($_, "GMT") ) { # all unqualified times are GMT
1888             $_ = $p}
1889             else {
1890             carp("could not parse value '$_' as time in field $fnam");
1891             return undef;
1892             };
1893             }
1894             return $_;
1895             }
1896              
1897             sub hEncode { # timestamp to beacon format
1898             my ($val, $fnam) = @_;
1899             local($_) = (ref $val) ? $val->{$fnam} : $val;
1900             return undef unless defined $_;
1901             if ( $fnam =~ /time|revisit/i ) { $_ = SeeAlso::Source::BeaconAggregator::tToISO($_) }
1902             elsif ( $fnam =~ /message/i ) { s/%s/{hits}/; s/%%/%/g; }
1903             elsif ( $fnam =~ /target/i ) { s/%s/{ID}/; s/%1\$s/{ID}/; s/%2\$s/{ALTID}/; s/%%/%/g; };
1904             return $_;
1905             }
1906              
1907             =head1 AUTHOR
1908              
1909             Thomas Berger
1910             CPAN ID: THB
1911             gymel.com
1912             THB@cpan.org
1913              
1914             =head1 COPYRIGHT
1915              
1916             This program is free software; you can redistribute
1917             it and/or modify it under the same terms as Perl itself.
1918              
1919             The full text of the license can be found in the
1920             LICENSE file included with this module.
1921              
1922             =cut
1923              
1924             1;
1925