File Coverage

blib/lib/SeeAlso/Source/BeaconAggregator/Maintenance.pm
Criterion Covered Total %
statement 631 818 77.1
branch 436 750 58.1
condition 121 256 47.2
subroutine 34 34 100.0
pod 20 23 86.9
total 1242 1881 66.0


line stmt bran cond sub pod time code
1             package SeeAlso::Source::BeaconAggregator::Maintenance;
2 10     10   431688 use strict;
  10         12  
  10         224  
3 10     10   30 use warnings;
  10         10  
  10         212  
4              
5             BEGIN {
6 10     10   27 use Exporter ();
  10         10  
  10         245  
7 10     10   28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         9  
  10         826  
8 10     10   18 $VERSION = '0.2_92';
9 10         108 @ISA = qw(Exporter);
10             #Give a hoot don't pollute, do not export more than needed by default
11 10         23 @EXPORT = qw();
12 10         16 @EXPORT_OK = qw();
13 10         209 %EXPORT_TAGS = ();
14             }
15              
16 10     10   33 use base ("SeeAlso::Source::BeaconAggregator");
  10         10  
  10         3802  
17 10     10   48 use Carp;
  10         11  
  10         424  
18 10     10   36 use HTTP::Date; # not perfect, but the module is commonly installed...
  10         12  
  10         372  
19 10     10   3263 use HTTP::Request;
  10         103705  
  10         226  
20 10     10   4865 use LWP::UserAgent;
  10         100967  
  10         251  
21 10     10   5446 use File::Temp;
  10         119697  
  10         78709  
22              
23             =head1 NAME
24              
25             sasbactrl.pl - command line interface to SeeAlso::Source::BeaconAggregator and
26             auxiliary classes
27              
28             =head1 SYNOPSIS
29              
30              
31             =head1 DESCRIPTION
32              
33             This Module allows a collection of BEACON files (cf. http://de.wikipedia.org/wiki/Wikipedia:BEACON)
34             to be used as SeeAlso::Source (probably in the context of an SeeAlso::Server application).
35             Therefore it implements the four methods documented in SeeAlso::Source
36              
37             The BEACON files (lists of non-local identifiers of a certain type documenting the coverage of a given
38             online database plus means for access) are imported by the methods provided by
39             SeeAlso::Source::BeaconAggregator::Maintenance.pm, usually by employing the script sasbactrl.pl
40             as command line client.
41              
42             Serving other formats than SeeAlso or providing a BEACON file with respect to this
43             SeeAlso service is achieved by using SeeAlso::Source::BeaconAggregator::Publisher.
44              
45              
46             =head1 USAGE
47              
48             Use the C method inherited from C to
49             access an existing database or create a new one.
50              
51              
52             =head2 Database Methods
53              
54             =head3 init( [ %options] )
55              
56             Sets up and initializes the database structure for the object.
57             This has to be done once after creating a new database and after
58             upgrading this module.
59              
60             Valid options include:
61              
62             =over 8
63              
64             =item verbose
65              
66             =item prepareRedirs
67              
68             =item identifierClass
69              
70             =back
71              
72              
73             The I table contains as columns all valid beacon fields plus
74             the following administrative fields which have to be prefixed with
75             "_" in the interface:
76              
77             =over 8
78              
79             =item seqno
80              
81             Sequence number: Is incremented on any successfull load
82              
83             =item alias
84              
85             Unique key: On update older seqences with the same alias are
86             automatically discarded. Most methods take an alias as
87             argument thus obliterating the need to determine the sequence
88             number.
89              
90             =item sort
91              
92             optional sort key
93              
94              
95             =item uri
96              
97             Overrides the #FEED header for updates
98              
99             =item ruri
100              
101             Real uri from which the last instance was loaded
102              
103              
104             =item ftime
105              
106             Fetch time: Timestamp as to when this instance was loaded
107              
108             Clear this or mtime to force automatic reload.
109              
110             =item fstat
111              
112             Short statistics line of last successful reload on update.
113              
114              
115             =item mtime
116              
117             Modification time: Timestamp of the file / HTTP object from which this instance was loaded.
118             Identical to ftime if no timestamp is provided
119              
120             Clear this or ftime to force automatic reload on update.
121              
122              
123             =item utime
124              
125             Timestamp of last update attempt
126              
127             =item ustat
128              
129             Short status line of last update attempt.
130              
131              
132             =item counti
133              
134             Identifier count
135              
136             =item countu
137              
138             Unique identifier count
139              
140              
141             =item admin
142              
143             Just to store some remarks.
144              
145             =back
146              
147             The I table stores the individual beacon entries from the input files.
148             Its columns are:
149              
150             =over 8
151              
152             =item hash
153              
154             Identifier. If a (subclass of) C instance is provided,
155             this will be transformed by the C method.
156              
157             =item seqno
158              
159             Sequence number of the beacon file in the database
160              
161             =item altid
162              
163             optional identifier from an alternative identifier system for use
164             with ALTTARGET templates.
165              
166             =item hits
167              
168             optional number of hits for this identifier in the given resource
169              
170             =item info
171              
172             optional information text
173              
174             =item link
175              
176             optional explicit URL
177              
178             =back
179              
180              
181             The I table contains C, C pairs for various metadata
182             concerning the collection as such, notably the values needed for
183             the Open Search Description and the Header fields needed in case
184             of publishing a beacon file for this collection.
185              
186             The I table stores (unique) C, C pairs for
187             general persistent data. Currently the following keys are defined:
188              
189             =over 8
190              
191             =item DATA_VERSION
192              
193             Integer version number to migrate database layout.
194              
195             =item IDENTIFIER_CLASS
196              
197             Name of the Identifier class to be used.
198              
199             =item REDIRECTION_INDEX
200              
201             Control creation of an additional index for the I column
202             (facialiates reverse lookups as needed for clustering).
203              
204             =back
205              
206              
207             =cut
208              
209             sub init {
210 9     9 1 5076 my ($self, %options) = @_;
211 9 50       40 $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
212              
213 9         53 my @fieldlist = SeeAlso::Source::BeaconAggregator->beaconfields();
214 9         25 my @bf = map{ join(" ", @{[SeeAlso::Source::BeaconAggregator->beaconfields($_)]}[0..1]) } @fieldlist;
  189         145  
  189         273  
215 9 50       33 my $hdl = $self->{dbh} or croak("no database handle?");
216              
217 9         18 local($") = ",\n";
218 9 50       102 $hdl->do(<<"XxX"
219             CREATE TABLE IF NOT EXISTS repos (
220             seqno INTEGER PRIMARY KEY AUTOINCREMENT,
221             alias TEXT,
222             sort TEXT,
223             uri VARCHAR(512),
224             ruri VARCHAR(512),
225             mtime INTEGER,
226             utime INTEGER,
227             ftime INTEGER,
228             counti INTEGER DEFAULT 0,
229             countu INTEGER DEFAULT 0,
230             fstat TEXT,
231             ustat TEXT,
232             admin VARCHAR(512),
233             @bf
234             );
235             XxX
236             ) or croak("Setup error: ".$hdl->errstr);
237              
238 9 50       52167 $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS seqnos ON repos(seqno);") or croak("Setup error: ".$hdl->errstr);
239 9 50       29641 $hdl->do("CREATE INDEX IF NOT EXISTS aliases ON repos(alias);") or croak("Setup error: ".$hdl->errstr);
240              
241 9 50       28997 $hdl->do(<<"XxX"
242             CREATE TABLE IF NOT EXISTS beacons (
243             hash CHARACTER(64) NOT NULL,
244             seqno INTEGER REFERENCES repos(seqno) ON DELETE CASCADE,
245             altid TEXT,
246             hits INTEGER,
247             info VARCHAR(255),
248             link VARCHAR(1024)
249             );
250             XxX
251             ) or croak("Setup error: ".$hdl->errstr);
252              
253              
254             # Faciliate lookups
255 9 50       18354 $hdl->do("CREATE INDEX IF NOT EXISTS lookup ON beacons(hash);") or croak("Setup error: ".$hdl->errstr);
256             # maintenance and enforce constraints
257             # (Problem: Dupes w/o altid but differing in link *and* info fields should be legitimate, too)
258 9 50       16722 $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS mntnce ON beacons(seqno, hash, altid);") or croak("Setup error: ".$hdl->errstr);
259              
260             # foreign key on cascade does not work?
261              
262 9 50       22416 $hdl->do(<<"XxX"
263             CREATE TRIGGER IF NOT EXISTS on_delete_seqno BEFORE DELETE ON repos FOR EACH ROW
264             BEGIN
265             DELETE FROM beacons WHERE seqno=OLD.seqno;
266             END;
267             XxX
268             ) or croak("Setup error: ".$hdl->errstr);
269              
270             # OpenSearchDescription
271 9 50       19410 $hdl->do(<<"XxX"
272             CREATE TABLE IF NOT EXISTS osd (
273             key CHAR(20) NOT NULL,
274             val VARCHAR(1024)
275             );
276             XxX
277             ) or croak("Setup error: ".$hdl->errstr);
278 9 50       9072 $hdl->do("CREATE INDEX IF NOT EXISTS OSDKeys ON osd(key);") or croak("Setup error: ".$hdl->errstr);
279              
280             # Admin Stuff
281 9 50       8124 $hdl->do(<<"XxX"
282             CREATE TABLE IF NOT EXISTS admin (
283             key CHAR(20) PRIMARY KEY NOT NULL,
284             val VARCHAR(1024)
285             );
286             XxX
287             ) or croak("Setup error: ".$hdl->errstr);
288              
289 9 50       13068 $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS ADMKeys ON admin(key);") or croak("Setup error: ".$hdl->errstr);
290              
291 9         23470 my $admref = $self->admhash();
292              
293 9         13 my $verkey = "DATA_VERSION";
294 9         23 my $goalver = $SeeAlso::Source::BeaconAggregator::DATA_VERSION;
295 9   100     26 my $dbver = $admref->{$verkey} || 0;
296 9 100       46 if ( $dbver != $goalver ) {
    50          
297 1         36 print "NOTICE: Database version $dbver: Upgrading to $goalver\n";
298             # alter tables here
299 1 50       5 if ( $dbver < 2 ) {
300             # my ($at, $type) = SeeAlso::Source::BeaconAggregator->beaconfields("COUNT");
301             # $hdl->do("ALTER TABLE repos ADD COLUMN $at $type;");
302             # ($at, $type) = SeeAlso::Source::BeaconAggregator->beaconfields("REMARK");
303             # $hdl->do("ALTER TABLE repos ADD COLUMN $at $type;");
304             };
305             }
306             elsif ( $options{'verbose'} ) {
307 0         0 print "INFO: Database version $dbver is current\n"};
308              
309 9 100       28 unless ( $dbver == $goalver) {
310 1         6 my $verh = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);", "update version statement");
311 1 50       5069 $verh->execute($verkey, $goalver)
312             or croak("Could not execute update version statement: ".$verh->errstr);
313             };
314              
315 9 100       28 unless ( exists $options{'identifierClass'} ) {
316 3 100       10 $options{'identifierClass'} = $self->{'identifierClass'} if exists $self->{'identifierClass'};
317             };
318              
319 9         18 my $ickey = "IDENTIFIER_CLASS";
320 9 100 100     114 if ( (exists $options{identifierClass}) and (my $wanttype = ref($options{'identifierClass'})) ) {
    100 66        
321 5 100 100     62 if ( (exists $self->{identifierClass}) && (ref($self->{identifierClass}) ne $wanttype) ) {
322 1         219 croak("Cannot override identifierClass set on new()")};
323 4 100       14 if ( my $oldtype = $admref->{$ickey} ) {
324 2 50       8 croak ("Identifier mismatch: Cannot set to $wanttype since database already branded to $oldtype")
325             unless($oldtype eq $wanttype);
326             }
327             else {
328 2 50       6 print "fixing identifierClass as $wanttype\n" if $options{'verbose'};
329 2         7 my $ichdl = $self->stmtHdl("INSERT INTO admin VALUES (?, ?);", "fix identifier class statement");
330 2 50       24558 $ichdl->execute($ickey, $wanttype)
331             or croak("Could not execute fix identifier class statement: ".$ichdl->errstr);
332 2         19 $self->{identifierClass} = $options{identifierClass};
333             };
334             }
335             elsif ( (exists $options{identifierClass}) and (not $options{identifierClass}) ) {
336 2 50       7 print "removing fixed identifierClass from admin table\n" if $options{'verbose'};
337 2         7 my $ichdl = $self->stmtHdl("DELETE FROM admin WHERE key=?;", "identifier class statement");
338 2 50       68931 $ichdl->execute($ickey)
339             or croak("Could not execute remove identifier class statement: ".$ichdl->errstr);
340 2         34 delete $self->{identifierClass};
341             };
342              
343 8         22 my $rikey = "REDIRECTION_INDEX";
344 8 50 33     61 if ( exists $options{prepareRedirs} or exists $admref->{$rikey} ) {
345 0         0 my $rihdl = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);", "fix redirection index statement");
346 0 0 0     0 if ( $options{prepareRedirs} or ( $admref->{$rikey} and not exists $options{prepareRedirs} ) ) {
    0 0        
      0        
347 0 0 0     0 print "creating redirection index\n" if $options{prepareRedirs} and $options{'verbose'};
348 0 0       0 $hdl->do("CREATE INDEX IF NOT EXISTS redir ON beacons(altid,seqno);") or croak("Setup error: ".$hdl->errstr);
349 0 0       0 $rihdl->execute($rikey, 1)
350             or croak("Could not execute fix redirection index: ".$rihdl->errstr);
351             }
352             elsif ( not( $admref->{$rikey} and ($options{prepareRedirs} or (not exists $options{prepareRedirs})) ) ) {
353 0 0       0 print "dropping redirection index\n" if $options{'verbose'};
354 0 0       0 $hdl->do("DROP INDEX IF EXISTS redir;") or croak("Setup error: ".$hdl->errstr);
355 0 0       0 $rihdl->execute($rikey, 0)
356             or croak("Could not execute fix redirection index: ".$rihdl->errstr);
357             };
358             # $admref = $self->admhash();
359             }
360              
361 8 50       27 print "[ANALYZE ..." if $options{'verbose'};
362 8         96 $hdl->do("ANALYZE;");
363 8 50       243467 print "]\n" if $options{'verbose'};
364 8         182 return 1; # o.k.
365             };
366              
367              
368             =head3 deflate()
369              
370             Maintenance action: performs VACCUUM, REINDEX and ANALYZE on the database
371              
372             =cut
373              
374             sub deflate {
375 1     1 1 1548 my ($self) = @_;
376 1 50       4 my $hdl = $self->{dbh} or croak("no handle?");
377 1         13 local $hdl->{AutoCommit} = 1;
378 1         29 print "VACUUM\n";
379 1 50       5 $hdl->do("VACUUM") or carp("could not VACUUM: Skipping...");
380 1         36787 print "REINDEX\n";
381 1 50       14 $hdl->do("REINDEX") or croak("could not REINDEX: Abort");
382 1         11040 print "ANALYZE\n";
383 1 50       12 $hdl->do("ANALYZE;") or croak("could not ANALYZE: Abort");
384 1         13792 return 1;
385             }
386              
387              
388             =head2 Handling of beacon files
389              
390             =head3 loadFile ( $file, $fields, %options )
391              
392             Reads a physical beacon file and stores it with a new Sequence number in the
393             database.
394              
395             Returns a triple:
396              
397             my ($seqno, $rec_ok, $message) = loadFile ( $file, $fields, %options )
398              
399             $seqno is undef on error
400              
401             $seqno and $rec_ok are zero with $message containing an explanation in case
402             of no action taken.
403              
404             $seqno is an positive integer if something was loaded: The L
405             (internal unique identifier) for the representation of the beacon file in
406             the database.
407              
408             =over 8
409              
410             =item $file
411              
412             File to read: Must be a beacon file
413              
414             =item $fields
415              
416             Hashref with additional meta and admin fields to store
417              
418             =item Supported options:
419              
420             verbose => (0|1)
421             force => (0|1) process unconditionally without timestamp comparison
422             nostat => (0|1) don't refresh global identifier counters
423              
424             =back
425              
426             If the file does not contain a minimal correct header (eg. is an empty file
427             or an HTML error page accidentaly caught) no action is performed.
428              
429             Otherwise, a fresh SeqNo (sequence number) is generated and meta and
430             BEACON-Lines are stored in the appropriate tables in the database.
431              
432             If the _alias field is provided, existing database entries for this
433             Alias are updated, identifiers not accounted for any more are
434             eventually discarded.
435              
436             =cut
437              
438             sub loadFile {
439 9     9 1 1910 my ($self, $file, $fields, %options) = @_;
440 9 100       31 $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
441 9   100     52 $options{'verbose'} ||= 0;
442              
443 9 50       163 if ( ! $file ) {
    50          
    50          
    50          
444 0         0 croak("Missing file argument")}
445             elsif ( ! -e $file ) {
446 0   0     0 print "ERROR: no such file $file\n" && return undef}
447             elsif ( ! -r _ ) {
448 0   0     0 print "ERROR: no read permissions for $file\n" && return undef}
449             elsif ( -z _ ) {
450 0         0 print "WARNING: empty file $file\n";
451 0         0 return (0,0, "empty file: Will not process");
452             }
453 9         28 my $mtime = (stat(_))[9];
454 9 50       241 open(BKN, "<:utf8", $file) or (print "ERROR: cannot read $file\n", return undef);
455 9         50 local($.) = 0;
456              
457 9 50       31 unless ( defined $self->{identifierClass} ) {
458 9         60 my $package = $self->autoIdentifier();
459 9 50 66     47 $options{'verbose'} && ref($package) && print "Assuming identifiers of type ".ref($package)."\n";
460             };
461              
462 9 50       17 $fields = {} unless $fields;
463 9   33     54 $fields->{'_ftime'} ||= time();
464 9   66     25 $fields->{'_mtime'} ||= $mtime;
465 9 100       26 delete $fields->{_uri} unless $fields->{_uri};
466 9 50       45 delete $fields->{_alias} unless $fields->{_alias};
467 9   50     23 my $autopurge = $fields->{_alias} || "";
468 9   0     25 my $showme = $fields->{_alias} || $fields->{_uri} || $file;
469              
470 9 100       19 if ( $options{'verbose'} ) {
471 4   50     16 printf("* Loading %s from URI %s\n", $fields->{_alias} || "", $fields->{_uri} || "");
      50        
472 4         123 printf("* local input %s (%s)\n", $file, SeeAlso::Source::BeaconAggregator::tToISO($mtime));
473             };
474              
475 9         13 my ($collno, $inserthandle, $replacehandle, $err, $format);
476 9         14 my ($linecount, $headerseen, $oseq) = (0, 0, 0);
477 9         21 my ($reccount, $recill, $recign, $recnil, $recupd, $recnew, $recdupl, $recdel) = (0, 0, 0, 0, 0, 0, 0, 0);
478 9         10 local($_);
479             lines:
480 9         115 while ( ) {
481 120         595 s/[ \x0d\x0a]+$//;
482 120 100       243 unless ( $linecount++ ) {
483 9 50       37 if ( s/^\x{FEFF}// ) { # BOM-Character
    50          
484             }
485             elsif ( s/^\xef\xbb\xbf// ) { # BOM-Bytes
486 0         0 print "ERROR: cannot cope with doubly UTF-8 encoded $file\n";
487 0         0 return (undef, undef, "encoding trouble")};
488 9 50       37 if ( /^\s*$/ ) {
489 0         0 print "WARNING: Discarding blank line before beacon header [$showme l.$.]\n";
490 0         0 next;
491             };
492             };
493 120 100       166 if ( not defined $collno ) { # $collno used as flag: "still in header"
494 65 100       296 if ( /^#\s*([A-Z][\w-]*):\s*(.*)$/ ) {
    50          
    100          
    50          
495 55         38 $headerseen++;
496 55         94 my ($field, $data) = ($1, $2);
497 55 50       100 $field =~ s/^DATE$/TIMESTAMP/ && print "WARNING: corrected DATE to TIMESTAMP in Beacon-Header [$showme l.$.]\n";
498 55         90 $data =~ s/\s+$//;
499 55 50       105 next if $data =~ /^\s*$/;
500 55 100       178 if ( SeeAlso::Source::BeaconAggregator->beaconfields($field) ) {
501 52 50       69 if ( $fields->{$field} ) {
502 0         0 print "WARNING: Skipping already set $field [$showme l.$.]\n"}
503             else {
504 52         297 $fields->{$field} = $data}
505             }
506             else {
507 3         43 print "WARNING: Ignoring unknown $field [$data] [$showme l.$.]\n";
508             };
509             }
510             elsif ( /^(#[^:\s]+)/ ) {
511 0         0 print "WARNING: Discarding unparseable line >$1...< in beacon header context [$showme l.$.]\n"}
512             elsif ( /^\s*$/ ) {
513 1 50       20 print "NOTICE: Discarding blank line in beacon header context [$showme l.$.]\n" if $options{'verbose'}}
514             elsif ( ! $headerseen ) {
515 0         0 print "ERROR: no header fields [$showme l.$.]\n";
516 0         0 return (0, 0, "no header fields: Will not proceed");
517             }
518             else {
519 9         40 ($collno, $err, $format, $inserthandle, $replacehandle, $oseq) = $self->processbeaconheader($fields, %options);
520 9 50       32 unless ( $collno ) {
521 0         0 print "ERROR: metadata error [$showme l.$.]\n";
522 0         0 return (0, 0, "metadata error: $err");
523             };
524 9         99 $self->{dbh}->{AutoCommit} = 0;
525 9         19 $linecount --;
526 9         24 redo lines;
527             }
528             }
529             else {
530 55         116 s/^\s+//; s/\s+$//;
  55         122  
531 55         42 my ($id, $altid, @rest);
532 55         369 ($id, @rest) = split(/\s*\|\s*/, $_, 4);
533 55 50       190 ($id, $altid) = split(/\s*=\s*/, $id, 2) if $id;
534 55 50       89 $id || ($recnil++, next);
535              
536 55 100       106 if ( $options{'filter'} ) {
537 14         17 ($id, $altid) = &{$options{'filter'}}($id, $altid, @rest);
  14         37  
538 14 100       112 unless ( $id ) {
539 7         6 $recign ++;
540 7 50       17 unless ( ++$reccount % 10000 ) {
541 0         0 $self->{dbh}->{AutoCommit} = 1;
542 0 0       0 print "$reccount\n" if $options{'verbose'};
543 0         0 $self->{dbh}->{AutoCommit} = 0;
544             };
545 7         26 next lines;
546             };
547             };
548 48   100     129 $altid ||= "";
549              
550 48         36 my($hits, $info, $link);
551 48 100 100     273 if ( @rest && ($rest[$#rest] =~ m!^\S+://\S+$!) ) {
    50          
552 18         26 $link = pop @rest}
553             elsif ( defined $rest[2] ) {
554 0         0 print "WARNING: unparseable link content >$rest[2]< [$showme l.$.]"};
555              
556 48 100 100     347 if ( @rest && ($rest[0] =~ /^\d*$/) ) {
    100          
    100          
557 9         14 $hits = shift @rest;
558             # really disregard hits with explicit 0?
559 9 100 66     58 if ( (!$altid) and (!$link) and ($format =~ /\baltTARGET\b/) ) {
      100        
560 1   50     6 $altid = shift @rest || "";
561             }
562             else {
563 8   100     20 $info = shift @rest || "";
564             };
565             }
566             elsif ( defined $rest[1] ) {
567 1         2 $hits = "";
568 1 50 33     13 if ( (!$altid) and (!$link) and ($format =~ /\baltTARGET\b/) ) {
      33        
569 1         3 $info = shift @rest;
570 1         2 $altid = shift @rest;
571             }
572             else {
573 0         0 shift @rest;
574 0         0 $info = shift @rest;
575             };
576             }
577             elsif ( defined $rest[0] ) {
578 30         30 $hits = "";
579 30         36 $info = shift @rest;
580             };
581 48 50       80 if ( @rest ) {
582 0         0 print "WARNING: unparseable content >$_< [$showme l.$.]"};
583              
584 48 100       65 unless ( $link ) {
585 30 100       118 if ( ($format =~ /\bhasTARGET\b/) ) { # ok
    50          
    0          
586             }
587             elsif ( $format =~ /\baltTARGET\b/ ) {
588 2 50       4 if ( $altid ) { # also ok
    0          
589             }
590             elsif ( $info ) { # too strange
591 0 0       0 print "WARNING: discarding >$id<".(defined $hits ? " ($hits)" : "")." for $info without link [$showme l.$.] (assertion failed)\n";
592 0         0 $recill++;
593 0         0 next lines;
594             }
595             else { # feed id as altid
596 0         0 $altid = $id}
597             }
598             elsif ( $format =~ /\bnoTARGET\b/ ) {
599 0 0       0 print "NOTICE: discarding >$id<".(defined $hits ? " ($hits)" : "")." without link [$showme l.$.]\n" if $options{'verbose'} > 1;
    0          
600 0         0 $recill++;
601 0         0 next lines;
602             }
603             else {
604 0 0       0 print "WARNING: discarding >$id<".(defined $hits ? " ($hits)" : "")." without link [$showme l.$.] (assertion failed)\n";
605 0         0 $recill++;
606 0         0 next lines;
607             }
608             };
609              
610 48 100       136 if ( $format !~ /\baltTARGET\b/ ) { # Allow certain duplicates (force disambiguisation)
611 4   33     20 $altid ||= $info || $link}
      33        
612              
613 48 100       81 $hits = "" unless defined $hits;
614 48 50       73 ($hits =~ /^0+/) && ($recnil++, next); # Explizit "0" => raus
615 48 100       88 $hits = 0 if $hits eq "";
616 48   100     104 $altid ||= "";
617 48         38 my $hash;
618 48 50       79 if ( defined $self->{identifierClass} ) {
619 0         0 $self->{identifierClass}->value($id);
620 0 0       0 unless ( $self->{identifierClass}->valid ) {
621 0 0       0 print "NOTICE: invalid identifier >$id< ($hits) [$showme l.$.]\n" if $options{'verbose'};
622 0         0 $recill++;
623 0         0 next lines;
624             };
625 0         0 $hash = $self->{identifierClass}->hash();
626             }
627             else {
628 48         45 $hash = $id};
629 48         35 my $did;
630 48 100 66     2309 if ( $replacehandle && ($did = $replacehandle->execute($hits, $info, $link, $hash, $altid)) ) { # UPDATE OR FAIL old record
    50          
    0          
631 24 50       98 if ( $replacehandle->err ) {
    100          
632 0         0 carp("update in trouble: $replacehandle->errstring [$showme l.$.]");
633 0         0 $recdupl++;
634             }
635             elsif ( $did eq "0E0" ) { # not found, try insert
636 19         586 $did = $inserthandle->execute($hash, $altid, $hits, $info, $link);
637 19 100       40 if ( $did eq "0E0" ) {
638 6         7 $recdupl++;
639 6 50       9 if ( $altid ) {
640 0 0       0 print "INFO: did not insert duplicate Id >$id< = >$altid< ($hits) [$showme l.$.]\n" if $options{'verbose'}}
641             else {
642 6 50       14 print "INFO: did not insert duplicate Id >$id< ($hits) [$showme l.$.]\n" if $options{'verbose'} > 1};
643             }
644             else {
645 13         13 $recnew++};
646             }
647             else {
648 5         6 $recupd++};
649             }
650             elsif ( $did = $inserthandle->execute($hash, $altid, $hits, $info, $link) ) { # INSERT OR IGNORE new record
651 24 100       48 if ( $did eq "0E0" ) {
652 7         6 $recdupl++;
653 7 50       17 print "INFO: did not insert duplicate Id $id ($hits) [$showme l.$.]\n" if $options{'verbose'} > 1;
654             }
655             else {
656 17         21 $recnew++};
657             }
658             elsif ( $inserthandle->errstr =~ /constraint/ ) {
659 0         0 $recdupl++;
660 0 0       0 print "INFO: duplicate Id $id ($hits): not inserting [$showme l.$.]\n" if $options{'verbose'} > 1;
661             }
662             else {
663 0         0 croak("Could not insert: ($id, $hits, $info, $link): ".$inserthandle->errstr)};
664              
665 48 50       324 unless ( ++$reccount % 10000 ) {
666 0         0 $self->{dbh}->{AutoCommit} = 1;
667 0 0       0 print "$reccount\n" if $options{'verbose'};
668 0         0 $self->{dbh}->{AutoCommit} = 0;
669             };
670             }
671             };
672 9 50       62 if ( not defined $collno ) {
673 0 0       0 if ( $headerseen ) {
    0          
674 0         0 ($collno, $err, $format, $inserthandle, $replacehandle, $oseq) = $self->processbeaconheader($fields, %options);
675 0 0       0 if ( $collno ) {
676 0         0 print "WARNING: no idn content in file [$showme l.$.]\n"}
677             else {
678 0         0 print "ERROR: metadata error [$showme l.$.]\n";
679 0         0 return (0,0, "metadata error: $err");
680             };
681             }
682             elsif ( $. ) {
683 0         0 print "ERROR: no header fields [$showme l.$.]\n";
684 0         0 return (0, 0, "no header fields: Will not proceed");
685             }
686             else {
687 0         0 print "WARNING: empty file [$showme]\n";
688 0         0 return (0,0, "empty file");
689             };
690             }
691 9         85672 $self->{dbh}->{AutoCommit} = 1;
692              
693 9 50       80 if ( $autopurge ) {
694 9         166 $self->{dbh}->{AutoCommit} = 0;
695 9 100       34 if ( $oseq ) {
696 4         30 my ($bcdelh, $bcdelexpl) = $self->stmtHdl("DELETE FROM beacons WHERE seqno==?");
697 4 50       15 $self->stmtExplain($bcdelexpl, $oseq) if $ENV{'DBI_PROFILE'};
698 4 50       441 my $rows = $bcdelh->execute($oseq) or croak("Could not execute >".$bcdelh->{Statement}."<: ".$bcdelh->errstr);
699 4         4414 $self->{dbh}->{AutoCommit} = 1;
700 4 50       20 printf("INFO: Purged %s surplus identifiers from old sequence %u\n", $rows, $oseq) if $options{'verbose'};
701 4 100       16 $rows = "0" if $rows eq "0E0";
702 4         14 $recdel += $rows;
703             };
704              
705 9         41 $self->{dbh}->{AutoCommit} = 0;
706 9         62 my ($rpdelh, $rpdelexpl) = $self->stmtHdl("DELETE FROM repos WHERE (alias=?) AND (seqno
707 9 50       34 $self->stmtExplain($rpdelexpl, $autopurge, $collno) if $ENV{'DBI_PROFILE'};
708 9 50       1308 my $rows = $rpdelh->execute($autopurge, $collno) or croak("Could not execute >".$rpdelh->{Statement}."<: ".$rpdelh->errstr);
709 9         26032 $self->{dbh}->{AutoCommit} = 1;
710 9 100       60 $rows = "0" if $rows eq "0E0";
711 9 100       175 printf("INFO: %u old sequences discarded\n", $rows) if $options{'verbose'};
712             }
713              
714 9   50     490 printf "NOTICE: New sequence %u for %s: processed %u Records from %u lines\n",
715             $collno, $autopurge || "???", $reccount, $linecount;
716 9         60 my $statline = sprintf "%u replaced, %u new, %u deleted, %u duplicate, %u nil, %u invalid, %u ignored",
717             $recupd, $recnew, $recdel, $recdupl, $recnil, $recill, $recign;
718 9         157 print " ($statline)\n";
719              
720 9         21 my $recok = $recupd + $recnew;
721 9 100 66     55 my $numchg = ($recnew or $recdel) ? 1 : 0;
722              
723             # my $ct1hdl = $self->stmtHdl("SELECT COUNT(*) FROM beacons WHERE seqno==? LIMIT 1;");
724             # $ct1hdl->execute($collno) or croak("could not execute live count: ".$ct1hdl->errstr);
725             # my $ct1ref = $ct1hdl->fetchrow_arrayref();
726             # my $counti = $ct1ref->[0] || 0;
727              
728             # my $ct2hdl = $self->stmtHdl("SELECT COUNT(DISTINCT hash) FROM beacons WHERE seqno==?");
729             # using subquery to trick SQLite into using indices
730             # my $ct2hdl = $self->stmtHdl("SELECT COUNT(*) FROM (SELECT DISTINCT hash FROM beacons WHERE seqno==?) LIMIT 1;");
731             # $ct2hdl->execute($collno) or croak("could not execute live count: ".$ct2hdl->errstr);
732             # my $ct2ref = $ct2hdl->fetchrow_arrayref();
733             # my $countu = $ct2ref->[0] || 0;
734              
735             # combined query turned out as not as efficient
736             # my $ct0hdl = $self->stmtHdl("SELECT COUNT(*), COUNT(DISTINCT hash) FROM beacons WHERE seqno==? LIMIT 1;");
737             # $ct0hdl->execute($collno) or croak("could not execute live count: ".$ct0hdl->errstr);
738             # my $ct0ref = $ct0hdl->fetchrow_arrayref();
739             # my ($counti, $countu) = ($ct0ref->[0] || 0, $ct0ref->[1] || 0);
740              
741 9         48 my ($updh, $updexpl) = $self->stmtHdl(<<"XxX");
742             UPDATE OR FAIL repos SET counti=?,countu=?,fstat=?,utime=?,ustat=?,sort=? WHERE seqno==?;
743             XxX
744              
745 9   50     57 my $counti = $self->idStat($collno, 'distinct' => 0) || 0;
746 9 50       30 printf("WARNING: expected %u valid records, counted %u\n", $recok, $counti) if $recok != $counti;
747 9 100       26 unless ( $numchg ) {
748 1   50     7 $fields->{'_counti'} ||= 0;
749 1 50       5 printf("WARNING: expected unchanged number %u valid records, counted %u\n", $fields->{'_counti'}, $counti) if $fields->{'_counti'} != $counti;
750             };
751              
752 9   50     49 my $sort = $fields->{'_sort'} || "";
753             my $countu = $numchg ? ( $self->idStat($collno, 'distinct' => 1) || 0 )
754 9 100 50     39 : ( $fields->{'_countu'} || $self->idStat($collno, 'distinct' => 1) || 0 );
      50        
755 9 50       36 $self->stmtExplain($updexpl, $counti, $countu, $statline, time(), "successfully loaded", $sort, $collno) if $ENV{'DBI_PROFILE'};
756             $updh->execute($counti, $countu, $statline, time(), "successfully loaded", $sort, $collno)
757 9 50       84305 or croak("Could not execute >".$updh->{Statement}."<: ".$updh->errstr);
758 9         124 close(BKN);
759              
760 9 50 66     46 if ( $numchg or $options{'force'} ) {
761             # if ( $options{'force'} ) {
762             # print "[ANALYZE ..." if $options{'verbose'};
763             # $self->{dbh}->do("ANALYZE;");
764             # print "]\n" if $options{'verbose'};
765             # };
766              
767 9 50       27 if ( $options{'nostat'} ) { # invalidate since they might have changed
768 0         0 $self->admin('gcounti', undef);
769 0         0 $self->admin('gcountu', undef);
770             }
771             else {
772 9   50     57 $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
773 9   50     91 $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
774             }
775             };
776              
777 9         158 return ($collno, $recok, undef);
778             }
779              
780              
781             =head4 processbeaconheader($self, $fieldref, [ %options] )
782              
783             Internal subroutine used by C.
784              
785             =over 8
786              
787             =item $fieldref
788              
789             Hash with raw fields.
790              
791             =item Supported options:
792              
793             verbose => (0|1)
794              
795             Show seqnos of old instances which are met by the alias
796              
797             =back
798              
799              
800             =cut
801              
802             sub processbeaconheader {
803 9     9 1 25 my ($self, $fieldref, %options) = @_;
804 9         10 my $osq = 0;
805 9         9 my @carp;
806              
807 9 50       27 if ( my $alias = $fieldref->{_alias} ) {
808 9         25 my $stampfield = SeeAlso::Source::BeaconAggregator->beaconfields("TIMESTAMP");
809 9         33 my ($listh, $listexpl) = $self->stmtHdl("SELECT seqno, $stampfield, mtime, counti, countu FROM repos WHERE alias=?;");
810 9 50       22 $self->stmtExplain($listexpl, $alias) if $ENV{'DBI_PROFILE'};
811 9 50       465 $listh->execute($alias) or croak("Could not execute >".$listh->{Statement}."<: ".$listh->errstr);
812 9         18 my ($rowcnt, $ocounti, $ocountu);
813 9         82 while ( my($row) = $listh->fetchrow_arrayref ) {
814 13 100       31 last unless defined $row;
815 4         6 $rowcnt ++;
816 4         14 ($ocounti, $ocountu) = ($row->[3], $row->[4]);
817 4 50       11 if ( $options{'verbose'} ) {
818 0 0       0 print "* Old Instances for $alias:\n" unless $osq;
819 0         0 $osq = $row->[0];
820 0   0     0 print "+\t#$osq ", SeeAlso::Source::BeaconAggregator::tToISO($row->[1] || $row->[2]), " (", $row->[3] || "???", ")\n";
      0        
821             }
822             else {
823 4         18 $osq = $row->[0]};
824             }
825 9 100 66     40 if ( $rowcnt && ($rowcnt == 1) ) {
826 4 100 33     17 $fieldref->{_counti} ||= $ocounti if $ocounti;
827 4 100 33     16 $fieldref->{_countu} ||= $ocountu if $ocountu;
828             }
829             };
830              
831 9         18 my $format = "";
832 9 50 66     69 if ( $fieldref->{'FORMAT'} && $self->{accept}->{'FORMAT'} ) {
    100 33        
    50 33        
    0 0        
833 0 0       0 if ( $fieldref->{'FORMAT'} =~ $self->{accept}->{'FORMAT'} ) {
834 0         0 $format = $fieldref->{'FORMAT'}}
835             else {
836 0         0 push(@carp, "ERROR: only FORMAT '".$self->{accept}->{'FORMAT'}."' are supported, this is ".$fieldref->{'FORMAT'})}
837             }
838             elsif ( $fieldref->{'FORMAT'} ) {
839 8         17 $format = $fieldref->{'FORMAT'}}
840             elsif ( $fieldref->{'VERSION'} or $fieldref->{'TARGET'} or $fieldref->{'PREFIX'} or $fieldref->{'MESSAGE'} ) {
841 1         3 push(@carp, "WARNING: header line #FORMAT: BEACON should be supplied")}
842             elsif ( $self->{accept}->{'FORMAT'} ) {
843 0         0 push(@carp, "ERROR: header line #FORMAT is missing")}
844             else {
845 0         0 push(@carp, "WARNING: header line #FORMAT: BEACON should be supplied")};
846              
847 9 50 66     66 if ( $fieldref->{'FORMAT'} && ($fieldref->{'FORMAT'} =~ /v(?:ersion)?\s*(\d+(?:\.\d*)?)/i) ) {
848 0   0     0 $fieldref->{'VERSION'} ||= $1};
849 9 100       26 unless ( $fieldref->{'VERSION'} ) {
850 2 100       5 $fieldref->{'VERSION'} = $fieldref->{'FORMAT'} ? "0.1" : "1.0";
851 2         7 push(@carp, "NOTICE: added header field #VERSION as '".$fieldref->{'VERSION'}."'");
852             };
853 9 50       24 if ( $self->{accept}->{'VERSION'} ) {
854             ($fieldref->{'VERSION'} =~ $self->{accept}->{'VERSION'})
855 0 0       0 || push(@carp, "ERROR: only VERSION '".$self->{accept}->{'VERSION'}."' is supported, this is ".$fieldref->{'VERSION'});
856             };
857              
858 9 100       21 if ( $fieldref->{'ALTTARGET'} ) {
859 7 50       22 $fieldref->{'ALTTARGET'} = "" unless defined $fieldref->{'ALTTARGET'};
860 7         28 my $parsed = hDecode($fieldref, 'ALTTARGET');
861 7 50 33     108 if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%\d\$s/) ) {
    0          
862 7         19 $fieldref->{'ALTTARGET'} = $parsed;
863 7         9 $format =~ s/\s*-altTARGET//;
864 7         18 $format .= " -altTARGET";
865 7 50       83 ($parsed =~ /(^|[^%])(%.)*%2\$s/) or
866             push(@carp, "WARNING: header field #ALTTARGET should contain placeholder {ALTID}");
867             }
868             elsif ( $parsed ) {
869 0         0 push(@carp, "ERROR: header field #ALTTARGET must contain placeholder {ALTID} (or {ID})");
870 0         0 delete $fieldref->{'ALTTARGET'};
871             }
872             else {
873 0         0 push(@carp, "ERROR: could not parse header field #ALTTARGET: '".$fieldref->{'ALTTARGET'}."'");
874 0         0 delete $fieldref->{'ALTTARGET'};
875             }
876             };
877              
878 9 50       27 if ( $fieldref->{'IMGTARGET'} ) {
879 0 0       0 $fieldref->{'IMGTARGET'} = "" unless defined $fieldref->{'IMGTARGET'};
880 0         0 my $parsed = hDecode($fieldref, 'IMGTARGET');
881 0 0 0     0 if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%\d\$s/) ) {
    0          
882 0         0 $fieldref->{'IMGTARGET'} = $parsed;
883 0         0 $format =~ s/\s*-imgTARGET//;
884 0         0 $format .= " -imgTARGET";
885             }
886             elsif ( $parsed ) {
887 0         0 push(@carp, "WARNING: header field #IMGTARGET should contain placeholders {ID} or {ALTID}")}
888             else {
889 0         0 push(@carp, "ERROR: could not parse header field #IMGTARGET: '".$fieldref->{'IMGTARGET'}."'");
890 0         0 delete $fieldref->{'IMGTARGET'};
891             }
892             };
893              
894 9 50       21 if ( exists $fieldref->{'TARGET'} ) {
    0          
895 9 50       20 $fieldref->{'TARGET'} = "" unless defined $fieldref->{'TARGET'};
896 9         22 my $parsed = hDecode($fieldref, 'TARGET');
897 9 100 66     180 if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%1\$s/) && ($parsed !~ /(^|[^%])(%.)*%[2-9]\$s/) ) {
    50 66        
898 8         19 $fieldref->{'TARGET'} = $parsed;
899 8         16 $format .= " -hasTARGET";
900             }
901             elsif ( $parsed ) {
902 1 50       4 if ( exists $fieldref->{'ALTTARGET'} ) {
903 0         0 push(@carp, "ERROR: header field #TARGET must contain placeholder {ID} only");
904             }
905             else {
906 1         3 push(@carp, "WARNING: Adding implicit {ID} to #TARGET as #ALTTARGET");
907 1         5 $fieldref->{'ALTTARGET'} = $parsed."%2\$s";
908 1         2 $format .= " -altTARGET";
909             }
910 1         4 delete $fieldref->{'TARGET'};
911             }
912             else {
913 0         0 push(@carp, "ERROR: could not parse header field #TARGET: '".$fieldref->{'TARGET'}."'");
914 0         0 delete $fieldref->{'TARGET'};
915             }
916             }
917             elsif ( $format =~ /^BEACON/ ) {
918 0         0 push(@carp, "WARNING: header field #TARGET not set: ALL beacon lines will have to provide their link by other means!");
919 0         0 $format =~ s/\s*-noTARGET//;
920 0         0 $format .= " -noTARGET";
921             }
922             else {
923 0         0 push(@carp, "ERROR: header field #TARGET is mandatory")};
924              
925              
926 9 50       24 $fieldref->{'MESSAGE'} = hDecode($fieldref, 'MESSAGE') if $fieldref->{'MESSAGE'};
927              
928 9 100       18 if ( $fieldref->{'TIMESTAMP'} ) {
929 1 50       4 if ( my $parsed = hDecode($fieldref, 'TIMESTAMP') ) {
930 1 50       6 printf("* %-30s %s\n", "Beacon Timestamp:", hEncode($parsed, 'TIMESTAMP')) if $options{'verbose'};
931 1         5 $fieldref->{'TIMESTAMP'} = $parsed;
932             }
933             else { # unparseable => use current
934 0         0 push(@carp, "WARNING: cannot parse TIMESTAMP '".$fieldref->{'TIMESTAMP'}."', using current time");
935 0         0 $fieldref->{'TIMESTAMP'} = $^T;
936             };
937             }
938             else {
939             # $fieldref->{'TIMESTAMP'} = $fieldref->{'_mtime'} || $^T;
940 8         14 push(@carp, "NOTICE: no header field #TIMESTAMP detected");
941             };
942              
943 9 50       22 if ( $fieldref->{'REVISIT'} ) {
944 0 0       0 if ( my $parsed = hDecode($fieldref, 'REVISIT') ) {
945 0 0       0 if ( $parsed < $^T ) {
946 0         0 printf("* %-30s %s [%s]\n", "STALE Revisit hint parsed as", hEncode($parsed, 'REVISIT'), $fieldref->{'REVISIT'})} # if $options{'verbose'}
947             else {
948 0 0       0 printf("* %-30s %s\n", "Revisit hint parsed as", hEncode($parsed, 'REVISIT')) if $options{'verbose'}};
949 0         0 $fieldref->{'REVISIT'} = $parsed;
950             }
951             else { # unparseable => discard
952 0         0 push(@carp, "WARNING: cannot parse #REVISIT '".$fieldref->{'REVISIT'}."', discarding");
953 0         0 delete $fieldref->{'REVISIT'};
954             };
955             }
956             else {
957 9         15 push(@carp, "INFO: no header field #REVISIT detected");
958             };
959              
960 9         13 my $cancontinue = 1;
961 9         9 my $err = "";
962 9         24 foreach ( @carp ) {
963 21         567 print "$_\n";
964 21 50       70 if ( s/^ERROR: // ) {
965 0         0 $cancontinue = 0;
966 0 0       0 $err .= " | " if $err;
967 0         0 $err .= $_;
968             };
969             }
970 9 0 33     25 unless ( $cancontinue or $options{'ignore-header-errors'} ) {
971 0         0 print "CRITICAL: Aborting because of Header Errors\n";
972 0         0 return (undef, $err, $format);
973             };
974              
975 9   66     29 $fieldref->{'_uri'} ||= $fieldref->{'FEED'};
976 9 100       38 delete $fieldref->{'_uri'} unless $fieldref->{'_uri'};
977              
978 9   0     41 $fieldref->{'_alias'} ||= $fieldref->{'FEED'} || $fieldref->{'TARGET'};
      33        
979              
980 9         13 my (@fn, @fd);
981 9         49 while ( my ($key, $val) = each %$fieldref ) {
982 109 100       148 next unless defined $val;
983 105         68 my $dbkey = "";
984 105 100       182 if ( $dbkey = SeeAlso::Source::BeaconAggregator->beaconfields($key) ) {
    50          
985 54         55 push(@fn, $dbkey)}
986             elsif ( $key =~ /_(\w+)$/ ) {
987 51         91 push(@fn, $1)}
988             else {
989 0         0 next};
990 105         96 my $myval = $val;
991 105 100       279 unless ( $myval =~ /^\d+$/ ) {
992 82         163 $myval =~ s/'/''/g;
993 82         150 $myval = "'".$myval."'";
994             };
995 105         300 push(@fd, $myval);
996             };
997 9         16 local($") = ",\n";
998 9         86 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
999             INSERT INTO repos ( seqno, @fn ) VALUES ( NULL, @fd );
1000             XxX
1001 9 50       28 $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
1002 9 50       122228 $sth->execute() or croak("Could not execute >".$sth->{Statement}."<:".$sth->errstr);
1003 9         154 my $collno = $self->{dbh}->last_insert_id("", "", "", "");
1004              
1005 9         13 my $rhandle;
1006 9 100       37 if ( $osq ) {
1007 4         67 $rhandle = $self->stmtHdl(<<"XxX");
1008             UPDATE OR FAIL beacons SET seqno=$collno, hits=?, info=?, link=? WHERE hash=? AND seqno==$osq AND altid=?;
1009             XxX
1010             };
1011 9         95 my $ihandle = $self->stmtHdl(<<"XxX");
1012             INSERT OR IGNORE INTO beacons ( hash, seqno, altid, hits, info, link ) VALUES (?, $collno, ?, ?, ?, ?);
1013             XxX
1014 9         123 return ($collno, "", $format, $ihandle, $rhandle, $osq);
1015             }
1016              
1017              
1018              
1019              
1020             my ($lwpcarp817, $lwpcarp827);
1021              
1022             =head3 update ($sq_or_alias, $params, %options)
1023              
1024             Loads a beacon file into the database, possibly replacing a previous instance.
1025              
1026             Some magic is employed to autoconvert ISO-8859-1 or doubly UTF-8 encoded files
1027             back to UTF-8.
1028              
1029             Returns undef, if something goes wrong, or the file was not modified since,
1030             otherwise returns a pair (new seqence number, number of lines imported).
1031              
1032              
1033             =over 8
1034              
1035              
1036             =item $sq_or_alias
1037              
1038             Sequence number or alias: Used to determine an existing instance.
1039              
1040              
1041             =item $params
1042              
1043             Hashref, containing
1044              
1045             agent => LWP::UserAgent to use
1046             _uri => Feed URL to load from
1047              
1048             =item %options
1049              
1050             Hash, propagated to C
1051              
1052             verbose => (0|1)
1053             force => (0|1) process unconditionally without timestamp comparison
1054             nostat => (0|1) don't refresh global identifier counters
1055              
1056             =back
1057              
1058             Incorporates a new beacon source from a URI in the database or updates an existing one.
1059             For HTTP URIs care is taken not to reload an unmodified BEACON feed (unless the 'force'
1060             option is provided).
1061              
1062             If the feed appears to be newer than the previously loaded version it is fetched,
1063             some UTF-8 adjustments are performed if necessary, then it is stored to a temporary file
1064             and from there finally processed by the C method above.
1065              
1066             The URI to load is determined by the following order of precedence:
1067              
1068             =over 8
1069              
1070             =item 1
1071              
1072             _uri Option
1073              
1074             =item 2
1075              
1076             admin field uri stored in the database
1077              
1078             =item 3
1079              
1080             meta field #FEED taken from the database
1081              
1082             =back
1083              
1084             Typical use is with an alias, not with a sequence number:
1085              
1086             $db->update('whatever');
1087              
1088             Can be used to initially load beacon files from URIs:
1089              
1090             $db->update("new_alias", {_uri => $file_uri} );
1091              
1092             =cut
1093              
1094             sub update {
1095 9     9 1 61348 my ($self, $sq_or_alias, $params, %options) = @_;
1096 9 100       37 $params = {} unless $params;
1097 9 100       37 $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1098              
1099 9         13 my $ua = $params->{'agent'};
1100 9 50       25 unless ( $ua ) {
1101 9         70 require LWP::UserAgent;
1102 9 50       85 $ua = LWP::UserAgent->new(agent => "SA-S-BeaconAggregator ", # end with space to get default agent appended
1103             env_proxy => 1,
1104             timeout => 300,
1105             ) or croak("cannot create UserAgent");
1106             };
1107              
1108 9         14585 my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($sq_or_alias);
1109 9 100       40 my $alias = ($sq_or_alias =~ /^\d+$/) ? "" : $sq_or_alias;
1110 9         61 my $feedname = SeeAlso::Source::BeaconAggregator->beaconfields("FEED");
1111 9         64 my ($ssth, $ssthexpl) = $self->stmtHdl(<<"XxX");
1112             SELECT seqno, uri, alias, $feedname, ftime, mtime, sort FROM repos $cond;
1113             XxX
1114 9 50       33 $self->stmtExplain($ssthexpl, @cval) if $ENV{'DBI_PROFILE'};
1115 9 50       807 $ssth->execute(@cval) or croak("Could not execute >".$ssth->{Statement}."<: ".$ssth->errstr);
1116 9 50       55 croak("Select old instance error: ".$ssth->errstr) if $ssth->err;
1117 9         96 my $aryref = $ssth->fetchrow_arrayref;
1118 9 100       35 my ($osq, $ouri, $oalias, $feed, $fetchtime, $modtime, $osort) = $aryref ? @$aryref : ();
1119              
1120 9   33     58 my $uri = $params->{'_uri'} || $ouri || $feed;
1121 9 50       49 croak("Cannot update $sq_or_alias: URI not given nor determinable from previous content") unless $uri;
1122 9         57 $uri =~ s/\s$//;
1123 9   50     42 $alias ||= $oalias || "";
      66        
1124              
1125 9 100       29 print "Requesting $uri\n" if $options{'verbose'};
1126 9 50       186 my $rq = HTTP::Request->new('GET', $uri, ['Accept' => 'text/*']) or croak("could not construct request from $uri");
1127 9 100 100     5284 if ( $fetchtime && $modtime && !$options{'force'} ) { # allow force-reload by deleting _ftime or _mtime
      100        
1128 1 50       5 printf(" %-30s %s\n", "Old instance stamped", scalar localtime($modtime)) if $options{'verbose'};
1129 1         6 $rq->header('If-Modified-Since', HTTP::Date::time2str($modtime));
1130             };
1131 9 50       105 if ( $rq->can("accept_decodable") ) { # LWP 5.817 and newer
1132 9         26 $rq->accept_decodable}
1133             else {
1134 0 0 0     0 carp("please upgrade to LWP >= 5.817 for compression negotiation") if $options{'verbose'} && (!$lwpcarp817++)};
1135              
1136 9         66435 my $response = $ua->request($rq); # Well, we hoggishly slurp everything into memory,
1137             # however explicit decompression of an already dumped result would be PITA
1138 9         21374 my $nuri = ($response->request)->uri;
1139 9 50       108 print "NOTICE: Differing result URI: $nuri\n" if $uri ne $nuri;
1140 9 100       76 if ( $response->is_success ) {
    50          
1141 8 100       390 print $osq ? "INFO: refreshing $alias sq $osq from $uri\n"
1142             : "INFO: importing previously unseen $alias from $uri\n";
1143 8         102 my $charset;
1144 8 50       42 if ( $response->can("content_charset") ) { # LWP 5.827 and above
1145 8         27 $charset = $response->content_charset;
1146 8 100 33     9263 print "DEBUG: Content charset is $charset\n" if $charset && $options{'verbose'};
1147             }
1148             else {
1149 0 0 0     0 carp("please upgrade to LWP >= 5.827 for better detection of content_charset") if $options{'verbose'} && (!$lwpcarp827++)};
1150 8   50     24 $charset ||= "UTF-8";
1151              
1152 8         35 my $lm = $response->last_modified;
1153 8 100 33     944 printf(" %-30s %s\n", "Last modified", scalar localtime($lm)) if $lm && $options{'verbose'};
1154 8   33     24 $lm ||= $^T;
1155              
1156 8         43 my $vt = $response->fresh_until(h_min => 1800, h_max => 30 * 86400);
1157 8 100 33     2820 printf(" %-30s %s\n", "Should be valid until", scalar localtime($vt)) if $vt && $options{'verbose'};
1158 8   50     35 $vt ||= 0;
1159              
1160             # temporary file for dumped contents
1161 8 50       45 my ($tmpfh, $tmpfile) = File::Temp::tempfile("BeaconAggregator-XXXXXXXX", SUFFIX => ".txt", TMPDIR => 1) or croak("Could not acquire temporary file for storage");
1162 8         3004 my $contref; # reference to content buffer
1163 8 50       35 if ( ! $response->content_is_text ) {
1164 0         0 my $ct = $response->content_type;
1165 0         0 print "WARNING: Response content is $ct, not text/*\n";
1166 0 0       0 if ( my $ce = $response->content_encoding ) {
1167 0         0 print "NOTICE: Response is also Content-encoded: $ce\n"}
1168 0         0 my $ctt = join("|", $response->decodable());
1169 0 0       0 if ( $ct =~ s!^(.+\/)?($ctt)$!$2! ) {
1170             # yes: decode anyway since it could be a gzip-encoded .txt.gz file!
1171 0         0 my $cr = $response->decoded_content( raise_error => 1, ref => 1); # method exists since LWP 5.802 (2004-11-30)
1172 0         0 $response->remove_content_headers;
1173 0         0 my $newresp = HTTP::Response->new($response->code, $response->message, $response->headers);
1174 0         0 $newresp->content_type("text/plain; charset: $charset");
1175 0         0 $newresp->content_encoding($ct);
1176 0         0 $newresp->content_ref($cr);
1177 0         0 $response = $newresp;
1178             }
1179             };
1180 8         219 $contref = $response->decoded_content( raise_error => 1, ref => 1); # method exists since LWP 5.802 (2004-11-30)
1181              
1182 8 50       9138 if ( $$contref =~ /^\x{FFEF}/ ) { # properly encoded BOM => put Characters to file
    50          
    100          
    50          
1183 0         0 binmode($tmpfh, ":utf8");
1184 0         0 print "INFO: properly encoded BOM detected: Groked UTF8\n"; # if $options{'verbose'};
1185             }
1186             elsif ( $$contref =~ s/^\xef\xbb\xbf// ) { # BOM Bytes => put Bytes to file, re-read as UTF-8
1187 0         0 print "INFO: Byte coded BOM detected: trying to restitute character semantics\n"; # if $options{'verbose'};
1188 0 0       0 print "INFO: Length is ", length($$contref), " ", (utf8::is_utf8($$contref) ? "characters" : "bytes"), "\n";
1189 0         0 binmode($tmpfh, ":bytes");
1190 0 0       0 utf_deduplicate($contref) && binmode($tmpfh, ":utf8");
1191             }
1192             elsif ( utf8::is_utf8($$contref) ) { # already Upgraded strings should be written as utf-8
1193 7 100       88 print "INFO: UTF8-ness already established\n" if $options{'verbose'};
1194 7         44 binmode($tmpfh, ":utf8");
1195 7         41 utf_deduplicate($contref); # but don't trust it (older LWP with file URLs, ...)
1196             }
1197             elsif ( utf8::decode($$contref) ) { # everything in character semantics now
1198 1 50       26 print "INFO: Could decode bytes to UTF8-characters\n" if $options{'verbose'};
1199 1         6 binmode($tmpfh, ":utf8");
1200             }
1201             else { # leave it alone
1202 0         0 print "WARNING: No clue about character encoding: Assume ISO 8859-1\n"; # if $options{'verbose'};
1203 0         0 binmode($tmpfh, ":utf8");
1204             };
1205 8         74 print $tmpfh $$contref;
1206 8         320 close($tmpfh);
1207             # early cleanup since everything might be huge....
1208 8         15 $contref = $response = undef;
1209              
1210 8         142 my ($collno, $count, $statref) = $self->loadFile($tmpfile, {_alias => $alias, _uri => $uri, _ruri => $nuri, _mtime => $lm, _sort => $osort}, %options);
1211 8 50 33     106 if ( ! $collno && $osq ) {
1212 0         0 my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1213             UPDATE OR FAIL repos SET utime=?,ustat=? WHERE seqno==?;
1214             XxX
1215 0 0       0 $self->stmtExplain($usthexpl, time(), ($statref ? "load error: $statref" : "internal error"), $osq) if $ENV{'DBI_PROFILE'};
    0          
1216             $usth->execute(time(), ($statref ? "load error: $statref" : "internal error"), $osq)
1217 0 0       0 or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
    0          
1218             };
1219              
1220 8 50       606 unlink($tmpfile) if -f $tmpfile;
1221 8 50       416 return $collno ? ($collno, $count) : undef;
1222             }
1223             elsif ( $response->code == 304 ) {
1224 1 50       16 print "INFO: $alias not modified since ".HTTP::Date::time2str($modtime)."\n" if $options{'verbose'};
1225 1         5 my $vt = $response->fresh_until(h_min => 1800, h_max => 6 * 86400);
1226 1 50       276 printf(" %-30s %s\n", "Will not try again before", scalar localtime($vt)) if $options{'verbose'};
1227 1         4 my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1228             UPDATE OR FAIL repos SET utime=?,ustat=?,ruri=? WHERE seqno==?;
1229             XxX
1230 1 50       5 $self->stmtExplain($usthexpl, time(), $response->status_line, $nuri, $osq) if $ENV{'DBI_PROFILE'};
1231             $usth->execute(time(), $response->status_line, $nuri, $osq)
1232 1 50       7 or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1233 1         6594 return undef;
1234             }
1235             else {
1236 0         0 print "WARNING: No access to $uri for $alias [".$response->status_line."]\n";
1237 0         0 print $response->headers_as_string, "\n";
1238 0 0       0 return undef unless $osq;
1239 0         0 my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1240             UPDATE OR FAIL repos SET utime=?,ustat=?,ruri=? WHERE seqno==?;
1241             XxX
1242 0 0       0 $self->stmtExplain($usthexpl, time(), $response->status_line, $nuri, $osq) if $ENV{'DBI_PROFILE'};
1243             $usth->execute(time(), $response->status_line, $nuri, $osq)
1244 0 0       0 or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1245 0         0 return undef;
1246             };
1247             }
1248              
1249              
1250              
1251             sub utf_deduplicate {
1252 7     7 0 16 my ($success, $stringref) = (0, @_);
1253 7 50       33 if ( utf8::downgrade($$stringref, 1) ) { # 1 = FAIL_OK
1254 7         14 my $prevlength = length($$stringref);
1255 7 50       316 print "INFO: Downgrade was possible, length now $prevlength ", (utf8::is_utf8($$stringref) ? "characters" : "bytes"), "\n";
1256 7         42 while ( utf8::decode($$stringref) ) {
1257 0         0 $success ++;
1258 0         0 my $newlength = length($$stringref);
1259 0 0       0 print "DEBUG: Reassembling as UTF-8 succeeded, length now $newlength ", (utf8::is_utf8($$stringref) ? "characters" : "bytes"), "\n";
1260 0 0       0 last if $newlength == $prevlength;
1261 0         0 $prevlength = $newlength;
1262             # last unless utf8::downgrade($$stringref, 1);
1263             }
1264             }
1265             else {
1266 0         0 print "WARNING: no downgrade possible, proceed with byte semantics";
1267             };
1268 7         11 return $success;
1269             }
1270              
1271             =head3 unload ( [ $seqno_or_alias, %options ] )
1272              
1273             Deletes the sequence(s).
1274              
1275             =over 8
1276              
1277             =item $seqno_or_alias
1278              
1279             numeric sequence number, Alias or SQL pattern.
1280              
1281             =item Supported options:
1282              
1283             force => (0|1)
1284              
1285             Needed to purge the complete database ($seqno_or_alias empty) or to purge
1286             more than one sequence ($seqno_or_alias yields more than one seqno).
1287              
1288             =back
1289              
1290              
1291             =cut
1292              
1293             sub unload {
1294 3     3 1 942 my ($self, $seqno_or_alias, %options) = @_;
1295 3 50       20 $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1296              
1297 3         7 my @seqnos = ();
1298 3 50 33     43 if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
    50 33        
1299 0         0 @seqnos = ($seqno_or_alias)}
1300             elsif ( $seqno_or_alias || $options{'force'} ) {
1301 3 50       22 @seqnos = $self->Seqnos('_alias', $seqno_or_alias ? ($seqno_or_alias) : ());
1302 3 50       12 unless ( @seqnos ) {
1303 0         0 carp("no Seqnos selected by $seqno_or_alias");
1304 0         0 return 0;
1305             };
1306 3 50 33     25 unless ( $options{'force'} or (@seqnos == 1) ) {
1307 0         0 carp("Use --force to purge more than one sequence (@seqnos)");
1308 0         0 return 0;
1309             };
1310             }
1311             else {
1312 0         0 carp("Use --force to purge the complete database");
1313 0         0 return 0;
1314             };
1315              
1316 3 50       11 if ( $options{'force'} ) {
1317 0         0 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1318             DELETE FROM beacons WHERE seqno==?;
1319             XxX
1320 0         0 foreach my $seqno ( @seqnos ) {
1321 0 0       0 $self->stmtExplain($sthexpl, $seqno_or_alias) if $ENV{'DBI_PROFILE'};
1322 0 0       0 my $rows = $sth->execute($seqno_or_alias) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1323 0 0       0 print "INFO: $rows forced for $seqno\n" if $options{'verbose'};
1324             };
1325             };
1326              
1327 3         17 my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1328 3         16 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1329             DELETE FROM repos $cond;
1330             XxX
1331 3 50       12 $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1332 3 50       82491 my $rows = $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1333 3 50       30 $rows = 0 if $rows eq "0E0";
1334              
1335 3 50 33     18 if ( $rows or $options{'force'} ) {
1336             # if ( $options{'force'} ) {
1337             # print "[ANALYZE ..." if $options{'verbose'};
1338             # $self->{dbh}->do("ANALYZE;");
1339             # print "]\n" if $options{'verbose'};
1340             # };
1341              
1342 3 50       20 if ( $options{'nostat'} ) { # invalidate since they might have changed
1343 0         0 $self->admin('gcounti', undef);
1344 0         0 $self->admin('gcountu', undef);
1345             }
1346             else {
1347 3   50     39 $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
1348 3   50     28 $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
1349             }
1350             };
1351              
1352 3         49 return $rows;
1353             }
1354              
1355              
1356             =head3 purge ( $seqno_or_alias[, %options ] )
1357              
1358             Deletes all identifiers from the database to the given pattern,
1359             but leaves the stored header information intact, such that it
1360             can be updated automatically.
1361              
1362             =over 8
1363              
1364             =item $seqno_or_alias
1365              
1366             Pattern
1367              
1368             =item Supported options:
1369              
1370             force => (0|1)
1371              
1372             Allow purging of more than one sequence.
1373              
1374             =back
1375              
1376              
1377             =cut
1378              
1379             sub purge {
1380 2     2 1 4405 my ($self, $seqno_or_alias, %options) = @_;
1381 2 50       11 $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1382 2         101 my @seqnos;
1383 2 50 33     33 if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
    50 33        
1384 0         0 @seqnos = ($seqno_or_alias)}
1385             elsif ( $seqno_or_alias || $options{'force'} ) {
1386 2 50       32 @seqnos = $self->Seqnos('_alias', $seqno_or_alias ? ($seqno_or_alias) : ());
1387 2 50       7 unless ( @seqnos ) {
1388 0         0 carp("no Seqnos selected by $seqno_or_alias");
1389 0         0 return 0;
1390             };
1391 2 50 33     16 unless ( $options{'force'} or (@seqnos == 1) ) {
1392 0         0 carp("Use --force to purge more than one sequence (@seqnos)");
1393 0         0 return 0;
1394             };
1395             }
1396             else {
1397 0         0 carp("Use --force to purge the complete database");
1398 0         0 return 0;
1399             };
1400 2         10 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1401             DELETE FROM beacons WHERE seqno==?;
1402             XxX
1403 2         8 my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1404             UPDATE OR FAIL repos SET counti=?,countu=?,utime=?,ustat=? WHERE seqno==?;
1405             XxX
1406 2         5 my $trows = 0;
1407 2         4 foreach my $seqno ( @seqnos ) {
1408 2 50       6 $self->stmtExplain($sthexpl, $seqno) if $ENV{'DBI_PROFILE'};
1409 2 50       42422 my $rows = $sth->execute($seqno) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1410 2 50       17 $rows = "0" if $rows eq "0E0";
1411 2 50       12 print "INFO: $rows purged for $seqno\n" if $options{'verbose'};
1412 2         6 $trows += $rows;
1413 2 50       12 $self->stmtExplain($usthexpl, 0, 0, time, "purged", $seqno) if $ENV{'DBI_PROFILE'};
1414             $usth->execute(0, 0, time, "purged", $seqno)
1415 2 50       20066 or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1416             };
1417              
1418 2 50 33     32 if ( $trows or $options{'force'} ) {
1419             # if ( $options{'force'} ) {
1420             # print "[ANALYZE ..." if $options{'verbose'};
1421             # $self->{dbh}->do("ANALYZE;");
1422             # print "]\n" if $options{'verbose'};
1423             # };
1424              
1425 2 50       14 if ( $options{'nostat'} ) { # invalidate since they might have changed
1426 0         0 $self->admin('gcounti', undef);
1427 0         0 $self->admin('gcountu', undef);
1428             }
1429             else {
1430 2   50     21 $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
1431 2   50     16 $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
1432             }
1433             };
1434              
1435 2         38 return $trows;
1436             }
1437              
1438              
1439             =head2 Methods for headers
1440              
1441             =head3 ($rows, @oldvalues) = headerfield ( $sq_or_alias, $key [, $value] )
1442              
1443             Gets or sets an meta or admin Entry for the constituent file indicated by $sq_or_alias
1444              
1445             =cut
1446              
1447             sub headerfield {
1448 9     9 1 9295 my ($self, $sq_or_alias, $key, $value) = @_;
1449              
1450 9         13 my $dbkey = "";
1451 9 100       40 if ( $dbkey = SeeAlso::Source::BeaconAggregator->beaconfields($key) ) {
    50          
1452             }
1453             elsif ( $key =~ /_(\w+)$/ ) {
1454 5         14 $dbkey = $1}
1455             else {
1456 0         0 carp "Field $key not known";
1457 0         0 return undef;
1458             };
1459              
1460 9         33 my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($sq_or_alias);
1461              
1462 9         53 my ($osth, $osthexpl) = $self->stmtHdl(<<"XxX");
1463             SELECT $dbkey FROM repos $cond;
1464             XxX
1465 9 50       24 $self->stmtExplain($osthexpl, @cval) if $ENV{'DBI_PROFILE'};
1466 9 50       606 $osth->execute(@cval) or croak("Could not execute >".$osth->{Statement}."<:".$osth->errstr);
1467 9         132 my $tmpval = $osth->fetchall_arrayref();
1468 9 100       22 my @oval = map { hEncode($_, $key) } map { (defined $_->[0]) ? ($_->[0]) : () } @$tmpval;
  7         20  
  9         34  
1469 9         12 my $rows = scalar @oval;
1470              
1471 9 100 66     48 if ( (defined $value) and ($value ne "") ) { # set
    50          
1472 3         18 my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1473             UPDATE OR FAIL repos SET $dbkey=? $cond;
1474             XxX
1475 3   100     12 $value = hDecode($value, $key) || "";
1476 3 50       8 $self->stmtExplain($usthexpl, $value, @cval) if $ENV{'DBI_PROFILE'};
1477 3 50       32698 $rows = $usth->execute($value, @cval) or croak("Could not execute >".$usth->{Statement}."<:".$usth->errstr);
1478             }
1479             elsif ( defined $value ) { # clear
1480 0         0 my ($dsth, $dsthexpl) = $self->stmtHdl(<<"XxX");
1481             UPDATE OR FAIL repos SET $dbkey=? $cond;
1482             XxX
1483 0 0       0 $self->stmtExplain($dsthexpl, undef, @cval) if $ENV{'DBI_PROFILE'};
1484 0 0       0 $rows = $dsth->execute(undef, @cval) or croak("Could not execute >".$dsth->{Statement}."<:".$dsth->errstr);
1485             }
1486             else { # read
1487             }
1488              
1489 9         75 return ($rows, @oval);
1490             }
1491              
1492             =head3 ($resultref, $metaref) = headers ( [ $seqno_or_alias ] )
1493              
1494             Iterates over all
1495              
1496             For each iteration returns two hash references:
1497              
1498             =over 8
1499              
1500             =item 1
1501             all official beacon fields
1502              
1503             =item 2
1504             all administrative fields (_alias, ...)
1505              
1506             =back
1507              
1508             =cut
1509              
1510             sub headers {
1511 5     5 1 7167 my ($self, $seqno_or_alias) = @_;
1512              
1513 5 100       12 unless ( $self->{_iterator_info} ) {
1514 1         4 my ($constraint, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1515 1         5 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1516             SELECT * FROM repos $constraint;
1517             XxX
1518 1 50       5 $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1519 1 50       47 $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1520 1         6 $self->{_iterator_info} = $sth;
1521             };
1522              
1523 5         209 my $info = $self->{_iterator_info}->fetchrow_hashref;
1524 5 100       21 unless ( defined $info ) {
1525 1 50       6 croak("Error listing Collections: $self->{_iterator_info}->errstr") if $self->{_iterator_info}->err;
1526 1         1 delete $self->{_iterator_info};
1527 1         3 return undef;
1528             }
1529              
1530 4   33     9 my $collno = $info->{seqno} || $seqno_or_alias;
1531 4         9 my %meta = (_seqno => $collno);
1532 4         5 my %result = ();
1533 4         13 while ( my($key, $val) = each %$info ) {
1534 136 100       218 next unless defined $val;
1535 71         68 my $pval = hEncode($val, $key);
1536              
1537 71 100       109 if ( $key =~ /^bc(\w+)$/ ) {
1538 25         85 $result{$1} = $pval}
1539             else {
1540 46         135 $meta{"_$key"} = $pval};
1541             }
1542 4         24 return \%result, \%meta;
1543             }
1544              
1545             =head3 listCollections ( [ $seqno_or_alias ] )
1546              
1547             Iterates over all Sequences and returns on each call an array of
1548              
1549             Seqno, Alias, Uri, Modification time, Identifier Count and Unique identifier count
1550              
1551             Returns undef if done.
1552              
1553             =cut
1554              
1555             sub listCollections {
1556 28     28 1 14751 my ($self, $seqno_or_alias) = @_;
1557              
1558 28 100       73 unless ( $self->{_iterator_listCollections} ) {
1559 7         26 my ($constraint, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1560 7         33 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1561             SELECT seqno, alias, uri, mtime, counti, countu FROM repos $constraint;
1562             XxX
1563 7 50       32 $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1564 7 50       433 $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1565 7         24 $self->{_iterator_listCollections} = $sth;
1566             };
1567 28         268 my $onerow = $self->{_iterator_listCollections}->fetchrow_arrayref;
1568 28 100       63 unless ( $onerow ) {
1569 7 50       57 croak("Error listing Collections: $self->{_iterator_listCollections}->errstr") if $self->{_iterator_listCollections}->err;
1570 7         11 delete $self->{_iterator_listCollections};
1571 7         19 return ();
1572             };
1573 21         81 return @$onerow;
1574             }
1575              
1576             =head2 Statistics
1577              
1578             =head3 idStat ( [ $seqno_or_alias, %options ] )
1579              
1580             Count identifiers for the given pattern.
1581              
1582             =over 8
1583              
1584             =item Supported options:
1585              
1586             distinct => (0|1)
1587              
1588             Count multiple occurences only once
1589              
1590             verbose => (0|1)
1591              
1592             =back
1593              
1594              
1595             =cut
1596              
1597             sub idStat {
1598 61     61 1 9076 my ($self, $seqno_or_alias, %options) = @_;
1599 61 50       248 $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1600 61         96 my $cond = "";
1601 61 100 100     369 if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
    100          
1602 17         32 $cond = "WHERE seqno==$seqno_or_alias"}
1603             elsif ( $seqno_or_alias ) {
1604 4         30 my @seqnos = $self->Seqnos('_alias', $seqno_or_alias);
1605 4 50       10 if ( @seqnos ) {
1606 4         19 $cond = "WHERE seqno IN (".join(",", @seqnos).")"}
1607             else {
1608 0         0 carp("no Seqnos selected by $seqno_or_alias");
1609 0         0 return 0;
1610             };
1611             };
1612             # my $count_what = $options{'distinct'} ? "DISTINCT hash" : "*";
1613             # will not be optimized by SQLite or mySQL: SELECT COUNT($count_what) FROM beacons $cond;
1614             # my $sth= $self->stmtHdl("SELECT COUNT($count_what) FROM beacons $cond LIMIT 1;");
1615 61 100       213 my $from = $options{'distinct'} ? "(SELECT DISTINCT hash FROM beacons $cond)"
1616             : "beacons $cond";
1617 61         389 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1618             SELECT COUNT(*) FROM $from LIMIT 1;
1619             XxX
1620 61 50       168 $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
1621 61 50       3456 $sth->execute() or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1622 61         652 my $hits = $sth->fetchrow_arrayref;
1623              
1624 61   50     418 return $hits->[0] || 0;
1625             };
1626              
1627              
1628             =head3 idCounts ( [ $pattern, %options ] )
1629              
1630             Iterates through the entries according to the optional id filter expression.
1631              
1632             For each iteration the call returns a triple consisting of (identifier,
1633             number of rows, and sum of all individual counts).
1634              
1635             =over 8
1636              
1637             =item Supported options:
1638              
1639             distinct => (0|1)
1640              
1641             Count multiple occurences in one beacon file only once.
1642              
1643             =back
1644              
1645             =cut
1646              
1647             sub idCounts {
1648 64     64 1 48054 my ($self, $pattern, %options) = @_;
1649 64 50       112 my $cond = $pattern ? qq!WHERE hash LIKE "$pattern"! : "";
1650 64 100       96 my $count_what = $options{'distinct'} ? "DISTINCT seqno" : "seqno";
1651 64 100       118 unless ( $self->{_iterator_idCounts} ) {
1652 8         37 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1653             SELECT hash, COUNT($count_what), SUM(hits) FROM beacons $cond GROUP BY hash ORDER BY hash;
1654             XxX
1655 8 50       23 $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
1656 8 50       743 $sth->execute() or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1657 8         23 $self->{_iterator_idCounts} = $sth;
1658 8 100       25 unless ( defined $self->{identifierClass} ) {
1659 2         10 my $package = $self->autoIdentifier();
1660 2 50 33     7 $options{'verbose'} && ref($package) && carp "Assuming identifiers of type ".ref($package)."\n";
1661             }
1662             };
1663 64         1012 my $onerow = $self->{_iterator_idCounts}->fetchrow_arrayref;
1664 64 100       114 unless ( $onerow ) {
1665 8 50       51 croak("Error listing Collections: $self->{_iterator_idCounts}->errstr") if $self->{_iterator_idCounts}->err;
1666 8         12 delete $self->{_iterator_idCounts};
1667 8         24 return ();
1668             };
1669 56 100       105 if ( defined $self->{identifierClass} ) {
1670 42         39 my $c = $self->{identifierClass};
1671             # compat: hash might not take an argument, must resort to value, has to be cleared before...
1672 42         103 $c->value("");
1673 42   100     1230 my $did = $c->hash($onerow->[0]) || $c->value($onerow->[0]);
1674 42 100       2011 $onerow->[0] = $c->can("pretty") ? $c->pretty() : $c->value();
1675             };
1676 56         465 return @$onerow;
1677             };
1678              
1679              
1680             =head3 idList ( [ $pattern ] )
1681              
1682             Iterates through the entries according to the optional selection.
1683              
1684             For each iteration the call returns a tuple consisting of identifier and an
1685             list of array references (Seqno, Hits, Info, explicit Link, AltId) or the emtpy list
1686             if finished.
1687              
1688             Hits, Info, Link and AltId are normalized to the empty string if undefined (or < 2 for hits).
1689              
1690             It is important to finish all iterations before calling this method for "new" arguments:
1691              
1692             1 while $db->idList(); # flush pending results
1693              
1694             =cut
1695              
1696             sub idList {
1697 44     44 1 87111 my ($self, $pattern) = @_;
1698 44 50       110 my $cond = $pattern ? ($pattern =~ /%/ ? "WHERE hash LIKE ?" : qq"WHERE hash=?")
    100          
1699             : "";
1700 44 100       95 unless ( $self->{_iterator_idList_handle} ) {
1701 8         41 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1702             SELECT hash, seqno, hits, info, link, altid FROM beacons $cond ORDER BY hash, seqno, altid;
1703             XxX
1704 8 0       24 $self->stmtExplain($sthexpl, ($pattern ? ($pattern) : () )) if $ENV{'DBI_PROFILE'};
    50          
1705 8 100       691 $sth->execute(($pattern ? ($pattern) : () )) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
    50          
1706 8         22 $self->{_iterator_idList_handle} = $sth;
1707 8         49 $self->{_iterator_idList_crosscheck} = $self->RepoCols("ALTTARGET");
1708 8         12 $self->{_iterator_idList_prefetch} = undef;
1709 8 100       28 $self->autoIdentifier() unless defined $self->{identifierClass};
1710             };
1711 44 100       79 unless ( exists $self->{_iterator_idList_prefetch} ) { # deferred exit
1712 8         13 delete $self->{_iterator_idList_handle};
1713 8         18 delete $self->{_iterator_idList_crosscheck};
1714 8         17 return ();
1715             };
1716 36         36 my $pf = $self->{_iterator_idList_prefetch};
1717 36         379 while ( my $onerow = $self->{_iterator_idList_handle}->fetchrow_arrayref ) {
1718             # $onerow->[2] = "" unless $self->{_iterator_idList_crosscheck}->{$onerow->[1]}; # kill artefacts
1719 48 100       96 $onerow->[2] = "" unless $onerow->[2]; # kill artefacts
1720 48 100       66 $onerow->[3] = "" unless defined $onerow->[3]; # kill artefacts
1721 48 100       74 $onerow->[4] = "" unless defined $onerow->[4]; # kill artefacts
1722 48 50       62 $onerow->[5] = "" unless defined $onerow->[5]; # kill artefacts
1723 48 100       113 if ( defined $self->{identifierClass} ) {
1724 36         32 my $c = $self->{identifierClass};
1725             # compat: hash might not take an argument, must resort to value, has to be cleared before...
1726 36         72 $c->value("");
1727 36   100     921 my $did = $c->hash($onerow->[0]) || $c->value($onerow->[0]);
1728 36 100       1382 $onerow->[0] = $c->can("pretty") ? $c->pretty() : $c->value();
1729             };
1730 48 100       318 if ( $pf ) {
1731 40 100       95 if ( $pf->[0] eq $onerow->[0] ) {
1732 12         47 push(@$pf, [@$onerow[1..@$onerow-1]]);
1733 12         190 next;
1734             }
1735             else {
1736 28         89 $self->{_iterator_idList_prefetch} = [$onerow->[0], [@$onerow[1..@$onerow-1]]];
1737 28         106 return @$pf;
1738             }
1739             }
1740             else {
1741 8         148 $pf = [$onerow->[0], [@$onerow[1..@$onerow-1]]]};
1742             };
1743            
1744 8 50       45 if ( $self->{_iterator_idList_handle}->err ) {
1745 0         0 croak("Error listing Collections: $self->{_iterator_idList_handle}->errstr");
1746             };
1747 8         12 delete $self->{_iterator_idList_prefetch};
1748 8 50       30 return $pf ? @$pf : ();
1749             };
1750              
1751              
1752             =head2 Manipulation of global metadata: Open Search Description
1753              
1754             =head3 setOSD ( $field, @values }
1755              
1756             Sets the field $field of the OpenSearchDescription to @value(s).
1757              
1758             =cut
1759              
1760             sub setOSD {
1761 5     5 1 1816 my ($self) = shift;
1762 5 100       9 $self->clearOSD($_[0]) or return undef;
1763 3 100       12 return (defined $_[1]) ? $self->addOSD(@_) : 0; # value(s) to set
1764             };
1765              
1766             =head3 clearOSD ( $field }
1767              
1768             Clears the field $field of the OpenSearchDescription.
1769              
1770             =cut
1771              
1772             sub clearOSD {
1773 8     8 1 2604 my ($self, $field) = @_;
1774 8 100       201 $field || (carp("no OSD field name provided"), return undef);
1775 6 100       24 defined $self->osdKeys($field) || (carp("no valid OSD field '$field'"), return undef);
1776 4         11 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1777             DELETE FROM osd WHERE key=?;
1778             XxX
1779 4 50       9 $self->stmtExplain($sthexpl, $field) if $ENV{'DBI_PROFILE'};
1780 4 50       28343 $sth->execute($field) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1781 4         18 return 1;
1782             }
1783              
1784             =head3 addOSD ( $field, @values }
1785              
1786             Adds more @value(s) as (repeatable) field $field of the OpenSearchDescription.
1787              
1788             =cut
1789              
1790             sub addOSD {
1791 10     10 1 2540 my ($self, $field, @values) = @_;
1792 10 100       107 $field || (carp("no OSD field name provided"), return undef);
1793 9 100       19 return 0 unless @values;
1794 8 100       24 defined $self->osdKeys($field) || (carp("no valid OSD field '$field'"), return undef);
1795 7         16 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1796             INSERT INTO osd ( key, val ) VALUES ( ?, ? );
1797             XxX
1798 7 50       13 $self->stmtExplain($sthexpl, $field, $values[0]) if $ENV{'DBI_PROFILE'};
1799 7         8 my $tstatus = [];
1800 7 50       31 my $tuples = $sth->execute_array({ArrayTupleStatus => $tstatus}, $field, \@values) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1801 7         269458 return $tuples;
1802             }
1803              
1804             =head2 Manipulation of global metadata: Beacon Metadata
1805              
1806             These headers are used when you will be publishing a beacon file for the collection.
1807              
1808             =head3 setBeaconMeta ( $field, $value )
1809              
1810             Sets the field $field of the Beacon meta table (used to generate a BEACON file for this
1811             service) to $value.
1812              
1813             =cut
1814              
1815             sub setBeaconMeta {
1816 4     4 1 531 my ($self) = shift;
1817 4 100       10 $self->clearBeaconMeta(@_) or return undef;
1818 2 100       11 return (defined $_[1]) ? $self->addBeaconMeta(@_) : 0; # value to set
1819             };
1820              
1821             =head3 clearBeaconMeta ( $field }
1822              
1823             Deletes the field $field of the Beacon meta table.
1824              
1825             =cut
1826              
1827             sub clearBeaconMeta {
1828 7     7 1 669 my ($self, $rfield) = @_;
1829 7 100       227 $rfield || (carp("no Beacon field name provided"), return undef);
1830 5 100       26 my $field = $self->beaconfields($rfield) or (carp("no valid Beacon field '$rfield'"), return undef);
1831 3         17 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1832             DELETE FROM osd WHERE key=?;
1833             XxX
1834 3 50       8 $self->stmtExplain($sthexpl, $field) if $ENV{'DBI_PROFILE'};
1835 3 50       35953 $sth->execute($field) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1836 3         55 return 1;
1837             }
1838              
1839             =head3 addBeaconMeta ( $field, $value )
1840              
1841             Appends $value to the field $field of the BEACON meta table
1842              
1843             =cut
1844             sub addBeaconMeta {
1845 7     7 1 359 my ($self, $rfield, $value) = @_;
1846 7 100       107 $rfield || (carp("no Beacon field name provided"), return undef);
1847 6 100       32 my $field = $self->beaconfields($rfield) or (carp("no valid Beacon field '$rfield'"), return undef);
1848 5         20 my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1849             INSERT INTO osd ( key, val ) VALUES ( ?, ? );
1850             XxX
1851 5 50       24 $self->stmtExplain($sthexpl, $field, $value) if $ENV{'DBI_PROFILE'};
1852 5 50       66570 $sth->execute($field, $value) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1853 5         127 return 1;
1854             }
1855              
1856             =head3 admin ( [$field, [$value]] )
1857              
1858             Manipulates the admin table.
1859              
1860             Yields a hashref to the admin table if called without arguments.
1861              
1862             If called with $field, returns the current value, and sets the
1863             table entry to $value if defined.
1864              
1865              
1866             =cut
1867              
1868             sub admin {
1869 33     33 1 2001 my ($self, $field, $value) = @_;
1870 33         141 my $admref = $self->admhash();
1871 33 100       67 return $admref unless $field;
1872 31         60 my $retval = $admref->{$field};
1873 31 100       108 return $retval unless defined $value;
1874              
1875 30         87 my ($admh, $admexpl) = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);");
1876 30 50       66 $self->stmtExplain($admexpl, $field, $value) if $ENV{'DBI_PROFILE'};
1877 30 50       407940 $admh->execute($field, $value)
1878             or croak("Could not execute update to admin table: ".$admh->errstr);
1879 30 100       398 return defined($retval) ? $retval : "";
1880             }
1881              
1882              
1883             # on-the-fly conversions
1884              
1885             sub hDecode { # external time to numeric timestamp, printf placeholders
1886 20     20 0 28 my ($val, $fnam) = @_;
1887 20 50       36 return $val unless $fnam;
1888 20 100       58 local($_) = (ref $val) ? $val->{$fnam} : $val;
1889 20 50       37 return undef unless defined $_;
1890              
1891 20 100       99 if ( $fnam =~ /target$/i ) { s/%/%%/g; s/(\{id\}|\$PND)/%1\$s/gi; s/(\{altid\}|\$PND)/%2\$s/gi; }
  16 50       46  
  16 100       89  
  16         92  
1892 0         0 elsif ( $fnam =~ /message$/i ) { s/%/%%/g; s/\{hits?\}/%s/gi; }
  0         0  
1893             elsif ( $fnam =~ /time|revisit/i ) {
1894 2 100       15 if ( /^\d+$/ ) { # legacy UNIX timestamp
    50          
1895             }
1896             elsif ( my $p = HTTP::Date::str2time($_, "GMT") ) { # all unqualified times are GMT
1897 1         135 $_ = $p}
1898             else {
1899 0         0 carp("could not parse value '$_' as time in field $fnam");
1900 0         0 return undef;
1901             };
1902             }
1903 20         48 return $_;
1904             }
1905              
1906             sub hEncode { # timestamp to beacon format
1907 79     79 0 69 my ($val, $fnam) = @_;
1908 79 50       109 local($_) = (ref $val) ? $val->{$fnam} : $val;
1909 79 50       96 return undef unless defined $_;
1910 79 100       360 if ( $fnam =~ /time|revisit/i ) { $_ = SeeAlso::Source::BeaconAggregator::tToISO($_) }
  15 100       56  
    100          
1911 2         6 elsif ( $fnam =~ /message/i ) { s/%s/{hits}/; s/%%/%/g; }
  2         4  
1912 6         12 elsif ( $fnam =~ /target/i ) { s/%s/{ID}/; s/%1\$s/{ID}/; s/%2\$s/{ALTID}/; s/%%/%/g; };
  6         15  
  6         13  
  6         10  
1913 79         118 return $_;
1914             }
1915              
1916             =head1 AUTHOR
1917              
1918             Thomas Berger
1919             CPAN ID: THB
1920             gymel.com
1921             THB@cpan.org
1922              
1923             =head1 COPYRIGHT
1924              
1925             This program is free software; you can redistribute
1926             it and/or modify it under the same terms as Perl itself.
1927              
1928             The full text of the license can be found in the
1929             LICENSE file included with this module.
1930              
1931             =cut
1932              
1933             1;
1934