File Coverage

blib/lib/NNexus/Index/Dispatcher.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Indexing Driver, | #
4             # | Dispatcher for Crawl, Store, Invalidation tasks | #
5             # |=====================================================================| #
6             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
7             # | Research software, produced as part of work done by: | #
8             # | the KWARC group at Jacobs University | #
9             # | Copyright (c) 2012 | #
10             # | Released under the MIT License (MIT) | #
11             # |---------------------------------------------------------------------| #
12             # | Adapted from the original NNexus code by | #
13             # | James Gardner and Aaron Krowne | #
14             # |---------------------------------------------------------------------| #
15             # | Deyan Ginev #_# | #
16             # | http://kwarc.info/people/dginev (o o) | #
17             # \=========================================================ooo==U==ooo=/ #
18             package NNexus::Index::Dispatcher;
19 1     1   531 use warnings;
  1         2  
  1         35  
20 1     1   3 use strict;
  1         1  
  1         28  
21 1     1   584 use Data::Dumper;
  1         9699  
  1         109  
22 1     1   653 use NNexus::Concepts qw(flatten_concept_harvest diff_concept_harvests);
  0            
  0            
23             use NNexus::Morphology qw(admissible_name);
24              
25             # Dispatch to the right NNexus::Index::Domain class
26             sub new {
27             my ($class,%options) = @_;
28             my $domain = $options{domain};
29             my $db = $options{db};
30             $domain = $domain ? ucfirst(lc($domain)) : '';
31             die ("Bad domain name: $domain; Must contain only alphanumeric characters!") if $domain =~ /\W/;
32             my $index_template;
33             my $should_update = $options{should_update} // 1;
34             my $eval_return = eval {require "NNexus/Index/$domain.pm"; 1; };
35             if ($eval_return && (!$@)) {
36             $index_template = eval {
37             "NNexus::Index::$domain"->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update);
38             };
39             } else {
40             print STDERR "NNexus::Index::$domain not available, fallback to generic indexer.\n";
41             print STDERR "Reason: $@\n" if $@;
42             require NNexus::Index::Template;
43             # The generic template will always fail...
44             # TODO: Should we fallback to Planetmath instead?
45             $index_template = NNexus::Index::Template->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update);
46             }
47              
48             bless {index_template=>$index_template,domain=>$domain,db=>$db,
49             verbosity=>$options{verbosity}||0,should_update=>$should_update}, $class;
50             }
51              
52             sub index_step {
53             my ($self,%options) = @_;
54             my $template = $self->{index_template};
55             my $db = $self->{db};
56             my $domain = $self->{domain};
57             my $verbosity = $options{verbosity} ? $options{verbosity} : $self->{verbosity};
58             # 1. Check if object has already been indexed:
59             my $next_step = $template->next_step;
60             return unless ref $next_step; # Done if nothing left.
61             unshift @{$template->{queue}}, $next_step; # Just peaking, keep it in the queue
62             my $url = $next_step->{url}; # Grab the next canonical URL
63             my $object = $db->select_object_by(url=>$url);
64             my $objectid = $object->{objectid};
65             my $old_concepts = [];
66             if (! $objectid) {
67             # 1.1. If not present, add it:
68             $objectid = $db->add_object_by(url=>$url,domain=>$domain);
69             } else {
70             # 1.2. If already indexed, grab all concepts defined by the object.
71             $old_concepts = $db->select_concepts_by(objectid=>$objectid);
72             # 1.3. Skip if we don't want to update and the URL is a leaf with some already known concepts
73             if ((!$self->{should_update}) && $template->leaf_test($url) && scalar(@$old_concepts)) {
74             # Skip leaves, when we don't want to update!
75             print STDERR "Skipping over $url\n";
76             my $indexed_concepts = $template->index_step(skip=>1);
77             return []; } }
78             # 2. Relay the indexing request to the template, gather concepts
79             my $indexed_concepts = $template->index_step(%options);
80             return unless defined $indexed_concepts; # Last step.
81              
82             # Idea: If a page can no longer be accessed, we will never delete it from the object table,
83             # we will only empty its payload (= no concepts defined by it) from the concept table.
84              
85             # 3.0.1 Flatten out incoming synonyms and categories to individual concepts:
86             my $new_concepts = flatten_concept_harvest($indexed_concepts);
87             # 3.0.2 Make sure they're admissible names;
88             @$new_concepts = grep {admissible_name($_->{concept})} @$new_concepts;
89             if ($verbosity > 0) {
90             print STDERR "FlatConcepts: ".scalar(@$new_concepts)."|URL: $url\n";
91             print STDERR Dumper($new_concepts);
92             }
93             # 3.1 Compute diff between previous and new concepts
94             my ($delete_concepts,$add_concepts) = diff_concept_harvests($old_concepts,$new_concepts);
95             # 4. Delete no longer present concepts
96             my $invalidated_URLs = [];
97             foreach my $delc(@$delete_concepts) {
98             my $concepts = $db->select_concepts_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid);
99             my $delc_id = $concepts->[0]->{conceptid};
100             $db->delete_concept_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid);
101             push @$invalidated_URLs,
102             $db->invalidate_by(conceptid=>$delc_id);
103             }
104             # 5. Add newly introduced concepts
105             foreach my $addc(@$add_concepts) {
106             my $addc_id =
107             $db->add_concept_by(concept=>$addc->{concept},category=>$addc->{category},objectid=>$objectid,
108             domain=>$domain,link=>($addc->{url}||$url),scheme=>$addc->{scheme});
109             push @$invalidated_URLs,
110             $db->invalidate_by(conceptid=>$addc_id);
111             }
112             # Add the http:// prefix before returning:
113             @$invalidated_URLs = map {'http://'.$_} @$invalidated_URLs;
114             # 6. Return URLs to be invalidated as effect:
115             return $invalidated_URLs;
116             }
117              
118             1;
119             __END__