File Coverage

blib/lib/NNexus/Index/Dispatcher.pm
Criterion Covered Total %
statement 65 74 87.8
branch 10 20 50.0
condition 7 15 46.6
subroutine 7 7 100.0
pod 2 2 100.0
total 91 118 77.1


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Indexing Driver, | #
4             # | Dispatcher for Crawl, Store, Invalidation tasks | #
5             # |=====================================================================| #
6             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
7             # | Research software, produced as part of work done by: | #
8             # | the KWARC group at Jacobs University | #
9             # | Copyright (c) 2012 | #
10             # | Released under the MIT License (MIT) | #
11             # |---------------------------------------------------------------------| #
12             # | Adapted from the original NNexus code by | #
13             # | James Gardner and Aaron Krowne | #
14             # |---------------------------------------------------------------------| #
15             # | Deyan Ginev #_# | #
16             # | http://kwarc.info/people/dginev (o o) | #
17             # \=========================================================ooo==U==ooo=/ #
18             package NNexus::Index::Dispatcher;
19 5     5   1413 use warnings;
  5         8  
  5         213  
20 5     5   25 use strict;
  5         10  
  5         176  
21 5     5   772 use Data::Dumper;
  5         9359  
  5         376  
22 5     5   538 use NNexus::Concepts qw(flatten_concept_harvest diff_concept_harvests);
  5         9  
  5         299  
23 5     5   34 use NNexus::Morphology qw(admissible_name);
  5         6  
  5         3388  
24              
25             # Dispatch to the right NNexus::Index::Domain class
26             sub new {
27 8     8 1 2966 my ($class,%options) = @_;
28 8         19 my $domain = $options{domain};
29 8         18 my $db = $options{db};
30 8 50       38 $domain = $domain ? ucfirst(lc($domain)) : '';
31 8 50       41 die ("Bad domain name: $domain; Must contain only alphanumeric characters!") if $domain =~ /\W/;
32 8         12 my $index_template;
33 8   100     40 my $should_update = $options{should_update} // 1;
34 8         19 my $eval_return = eval {require "NNexus/Index/$domain.pm"; 1; };
  8         3106  
  8         27  
35 8 50 33     83 if ($eval_return && (!$@)) {
36 8         12 $index_template = eval {
37 8         106 "NNexus::Index::$domain"->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update);
38             };
39             } else {
40 0         0 print STDERR "NNexus::Index::$domain not available, fallback to generic indexer.\n";
41 0 0       0 print STDERR "Reason: $@\n" if $@;
42 0         0 require NNexus::Index::Template;
43             # The generic template will always fail...
44             # TODO: Should we fallback to Planetmath instead?
45 0         0 $index_template = NNexus::Index::Template->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update);
46             }
47              
48 8   50     98 bless {index_template=>$index_template,domain=>$domain,db=>$db,
49             verbosity=>$options{verbosity}||0,should_update=>$should_update}, $class;
50             }
51              
52             sub index_step {
53 13     13 1 214 my ($self,%options) = @_;
54 13         33 my $template = $self->{index_template};
55 13         27 my $db = $self->{db};
56 13         23 my $domain = $self->{domain};
57 13 50       41 my $verbosity = $options{verbosity} ? $options{verbosity} : $self->{verbosity};
58             # 1. Check if object has already been indexed:
59 13         111 my $next_step = $template->next_step;
60 13 100       47 return unless ref $next_step; # Done if nothing left.
61 8         14 unshift @{$template->{queue}}, $next_step; # Just peaking, keep it in the queue
  8         20  
62 8         26 my $url = $next_step->{url}; # Grab the next canonical URL
63 8         52 my $object = $db->select_object_by(url=>$url);
64 8         15 my $objectid = $object->{objectid};
65 8         11 my $old_concepts = [];
66 8 100       27 if (! $objectid) {
67             # 1.1. If not present, add it:
68 3         17 $objectid = $db->add_object_by(url=>$url,domain=>$domain);
69             } else {
70             # 1.2. If already indexed, grab all concepts defined by the object.
71 5         15 $old_concepts = $db->select_concepts_by(objectid=>$objectid);
72             # 1.3. Skip if we don't want to update and the URL is a leaf with some already known concepts
73 5 0 33     17 if ((!$self->{should_update}) && $template->leaf_test($url) && scalar(@$old_concepts)) {
      50        
74             # Skip leaves, when we don't want to update!
75 0         0 print STDERR "Skipping over $url\n";
76 0         0 my $indexed_concepts = $template->index_step(skip=>1);
77 0         0 return []; } }
78             # 2. Relay the indexing request to the template, gather concepts
79 8         62 my $indexed_concepts = $template->index_step(%options);
80 8 50       24 return unless defined $indexed_concepts; # Last step.
81              
82             # Idea: If a page can no longer be accessed, we will never delete it from the object table,
83             # we will only empty its payload (= no concepts defined by it) from the concept table.
84              
85             # 3.0.1 Flatten out incoming synonyms and categories to individual concepts:
86 8         52 my $new_concepts = flatten_concept_harvest($indexed_concepts);
87             # 3.0.2 Make sure they're admissible names;
88 8         21 @$new_concepts = grep {admissible_name($_->{concept})} @$new_concepts;
  18         50  
89 8 50       30 if ($verbosity > 0) {
90 0         0 print STDERR "FlatConcepts: ".scalar(@$new_concepts)."|URL: $url\n";
91 0         0 print STDERR Dumper($new_concepts);
92             }
93             # 3.1 Compute diff between previous and new concepts
94 8         34 my ($delete_concepts,$add_concepts) = diff_concept_harvests($old_concepts,$new_concepts);
95             # 4. Delete no longer present concepts
96 8         14 my $invalidated_URLs = [];
97 8         21 foreach my $delc(@$delete_concepts) {
98 2         15 my $concepts = $db->select_concepts_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid);
99 2         6 my $delc_id = $concepts->[0]->{conceptid};
100 2         13 $db->delete_concept_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid);
101 2         8 push @$invalidated_URLs,
102             $db->invalidate_by(conceptid=>$delc_id);
103             }
104             # 5. Add newly introduced concepts
105 8         19 foreach my $addc(@$add_concepts) {
106 12   33     93 my $addc_id =
107             $db->add_concept_by(concept=>$addc->{concept},category=>$addc->{category},objectid=>$objectid,
108             domain=>$domain,link=>($addc->{url}||$url),scheme=>$addc->{scheme});
109 12         47 push @$invalidated_URLs,
110             $db->invalidate_by(conceptid=>$addc_id);
111             }
112             # Add the http:// prefix before returning:
113 8         24 @$invalidated_URLs = map {'http://'.$_} @$invalidated_URLs;
  2         7  
114             # 6. Return URLs to be invalidated as effect:
115 8         84 return $invalidated_URLs;
116             }
117              
118             1;
119             __END__