|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # /=====================================================================\ #  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | NNexus Autolinker                                                   | #  | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Indexing Driver,                                                    | #  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |   Dispatcher for Crawl, Store, Invalidation tasks                   | #  | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |=====================================================================| #  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Part of the Planetary project: http://trac.mathweb.org/planetary    | #  | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |  Research software, produced as part of work done by:               | #  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |  the KWARC group at Jacobs University                               | #  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Copyright (c) 2012                                                  | #  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Released under the MIT License (MIT)                                | #  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |---------------------------------------------------------------------| #  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Adapted from the original NNexus code by                            | #  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |                                  James Gardner and Aaron Krowne     | #  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # |---------------------------------------------------------------------| #  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | Deyan Ginev                   #_#     | #  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # | http://kwarc.info/people/dginev                            (o o)    | #  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # \=========================================================ooo==U==ooo=/ #  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package NNexus::Index::Dispatcher;  | 
| 
19
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
1413
 | 
 use warnings;  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
    | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
213
 | 
    | 
| 
20
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
25
 | 
 use strict;  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
176
 | 
    | 
| 
21
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
772
 | 
 use Data::Dumper;  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9359
 | 
    | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
376
 | 
    | 
| 
22
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
538
 | 
 use NNexus::Concepts qw(flatten_concept_harvest diff_concept_harvests);  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
    | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
299
 | 
    | 
| 
23
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
34
 | 
 use NNexus::Morphology qw(admissible_name);  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3388
 | 
    | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Dispatch to the right NNexus::Index::Domain class  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
27
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
  
1
  
 | 
2966
 | 
   my ($class,%options) = @_;  | 
| 
28
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
   my $domain = $options{domain};  | 
| 
29
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
   my $db = $options{db};  | 
| 
30
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
38
 | 
   $domain = $domain ? ucfirst(lc($domain)) : '';  | 
| 
31
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
41
 | 
   die ("Bad domain name: $domain; Must contain only alphanumeric characters!") if $domain =~ /\W/;  | 
| 
32
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
   my $index_template;  | 
| 
33
 | 
8
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
40
 | 
   my $should_update = $options{should_update} // 1;  | 
| 
34
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
   my $eval_return = eval {require "NNexus/Index/$domain.pm"; 1; };  | 
| 
 
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3106
 | 
    | 
| 
 
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
    | 
| 
35
 | 
8
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
83
 | 
   if ($eval_return && (!$@)) {  | 
| 
36
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     $index_template = eval {  | 
| 
37
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
106
 | 
       "NNexus::Index::$domain"->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update);  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     };  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
40
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     print STDERR "NNexus::Index::$domain not available, fallback to generic indexer.\n";  | 
| 
41
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     print STDERR "Reason: $@\n" if $@;  | 
| 
42
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     require NNexus::Index::Template;  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # The generic template will always fail...  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # TODO: Should we fallback to Planetmath instead?  | 
| 
45
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $index_template = NNexus::Index::Template->new(start=>$options{start},dom=>$options{dom},should_update=>$should_update);  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
48
 | 
8
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
98
 | 
   bless {index_template=>$index_template,domain=>$domain,db=>$db,  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	verbosity=>$options{verbosity}||0,should_update=>$should_update}, $class;  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub index_step {  | 
| 
53
 | 
13
 | 
 
 | 
 
 | 
  
13
  
 | 
  
1
  
 | 
214
 | 
   my ($self,%options) = @_;  | 
| 
54
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
   my $template = $self->{index_template};  | 
| 
55
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
   my $db = $self->{db};  | 
| 
56
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
   my $domain = $self->{domain};  | 
| 
57
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
41
 | 
   my $verbosity = $options{verbosity} ? $options{verbosity} : $self->{verbosity};  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 1. Check if object has already been indexed:  | 
| 
59
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
111
 | 
   my $next_step = $template->next_step;  | 
| 
60
 | 
13
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
47
 | 
   return unless ref $next_step; # Done if nothing left.  | 
| 
61
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
   unshift @{$template->{queue}}, $next_step; # Just peaking, keep it in the queue  | 
| 
 
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
    | 
| 
62
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
26
 | 
   my $url = $next_step->{url}; # Grab the next canonical URL  | 
| 
63
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52
 | 
   my $object = $db->select_object_by(url=>$url);  | 
| 
64
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
   my $objectid = $object->{objectid};  | 
| 
65
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
   my $old_concepts = [];  | 
| 
66
 | 
8
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
27
 | 
   if (! $objectid) {  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # 1.1. If not present, add it:  | 
| 
68
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
17
 | 
     $objectid = $db->add_object_by(url=>$url,domain=>$domain);  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # 1.2. If already indexed, grab all concepts defined by the object.  | 
| 
71
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
     $old_concepts = $db->select_concepts_by(objectid=>$objectid);  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # 1.3. Skip if we don't want to update and the URL is a leaf with some already known concepts  | 
| 
73
 | 
5
 | 
  
  0
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
17
 | 
     if ((!$self->{should_update}) && $template->leaf_test($url) && scalar(@$old_concepts)) {  | 
| 
 
 | 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # Skip leaves, when we don't want to update!  | 
| 
75
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         print STDERR "Skipping over $url\n";  | 
| 
76
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         my $indexed_concepts = $template->index_step(skip=>1);  | 
| 
77
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         return []; } }  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 2. Relay the indexing request to the template, gather concepts  | 
| 
79
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
62
 | 
   my $indexed_concepts = $template->index_step(%options);  | 
| 
80
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
24
 | 
   return unless defined $indexed_concepts; # Last step.  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Idea: If a page can no longer be accessed, we will never delete it from the object table,  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #       we will only empty its payload (= no concepts defined by it) from the concept table.  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 3.0.1 Flatten out incoming synonyms and categories to individual concepts:  | 
| 
86
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52
 | 
   my $new_concepts = flatten_concept_harvest($indexed_concepts);  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 3.0.2 Make sure they're admissible names;  | 
| 
88
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
   @$new_concepts = grep {admissible_name($_->{concept})} @$new_concepts;  | 
| 
 
 | 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
50
 | 
    | 
| 
89
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
30
 | 
   if ($verbosity > 0) {  | 
| 
90
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     print STDERR "FlatConcepts: ".scalar(@$new_concepts)."|URL: $url\n";  | 
| 
91
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     print STDERR Dumper($new_concepts);  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 3.1 Compute diff between previous and new concepts  | 
| 
94
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
   my ($delete_concepts,$add_concepts) = diff_concept_harvests($old_concepts,$new_concepts);  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 4. Delete no longer present concepts  | 
| 
96
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
   my $invalidated_URLs = [];  | 
| 
97
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
   foreach my $delc(@$delete_concepts) {  | 
| 
98
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
     my $concepts = $db->select_concepts_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid);  | 
| 
99
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my $delc_id = $concepts->[0]->{conceptid};  | 
| 
100
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
     $db->delete_concept_by(concept=>$delc->{concept},category=>$delc->{category},scheme=>$delc->{scheme},objectid=>$objectid);  | 
| 
101
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     push @$invalidated_URLs,  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $db->invalidate_by(conceptid=>$delc_id);  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 5. Add newly introduced concepts  | 
| 
105
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
   foreach my $addc(@$add_concepts) {  | 
| 
106
 | 
12
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
93
 | 
     my $addc_id =   | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $db->add_concept_by(concept=>$addc->{concept},category=>$addc->{category},objectid=>$objectid,  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                        domain=>$domain,link=>($addc->{url}||$url),scheme=>$addc->{scheme});  | 
| 
109
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
47
 | 
     push @$invalidated_URLs,   | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $db->invalidate_by(conceptid=>$addc_id);  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Add the http:// prefix before returning:  | 
| 
113
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
   @$invalidated_URLs = map {'http://'.$_} @$invalidated_URLs;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # 6. Return URLs to be invalidated as effect:  | 
| 
115
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
84
 | 
   return $invalidated_URLs;  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |