File Coverage

blib/lib/NNexus/Index/Template.pm
Criterion Covered Total %
statement 69 83 83.1
branch 14 24 58.3
condition 5 9 55.5
subroutine 14 21 66.6
pod 11 14 78.5
total 113 151 74.8


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Template for Indexing Plug-ins, PULL API | #
4             # |=====================================================================| #
5             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
6             # | Research software, produced as part of work done by: | #
7             # | the KWARC group at Jacobs University | #
8             # | Copyright (c) 2012 | #
9             # | Released under the MIT License (MIT) | #
10             # |---------------------------------------------------------------------| #
11             # | Adapted from the original NNexus code by | #
12             # | James Gardner and Aaron Krowne | #
13             # |---------------------------------------------------------------------| #
14             # | Deyan Ginev #_# | #
15             # | http://kwarc.info/people/dginev (o o) | #
16             # \=========================================================ooo==U==ooo=/ #
17             package NNexus::Index::Template;
18 6     6   671 use warnings;
  6         9  
  6         175  
19 6     6   21 use strict;
  6         8  
  6         154  
20              
21 6     6   431 use Mojo::DOM;
  6         64290  
  6         122  
22 6     6   2338 use Mojo::UserAgent;
  6         573890  
  6         66  
23 6     6   263 use Mojo::UserAgent::CookieJar;
  6         11  
  6         44  
24 6     6   175 use Time::HiRes qw(sleep);
  6         10  
  6         52  
25 6     6   1207 use NNexus::Morphology qw(canonicalize_url);
  6         11  
  6         3599  
26              
27             ### EXTERNAL API
28             sub new {
29 12     12 1 382737 my ($class,%options) = @_;
30 12         95 my $ua = Mojo::UserAgent->new;
31 12         111 $ua->max_redirects(2)->connect_timeout(10)->request_timeout(20);
32 12         224 $ua->cookie_jar(Mojo::UserAgent::CookieJar->new);
33 12   50     155 my $visited = $options{visited}||{};
34 12   50     58 my $queue = $options{queue}||[];
35              
36 12         57 my $self = bless {ua=>$ua,visited=>$visited,queue=>$queue}, $class;
37              
38             # Set current if we're starting up.
39 12         13 my $first_url;
40 12 50       38 if (defined $options{start}) {
41 12 50       30 if ($options{start} eq 'default') {
42 0         0 $first_url = $self->domain_root;
43             } else {
44 12         26 $first_url = $options{start};
45             }}
46             else {
47 0         0 $first_url = $self->domain_root; }
48              
49 12 50       15 push (@{$self->{queue}}, {
  12         100  
50             url=>canonicalize_url($first_url),
51             ($options{dom} ? (dom=>$options{dom}) : ()),
52             depth=>0});
53 12         126 return $self;
54             }
55 0     0 0 0 sub ua {$_[0]->{ua};}
56              
57             # index: Traverse a page, obtain candidate concepts and candidate further links
58             sub index_step {
59 12     12 1 4415 my ($self,%options) = @_;
60 12         24 my $visited = $self->{visited};
61 12         19 my $depth;
62              
63             # Grab the next job from the queue
64 12         36 my $next_step = $self->next_step;
65 12 50       101 if (ref $next_step) {
66 12         73 $self->current_url($next_step->{url});
67 12         72 $self->current_categories($next_step->{categories});
68 12   50     58 $depth = $next_step->{depth} || 0;
69             } else {
70             # We're out of urls, last step.
71 0         0 delete $self->{current_url};
72             }
73             # If we've visited, or we're out of urls, terminate.
74 12         26 my $current_url = $self->current_url;
75 12 50       30 return unless $current_url; # Empty return for last job
76 12         25 $visited->{$current_url} = 1; # Mark visited
77             # Also skip if we're over the depth limit.
78 12 50       45 return $self->index_step if $depth > $self->depth_limit;
79 12 50       30 return [] if $options{skip}; # We are skipping over this URL, return
80             # 2.1. Prepare (or just accept) a Mojo::DOM to be analyzed
81 12 50       76 if ($next_step->{dom}) {
82 12         102 $self->current_dom($next_step->{dom});
83 12         76 delete $next_step->{dom};
84             } else {
85 0         0 sleep($self->request_interval()); # Don't overload the server
86 0         0 $self->current_dom($self->ua->get($current_url)->res->dom);
87             }
88             # Obtain the indexer payload
89 12         37 my $payload = $self->index_page;
90             # What are the candidate categories for follow-up jobs?
91 12         98 my $categories = $self->candidate_categories;
92             # Push all following candidate jobs to queue
93 12 50       41 if ($depth <= $self->depth_limit) { # Don't add pointless nodes
94 12         41 my $candidate_links = $self->candidate_links;
95 12         40 foreach (@$candidate_links) {
96             # push and shift give us breadth-first search.
97 28         15 push (@{$self->{queue}}, {
  28         45  
98             url=>canonicalize_url($_),
99             categories=>$categories,
100             depth=>$depth+1});
101             }
102             }
103             # Return final list of concepts for this page
104 12         58 return $payload;
105             }
106              
107             sub next_step {
108 25     25 0 34 my ($self) = @_;
109 25         36 my $visited = $self->{visited};
110             # Otherwise, grab the next job from the queue
111 25         27 my $next_step = shift @{$self->{queue}};
  25         61  
112 25   66     131 while ((ref $next_step) && ($visited->{$next_step->{url}})) {
113 0         0 $next_step = shift @{$self->{queue}};
  0         0  
114             }
115 25         57 return $next_step;
116             }
117              
118             ### PULL API
119             # To be overloaded by concrete classes
120 0     0 1 0 sub depth_limit {4;}
121 0     0 1 0 sub domain_root {q{};} # To be overriden in the concrete classes
122             # TODO: Rename index_page to candidate_concepts ? Or index_links / index_categories instead?
123 0     0 1 0 sub index_page {[];} # To be overriden in the concrete classes
124             sub candidate_links {
125 0     0 1 0 [];
126             # TODO: Generic implementation should simply retrieve ALL s as candidate links.
127             }
128 10     10 1 19 sub candidate_categories {}
129 0     0 1 0 sub request_interval { 2; }
130             # Tests if the page is a leaf, in which case we want to skip it when should_update is 0
131 0     0 0 0 sub leaf_test {0;}
132             ### SHARED METHODS
133             # To be directly inherited and used by concrete classes
134              
135             # Getter or Setter for the current URL/DOM/Categories
136 50 100   50 1 179 sub current_url { $_[1] ? $_[0]->{current_url} = $_[1] : $_[0]->{current_url}; }
137 27 100   27 1 107 sub current_dom { $_[1] ? $_[0]->{current_dom} = $_[1] : $_[0]->{current_dom}; }
138 16 50   16 1 72 sub current_categories {$_[1] ? $_[0]->{current_categories} = $_[1] : $_[0]->{current_categories};}
139              
140             1;
141             __END__