File Coverage

blib/lib/NNexus/Index/Template.pm
Criterion Covered Total %
statement 21 83 25.3
branch 0 24 0.0
condition 0 9 0.0
subroutine 7 21 33.3
pod 11 14 78.5
total 39 151 25.8


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Template for Indexing Plug-ins, PULL API | #
4             # |=====================================================================| #
5             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
6             # | Research software, produced as part of work done by: | #
7             # | the KWARC group at Jacobs University | #
8             # | Copyright (c) 2012 | #
9             # | Released under the MIT License (MIT) | #
10             # |---------------------------------------------------------------------| #
11             # | Adapted from the original NNexus code by | #
12             # | James Gardner and Aaron Krowne | #
13             # |---------------------------------------------------------------------| #
14             # | Deyan Ginev #_# | #
15             # | http://kwarc.info/people/dginev (o o) | #
16             # \=========================================================ooo==U==ooo=/ #
17             package NNexus::Index::Template;
18 1     1   5 use warnings;
  1         1  
  1         25  
19 1     1   4 use strict;
  1         0  
  1         21  
20              
21 1     1   524 use Mojo::DOM;
  1         61569  
  1         34  
22 1     1   546 use Mojo::UserAgent;
  1         191836  
  1         11  
23 1     1   32 use Mojo::UserAgent::CookieJar;
  1         2  
  1         6  
24 1     1   18 use Time::HiRes qw(sleep);
  1         2  
  1         8  
25 1     1   556 use NNexus::Morphology qw(canonicalize_url);
  1         2  
  1         637  
26              
27             ### EXTERNAL API
28             sub new {
29 0     0 1   my ($class,%options) = @_;
30 0           my $ua = Mojo::UserAgent->new;
31 0           $ua->max_redirects(2)->connect_timeout(10)->request_timeout(20);
32 0           $ua->cookie_jar(Mojo::UserAgent::CookieJar->new);
33 0   0       my $visited = $options{visited}||{};
34 0   0       my $queue = $options{queue}||[];
35              
36 0           my $self = bless {ua=>$ua,visited=>$visited,queue=>$queue}, $class;
37              
38             # Set current if we're starting up.
39 0           my $first_url;
40 0 0         if (defined $options{start}) {
41 0 0         if ($options{start} eq 'default') {
42 0           $first_url = $self->domain_root;
43             } else {
44 0           $first_url = $options{start};
45             }}
46             else {
47 0           $first_url = $self->domain_root; }
48              
49 0 0         push (@{$self->{queue}}, {
  0            
50             url=>canonicalize_url($first_url),
51             ($options{dom} ? (dom=>$options{dom}) : ()),
52             depth=>0});
53 0           return $self;
54             }
55 0     0 0   sub ua {$_[0]->{ua};}
56              
57             # index: Traverse a page, obtain candidate concepts and candidate further links
58             sub index_step {
59 0     0 1   my ($self,%options) = @_;
60 0           my $visited = $self->{visited};
61 0           my $depth;
62              
63             # Grab the next job from the queue
64 0           my $next_step = $self->next_step;
65 0 0         if (ref $next_step) {
66 0           $self->current_url($next_step->{url});
67 0           $self->current_categories($next_step->{categories});
68 0   0       $depth = $next_step->{depth} || 0;
69             } else {
70             # We're out of urls, last step.
71 0           delete $self->{current_url};
72             }
73             # If we've visited, or we're out of urls, terminate.
74 0           my $current_url = $self->current_url;
75 0 0         return unless $current_url; # Empty return for last job
76 0           $visited->{$current_url} = 1; # Mark visited
77             # Also skip if we're over the depth limit.
78 0 0         return $self->index_step if $depth > $self->depth_limit;
79 0 0         return [] if $options{skip}; # We are skipping over this URL, return
80             # 2.1. Prepare (or just accept) a Mojo::DOM to be analyzed
81 0 0         if ($next_step->{dom}) {
82 0           $self->current_dom($next_step->{dom});
83 0           delete $next_step->{dom};
84             } else {
85 0           sleep($self->request_interval()); # Don't overload the server
86 0           $self->current_dom($self->ua->get($current_url)->res->dom);
87             }
88             # Obtain the indexer payload
89 0           my $payload = $self->index_page;
90             # What are the candidate categories for follow-up jobs?
91 0           my $categories = $self->candidate_categories;
92             # Push all following candidate jobs to queue
93 0 0         if ($depth <= $self->depth_limit) { # Don't add pointless nodes
94 0           my $candidate_links = $self->candidate_links;
95 0           foreach (@$candidate_links) {
96             # push and shift give us breadth-first search.
97 0           push (@{$self->{queue}}, {
  0            
98             url=>canonicalize_url($_),
99             categories=>$categories,
100             depth=>$depth+1});
101             }
102             }
103             # Return final list of concepts for this page
104 0           return $payload;
105             }
106              
107             sub next_step {
108 0     0 0   my ($self) = @_;
109 0           my $visited = $self->{visited};
110             # Otherwise, grab the next job from the queue
111 0           my $next_step = shift @{$self->{queue}};
  0            
112 0   0       while ((ref $next_step) && ($visited->{$next_step->{url}})) {
113 0           $next_step = shift @{$self->{queue}};
  0            
114             }
115 0           return $next_step;
116             }
117              
118             ### PULL API
119             # To be overloaded by concrete classes
120 0     0 1   sub depth_limit {4;}
121 0     0 1   sub domain_root {q{};} # To be overriden in the concrete classes
122             # TODO: Rename index_page to candidate_concepts ? Or index_links / index_categories instead?
123 0     0 1   sub index_page {[];} # To be overriden in the concrete classes
124             sub candidate_links {
125 0     0 1   [];
126             # TODO: Generic implementation should simply retrieve ALL s as candidate links.
127             }
128 0     0 1   sub candidate_categories {}
129 0     0 1   sub request_interval { 2; }
130             # Tests if the page is a leaf, in which case we want to skip it when should_update is 0
131 0     0 0   sub leaf_test {0;}
132             ### SHARED METHODS
133             # To be directly inherited and used by concrete classes
134              
135             # Getter or Setter for the current URL/DOM/Categories
136 0 0   0 1   sub current_url { $_[1] ? $_[0]->{current_url} = $_[1] : $_[0]->{current_url}; }
137 0 0   0 1   sub current_dom { $_[1] ? $_[0]->{current_dom} = $_[1] : $_[0]->{current_dom}; }
138 0 0   0 1   sub current_categories {$_[1] ? $_[0]->{current_categories} = $_[1] : $_[0]->{current_categories};}
139              
140             1;
141             __END__