File Coverage

blib/lib/NNexus/Index/Wikipedia.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Indexing Plug-in, Wikipedia.org domain | #
4             # |=====================================================================| #
5             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
6             # | Research software, produced as part of work done by: | #
7             # | the KWARC group at Jacobs University | #
8             # | Copyright (c) 2012 | #
9             # | Released under the MIT License (MIT) | #
10             # |---------------------------------------------------------------------| #
11             # | Adapted from the original NNexus code by | #
12             # | James Gardner and Aaron Krowne | #
13             # |---------------------------------------------------------------------| #
14             # | Deyan Ginev #_# | #
15             # | http://kwarc.info/people/dginev (o o) | #
16             # \=========================================================ooo==U==ooo=/ #
17             package NNexus::Index::Wikipedia;
18 1     1   444 use warnings;
  1         1  
  1         34  
19 1     1   3 use strict;
  1         2  
  1         28  
20 1     1   4 use base qw(NNexus::Index::Template);
  1         1  
  1         68  
21             # Special Blacklist for Wikipedia categories:
22 1     1   588 use NNexus::Index::Wikipedia::Lists;
  1         4  
  1         312  
23              
24 1     1   9 use feature 'say';
  1         2  
  1         79  
25 1     1   1274 use List::MoreUtils qw(uniq);
  0            
  0            
26              
27              
28             # EN.Wikipedia.org indexing template
29             # 1. We want to start from the top-level math category
30             sub domain_root { "http://en.wikipedia.org/wiki/Category:Mathematics"; }
31             our $category_test = qr/\/wiki\/Category:(.+)$/;
32             our $english_category_test = qr/^\/wiki\/Category:/;
33             our $english_concept_test = qr/^\/wiki\/[^\/\:]+$/;
34             our $wiki_base = 'http://en.wikipedia.org';
35             # 2. Candidate links to subcategories and concept pages
36             sub candidate_links {
37             my ($self)=@_;
38             my $url = $self->current_url;
39             # Add links from subcategory pages
40             if ($url =~ /$category_test/ ) {
41             my $category_name = $1;
42             return [] if $wiki_category_blacklist->{$category_name};
43             my $dom = $self->current_dom;
44             my $subcategories = $dom->find('#mw-subcategories')->[0];
45             my @category_links = ();
46             if( defined $subcategories ) {
47             @category_links = $subcategories->find('a')->each;
48             @category_links = grep {defined && /$english_category_test/} map {$_->{href}} @category_links; }
49             # Also add terminal links:
50             my $concepts = $dom->find('#mw-pages')->[0];
51             my @concept_links = $concepts->find('a')->each if defined $concepts;
52             @concept_links = grep {defined && /$english_concept_test/} map {$_->{href}} @concept_links;
53              
54             my $candidates = [ map {$wiki_base . $_ } (@category_links, @concept_links) ];
55             return $candidates;
56             } else {return [];} # skip leaves
57             }
58              
59             # Index a concept page, ignore category pages
60             sub index_page {
61             my ($self) = @_;
62             my $url = $self->current_url;
63             # Nothing to do in category pages
64             return [] unless $self->leaf_test($url);
65             my $dom = $self->current_dom;
66             # We might want to index a leaf page when descending from different categories, so keep them marked as "not visited"
67             delete $self->{visited}->{$url};
68             my ($concept) = map {/([^\(]+)/; lc(rtrim($1));} $dom->find('span[dir="auto"]')->pluck('all_text')->each;
69             my @synonyms;
70             # Bold entries in the first paragraph are typically synonyms.
71             my $first_p = $dom->find('p')->[0];
72             @synonyms = (grep {(length($_)>4) && ($_ ne $concept)} map {lc $_} $first_p->children('b')->pluck('all_text')->each) if $first_p;
73             my $categories = $self->current_categories || ['XX-XX'];
74              
75             return [{ url => $url,
76             concept => $concept,
77             scheme => 'wiki',
78             categories => $categories,
79             @synonyms ? (synonyms => \@synonyms) : ()
80             }];
81             }
82              
83             sub candidate_categories {
84             my ($self) = @_;
85             if ($self->current_url =~ /$category_test/ ) {
86             return [$1];
87             } else {
88             return $self->current_categories;
89             }
90             }
91              
92             # The subcategories trail into unrelated topics after the 4th level...
93             sub depth_limit {20;} # But let's bite the bullet and manually strip away the ones that are pointless
94             sub leaf_test { $_[1] !~ /$category_test/ }
95             # Utility:
96             # Right trim function to remove trailing whitespace
97             sub rtrim {
98             my $string = shift;
99             $string =~ s/\s+$//;
100             return $string; }
101              
102             1;
103             __END__