File Coverage

blib/lib/NNexus/Index/Wikipedia.pm
Criterion Covered Total %
statement 45 65 69.2
branch 6 22 27.2
condition 1 2 50.0
subroutine 12 13 92.3
pod 5 7 71.4
total 69 109 63.3


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Indexing Plug-in, Wikipedia.org domain | #
4             # |=====================================================================| #
5             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
6             # | Research software, produced as part of work done by: | #
7             # | the KWARC group at Jacobs University | #
8             # | Copyright (c) 2012 | #
9             # | Released under the MIT License (MIT) | #
10             # |---------------------------------------------------------------------| #
11             # | Adapted from the original NNexus code by | #
12             # | James Gardner and Aaron Krowne | #
13             # |---------------------------------------------------------------------| #
14             # | Deyan Ginev #_# | #
15             # | http://kwarc.info/people/dginev (o o) | #
16             # \=========================================================ooo==U==ooo=/ #
17             package NNexus::Index::Wikipedia;
18 3     3   978 use warnings;
  3         4  
  3         98  
19 3     3   14 use strict;
  3         3  
  3         98  
20 3     3   12 use base qw(NNexus::Index::Template);
  3         4  
  3         495  
21             # Special Blacklist for Wikipedia categories:
22 3     3   2384 use NNexus::Index::Wikipedia::Lists;
  3         10  
  3         1468  
23              
24 3     3   33 use feature 'say';
  3         3  
  3         283  
25 3     3   606 use List::MoreUtils qw(uniq);
  3         8355  
  3         36  
26              
27              
28             # EN.Wikipedia.org indexing template
29             # 1. We want to start from the top-level math category
30 0     0 1 0 sub domain_root { "http://en.wikipedia.org/wiki/Category:Mathematics"; }
31             our $category_test = qr/\/wiki\/Category:(.+)$/;
32             our $english_category_test = qr/^\/wiki\/Category:/;
33             our $english_concept_test = qr/^\/wiki\/[^\/\:]+$/;
34             our $wiki_base = 'http://en.wikipedia.org';
35             # 2. Candidate links to subcategories and concept pages
36             sub candidate_links {
37 2     2 1 3 my ($self)=@_;
38 2         5 my $url = $self->current_url;
39             # Add links from subcategory pages
40 2 50       9 if ($url =~ /$category_test/ ) {
  2         5  
41 0         0 my $category_name = $1;
42 0 0       0 return [] if $wiki_category_blacklist->{$category_name};
43 0         0 my $dom = $self->current_dom;
44 0         0 my $subcategories = $dom->find('#mw-subcategories')->[0];
45 0         0 my @category_links = ();
46 0 0       0 if( defined $subcategories ) {
47 0         0 @category_links = $subcategories->find('a')->each;
48 0 0       0 @category_links = grep {defined && /$english_category_test/} map {$_->{href}} @category_links; }
  0         0  
  0         0  
49             # Also add terminal links:
50 0         0 my $concepts = $dom->find('#mw-pages')->[0];
51 0 0       0 my @concept_links = $concepts->find('a')->each if defined $concepts;
52 0 0       0 @concept_links = grep {defined && /$english_concept_test/} map {$_->{href}} @concept_links;
  0         0  
  0         0  
53              
54 0         0 my $candidates = [ map {$wiki_base . $_ } (@category_links, @concept_links) ];
  0         0  
55 0         0 return $candidates;
56             } else {return [];} # skip leaves
57             }
58              
59             # Index a concept page, ignore category pages
60             sub index_page {
61 2     2 1 2 my ($self) = @_;
62 2         6 my $url = $self->current_url;
63             # Nothing to do in category pages
64 2 50       6 return [] unless $self->leaf_test($url);
65 2         5 my $dom = $self->current_dom;
66             # We might want to index a leaf page when descending from different categories, so keep them marked as "not visited"
67 2         5 delete $self->{visited}->{$url};
68 2         7 my ($concept) = map {/([^\(]+)/; lc(rtrim($1));} $dom->find('span[dir="auto"]')->map('all_text')->each;
  2         85122  
  2         11  
69 2         71 my @synonyms;
70             # Bold entries in the first paragraph are typically synonyms.
71 2         9 my $first_p = $dom->find('p')->[0];
72 2 50       83997 @synonyms = (grep {(length($_)>4) && ($_ ne $concept)} map {lc $_} $first_p->children('b')->map('all_text')->each) if $first_p;
  4 50       30  
  4         2394  
73 2   50     64 my $categories = $self->current_categories || ['XX-XX'];
74              
75 2 50       24 return [{ url => $url,
76             concept => $concept,
77             scheme => 'wiki',
78             categories => $categories,
79             @synonyms ? (synonyms => \@synonyms) : ()
80             }];
81             }
82              
83             sub candidate_categories {
84 2     2 1 4 my ($self) = @_;
85 2 50       8 if ($self->current_url =~ /$category_test/ ) {
86 0         0 return [$1];
87             } else {
88 2         9 return $self->current_categories;
89             }
90             }
91              
92             # The subcategories trail into unrelated topics after the 4th level...
93 4     4 1 16 sub depth_limit {20;} # But let's bite the bullet and manually strip away the ones that are pointless
94 2     2 0 16 sub leaf_test { $_[1] !~ /$category_test/ }
95             # Utility:
96             # Right trim function to remove trailing whitespace
97             sub rtrim {
98 2     2 0 7 my $string = shift;
99 2         6 $string =~ s/\s+$//;
100 2         8 return $string; }
101              
102             1;
103             __END__