File Coverage

blib/lib/NNexus/Index/Planetmath.pm
Criterion Covered Total %
statement 39 55 70.9
branch 6 18 33.3
condition n/a
subroutine 7 9 77.7
pod 5 6 83.3
total 57 88 64.7


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Indexing Plug-in, PlanetMath.org domain | #
4             # |=====================================================================| #
5             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
6             # | Research software, produced as part of work done by: | #
7             # | the KWARC group at Jacobs University | #
8             # | Copyright (c) 2012 | #
9             # | Released under the MIT License (MIT) | #
10             # |---------------------------------------------------------------------| #
11             # | Adapted from the original NNexus code by | #
12             # | James Gardner and Aaron Krowne | #
13             # |---------------------------------------------------------------------| #
14             # | Deyan Ginev #_# | #
15             # | http://kwarc.info/people/dginev (o o) | #
16             # \=========================================================ooo==U==ooo=/ #
17             package NNexus::Index::Planetmath;
18 6     6   1464 use warnings;
  6         9  
  6         220  
19 6     6   29 use strict;
  6         10  
  6         321  
20 6     6   28 use base qw(NNexus::Index::Template);
  6         71  
  6         3899  
21              
22 0     0 1 0 sub domain_root { "http://planetmath.org/articles"; }
23             our $pm_base="http://planetmath.org";
24             sub candidate_links {
25 7     7 1 10 my ($self) = @_;
26 7         22 my $url = $self->current_url;
27 7 50       22 return [] if $self->leaf_test($url);
28 0         0 my $dom = $self->current_dom;
29             # Encyclopedia entries are root links "/entry"
30 0         0 my $content = $dom->find('div[class="view-content"]')->[0];
31 0 0       0 my @encyclopedia_links = $content ? $content->find('a')->each : ();
32 0 0       0 @encyclopedia_links = grep {defined && /^\/(\w+)$/} map {$_->{href}} @encyclopedia_links;
  0         0  
  0         0  
33             # Further links can be found in: "/articles?section=All&page=NUMBER"
34 0         0 my $navigation = $dom->find('div[class="item-list"]')->[1];
35 0 0       0 my @nav_links = $navigation ? $navigation->find('a')->each : ();
36 0 0       0 @nav_links = grep {defined && /^\/articles\?section=All/} map {$_->{href}} @nav_links;
  0         0  
  0         0  
37 0         0 my $candidates = [ map { $pm_base . $_ } (@nav_links, @encyclopedia_links ) ];
  0         0  
38 0         0 return $candidates; }
39              
40             sub index_page {
41 7     7 1 12 my ($self) = @_;
42 7         15 my $url = $self->current_url;
43 7 50       22 return [] unless $self->leaf_test($url);
44 7         19 my $dom = $self->current_dom->xml(1);
45 7         132 my $title = $dom->find('div[property="dct:title"]')->[0];
46 7 100       56627 return [] unless $title;
47 5         51 $title = $title->attr('content');
48             # Only concepts have titles, so return an empty harvest if undefined:
49             # Also record defined concepts
50 5         122 my $content_div = $dom->find('section[class="ltx_document"]')->[0];
51 5 50       50329 return [] unless $content_div;
52 5         42 my @defined_concepts = $content_div->find('div[property="pm:defines"]')->each;
53 7         34 my @categories = grep {length($_)>0} map {s/^msc\://; $_;}
  7         94  
  7         18  
  7         42141  
54 5         41141 map {$_->attr('resource')} $content_div->find('div[class="ltx_rdf"][property="dct:subject"]')->each;
55 5         58 my @synonyms = map {$_->attr('content')} $content_div->find('div[class="ltx_rdf"][property="pm:synonym"]')->each;
  2         41799  
56              
57 5         1114 my @harvest;
58 5 50       33 @categories = ('XX-XX') unless @categories;
59 5         11 foreach my $defined(@defined_concepts) {
60 7         20 my $name = $defined->attr('content');
61 7         105 $name =~ s/^pmconcept\://;
62             # TODO: No special chars
63             # Wild chars in synonyms - people use TeX math syntax, e.g. ^, $, + ... should we LaTeXML-convert?
64             # Right now we just skip over...
65 7         38 push @harvest, {
66             url=>$url,
67             concept=>$name,
68             categories=>\@categories,
69             }; }
70             # Title with synonyms:
71 5         21 push @harvest, {
72             url=>$url,
73             concept=>$title,
74             categories=>\@categories,
75             synonyms=>\@synonyms
76             };
77 5         28 return \@harvest; }
78              
79 14     14 1 48 sub depth_limit {10000;} #We're just traversing down the list of pages, nothing dangerous here
80 0     0 1 0 sub request_interval {0.5;}
81             # Only concepts have titles, so consider next links IF undefined:
82 14     14 0 69 sub leaf_test { $_[1] !~ /\/articles/; }
83              
84             1;
85             __END__