File Coverage

blib/lib/NNexus/Discover.pm
Criterion Covered Total %
statement 28 30 93.3
branch n/a
condition n/a
subroutine 10 10 100.0
pod n/a
total 38 40 95.0


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Concept Discovery Module | #
4             # |=====================================================================| #
5             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
6             # | Research software, produced as part of work done by: | #
7             # | the KWARC group at Jacobs University | #
8             # | Copyright (c) 2012 | #
9             # | Released under the MIT License (MIT) | #
10             # |---------------------------------------------------------------------| #
11             # | Adapted from the original NNexus code by | #
12             # | James Gardner and Aaron Krowne | #
13             # |---------------------------------------------------------------------| #
14             # | Deyan Ginev #_# | #
15             # | http://kwarc.info/people/dginev (o o) | #
16             # \=========================================================ooo==U==ooo=/ #
17             package NNexus::Discover;
18 5     5   22 use strict;
  5         6  
  5         158  
19 5     5   25 use warnings;
  5         5  
  5         110  
20              
21 5     5   19 use Exporter;
  5         29  
  5         346  
22             our @ISA = qw(Exporter);
23             our @EXPORT_OK = qw(mine_candidates);
24 5     5   21 use Encode qw/is_utf8/;
  5         5  
  5         206  
25 5     5   20 use utf8;
  5         24  
  5         46  
26 5     5   89 use Data::Dumper;
  5         15  
  5         409  
27 5     5   31 use Time::HiRes qw ( time alarm sleep );
  5         7  
  5         44  
28              
29 5     5   4432 use NNexus::StopWordList qw(stop_words_ref);
  5         14  
  5         273  
30 5     5   1379 use NNexus::Morphology qw(normalize_word);
  5         10  
  5         302  
31 5     5   2119 use NNexus::Concepts qw(clone_concepts);
  0            
  0            
32              
33             use HTML::Parser;
34              
35             # Reusable parser object (TODO: What happens if we thread/fork ?)
36             our $HTML_Parser =
37             HTML::Parser->new(
38             'api_version' => 3,
39             'start_h' => [sub {
40             my ($self,$tagname,$attr)=@_;
41             if ($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/ ||
42             (($tagname eq 'span') && $attr->{class} && ($attr->{class} =~ 'nolink'))) {
43             $self->{fresh_skip}=1;
44             $self->{noparse}++;
45             } else {
46             $self->{fresh_skip}=0;
47             }
48             } , 'self, tagname, attr'],
49             'end_h' => [sub {
50             my ($self,$tagname)=@_;
51             if (($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/) ||
52             (((length($tagname)==0)||($tagname eq 'span')) && ($self->{fresh_skip} == 1))) {
53             $self->{noparse}--;
54             $self->{fresh_skip}=0;
55             }
56             }, 'self,tagname'],
57             'text_h' => [\&_text_event_handler, 'self,text,offset']
58             );
59             $HTML_Parser->unbroken_text;
60             $HTML_Parser->xml_mode;
61             $HTML_Parser->attr_encoded(1);
62             $HTML_Parser->empty_element_tags(1);
63              
64             # Prepare cache for first-word concept lookup
65             our $first_word_cache_template = {map { ($_,[]) } @{stop_words_ref()}};
66             sub mine_candidates {
67             my (%options) = @_;
68             # State: We need a db object with a properly set database
69             # Input: We need a string representing the (HTML) body of the chunk we're
70             # mining on, as well as its URL.
71             # Optional: Deprecated details such as 'domain' or 'format'.
72             # Interesting: allow 'nolink' again?
73             my ($db,$format,$body,$nolink,$url,$domain) =
74             map {$options{$_}} qw(db format body nolink url domain);
75             die "The db key is a mandatory parameter for mine_candidates!\n" unless ref $db; # TODO: Maybe raise a better error?
76             $format = 'html' unless defined $format;
77             return ([],0) unless $body;
78             # Prepare data, if we have a URL:
79             my $objectid; # undefined if we don't have a URL, we only do MoC for named resources
80             if ($url) {
81             my $object = $db->select_object_by(url=>$url) || {};
82             $objectid = $object->{objectid} || -1;
83             $domain = $object->{domain} unless defined $domain;
84             # If objectid is -1 , we will also need to add_object on the url
85             if ($objectid == -1) {
86             # TODO: Extract the domain from the URL, this is unreliable
87             $objectid = $db->add_object_by(url=>$options{'url'},domain=>$domain);
88             } else {
89             # If already known, flush the links_cache for this object
90             $db->delete_linkscache_by(objectid=>$objectid);
91             }
92             }
93             # Keep a cache of first words, that will simultaneously act as a blacklist.
94             # TODO: Incorporate the single words from 'nolink'
95             # Experiment: Use a global first_word cache
96             $options{first_word_cache} = $first_word_cache_template; # Default are stopwords, keep a global cache
97             # $options{first_word_cache} = { %$first_word_cache_template }; # Default are stopwords
98             # Always return an embedded annotation with links, as well as a stand-off mined_canidates hash, containing the individual concepts with pointers.
99             my $time;
100             if ($options{verbosity}) {
101             $time = time();
102             }
103             my $mined_candidates=[];
104             my $text_length=0;
105             if ($format eq 'html') {
106             ($mined_candidates,$text_length) = mine_candidates_html(\%options);
107             } elsif ($format eq 'text') {
108             ($mined_candidates,$text_length) = mine_candidates_text(\%options);
109             } else {
110             print STDERR "Error: Unrecognized input format for auto-linking.\n";
111             }
112             # Only mark-up first found candidate, unless requested otherwise
113             my @uniq_candidates;
114             while (@$mined_candidates) {
115             my $candidate = shift @$mined_candidates;
116             my $concept = $candidate->{concept};
117             my $link = $candidate->{link};
118             my $category = $candidate->{category};
119             @$mined_candidates = grep {($_->{concept} ne $concept) || ($_->{link} ne $link) || ($_->{category} ne $category)} @$mined_candidates;
120             push @uniq_candidates, $candidate;
121             }
122             # Also, don't add self-links, coming from $url
123             @uniq_candidates = grep {$_->{link} ne $url} @uniq_candidates if $url;
124             @$mined_candidates = @uniq_candidates;
125              
126             #TODO: When do we deal with the nolink settings?
127             # next if (inset($concept,@$nolink));
128             if ($options{verbosity}) {
129             printf STDERR " Discovered %d concepts in %.3f seconds.\n",scalar(@uniq_candidates),time()-$time;
130             }
131              
132             # Update linkscache:
133             if ($objectid) {
134             $db->add_linkscache_by(objectid=>$objectid,conceptid=>$_->{conceptid})
135             foreach (@$mined_candidates);
136             }
137             return ($mined_candidates,$text_length);
138             }
139              
140             sub mine_candidates_html {
141             my ($options) = @_;
142             my ($db,$domain,$body,$syns,$targetid,$class) = map {$options->{$_}} qw(db domain body nolink targetid class);
143             # Current HTML Parsing strategy - fire events for all HTML tags and explicitly skip over tags that
144             # won't be of interest. We need to autolink in all textual elements.
145             # TODO: Handle MathML better
146             return ([],0) unless $body;
147              
148             $HTML_Parser->{mined_candidates} = [];
149             $HTML_Parser->{text_length} = 0;
150             $HTML_Parser->{state_information}=$options; # Not pretty, but TODO: improve
151             $HTML_Parser->parse($body);
152             $HTML_Parser->eof();
153             return ($HTML_Parser->{mined_candidates},$HTML_Parser->{text_length});
154             }
155              
156             sub _text_event_handler {
157             my ($self,$body,$offset) = @_;
158             my $state = $self->{state_information};
159             # Skip if in a silly element:
160             if (($self->{noparse} && ($self->{noparse}>0)) || ($body !~ /\w/)) {
161             return;
162             }
163             # Otherwise - discover concepts and annotate!
164             my $time = time();
165             my ($mined_candidates,$chunk_length) =
166             mine_candidates_text({db=>$state->{db},
167             nolink=>$state->{nolink},
168             body=>$body,
169             domain=>$state->{domain},
170             first_word_cache=>$state->{first_word_cache},
171             class=>$state->{class}});
172             #printf STDERR " --processed textual chunk in %.3f seconds\n",time()-$time;
173             foreach my $candidate(@$mined_candidates) {
174             $candidate->{offset_begin}+=$offset;
175             $candidate->{offset_end}+=$offset;
176             }
177             push @{$self->{mined_candidates}}, @$mined_candidates;
178             $self->{text_length} += $chunk_length;
179             }
180              
181             # Core Data Mining routine - inspects plain-text strings
182             # returns back the matches and position of disambiguated links of the supplied text.
183             sub mine_candidates_text {
184             my ($options) = @_;
185             my ($db,$domain,$body,$syns,$targetid,$nolink,$class,$first_word_cache) =
186             map {$options->{$_}} qw(db domain body nolink targetid nolink class first_word_cache);
187              
188             # TODO: We have to make a distinction between "defined concepts" and "candidate concepts" here.
189             # Probably just based on whether we find a URL or not?
190             my @matches;
191             my %termlist = ();
192             my $offset=0;
193             my $text_length = length($body);
194             # Read one (2+ letter) word at a time
195             my $concept_word_rex = $NNexus::Morphology::concept_word_rex;
196             CONCEPT_TRAVERSAL:
197             while ($body =~ s/^(.*?)($concept_word_rex)//s) {
198             $offset += length($1);
199             my $offset_begin = $offset;
200             $offset += length($2);
201             my $offset_end = $offset;
202             my $word = lc($2); # lower-case to match stopwords
203             # Use a cache for first-word lookups, with the dual-role of a blacklist.
204             my $cached = $first_word_cache->{$word};
205             my @candidates=();
206             if (! (ref $cached )) {
207             # Normalize word
208             my $norm_word = normalize_word($word);
209             # get all possible candidates for both posessive and plural forms of $word
210             @candidates = $db->select_firstword_matches($norm_word);
211             # Cache the candidates:
212             my $saved_candidates = clone_concepts(\@candidates); # Clone the candidates
213             $first_word_cache->{$word} = $saved_candidates;
214             $first_word_cache->{$norm_word} = $saved_candidates;
215             } else {
216             #Cached, clone into a new array
217             @candidates = @{ clone_concepts($cached)};
218             }
219             next CONCEPT_TRAVERSAL unless @candidates; # if there are no candidates skip the word
220             # Split tailwords into an array
221             foreach my $c(@candidates) {
222             $c->{tailwords} = [split(/\s+/,$c->{tailwords}||'')]; }
223             my $inner_offset = 0;
224             my $match_offset = 0; # Record the offset of the current longest match, add to end_position when finalized
225             my $inner_body = $body; # A copy of the text to munge around while searching.
226             my @inner_matches = grep {@{$_->{tailwords}} == 0} @candidates; # Record the current longest matches here
227             # Longest-match:
228             # as long as:
229             # - there is leftover tail in some candidate(s)
230             @candidates = grep {@{$_->{tailwords}} > 0} @candidates;
231             CANDIDATE_LOOP:
232             while (@candidates) {
233             # - AND there are leftover words in current phrase
234             if ($inner_body =~ /^(\s+)($concept_word_rex)/s) {
235             # then: pull and compare next word, reduce text and tailwords
236             # 1. Pull next.
237             my $step_offset = length($1) + length($2);
238             $inner_offset += $step_offset;
239             my $next_word = normalize_word($2);
240             # 2. Filter for applicable candidates
241             my @inner_candidates = grep { $_->{tailwords}->[0] eq $next_word } @candidates;
242             if (@inner_candidates) {
243             # We have indeed a longer match, remove the first tailword
244             shift @{$_->{tailwords}} foreach @inner_candidates;
245             # candidates for next iteration must have leftover tail words
246             @candidates = grep {@{$_->{tailwords}} > 0} @inner_candidates;
247             # record intermediate longest matches - the current empty tailwords
248             my @step_matches = grep {@{$_->{tailwords}} == 0} @inner_candidates;
249             if (@step_matches) {
250             @inner_matches = @step_matches;
251             $match_offset = $inner_offset;
252             }
253             # Move $step_offset right the text
254             substr($inner_body,0,$step_offset)='';
255             } else {last CANDIDATE_LOOP;} # Last here as well.
256             } else {last CANDIDATE_LOOP;} # Otherwise we are done
257             }
258             # In the end, do we have one or more matches?
259             if (@inner_matches > 0) {
260             # Yes!
261             # merge multi-links into single match entry
262             # multi-link = same concept, category and domain, different URLs
263             # CARE: careful not to confuse with cases of different categories, which need disambiguation
264             my @merged_matches;
265             #print STDERR Dumper(\@inner_matches);
266             while (@inner_matches) {
267             my $match = shift @inner_matches;
268             my $category = $match->{category};
269             my $domain = $match->{domain};
270             my @multilinks = map {$_->{link}}
271             grep {($_->{category} eq $category) && ($_->{domain} eq $domain)} @inner_matches;
272             @inner_matches = grep {($_->{category} ne $category) || ($_->{domain} ne $domain)} @inner_matches;
273             if (@multilinks>0) {
274             unshift @multilinks, $match->{link};
275             $match->{multilinks} = \@multilinks;
276             }
277             push @merged_matches, $match;
278             }
279             @inner_matches = @merged_matches;
280             # Record offsets:
281             $offset += $match_offset;
282             $offset_end += $match_offset;
283             foreach my $match(@inner_matches) {
284             $match->{offset_begin} = $offset_begin;
285             $match->{offset_end} = $offset_end;
286             delete $match->{tailwords};
287             }
288             # And push to main matches array
289             push @matches, @inner_matches;
290             # And move the text forward with the match_offset
291             substr($body,0,$match_offset)='' if $match_offset;
292             } else { next CONCEPT_TRAVERSAL; } # If not, we just move on to the next word
293             }
294             return (\@matches,$text_length);
295             }
296              
297             1;
298              
299             __END__