File Coverage

blib/lib/NNexus/Discover.pm
Criterion Covered Total %
statement 173 180 96.1
branch 39 50 78.0
condition 12 15 80.0
subroutine 15 15 100.0
pod 0 3 0.0
total 239 263 90.8


line stmt bran cond sub pod time code
1             # /=====================================================================\ #
2             # | NNexus Autolinker | #
3             # | Concept Discovery Module | #
4             # |=====================================================================| #
5             # | Part of the Planetary project: http://trac.mathweb.org/planetary | #
6             # | Research software, produced as part of work done by: | #
7             # | the KWARC group at Jacobs University | #
8             # | Copyright (c) 2012 | #
9             # | Released under the MIT License (MIT) | #
10             # |---------------------------------------------------------------------| #
11             # | Adapted from the original NNexus code by | #
12             # | James Gardner and Aaron Krowne | #
13             # |---------------------------------------------------------------------| #
14             # | Deyan Ginev #_# | #
15             # | http://kwarc.info/people/dginev (o o) | #
16             # \=========================================================ooo==U==ooo=/ #
17             package NNexus::Discover;
18 5     5   21 use strict;
  5         8  
  5         174  
19 5     5   20 use warnings;
  5         5  
  5         115  
20              
21 5     5   19 use Exporter;
  5         23  
  5         359  
22             our @ISA = qw(Exporter);
23             our @EXPORT_OK = qw(mine_candidates);
24 5     5   21 use Encode qw/is_utf8/;
  5         10  
  5         233  
25 5     5   21 use utf8;
  5         24  
  5         41  
26 5     5   97 use Data::Dumper;
  5         5  
  5         236  
27 5     5   19 use Time::HiRes qw ( time alarm sleep );
  5         7  
  5         42  
28              
29 5     5   2777 use NNexus::StopWordList qw(stop_words_ref);
  5         6  
  5         261  
30 5     5   1322 use NNexus::Morphology qw(normalize_word);
  5         10  
  5         386  
31 5     5   2200 use NNexus::Concepts qw(clone_concepts);
  5         13  
  5         413  
32              
33 5     5   3080 use HTML::Parser;
  5         32068  
  5         8442  
34              
35             # Reusable parser object (TODO: What happens if we thread/fork ?)
36             our $HTML_Parser =
37             HTML::Parser->new(
38             'api_version' => 3,
39             'start_h' => [sub {
40             my ($self,$tagname,$attr)=@_;
41             if ($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/ ||
42             (($tagname eq 'span') && $attr->{class} && ($attr->{class} =~ 'nolink'))) {
43             $self->{fresh_skip}=1;
44             $self->{noparse}++;
45             } else {
46             $self->{fresh_skip}=0;
47             }
48             } , 'self, tagname, attr'],
49             'end_h' => [sub {
50             my ($self,$tagname)=@_;
51             if (($tagname=~/^(head|style|title|script|xmp|iframe|code|math|svg|sup|a|(h\d+))$/) ||
52             (((length($tagname)==0)||($tagname eq 'span')) && ($self->{fresh_skip} == 1))) {
53             $self->{noparse}--;
54             $self->{fresh_skip}=0;
55             }
56             }, 'self,tagname'],
57             'text_h' => [\&_text_event_handler, 'self,text,offset']
58             );
59             $HTML_Parser->unbroken_text;
60             $HTML_Parser->xml_mode;
61             $HTML_Parser->attr_encoded(1);
62             $HTML_Parser->empty_element_tags(1);
63              
64             # Prepare cache for first-word concept lookup
65             our $first_word_cache_template = {map { ($_,[]) } @{stop_words_ref()}};
66             sub mine_candidates {
67 8     8 0 45 my (%options) = @_;
68             # State: We need a db object with a properly set database
69             # Input: We need a string representing the (HTML) body of the chunk we're
70             # mining on, as well as its URL.
71             # Optional: Deprecated details such as 'domain' or 'format'.
72             # Interesting: allow 'nolink' again?
73 48         73 my ($db,$format,$body,$nolink,$url,$domain) =
74 8         20 map {$options{$_}} qw(db format body nolink url domain);
75 8 50       35 die "The db key is a mandatory parameter for mine_candidates!\n" unless ref $db; # TODO: Maybe raise a better error?
76 8 50       22 $format = 'html' unless defined $format;
77 8 50       19 return ([],0) unless $body;
78             # Prepare data, if we have a URL:
79 8         11 my $objectid; # undefined if we don't have a URL, we only do MoC for named resources
80 8 100       20 if ($url) {
81 1   50     7 my $object = $db->select_object_by(url=>$url) || {};
82 1   50     5 $objectid = $object->{objectid} || -1;
83 1 50       3 $domain = $object->{domain} unless defined $domain;
84             # If objectid is -1 , we will also need to add_object on the url
85 1 50       3 if ($objectid == -1) {
86             # TODO: Extract the domain from the URL, this is unreliable
87 1         6 $objectid = $db->add_object_by(url=>$options{'url'},domain=>$domain);
88             } else {
89             # If already known, flush the links_cache for this object
90 0         0 $db->delete_linkscache_by(objectid=>$objectid);
91             }
92             }
93             # Keep a cache of first words, that will simultaneously act as a blacklist.
94             # TODO: Incorporate the single words from 'nolink'
95             # Experiment: Use a global first_word cache
96 8         18 $options{first_word_cache} = $first_word_cache_template; # Default are stopwords, keep a global cache
97             # $options{first_word_cache} = { %$first_word_cache_template }; # Default are stopwords
98             # Always return an embedded annotation with links, as well as a stand-off mined_canidates hash, containing the individual concepts with pointers.
99 8         11 my $time;
100 8 50       20 if ($options{verbosity}) {
101 0         0 $time = time();
102             }
103 8         15 my $mined_candidates=[];
104 8         10 my $text_length=0;
105 8 100       27 if ($format eq 'html') {
    50          
106 2         8 ($mined_candidates,$text_length) = mine_candidates_html(\%options);
107             } elsif ($format eq 'text') {
108 6         21 ($mined_candidates,$text_length) = mine_candidates_text(\%options);
109             } else {
110 0         0 print STDERR "Error: Unrecognized input format for auto-linking.\n";
111             }
112             # Only mark-up first found candidate, unless requested otherwise
113 8         13 my @uniq_candidates;
114 8         30 while (@$mined_candidates) {
115 19         19 my $candidate = shift @$mined_candidates;
116 19         25 my $concept = $candidate->{concept};
117 19         21 my $link = $candidate->{link};
118 19         21 my $category = $candidate->{category};
119 19 100 100     28 @$mined_candidates = grep {($_->{concept} ne $concept) || ($_->{link} ne $link) || ($_->{category} ne $category)} @$mined_candidates;
  43         111  
120 19         40 push @uniq_candidates, $candidate;
121             }
122             # Also, don't add self-links, coming from $url
123 8 100       21 @uniq_candidates = grep {$_->{link} ne $url} @uniq_candidates if $url;
  1         4  
124 8         16 @$mined_candidates = @uniq_candidates;
125              
126             #TODO: When do we deal with the nolink settings?
127             # next if (inset($concept,@$nolink));
128 8 50       23 if ($options{verbosity}) {
129 0         0 printf STDERR " Discovered %d concepts in %.3f seconds.\n",scalar(@uniq_candidates),time()-$time;
130             }
131              
132             # Update linkscache:
133 8 100       20 if ($objectid) {
134             $db->add_linkscache_by(objectid=>$objectid,conceptid=>$_->{conceptid})
135 1         6 foreach (@$mined_candidates);
136             }
137 8         56 return ($mined_candidates,$text_length);
138             }
139              
140             sub mine_candidates_html {
141 2     2 0 5 my ($options) = @_;
142 2         4 my ($db,$domain,$body,$syns,$targetid,$class) = map {$options->{$_}} qw(db domain body nolink targetid class);
  12         20  
143             # Current HTML Parsing strategy - fire events for all HTML tags and explicitly skip over tags that
144             # won't be of interest. We need to autolink in all textual elements.
145             # TODO: Handle MathML better
146 2 50       7 return ([],0) unless $body;
147              
148 2         12 $HTML_Parser->{mined_candidates} = [];
149 2         7 $HTML_Parser->{text_length} = 0;
150 2         3 $HTML_Parser->{state_information}=$options; # Not pretty, but TODO: improve
151 2         33 $HTML_Parser->parse($body);
152 2         10 $HTML_Parser->eof();
153 2         11 return ($HTML_Parser->{mined_candidates},$HTML_Parser->{text_length});
154             }
155              
156             sub _text_event_handler {
157 665     665   783 my ($self,$body,$offset) = @_;
158 665         730 my $state = $self->{state_information};
159             # Skip if in a silly element:
160 665 100 66     2898 if (($self->{noparse} && ($self->{noparse}>0)) || ($body !~ /\w/)) {
      100        
161 580         2301 return;
162             }
163             # Otherwise - discover concepts and annotate!
164 85         188 my $time = time();
165 85         538 my ($mined_candidates,$chunk_length) =
166             mine_candidates_text({db=>$state->{db},
167             nolink=>$state->{nolink},
168             body=>$body,
169             domain=>$state->{domain},
170             first_word_cache=>$state->{first_word_cache},
171             class=>$state->{class}});
172             #printf STDERR " --processed textual chunk in %.3f seconds\n",time()-$time;
173 85         270 foreach my $candidate(@$mined_candidates) {
174 2         6 $candidate->{offset_begin}+=$offset;
175 2         6 $candidate->{offset_end}+=$offset;
176             }
177 85         68 push @{$self->{mined_candidates}}, @$mined_candidates;
  85         153  
178 85         674 $self->{text_length} += $chunk_length;
179             }
180              
181             # Core Data Mining routine - inspects plain-text strings
182             # returns back the matches and position of disambiguated links of the supplied text.
183             sub mine_candidates_text {
184 91     91 0 103 my ($options) = @_;
185 728         942 my ($db,$domain,$body,$syns,$targetid,$nolink,$class,$first_word_cache) =
186 91         143 map {$options->{$_}} qw(db domain body nolink targetid nolink class first_word_cache);
187              
188             # TODO: We have to make a distinction between "defined concepts" and "candidate concepts" here.
189             # Probably just based on whether we find a URL or not?
190 91         132 my @matches;
191 91         160 my %termlist = ();
192 91         94 my $offset=0;
193 91         100 my $text_length = length($body);
194             # Read one (2+ letter) word at a time
195 91         96 my $concept_word_rex = $NNexus::Morphology::concept_word_rex;
196             CONCEPT_TRAVERSAL:
197 91         961 while ($body =~ s/^(.*?)($concept_word_rex)//s) {
198 401         812 $offset += length($1);
199 401         389 my $offset_begin = $offset;
200 401         453 $offset += length($2);
201 401         299 my $offset_end = $offset;
202 401         577 my $word = lc($2); # lower-case to match stopwords
203             # Use a cache for first-word lookups, with the dual-role of a blacklist.
204 401         622 my $cached = $first_word_cache->{$word};
205 401         515 my @candidates=();
206 401 100       708 if (! (ref $cached )) {
207             # Normalize word
208 134         344 my $norm_word = normalize_word($word);
209             # get all possible candidates for both posessive and plural forms of $word
210 134         454 @candidates = $db->select_firstword_matches($norm_word);
211             # Cache the candidates:
212 134         398 my $saved_candidates = clone_concepts(\@candidates); # Clone the candidates
213 134         271 $first_word_cache->{$word} = $saved_candidates;
214 134         206 $first_word_cache->{$norm_word} = $saved_candidates;
215             } else {
216             #Cached, clone into a new array
217 267         220 @candidates = @{ clone_concepts($cached)};
  267         577  
218             }
219 401 100       3645 next CONCEPT_TRAVERSAL unless @candidates; # if there are no candidates skip the word
220             # Split tailwords into an array
221 14         25 foreach my $c(@candidates) {
222 182   100     477 $c->{tailwords} = [split(/\s+/,$c->{tailwords}||'')]; }
223 14         20 my $inner_offset = 0;
224 14         17 my $match_offset = 0; # Record the offset of the current longest match, add to end_position when finalized
225 14         24 my $inner_body = $body; # A copy of the text to munge around while searching.
226 14         25 my @inner_matches = grep {@{$_->{tailwords}} == 0} @candidates; # Record the current longest matches here
  182         90  
  182         204  
227             # Longest-match:
228             # as long as:
229             # - there is leftover tail in some candidate(s)
230 14         19 @candidates = grep {@{$_->{tailwords}} > 0} @candidates;
  182         99  
  182         217  
231             CANDIDATE_LOOP:
232 14         36 while (@candidates) {
233             # - AND there are leftover words in current phrase
234 14 100       252 if ($inner_body =~ /^(\s+)($concept_word_rex)/s) {
  4         9  
235             # then: pull and compare next word, reduce text and tailwords
236             # 1. Pull next.
237 10         35 my $step_offset = length($1) + length($2);
238 10         15 $inner_offset += $step_offset;
239 10         40 my $next_word = normalize_word($2);
240             # 2. Filter for applicable candidates
241 10         20 my @inner_candidates = grep { $_->{tailwords}->[0] eq $next_word } @candidates;
  174         207  
242 10 100       24 if (@inner_candidates) {
  4         10  
243             # We have indeed a longer match, remove the first tailword
244 6         14 shift @{$_->{tailwords}} foreach @inner_candidates;
  16         36  
245             # candidates for next iteration must have leftover tail words
246 6         14 @candidates = grep {@{$_->{tailwords}} > 0} @inner_candidates;
  16         13  
  16         125  
247             # record intermediate longest matches - the current empty tailwords
248 6         12 my @step_matches = grep {@{$_->{tailwords}} == 0} @inner_candidates;
  16         14  
  16         32  
249 6 50       17 if (@step_matches) {
250 6         13 @inner_matches = @step_matches;
251 6         12 $match_offset = $inner_offset;
252             }
253             # Move $step_offset right the text
254 6         35 substr($inner_body,0,$step_offset)='';
255             } else {last CANDIDATE_LOOP;} # Last here as well.
256             } else {last CANDIDATE_LOOP;} # Otherwise we are done
257             }
258             # In the end, do we have one or more matches?
259 14 100       32 if (@inner_matches > 0) {
260             # Yes!
261             # merge multi-links into single match entry
262             # multi-link = same concept, category and domain, different URLs
263             # CARE: careful not to confuse with cases of different categories, which need disambiguation
264 10         12 my @merged_matches;
265             #print STDERR Dumper(\@inner_matches);
266 10         23 while (@inner_matches) {
267 20         29 my $match = shift @inner_matches;
268 20         26 my $category = $match->{category};
269 20         21 my $domain = $match->{domain};
270 0 100       0 my @multilinks = map {$_->{link}}
  22         42  
271 20         25 grep {($_->{category} eq $category) && ($_->{domain} eq $domain)} @inner_matches;
272 20 100       23 @inner_matches = grep {($_->{category} ne $category) || ($_->{domain} ne $domain)} @inner_matches;
  22         46  
273 20 50       32 if (@multilinks>0) {
274 0         0 unshift @multilinks, $match->{link};
275 0         0 $match->{multilinks} = \@multilinks;
276             }
277 20         42 push @merged_matches, $match;
278             }
279 10         15 @inner_matches = @merged_matches;
280             # Record offsets:
281 10         23 $offset += $match_offset;
282 10         9 $offset_end += $match_offset;
283 10         17 foreach my $match(@inner_matches) {
284 20         32 $match->{offset_begin} = $offset_begin;
285 20         27 $match->{offset_end} = $offset_end;
286 20         37 delete $match->{tailwords};
287             }
288             # And push to main matches array
289 10         21 push @matches, @inner_matches;
290             # And move the text forward with the match_offset
291 10 100       150 substr($body,0,$match_offset)='' if $match_offset;
292 4         42 } else { next CONCEPT_TRAVERSAL; } # If not, we just move on to the next word
293             }
294 91         300 return (\@matches,$text_length);
295             }
296              
297             1;
298              
299             __END__