File Coverage

blib/lib/Lingua/NL/FactoidExtractor.pm
Criterion Covered Total %
statement 6 210 2.8
branch 0 100 0.0
condition 0 72 0.0
subroutine 2 8 25.0
pod 0 1 0.0
total 8 391 2.0


line stmt bran cond sub pod time code
1             package Lingua::NL::FactoidExtractor;
2              
3 1     1   20744 use 5.008007;
  1         3  
  1         30  
4 1     1   5 use strict;
  1         1  
  1         2886  
5             require Exporter;
6              
7             our @ISA = qw(Exporter);
8             our @EXPORT = qw(extract);
9             our $VERSION = '1.4';
10              
11             #Declare global variables
12             our @factoids;
13              
14             my @functionwords=("alle","alles","andere","anderen","beide","dat","deze","dezelfde","die","dingen","dit","een","geen","hem","hen","het","hij","ieder","iedereen","iemand","iets","ik","je","jij","meer","men","mensen","niemand","niets","ons","sommige","sommigen","u","veel","vele","velen","waaraan","waaronder","wat","we","weinig","welke","wie","wij","ze","zich","zichzelf","zij","zijn","zo","zoveel");
15             my %functionwords = map { $_ => 1 } @functionwords;
16             # We do not save factoids of which the subject is only a pronoun
17              
18             # Package variables needed for reading the xml input
19             my($level, %rel, %word, %root, %level, %frame, %cat, %lcat, %head, %sc, %index, %begin, %wh, %ids_for_index, @ids, %clauses_done);
20             my $sentence_initial; # boolean
21             my $doctitle;
22              
23             sub extract ($$) {
24 0     0 0   my ($inputfile,$verbose) = @_;
25              
26 0           undef @factoids;
27 0           undef %rel; undef %word; undef %root; undef %level; undef %frame; undef %cat; undef %lcat; undef %head; undef %sc; undef %index; undef %begin; undef %wh;
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
28 0           undef %ids_for_index; undef @ids; undef %clauses_done;
  0            
  0            
29            
30 0           $level = 0;
31 0           $sentence_initial = 0;
32 0           $doctitle = "";
33 0           print STDERR "Parsing $inputfile...\n";
34            
35 0 0         open (ALP,"< $inputfile") or die "$! $inputfile\n";
36              
37 0           while (my $line=) {
38 0 0         if ($line =~ /
39 0           $level++;
40             }
41            
42 0           my $id="";
43 0 0         if ($line =~ / id=\"([0-9]+)\" /) {
44 0           $id=$1;
45             }
46 0 0         if ($line =~ / begin=\"([0-9]+)\" /) {
47 0           my $begin = $1;
48 0           $begin{$id} = $begin;
49             }
50 0 0         if ($line =~ / rel=\"([^\"]+)\"/) {
51 0           my $rel=$1;
52 0           $rel{$id} = $rel;
53             }
54 0 0         if ($line =~ / frame=\"([^\"]+)\"/) {
55 0           my $frame=$1;
56 0           $frame{$id} = $frame;
57             }
58              
59 0 0         if ($line =~ / cat=\"([^\"]+)\"/) {
60 0           my $cat=$1;
61 0           $cat{$id} = $cat;
62             }
63 0 0         if ($line =~ / lcat=\"([^\"]+)\"/) {
64 0           my $lcat=$1;
65 0           $lcat{$id} = $lcat;
66             }
67 0 0         if ($line =~ / sc=\"([^\"]+)\"/) {
68 0           my $sc=$1;
69 0           $sc{$id} = $sc;
70             # can be copula, passive, etc.
71             }
72 0 0         if ($line =~ / word=\"([^\"]+?)[\.\,]?\"/) {
73 0           my $word=$1;
74 0           $word{$id} = $word;
75             }
76 0 0         if ($line =~ / root=\"([^\"]+?)[\.\,]?\"/) {
77 0           my $root=$1;
78 0           $root{$id} = $root;
79             }
80 0 0         if ($line =~ / wh=\"([^\"]+)\"/) {
81 0           my $wh=$1;
82 0           $wh{$id} = $wh;
83             }
84 0 0         if ($line =~ / index=\"([^\"]+)\"/) {
85 0           my $index=$1;
86 0           $index{$id} = $index;
87 0           push(@{$ids_for_index{$index}},$id);
  0            
88             }
89              
90 0           $level{$id} = $level;
91 0 0 0       if ($line =~ /<\/node>/ or $line =~ /\/>/) {
92 0           $level--;
93             }
94            
95 0 0         if ($line =~ /(.*)<\/sentence>/) {
96 0 0         print "\# $1\n" if ($verbose);
97             }
98             }
99 0           close(ALP);
100              
101 0           @ids = sort {$a <=> $b} keys %rel;
  0            
102              
103 0           my $sentence_initial = 0; # boolean
104              
105 0           foreach my $id (sort {$a <=> $b} keys %rel) {
  0            
106              
107 0 0 0       if (defined $cat{$id} && $cat{$id} =~ /^(smain|ssub|sv1)$/) {
108 0 0         if (not defined $clauses_done{$id}) {
109 0           my ($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,$1,"","","");
110             # return subject (and its id) of main clause because we need it in embedded clause vc
111             # (either as subject or as object in the case of a passive main clause)
112 0           while (defined $new_head_id) {
113 0 0         if (not defined $clauses_done{$new_head_id}) {
114 0           $id = $new_head_id;
115 0 0         if ($voice eq "passive") {
116 0           ($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,"",$subject);
117             # if passive, then store the subject of the main clause in the object slot of the vc
118             } else {
119 0           ($new_head_id,$subj_id,$subject,$voice) = &_generate_factoid($id,"vc/body",$subj_id,$subject,"");
120             }
121             }
122             }
123             }
124             }
125             }
126              
127 0           my $factoids = join("\n",@factoids);
128 0           return $factoids;
129             };
130              
131             sub _generate_factoid($$) {
132 0     0     my ($clause_id,$clausetype,$subj_id,$subject,$object) = @_;
133            
134 0           $clauses_done{$clause_id} = 1;
135              
136 0           my $verb="";
137 0           my @modifiers;
138            
139             my $new_head_id;
140             # if there is a vc or body in the clause then this is an embedded factoid
141             # with the same subject as the main clause
142              
143 0           my @headed_ids = _get_headed_ids($clause_id);
144             #print STDERR "headed ids for clause $clause_id: @headed_ids\n";
145 0           my $voice="active";
146 0           my $verb_type="";
147 0           my $tuple_type="factoid";
148 0           my $obj_id;
149            
150 0           my $info = "";
151              
152 0           foreach my $id (@headed_ids) {
153 0 0 0       if (defined $sc{$id} && $sc{$id} eq "passive") {
154 0           $voice = "passive";
155             }
156 0           my $rel = $rel{$id};
157 0           my $frame="";
158 0 0         if (defined $frame{$id}) {
159 0           $frame = $frame{$id};
160             }
161            
162 0 0 0       if ($rel eq "hd" && $verb eq "" && $frame =~ /verb/) {
    0 0        
    0 0        
    0 0        
    0          
163             # if the verb slot was not yet filled with a main verb
164             #$verb = "hd:".$word{$id};
165             #print STDERR "hd: $id\n";
166 0           $verb = "hd:".$root{$id};
167 0           $verb_type = $sc{$id};
168             # use root (lemma) of verb
169             } elsif ($rel eq "vc" or $rel eq "body") {
170             # get the underlying factoid recursively by returning the current id as new head id
171 0           $new_head_id = $id;
172             } elsif ($rel eq "su" && $subject eq "") {
173             # if the subject slot was not yet filled with the subject of the main clause
174 0           $subject = "su:".&_get_constituent($id);
175 0           $subj_id = $id;
176 0 0         if ($begin{$id} eq "0") {
177 0           $sentence_initial = 1;
178             }
179             } elsif ($rel =~ /^(obj1|obj2|predc)$/) {
180 0           my $rel = $1;
181 0 0 0       if ($object =~ /su:/ && $rel eq "obj1") {
182             # if the object slot already contains the subject of the main clause (in case of passive voice) don't add it again
183             } else {
184 0           $object .= "$rel:".&_get_constituent($id);
185 0           $obj_id = $id;
186             }
187             } elsif ($rel =~ /^(mod|pc|predm|ld)$/) {
188 0           my $modifier = "$1:".&_get_constituent($id);
189             #print STDERR "Mod: $modifier\n";
190 0           push (@modifiers,$modifier);
191             }
192            
193             }
194            
195             # transform passive clauses
196 0 0 0       if ($subject eq "" && $object =~ /su:/) {
197 0           my $m=0;
198 0           $info .= "passive-to-active ";
199 0           foreach my $modifier (@modifiers) {
200 0 0         if ($modifier =~ /door (.+)$/) {
201 0           $subject = $1;
202 0           splice(@modifiers,$m,1);
203 0           $info .= "modifier-to-subject ";
204             }
205 0           $m++;
206             }
207 0 0         if ($subject eq "") {
208             # if none of the modifiers starts with 'door'
209 0           $subject = "MEN";
210             }
211             }
212            
213             # transform double object constructions to a factoid and a definition
214 0 0         if ($object =~ s/([a-z0-9]+):(.+) ([a-z0-9]+):(.+)/$1:$2|$3:$4/) {
215             # double object construction, e.g. "het wordt het Silicon Valley van India genoemd"
216 0           $info .= "double-object-to-definition ";
217 0           $tuple_type = "definition";
218 0           my $definition = "<$tuple_type id='$clause_id' subj='$1:$2' verb='IS' obj='$3:$4' mods='' topic='$doctitle'> # $info";
219 0           $definition = &_clean_up($definition);
220 0           push (@factoids,$definition);
221 0           $tuple_type = "factoid";
222             }
223            
224             # transform copular constructions without modifiers to definitions
225 0 0 0       if ($verb_type eq "copula" && $subject =~ /\S/ && $object =~ /\S/){
      0        
226 0           $verb = "IS";
227 0           $tuple_type = "definition";
228 0           $info .= "copula-to-definition ";
229             }
230            
231             # resolve relative pronouns: replace die/dat/wat by the most recent NP.
232 0 0 0       if ($subject =~ /:(die|dat|wat) *$/i && defined($subj_id)) {
233             #print STDERR "Get recent cat id for subject id $subj_id ($subject)\n";
234 0           my $head_id = &_get_recent_cat_id($subj_id,"np");
235 0           $subject = "su:".&_get_constituent($head_id);
236 0           $info .= "pron-to-np ";
237             }
238 0 0 0       if ($object =~ /:(die|dat|wat) *$/i && defined($obj_id)) {
239 0           my $head_id = &_get_recent_cat_id($obj_id,"np");
240 0           $object = "obj:".&_get_constituent($head_id);
241 0           $info .= "pron-to-np ";
242             }
243            
244              
245 0           $subject = &_clean_up($subject);
246 0           $object = &_clean_up($object);
247            
248 0 0         if ($object =~ s/ ([0-9]{4})$//) {
249 0           push(@modifiers,$1);
250             # if the object ends in a year then move it to the modifiers
251             }
252 0           my $modifiers = join("|",@modifiers);
253              
254 0 0 0       if ($sentence_initial && $subject =~ /^(de|het|een|die|dat|deze|dit|alle|andere|dezelfde|geen|ieder|meer|veel|vele|weinig|welke|zoveel) /i) {
255 0           $subject = lcfirst($subject);
256             # at the beginning of a sentence, lowercase determiners
257             }
258 0           my $factoid = "<$tuple_type id='$clause_id' subj='$subject' verb='$verb' obj='$object' mods='$modifiers' topic='$doctitle'> # $info";
259 0           $factoid = &_clean_up($factoid);
260            
261             #print STDERR "Verb type: $verb_type\n";
262 0 0 0       if ($verb eq "" or (($object eq "") && (not defined $modifiers[0]) && ($verb_type =~ /(aux|passive)/))) {
      0        
      0        
263             # throw away empty passives for which the sub clause has been raised, e.g. ("Dit rijk wordt")
264             } else {
265 0           push(@factoids,$factoid);
266             }
267 0           return ($new_head_id,$subj_id,"su:$subject",$voice);
268             }
269              
270             sub _get_constituent($) {
271 0     0     my ($start_id) = @_;
272 0           my $constituent = "";
273 0 0 0       if (not defined $cat{$start_id} && not defined $lcat{$start_id} && defined $index{$start_id}) {
      0        
274            
275             # find the constituent that has the same index
276 0 0         if (defined $index{$start_id}) {
277 0           my $index = $index{$start_id};
278 0           foreach my $index_id (@{$ids_for_index{$index}}) {
  0            
279 0 0 0       if (defined $cat{$index_id} or defined $lcat{$index_id}) {
280 0           $start_id = $index_id;
281 0           last;
282             }
283             }
284             }
285             }
286 0           my $rellevel = $level{$start_id};
287 0 0         $constituent .= "$word{$start_id} " if (defined $word{$start_id});
288 0           my $id=$start_id;
289 0           $id++;
290 0   0       while ($id <= $ids[-1] && $level{$id} > $rellevel) {
291 0 0         last if ($rel{$id} eq "rhd");
292 0 0         $constituent .= "$word{$id} " if (defined $word{$id});
293 0           $id++;
294             }
295 0           return $constituent;
296             }
297              
298             sub _get_recent_cat_id($$) {
299 0     0     my ($id,$search_cat) = @_;
300 0           my $head_id = $id;
301 0           my $cat_of_head_id="";
302 0 0         if (defined $cat{$head_id}) {
    0          
303 0           $cat_of_head_id = $cat{$head_id};
304             } elsif (defined $lcat{$head_id}) {
305 0           $cat_of_head_id = $lcat{$head_id};
306             }
307             #print STDERR "Head id: $head_id, Search cat: $search_cat\n";
308 0   0       while ($cat_of_head_id ne $search_cat && $head_id > 0) {
309 0           $head_id--;
310             #print STDERR "Head id: $head_id\n";
311             }
312 0 0 0       if ($head_id == 0 && $cat{$head_id} ne $search_cat){
313             # if no note of type search_cat was found before then the original id is returned
314             # (for example, when a sentence starts with a pronoun, there is no preceding NP)
315 0           return $id;
316             }
317 0           return $head_id;
318             }
319              
320             sub _get_headed_ids($) {
321 0     0     my ($head_id) = @_;
322 0           my $headlevel = $level{$head_id};
323 0           my @headed_ids;
324 0           my $id = $head_id;
325 0           $id++;
326 0 0         push(@headed_ids,$id) if ($level{$id} == $headlevel+1);
327 0   0       while ($id < $ids[-1] && $level{$id} > $headlevel) {
328 0           $id++;
329 0 0         push(@headed_ids,$id) if ($level{$id} == $headlevel+1);
330             }
331 0           return @headed_ids;
332             }
333              
334             sub _clean_up($) {
335 0     0     my ($string) = @_;
336 0           $string =~ s/[a-z0-9]+: *//g;
337 0           $string =~ s/ +/ /g;
338 0           $string =~ s/^ //;
339 0           $string =~ s/[,.] *$//;
340 0           $string =~ s/=\' /=\'/g;
341 0           $string =~ s/ \' /\' /g;
342 0           $string =~ s/ \'>/\'>/g;
343 0           return $string;
344             }
345              
346              
347             1;
348             __END__