File Coverage

blib/lib/Lingua/YaTeA/TestifiedTermSet.pm

Criterion	Covered	Total	%
statement	88	139	63.3
branch	10	44	22.7
condition	4	12	33.3
subroutine	17	19	89.4
pod	10	12	83.3
total	129	226	57.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::YaTeA::TestifiedTermSet;
2	5			5		30	use strict;
	5					10
	5					128
3	5			5		22	use warnings;
	5					10
	5					116
4	5			5		1887	use Lingua::YaTeA::MultiWordTestifiedTerm;
	5					12
	5					46
5	5			5		1833	use Lingua::YaTeA::MonolexicalTestifiedTerm;
	5					11
	5					45
6	5			5		1901	use Lingua::YaTeA::TestifiedTermParser;
	5					12
	5					52
7	5			5		136	use UNIVERSAL;
	5					10
	5					101
8	5			5		119	use Scalar::Util qw(blessed);
	5					9
	5					6790
9
10							our $VERSION=$Lingua::YaTeA::VERSION;
11
12							sub new
13							{
14	2			2	1	5	my ($class) = @_;
15	2					4	my $this = {};
16	2					5	bless ($this,$class);
17	2					14	$this->{TESTIFIED_TERMS} = {};
18	2					11	$this->{LEXICON} = Lingua::YaTeA::Lexicon->new;
19	2					11	$this->{SOURCES} = [];
20	2					7	return $this;
21							}
22
23							sub addSubset
24							{
25	1			1	1	3	my ($this,$file_path,$filtering_lexicon_h,$sentence_boundary,$match_type,$tag_set) = @_;
26	1					1	my $line;
27
28
29	1					4	my $format = $this->testTerminologyFormat($file_path);
30	1					30	print STDERR "\ -". $file_path . " (format: ". $format . ")\n";
31
32	1	50				6	if($format eq "TTG")
33							{
34	1					4	$this->loadTTGformatTerminology($file_path,$filtering_lexicon_h,$sentence_boundary,$match_type,$tag_set);
35							}
36							else
37							{
38	0	0				0	if($format eq "PARSED")
39							{
40	0					0	$this->loadParsedTerminology($file_path,$filtering_lexicon_h,$match_type,$tag_set);
41							}
42							}
43
44							}
45
46							sub testTerminologyFormat
47							{
48	1			1	1	2	my ($this,$file_path) = @_;
49	1					32	warn "check $file_path\n";
50	1	50				10	my $fh = FileHandle->new("<$file_path") or die "\n******\nNo such file: $file_path\n******\n";
51	1					69	my $line;
52	1					22	while ($line= $fh->getline)
53							{
54	1	50				47	if($line =~ /^[^\t]+\t[^\t]+\t[^\t]+$/)
55							{
56	1					15	return "TTG";
57							}
58							else{ # format analyse parenthesee : a abandonner
59	0	0				0	if ($line =~ /^\( .+<=[HM]> /){
60	0					0	return "PARSED";
61							}
62							}
63
64							}
65	0					0	die "undefined terminology input format";
66
67							}
68
69							sub loadTTGformatTerminology
70							{
71	1			1	1	4	my ($this,$file_path,$filtering_lexicon_h,$sentence_boundary,$match_type,$tag_set) = @_;
72
73	1					6	my $fh = FileHandle->new("<$file_path");
74	1					61	my $word;
75							my $block;
76
77	1					7	local $/ = "\.\t". $sentence_boundary ."\t\.\n";
78
79	1					3	while (! $fh->eof)
80							{
81	1					40	$block = $fh->getline;
82	1					22	$this->buildTestifiedTerm($block,$sentence_boundary,$match_type,$filtering_lexicon_h,$file_path,$tag_set);
83							}
84
85							}
86
87							sub loadParsedTerminology
88							{
89	0			0	0	0	my ($this,$file_path,$filtering_lexicon_h,$match_type,$tag_set) = @_;
90
91	0					0	my $fh = FileHandle->new("<".$file_path);
92	0					0	print "open " . $file_path . "\n";
93	0					0	my $word;
94							my $line;
95
96	0					0	my $parser = Lingua::YaTeA::TestifiedTermParser->new();
97	0					0	$parser->YYData->{TTS} = $this;
98	0					0	$parser->YYData->{WORD} = '([^ <\t]+)';
99	0					0	$parser->YYData->{TAGSET} = $tag_set;
100	0					0	$parser->YYData->{MATCH} = $match_type;
101	0					0	$parser->YYData->{FH} = $fh;
102	0					0	$parser->YYData->{FILTERING_LEXICON} = $filtering_lexicon_h;
103	0					0	$parser->YYParse(yylex => \&Lingua::YaTeA::TestifiedTermParser::_Lexer, yyerror => \&Lingua::YaTeA::TestifiedTermParser::_Error #,yydebug=>1);
104							);
105							# while (! $fh->eof)
106							# {
107							# $line = $fh->getline;
108							# if (($line !~ /^\#/)&&($line !~ /^\s*$/)){ # if line not commented nor empty
109							# $this->parseTestifiedTerm($line,$match_type,$filtering_lexicon_h,$tag_set);
110							# }
111							# }
112							}
113
114
115							# sub parseTestifiedTerm
116							# {
117							# my ($this,$line,$match_type,$filtering_lexicon_h,$tag_set) = @_;
118							# my $testified_infos;
119							# my $testified;
120							# my $i;
121							# if($this->getTestifiedInfos(\$testified_infos,$line,$match_type,$filtering_lexicon_h,$tag_set) == 1)
122							# {
123							# if(scalar @{$testified_infos->{"WORDS"}} > 1)
124							# {
125							# $testified = Lingua::YaTeA::MultiWordTestifiedTerm->new($testified_infos->{"NUM_CONTENT_WORDS"},$testified_infos->{"WORDS"},$tag_set,$testified_infos->{"SOURCE"},$match_type);
126							# $testified->setForest($testified_infos->{"PARSE"});
127							# }
128							# else
129							# {
130							# if(scalar @{$testified_infos->{"WORDS"}} == 1)
131							# {
132							# $testified = Lingua::YaTeA::MonolexicalTestifiedTerm->new($testified_infos->{"NUM_CONTENT_WORDS"},$testified_infos->{"WORDS"},$tag_set,$testified_infos->{"SOURCE"},$match_type);
133							# }
134							# }
135							# }
136
137							# if((blessed($testified)) && ($testified->isa('Lingua::YaTeA::TestifiedTerm')))
138							# {
139							# $this->addTestified($testified);
140							# }
141
142							# }
143
144
145							sub getTestifiedInfos
146							{
147	0			0	0	0	my ($this,$testified_infos_r,$IF_a,$POS_a,$LF_a,$src,$lex_items_a,$match_type,$filtering_lexicon_h,$tag_set) = @_;
148	0					0	my @infos;
149							my $word;
150	0					0	my $item;
151	0					0	my $i;
152
153							# print STDERR "GTI: " . join(" ", @$IF_a) . "\n";
154	0					0	for ($i=0; $i < scalar @$IF_a; $i++)
155							{
156	0	0				0	if($match_type eq "loose") # look at IF or LF
157							{
158	0	0	0			0	if(
159							(!exists $filtering_lexicon_h->{lc($IF_a->[$i])})
160							&&
161							(!exists $filtering_lexicon_h->{lc($LF_a->[$i])})
162							)
163							{
164							# current word does not appear in the corpus : testified term won't be loaded
165	0					0	return 0;
166							}
167							}
168							else
169							{
170	0	0				0	if($match_type eq "strict") # look at IF and POS
171							{
172	0	0				0	if (!exists $filtering_lexicon_h->{lc($IF_a->[$i])."~".$POS_a->[$i]})
173							{
174							# current word does not appear in the corpus : testified term won't be loaded
175	0					0	return 0;
176							}
177
178							}
179							else
180							{
181							# default match: look at IF
182	0	0				0	if(!exists $filtering_lexicon_h->{lc($IF_a->[$i])})
183							{
184							# current word does not appear in the corpus : testified term won't be loaded
185	0					0	return 0;
186							}
187							}
188
189							}
190							}
191							# $$testified_infos_r->{"PARSE"} = $infos[0];
192							# print STDERR $$testified_infos_r->{"PARSE"} . "\n";
193	0					0	$$testified_infos_r->{"SOURCE"} = $infos[4];
194	0					0	for ($i=0; $i < scalar @$IF_a; $i++)
195							{
196	0					0	$word = $IF_a->[$i] . "\t" . $POS_a->[$i] . "\t" . $LF_a->[$i];
197	0					0	$item = $this->getLexicon->addOccurrence($word);
198	0					0	push @$lex_items_a, $item;
199	0	0				0	if ($tag_set->existTag('CANDIDATES',$item->getPOS))
200							{
201	0					0	$$testified_infos_r->{"NUM_CONTENT_WORDS"}++;
202							}
203	0					0	push @{$$testified_infos_r->{"WORDS"}}, $item;
	0					0
204							}
205
206	0					0	return 1;
207							}
208							# sub getTestifiedInfos
209							# {
210							# my ($this,$testified_infos_r,$IF_a,$POS_a,$LF_a,$src,$lex_items_a,$match_type,$filtering_lexicon_h,$tag_set) = @_;
211							# # my ($this,$testified_infos_r,$line,$match_type,$filtering_lexicon_h,$tag_set) = @_;
212							# my @infos;
213							# my @IF;
214							# my @LF;
215							# my @POS;
216							# my $word;
217							# my $item;
218							# my $i;
219							# chomp $line;
220							# @infos = split /\t/, $line;
221							# @IF = split / /,$infos[1];
222							# @LF = split / /,$infos[3];
223							# @POS = split / /,$infos[2];
224
225							# for ($i=0; $i < scalar @IF; $i++)
226							# {
227							# if($match_type eq "loose") # look at IF or LF
228							# {
229							# if(
230							# (!exists $filtering_lexicon_h->{lc($IF[$i])})
231							# &&
232							# (!exists $filtering_lexicon_h->{lc($LF[$i])})
233							# )
234							# {
235							# # current word does not appear in the corpus : testified term won't be loaded
236							# return;
237							# }
238							# }
239							# else
240							# {
241							# if($match_type eq "strict") # look at IF and POS
242							# {
243							# if (!exists $filtering_lexicon_h->{lc($IF[$i])."~".$POS[$i]})
244							# {
245							# # current word does not appear in the corpus : testified term won't be loaded
246							# return;
247							# }
248
249							# }
250							# else
251							# {
252							# # default match: look at IF
253							# if(!exists $filtering_lexicon_h->{lc($IF[$i])})
254							# {
255							# # current word does not appear in the corpus : testified term won't be loaded
256							# return;
257							# }
258							# }
259
260							# }
261							# }
262							# $$testified_infos_r->{"PARSE"} = $infos[0];
263							# $$testified_infos_r->{"SOURCE"} = $infos[4];
264							# for ($i=0; $i < scalar @IF; $i++)
265							# {
266							# $word = $IF[$i] . "\t" . $POS[$i] . "\t" . $LF[$i];
267							# $item = $this->getLexicon->addOccurrence($word);
268							# if ($tag_set->existTag('CANDIDATES',$item->getPOS))
269							# {
270							# $$testified_infos_r->{"NUM_CONTENT_WORDS"}++;
271							# }
272							# push @{$$testified_infos_r->{"WORDS"}}, $item;
273							# }
274
275							# return 1;
276							# }
277
278
279							sub buildTestifiedTerm
280							{
281	1			1	1	4	my ($this,$block,$sentence_boundary,$match_type,$filtering_lexicon_h,$source,$tag_set) = @_;
282	1					3	my $word;
283							my $testified;
284	1					0	my $item;
285	1					4	my @words = split /\n/,$block;
286	1					2	my @clean_words;
287	1					2	my $num_content_words = 0;
288	1					2	my @lex_items;
289	1					2	foreach $word (@words)
290							{
291	3	100	66			27	if (
292							($word =~ /^([^\t]+)\t([^\t]+)\t([^\t]+)$/)
293							&&
294							($2 ne $sentence_boundary)
295							)
296							{
297
298	2	50				5	if($match_type eq "loose") # look at IF or LF
299							{
300	2	0	33			7	if(
301							(!exists $filtering_lexicon_h->{lc($1)})
302							&&
303							(!exists $filtering_lexicon_h->{lc($3)})
304							)
305							{
306							# current word does not appear in the corpus : testified term won't be loaded
307	0					0	return;
308							}
309							}
310							else
311							{
312	0	0				0	if($match_type eq "strict") # look at IF and POS
313							{
314	0	0				0	if
315							(!exists $filtering_lexicon_h->{lc($1)."~".$2})
316							{
317							# current word does not appear in the corpus : testified term won't be loaded
318	0					0	return;
319							}
320
321							}
322							else
323							{
324							# default match: look at IF
325	0	0				0	if
326							(!exists $filtering_lexicon_h->{lc($1)})
327							{
328							# current word does not appear in the corpus : testified term won't be loaded
329	0					0	return;
330							}
331							}
332
333							}
334	2					5	push @clean_words, $word;
335							}
336							}
337	1					2	foreach $word (@clean_words)
338							{
339
340	2					6	$item = $this->getLexicon->addOccurrence($word);
341	2	50				5	if ($tag_set->existTag('CANDIDATES',$item->getPOS))
342							{
343	2					8	$num_content_words++;
344							}
345	2					6	push @lex_items, $item;
346							}
347	1	50				3	if(scalar @lex_items > 1)
348							{
349	1					7	$testified = Lingua::YaTeA::MultiWordTestifiedTerm->new($num_content_words,\@lex_items,$tag_set,$source,$match_type);
350							}
351							else
352							{
353	0	0				0	if(scalar @lex_items == 1)
354							{
355	0					0	$testified = Lingua::YaTeA::MonolexicalTestifiedTerm->new($num_content_words,\@lex_items,$tag_set,$source,$match_type);
356							}
357							}
358	1	50	33			11	if ((blessed($testified)) && ($testified->isa('Lingua::YaTeA::TestifiedTerm')))
359							{
360	1					3	$this->addTestified($testified);
361							}
362							}
363
364							sub addTestified
365							{
366	1			1	1	3	my ($this,$testified) = @_;
367	1					5	my $key = $testified->buildKey;
368	1	50				3	if(!exists $this->getTestifiedTerms->{$key})
369							{
370	1					2	$Lingua::YaTeA::TestifiedTerm::id++;
371	1					2	$this->getTestifiedTerms->{$key} = $testified;
372							}
373							else
374							{
375	0					0	push @{$this->getTestifiedTerms->{$key}->getSource}, @{$testified->getSource};
	0					0
	0					0
376							}
377							}
378
379
380							sub getLexicon
381							{
382	2			2	1	5	my ($this) = @_;
383	2					7	return $this->{LEXICON};
384							}
385
386							sub getTestifiedTerms
387							{
388	37			37	1	57	my ($this) = @_;
389	37					168	return $this->{TESTIFIED_TERMS};
390							}
391
392							sub size
393							{
394	20			20	1	32	my ($this) = @_;
395	20					30	return scalar (keys %{$this->getTestifiedTerms});
	20					41
396							}
397
398
399							sub changeKeyToID
400							{
401	1			1	1	2	my ($this) = @_;
402	1					2	my $original_h = $this->getTestifiedTerms;
403	1					2	my %new;
404							my $testified;
405
406	1					4	foreach $testified (values %$original_h)
407							{
408	1					6	$new{$testified->getID} = $testified;
409							}
410	1					2	%{$this->getTestifiedTerms} = %new;
	1					3
411							}
412
413							1;
414
415							__END__