File Coverage

blib/lib/Lingua/YaTeA/Corpus.pm

Criterion	Covered	Total	%
statement	405	766	52.8
branch	100	236	42.3
condition	45	117	38.4
subroutine	52	73	71.2
pod	49	56	87.5
total	651	1248	52.1

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::YaTeA::Corpus;
2	3			3		71078	use strict;
	3					17
	3					95
3	3			3		17	use warnings;
	3					8
	3					88
4	3			3		706	use Data::Dumper;
	3					7020
	3					159
5	3			3		602	use UNIVERSAL;
	3					22
	3					15
6	3			3		121	use Scalar::Util qw(blessed);
	3					6
	3					150
7	3			3		28	use File::Path;
	3					7
	3					175
8	3			3		1515	use POSIX qw(log10);
	3					19114
	3					17
9
10	3			3		5607	use Lingua::YaTeA::ForbiddenStructureMark;
	3					9
	3					35
11	3			3		1292	use Lingua::YaTeA::TestifiedTermMark;
	3					11
	3					78
12	3			3		1338	use Lingua::YaTeA::Sentence;
	3					8
	3					31
13	3			3		1272	use Lingua::YaTeA::Lexicon;
	3					9
	3					30
14	3			3		1267	use Lingua::YaTeA::DocumentSet;
	3					7
	3					37
15	3			3		1304	use Lingua::YaTeA::SentenceSet;
	3					10
	3					1318
16	3			3		255	use Lingua::YaTeA::WordFromCorpus;
	3					6
	3					1238
17	3			3		486	use Lingua::YaTeA::XMLEntities;
	3					6
	3					22
18
19	3			3		673	use Encode qw(:fallbacks);;
	3					9880
	3					25929
20
21
22							our $VERSION=$Lingua::YaTeA::VERSION;
23
24							our $forbidden_counter = 0;
25							our $tt_counter = 0;
26							our $split_counter = 0;
27
28
29							sub new
30							{
31	3			3	1	13	my ($class,$path,$option_set,$message_set) = @_;
32	3					11	my $this = {};
33	3					9	bless ($this,$class);
34	3					21	$this->{PATH} = $path;
35	3					9	$this->{NAME} = ();
36	3					29	$this->{LEXICON} = Lingua::YaTeA::Lexicon->new;
37	3					27	$this->{DOCUMENTS} = Lingua::YaTeA::DocumentSet->new;
38	3					64	$this->{SENTENCES} = Lingua::YaTeA::SentenceSet->new;
39	3					31	$this->{WORDS} = [];
40	3					8	$this->{OUTPUTS} = ();
41	3					16	$this->setName;
42
43
44	3					16	$this->setOutputFiles($option_set,$message_set);
45	3					13	return $this;
46							}
47
48							sub preLoadLexicon
49							{
50	1			1	1	6	my ($this,$sentence_boundary,$document_boundary,$match_type) = @_;
51	1					5	my $fh = $this->getFileHandle;
52	1					3	my $word;
53							my %lexicon;
54	1					7	while (! $fh->eof)
55							{
56	299					6237	$word = $fh->getline;
57	299	100	100			7700	if(
			66
58							($word=~ /^([^\t]+)\t([^\t]+)\t([^\t]+)$/)
59							&&
60							($2 ne $sentence_boundary)
61							&&
62							($2 ne $document_boundary)
63							)
64							{
65	288	50				538	if($match_type ne "strict")
66							{
67	288					702	$lexicon{lc($1)}++; # record IF
68	288	50				491	if($match_type eq "loose")
69							{
70	288					843	$lexicon{lc($3)}++; # record LF
71							}
72							}
73							else
74							{
75	0					0	$lexicon{lc($1)."~".$2}++; # record IF + POS
76							}
77							}
78							}
79	1					32	return \%lexicon;
80							}
81
82
83							sub _normalizeInputCorpusLine {
84	20			20		1681	my ($this, $block, $language) = @_;
85
86	20					57	my $line;
87							my @elems;
88	20					0	my @elems_out;
89	20					34	my $new_block = "";
90
91	20					37	my @septags;
92
93	20	50	33			101	if ((defined $language) && ($language eq "FR-Flemm")) {
94	0					0	foreach $line (split /\n/, $block) {
95	0					0	$line =~ s/
96	0					0	$line =~ s/>/SUP/go;
97	0					0	$line =~ s/\t:\t/\tCOLUMN\t/go;
98
99	0					0	@elems = split /\t/, $line;
100
101							# warn "-> $line" . scalar(@elems) . "\n";
102
103
104	0	0				0	if (scalar(@elems) > 3) {
105							# ambiguity in the pos tagging
106	0					0	my @tmp = split /\s\\|\\|\s/, $elems[2];
107	0					0	$elems[2] = shift @tmp;
108	0					0	$#elems = 2;
109							}
110	0	0				0	if (scalar(@elems) == 3) {
111	0					0	@septags = split /:/, $elems[1];
112	0					0	my $tag;
113	0	0				0	if (scalar(@septags) == 2) {
		0
114	0					0	$tag = $septags[1];
115							} elsif (scalar(@septags) == 3) {
116	0					0	$tag = $septags[2];
117							} else {
118	0					0	$tag = $septags[0];
119							}
120							# if the tag is PUN(cit)
121	0	0				0	if ($tag eq "PUN(CIT)") {
122	0					0	$tag = "PUN";
123							}
124							# if the tag is Sp+Da, it is transformed as SpDa
125	0					0	$tag =~ s/\+D/D/;
126							# if the word is 'une', the postag is corrected
127	0	0				0	if ($elems[0] eq "une") {
128	0					0	$tag = "Da3fs---";
129							}
130							# if it's a present participle, the postag is then Vmpp-----
131	0	0				0	if ($tag =~ /Vmpp/) {
132	0					0	$tag = "Vmpp-----";
133							}
134							# if the word is 'l', the postag is corrected
135	0	0				0	if (lc($elems[0]) eq "l") {
136	0					0	$tag = "Da3-s---";
137							}
138	0					0	$elems[1] = $tag;
139							}
140	0	0				0	if (scalar(@elems) == 3) {
141	0					0	$new_block .= join("\t", @elems) . "\n";
142							}
143							}
144	0					0	return($new_block);
145							} else {
146							# warn "Language is $language, so nothing to do\n";
147
148							# foreach $line (split /\n/, $block) {
149							# $line =~ s/
150							# $line =~ s/>/SUP/go;
151							# $line =~ s/\t:\t/\tCOLUMN\t/go;
152							# $line =~ s/\t\t+/\t/go;
153							# if($line !~ /^[^\t]\t[^\t]+\t[^\t]$/o){
154							# # warn "***********************************\n";
155							# # warn "Start correction of the line: $line\n";
156							# @elems = split /\t/, $line;
157							# if (scalar(@elems) > 3) {
158							# # ambiguity in the pos tagging
159							# # my @tmp = split /\s\\|\\|\s/, $elems[2];
160							# # $elems[2] = shift @tmp;
161							# $#elems = 2;
162							# } else {
163							# if (defined $elems[0]) {
164							# $elems[2] = $elems[0];
165							# if (!defined $elems[1]) {
166							# # $elems[1] = 'SYM';
167							# $elems[1] = $elems[0];
168							# }
169							# } else {
170							# @elems =();
171							# }
172
173							# }
174							# if (scalar(@elems) == 3) {
175							# $new_block .= join("\t", @elems) . "\n";
176							# }
177							# } else {
178							# $new_block .= $line . "\n";
179							# }
180							# }
181							# $new_block .= join("\t", @elems) . "\n";
182							# return($new_block);
183	20					68	return($block);
184							}
185							}
186
187
188							sub read
189							{
190	2			2	1	9	my ($this,$sentence_boundary,$document_boundary,$FS_set,$testified_set,$match_type,$message_set,$display_language, $language,$debug_fh) = @_;
191	2					6	my $num_line = 0;
192	2					12	my $fh = $this->getFileHandle;
193	2					7	my $block;
194
195							# local $/ = "\.\t". $sentence_boundary ."\t\.\n";
196	2					11	$this->getSentenceSet->addSentence($this->getDocumentSet);
197	2					10	while (! $fh->eof)
198							{
199	20					257	$block = $this->_normalizeInputCorpusLine(Encode::decode("UTF-8", $this->readSentence($fh,$sentence_boundary)), $language);
200	20					85	$this->WrapBlock(\$block);
201	20					77	$this->MarkForbiddenStructures(\$block,$FS_set);
202	20					99	$this->MarkTestifiedTerms(\$block,$testified_set,$match_type,$debug_fh);
203	20					88	$this->UnwrapBlock(\$block);
204
205	20					72	$this->recordWords($block,$sentence_boundary,$document_boundary,\$num_line,$message_set,$display_language);
206							}
207							}
208
209							sub readSentence {
210	20			20	0	65	my ($this, $fh, $sentence_boundary) = @_;
211
212							# warn "in readsentence\n";
213	20					44	my $line;
214							my $sentence;
215	20	50				62	if (! $fh->eof) {
216	20		100			116	do {
217	598					14408	$line = $fh->getline;
218							# warn "line: $line;\n";
219	598					13600	$sentence .= $this->correctInputLine($line);
220							# $sentence .= $line;
221							} while ((!$fh->eof) && (index($line, "\t$sentence_boundary\t") == -1));
222							# warn "sentence: $sentence\n";
223							}
224	20					268	return($sentence);
225							}
226
227							sub correctInputLine {
228	598			598	0	1200	my ($line, $this) = reverse(@_);
229
230	598					785	my @elems;
231	598					825	my $tail = "";
232	598	50				1976	if ($line =~ /(\n+)$/) {
233	598					1300	$tail = $1;
234							}
235	598					1044	chomp $line;
236
237	598					945	$line =~ s/
238	598					880	$line =~ s/>/SUP/go;
239	598					864	$line =~ s/\t:\t/\tCOLUMN\t/go;
240	598					1194	$line =~ s/\t\t+/\t/go;
241	598	100				1826	if($line !~ /^[^\t]\t[^\t]+\t[^\t]$/o){
242							# warn "***********************************\n";
243							# my $line2 = $line;
244							# $line2 =~ s/\t/\\t/go;
245							# warn "Start correction of the line: " . $line2 . "\n";
246	4					21	@elems = split /\t/, $line;
247	4	50				14	if (scalar(@elems) > 3) {
248							# ambiguity in the pos tagging
249							# my @tmp = split /\s\\|\\|\s/, $elems[2];
250							# $elems[2] = shift @tmp;
251	0					0	$#elems = 2;
252							} else {
253	4	50	33			37	if ((defined $elems[0]) && (length($elems[0])>0)) {
254	0					0	$elems[2] = $elems[0];
255	0	0				0	if (!defined $elems[1]) {
256							# $elems[1] = 'SYM';
257	0					0	$elems[1] = $elems[0];
258							}
259							} else {
260	4					9	@elems =();
261							}
262
263							}
264	4	50				10	if (scalar(@elems) == 3) {
265							# warn "Corrected line: " . join('\t', @elems). "\n";
266							# warn "***********************************\n";
267	0					0	return(join("\t", @elems) . $tail);
268							} else {
269							# warn "Removing line\n";
270							# warn "***********************************\n";
271	4					17	return("");
272							}
273							} else {
274	594					2537	return($line . $tail);
275							}
276
277	0					0	return($line . $tail);
278							}
279
280							sub recordWords
281							{
282	20			20	1	72	my ($this,$block,$sentence_boundary,$document_boundary,$num_line,$message_set,$display_language) = @_;
283	20					33	my $word;
284	20					239	my @words = split /\n/,$block;
285
286	20					52	foreach $word (@words)
287							{ # record each word of the sentence
288	612					948	$$num_line++;
289	612	50				2005	if ($word !~ /^\s*$/)
290							{
291	612					1147	$this->addWordFromCorpus($word,$sentence_boundary,$document_boundary,$num_line,$message_set,$display_language);
292							}
293							}
294							}
295
296							sub addWordFromCorpus
297							{
298	612			612	1	1301	my ($this,$form,$sentence_boundary,$document_boundary,$num_line,$message_set,$display_language) = @_;
299	612					885	my $word;
300	612					1116	chomp $form;
301
302	612	100				2152	if($form =~ /^[^\t]\t[^\t]+\t[^\t]$/o){
303	594					1260	$word = Lingua::YaTeA::WordFromCorpus->new($form,$this->getLexicon,$this->getSentenceSet);
304							}
305							else{
306	18	100				93	if($form =~ /\<\/?FORBIDDEN/)
307							{
308	12					60	$word = Lingua::YaTeA::ForbiddenStructureMark->new($form);
309							}
310							else
311							{
312	6	50				32	if($form =~ /\<\/?FRONTIER/)
313							{
314	6					26	$word = Lingua::YaTeA::TestifiedTermMark->new($form);
315							}
316							else
317							{
318	0					0	warn $message_set->getMessage('INVALID_TOKEN')->getContent($display_language) . $$num_line . $message_set->getMessage('IN_FILE')->getContent($display_language) . $this->getPath . " ($form)";
319	0					0	die "\n";
320							}
321							}
322							}
323	612					1015	push @{$this->{WORDS}}, $word;
	612					1253
324	612					1493	$this->incrementCounters($word,$sentence_boundary,$document_boundary);
325	612					1564	return $word;
326							}
327
328							sub incrementCounters
329							{
330	612			612	1	1126	my ($this,$word,$sentence_boundary,$document_boundary) = @_;
331
332	612	100	66			3402	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
333							{
334	594					864	$Lingua::YaTeA::WordFromCorpus::counter++;
335	594	100				1306	if ($word->isSentenceBoundary($sentence_boundary))
336							{
337	18					37	$Lingua::YaTeA::Sentence::counter++;
338	18					27	$Lingua::YaTeA::Sentence::in_doc_counter++;
339	18					65	Lingua::YaTeA::Sentence::resetStartChar;
340	18					44	$this->getSentenceSet->addSentence($this->getDocumentSet);
341							}
342							else{
343	576	50				1216	if ($word->isDocumentBoundary($document_boundary))
344							{
345	0					0	$this->getDocumentSet->addDocument($word);
346	0					0	Lingua::YaTeA::Sentence->resetStartChar;
347	0					0	$this->getSentenceSet->addSentence($this->getDocumentSet);
348	0					0	Lingua::YaTeA::Sentence->resetInDocCounter;
349	0					0	$word->updateSentence($this->getSentenceSet);
350	0					0	$word->updateStartChar;
351
352	0					0	$Lingua::YaTeA::Sentence::counter++;
353	0					0	$Lingua::YaTeA::Sentence::in_doc_counter++;
354	0					0	$this->getSentenceSet->addSentence($this->getDocumentSet);
355
356							}
357							else{
358	576					1379	Lingua::YaTeA::Sentence->updateStartChar($word);
359							}
360							}
361							}
362
363							}
364
365							sub print
366							{
367	0			0	1	0	my ($this,$sentence_boundary,$document_boundary) = @_;
368	0					0	my $word;
369	0					0	foreach $word (@{$this->{WORDS}} )
	0					0
370							{
371	0	0	0			0	if ((blessed($word)) && ($word->isa("Lingua::YaTeA::WordFromCorpus")))
372							{
373	0	0				0	if ($word->isSentenceBoundary($sentence_boundary))
374							{
375	0					0	print $word->getLexItem->getIF . "\n";
376							}
377							else
378							{
379	0	0				0	if($word->isDocumentBoundary($document_boundary))
380							{
381	0					0	print "\n" . $word->getLexItem->getIF . "\n";
382							}
383							else
384							{
385	0					0	print $word->getLexItem->getIF . " ";
386							}
387							}
388							}
389							else{
390	0					0	print $word->getForm . "\n";
391							}
392							}
393							}
394
395
396							sub selectTestifiedTerms {
397	20			20	1	46	my ($this,$block_r,$testified_set,$match_type) = @_;
398	20					281	my @block_lines = split ("\n", $$block_r);
399	20					83	my %block_lexicon;
400							my $word;
401	20					0	my $testified;
402	20					0	my %block_testified_set;
403
404	20	100	66			120	if((defined $testified_set) && ($testified_set->size > 0)) {
405	10					23	foreach $word (@block_lines) {
406	313	100				965	if ($word=~ /^([^\t]+)\t([^\t]+)\t([^\t]+)$/) {
407	297	50				479	if($match_type ne "strict") {
408	297					819	$block_lexicon{lc($1)}++; # record IF
409	297	50				527	if($match_type eq "loose") {
410	297					632	$block_lexicon{lc($3)}++; # record LF
411							}
412							}
413							else {
414	0					0	$block_lexicon{lc($1)."~".$2}++; # record IF + POS
415							}
416							}
417							}
418	10					17	foreach $testified (values %{$testified_set->getTestifiedTerms}) {
	10					26
419	10	100				55	if($testified->isInLexicon(\%block_lexicon,$match_type) == 1) {
420	3					13	$block_testified_set{$testified->getID} = $testified;
421							}
422							}
423							}
424	20					130	return \%block_testified_set;
425							}
426
427
428
429
430							sub MarkTestifiedTerms
431							{
432	20			20	1	74	my ($this,$block_r,$testified_set,$match_type,$debug_fh) = @_;
433	20					40	my $testified;
434							my $reg_exp;
435	20					37	my $id = 0;
436							# print $debug_fh $$block_r . "\n";
437	20					62	my $selected_TTs_h = $this->selectTestifiedTerms($block_r,$testified_set,$match_type);
438
439	20	50				52	if (defined $selected_TTs_h)
440							{
441	20					84	foreach $testified (values %$selected_TTs_h)
442							{
443							# print $debug_fh $testified->getIF . "\n";
444	3					13	$reg_exp = $testified->getRegExp;
445	3					483	$$block_r =~ s/($reg_exp)/$this->createAnnotation($1,\$id,$testified)/gei;
	3					16
446	3					43	$$block_r =~ s/\n\n/\n/g;
447							}
448							}
449							}
450
451							sub createAnnotation
452							{
453	3			3	1	15	my ($this,$match,$id_r,$testified) = @_;
454	3					7	my $type;
455
456	3					17	my $annotation = "\n\\n" . $match . "\n<\/FRONTIER ID=" . $$id_r . " TT=" . $testified->getID ."\>\n";
457	3					8	$$id_r++;
458
459	3					202	return $annotation;
460							}
461
462
463							sub MarkForbiddenStructures
464							{
465	20			20	1	46	my ($this,$block_r,$FS_set) = @_;
466	20					77	my $FS_any_a = $FS_set->getSubset("ANY");
467	20					36	my $FS;
468	20					29	my $ID = 0;
469	20					51	my $reg_exp;
470							my $action;
471	20					0	my $split_after;
472
473	20					52	foreach $FS (@$FS_any_a)
474							{
475	860					2346	$action = $FS->getAction;
476	860					1808	$reg_exp = $FS->getRegExp;
477	860	100				1773	if ($action eq "delete"){
478	580					49838	$$block_r =~ s/($reg_exp)/\n\$1\<\/FORBIDDEN ID=$ID ACTION=$action\>\n/ig;
479							}
480							else{
481	280	50				542	if ($action eq "split"){
482	280					571	$split_after = $FS->getSplitAfter;
483	280					33630	$$block_r =~ s/($reg_exp)/\n\$1\<\/FORBIDDEN ID=$ID ACTION=$action SPLIT\_AFTER=$split_after\>\n/ig;
484							}
485							}
486	860					2920	$ID++;
487							}
488							}
489
490							sub WrapBlock
491							{
492	20			20	1	46	my ($this,$block_r) = @_;
493	20					104	$$block_r =~ s/\r//g;
494	20					148	$$block_r =~ s/^/\n/;
495	20					174	$$block_r =~ s/$/\n/;
496							}
497
498							sub UnwrapBlock
499							{
500	20			20	1	52	my ($this,$block_r) = @_;
501	20					185	$$block_r =~ s/^\n//;
502	20					175	$$block_r =~ s/\n$//;
503							}
504
505
506
507
508
509							sub chunk
510							{
511	2			2	1	10	my ($this,$phrase_set,$sentence_boundary,$document_boundary,$chunking_data,$FS_set,$tag_set,$parsing_pattern_set,$testified_term_set,$option_set,$fh) = @_;
512	2					206	my $word;
513							my $i;
514	2					0	my @words;
515	2					0	my $action;
516	2					9	my $split_after = -1;
517	2					26	my $valid;
518							my $num_content_words;
519	2					0	my @clean_corpus;
520	2					0	my $term_frontiers_h;
521	2					11	my $compulsory = $option_set->getCompulsory;
522	2					10	my $max_length = $option_set->getMaxLength;
523
524	2					95	print STDERR "MAX_LENGTH: " . $max_length . "\n";
525	2					20	for ($i = 0; $i <= $this->size; $i++){
526	614					1189	$word = $this->getWord($i);
527	614	100	66			1922	if ((defined $fh) && (defined $word))
528							{
529	612					1533	$word->print($fh);
530							}
531							# if (defined $word) {
532							# print STDERR "> ($word)";
533							# $word->print(\*STDERR);
534							# }
535	614	100	100			1344	if(
536							($i == $this->size) # last word of the corpus
537							\|\|
538							($word->isChunkEnd(\$action,\$split_after,$sentence_boundary,$document_boundary,$chunking_data) == 1)
539							)
540							{
541	248					771	($valid,$num_content_words,$term_frontiers_h) = $this->cleanChunk(\@words,$chunking_data,$FS_set,$option_set->getCompulsory,$tag_set,$fh);
542							# print STDERR "====$valid\n";
543							# foreach my $w (@words) {
544							# $w->print(\*STDERR);
545							# }
546							# print STDERR "====\n";
547
548	248	100				599	if ($valid == 1)
549							{
550	122					304	$phrase_set->recordOccurrence(\@words,$num_content_words,$tag_set,$parsing_pattern_set,$option_set,$term_frontiers_h,$testified_term_set,$this->getLexicon,$this->getSentenceSet,$fh);
551	122					248	$Lingua::YaTeA::Corpus::tt_counter = 0;
552							}
553
554	248					450	@words = ();
555
556							}
557							else{
558	366					641	push @words, $word;
559							}
560
561
562							# warn "ref= " . ref($word) . "\n";
563	614	100	66			3755	if((defined $word) && ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus'))))
			100
564							{
565	594					1750	push @clean_corpus,$word;
566							}
567							}
568
569	2					127	$this->{WORDS} = \@clean_corpus;
570
571							}
572
573
574							sub cleanChunk
575							{
576	248			248	1	555	my ($this,$words_a,$chunking_data,$FS_set,$compulsory,$tag_set,$fh) = @_;
577	248					371	my $num_content_words;
578							my $term_frontiers_h;
579
580	248	100				559	if ($this->pruneFromStart($words_a,$chunking_data,$FS_set,$fh) == 1)
581							{
582
583	124	100				349	if($this->pruneFromEnd($words_a,$chunking_data,$FS_set,$fh) == 1)
584							{
585	122	50				287	if($this->checkCompulsory($words_a,$compulsory,$fh) == 1)
586							{
587	122					306	($num_content_words,$term_frontiers_h) = $this->deleteAnnotationMarks($words_a,$tag_set,$fh);
588	122					430	return (1,$num_content_words,$term_frontiers_h);
589							}
590	0					0	return (0,0);
591							}
592	2					11	return (0,0); # no words left
593							}
594	124					337	return (0,0); # no words left
595							}
596
597
598							sub deleteAnnotationMarks
599							{
600	122			122	1	239	my ($class,$words_a,$tag_set,$fh) = @_;
601	122					196	my $word;
602							my @tmp;
603	122					189	my $content_words = 0;
604	122					161	my $index = 0;
605	122					200	my %term_frontiers;
606							my $frontier;
607
608	122					231	foreach $word (@$words_a){
609	292	100	66			1240	if ((blessed($word)) && ($word->isa("Lingua::YaTeA::WordFromCorpus")))
610							{
611	286	100				612	if ($tag_set->existTag('CANDIDATES',$word->getPOS))
612							{
613	250					365	$content_words++;
614							}
615	286					512	push @tmp, $word;
616	286					515	$index++;
617							}
618							else
619							{
620	6	50	33			32	if ((blessed($word)) && ($word->isa("Lingua::YaTeA::TestifiedTermMark")))
621							{
622	6	100				18	if($word->isOpener)
623							{
624	3					14	$term_frontiers{$word->getID} = $word;
625	3					11	$word->{START} = $index; # should use setStart
626							}
627							else
628							{
629	3	50				9	if($word->isCloser)
630							{
631	3					10	$frontier = $term_frontiers{$word->getID};
632	3					9	$frontier->{END} = $index; # should use setEnd
633							}
634							}
635							}
636							}
637							}
638	122					365	@$words_a = @tmp;
639
640	122					422	return ($content_words,\%term_frontiers);
641							}
642
643
644
645
646							sub pruneFromStart
647							{
648	248			248	1	455	my ($this,$words_a,$chunking_data,$FS_set,$fh) = @_;
649	248					362	my $i =0;
650	248					353	my $word;
651							my $potential_FS_a;
652	248					326	my $inside_testified = 0;
653	248					382	my %testified_frontiers;
654
655
656	248					585	while ($i < scalar @$words_a)
657							{
658	188					301	$word = $words_a->[$i];
659
660	188	100	66			1147	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::TestifiedTermMark')))
661							{
662	2					9	return 1;
663							}
664							else
665							{
666	186	100	66			816	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
667							{
668
669	182	100				459	if ($word->isCleaningFrontier($chunking_data))
670							{
671	122	100				399	if(
672							# ($inside_testified == 0)
673							# &&
674							# ($word->isa('Lingua::YaTeA::WordFromCorpus'))
675							# &&
676							($potential_FS_a = $word->isStartTrigger($FS_set->getTriggerSet("START")))
677							)
678							{
679	25	50				71	if(!$this->expandStartTriggers($potential_FS_a,$words_a,$fh))
680							{
681	25					47	last;
682							}
683							}
684							else
685							{
686	97					181	last;
687							}
688							}
689
690
691							}
692	64					177	shift @$words_a; # delete element
693							}
694							}
695	246	100				578	if(scalar @$words_a > 0)
696							{
697	122					369	return 1;
698							}
699	124					347	return 0;
700							}
701
702							sub pruneFromEnd
703							{
704	124			124	1	360	my ($this,$words_a,$chunking_data,$FS_set,$fh) = @_;
705	124					225	my $i = $#$words_a;
706	124					203	my $word;
707							my $potential_FS_a;
708	124					173	my $inside_testified = 0;
709	124					159	my %testified_frontiers;
710	124					174	my $deleted = 0;
711
712	124					292	while ($i >= 0)
713							{
714	132					213	$word = $words_a->[$i];
715
716	132	50	33			894	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::TestifiedTermMark')))
717							{
718	0					0	return 1;
719							}
720							else
721							{
722	132	50	33			635	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
723							{
724	132	100				331	if ($word->isCleaningFrontier($chunking_data))
725
726							{
727	126	100	33			923	if(
			33
			66
728							($inside_testified == 0)
729							&&
730							((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
731							&&
732							($potential_FS_a = $word->isEndTrigger($FS_set->getTriggerSet("END")))
733							)
734							{
735	4	50				21	if(!$this->expandEndTriggers($potential_FS_a,$words_a,$fh))
736							{
737	0					0	last;
738							}
739							else
740							{
741	4					9	$deleted = 1;
742	4	100				15	if(scalar @$words_a == 0)
743							{
744	2					11	return 0;
745							}
746							else
747							{
748	2					12	$i = $#$words_a;
749							}
750							}
751							}
752							else
753							{
754	122					249	last;
755							}
756							}
757							}
758	8	50	33			81	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::ForbiddenStructureMark')))
759							{
760	0					0	my $del = pop @$words_a; # delete element
761	0					0	$i--;
762							}
763							else
764							{
765	8	100				36	if($deleted == 0)
766							{
767	6					14	my $del = pop @$words_a; # delete element
768	6					16	$i--;
769							}
770							}
771	8					24	$deleted = 0;
772
773
774							}
775							}
776	122	50				324	if(scalar @$words_a > 0)
777							{
778	122					351	return 1;
779							}
780	0					0	return 0;
781							}
782
783
784
785
786							sub checkCompulsory
787							{
788	122			122	1	260	my ($this,$words_a,$compulsory,$fh) = @_;
789	122					159	my $word;
790	122					223	foreach $word (@$words_a)
791							{
792	147	50	33			843	if (!((blessed($word)) && ($word->isa('Lingua::YaTeA::ForbiddenStructureMark'))))
793							{
794
795	147	100	66			843	if (
			100
796							((blessed($word)) && ($word->isa('Lingua::YaTeA::TestifiedTermMark')))
797							\|\|
798							($word->isCompulsory($compulsory))
799							)
800							{
801	122					381	return 1;
802							}
803							}
804							}
805	0					0	return 0;
806							}
807
808
809
810							sub getWord
811							{
812	614			614	1	1007	my ($this,$i) = @_;
813	614					1362	return $this->{WORDS}->[$i];
814							}
815
816							sub size
817							{
818	1230			1230	1	1981	my ($this) = @_;
819	1230					1532	return scalar @{$this->{WORDS}};
	1230					3848
820							}
821
822
823							sub expandStartTriggers
824							{
825	25			25	1	73	my ($this,$potential_FS_a,$words_a,$fh) = @_;
826	25					133	my $FS;
827							my $i;
828	25					0	my $j;
829	25					0	my $to_find;
830	25					0	my $to_delete;
831
832	25					57	foreach $FS (@$potential_FS_a)
833							{
834
835	50	100				151	if($FS->getLength <= scalar @$words_a)
836							{
837	46					136	$to_delete = $FS->apply($words_a);
838	46	50				129	if(defined $to_delete)
839							{
840	0					0	last;
841							}
842
843							}
844							}
845	25	50				66	if(defined $to_delete)
846							{
847	0					0	while($to_delete != 1)
848							{
849	0	0	0			0	if ((blessed($words_a->[0])) && ($words_a->[0]->isa('Lingua::YaTeA::TestifiedTermMark')))
850							{
851	0					0	return 1;
852							}
853							else
854							{
855	0					0	my $del = shift @$words_a;
856	0					0	$to_delete--;
857							}
858							}
859	0					0	return 1;
860							}
861	25					68	return 0;
862							}
863
864
865							sub expandEndTriggers
866							{
867	4			4	1	15	my ($this,$potential_FS_a,$words_a,$fh) = @_;
868	4					26	my $FS;
869							my $i;
870	4					0	my $j;
871	4					0	my $to_find;
872	4					0	my $to_delete;
873
874	4					12	foreach $FS (@$potential_FS_a)
875							{
876
877	4	50				16	if($FS->getLength <= scalar @$words_a)
878							{
879	4					16	$to_delete = $FS->apply($words_a);
880	4	50				14	if(defined $to_delete)
881							{
882	4					9	last;
883							}
884							}
885							}
886	4	50				14	if(defined $to_delete)
887							{
888	4					13	while($to_delete != 0)
889							{
890	4	50	33			58	if ((blessed($words_a->[$#$words_a])) && ($words_a->[$#$words_a]->isa('Lingua::YaTeA::TestifiedTermMark')))
891							{
892	0					0	return 1;
893							}
894							else
895							{
896	4					13	my $w = pop @$words_a;
897	4					13	$to_delete--;
898							}
899							}
900	4					17	return 1;
901							}
902	0					0	return 0;
903							}
904
905							sub getSentenceSet
906							{
907	738			738	1	1410	my ($this) = @_;
908	738					2214	return $this->{SENTENCES};
909							}
910
911							sub getDocumentSet
912							{
913	22			22	1	45	my ($this) = @_;
914	22					89	return $this->{DOCUMENTS};
915							}
916
917							sub getFileHandle
918							{
919	3			3	1	9	my ($this) = @_;
920	3					16	my $path = $this->getPath;
921							# print STDERR "corpus :" . $path . "\n";
922	3					22	my $fh = FileHandle->new("<$path");
923							# binmode($fh, ":utf8");
924	3					236	return $fh;
925							}
926
927							sub getPath
928							{
929	6			6	1	14	my ($this) = @_;
930	6					46	return $this->{PATH};
931							}
932
933							sub getName
934							{
935	6			6	1	16	my ($this) = @_;
936	6					44	return $this->{NAME};
937							}
938
939							sub getOutputFileSet
940							{
941	16			16	1	50	my ($this) = @_;
942	16					101	return $this->{OUTPUT};
943							}
944
945							sub getLexicon
946							{
947	718			718	1	1224	my ($this) = @_;
948	718					1722	return $this->{LEXICON};
949							}
950
951							# the name of the file is what appears after the last "/" and before the last "." if any
952							sub setName
953							{
954	3			3	1	9	my ($this) = @_;
955
956	3	50				14	if($this->getPath =~ /\/?([^\/]+)\.[^\.]+$/)
957							{
958	3					12	$this->{NAME} = $1;
959							}
960							else
961							{
962	0					0	$this->getPath =~ /\/?([^\/]+)$/;
963	0					0	$this->{NAME} = $1;
964							}
965							}
966
967							sub setOutputFiles
968							{
969	3			3	1	9	my ($this,$option_set,$message_set) = @_;
970	3					23	my $sub_dir;
971							my $option;
972	3					0	my $file;
973	3					0	my @files;
974	3					0	my $file_info;
975	3					0	my $output_path;
976	3					4	my $no_output_defined = 1;
977	3					47	my %match_to_option= (
978							'xmlout'=>'xml:candidates.xml',
979							'TT-for-BioLG'=>'xml:TTforBioLG.xml',
980							'TC-for-BioLG'=>'xml:TCforBioLG.xml',
981							'termList'=>'raw:termList.txt',
982							'termAndHeadList'=>'raw:termAndHeadList.txt',
983							'printChunking'=>'html:candidatesAndUnparsedInCorpus.html',
984							'debug'=>'raw:debug,unparsable,unparsed',
985							'TTG-style-term-candidates' => 'raw:termCandidates.ttg',
986							'XML-corpus-for-BioLG' => 'xml:corpusForBioLG.xml',
987							'bootstrap' => 'raw:parsedTerms.txt',
988							'XML-corpus-raw' => 'xml:corpusRaw.xml',
989							);
990
991	3					22	$output_path = $option_set->getOutputPath ."/". $this->getName . "/" . $option_set->getSuffix;
992
993	3	100				92	if(-d $output_path)
994							{
995	1					7	print STDERR $message_set->getMessage('OVER_WRITE_REP')->getContent($option_set->getDisplayLanguage) . $output_path . "/\n";
996	1					858	rmtree $output_path;
997							}
998							# else
999							# {
1000	3					809	mkpath $output_path;
1001	3					30	print STDERR $message_set->getMessage('CREATE_REP')->getContent($option_set->getDisplayLanguage) . $output_path . "/\n";
1002							# }
1003	3					24	$this->{OUTPUT} = Lingua::YaTeA::FileSet->new($this->getName);
1004
1005	3					24	while (($option,$file_info) = each (%match_to_option))
1006							{
1007	33	100				100	if($option_set->optionExists($option))
1008							{
1009	24					65	$this->setFilesForOption($file_info,$output_path);
1010	24					88	$no_output_defined = 0;
1011							}
1012							}
1013	3	50				21	if($no_output_defined == 1)
1014							{
1015	0					0	$this->setFilesForOption($match_to_option{'xmlout'},$output_path);
1016							}
1017	3					17	$option_set->addOption('default_output',$no_output_defined);
1018							}
1019
1020							sub setFilesForOption
1021							{
1022	24			24	1	45	my ($this,$file_info,$sub_dir) = @_;
1023	24					54	my @files;
1024							my $file;
1025	24					0	my $sub_sub_dir;
1026	24					87	$file_info =~ /^([^:]+):(.+)$/;
1027	24					92	@files = split (/,/,$2);
1028	24					66	$sub_sub_dir = $sub_dir . "/" . $1;
1029	24	100				387	if(! -d $sub_sub_dir)
1030							{
1031	9					335	mkdir $sub_sub_dir;
1032							}
1033	24					84	foreach $file (@files)
1034							{
1035	30					117	$this->{OUTPUT}->addFile($sub_sub_dir,$file);
1036							}
1037							}
1038
1039							sub printCorpusForLGPwithTCs
1040							{
1041	0			0	1	0	my ($this,$term_candidates_h,$output_file,$sentence_boundary,$document_boundary,$lgp_mapping_file,$chained_links,$tag_set) = @_;
1042
1043	0					0	my ($occurrences_h,$mapping_to_TCs_h) = $this->orderOccurrencesForXML($term_candidates_h);
1044	0					0	my $LGPmapping_h = $this->loadLGPmappingFile($lgp_mapping_file->getPath);
1045	0					0	$this->printXMLcorpus($occurrences_h,$output_file->getPath,$sentence_boundary,$document_boundary,$mapping_to_TCs_h,$LGPmapping_h,$chained_links,$tag_set);
1046							}
1047
1048
1049							sub printCorpusForLGPwithTTs
1050							{
1051	0			0	1	0	my ($this,$testified_terms_h,$output_file,$sentence_boundary,$document_boundary,$lgp_mapping_file,$parsing_direction,$chained_links,$tag_set) = @_;
1052
1053	0					0	my ($occurrences_h,$mapping_to_TTs_h) = $this->orderOccurrencesForXML($testified_terms_h);
1054	0					0	$this->getBestOccurrences($occurrences_h,$parsing_direction);
1055	0					0	my $LGPmapping_h = $this->loadLGPmappingFile($lgp_mapping_file->getPath);
1056	0					0	$this->printXMLcorpus($occurrences_h,$output_file->getPath,$sentence_boundary,$document_boundary,$mapping_to_TTs_h,$LGPmapping_h,$chained_links,$tag_set);
1057							}
1058
1059
1060							sub getBestOccurrences
1061							{
1062	0			0	1	0	my ($this,$occurrences_h,$parsing_direction) = @_;
1063	0					0	my $doc;
1064							my $sentence;
1065	0					0	my @occurrences;
1066	0					0	my $occurrence;
1067	0					0	my $occurrence_set;
1068	0					0	my $same_start;
1069	0					0	foreach $doc (values %$occurrences_h)
1070							{
1071	0					0	foreach $sentence (values %$doc)
1072							{
1073	0					0	@occurrences = ();
1074	0					0	foreach $same_start (values %$sentence)
1075							{
1076	0					0	foreach $occurrence (@$same_start){
1077	0					0	push @occurrences, $occurrence;
1078							}
1079
1080							}
1081	0					0	foreach $occurrence (@occurrences)
1082							{
1083	0	0				0	if($occurrence->isNotBest(\@occurrences,$parsing_direction))
1084							{
1085	0					0	$this->removeOccurrence($occurrences_h,$occurrence->getDocument->getID,$occurrence->getSentence->getID,$occurrence->getStartChar,$occurrence->getID);
1086
1087							}
1088							}
1089
1090							}
1091							}
1092
1093							}
1094
1095
1096							sub removeOccurrence
1097							{
1098	0			0	1	0	my ($this,$occurrences_h,$doc,$sentence,$start_char,$occ_id) = @_;
1099	0					0	my @tmp;
1100							my $occurrence;
1101	0					0	my $occurrences_set = $occurrences_h->{$doc}{$sentence}{$start_char};
1102	0	0				0	if(scalar @$occurrences_set == 1)
1103							{
1104	0					0	delete $occurrences_h->{$doc}{$sentence}{$start_char};
1105							}
1106							else
1107							{
1108	0					0	foreach $occurrence (@$occurrences_set)
1109							{
1110	0	0				0	if($occurrence->getID != $occ_id)
1111							{
1112	0					0	push @tmp, $occurrence;
1113							}
1114							}
1115	0					0	@{$occurrences_h->{$doc}{$sentence}{$start_char}} = @tmp;
	0					0
1116							}
1117							}
1118
1119							sub printCorpusForBioLG{
1120	0			0	0	0	my ($this,$output_file,$sentence_boundary,$document_boundary,$chained_links) = @_;
1121
1122	0					0	my %occurrences;
1123							my %BioLGmapping;
1124	0					0	my %mapping_to_TTs;
1125
1126	0					0	$this->printXMLcorpus(\%occurrences,$output_file->getPath,$sentence_boundary,$document_boundary,\%mapping_to_TTs,\%BioLGmapping,$chained_links);
1127							}
1128
1129
1130
1131							sub printXMLcorpus
1132							{
1133	0			0	1	0	my ($this,$occurrences_h,$output_file_path,$sentence_boundary,$document_boundary,$mapping_to_TCs_h,$LGPmapping_h,$chained_links,$tag_set) = @_;
1134	0					0	my $word;
1135							my $sentence;
1136	0					0	my $sentence_id;
1137	0					0	my $document_id;
1138	0					0	my $occurrence;
1139	0					0	my $tc;
1140	0					0	my $head_word;
1141	0					0	my $head_index;
1142	0					0	my $local_occurrences_a;
1143	0					0	my $links_a;
1144	0					0	my $counter = 0;
1145	0					0	my $pos;
1146							my $if;
1147	0					0	my $i;
1148	0					0	my $tc_length;
1149	0					0	my $occurrence_set;
1150
1151	0					0	my $fh = FileHandle->new(">" . $output_file_path);
1152	0					0	binmode($fh, ":utf8");
1153	0					0	$this->printXMLheader($fh);
1154
1155	0					0	for ($i=0; $i < scalar @{$this->getWords}; $i++)
	0					0
1156							{
1157	0					0	$word = $this->getWords->[$i];
1158	0	0				0	if(! $word->isDocumentBoundary($document_boundary))
1159							{
1160
1161	0	0				0	if(exists $occurrences_h->{$word->getDocumentID}{$word->getSentenceID}{$word->getStartChar})
1162							{
1163	0					0	$occurrence_set = $occurrences_h->{$word->getDocumentID}{$word->getSentenceID}{$word->getStartChar};
1164	0					0	$occurrence = $occurrence_set->[0];
1165
1166	0					0	$tc = $mapping_to_TCs_h->{$occurrence->getID};
1167	0					0	$tc_length = $tc->getLength;
1168
1169							# previous and next words are not coordinations: to be removed when coordination will be handled ?
1170	0	0	0			0	if(
			0
			0
1171							(
1172							($i == 0)
1173							\|\|
1174							(!($tag_set->existTag('COORDINATIONS',$this->getWords->[$i-1]->getPOS)))
1175							)
1176							&&
1177							(
1178							($i == (scalar @{$this->getWords} -1))
1179							\|\|
1180							(!($tag_set->existTag('COORDINATIONS',$this->getWords->[$i + $tc_length]->getPOS)))
1181							)
1182							)
1183							{
1184
1185	0					0	($head_word,$head_index,$links_a) = $tc->getHeadAndLinks($LGPmapping_h,$chained_links);
1186	0					0	$counter++;
1187	0					0	$pos = $head_word->getPOS;
1188	0					0	$if = $head_word->getIF;
1189	0					0	Lingua::YaTeA::XMLEntities::encode($pos);
1190	0					0	Lingua::YaTeA::XMLEntities::encode($if);
1191	0					0	$sentence .= "
1192	0	0				0	if(scalar $links_a > 0)
1193							{
1194	0					0	$sentence .= " internal=\"" . join ("",@$links_a) . "\"";
1195							}
1196	0					0	$sentence .= " head=\"" . $head_index . "\">";
1197
1198							}
1199							else
1200							{
1201	0					0	undef $occurrence;
1202							}
1203							}
1204	0					0	$pos = $word->getPOS;
1205	0					0	$if = $word->getIF;
1206	0					0	Lingua::YaTeA::XMLEntities::encode($pos);
1207	0					0	Lingua::YaTeA::XMLEntities::encode($if);
1208	0					0	$sentence .= "" . $if . " ";
1209
1210	0	0	0			0	if (
1211							(defined $occurrence)
1212							&&
1213							($occurrence->getEndChar == $word->getStartChar + $word->getLexItem->getLength)
1214							)
1215							{
1216	0					0	undef $occurrence;
1217	0					0	$sentence .= "";
1218							}
1219	0	0				0	if($word->isSentenceBoundary($sentence_boundary))
1220							{
1221	0					0	$sentence =~ s/\s*$//;
1222	0					0	$sentence .= "";
1223	0					0	$sentence_id = $word->getSentence->getID;
1224	0					0	$document_id = $word->getDocument->getID;
1225	0					0	$local_occurrences_a = $occurrences_h->{$document_id}{$sentence_id};
1226
1227	0					0	print $fh $sentence . "\n";
1228	0					0	$sentence = "";
1229							}
1230							}
1231
1232							}
1233							# corpus ended without a final dot
1234	0	0				0	if($sentence ne "")
1235							{
1236	0					0	$sentence .= "";
1237	0					0	print $fh $sentence;
1238							}
1239	0					0	print STDERR $counter . " terms marked\n";
1240	0					0	$this->printXMLtrailer($fh);
1241							}
1242
1243							sub printXMLheader
1244							{
1245	0			0	1	0	my ($this,$fh) = @_;
1246	0					0	print $fh "\n";
1247							}
1248
1249							sub printXMLtrailer
1250							{
1251	0			0	1	0	my ($this,$fh) = @_;
1252	0					0	print $fh "\n\n";
1253							}
1254
1255							sub loadLGPmappingFile
1256							{
1257	0			0	1	0	my ($this,$file_path) = @_;
1258
1259	0					0	my $fh = FileHandle->new("<$file_path");
1260	0					0	my %mapping;
1261							my $line;
1262	0					0	while ($line= $fh->getline)
1263							{
1264	0	0	0			0	if(($line !~ /^\s$/)&&($line !~ /^\s#/)) # line is not empty nor commented
1265							{
1266	0					0	$line =~ /^([^\t]+)\t([^\t]+)\s*\n$/;
1267	0					0	$mapping{$1} = $2;
1268							}
1269							}
1270	0					0	return \%mapping;
1271							}
1272
1273
1274							sub printCandidatesAndUnparsedInCorpus
1275							{
1276	0			0	1	0	my ($this,$term_candidates_h,$unparsable_a,$file,$sentence_boundary,$document_boundary,$color_blind_option, $parsed_color, $unparsed_color) = @_;
1277	0					0	my %ids_for_parsed;
1278
1279							# my $fh = FileHandle->new(">".$file->getPath);
1280
1281							my $fh;
1282	0	0				0	if ($file eq "stdout") {
1283	0					0	$fh = \*STDOUT;
1284							} else {
1285	0	0				0	if ($file eq "stderr") {
1286	0					0	$fh = \*STDERR;
1287							} else {
1288	0					0	$fh = FileHandle->new(">".$file->getPath);
1289							}
1290							}
1291	0					0	binmode($fh, ":utf8");
1292
1293
1294	0					0	$this->printHTMLheader($fh);
1295	0					0	my $occurrences_h = $this->orderOccurrences($term_candidates_h,$unparsable_a,\%ids_for_parsed);
1296
1297	0					0	$this->printHTMLCorpus($occurrences_h,\%ids_for_parsed,$fh,$sentence_boundary,$document_boundary,$color_blind_option, $parsed_color, $unparsed_color);
1298	0					0	$this->printHTMLtrailer($fh);
1299							}
1300
1301
1302							sub printHTMLheader
1303							{
1304	0			0	1	0	my ($this,$fh) = @_;
1305	0					0	print $fh
1306							"\n\nTerm Candidates and unparsed phrases in Corpus\n";
1307							}
1308
1309							sub printHTMLtrailer
1310							{
1311	0			0	1	0	my ($this,$fh) = @_;
1312	0					0	print $fh
1313							"";
1314							}
1315
1316							sub printXMLRawCorpus
1317							{
1318	0			0	0	0	my ($this,$file,$sentence_boundary,$document_boundary) = @_;
1319	0					0	my $sentence_id;
1320							my $document_id;
1321	0					0	my $document_name;
1322	0					0	my $word;
1323	0					0	my $first_sentence = 1;
1324	0					0	my $in_doc = 0;
1325	0					0	my $string;
1326							my $last_word;
1327	0					0	my $fh = FileHandle->new(">".$file->getPath);
1328	0					0	binmode($fh,":utf8");
1329	0					0	$this->printXMLheader($fh);
1330	0					0	print $fh " \n";
1331	0					0	foreach $word (@{$this->getWords})
	0					0
1332							{
1333	0	0				0	if($word->isDocumentBoundary($document_boundary)) # new document is started
1334							{
1335	0	0				0	if($in_doc == 1)
1336							{
1337	0	0				0	if(!$last_word->isSentenceBoundary($sentence_boundary)) # last word of document is not a sentence boundary
1338							{
1339	0					0	$string =~ s/ $//;
1340	0					0	Lingua::YaTeA::XMLEntities::encode($string);
1341	0					0	print $fh " getSentence->getID . "\" inDocID=\"" . $word->getSentence->getInDocID . "\">" . $string . "\n";
1342	0					0	$string = "";
1343							}
1344	0					0	print $fh " \n";
1345							}
1346	0					0	print $fh " getDocument->getID . "\"";
1347	0	0				0	if($word->getDocument->getName ne 'no_name')
1348							{
1349	0					0	print $fh " name=\"". $word->getDocument->getName . "\"";
1350							}
1351	0					0	print $fh ">\n";
1352	0					0	$in_doc = 1 ;
1353							}
1354							else
1355							{
1356							# rebuild the sentence from occurrences of words from the corpus
1357	0					0	$string .= $word->getIF . " ";
1358	0					0	$last_word = $word;
1359	0	0				0	if($in_doc == 0) # if no explicit marker of document boundary in the input document
1360							{
1361	0					0	print $fh " getDocument->getID . "\"";
1362	0	0				0	if($word->getDocument->getName ne 'no_name')
1363							{
1364	0					0	print $fh " name=\"". $word->getDocument->getName . "\"";
1365							}
1366	0					0	print $fh ">\n";
1367	0					0	$in_doc = 1;
1368							}
1369	0	0	0			0	if (
1370							($word->isSentenceBoundary($sentence_boundary)) # new sentence is started
1371							\|\|
1372	0					0	($word == $this->getWords->[$#{$this->getWords}]) # last word of the corpus (no final dot)
1373							)
1374							{
1375	0					0	$string =~ s/ $//;
1376	0					0	Lingua::YaTeA::XMLEntities::encode($string);
1377	0					0	print $fh " getSentence->getID . "\" inDocID=\"" . $word->getSentence->getInDocID . "\">" . $string . "\n";
1378	0					0	$string = "";
1379	0	0				0	if($word == $this->getWords->[$#{$this->getWords}])
	0					0
1380							{
1381	0					0	print $fh " \n";
1382							}
1383							}
1384							}
1385
1386							}
1387	0					0	print $fh " \n";
1388							}
1389
1390
1391							sub printHTMLCorpus
1392							{
1393	0			0	1	0	my ($this,$parsed_occurrences_h,$ids_for_parsed_h,$fh,$sentence_boundary,$document_boundary,$color_blind_option, $parsed_color, $unparsed_color) = @_;
1394	0					0	my $sentence_id;
1395							my $document_id;
1396	0					0	my $document_name;
1397	0					0	my $word;
1398	0					0	my $occurrence;
1399	0					0	my $local_occurrences_a;
1400	0					0	my $string;
1401	0					0	my $offset = 0;
1402	0					0	my $string_copy;
1403							my $color;
1404
1405
1406	0					0	foreach $word (@{$this->getWords})
	0					0
1407							{
1408	0	0				0	if($word->isDocumentBoundary($document_boundary)) # new sentence is started
1409							{
1410	0					0	print $fh " Document " . $word->getDocument->getID;
1411	0	0				0	if($word->getDocument->getName ne 'no_name')
1412							{
1413	0					0	print $fh " - ". $word->getDocument->getName;
1414							}
1415	0					0	print $fh " ";
1416							}
1417							else
1418							{
1419							# rebuild the sentence from occurrences of words from the corpus
1420	0					0	$string .= $word->getIF . " ";
1421	0	0	0			0	if (
1422							($word->isSentenceBoundary($sentence_boundary)) # new sentence is started
1423							\|\|
1424	0					0	($word == $this->getWords->[$#{$this->getWords}]) # last word of the corpus (no final dot)
1425							)
1426							{
1427	0					0	$string =~ s/ $//;
1428							# get the term candidates occurrences for the next sentence
1429	0					0	$sentence_id = $word->getSentence->getID;
1430	0					0	$document_id = $word->getDocument->getID;
1431	0					0	$document_name = $word->getDocument->getName;
1432
1433	0					0	$local_occurrences_a = $parsed_occurrences_h->{$document_id}{$sentence_id};
1434							# mark term candidates on the rebuilt sentence
1435	0					0	foreach $occurrence (@$local_occurrences_a)
1436							{
1437	0					0	$color = $this->setColor($occurrence->getID,$ids_for_parsed_h,$color_blind_option, $parsed_color, $unparsed_color);
1438	0	0				0	if(!defined $offset)
1439							{
1440	0					0	die;
1441							}
1442	0					0	$string_copy .= substr($string,$offset,$occurrence->getStartChar - $offset). "";
1443	0					0	$string_copy .= substr($string,$occurrence->getStartChar,$occurrence->getEndChar - $occurrence->getStartChar) . "";
1444	0					0	$offset = $occurrence->getEndChar;
1445
1446	0	0				0	if(! substr($string,$offset-1))
1447							{
1448	0					0	print STDERR "problem d'offset pour la phrase DOC:" . $document_id . " - SENT: " . $sentence_id . "\n";
1449	0					0	print STDERR $string . "\n";
1450							}
1451							}
1452
1453	0					0	$string_copy .= substr($string,$offset);
1454	0					0	print $fh $word->getSentence->getInDocID . ":" . $string_copy . " \n";
1455	0					0	$string = "";
1456	0					0	$string_copy = "";
1457	0					0	$offset = 0;
1458							}
1459							}
1460							}
1461							}
1462
1463
1464							sub setColor
1465							{
1466	0			0	1	0	my ($this,$occurrence_id,$ids_for_parsed_h,$color_blind_option, $parsed_color, $unparsed_color) = @_;
1467	0					0	my $color;
1468
1469	0	0				0	if(exists $ids_for_parsed_h->{$occurrence_id})
1470							{
1471	0	0				0	if($color_blind_option->getValue eq 'yes')
1472							{
1473	0	0				0	if (defined $parsed_color) {
1474	0					0	$color = $parsed_color->getValue;
1475							} else {
1476	0					0	$color = "FF0099";
1477							}
1478							}
1479							else
1480							{
1481	0	0				0	if (defined $parsed_color) {
1482	0					0	$color = $parsed_color->getValue;
1483							} else {
1484	0					0	$color = "CC0066";
1485							}
1486							}
1487							}
1488							else
1489							{
1490	0	0				0	if($color_blind_option->getValue eq 'yes')
1491							{
1492	0	0				0	if (defined $unparsed_color) {
1493	0					0	$color = $unparsed_color->getValue;
1494							} else {
1495	0					0	$color = "0000CC";
1496							}
1497							}
1498							else
1499							{
1500	0	0				0	if (defined $unparsed_color) {
1501	0					0	$color = $unparsed_color->getValue;
1502							} else {
1503	0					0	$color = "3366CC";
1504							}
1505							}
1506							}
1507	0					0	return $color;
1508							}
1509
1510							sub orderOccurrencesForXML
1511							{
1512	0			0	1	0	my ($this,$term_h) = @_;
1513	0					0	my %occurrences;
1514							my %mapping_to_TCs;
1515	0					0	my $document;
1516	0					0	my $sentence;
1517	0					0	my $occurrence;
1518	0					0	my $term;
1519	0					0	my $unparsable;
1520	0					0	my $sent_hash;
1521	0					0	my $occurrences_a;
1522
1523	0					0	foreach $term (values (%$term_h))
1524							{
1525
1526	0					0	foreach $occurrence (@{$term->getOccurrences})
	0					0
1527							{
1528							# only the occurrences covering an entire phrase are selected
1529	0	0	0			0	if(
			0
1530							((blessed($term)) && ($term->isa('Lingua::YaTeA::TestifiedTerm')))
1531							\|\|
1532							($occurrence->isMaximal)
1533							)
1534							{
1535	0					0	push @{$occurrences{$occurrence->getDocument->getID}{$occurrence->getSentence->getID}{$occurrence->getStartChar}}, $occurrence;
	0					0
1536	0					0	$mapping_to_TCs{$occurrence->getID} = $term;
1537							}
1538
1539							}
1540							}
1541	0					0	return (\%occurrences,\%mapping_to_TCs);
1542							}
1543
1544
1545
1546							sub orderOccurrences
1547							{
1548	0			0	1	0	my ($this,$term_candidates_h,$unparsable_a,$ids_for_parsed_h) = @_;
1549	0					0	my %occurrences;
1550							my $document;
1551	0					0	my $sentence;
1552	0					0	my $occurrence;
1553	0					0	my $term_candidate;
1554	0					0	my $unparsable;
1555	0					0	my $sent_hash;
1556	0					0	my $occurrences_a;
1557
1558
1559	0					0	foreach $term_candidate (values (%$term_candidates_h))
1560							{
1561	0					0	foreach $occurrence (@{$term_candidate->getOccurrences})
	0					0
1562							{
1563	0	0				0	if($occurrence->isMaximal)
1564							{
1565	0					0	push @{$occurrences{$occurrence->getDocument->getID}{$occurrence->getSentence->getID}}, $occurrence;
	0					0
1566	0					0	$ids_for_parsed_h->{$occurrence->getID}++;
1567							}
1568							}
1569							}
1570	0					0	foreach $unparsable (@$unparsable_a)
1571							{
1572	0					0	foreach $occurrence (@{$unparsable->getOccurrences})
	0					0
1573							{
1574	0	0				0	if($occurrence->isMaximal)
1575							{
1576	0					0	push @{$occurrences{$occurrence->getDocument->getID}{$occurrence->getSentence->getID}}, $occurrence;
	0					0
1577							}
1578							}
1579							}
1580	0					0	while (($document,$sent_hash) = each (%occurrences))
1581							{
1582	0					0	while (($sentence,$occurrences_a) = each (%$sent_hash))
1583							{
1584	0					0	@$occurrences_a = sort ({$a->getStartChar <=> $b->getStartChar} @$occurrences_a);
	0					0
1585							}
1586
1587							}
1588	0					0	return \%occurrences;
1589							}
1590
1591							sub getWords
1592							{
1593	0			0	1	0	my ($this) = @_;
1594	0					0	return $this->{WORDS};
1595							}
1596
1597							sub selectOnTermListStyle {
1598	0			0	0	0	my ($this, $term_candidates_h,$term_list_style,$debug_fh) = @_;
1599
1600	0					0	my $tc;
1601							# warn "selectOnTermListStyle ($term_list_style)\n";
1602	0					0	foreach $tc (values (%$term_candidates_h))
1603							{
1604							# warn $tc->getIF . "\n";
1605							# warn ($tc->isa('Lingua::YaTeA::MultiWordTermCandidate') * 1) . "\n";
1606	0	0	0			0	if (($term_list_style ne "") && ($term_list_style ne "all") &&
			0
			0
1607							(($term_list_style ne "multi") \|\| ((blessed($tc)) && ($tc->isa('Lingua::YaTeA::MultiWordTermCandidate') != 1)))) {
1608	0					0	$tc->setTermStatus(0);
1609							}
1610							# warn "" . (1 * $tc->getTermStatus) . "\n";
1611							# warn " " . ($tc->isTerm * 1) . "\n";
1612							}
1613							}
1614
1615							sub makeDDW
1616							{
1617	2			2	0	10	my ($this,$term_candidates_h,$fh) = @_;
1618	2					7	my $tc_weight;
1619							my $mean_occ;
1620	2					14	my $total_occ = 0;
1621	2					16	my $total_doc = $this->getDocumentSet->getDocumentNumber;
1622	2					14	my %doc_by_tc;
1623							my %docs_for_this_tc;
1624	2					0	my $tc;
1625	2					0	my $occ;
1626
1627
1628	2					16	foreach $tc (values (%$term_candidates_h))
1629							{
1630	240					392	%docs_for_this_tc = ();
1631	240					314	foreach $occ (@{$tc->getOccurrences})
	240					487
1632							{
1633	330					680	$docs_for_this_tc{$occ->getDocument->getID}++;
1634	330					561	$total_occ++;
1635							}
1636	240					488	$doc_by_tc{$tc->getKey} = scalar keys(%docs_for_this_tc);
1637							}
1638	2	50				25	if (scalar(keys(%$term_candidates_h)) > 0) {
1639	2					16	$mean_occ = $total_occ / scalar keys %$term_candidates_h;
1640							} else {
1641	0					0	$mean_occ = 0;
1642							}
1643	2					25	foreach $tc (values (%$term_candidates_h))
1644							{
1645							##### measure 'descriptor discriminating weight' described in 'Building back-of-the-book indexes', Nazarenko, Ait El Mekki (2005)
1646							##### PROBLEM: each time there is only one document in the corpus, the value is 0 (log10(1/1) = 0)
1647	240					486	$tc_weight = ($tc->getFrequency/$mean_occ) * log10 ($total_doc/$doc_by_tc{$tc->getKey});
1648	240					1133	$tc->setWeight($tc_weight);
1649							}
1650							}
1651
1652							sub processTotalDocOccurrences
1653							{
1654	0			0	0		my ($this,$occurrences_h) = @_;
1655	0						my $total;
1656							my $occ;
1657							# print STDERR $occurrences_h. "\n";
1658	0						foreach $occ (values (%$occurrences_h))
1659							{
1660	0						$total += $occ;
1661							}
1662	0						return $total;
1663							}
1664
1665
1666							1;
1667
1668							__END__