File Coverage

blib/lib/Lingua/YaTeA/Corpus.pm

Criterion	Covered	Total	%
statement	405	766	52.8
branch	100	236	42.3
condition	45	117	38.4
subroutine	52	73	71.2
pod	49	56	87.5
total	651	1248	52.1

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::YaTeA::Corpus;
2	3			3		70501	use strict;
	3					15
	3					95
3	3			3		18	use warnings;
	3					6
	3					87
4	3			3		643	use Data::Dumper;
	3					6938
	3					156
5	3			3		571	use UNIVERSAL;
	3					18
	3					25
6	3			3		149	use Scalar::Util qw(blessed);
	3					10
	3					158
7	3			3		19	use File::Path;
	3					6
	3					165
8	3			3		1522	use POSIX qw(log10);
	3					19468
	3					17
9
10	3			3		5663	use Lingua::YaTeA::ForbiddenStructureMark;
	3					8
	3					33
11	3			3		1313	use Lingua::YaTeA::TestifiedTermMark;
	3					7
	3					89
12	3			3		1296	use Lingua::YaTeA::Sentence;
	3					10
	3					32
13	3			3		1273	use Lingua::YaTeA::Lexicon;
	3					10
	3					30
14	3			3		1252	use Lingua::YaTeA::DocumentSet;
	3					8
	3					39
15	3			3		1262	use Lingua::YaTeA::SentenceSet;
	3					11
	3					1139
16	3			3		264	use Lingua::YaTeA::WordFromCorpus;
	3					1415
	3					20
17	3			3		480	use Lingua::YaTeA::XMLEntities;
	3					7
	3					21
18
19	3			3		674	use Encode qw(:fallbacks);;
	3					9858
	3					25623
20
21
22							our $VERSION=$Lingua::YaTeA::VERSION;
23
24							our $forbidden_counter = 0;
25							our $tt_counter = 0;
26							our $split_counter = 0;
27
28
29							sub new
30							{
31	3			3	1	13	my ($class,$path,$option_set,$message_set) = @_;
32	3					10	my $this = {};
33	3					9	bless ($this,$class);
34	3					21	$this->{PATH} = $path;
35	3					9	$this->{NAME} = ();
36	3					30	$this->{LEXICON} = Lingua::YaTeA::Lexicon->new;
37	3					30	$this->{DOCUMENTS} = Lingua::YaTeA::DocumentSet->new;
38	3					37	$this->{SENTENCES} = Lingua::YaTeA::SentenceSet->new;
39	3					9	$this->{WORDS} = [];
40	3					9	$this->{OUTPUTS} = ();
41	3					16	$this->setName;
42
43
44	3					16	$this->setOutputFiles($option_set,$message_set);
45	3					14	return $this;
46							}
47
48							sub preLoadLexicon
49							{
50	1			1	1	5	my ($this,$sentence_boundary,$document_boundary,$match_type) = @_;
51	1					5	my $fh = $this->getFileHandle;
52	1					3	my $word;
53							my %lexicon;
54	1					5	while (! $fh->eof)
55							{
56	299					6283	$word = $fh->getline;
57	299	100	100			7907	if(
			66
58							($word=~ /^([^\t]+)\t([^\t]+)\t([^\t]+)$/)
59							&&
60							($2 ne $sentence_boundary)
61							&&
62							($2 ne $document_boundary)
63							)
64							{
65	288	50				564	if($match_type ne "strict")
66							{
67	288					672	$lexicon{lc($1)}++; # record IF
68	288	50				531	if($match_type eq "loose")
69							{
70	288					869	$lexicon{lc($3)}++; # record LF
71							}
72							}
73							else
74							{
75	0					0	$lexicon{lc($1)."~".$2}++; # record IF + POS
76							}
77							}
78							}
79	1					40	return \%lexicon;
80							}
81
82
83							sub _normalizeInputCorpusLine {
84	20			20		1826	my ($this, $block, $language) = @_;
85
86	20					67	my $line;
87							my @elems;
88	20					0	my @elems_out;
89	20					41	my $new_block = "";
90
91	20					36	my @septags;
92
93	20	50	33			97	if ((defined $language) && ($language eq "FR-Flemm")) {
94	0					0	foreach $line (split /\n/, $block) {
95	0					0	$line =~ s/
96	0					0	$line =~ s/>/SUP/go;
97	0					0	$line =~ s/\t:\t/\tCOLUMN\t/go;
98
99	0					0	@elems = split /\t/, $line;
100
101							# warn "-> $line" . scalar(@elems) . "\n";
102
103
104	0	0				0	if (scalar(@elems) > 3) {
105							# ambiguity in the pos tagging
106	0					0	my @tmp = split /\s\\|\\|\s/, $elems[2];
107	0					0	$elems[2] = shift @tmp;
108	0					0	$#elems = 2;
109							}
110	0	0				0	if (scalar(@elems) == 3) {
111	0					0	@septags = split /:/, $elems[1];
112	0					0	my $tag;
113	0	0				0	if (scalar(@septags) == 2) {
		0
114	0					0	$tag = $septags[1];
115							} elsif (scalar(@septags) == 3) {
116	0					0	$tag = $septags[2];
117							} else {
118	0					0	$tag = $septags[0];
119							}
120							# if the tag is PUN(cit)
121	0	0				0	if ($tag eq "PUN(CIT)") {
122	0					0	$tag = "PUN";
123							}
124							# if the tag is Sp+Da, it is transformed as SpDa
125	0					0	$tag =~ s/\+D/D/;
126							# if the word is 'une', the postag is corrected
127	0	0				0	if ($elems[0] eq "une") {
128	0					0	$tag = "Da3fs---";
129							}
130							# if it's a present participle, the postag is then Vmpp-----
131	0	0				0	if ($tag =~ /Vmpp/) {
132	0					0	$tag = "Vmpp-----";
133							}
134							# if the word is 'l', the postag is corrected
135	0	0				0	if (lc($elems[0]) eq "l") {
136	0					0	$tag = "Da3-s---";
137							}
138	0					0	$elems[1] = $tag;
139							}
140	0	0				0	if (scalar(@elems) == 3) {
141	0					0	$new_block .= join("\t", @elems) . "\n";
142							}
143							}
144	0					0	return($new_block);
145							} else {
146							# warn "Language is $language, so nothing to do\n";
147
148							# foreach $line (split /\n/, $block) {
149							# $line =~ s/
150							# $line =~ s/>/SUP/go;
151							# $line =~ s/\t:\t/\tCOLUMN\t/go;
152							# $line =~ s/\t\t+/\t/go;
153							# if($line !~ /^[^\t]\t[^\t]+\t[^\t]$/o){
154							# # warn "***********************************\n";
155							# # warn "Start correction of the line: $line\n";
156							# @elems = split /\t/, $line;
157							# if (scalar(@elems) > 3) {
158							# # ambiguity in the pos tagging
159							# # my @tmp = split /\s\\|\\|\s/, $elems[2];
160							# # $elems[2] = shift @tmp;
161							# $#elems = 2;
162							# } else {
163							# if (defined $elems[0]) {
164							# $elems[2] = $elems[0];
165							# if (!defined $elems[1]) {
166							# # $elems[1] = 'SYM';
167							# $elems[1] = $elems[0];
168							# }
169							# } else {
170							# @elems =();
171							# }
172
173							# }
174							# if (scalar(@elems) == 3) {
175							# $new_block .= join("\t", @elems) . "\n";
176							# }
177							# } else {
178							# $new_block .= $line . "\n";
179							# }
180							# }
181							# $new_block .= join("\t", @elems) . "\n";
182							# return($new_block);
183	20					70	return($block);
184							}
185							}
186
187
188							sub read
189							{
190	2			2	1	10	my ($this,$sentence_boundary,$document_boundary,$FS_set,$testified_set,$match_type,$message_set,$display_language, $language,$debug_fh) = @_;
191	2					5	my $num_line = 0;
192	2					11	my $fh = $this->getFileHandle;
193	2					6	my $block;
194
195							# local $/ = "\.\t". $sentence_boundary ."\t\.\n";
196	2					12	$this->getSentenceSet->addSentence($this->getDocumentSet);
197	2					12	while (! $fh->eof)
198							{
199	20					266	$block = $this->_normalizeInputCorpusLine(Encode::decode("UTF-8", $this->readSentence($fh,$sentence_boundary)), $language);
200	20					91	$this->WrapBlock(\$block);
201	20					80	$this->MarkForbiddenStructures(\$block,$FS_set);
202	20					102	$this->MarkTestifiedTerms(\$block,$testified_set,$match_type,$debug_fh);
203	20					82	$this->UnwrapBlock(\$block);
204
205	20					81	$this->recordWords($block,$sentence_boundary,$document_boundary,\$num_line,$message_set,$display_language);
206							}
207							}
208
209							sub readSentence {
210	20			20	0	53	my ($this, $fh, $sentence_boundary) = @_;
211
212							# warn "in readsentence\n";
213	20					44	my $line;
214							my $sentence;
215	20	50				45	if (! $fh->eof) {
216	20		100			118	do {
217	598					13774	$line = $fh->getline;
218							# warn "line: $line;\n";
219	598					13077	$sentence .= $this->correctInputLine($line);
220							# $sentence .= $line;
221							} while ((!$fh->eof) && (index($line, "\t$sentence_boundary\t") == -1));
222							# warn "sentence: $sentence\n";
223							}
224	20					291	return($sentence);
225							}
226
227							sub correctInputLine {
228	598			598	0	1122	my ($line, $this) = reverse(@_);
229
230	598					785	my @elems;
231	598					796	my $tail = "";
232	598	50				1926	if ($line =~ /(\n+)$/) {
233	598					1161	$tail = $1;
234							}
235	598					960	chomp $line;
236
237	598					965	$line =~ s/
238	598					782	$line =~ s/>/SUP/go;
239	598					792	$line =~ s/\t:\t/\tCOLUMN\t/go;
240	598					888	$line =~ s/\t\t+/\t/go;
241	598	100				1791	if($line !~ /^[^\t]\t[^\t]+\t[^\t]$/o){
242							# warn "***********************************\n";
243							# my $line2 = $line;
244							# $line2 =~ s/\t/\\t/go;
245							# warn "Start correction of the line: " . $line2 . "\n";
246	4					14	@elems = split /\t/, $line;
247	4	50				13	if (scalar(@elems) > 3) {
248							# ambiguity in the pos tagging
249							# my @tmp = split /\s\\|\\|\s/, $elems[2];
250							# $elems[2] = shift @tmp;
251	0					0	$#elems = 2;
252							} else {
253	4	50	33			31	if ((defined $elems[0]) && (length($elems[0])>0)) {
254	0					0	$elems[2] = $elems[0];
255	0	0				0	if (!defined $elems[1]) {
256							# $elems[1] = 'SYM';
257	0					0	$elems[1] = $elems[0];
258							}
259							} else {
260	4					9	@elems =();
261							}
262
263							}
264	4	50				10	if (scalar(@elems) == 3) {
265							# warn "Corrected line: " . join('\t', @elems). "\n";
266							# warn "***********************************\n";
267	0					0	return(join("\t", @elems) . $tail);
268							} else {
269							# warn "Removing line\n";
270							# warn "***********************************\n";
271	4					17	return("");
272							}
273							} else {
274	594					2309	return($line . $tail);
275							}
276
277	0					0	return($line . $tail);
278							}
279
280							sub recordWords
281							{
282	20			20	1	76	my ($this,$block,$sentence_boundary,$document_boundary,$num_line,$message_set,$display_language) = @_;
283	20					35	my $word;
284	20					262	my @words = split /\n/,$block;
285
286	20					53	foreach $word (@words)
287							{ # record each word of the sentence
288	612					868	$$num_line++;
289	612	50				2046	if ($word !~ /^\s*$/)
290							{
291	612					1176	$this->addWordFromCorpus($word,$sentence_boundary,$document_boundary,$num_line,$message_set,$display_language);
292							}
293							}
294							}
295
296							sub addWordFromCorpus
297							{
298	612			612	1	1242	my ($this,$form,$sentence_boundary,$document_boundary,$num_line,$message_set,$display_language) = @_;
299	612					792	my $word;
300	612					1002	chomp $form;
301
302	612	100				2144	if($form =~ /^[^\t]\t[^\t]+\t[^\t]$/o){
303	594					1298	$word = Lingua::YaTeA::WordFromCorpus->new($form,$this->getLexicon,$this->getSentenceSet);
304							}
305							else{
306	18	100				82	if($form =~ /\<\/?FORBIDDEN/)
307							{
308	12					64	$word = Lingua::YaTeA::ForbiddenStructureMark->new($form);
309							}
310							else
311							{
312	6	50				28	if($form =~ /\<\/?FRONTIER/)
313							{
314	6					26	$word = Lingua::YaTeA::TestifiedTermMark->new($form);
315							}
316							else
317							{
318	0					0	warn $message_set->getMessage('INVALID_TOKEN')->getContent($display_language) . $$num_line . $message_set->getMessage('IN_FILE')->getContent($display_language) . $this->getPath . " ($form)";
319	0					0	die "\n";
320							}
321							}
322							}
323	612					928	push @{$this->{WORDS}}, $word;
	612					1209
324	612					1398	$this->incrementCounters($word,$sentence_boundary,$document_boundary);
325	612					1499	return $word;
326							}
327
328							sub incrementCounters
329							{
330	612			612	1	1105	my ($this,$word,$sentence_boundary,$document_boundary) = @_;
331
332	612	100	66			3284	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
333							{
334	594					928	$Lingua::YaTeA::WordFromCorpus::counter++;
335	594	100				1269	if ($word->isSentenceBoundary($sentence_boundary))
336							{
337	18					30	$Lingua::YaTeA::Sentence::counter++;
338	18					36	$Lingua::YaTeA::Sentence::in_doc_counter++;
339	18					67	Lingua::YaTeA::Sentence::resetStartChar;
340	18					54	$this->getSentenceSet->addSentence($this->getDocumentSet);
341							}
342							else{
343	576	50				1196	if ($word->isDocumentBoundary($document_boundary))
344							{
345	0					0	$this->getDocumentSet->addDocument($word);
346	0					0	Lingua::YaTeA::Sentence->resetStartChar;
347	0					0	$this->getSentenceSet->addSentence($this->getDocumentSet);
348	0					0	Lingua::YaTeA::Sentence->resetInDocCounter;
349	0					0	$word->updateSentence($this->getSentenceSet);
350	0					0	$word->updateStartChar;
351
352	0					0	$Lingua::YaTeA::Sentence::counter++;
353	0					0	$Lingua::YaTeA::Sentence::in_doc_counter++;
354	0					0	$this->getSentenceSet->addSentence($this->getDocumentSet);
355
356							}
357							else{
358	576					1406	Lingua::YaTeA::Sentence->updateStartChar($word);
359							}
360							}
361							}
362
363							}
364
365							sub print
366							{
367	0			0	1	0	my ($this,$sentence_boundary,$document_boundary) = @_;
368	0					0	my $word;
369	0					0	foreach $word (@{$this->{WORDS}} )
	0					0
370							{
371	0	0	0			0	if ((blessed($word)) && ($word->isa("Lingua::YaTeA::WordFromCorpus")))
372							{
373	0	0				0	if ($word->isSentenceBoundary($sentence_boundary))
374							{
375	0					0	print $word->getLexItem->getIF . "\n";
376							}
377							else
378							{
379	0	0				0	if($word->isDocumentBoundary($document_boundary))
380							{
381	0					0	print "\n" . $word->getLexItem->getIF . "\n";
382							}
383							else
384							{
385	0					0	print $word->getLexItem->getIF . " ";
386							}
387							}
388							}
389							else{
390	0					0	print $word->getForm . "\n";
391							}
392							}
393							}
394
395
396							sub selectTestifiedTerms {
397	20			20	1	49	my ($this,$block_r,$testified_set,$match_type) = @_;
398	20					309	my @block_lines = split ("\n", $$block_r);
399	20					85	my %block_lexicon;
400							my $word;
401	20					0	my $testified;
402	20					0	my %block_testified_set;
403
404	20	100	66			168	if((defined $testified_set) && ($testified_set->size > 0)) {
405	10					25	foreach $word (@block_lines) {
406	313	100				974	if ($word=~ /^([^\t]+)\t([^\t]+)\t([^\t]+)$/) {
407	297	50				487	if($match_type ne "strict") {
408	297					791	$block_lexicon{lc($1)}++; # record IF
409	297	50				521	if($match_type eq "loose") {
410	297					633	$block_lexicon{lc($3)}++; # record LF
411							}
412							}
413							else {
414	0					0	$block_lexicon{lc($1)."~".$2}++; # record IF + POS
415							}
416							}
417							}
418	10					19	foreach $testified (values %{$testified_set->getTestifiedTerms}) {
	10					29
419	10	100				51	if($testified->isInLexicon(\%block_lexicon,$match_type) == 1) {
420	3					14	$block_testified_set{$testified->getID} = $testified;
421							}
422							}
423							}
424	20					129	return \%block_testified_set;
425							}
426
427
428
429
430							sub MarkTestifiedTerms
431							{
432	20			20	1	71	my ($this,$block_r,$testified_set,$match_type,$debug_fh) = @_;
433	20					37	my $testified;
434							my $reg_exp;
435	20					44	my $id = 0;
436							# print $debug_fh $$block_r . "\n";
437	20					71	my $selected_TTs_h = $this->selectTestifiedTerms($block_r,$testified_set,$match_type);
438
439	20	50				63	if (defined $selected_TTs_h)
440							{
441	20					85	foreach $testified (values %$selected_TTs_h)
442							{
443							# print $debug_fh $testified->getIF . "\n";
444	3					15	$reg_exp = $testified->getRegExp;
445	3					481	$$block_r =~ s/($reg_exp)/$this->createAnnotation($1,\$id,$testified)/gei;
	3					16
446	3					42	$$block_r =~ s/\n\n/\n/g;
447							}
448							}
449							}
450
451							sub createAnnotation
452							{
453	3			3	1	16	my ($this,$match,$id_r,$testified) = @_;
454	3					5	my $type;
455
456	3					15	my $annotation = "\n\\n" . $match . "\n<\/FRONTIER ID=" . $$id_r . " TT=" . $testified->getID ."\>\n";
457	3					8	$$id_r++;
458
459	3					223	return $annotation;
460							}
461
462
463							sub MarkForbiddenStructures
464							{
465	20			20	1	48	my ($this,$block_r,$FS_set) = @_;
466	20					93	my $FS_any_a = $FS_set->getSubset("ANY");
467	20					37	my $FS;
468	20					33	my $ID = 0;
469	20					57	my $reg_exp;
470							my $action;
471	20					0	my $split_after;
472
473	20					52	foreach $FS (@$FS_any_a)
474							{
475	860					2402	$action = $FS->getAction;
476	860					1751	$reg_exp = $FS->getRegExp;
477	860	100				1726	if ($action eq "delete"){
478	580					49791	$$block_r =~ s/($reg_exp)/\n\$1\<\/FORBIDDEN ID=$ID ACTION=$action\>\n/ig;
479							}
480							else{
481	280	50				565	if ($action eq "split"){
482	280					538	$split_after = $FS->getSplitAfter;
483	280					34253	$$block_r =~ s/($reg_exp)/\n\$1\<\/FORBIDDEN ID=$ID ACTION=$action SPLIT\_AFTER=$split_after\>\n/ig;
484							}
485							}
486	860					2896	$ID++;
487							}
488							}
489
490							sub WrapBlock
491							{
492	20			20	1	51	my ($this,$block_r) = @_;
493	20					112	$$block_r =~ s/\r//g;
494	20					146	$$block_r =~ s/^/\n/;
495	20					174	$$block_r =~ s/$/\n/;
496							}
497
498							sub UnwrapBlock
499							{
500	20			20	1	50	my ($this,$block_r) = @_;
501	20					176	$$block_r =~ s/^\n//;
502	20					190	$$block_r =~ s/\n$//;
503							}
504
505
506
507
508
509							sub chunk
510							{
511	2			2	1	10	my ($this,$phrase_set,$sentence_boundary,$document_boundary,$chunking_data,$FS_set,$tag_set,$parsing_pattern_set,$testified_term_set,$option_set,$fh) = @_;
512	2					265	my $word;
513							my $i;
514	2					0	my @words;
515	2					0	my $action;
516	2					8	my $split_after = -1;
517	2					11	my $valid;
518							my $num_content_words;
519	2					0	my @clean_corpus;
520	2					0	my $term_frontiers_h;
521	2					11	my $compulsory = $option_set->getCompulsory;
522	2					9	my $max_length = $option_set->getMaxLength;
523
524	2					116	print STDERR "MAX_LENGTH: " . $max_length . "\n";
525	2					18	for ($i = 0; $i <= $this->size; $i++){
526	614					1114	$word = $this->getWord($i);
527	614	100	66			1807	if ((defined $fh) && (defined $word))
528							{
529	612					1517	$word->print($fh);
530							}
531							# if (defined $word) {
532							# print STDERR "> ($word)";
533							# $word->print(\*STDERR);
534							# }
535	614	100	100			1297	if(
536							($i == $this->size) # last word of the corpus
537							\|\|
538							($word->isChunkEnd(\$action,\$split_after,$sentence_boundary,$document_boundary,$chunking_data) == 1)
539							)
540							{
541	248					720	($valid,$num_content_words,$term_frontiers_h) = $this->cleanChunk(\@words,$chunking_data,$FS_set,$option_set->getCompulsory,$tag_set,$fh);
542							# print STDERR "====$valid\n";
543							# foreach my $w (@words) {
544							# $w->print(\*STDERR);
545							# }
546							# print STDERR "====\n";
547
548	248	100				673	if ($valid == 1)
549							{
550	122					299	$phrase_set->recordOccurrence(\@words,$num_content_words,$tag_set,$parsing_pattern_set,$option_set,$term_frontiers_h,$testified_term_set,$this->getLexicon,$this->getSentenceSet,$fh);
551	122					226	$Lingua::YaTeA::Corpus::tt_counter = 0;
552							}
553
554	248					429	@words = ();
555
556							}
557							else{
558	366					596	push @words, $word;
559							}
560
561
562							# warn "ref= " . ref($word) . "\n";
563	614	100	66			3602	if((defined $word) && ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus'))))
			100
564							{
565	594					1701	push @clean_corpus,$word;
566							}
567							}
568
569	2					99	$this->{WORDS} = \@clean_corpus;
570
571							}
572
573
574							sub cleanChunk
575							{
576	248			248	1	517	my ($this,$words_a,$chunking_data,$FS_set,$compulsory,$tag_set,$fh) = @_;
577	248					339	my $num_content_words;
578							my $term_frontiers_h;
579
580	248	100				516	if ($this->pruneFromStart($words_a,$chunking_data,$FS_set,$fh) == 1)
581							{
582
583	124	100				276	if($this->pruneFromEnd($words_a,$chunking_data,$FS_set,$fh) == 1)
584							{
585	122	50				255	if($this->checkCompulsory($words_a,$compulsory,$fh) == 1)
586							{
587	122					307	($num_content_words,$term_frontiers_h) = $this->deleteAnnotationMarks($words_a,$tag_set,$fh);
588	122					383	return (1,$num_content_words,$term_frontiers_h);
589							}
590	0					0	return (0,0);
591							}
592	2					8	return (0,0); # no words left
593							}
594	124					330	return (0,0); # no words left
595							}
596
597
598							sub deleteAnnotationMarks
599							{
600	122			122	1	225	my ($class,$words_a,$tag_set,$fh) = @_;
601	122					191	my $word;
602							my @tmp;
603	122					158	my $content_words = 0;
604	122					169	my $index = 0;
605	122					180	my %term_frontiers;
606							my $frontier;
607
608	122					205	foreach $word (@$words_a){
609	292	100	66			1260	if ((blessed($word)) && ($word->isa("Lingua::YaTeA::WordFromCorpus")))
610							{
611	286	100				614	if ($tag_set->existTag('CANDIDATES',$word->getPOS))
612							{
613	250					362	$content_words++;
614							}
615	286					484	push @tmp, $word;
616	286					491	$index++;
617							}
618							else
619							{
620	6	50	33			30	if ((blessed($word)) && ($word->isa("Lingua::YaTeA::TestifiedTermMark")))
621							{
622	6	100				17	if($word->isOpener)
623							{
624	3					13	$term_frontiers{$word->getID} = $word;
625	3					9	$word->{START} = $index; # should use setStart
626							}
627							else
628							{
629	3	50				9	if($word->isCloser)
630							{
631	3					8	$frontier = $term_frontiers{$word->getID};
632	3					8	$frontier->{END} = $index; # should use setEnd
633							}
634							}
635							}
636							}
637							}
638	122					269	@$words_a = @tmp;
639
640	122					400	return ($content_words,\%term_frontiers);
641							}
642
643
644
645
646							sub pruneFromStart
647							{
648	248			248	1	457	my ($this,$words_a,$chunking_data,$FS_set,$fh) = @_;
649	248					350	my $i =0;
650	248					351	my $word;
651							my $potential_FS_a;
652	248					368	my $inside_testified = 0;
653	248					378	my %testified_frontiers;
654
655
656	248					519	while ($i < scalar @$words_a)
657							{
658	188					282	$word = $words_a->[$i];
659
660	188	100	66			1118	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::TestifiedTermMark')))
661							{
662	2					9	return 1;
663							}
664							else
665							{
666	186	100	66			797	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
667							{
668
669	182	100				410	if ($word->isCleaningFrontier($chunking_data))
670							{
671	122	100				385	if(
672							# ($inside_testified == 0)
673							# &&
674							# ($word->isa('Lingua::YaTeA::WordFromCorpus'))
675							# &&
676							($potential_FS_a = $word->isStartTrigger($FS_set->getTriggerSet("START")))
677							)
678							{
679	25	50				65	if(!$this->expandStartTriggers($potential_FS_a,$words_a,$fh))
680							{
681	25					46	last;
682							}
683							}
684							else
685							{
686	97					169	last;
687							}
688							}
689
690
691							}
692	64					169	shift @$words_a; # delete element
693							}
694							}
695	246	100				590	if(scalar @$words_a > 0)
696							{
697	122					400	return 1;
698							}
699	124					326	return 0;
700							}
701
702							sub pruneFromEnd
703							{
704	124			124	1	229	my ($this,$words_a,$chunking_data,$FS_set,$fh) = @_;
705	124					230	my $i = $#$words_a;
706	124					207	my $word;
707							my $potential_FS_a;
708	124					173	my $inside_testified = 0;
709	124					179	my %testified_frontiers;
710	124					180	my $deleted = 0;
711
712	124					249	while ($i >= 0)
713							{
714	132					212	$word = $words_a->[$i];
715
716	132	50	33			745	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::TestifiedTermMark')))
717							{
718	0					0	return 1;
719							}
720							else
721							{
722	132	50	33			552	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
723							{
724	132	100				300	if ($word->isCleaningFrontier($chunking_data))
725
726							{
727	126	100	33			864	if(
			33
			66
728							($inside_testified == 0)
729							&&
730							((blessed($word)) && ($word->isa('Lingua::YaTeA::WordFromCorpus')))
731							&&
732							($potential_FS_a = $word->isEndTrigger($FS_set->getTriggerSet("END")))
733							)
734							{
735	4	50				19	if(!$this->expandEndTriggers($potential_FS_a,$words_a,$fh))
736							{
737	0					0	last;
738							}
739							else
740							{
741	4					9	$deleted = 1;
742	4	100				14	if(scalar @$words_a == 0)
743							{
744	2					9	return 0;
745							}
746							else
747							{
748	2					4	$i = $#$words_a;
749							}
750							}
751							}
752							else
753							{
754	122					200	last;
755							}
756							}
757							}
758	8	50	33			66	if ((blessed($word)) && ($word->isa('Lingua::YaTeA::ForbiddenStructureMark')))
759							{
760	0					0	my $del = pop @$words_a; # delete element
761	0					0	$i--;
762							}
763							else
764							{
765	8	100				40	if($deleted == 0)
766							{
767	6					16	my $del = pop @$words_a; # delete element
768	6					10	$i--;
769							}
770							}
771	8					22	$deleted = 0;
772
773
774							}
775							}
776	122	50				321	if(scalar @$words_a > 0)
777							{
778	122					341	return 1;
779							}
780	0					0	return 0;
781							}
782
783
784
785
786							sub checkCompulsory
787							{
788	122			122	1	276	my ($this,$words_a,$compulsory,$fh) = @_;
789	122					166	my $word;
790	122					211	foreach $word (@$words_a)
791							{
792	147	50	33			821	if (!((blessed($word)) && ($word->isa('Lingua::YaTeA::ForbiddenStructureMark'))))
793							{
794
795	147	100	66			784	if (
			100
796							((blessed($word)) && ($word->isa('Lingua::YaTeA::TestifiedTermMark')))
797							\|\|
798							($word->isCompulsory($compulsory))
799							)
800							{
801	122					323	return 1;
802							}
803							}
804							}
805	0					0	return 0;
806							}
807
808
809
810							sub getWord
811							{
812	614			614	1	981	my ($this,$i) = @_;
813	614					1148	return $this->{WORDS}->[$i];
814							}
815
816							sub size
817							{
818	1230			1230	1	1842	my ($this) = @_;
819	1230					1514	return scalar @{$this->{WORDS}};
	1230					3575
820							}
821
822
823							sub expandStartTriggers
824							{
825	25			25	1	61	my ($this,$potential_FS_a,$words_a,$fh) = @_;
826	25					98	my $FS;
827							my $i;
828	25					0	my $j;
829	25					0	my $to_find;
830	25					0	my $to_delete;
831
832	25					44	foreach $FS (@$potential_FS_a)
833							{
834
835	50	100				159	if($FS->getLength <= scalar @$words_a)
836							{
837	46					126	$to_delete = $FS->apply($words_a);
838	46	50				135	if(defined $to_delete)
839							{
840	0					0	last;
841							}
842
843							}
844							}
845	25	50				63	if(defined $to_delete)
846							{
847	0					0	while($to_delete != 1)
848							{
849	0	0	0			0	if ((blessed($words_a->[0])) && ($words_a->[0]->isa('Lingua::YaTeA::TestifiedTermMark')))
850							{
851	0					0	return 1;
852							}
853							else
854							{
855	0					0	my $del = shift @$words_a;
856	0					0	$to_delete--;
857							}
858							}
859	0					0	return 1;
860							}
861	25					59	return 0;
862							}
863
864
865							sub expandEndTriggers
866							{
867	4			4	1	13	my ($this,$potential_FS_a,$words_a,$fh) = @_;
868	4					24	my $FS;
869							my $i;
870	4					0	my $j;
871	4					0	my $to_find;
872	4					0	my $to_delete;
873
874	4					11	foreach $FS (@$potential_FS_a)
875							{
876
877	4	50				13	if($FS->getLength <= scalar @$words_a)
878							{
879	4					17	$to_delete = $FS->apply($words_a);
880	4	50				13	if(defined $to_delete)
881							{
882	4					10	last;
883							}
884							}
885							}
886	4	50				21	if(defined $to_delete)
887							{
888	4					14	while($to_delete != 0)
889							{
890	4	50	33			50	if ((blessed($words_a->[$#$words_a])) && ($words_a->[$#$words_a]->isa('Lingua::YaTeA::TestifiedTermMark')))
891							{
892	0					0	return 1;
893							}
894							else
895							{
896	4					13	my $w = pop @$words_a;
897	4					13	$to_delete--;
898							}
899							}
900	4					14	return 1;
901							}
902	0					0	return 0;
903							}
904
905							sub getSentenceSet
906							{
907	738			738	1	1268	my ($this) = @_;
908	738					1964	return $this->{SENTENCES};
909							}
910
911							sub getDocumentSet
912							{
913	22			22	1	49	my ($this) = @_;
914	22					85	return $this->{DOCUMENTS};
915							}
916
917							sub getFileHandle
918							{
919	3			3	1	8	my ($this) = @_;
920	3					19	my $path = $this->getPath;
921							# print STDERR "corpus :" . $path . "\n";
922	3					26	my $fh = FileHandle->new("<$path");
923							# binmode($fh, ":utf8");
924	3					235	return $fh;
925							}
926
927							sub getPath
928							{
929	6			6	1	14	my ($this) = @_;
930	6					46	return $this->{PATH};
931							}
932
933							sub getName
934							{
935	6			6	1	22	my ($this) = @_;
936	6					55	return $this->{NAME};
937							}
938
939							sub getOutputFileSet
940							{
941	16			16	1	44	my ($this) = @_;
942	16					87	return $this->{OUTPUT};
943							}
944
945							sub getLexicon
946							{
947	718			718	1	1160	my ($this) = @_;
948	718					1620	return $this->{LEXICON};
949							}
950
951							# the name of the file is what appears after the last "/" and before the last "." if any
952							sub setName
953							{
954	3			3	1	13	my ($this) = @_;
955
956	3	50				15	if($this->getPath =~ /\/?([^\/]+)\.[^\.]+$/)
957							{
958	3					14	$this->{NAME} = $1;
959							}
960							else
961							{
962	0					0	$this->getPath =~ /\/?([^\/]+)$/;
963	0					0	$this->{NAME} = $1;
964							}
965							}
966
967							sub setOutputFiles
968							{
969	3			3	1	10	my ($this,$option_set,$message_set) = @_;
970	3					22	my $sub_dir;
971							my $option;
972	3					0	my $file;
973	3					0	my @files;
974	3					0	my $file_info;
975	3					0	my $output_path;
976	3					7	my $no_output_defined = 1;
977	3					41	my %match_to_option= (
978							'xmlout'=>'xml:candidates.xml',
979							'TT-for-BioLG'=>'xml:TTforBioLG.xml',
980							'TC-for-BioLG'=>'xml:TCforBioLG.xml',
981							'termList'=>'raw:termList.txt',
982							'termAndHeadList'=>'raw:termAndHeadList.txt',
983							'printChunking'=>'html:candidatesAndUnparsedInCorpus.html',
984							'debug'=>'raw:debug,unparsable,unparsed',
985							'TTG-style-term-candidates' => 'raw:termCandidates.ttg',
986							'XML-corpus-for-BioLG' => 'xml:corpusForBioLG.xml',
987							'bootstrap' => 'raw:parsedTerms.txt',
988							'XML-corpus-raw' => 'xml:corpusRaw.xml',
989							);
990
991	3					31	$output_path = $option_set->getOutputPath ."/". $this->getName . "/" . $option_set->getSuffix;
992
993	3	100				136	if(-d $output_path)
994							{
995	1					12	print STDERR $message_set->getMessage('OVER_WRITE_REP')->getContent($option_set->getDisplayLanguage) . $output_path . "/\n";
996	1					930	rmtree $output_path;
997							}
998							# else
999							# {
1000	3					864	mkpath $output_path;
1001	3					29	print STDERR $message_set->getMessage('CREATE_REP')->getContent($option_set->getDisplayLanguage) . $output_path . "/\n";
1002							# }
1003	3					24	$this->{OUTPUT} = Lingua::YaTeA::FileSet->new($this->getName);
1004
1005	3					33	while (($option,$file_info) = each (%match_to_option))
1006							{
1007	33	100				101	if($option_set->optionExists($option))
1008							{
1009	24					64	$this->setFilesForOption($file_info,$output_path);
1010	24					83	$no_output_defined = 0;
1011							}
1012							}
1013	3	50				23	if($no_output_defined == 1)
1014							{
1015	0					0	$this->setFilesForOption($match_to_option{'xmlout'},$output_path);
1016							}
1017	3					19	$option_set->addOption('default_output',$no_output_defined);
1018							}
1019
1020							sub setFilesForOption
1021							{
1022	24			24	1	48	my ($this,$file_info,$sub_dir) = @_;
1023	24					54	my @files;
1024							my $file;
1025	24					0	my $sub_sub_dir;
1026	24					91	$file_info =~ /^([^:]+):(.+)$/;
1027	24					84	@files = split (/,/,$2);
1028	24					63	$sub_sub_dir = $sub_dir . "/" . $1;
1029	24	100				426	if(! -d $sub_sub_dir)
1030							{
1031	9					354	mkdir $sub_sub_dir;
1032							}
1033	24					77	foreach $file (@files)
1034							{
1035	30					118	$this->{OUTPUT}->addFile($sub_sub_dir,$file);
1036							}
1037							}
1038
1039							sub printCorpusForLGPwithTCs
1040							{
1041	0			0	1	0	my ($this,$term_candidates_h,$output_file,$sentence_boundary,$document_boundary,$lgp_mapping_file,$chained_links,$tag_set) = @_;
1042
1043	0					0	my ($occurrences_h,$mapping_to_TCs_h) = $this->orderOccurrencesForXML($term_candidates_h);
1044	0					0	my $LGPmapping_h = $this->loadLGPmappingFile($lgp_mapping_file->getPath);
1045	0					0	$this->printXMLcorpus($occurrences_h,$output_file->getPath,$sentence_boundary,$document_boundary,$mapping_to_TCs_h,$LGPmapping_h,$chained_links,$tag_set);
1046							}
1047
1048
1049							sub printCorpusForLGPwithTTs
1050							{
1051	0			0	1	0	my ($this,$testified_terms_h,$output_file,$sentence_boundary,$document_boundary,$lgp_mapping_file,$parsing_direction,$chained_links,$tag_set) = @_;
1052
1053	0					0	my ($occurrences_h,$mapping_to_TTs_h) = $this->orderOccurrencesForXML($testified_terms_h);
1054	0					0	$this->getBestOccurrences($occurrences_h,$parsing_direction);
1055	0					0	my $LGPmapping_h = $this->loadLGPmappingFile($lgp_mapping_file->getPath);
1056	0					0	$this->printXMLcorpus($occurrences_h,$output_file->getPath,$sentence_boundary,$document_boundary,$mapping_to_TTs_h,$LGPmapping_h,$chained_links,$tag_set);
1057							}
1058
1059
1060							sub getBestOccurrences
1061							{
1062	0			0	1	0	my ($this,$occurrences_h,$parsing_direction) = @_;
1063	0					0	my $doc;
1064							my $sentence;
1065	0					0	my @occurrences;
1066	0					0	my $occurrence;
1067	0					0	my $occurrence_set;
1068	0					0	my $same_start;
1069	0					0	foreach $doc (values %$occurrences_h)
1070							{
1071	0					0	foreach $sentence (values %$doc)
1072							{
1073	0					0	@occurrences = ();
1074	0					0	foreach $same_start (values %$sentence)
1075							{
1076	0					0	foreach $occurrence (@$same_start){
1077	0					0	push @occurrences, $occurrence;
1078							}
1079
1080							}
1081	0					0	foreach $occurrence (@occurrences)
1082							{
1083	0	0				0	if($occurrence->isNotBest(\@occurrences,$parsing_direction))
1084							{
1085	0					0	$this->removeOccurrence($occurrences_h,$occurrence->getDocument->getID,$occurrence->getSentence->getID,$occurrence->getStartChar,$occurrence->getID);
1086
1087							}
1088							}
1089
1090							}
1091							}
1092
1093							}
1094
1095
1096							sub removeOccurrence
1097							{
1098	0			0	1	0	my ($this,$occurrences_h,$doc,$sentence,$start_char,$occ_id) = @_;
1099	0					0	my @tmp;
1100							my $occurrence;
1101	0					0	my $occurrences_set = $occurrences_h->{$doc}{$sentence}{$start_char};
1102	0	0				0	if(scalar @$occurrences_set == 1)
1103							{
1104	0					0	delete $occurrences_h->{$doc}{$sentence}{$start_char};
1105							}
1106							else
1107							{
1108	0					0	foreach $occurrence (@$occurrences_set)
1109							{
1110	0	0				0	if($occurrence->getID != $occ_id)
1111							{
1112	0					0	push @tmp, $occurrence;
1113							}
1114							}
1115	0					0	@{$occurrences_h->{$doc}{$sentence}{$start_char}} = @tmp;
	0					0
1116							}
1117							}
1118
1119							sub printCorpusForBioLG{
1120	0			0	0	0	my ($this,$output_file,$sentence_boundary,$document_boundary,$chained_links) = @_;
1121
1122	0					0	my %occurrences;
1123							my %BioLGmapping;
1124	0					0	my %mapping_to_TTs;
1125
1126	0					0	$this->printXMLcorpus(\%occurrences,$output_file->getPath,$sentence_boundary,$document_boundary,\%mapping_to_TTs,\%BioLGmapping,$chained_links);
1127							}
1128
1129
1130
1131							sub printXMLcorpus
1132							{
1133	0			0	1	0	my ($this,$occurrences_h,$output_file_path,$sentence_boundary,$document_boundary,$mapping_to_TCs_h,$LGPmapping_h,$chained_links,$tag_set) = @_;
1134	0					0	my $word;
1135							my $sentence;
1136	0					0	my $sentence_id;
1137	0					0	my $document_id;
1138	0					0	my $occurrence;
1139	0					0	my $tc;
1140	0					0	my $head_word;
1141	0					0	my $head_index;
1142	0					0	my $local_occurrences_a;
1143	0					0	my $links_a;
1144	0					0	my $counter = 0;
1145	0					0	my $pos;
1146							my $if;
1147	0					0	my $i;
1148	0					0	my $tc_length;
1149	0					0	my $occurrence_set;
1150
1151	0					0	my $fh = FileHandle->new(">" . $output_file_path);
1152	0					0	binmode($fh, ":utf8");
1153	0					0	$this->printXMLheader($fh);
1154
1155	0					0	for ($i=0; $i < scalar @{$this->getWords}; $i++)
	0					0
1156							{
1157	0					0	$word = $this->getWords->[$i];
1158	0	0				0	if(! $word->isDocumentBoundary($document_boundary))
1159							{
1160
1161	0	0				0	if(exists $occurrences_h->{$word->getDocumentID}{$word->getSentenceID}{$word->getStartChar})
1162							{
1163	0					0	$occurrence_set = $occurrences_h->{$word->getDocumentID}{$word->getSentenceID}{$word->getStartChar};
1164	0					0	$occurrence = $occurrence_set->[0];
1165
1166	0					0	$tc = $mapping_to_TCs_h->{$occurrence->getID};
1167	0					0	$tc_length = $tc->getLength;
1168
1169							# previous and next words are not coordinations: to be removed when coordination will be handled ?
1170	0	0	0			0	if(
			0
			0
1171							(
1172							($i == 0)
1173							\|\|
1174							(!($tag_set->existTag('COORDINATIONS',$this->getWords->[$i-1]->getPOS)))
1175							)
1176							&&
1177							(
1178							($i == (scalar @{$this->getWords} -1))
1179							\|\|
1180							(!($tag_set->existTag('COORDINATIONS',$this->getWords->[$i + $tc_length]->getPOS)))
1181							)
1182							)
1183							{
1184
1185	0					0	($head_word,$head_index,$links_a) = $tc->getHeadAndLinks($LGPmapping_h,$chained_links);
1186	0					0	$counter++;
1187	0					0	$pos = $head_word->getPOS;
1188	0					0	$if = $head_word->getIF;
1189	0					0	Lingua::YaTeA::XMLEntities::encode($pos);
1190	0					0	Lingua::YaTeA::XMLEntities::encode($if);
1191	0					0	$sentence .= "
1192	0	0				0	if(scalar $links_a > 0)
1193							{
1194	0					0	$sentence .= " internal=\"" . join ("",@$links_a) . "\"";
1195							}
1196	0					0	$sentence .= " head=\"" . $head_index . "\">";
1197
1198							}
1199							else
1200							{
1201	0					0	undef $occurrence;
1202							}
1203							}
1204	0					0	$pos = $word->getPOS;
1205	0					0	$if = $word->getIF;
1206	0					0	Lingua::YaTeA::XMLEntities::encode($pos);
1207	0					0	Lingua::YaTeA::XMLEntities::encode($if);
1208	0					0	$sentence .= "" . $if . " ";
1209
1210	0	0	0			0	if (
1211							(defined $occurrence)
1212							&&
1213							($occurrence->getEndChar == $word->getStartChar + $word->getLexItem->getLength)
1214							)
1215							{
1216	0					0	undef $occurrence;
1217	0					0	$sentence .= "";
1218							}
1219	0	0				0	if($word->isSentenceBoundary($sentence_boundary))
1220							{
1221	0					0	$sentence =~ s/\s*$//;
1222	0					0	$sentence .= "";
1223	0					0	$sentence_id = $word->getSentence->getID;
1224	0					0	$document_id = $word->getDocument->getID;
1225	0					0	$local_occurrences_a = $occurrences_h->{$document_id}{$sentence_id};
1226
1227	0					0	print $fh $sentence . "\n";
1228	0					0	$sentence = "";
1229							}
1230							}
1231
1232							}
1233							# corpus ended without a final dot
1234	0	0				0	if($sentence ne "")
1235							{
1236	0					0	$sentence .= "";
1237	0					0	print $fh $sentence;
1238							}
1239	0					0	print STDERR $counter . " terms marked\n";
1240	0					0	$this->printXMLtrailer($fh);
1241							}
1242
1243							sub printXMLheader
1244							{
1245	0			0	1	0	my ($this,$fh) = @_;
1246	0					0	print $fh "\n";
1247							}
1248
1249							sub printXMLtrailer
1250							{
1251	0			0	1	0	my ($this,$fh) = @_;
1252	0					0	print $fh "\n\n";
1253							}
1254
1255							sub loadLGPmappingFile
1256							{
1257	0			0	1	0	my ($this,$file_path) = @_;
1258
1259	0					0	my $fh = FileHandle->new("<$file_path");
1260	0					0	my %mapping;
1261							my $line;
1262	0					0	while ($line= $fh->getline)
1263							{
1264	0	0	0			0	if(($line !~ /^\s$/)&&($line !~ /^\s#/)) # line is not empty nor commented
1265							{
1266	0					0	$line =~ /^([^\t]+)\t([^\t]+)\s*\n$/;
1267	0					0	$mapping{$1} = $2;
1268							}
1269							}
1270	0					0	return \%mapping;
1271							}
1272
1273
1274							sub printCandidatesAndUnparsedInCorpus
1275							{
1276	0			0	1	0	my ($this,$term_candidates_h,$unparsable_a,$file,$sentence_boundary,$document_boundary,$color_blind_option, $parsed_color, $unparsed_color) = @_;
1277	0					0	my %ids_for_parsed;
1278
1279							# my $fh = FileHandle->new(">".$file->getPath);
1280
1281							my $fh;
1282	0	0				0	if ($file eq "stdout") {
1283	0					0	$fh = \*STDOUT;
1284							} else {
1285	0	0				0	if ($file eq "stderr") {
1286	0					0	$fh = \*STDERR;
1287							} else {
1288	0					0	$fh = FileHandle->new(">".$file->getPath);
1289							}
1290							}
1291	0					0	binmode($fh, ":utf8");
1292
1293
1294	0					0	$this->printHTMLheader($fh);
1295	0					0	my $occurrences_h = $this->orderOccurrences($term_candidates_h,$unparsable_a,\%ids_for_parsed);
1296
1297	0					0	$this->printHTMLCorpus($occurrences_h,\%ids_for_parsed,$fh,$sentence_boundary,$document_boundary,$color_blind_option, $parsed_color, $unparsed_color);
1298	0					0	$this->printHTMLtrailer($fh);
1299							}
1300
1301
1302							sub printHTMLheader
1303							{
1304	0			0	1	0	my ($this,$fh) = @_;
1305	0					0	print $fh
1306							"\n\nTerm Candidates and unparsed phrases in Corpus\n";
1307							}
1308
1309							sub printHTMLtrailer
1310							{
1311	0			0	1	0	my ($this,$fh) = @_;
1312	0					0	print $fh
1313							"";
1314							}
1315
1316							sub printXMLRawCorpus
1317							{
1318	0			0	0	0	my ($this,$file,$sentence_boundary,$document_boundary) = @_;
1319	0					0	my $sentence_id;
1320							my $document_id;
1321	0					0	my $document_name;
1322	0					0	my $word;
1323	0					0	my $first_sentence = 1;
1324	0					0	my $in_doc = 0;
1325	0					0	my $string;
1326							my $last_word;
1327	0					0	my $fh = FileHandle->new(">".$file->getPath);
1328	0					0	binmode($fh,":utf8");
1329	0					0	$this->printXMLheader($fh);
1330	0					0	print $fh " \n";
1331	0					0	foreach $word (@{$this->getWords})
	0					0
1332							{
1333	0	0				0	if($word->isDocumentBoundary($document_boundary)) # new document is started
1334							{
1335	0	0				0	if($in_doc == 1)
1336							{
1337	0	0				0	if(!$last_word->isSentenceBoundary($sentence_boundary)) # last word of document is not a sentence boundary
1338							{
1339	0					0	$string =~ s/ $//;
1340	0					0	Lingua::YaTeA::XMLEntities::encode($string);
1341	0					0	print $fh " getSentence->getID . "\" inDocID=\"" . $word->getSentence->getInDocID . "\">" . $string . "\n";
1342	0					0	$string = "";
1343							}
1344	0					0	print $fh " \n";
1345							}
1346	0					0	print $fh " getDocument->getID . "\"";
1347	0	0				0	if($word->getDocument->getName ne 'no_name')
1348							{
1349	0					0	print $fh " name=\"". $word->getDocument->getName . "\"";
1350							}
1351	0					0	print $fh ">\n";
1352	0					0	$in_doc = 1 ;
1353							}
1354							else
1355							{
1356							# rebuild the sentence from occurrences of words from the corpus
1357	0					0	$string .= $word->getIF . " ";
1358	0					0	$last_word = $word;
1359	0	0				0	if($in_doc == 0) # if no explicit marker of document boundary in the input document
1360							{
1361	0					0	print $fh " getDocument->getID . "\"";
1362	0	0				0	if($word->getDocument->getName ne 'no_name')
1363							{
1364	0					0	print $fh " name=\"". $word->getDocument->getName . "\"";
1365							}
1366	0					0	print $fh ">\n";
1367	0					0	$in_doc = 1;
1368							}
1369	0	0	0			0	if (
1370							($word->isSentenceBoundary($sentence_boundary)) # new sentence is started
1371							\|\|
1372	0					0	($word == $this->getWords->[$#{$this->getWords}]) # last word of the corpus (no final dot)
1373							)
1374							{
1375	0					0	$string =~ s/ $//;
1376	0					0	Lingua::YaTeA::XMLEntities::encode($string);
1377	0					0	print $fh " getSentence->getID . "\" inDocID=\"" . $word->getSentence->getInDocID . "\">" . $string . "\n";
1378	0					0	$string = "";
1379	0	0				0	if($word == $this->getWords->[$#{$this->getWords}])
	0					0
1380							{
1381	0					0	print $fh " \n";
1382							}
1383							}
1384							}
1385
1386							}
1387	0					0	print $fh " \n";
1388							}
1389
1390
1391							sub printHTMLCorpus
1392							{
1393	0			0	1	0	my ($this,$parsed_occurrences_h,$ids_for_parsed_h,$fh,$sentence_boundary,$document_boundary,$color_blind_option, $parsed_color, $unparsed_color) = @_;
1394	0					0	my $sentence_id;
1395							my $document_id;
1396	0					0	my $document_name;
1397	0					0	my $word;
1398	0					0	my $occurrence;
1399	0					0	my $local_occurrences_a;
1400	0					0	my $string;
1401	0					0	my $offset = 0;
1402	0					0	my $string_copy;
1403							my $color;
1404
1405
1406	0					0	foreach $word (@{$this->getWords})
	0					0
1407							{
1408	0	0				0	if($word->isDocumentBoundary($document_boundary)) # new sentence is started
1409							{
1410	0					0	print $fh " Document " . $word->getDocument->getID;
1411	0	0				0	if($word->getDocument->getName ne 'no_name')
1412							{
1413	0					0	print $fh " - ". $word->getDocument->getName;
1414							}
1415	0					0	print $fh " ";
1416							}
1417							else
1418							{
1419							# rebuild the sentence from occurrences of words from the corpus
1420	0					0	$string .= $word->getIF . " ";
1421	0	0	0			0	if (
1422							($word->isSentenceBoundary($sentence_boundary)) # new sentence is started
1423							\|\|
1424	0					0	($word == $this->getWords->[$#{$this->getWords}]) # last word of the corpus (no final dot)
1425							)
1426							{
1427	0					0	$string =~ s/ $//;
1428							# get the term candidates occurrences for the next sentence
1429	0					0	$sentence_id = $word->getSentence->getID;
1430	0					0	$document_id = $word->getDocument->getID;
1431	0					0	$document_name = $word->getDocument->getName;
1432
1433	0					0	$local_occurrences_a = $parsed_occurrences_h->{$document_id}{$sentence_id};
1434							# mark term candidates on the rebuilt sentence
1435	0					0	foreach $occurrence (@$local_occurrences_a)
1436							{
1437	0					0	$color = $this->setColor($occurrence->getID,$ids_for_parsed_h,$color_blind_option, $parsed_color, $unparsed_color);
1438	0	0				0	if(!defined $offset)
1439							{
1440	0					0	die;
1441							}
1442	0					0	$string_copy .= substr($string,$offset,$occurrence->getStartChar - $offset). "";
1443	0					0	$string_copy .= substr($string,$occurrence->getStartChar,$occurrence->getEndChar - $occurrence->getStartChar) . "";
1444	0					0	$offset = $occurrence->getEndChar;
1445
1446	0	0				0	if(! substr($string,$offset-1))
1447							{
1448	0					0	print STDERR "problem d'offset pour la phrase DOC:" . $document_id . " - SENT: " . $sentence_id . "\n";
1449	0					0	print STDERR $string . "\n";
1450							}
1451							}
1452
1453	0					0	$string_copy .= substr($string,$offset);
1454	0					0	print $fh $word->getSentence->getInDocID . ":" . $string_copy . " \n";
1455	0					0	$string = "";
1456	0					0	$string_copy = "";
1457	0					0	$offset = 0;
1458							}
1459							}
1460							}
1461							}
1462
1463
1464							sub setColor
1465							{
1466	0			0	1	0	my ($this,$occurrence_id,$ids_for_parsed_h,$color_blind_option, $parsed_color, $unparsed_color) = @_;
1467	0					0	my $color;
1468
1469	0	0				0	if(exists $ids_for_parsed_h->{$occurrence_id})
1470							{
1471	0	0				0	if($color_blind_option->getValue eq 'yes')
1472							{
1473	0	0				0	if (defined $parsed_color) {
1474	0					0	$color = $parsed_color->getValue;
1475							} else {
1476	0					0	$color = "FF0099";
1477							}
1478							}
1479							else
1480							{
1481	0	0				0	if (defined $parsed_color) {
1482	0					0	$color = $parsed_color->getValue;
1483							} else {
1484	0					0	$color = "CC0066";
1485							}
1486							}
1487							}
1488							else
1489							{
1490	0	0				0	if($color_blind_option->getValue eq 'yes')
1491							{
1492	0	0				0	if (defined $unparsed_color) {
1493	0					0	$color = $unparsed_color->getValue;
1494							} else {
1495	0					0	$color = "0000CC";
1496							}
1497							}
1498							else
1499							{
1500	0	0				0	if (defined $unparsed_color) {
1501	0					0	$color = $unparsed_color->getValue;
1502							} else {
1503	0					0	$color = "3366CC";
1504							}
1505							}
1506							}
1507	0					0	return $color;
1508							}
1509
1510							sub orderOccurrencesForXML
1511							{
1512	0			0	1	0	my ($this,$term_h) = @_;
1513	0					0	my %occurrences;
1514							my %mapping_to_TCs;
1515	0					0	my $document;
1516	0					0	my $sentence;
1517	0					0	my $occurrence;
1518	0					0	my $term;
1519	0					0	my $unparsable;
1520	0					0	my $sent_hash;
1521	0					0	my $occurrences_a;
1522
1523	0					0	foreach $term (values (%$term_h))
1524							{
1525
1526	0					0	foreach $occurrence (@{$term->getOccurrences})
	0					0
1527							{
1528							# only the occurrences covering an entire phrase are selected
1529	0	0	0			0	if(
			0
1530							((blessed($term)) && ($term->isa('Lingua::YaTeA::TestifiedTerm')))
1531							\|\|
1532							($occurrence->isMaximal)
1533							)
1534							{
1535	0					0	push @{$occurrences{$occurrence->getDocument->getID}{$occurrence->getSentence->getID}{$occurrence->getStartChar}}, $occurrence;
	0					0
1536	0					0	$mapping_to_TCs{$occurrence->getID} = $term;
1537							}
1538
1539							}
1540							}
1541	0					0	return (\%occurrences,\%mapping_to_TCs);
1542							}
1543
1544
1545
1546							sub orderOccurrences
1547							{
1548	0			0	1	0	my ($this,$term_candidates_h,$unparsable_a,$ids_for_parsed_h) = @_;
1549	0					0	my %occurrences;
1550							my $document;
1551	0					0	my $sentence;
1552	0					0	my $occurrence;
1553	0					0	my $term_candidate;
1554	0					0	my $unparsable;
1555	0					0	my $sent_hash;
1556	0					0	my $occurrences_a;
1557
1558
1559	0					0	foreach $term_candidate (values (%$term_candidates_h))
1560							{
1561	0					0	foreach $occurrence (@{$term_candidate->getOccurrences})
	0					0
1562							{
1563	0	0				0	if($occurrence->isMaximal)
1564							{
1565	0					0	push @{$occurrences{$occurrence->getDocument->getID}{$occurrence->getSentence->getID}}, $occurrence;
	0					0
1566	0					0	$ids_for_parsed_h->{$occurrence->getID}++;
1567							}
1568							}
1569							}
1570	0					0	foreach $unparsable (@$unparsable_a)
1571							{
1572	0					0	foreach $occurrence (@{$unparsable->getOccurrences})
	0					0
1573							{
1574	0	0				0	if($occurrence->isMaximal)
1575							{
1576	0					0	push @{$occurrences{$occurrence->getDocument->getID}{$occurrence->getSentence->getID}}, $occurrence;
	0					0
1577							}
1578							}
1579							}
1580	0					0	while (($document,$sent_hash) = each (%occurrences))
1581							{
1582	0					0	while (($sentence,$occurrences_a) = each (%$sent_hash))
1583							{
1584	0					0	@$occurrences_a = sort ({$a->getStartChar <=> $b->getStartChar} @$occurrences_a);
	0					0
1585							}
1586
1587							}
1588	0					0	return \%occurrences;
1589							}
1590
1591							sub getWords
1592							{
1593	0			0	1	0	my ($this) = @_;
1594	0					0	return $this->{WORDS};
1595							}
1596
1597							sub selectOnTermListStyle {
1598	0			0	0	0	my ($this, $term_candidates_h,$term_list_style,$debug_fh) = @_;
1599
1600	0					0	my $tc;
1601							# warn "selectOnTermListStyle ($term_list_style)\n";
1602	0					0	foreach $tc (values (%$term_candidates_h))
1603							{
1604							# warn $tc->getIF . "\n";
1605							# warn ($tc->isa('Lingua::YaTeA::MultiWordTermCandidate') * 1) . "\n";
1606	0	0	0			0	if (($term_list_style ne "") && ($term_list_style ne "all") &&
			0
			0
1607							(($term_list_style ne "multi") \|\| ((blessed($tc)) && ($tc->isa('Lingua::YaTeA::MultiWordTermCandidate') != 1)))) {
1608	0					0	$tc->setTermStatus(0);
1609							}
1610							# warn "" . (1 * $tc->getTermStatus) . "\n";
1611							# warn " " . ($tc->isTerm * 1) . "\n";
1612							}
1613							}
1614
1615							sub makeDDW
1616							{
1617	2			2	0	8	my ($this,$term_candidates_h,$fh) = @_;
1618	2					6	my $tc_weight;
1619							my $mean_occ;
1620	2					6	my $total_occ = 0;
1621	2					12	my $total_doc = $this->getDocumentSet->getDocumentNumber;
1622	2					12	my %doc_by_tc;
1623							my %docs_for_this_tc;
1624	2					0	my $tc;
1625	2					0	my $occ;
1626
1627
1628	2					15	foreach $tc (values (%$term_candidates_h))
1629							{
1630	240					397	%docs_for_this_tc = ();
1631	240					299	foreach $occ (@{$tc->getOccurrences})
	240					451
1632							{
1633	330					628	$docs_for_this_tc{$occ->getDocument->getID}++;
1634	330					548	$total_occ++;
1635							}
1636	240					462	$doc_by_tc{$tc->getKey} = scalar keys(%docs_for_this_tc);
1637							}
1638	2	50				30	if (scalar(keys(%$term_candidates_h)) > 0) {
1639	2					14	$mean_occ = $total_occ / scalar keys %$term_candidates_h;
1640							} else {
1641	0					0	$mean_occ = 0;
1642							}
1643	2					19	foreach $tc (values (%$term_candidates_h))
1644							{
1645							##### measure 'descriptor discriminating weight' described in 'Building back-of-the-book indexes', Nazarenko, Ait El Mekki (2005)
1646							##### PROBLEM: each time there is only one document in the corpus, the value is 0 (log10(1/1) = 0)
1647	240					497	$tc_weight = ($tc->getFrequency/$mean_occ) * log10 ($total_doc/$doc_by_tc{$tc->getKey});
1648	240					494	$tc->setWeight($tc_weight);
1649							}
1650							}
1651
1652							sub processTotalDocOccurrences
1653							{
1654	0			0	0		my ($this,$occurrences_h) = @_;
1655	0						my $total;
1656							my $occ;
1657							# print STDERR $occurrences_h. "\n";
1658	0						foreach $occ (values (%$occurrences_h))
1659							{
1660	0						$total += $occ;
1661							}
1662	0						return $total;
1663							}
1664
1665
1666							1;
1667
1668							__END__