File Coverage

blib/lib/Lingua/AlignmentSlice.pm

Criterion	Covered	Total	%
statement	9	565	1.5
branch	0	160	0.0
condition	0	117	0.0
subroutine	3	18	16.6
pod	0	15	0.0
total	12	875	1.3

line	stmt	bran	cond	sub	pod	time	code
1
2							########################################################################
3							# Author: Patrik Lambert (lambert@talp.ucp.es)
4							# Description: Provides method to cut and process a part of an alignment
5							# in a sentence pair.
6							#
7							#-----------------------------------------------------------------------
8							#
9							# Copyright 2004 by Patrik Lambert
10							#
11							# This program is free software; you can redistribute it and/or modify
12							# it under the terms of the GNU General Public License as published by
13							# the Free Software Foundation; either version 2 of the License, or
14							# (at your option) any later version.
15							#
16							# This program is distributed in the hope that it will be useful,
17							# but WITHOUT ANY WARRANTY; without even the implied warranty of
18							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19							# GNU General Public License for more details.
20							#
21							# You should have received a copy of the GNU General Public License
22							# along with this program; if not, write to the Free Software
23							# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24							########################################################################
25
26							package Lingua::AlignmentSlice;
27							@ISA = ("Lingua::Alignment"); # Inherits from Alignment
28
29	1			1		6	use strict;
	1					2
	1					36
30	1			1		6	use Lingua::AlSetLib;
	1					3
	1					20
31	1			1		4	use Dumpvalue;
	1					2
	1					6295
32
33							#a slice or fraction of an Alignment object:
34							#inherits from an Alignment object
35							#zero is the point where to insert the slice in the father alignment
36							#indices are the token indices that really contain information in the slice. Only paste those ones.
37							#sub new2 {
38							# my ($pkg,$al) = @_;
39							# my $this = $pkg->Lingua::Alignment::new();
40
41							# $this->{father}=$al;
42							# $this->{zeroSource}=0;
43							# $this->{zeroTarget}=0;
44							# $this->{sourceIndices}={};
45							# $this->{targetIndices}={};
46							# return $this;
47							#}
48
49							sub new {
50	0			0	0		my ($pkg,$al) = @_;
51	0						my $this = Lingua::Alignment->new();
52
53	0						$this->{father}=$al;
54	0						$this->{zeroSource}=0;
55	0						$this->{zeroTarget}=0;
56	0						$this->{sourceIndices}={};
57	0						$this->{targetIndices}={};
58	0						return bless $this,$pkg;
59							}
60
61							sub setZero {
62	0			0	0		my ($this,$zeroSource,$zeroTarget)=@_;
63	0						$this->{zeroSource}=$zeroSource;
64	0						$this->{zeroTarget}=$zeroTarget;
65							}
66
67							sub paste {
68	0			0	0		my ($this,$al)=@_;
69	0	0					if (!defined($al)){$al=$this->{father}}
	0
70	0						my %side=("source"=>"target","target"=>"source");
71	0						my ($j,$i,$idx);
72	0						my ($al_j,$al_i);
73	0						my ($length,$side,$Side,$reverseSide,$ReverseSide);
74	0						my @nullNotIn;
75	0						my @nullIn;
76
77							# print "slice to paste:\n";
78							# main::dumpValue($this);
79	0						while (($side,$reverseSide)= each(%side)){
80							# print "PASTE:$side\n";
81							# print "ANTES DE PROCESAR NULL:\n";
82							# main::dumpValue($al->{$side."Al"});
83							# print "PROCESS NULL...\n";
84	0						$Side=ucfirst($side);
85	0						$ReverseSide=ucfirst($reverseSide);
86							# 1 process NULL links
87	0	0					if ($this->{$side."Indices"}{0}){
88	0						$length=@{$al->{$side."Al"}[0]};
	0
89							#list links to null of the indices not included in perturbation
90	0						@nullNotIn=();
91	0						@nullIn=();
92	0						foreach $i (@{$al->{$side."Al"}[0]}){
	0
93							# print "zero $ReverseSide:".$this->{"zero".$ReverseSide}." i:$i\n";
94	0	0					if (!$this->{$reverseSide."Indices"}{$i-$this->{"zero".$ReverseSide}}){
95	0						push @nullNotIn,$i;
96							}
97							}
98							#replace the other ones (those included in the perturbation) by their alignemnent in the perturbation
99	0						@{$al->{$side."Al"}[0]}=@nullNotIn;
	0
100	0						foreach $i (@{$this->{$side."Al"}[0]}){
	0
101	0						push @{$al->{$side."Al"}[0]},$i+$this->{"zero".$ReverseSide};
	0
102							}
103							# print "null not in:",join(" ",@nullNotIn),"\n";
104							}
105							# print "DESPU�S DE PROCESAR NULL:\n";
106							# main::dumpValue($al->{$side."Al"});
107							# print "PROCESS NO NULL...\n";
108
109							# 2 process NO-null links
110	0						foreach $j (keys %{$this->{$side."Indices"}}){
	0
111	0	0					if ($j>0){
112	0						$al_j = $j+$this->{"zero".$Side};
113							# if $j is in the perturbation all indices linked to it are also,
114							# so we can replace the entire array by that of the perturation:
115	0						$al->{$side."Al"}[$al_j]=[];
116	0						foreach $i (@{$this->{$side."Al"}[$j]}){
	0
117	0	0					if ($i>0){
118	0						$al_i = $i+$this->{"zero".$ReverseSide};
119							}else{
120	0						$al_i = 0;
121							}
122							# print "j i:$j $i->push $side.Al[$al_j],$al_i\n";
123	0						push @{$al->{$side."Al"}[$al_j]},$al_i;
	0
124							}
125							}
126							} #foreach $j...
127							# print "DESPU�S DE PROCESAR NO NULL:\n";
128							# main::dumpValue($al->{$side."Al"});
129							} #while
130							}
131
132							sub clone {
133	0			0	0		my $this = shift;
134	0						my $clone = $this->Lingua::Alignment::clone();
135
136	0						$clone->{father}=$this->{father};
137	0						$clone->{zeroSource}=$this->{zeroSource};
138	0						$clone->{zeroTarget}=$this->{zeroTarget};
139	0						$clone->{sourceIndices}=$this->{sourceIndices};
140	0						$clone->{targetIndices}=$this->{targetIndices};
141
142	0						return bless $clone,"Lingua::AlignmentSlice";
143							}
144							##############################################################################
145							### SYMMETRIZATION SUBS (make sense only for GIZA type alignments) ###
146							##############################################################################
147
148							sub sparse{
149	0			0	0		my ($alSlice,$side) = @_;
150
151	0						return ( (@{$alSlice->{$side."Words"}}-scalar(keys %{$alSlice->{$side."Indices"}}))>3 );
	0
	0
152							}
153							# SYMMETRIZATION RULE FUNCTION:
154							# input: AlignmentSlice object (modAl)
155							# action: modAl is corrected by application of the rule, to symAl are added new symmetric links and al remains unchanged
156							# returns the number of applications of the rule in alignment
157							sub applyOneToMany_1 {
158	0			0	0		my ($alSlice) = @_;
159	0						my %side=("source","target","target","source");
160	0						my ($source,$target);
161	0						my ($j,$i,$k);
162	0						my $ruleApplicationNb=0;
163	0						my %candidate;
164	0						my $failed = 0;
165
166	0						for (($source,$target)= each (%side)){
167	0						for ($j=0; $j<@{$alSlice->{$source."Al"}};$j++){
	0
168							#select the $j's linked to various $i's
169	0	0					if (defined($j)){
170	0	0					if (@{$alSlice->{$source."Al"}[$j]}>1){
	0
171	0						($failed,$k) = (0,0);
172	0						%candidate=();
173							# for each $i linked to $j we look if the reverse link exists in targetAl and if $i is not linked to another source word
174	0		0				while (!$failed && $k<@{$alSlice->{$source."Al"}[$j]}){
	0
175	0						$i = $alSlice->{$source."Al"}->[$j][$k];
176	0	0	0				if (@{$alSlice->{$target."Al"}[$i]}==1 && $alSlice->{$target."Al"}->[$i][0]==$j){
	0	0
	0
177	0						$candidate{$i} = 1;
178							}elsif (@{$alSlice->{$target."Al"}[$i]}>=1){ #$i is linked to some $j', $j' different from $j
179	0						$failed = 1;
180							}
181	0						$k++;
182							}
183							# if the conditions are present, we apply the rule
184	0	0	0				if (!$failed && %candidate>0){
185	0						foreach $i (@{$alSlice->{$source."Al"}[$j]}){
	0
186							#indicate modified word:
187							# ${$modAl->{$target."Words"}}[$i]='#'.${$modAl->{$target."Words"}}[$i].'#';
188							# ${$symAl->{$target."Words"}}[$i]='#'.${$symAl->{$target."Words"}}[$i].'#';
189	0	0					if (! $candidate{$i}){
190							# push @{$modAl->{$target."Al"}[$i]},$j;
191							# push @{$symAl->{$target."Al"}[$i]},$j;
192							# push @{$symAl->{$source."Al"}[$j]},$i;
193							}else{
194	0						$ruleApplicationNb++;
195							#indicate modified word:
196							# ${$modAl->{$source."Words"}}[$j]='#'.${$modAl->{$source."Words"}}[$j].'#';
197							# ${$symAl->{$source."Words"}}[$j]='#'.${$symAl->{$source."Words"}}[$j].'#';
198							}
199							}
200							}
201							} #if
202							}
203							} #for
204							} #for source,target
205	0						return $ruleApplicationNb;
206							}
207
208							#look for source\|target tokens aligned with every target\|source index in the slice
209							sub applyOneToMany_2 {
210	0			0	0		my ($alSlice) = @_;
211	0						my %side=("source"=>"target","target"=>"source");
212	0						my ($source,$target);
213	0						my ($j,$i,$k);
214	0						my $ruleApplicationNb=0;
215	0						my @candidate;
216							my $failed;
217	0						my @toModify = ();
218
219	0						while (($source,$target)= each %side){
220							# print "\n$source $target :\n";
221	0	0					if (@toModify==0){
222							#1 select the $j's linked to all $i's
223							# we want only one $j in this situation so sourceAl==2
224							# we eliminate situations where $j is linked to two $i's situated very far apart (ie: indices<
225	0	0	0				if (@{$alSlice->{$source."Al"}}==2 && @{$alSlice->{$source."Al"}[0]}==0 && (@{$alSlice->{$target."Words"}}-scalar(keys %{$alSlice->{$target."Indices"}}))<4){
	0		0
	0
	0
	0
226	0						$j=1;
227							# print "al:",join(" ",@{$alSlice->{$source."Al"}[$j]})." indices:",join(" ",keys %{$alSlice->{$target."Indices"}}),"\n";
228	0	0					if ( @{$alSlice->{$source."Al"}[$j]}==scalar(keys %{$alSlice->{$target."Indices"}}) ){
	0
	0
229							#2 check there is one $i with reverse link to $j (from Giza constraint there can be only one) and no $i linked to other word
230	0						@candidate=();
231	0						$failed=0;
232	0		0				while (!$failed && $k<@{$alSlice->{$source."Al"}[$j]}){
	0
233	0						$i = $alSlice->{$source."Al"}->[$j][$k];
234							# $i can only be aligned to $j or not linked
235	0	0	0				if (@{$alSlice->{$target."Al"}[$i]}==1 && $alSlice->{$target."Al"}->[$i][0]==$j){
	0
236							# print " 1st class\n";
237	0						$candidate[0]=$i;
238							}else{ #$i is linked to nothing
239							# print " modify $i\n";
240	0						push @toModify,$i;
241							}
242	0						$k++;
243							}
244							} # if
245							} #for $j...
246	0	0					if (@candidate>0){
247	0						foreach $i (@toModify){
248	0						push @{$alSlice->{$target."Al"}[$i]},$j;
	0
249	0						$ruleApplicationNb=1;
250							}
251							}
252							} #if @toModify==0
253							} #while source,target
254	0						return $ruleApplicationNb;
255							}
256
257							sub selectSubgroups {
258	0			0	0		my ($alSlice,$groups,$subGroups,$globals) = @_;
259	0						my $sourceSize = @{$alSlice->{sourceWords}}-1;
	0
260	0						my $targetSize = @{$alSlice->{targetWords}}-1;
	0
261	0						my ($side,$k,$minK,$j,$i,$group,$subGroup,$count,$numWords);
262	0						my ($sourceCandidate,$targetCandidate);
263	0						my %candidates = ("source",[],"target",[]);
264	0						my %subCandidates = ("source",[],"target",[]);
265	0						my @allWords;
266							my @words;
267	0						my $dumper=new Dumpvalue;
268	0	0	0				if ($sourceSize >0 && $targetSize >0 && ($sourceSize>1 \|\| $targetSize>1) && !$alSlice->sparse("source") && !$alSlice->sparse("target")){
			0
			0
			0
			0
269							#select frases contains in the group
270	0						foreach $side (("source","target")){
271							# push group candidates
272	0						@allWords = @{$alSlice->{$side."Words"}};
	0
273	0						shift @allWords;
274							#prepare group to be splitted between punctuation, for subGroups hash
275	0						$group=join(" ",@allWords)." ";
276	0						$group =~ s/([��!\?\.,;:] )+/$1/g;
277	0						@allWords = split /[��!\?\.,;:] /g,$group;
278							# remove puntuation to put it in group hash:
279	0						$group =~ s/[��!\?\.,;:] //g;
280	0						$group =~ s/\s+$//g;
281
282	0						push @{$candidates{$side}},$group;
	0
283
284	0	0					if ($globals->{onlyGroups}==0){
285	0						foreach $group (@allWords){
286	0						$group =~ s/\s+$//;
287							# print "group:$group:\n";
288	0						@words = split / /,$group;
289	0						$numWords = scalar(@words);
290							#push subgroup candidates
291	0	0					if ($numWords<3){$minK=1}
	0	0
	0
292	0						elsif($numWords<5){$minK=2}
293							else{$minK=2};
294							# print "minK:$minK numWords:$numWords\n";
295	0						for ($k=$minK;$k<=$numWords;$k++){
296	0						for ($j=0;$j<$numWords-$k+1;$j++){
297	0						@words = split / /,$group;
298	0						$subGroup = join(" ",splice @words,$j,$k);
299	0						push @{$subCandidates{$side}},$subGroup;
	0
300							# print " subGroup:$subGroup\n";
301							}
302							}
303							} #foreach $candidate (@allWords)
304	0						foreach $sourceCandidate (@{$subCandidates{source}}){
	0
305	0						foreach $targetCandidate (@{$subCandidates{target}}){
	0
306							# print "$sourceCandidate -- $targetCandidate\n";
307	0						$subGroups->{"$sourceCandidate \| $targetCandidate"}=1;
308							}
309							}
310							}
311	0						foreach $sourceCandidate (@{$candidates{source}}){
	0
312	0						foreach $targetCandidate (@{$candidates{target}}){
	0
313							# number of words may change after filtering punctuation:
314	0						$sourceSize=scalar(split / /,$sourceCandidate);
315	0						$targetSize=scalar(split / /,$targetCandidate);
316	0	0	0				if ($sourceSize >0 && $targetSize >0 && ($sourceSize>1 \|\| $targetSize>1)){
			0
			0
317	0						$groups->{"$sourceCandidate \| $targetCandidate"}=1;
318							}
319							}
320							}
321							}
322							}
323	0						return ($groups,$subGroups);
324							}
325
326							sub applyGrouping {
327	0			0	0		my ($alSlice,$groupKeys,$subGroupKeys,$globals) = @_;
328	0						my $myGroupKeys=$groupKeys;
329	0						my $sourceSize = @{$alSlice->{sourceWords}};
	0
330	0						my $targetSize = @{$alSlice->{targetWords}};
	0
331	0						my ($side,$reverseSide,$k,$minK,$j,$i,$idx,$interPunctuationIdx,$pushed,$candidate,$refToCand,$word);
332	0						my %side=("source"=>"target","target"=>"source");
333	0						my ($refToSourceCand,$sourceCandidate,$refToTargetCand,$targetCandidate,$bestMatch);
334	0						my %candidates;
335	0						my %cands;
336	0						my %targetCands;
337	0						my @allWords;
338	0						my @words;
339	0						my ($regExp,$num,$match,$numMatches);
340	0						my @matches;
341	0						my ($nscan,$modifications,$modified)=(0,1,0);
342	0						my %toProcess;
343	0						my ($numSourceIndicesToProcess,$numTargetIndicesToProcess);
344	0						my ($first_j,$first_i);
345	0						my @sourceCandidateTokens;
346	0						my @targetCandidateTokens;
347	0						my %crossLinksPatterns;
348	0						my $clone=$alSlice->clone();
349	0						my $lastChance=0;
350	0						my @grepMatch;
351							my %grepMatches;
352	0						my $dumper = new Dumpvalue;
353	0						my $verbose = $globals->{verbose};
354
355	0	0	0				if ($sourceSize >1 && $targetSize >1 && !$alSlice->sparse("source") && !$alSlice->sparse("target")){
			0
			0
356	0						my $defaultActionGrouping=$globals->{defaultActionGrouping};
357	0						$alSlice->$defaultActionGrouping();
358							#once you have the intersection, it's easy to list the reciprocal links
359	0						$clone->intersect();
360	0						for ($j=0;$j<@{$clone->{sourceAl}};$j++){
	0
361	0	0					if (defined($clone->{sourceAl}[$j])){
362	0						foreach $i (@{$clone->{sourceAl}[$j]}){
	0
363	0						$crossLinksPatterns{$clone->{sourceWords}[$j].'.\\|.'.$clone->{targetWords}[$i]}=1;
364							}
365							}
366							}
367	0	0					if ($verbose > 1){
368	0						print $alSlice->sourceSentence." \| ".$alSlice->targetSentence."\n";
369	0	0					if ($verbose > 2){
370	0						print "recip links:\n";
371	0						print $dumper->dumpValue(\%crossLinksPatterns);
372							}
373							}
374	0						$toProcess{sourceIndices}={%{$alSlice->{sourceIndices}}};
	0
375	0						$toProcess{targetIndices}={%{$alSlice->{targetIndices}}};
	0
376	0						delete($toProcess{sourceIndices}{0});
377	0						delete($toProcess{targetIndices}{0});
378	0						$toProcess{sourceWords}=[@{$alSlice->{sourceWords}}];
	0
379	0						$toProcess{targetWords}=[@{$alSlice->{targetWords}}];
	0
380	0						shift @{$toProcess{sourceWords}}; #remove NULL word
	0
381	0						shift @{$toProcess{targetWords}};
	0
382	0						$toProcess{sourceWordPos}=[1..scalar(@{$toProcess{sourceWords}})];
	0
383	0						$toProcess{targetWordPos}=[1..scalar(@{$toProcess{targetWords}})];
	0
384	0						$numSourceIndicesToProcess = scalar(keys %{$toProcess{sourceIndices}});
	0
385	0						$numTargetIndicesToProcess = scalar(keys %{$toProcess{targetIndices}});
	0
386
387	0		0				while ( $numSourceIndicesToProcess>0 && $numTargetIndicesToProcess>0 ){
388							# print "s words:",join(" ",@{$toProcess{sourceWords}})," - t words:",join(" ",@{$toProcess{targetWords}}),"\n";
389							# print "ind to process s:",join(" ",keys %{$toProcess{sourceIndices}})," - target:",join(" ",keys %{$toProcess{targetIndices}}),"\n";
390							# IF FAILED RETURN
391	0	0	0				if (!$lastChance && !$modifications){
392	0						$lastChance=1;
393	0	0					if ($globals->{onlyGroups}==1){
394	0						return -1;
395							}
396							}
397	0						%grepMatches=();
398							# print "$nscan: myGroupKeys:$myGroupKeys lastChance:$lastChance\n";
399	0						%candidates = ("source",[],"target",[]);
400
401							# SELECT FRASES CONTAINED IN THE GROUP (if the number of indices is different (ie modifications==1)
402	0	0	0				if ($nscan==0 \|\| $modifications){
403	0						while (($side,$reverseSide)=each %side){
404							#target words
405	0						$targetCands{$side} = " \|".join("\|",@{$toProcess{$reverseSide."Words"}});
	0
406	0						$targetCands{$side} =~ s/\\|[\?�!�\.,]//g; #remove punctuation marks
407	0						$targetCands{$side} =~ s/([\\\[\{\^\$\*\+\?\.])/\\$1/g; #escape special characters
408	0	0					if ($verbose > 2){print "targetCands $side:$targetCands{$side}\n";}
	0
409							#source group candidates
410	0						@allWords=();
411	0						@{$allWords[0]}=();
	0
412	0						$interPunctuationIdx = 0;
413	0						for ($idx=0;$idx<@{$toProcess{$side."Words"}};$idx++){
	0
414	0	0	0				if ($toProcess{$side."Words"}[$idx]=~/[\?�!�\.,]/ \|\| !$alSlice->{$side."Indices"}{$toProcess{$side."WordPos"}[$idx]}){
415							# if ($toProcess{$side."Words"}[$idx]=~/[\?�!�\.,]/){
416	0	0	0				unless ($idx==0 \|\|$idx==@{$toProcess{$side."Words"}}-1 \|\| $pushed==0){
	0		0
417	0						$interPunctuationIdx++;
418	0						$pushed=0;
419							}
420							}else{
421	0						push @{$allWords[$interPunctuationIdx]},{"pos" => $toProcess{$side."WordPos"}[$idx],"txt" => $toProcess{$side."Words"}[$idx]};
	0
422	0						$pushed=1;
423							}
424							}
425	0						@{$cands{$side}}=();
	0
426	0						for ($interPunctuationIdx=0;$interPunctuationIdx<@allWords;$interPunctuationIdx++){
427	0	0					if (scalar(keys %{$toProcess{$side."Indices"}})<4){$minK=1}
	0	0
	0
	0
428	0						elsif(scalar(keys %{$toProcess{$side."Indices"}})<5){$minK=2}
	0
429							else{$minK=3};
430							#select subgroups for grouping candidates
431	0		0				for ($k=$minK;$k<=@{$allWords[$interPunctuationIdx]} && $k<7;$k++){
	0
432	0						for ($j=0;$j<=@{$allWords[$interPunctuationIdx]}-$k;$j++){
	0
433	0						@words=@{$allWords[$interPunctuationIdx]};
	0
434							# substitute words not in slice indices by "blank" (ie '[^ ]?')
435							# for ($idx=0;$idx<@words;$idx++){
436							# if (!$alSlice->{$side."Indices"}{$words[$idx]->{pos}}){
437							# $words[$idx]->{txt} ='[^ ]+';
438							# }
439							# }
440	0						push @{$cands{$side}},[splice @words,$j,$k];
	0
441							# print " -j:$j -k:$k -s words to process:",@{$toProcess{$side."Words"}}-$k,"\n";
442							#if split 3 in 1-2 or 5 in 2-3 for instance put left-over as candidate:
443	0	0	0				if (@words>0 && @words<$minK && ( $j==0 \|\| $j==@{$allWords[$interPunctuationIdx]}-$k)){
			0
			0
444	0						push @{$cands{$side}},[@words];
	0
445							}
446							}
447							}
448							}
449							}#foreach side
450							}#if modifications
451
452							# print "targetCands source:",$targetCands{source},"\n";
453							# print "targetCands target:",$targetCands{target},"\n";
454
455							# FILTER POSSIBLE CANDIDATES
456	0						foreach $side (("source","target")){
457	0						foreach $refToCand (@{$cands{$side}}){
	0
458	0						$candidate = printGroup($refToCand);
459	0	0					if ($candidate !~ /^(\[\^ \]\+ ?)+$/){
460	0						$candidate =~ s/(\[\^ \]\+ )/$$1${0,1}/g;
461	0						$candidate =~ s/( \[\^ \]\+)/$$1${0,1}/g;
462	0						$candidate =~ s/([\\\[\{\^\$\*\+\?\.])/\\$1/g; #escape special characters
463
464	0	0					if ($verbose>2){
465	0						print "cand: $candidate\n";
466							}
467	0	0					if ($side eq "source"){
468	0	0					if (!$lastChance){
469	0						$regExp = '^\d+ \\| '.$candidate.' \\| ('.$targetCands{$side}.')+$';
470							}else{
471	0						$regExp = $candidate.'.\\|.('.$targetCands{$side}.')+';
472							}
473							}else{
474	0	0					if (!$lastChance){
475	0						$regExp = '^\d+ \\| ('.$targetCands{$side}.')+ \\| '.$candidate.'$';
476							}else{
477	0						$regExp = '('.$targetCands{$side}.')+.\\|.'.$candidate;
478							}
479							}
480	0						$regExp =~ s/\?/\\\?/g;
481	0	0					if ($verbose > 2){print "$regExp\n";}
	0
482	0						@grepMatch = grep(/$regExp/,@$myGroupKeys);
483	0	0					if (@grepMatch>0){
484	0						push @{$candidates{$side}},$refToCand;
	0
485	0						foreach $match (@grepMatch){
486	0						$grepMatches{$match}=1;
487							}
488							}
489							}
490							} #foreach @cands
491
492	0						foreach $refToCand (@{$candidates{$side}}){
	0
493	0						$candidate= printGroup($refToCand);
494							# print " ",$candidate."\n";
495							}
496							} #foreach $side
497	0						@matches=();
498
499							# CROSS POSSIBLE CANDIDATES AND IF THEY MATCH PUSH INTO @MATCHES
500	0						foreach $refToSourceCand (@{$candidates{source}}){
	0
501	0						foreach $refToTargetCand (@{$candidates{target}}){
	0
502	0						$sourceCandidate=printGroup($refToSourceCand);
503	0						$targetCandidate=printGroup($refToTargetCand);
504	0	0					if ($verbose > 2){
505	0						print "s t:$sourceCandidate - $targetCandidate indices: s $numSourceIndicesToProcess -t $numTargetIndicesToProcess\n";
506							}
507							# for subGroupKeys, eliminate candidates with one word each side, except if they are the only words to process
508	0	0					if (!$lastChance){
509	0						$regExp = '^\d+ \\| '.$sourceCandidate.' \\| '.$targetCandidate.'$';
510							}else{
511	0						$regExp = ' '.$sourceCandidate.' .\\|. '.$targetCandidate;
512							}
513							# print "$regExp\n";
514	0						@grepMatch= grep(/$regExp/,keys %grepMatches);
515	0	0					if (@grepMatch>0){
516	0						$numMatches=0;
517	0						foreach $match (@grepMatch){
518							# print " $match\n";
519	0						($num)=split(" \\\| ",$match);
520	0						$numMatches+=$num;
521							}
522							# print "numMatches:$numMatches\n";
523	0						push @matches, [$numMatches,[@$refToSourceCand],[@$refToTargetCand]];
524							}
525							}
526							}
527	0	0					if ($verbose>2){
528	0						print "MATCHES:\n";
529	0						print $dumper->dumpValue(\@matches);
530							}
531							# ANALYSE AND COMBINES MATCHES TO SELECT THE BEST MATCH
532	0	0					if (@matches ==0){
533	0						$modifications=0;
534	0	0					if ($lastChance){
535	0	0					if ($modified){return $modified}
	0
	0
536							else {return -1};
537							}
538							}else{
539	0						$modifications=1;
540	0	0					if (@matches ==1){
541	0						$bestMatch=@matches[0];
542							}else {
543	0	0					if ($lastChance){
544	0						$bestMatch=searchBestSubGroupMatch(\@matches,\%crossLinksPatterns);
545							}else{
546	0						$bestMatch=searchBestGroupMatch(\@matches,$verbose);
547							}
548							} #if @matches==1
549	0						($num,$refToSourceCand,$refToTargetCand)=@$bestMatch;
550	0	0					if ($verbose>0){
551	0						print "***bestMatch:",printGroup($refToSourceCand),"--",printGroup($refToTargetCand),"\n";
552							}
553							#apply grouping:
554							#see if there is something left to process for the next step:
555	0						for ($j=0;$j<@{$toProcess{sourceWordPos}};$j++){
	0
556	0	0					if ($toProcess{sourceWordPos}[$j]==$refToSourceCand->[0]{pos}){
557	0						$first_j=$j;
558	0						last;
559							}
560							}
561	0						for ($i=0;$i<@{$toProcess{targetWordPos}};$i++){
	0
562	0	0					if ($toProcess{targetWordPos}[$i]==$refToTargetCand->[0]{pos}){
563	0						$first_i=$i;
564	0						last;
565							}
566							}
567
568							# print "sourceWords:",join(" ",@{$toProcess{sourceWords}})," -splice: ",$first_j,",",scalar(@$refToSourceCand),"\n";
569							# print "targetWords:",join(" ",@{$toProcess{targetWords}})," -splice: ",$first_i,",",scalar(@$refToTargetCand),"\n";
570	0						splice @{$toProcess{sourceWords}},$first_j,scalar(@$refToSourceCand);
	0
571	0						splice @{$toProcess{sourceWordPos}},$first_j,scalar(@$refToSourceCand);
	0
572	0						splice @{$toProcess{targetWords}},$first_i,scalar(@$refToTargetCand);
	0
573	0						splice @{$toProcess{targetWordPos}},$first_i,scalar(@$refToTargetCand);
	0
574							# print "s wordPos:",join(" ",@{$toProcess{sourceWordPos}}),"\n";
575							# print "t wordPos:",join(" ",@{$toProcess{targetWordPos}}),"\n";
576	0						for ($j=$refToSourceCand->[0]{pos};$j<=$refToSourceCand->[@$refToSourceCand-1]{pos};$j++){
577	0						delete $toProcess{sourceIndices}{$j};
578							}
579							# print "\nsourceWords 2:",join(" ",@{$toProcess{sourceWords}}),"\n";
580	0						for ($i=$refToTargetCand->[0]{pos};$i<=$refToTargetCand->[@$refToTargetCand-1]{pos};$i++){
581	0						delete $toProcess{targetIndices}{$i};
582							}
583							# print "targetWords 2:",join(" ",@{$toProcess{targetWords}}),"\n";
584	0						$numSourceIndicesToProcess = scalar(keys %{$toProcess{sourceIndices}});
	0
585	0						$numTargetIndicesToProcess = scalar(keys %{$toProcess{targetIndices}});
	0
586
587	0						$alSlice->group($refToSourceCand,$refToTargetCand,$globals->{extendGroups});
588							# print "modifs:".$modifications."\n";
589	0	0					if ($modifications>0){
590	0						$modified=1;
591							}
592							} #if @matches==0 else
593	0						$nscan++;
594							# print "modif:$modifications\n";
595							# print "sourceInd:",scalar(keys %{$toProcess{sourceIndices}}),"\n";
596							# print "targetInd:",scalar(keys %{$toProcess{targetIndices}}),"\n";
597
598							} #while no changes and areas to cover
599	0						return 1;
600							} #general if
601	0						return 0;
602							}
603
604							sub printGroup{
605	0			0	0		my $refToGroup = shift;
606	0						my $candidate="";
607	0						my $word;
608	0						foreach $word (@$refToGroup){
609	0						$candidate=$candidate." ".$word->{txt};
610							}
611	0						$candidate =~ s/^ //;
612	0						return $candidate;
613							}
614
615							sub searchBestGroupMatch{
616	0			0	0		my $matches = shift;
617	0						my $verbose = shift;
618	0						my @matchNums = ();
619	0						my ($match,$idx,$length);
620	0						my ($minMatch,$maxMatch);
621	0						my @bestCandidates;
622	0						my %maxLength;
623
624	0						foreach $match (@$matches){
625	0	0					if ($verbose>1){
626	0						print $match->[0];
627							}
628	0						push @matchNums,$match->[0];
629							}
630	0						($minMatch,$maxMatch) = Lingua::AlSetLib::minmax( \@matchNums );
631	0						@bestCandidates = ();
632	0						foreach $match (@$matches){
633	0	0					if ($match->[0]==$maxMatch){
634	0						push @bestCandidates,$match;
635							}
636							}
637	0	0					if (@bestCandidates>1){
638	0						$maxLength{"length"}=0;
639	0						for ($idx=0;$idx<@bestCandidates;$idx++){
640	0						$length = @{$bestCandidates[$idx]->[1]}+@{$bestCandidates[$idx]->[2]};
	0
	0
641							# print "len:$length\n";
642	0	0					if ($length>$maxLength{"length"}){
643	0						$maxLength{"idx"}=$idx;
644	0						$maxLength{"length"}=$length;
645							}
646							}
647	0						$bestCandidates[0]=$bestCandidates[$maxLength{"idx"}];
648							}
649	0						return $bestCandidates[0];
650							}
651
652							sub searchBestSubGroupMatch{
653	0			0	0		my ($matches,$crossLinkedWords) = @_;
654	0						my %matchNums = ();
655	0						my @sortedMatchNums;
656	0						my ($match,$idx,$k,$length,$maxMatch);
657	0						my $thisOneWithCross=0;
658
659	0						my @bestCandidates;
660							my @finalBest;
661	0						my @candsWithoutCrossLink;
662	0						my @candsWithCrossLink;
663	0						my @currentWithoutCrossLink;
664	0						my $regExp;
665
666							# sort all matchcounts retrieved:
667	0						foreach $match (@$matches){
668	0						$matchNums{$match->[0]}=1;
669							}
670	0						@sortedMatchNums=reverse (sort { $a <=> $b; } keys %matchNums);
	0
671							# print "\nMATCHNUMS: ",join(" ",@sortedMatchNums),"\n\n";
672
673	0		0				for ($idx=0;$idx<@sortedMatchNums && $idx<2;$idx++){
674							#list matches of current matchcount
675	0						@bestCandidates = ();
676	0						@currentWithoutCrossLink=();
677	0						foreach $match (@$matches){
678	0	0					if ($match->[0]==$sortedMatchNums[$idx]){
679	0						push @bestCandidates,$match;
680							}
681							}
682	0						foreach $match (@bestCandidates){
683							# print " $match:",printGroup($match->[1]),"--",printGroup($match->[2]),"\n";
684							#look if contains cross link
685	0						$thisOneWithCross=0;
686	0						foreach $regExp (keys %$crossLinkedWords){
687	0	0					if ((printGroup($match->[1]).' \| '.printGroup($match->[2]))=~/$regExp/){
688							# print "$regExp --> $match\n";
689	0						$thisOneWithCross++;
690	0						last;
691							}
692							}
693	0	0					if ($thisOneWithCross){
694	0						push @candsWithCrossLink,$match;
695							}else{
696	0						push @currentWithoutCrossLink,$match;
697							}
698							}
699	0	0					if (@candsWithCrossLink>0){
700	0						last;
701							}else{
702	0						push @{$candsWithoutCrossLink[$idx]},@currentWithoutCrossLink;
	0
703							}
704							}
705	0						for ($idx=0;$idx<@candsWithoutCrossLink;$idx++){
706	0						foreach $match (@{$candsWithoutCrossLink[$idx]}){
	0
707							# print "$idx without match:",printGroup($match->[1]),"--",printGroup($match->[2]),"\n";
708							}
709							}
710	0						foreach $match (@candsWithCrossLink){
711							# print "with match:",printGroup($match->[1]),"--",printGroup($match->[2]),"\n";
712							}
713	0	0					if (@candsWithCrossLink>0){
714	0						push @finalBest,clusterGroups(\@candsWithCrossLink);
715	0	0	0				if ($idx>0 && @candsWithoutCrossLink>0){
716	0						push @finalBest,clusterGroups($candsWithoutCrossLink[0]);
717							}
718							}else{
719	0						for ($k=0;$k<@candsWithoutCrossLink;$k++){
720	0						push @finalBest,clusterGroups($candsWithoutCrossLink[$k]);
721							}
722							}
723							# print "num finals:",scalar(@finalBest),"\n";
724	0						return clusterGroups(\@finalBest);
725							}
726
727							sub clusterGroups{
728	0			0	0		my $groups = shift;
729	0						my ($match,$group,$word,$k,$idx,$l);
730	0						my %clusterPositions;
731	0						my %isInCluster;
732	0						my %cluster;
733	0						my @right;
734
735							#1.Search positions included in cluster, in each side. Start with first match in group.
736	0						foreach $k (1,2){
737	0						foreach $word (@{$groups->[0][$k]}){
	0
738	0						$clusterPositions{$k}{$word->{pos}}=1;
739							}
740							}
741	0						for ($idx=1;$idx<@$groups;$idx++){
742	0						%isInCluster=();
743	0						foreach $k (1,2){
744	0						foreach $word (@{$groups->[$idx][$k]}){
	0
745	0	0					if ($clusterPositions{$k}{$word->{pos}}){
746	0						$isInCluster{$k}=1;
747	0						last;
748							}
749							}
750							}
751	0	0	0				if ( $isInCluster{1} \|\| $isInCluster{2} ){
752	0						foreach $k (1,2){
753	0						foreach $word (@{$groups->[$idx][$k]}){
	0
754	0						$clusterPositions{$k}{$word->{pos}}=1;
755							}
756							}
757							}
758							} #for $idx
759
760							#2. Build cluster
761	0						foreach $k (1,2){
762	0						foreach $word (@{$groups->[0][$k]}){ #we started with first match in group.
	0
763	0						push @{$cluster{$k}},$word;
	0
764	0						delete($clusterPositions{$k}->{$word->{pos}});
765							}
766	0		0				for ($idx=1;$idx<@$groups && scalar(keys %{$clusterPositions{$k}})>0;$idx++){
	0
767	0						$group=$groups->[$idx][$k];
768	0						foreach $word (@$group){
769	0	0					if ($clusterPositions{$k}{$word->{pos}}){
770							#insert in cluster
771	0						for ($l=0;$l<@{$cluster{$k}};$l++){
	0
772	0	0					if ($cluster{$k}->[$l]{pos}>$word->{pos}){
773	0						last;
774							}
775							}
776	0						@right=splice @{$cluster{$k}},$l;
	0
777	0						@{$cluster{$k}}=(@{$cluster{$k}},$word,@right);
	0
	0
778							#delete key
779	0						delete($clusterPositions{$k}->{$word->{pos}});
780							}
781							}
782							} #for each group to cluster
783							} #foreach $k (source,target)
784	0						return [$groups->[0][0],$cluster{1},$cluster{2}];
785							}
786
787							sub group{
788	0			0	0		my ($alSlice,$refToSourceCand,$refToTargetCand,$extendGroup) = @_;
789	0						my ($j,$i);
790	0						my $nLinks=0;
791	0						my $first_j=$refToSourceCand->[0]{pos};
792	0						my $last_j=$first_j+@$refToSourceCand-1;
793	0						my $first_i=$refToTargetCand->[0]{pos};
794	0						my $last_i=$first_i+@$refToTargetCand-1;
795	0						my @sourceExtensions;
796							my @targetExtensions;
797	0						my ($minMatch,$maxMatch);
798
799	0	0					if ($extendGroup==1){
800							#first we extend the group to cross links aligned with some member of the group
801	0						for ($j=1;$j<@{$alSlice->{sourceAl}};$j++){
	0
802	0	0	0				if (defined($alSlice->{sourceAl}[$j]) && @{$alSlice->{sourceAl}[$j]}>0 && ($j<$first_j \|\| $j>$last_j)){
	0		0
			0
803	0						for ($i=$first_i;$i<=$last_i;$i++){
804	0	0					if ($alSlice->isCrossLink($j,$i)){
805	0	0					if ($j<$first_j){$first_j=$j}
	0
806	0	0					if ($j>$last_j){$last_j=$j}
	0
807							}
808							}
809							}
810							}
811	0						for ($i=1;$i<@{$alSlice->{targetAl}};$i++){
	0
812	0	0	0				if (defined($alSlice->{targetAl}[$i]) && @{$alSlice->{targetAl}[$i]}>0 && ($i<$first_i \|\| $i>$last_i)){
	0		0
			0
813	0						for ($j=$first_j;$j<=$last_j;$j++){
814	0	0					if ($alSlice->isCrossLink($j,$i)){
815	0	0					if ($i<$first_i){$first_i=$i}
	0
816	0	0					if ($i>$last_i){$last_i=$i}
	0
817							}
818							}
819							}
820							}
821							}
822							#then we group
823	0						for ($j=$first_j;$j<=$last_j;$j++){
824	0						for ($i=$first_i;$i<=$last_i;$i++){
825	0	0	0				if ($alSlice->{sourceIndices}->{$j} && $alSlice->{targetIndices}->{$i}){
826	0	0					if (!$alSlice->isIn("sourceAl",$j,$i)){
827	0						push @{$alSlice->{sourceAl}[$j]},$i;
	0
828	0						$nLinks++;
829							}
830	0	0					if (!$alSlice->isIn("targetAl",$i,$j)){
831	0						push @{$alSlice->{targetAl}[$i]},$j;
	0
832	0						$nLinks++;
833							}
834							}
835							} #for
836							} #for
837	0						return $nLinks;
838							}
839
840							sub processNull {
841	0			0	0		my $this=shift;
842
843
844							}
845							1;