File Coverage

blib/lib/Lingua/AlignmentSet.pm
Criterion Covered Total %
statement 230 1038 22.1
branch 94 476 19.7
condition 23 159 14.4
subroutine 20 36 55.5
pod 0 27 0.0
total 367 1736 21.1


line stmt bran cond sub pod time code
1             ########################################################################
2             # Author: Patrik Lambert (lambert@talp.ucp.es)
3             # Description: Tools library to manage an Alignment Sets, i.e. a set of
4             # sentences aligned at the word (or phrase) level.
5             #-----------------------------------------------------------------------
6             # Copyright 2004 by Patrik Lambert
7             #
8             # This program is free software; you can redistribute it and/or modify
9             # it under the terms of the GNU General Public License as published by
10             # the Free Software Foundation; either version 2 of the License, or
11             # (at your option) any later version.
12             #
13             # This program is distributed in the hope that it will be useful,
14             # but WITHOUT ANY WARRANTY; without even the implied warranty of
15             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16             # GNU General Public License for more details.
17             #
18             # You should have received a copy of the GNU General Public License
19             # along with this program; if not, write to the Free Software
20             # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21             ########################################################################
22              
23             package Lingua::AlignmentSet;
24 1     1   25122 use 5.005;
  1         4  
  1         68  
25 1     1   7 use vars qw($VERSION);
  1         2  
  1         58  
26 1     1   5 use strict;
  1         7  
  1         62  
27             $VERSION = 1.1;
28 1     1   661 use Lingua::AlSetLib 1.1;
  1         37  
  1         40  
29 1     1   927 use Lingua::Alignment 1.1;
  1         40  
  1         48  
30 1     1   706 use Lingua::WriteLatexFile;
  1         3  
  1         30  
31 1     1   552 use Lingua::AlignmentEval;
  1         3  
  1         34  
32              
33 1     1   7 use Dumpvalue;
  1         3  
  1         23  
34 1     1   1128 use IO::File;
  1         12986  
  1         14449  
35              
36             my $dumper=new Dumpvalue;
37             my $true = 1;
38             my $false = 0;
39              
40             sub new {
41 2     2 0 33 my ($pkg,$refToFileSets) = @_;
42 2         9 my $refToLocation = readLocation($refToFileSets->[0][0]);
43 2         5 my $format = $refToFileSets->[0][1];
44 2         3 my $range = $refToFileSets->[0][2];
45 2         4 my $alSet = {};
46             #default values:
47 2 50       5 if (!defined($format)){$format="TALP"}
  0         0  
  2         4  
48             else {$format = uc $format};
49 2 100       7 if (!defined($range)){$range="1-"};
  1         2  
50 2 100       6 if ($format eq "BLINKER"){
51             #for future ease we save detailed infos contained in the source sample path
52 1         4 completeBlinkerLocation($refToLocation);
53             }
54            
55 2         5 $alSet->{location}=$refToLocation;
56 2         5 $alSet->{format}=$format;
57            
58 2         6 setRange($alSet,$range);
59            
60             #checking the data:
61 2 50       16 if ($format eq "GIZA"){
    100          
    50          
    0          
62             # if ($ambiguity || $confidence){die "GIZA format not compatible with ambiguity or confidence features"}
63             } elsif ($format eq "BLINKER"){
64             } elsif ($format eq "NAACL"){
65             } elsif ($format eq "TALP"){
66             } else {
67 0         0 die "Unknown format $format. Can't create alignment set object";
68             }
69 2         9 return bless $alSet,$pkg;
70             }
71              
72             # create a new AlignmentSet that contains the same data of an already existing alignment set (without copying the addresses)
73             sub copy {
74 0     0 0 0 my $alSet = shift;
75              
76 0         0 my $cloneLocation={};
77 0         0 my ($field,$value);
78 0         0 while (($field,$value)=each (%{$alSet->{location}})){
  0         0  
79 0         0 $cloneLocation->{$field}=$value;
80             }
81 0         0 return Lingua::AlignmentSet->new([[$cloneLocation,$alSet->{format},$alSet->{firstSentPair}."-".$alSet->{lastSentPair}]]);
82             }
83              
84             sub setWordFiles{
85 1     1 0 10 my ($alSet,$sourcePath,$targetPath) = @_;
86            
87 1         7 $alSet->{location}->{source}=$sourcePath;
88 1         4 $alSet->{location}->{target}=$targetPath;
89             }
90             sub setSourceFile{
91 0     0 0 0 my ($alSet,$sourcePath) = @_;
92            
93 0         0 $alSet->{location}->{source}=$sourcePath;
94             }
95             sub setTargetFile{
96 0     0 0 0 my ($alSet,$targetPath) = @_;
97            
98 0         0 $alSet->{location}->{target}=$targetPath;
99             }
100              
101             sub setTargetToSourceFile{
102 0     0 0 0 my ($alSet,$targetToSourcePath) = @_;
103            
104 0         0 $alSet->{location}->{targetToSource}=$targetToSourcePath;
105             }
106              
107             sub chFormat {
108 0     0 0 0 my ($alSet,$newLocation,$newFormat,$alignMode)=@_;
109              
110 0         0 $alSet->convert($newLocation,$newFormat,$alignMode);
111             }
112              
113              
114             # Won't work if the sentence files are not specified
115             sub visualise {
116 0     0 0 0 my ($alSet,$representation,$format,$outputFH,$mark,$alignMode,$maxRows,$maxCols)=@_;
117 0         0 $representation = lc $representation;
118 0         0 $format = lc $format;
119 0 0       0 if (!defined($outputFH)){$outputFH=*STDOUT}
  0         0  
120 0 0       0 if ($representation eq "matrix"){
121 0 0       0 if (!defined($mark)){$mark = "cross"}
  0         0  
122 0 0       0 if (!defined($maxRows)){$maxRows = 53} #default maxRows value
  0         0  
123 0 0       0 if (!defined($maxCols)){$maxCols = 35} #default maxRows value
  0         0  
124 0         0 $format="latex";
125             }
126 0         0 my $latex = Lingua::Latex->new;
127 0 0       0 if ($format eq "latex"){
128 0         0 print $outputFH $latex->startFile;
129 0         0 print $outputFH $latex->setTabcolsep("0.5mm");
130             }
131 0         0 my $output = "";
132 0         0 my $inputSentPairNum = $alSet->{firstSentPair};
133 0         0 my $i;
134 0         0 my ($al,$alSetChunk);
135 0         0 my $FH = $alSet->openFiles();
136 0 0 0     0 if (($alSet->{format} ne "GIZA") && (!$FH->{source} || !$FH->{target})){
      0        
137 0         0 die "To use the 'visualise' function, you must specify the sentence (words) files.\n";
138             }
139 0         0 while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
140 0         0 $output = "";
141 0         0 for ($i=0;$i<@$alSetChunk;$i++){
142 0         0 $al = $$alSetChunk[$i];
143             # print main::Dumper($al);
144 0 0       0 if ($representation eq "matrix"){
    0          
145 0         0 $output.= "\n$inputSentPairNum\n".$al->displayAsMatrix($latex,$mark,$maxRows,$maxCols);
146             }elsif ($representation eq "enumlinks"){
147 0         0 $output.= "\n$inputSentPairNum\n".$al->displayAsLinkEnumeration($format,$latex);
148             } #elsif
149             }#for
150 0         0 print $outputFH $output;
151 0         0 $inputSentPairNum++;
152             }
153 0 0       0 if ($format eq "latex"){print $outputFH $latex->endFile};
  0         0  
154             }
155              
156             #only work if the text files are given (not only the alignment files).
157             sub getSize {
158 0     0 0 0 my $alSet = shift;
159 0         0 my ($file,$factor);
160 0         0 my $size;
161            
162 0 0 0     0 if ($alSet->{format} eq "GIZA"){
    0 0        
163 0         0 $file = $alSet->{location}->{sourceToTarget};
164 0         0 $factor = 3;
165             }elsif ($alSet->{format} eq "NAACL" || $alSet->{format} eq "BLINKER" || $alSet->{format} eq "TALP"){
166 0 0       0 if (!$alSet->{location}->{source}){
167 0         0 die "One of the functions your are using requires you specify the sentence files (source and target)\n";
168             }
169 0         0 $file = $alSet->{location}->{source};
170 0         0 $factor = 1;
171             }
172 0         0 open (FILE,"<$file");
173 0         0 $size += tr/\n/\n/ while sysread(FILE, $_, 2 ** 16);
174 0         0 close(FILE);
175 0         0 $size = $size / $factor;
176 0         0 return $size;
177             }
178              
179             # returns a list (in random order) of lineNumbers
180             # to sort this list, do: my @sortedSelection = sort { $a <=> $b; } @selection;
181              
182             sub chooseSubsets {
183             #TO DO: possibility of percentage input for the size
184 0     0 0 0 my ($alSet,$seed,$size) = @_;
185 0         0 my $alSetSize = $alSet->getSize();
186 0         0 my $count;
187 0         0 my @selected=();
188 0         0 my @notSelected = ();
189 0         0 my ($ind,$elt);
190              
191 0         0 for ($count=1;$count<$alSetSize;$count++){
192 0         0 push @notSelected,$count;
193             }
194 0         0 srand $seed;
195 0         0 for ($count=0;$count<$size;$count++){
196 0         0 $ind = rand @notSelected;
197 0         0 $elt = $notSelected[$ind];
198 0         0 splice @notSelected, $ind, 1;
199 0         0 push @selected,$elt;
200             }
201 0         0 return \@selected;
202             }
203             ###################################################################
204             ### EVALUATION ###
205             ###################################################################
206              
207             #code adapted from Rada Mihalcea's wa_eval_align.pl, rada@cs.unt.edu
208             # Evaluation is performed using:
209             # - Standard Precision, Recall, F-measure, separate for S (Sure) and P (Possible) cases
210             # - AER measure, defined as
211             # AER = 1 - ( |A & S| + |A & P| ) / ( |A| + |S| )
212             # [where A represents the alignment, S and P represent the S (Sure) and P (Possible) gold standard alignments]
213              
214             sub evaluate {
215 1     1 0 8 my ($submissionAlSet,$answerAlSet,$alignMode,$weighted)=@_;
216 1 50       4 if (!defined($weighted)){$weighted=0}
  1         1  
217 1         2 my ($line,$alignment);
218 0         0 my ($FH,$alSetChunk,$i,$al,$fhPos);
219 0         0 my ($inputSentPairNum,$internalSentPairNum,$sentPairNum);
220 0         0 my ($sureMatch,$possibleMatch,$possibleMatchSure);
221 0         0 my ($surePrecision,$sureRecall,$possiblePrecision,$possibleRecall,$sureFMeasure,$possibleFMeasure,$AER);
222             # 1 READ ANSWER AND SUBMISSION FILES
223             # (in the case of NAACL format file it's more efficient to treat it directly, otherwise load to internal structure)
224             # answer file
225 0         0 my %sureAnswer;
226 0         0 my %possibleAnswer;
227 1         2 my $INFINITY = 9999999999;
228 1         2 $inputSentPairNum = $answerAlSet->{firstSentPair};
229 1         2 $internalSentPairNum = 1;
230 1 50 33     7 if ( $answerAlSet->{format} eq "NAACL" && $alignMode eq "as-is" && $answerAlSet->{firstSentPair} == 1){
      33        
231 0 0       0 my $answerFH = IO::File->new("<".$answerAlSet->{location}{sourceToTarget}) or die "Answer alignment file opening error:$!";
232             #go to first sentence pair:
233 0         0 $fhPos = $answerFH->getpos;
234 0   0     0 while ($answerFH->getline() !~ m/^0*$inputSentPairNum .*/o && !$answerFH->eof()) {
235 0         0 $fhPos = $answerFH->getpos;
236             }
237 0 0       0 if ($answerFH->eof()){
238 0         0 die "First sentence pair of range not found in ".$answerAlSet->{location}{sourceToTarget};
239             }
240 0         0 $answerFH->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
241            
242             #read file:
243 0 0       0 if ($answerAlSet->{lastSentPair} eq "eof"){
244 0         0 $inputSentPairNum = $INFINITY;
245             }else{
246 0         0 $inputSentPairNum = $answerAlSet->{lastSentPair}+1;
247             }
248 0   0     0 while(!$answerFH->eof() && ( ($line=$answerFH->getline()) !~ m/^0*$inputSentPairNum .*/o )) {
249 0         0 chomp $line;
250 0         0 $line =~ s/^\s+|\s+$//g;
251 0         0 identifySurePossible($line,\%sureAnswer,\%possibleAnswer);
252             }
253 0         0 $answerFH->close();
254             }else{
255 1         7 $FH = $answerAlSet->openFiles();
256 1         12 while ($alSetChunk = $answerAlSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
257 10         52 for ($i=0;$i<@$alSetChunk;$i++){
258 10         15 $al = $$alSetChunk[$i];
259             # print "EVALUATE: answer al:\n";
260             # print main::Dumper($al);
261 10         52 foreach $line (@{$al->writeToBlinker()}){
  10         32  
262 90         142 $line = "$internalSentPairNum ".$line;
263 90         179 identifySurePossible($line,\%sureAnswer,\%possibleAnswer);
264             }
265             }
266 10         12 $inputSentPairNum++;
267 10         27 $internalSentPairNum++;
268             }
269 1         4 closeFiles($FH,$answerAlSet->{format});
270             }# if format - else
271            
272             # submission file
273 1         2 my %sureSubmission;
274             my %possibleSubmission;
275 1         2 $inputSentPairNum = $submissionAlSet->{firstSentPair};
276 1         2 $internalSentPairNum = 1;
277              
278 1 50 33     8 if ($submissionAlSet->{format} eq "NAACL" && $alignMode eq "as-is" && $submissionAlSet->{firstSentPair}==1){
      33        
279 0 0       0 my $submissionFH = IO::File->new("<".$submissionAlSet->{location}{sourceToTarget}) or die "Submission alignment file opening error:$!";
280              
281             #go to first sentence pair:
282 0         0 $fhPos = $submissionFH->getpos;
283 0   0     0 while ($submissionFH->getline() !~ m/^0*$inputSentPairNum .*/o && !$submissionFH->eof()) {
284 0         0 $fhPos = $submissionFH->getpos;
285             }
286 0 0       0 if ($submissionFH->eof()){
287 0         0 die "First sentence pair of range not found in ".$submissionAlSet->{location}{sourceToTarget};
288             }
289 0         0 $submissionFH->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
290            
291             #read file:
292 0 0       0 if ($submissionAlSet->{lastSentPair} eq "eof"){
293 0         0 $inputSentPairNum = $INFINITY;
294             }else{
295 0         0 $inputSentPairNum = $submissionAlSet->{lastSentPair}+1;
296             }
297 0   0     0 while(!$submissionFH->eof() && (($line = $submissionFH->getline()) !~ m/^0*$inputSentPairNum .*/o )) {
298 0         0 chomp $line;
299 0         0 $line =~ s/^\s+|\s+$//g;
300 0         0 identifySurePossible($line,\%sureSubmission,\%possibleSubmission);
301             }
302 0         0 $submissionFH->close();
303             }else{
304 1         4 $FH = $submissionAlSet->openFiles();
305 1         4 while ($alSetChunk = $submissionAlSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
306 10         27 for ($i=0;$i<@$alSetChunk;$i++){
307 10         12 $al = $$alSetChunk[$i];
308             # print "submission al:\n";
309             # $dumper->dumpValue($al);
310 10         56 foreach $line (@{$al->writeToBlinker()}){
  10         38  
311 59         100 $line = "$internalSentPairNum ".$line;
312 59         120 identifySurePossible($line,\%sureSubmission,\%possibleSubmission);
313             }
314             }
315 10         11 $inputSentPairNum++;
316 10         29 $internalSentPairNum++;
317             }
318 1         6 closeFiles($FH,$submissionAlSet->{format});
319             }# if format=NAACL else
320            
321            
322             # print "weighted:$weighted\n";
323             # print "SA:".join("-",keys %sureAnswer),"\nSS:".join(" - ",keys %sureSubmission),"\nPA:".join(" - ",keys %possibleAnswer),"\nPS:".join(" - ",keys %possibleSubmission)."\n";
324              
325             # 2 WEIGHT LINKS
326             # It is a kind of "normalization" of multiple links: each link (j i) is weighted according to
327             # the number of links in which j and i are involved: weight(j,i)=0.5*(1/numLinks(j)+1/numLinks(i)).
328              
329 1         3 my ($link,$j,$hash,$value);
330 0         0 my %weightsSure;
331 0         0 my %linksSure;
332 0         0 my @linksSureInSentence;
333 0         0 my %linksPossible;
334 0         0 my @linksPossibleInSentence;
335 0         0 my %weightsPossible;
336              
337 1 50       6 if ($weighted){
338             # When only sure links are considered (calculation of Ps and Rs), they are weighted with respect to the union of both sure sets
339             # take union
340 0         0 foreach $hash ( \%sureSubmission, \%sureAnswer ) {
341 0         0 while (($link, $value) = each %$hash) {
342 0         0 ($sentPairNum,$j,$i)=split(" ",$link);
343 0         0 $linksSure{$sentPairNum}{"$j $i"} = $value;
344             }
345             }
346             # calculate weight of each link
347 0         0 foreach $sentPairNum (keys %linksSure){
348 0         0 @linksSureInSentence =keys %{$linksSure{$sentPairNum}};
  0         0  
349 0         0 foreach $link (@linksSureInSentence){
350 0         0 ($j,$i)=split(" ",$link);
351 0         0 $weightsSure{"$sentPairNum $link"}=0.5*( 1/grep(/^$j /,@linksSureInSentence)+1/grep(/ $i$/,@linksSureInSentence) );
352             }
353             }
354              
355             # When all links are considered (calculation of Pp and Rp, AER), possible AND sure links are weighted with respect to the union of all sets.
356 0         0 %linksPossible=%linksSure;
357             # add union of possible links
358 0         0 foreach $hash (\%possibleSubmission, \%possibleAnswer ) {
359 0         0 while (($link, $value) = each %$hash) {
360 0         0 ($sentPairNum,$j,$i)=split(" ",$link);
361 0         0 $linksPossible{$sentPairNum}{"$j $i"} = $value;
362             }
363             }
364             # calculate weight of each link
365 0         0 foreach $sentPairNum (keys %linksPossible){
366 0         0 @linksPossibleInSentence =keys %{$linksPossible{$sentPairNum}};
  0         0  
367 0         0 foreach $link (@linksPossibleInSentence){
368 0         0 ($j,$i)=split(" ",$link);
369 0         0 $weightsPossible{"$sentPairNum $link"}=0.5*( 1/grep(/^$j /,@linksPossibleInSentence)+1/grep(/ $i$/,@linksPossibleInSentence) );
370             }
371             }
372             }
373            
374             # 3 SUM UP LINKS
375             # in case of weights distinct from 1: sum of %possibleAnswer and %possibleSubmission is always with %weightsPossible.
376             # however the sum of %sureAnswer and %sureSubmission is with %weightsSure to calculate Ps and Rs, %weightsPossible for Pp, Rp and AER.
377 1         4 my ($totalPossibleAnswer,$totalPossibleSubmission)=(0,0);
378 1         2 my ($totalSureAnswer_weightsSure,$totalSureSubmission_weightsSure,$totalSureAnswer_weightsPossible,$totalSureSubmission_weightsPossible)=(0,0,0,0);
379 1 50       9 if ($weighted){
380 0         0 foreach $link (keys %sureAnswer){
381 0         0 $totalSureAnswer_weightsSure+=$weightsSure{$link};
382 0         0 $totalSureAnswer_weightsPossible+=$weightsPossible{$link};
383             }
384 0         0 foreach $link (keys %sureSubmission){
385 0         0 $totalSureSubmission_weightsSure+=$weightsSure{$link};
386 0         0 $totalSureSubmission_weightsPossible+=$weightsPossible{$link};
387             }
388 0         0 foreach $link (keys %possibleAnswer){
389 0         0 $totalPossibleAnswer+=$weightsPossible{$link};
390             }
391 0         0 foreach $link (keys %possibleSubmission){
392 0         0 $totalPossibleSubmission+=$weightsPossible{$link};
393             }
394             }else{ #every link has a weight 1
395 1         8 $totalSureAnswer_weightsSure=scalar(keys %sureAnswer);
396 1         2 $totalSureAnswer_weightsPossible= $totalSureAnswer_weightsSure;
397 1         3 $totalSureSubmission_weightsSure=scalar(keys %sureSubmission);
398 1         3 $totalSureSubmission_weightsPossible=$totalSureSubmission_weightsSure;
399 1         2 $totalPossibleAnswer=scalar(keys %possibleAnswer);
400 1         3 $totalPossibleSubmission=scalar(keys %possibleSubmission);
401             }
402            
403             # 4 COUNT MATCHES
404              
405             # print "sureSubmission:",join("|",keys %sureSubmission),"\n";
406             # print "possibleSubmission:",join("|",keys %possibleSubmission),"\n";
407             # print "sureAnswer:",join("|",keys %sureAnswer),"\n";
408             # print "possibleAnswer:",join("|",keys %possibleAnswer),"\n";
409             # print "\n";
410             # now determine the S[ure] matches
411 1         2 $sureMatch = 0;
412 1         13 foreach $alignment (keys %sureSubmission) {
413 59 100       122 if(defined($sureAnswer{$alignment})) {
414 54 50       80 if (!$weighted){$sureMatch++}
  54         68  
  0         0  
415             else {$sureMatch += $weightsSure{$alignment}}
416             }
417             }
418             # and the [P]robable matches
419             # these are checked against both S[ure] and P[robable] correct alignments
420 1         6 $possibleMatch = 0;
421 1         10 foreach $alignment (keys %possibleSubmission, keys %sureSubmission) {
422 59 100 66     153 if(defined($sureAnswer{$alignment}) || defined($possibleAnswer{$alignment})) {
423 54 50       79 if (!$weighted){$possibleMatch++}
  54         66  
  0         0  
424             else{$possibleMatch += $weightsPossible{$alignment}}
425             }
426             }
427             # and also the intersection between all submitted alignments
428             # and the S [Sure] correct alignments -- as needed by AER
429 1         8 $possibleMatchSure = 0;
430 1         9 foreach $alignment (keys %possibleSubmission, keys %sureSubmission) {
431 59 100       120 if(defined($sureAnswer{$alignment})) {
432 54 50       77 if (!$weighted){$possibleMatchSure++}
  54         67  
  0         0  
433             else{$possibleMatchSure+= $weightsPossible{$alignment}}
434             }
435             }
436             # print "sureMatch:$sureMatch possibleMatch:$possibleMatch possibleMatchSure:$possibleMatchSure\n";
437              
438             # 5 COMPUTE EVALUATION MEASURES
439             # now determine the precision, recall, and F-measure for [S]ure alignments
440 1 50       8 if(scalar(keys %sureSubmission) != 0) {
441 1         3 $surePrecision = $sureMatch / $totalSureSubmission_weightsSure;
442             }else {
443 0         0 $surePrecision = 0;
444             }
445 1 50       5 if(scalar(keys %sureAnswer) != 0) {
446 1         4 $sureRecall = $sureMatch / $totalSureAnswer_weightsSure;
447             }else {
448 0         0 $sureRecall = 0;
449             }
450 1 50 33     9 if($sureRecall != 0 && $surePrecision != 0) {
451 1         4 $sureFMeasure = 2 * $sureRecall * $surePrecision / ($sureRecall + $surePrecision);
452             }else {
453 0         0 $sureFMeasure = 0;
454             }
455              
456              
457             # and now determine the precision, recall, and F-measure for [P]robable alignments
458 1 50       6 if(scalar(keys %sureSubmission) + scalar(keys %possibleSubmission) != 0) {
459 1         2 $possiblePrecision = $possibleMatch / ($totalSureSubmission_weightsPossible+$totalPossibleSubmission);
460             }else {
461 0         0 $possiblePrecision = 0;
462             }
463 1 50       7 if(scalar(keys %sureAnswer) + scalar(keys %possibleAnswer)!= 0) {
464 1         3 $possibleRecall = $possibleMatch / ($totalSureAnswer_weightsPossible+$totalPossibleAnswer);
465             }else {
466 0         0 $possibleRecall = 0;
467             }
468 1 50 33     8 if($possibleRecall != 0 && $possiblePrecision != 0) {
469 1         2 $possibleFMeasure = 2 * $possibleRecall * $possiblePrecision / ($possibleRecall + $possiblePrecision);
470             }else {
471 0         0 $possibleFMeasure = 0;
472             }
473              
474             # and determine the AER
475 1 50       3 if(scalar(keys %sureSubmission) + scalar(keys %possibleSubmission) != 0) {
476 1         3 $AER = 1 - ($possibleMatchSure + $possibleMatch) / ($totalSureSubmission_weightsPossible+$totalPossibleSubmission+$totalSureAnswer_weightsPossible);
477             }else {
478 0         0 $AER = 0;
479             }
480 1         11 return Lingua::AlignmentEval->new($surePrecision,$sureRecall,$sureFMeasure,$possiblePrecision,$possibleRecall,$possibleFMeasure,$AER);
481             }
482             ###################################################################
483             ### PROCESSING ###
484             ###################################################################
485              
486             sub processAlignment{
487 0     0 0 0 my ($alSet,$AlignmentSub,$newLocation,$newFormat,$alignMode)=@_;
488 0         0 my $newAlSet = $alSet->copy;
489 0 0       0 if (ref($AlignmentSub) eq 'ARRAY'){
490 0 0       0 if ($AlignmentSub->[0] eq "Lingua::Alignment::eliminateWord"){
491 0 0       0 if (@$AlignmentSub<3){die "Missing parameters for Lingua::Alignment::eliminateWord\n"}
  0         0  
492             else{
493 0         0 my $side = lc $AlignmentSub->[2];
494 0 0 0     0 if (!$alSet->{location}{$side} || !$newLocation->{$side}){die "Missing $side file for Lingua::Alignment::eliminateWord\n"}
  0         0  
495             }
496             }
497             }
498 0         0 $newAlSet->convert($newLocation,$newFormat,$alignMode,$AlignmentSub);
499 0         0 return $newAlSet;
500             }
501              
502             sub symmetrize {
503 0     0 0 0 my ($alSet,$newLocation,$newFormat,$ENV,$selectSubgroups,$alignMode,$globals)=@_;
504             #defaults
505 0 0       0 if (!defined($selectSubgroups)){$selectSubgroups=0}
  0         0  
506 0 0       0 if (!defined($alignMode)){$alignMode="no-null-align"}
  0         0  
507 0 0       0 if (!defined($globals->{"minPhraseFrequency"})){$globals->{"minPhraseFrequency"}=2};
  0         0  
508 0 0       0 if (!defined($globals->{"extendGroups"})){$globals->{"extendGroups"}=0};
  0         0  
509 0 0       0 if (!defined($globals->{"onlyGroups"})){$globals->{"onlyGroups"}=1};
  0         0  
510 0 0       0 if (!defined($globals->{"defaultActionGrouping"})){$globals->{"defaultActionGrouping"}="Lingua::Alignment::getUnion"};
  0         0  
511 0 0       0 if (!defined($globals->{"defaultActionGeneral"})){$globals->{"defaultActionGeneral"}="Lingua::Alignment::intersect"};
  0         0  
512 0 0       0 if (!defined($globals->{"verbose"})){$globals->{"verbose"}="0"};
  0         0  
513 0         0 my $verbose = $globals->{"verbose"};
514 0         0 my $al; # reference alignment- remains unchanged
515             my $modAl; #reference alignment modified with the successive aplication of symRules
516             #load in memory a chunk of the alignment set as a list
517             #of references to (internal representation) alignment objects:
518 0         0 my ($k,$alSetChunk);
519 0         0 my $FH = $alSet->openFiles();
520 0         0 my $newFH;
521 0 0       0 if ($selectSubgroups==0){
522 0         0 $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
523             }
524 0         0 my $internalSentPairNum = 1;
525 0         0 my ($sentenceNum,$ruleApplied) = ($alSet->{firstSentPair},0);
526              
527 0         0 my $grSentPairNum=1;
528              
529 0         0 my ($j,$i);
530 0         0 my ($lines,$line);
531 0         0 my $groups = {};
532 0         0 my $groupsCurrentSentence = {};
533 0         0 my $groupKeys = [];
534 0         0 my $subGroups={};
535 0         0 my $subGroupsCurrentSentence = {};
536 0         0 my $subGroupKeys=[];
537 0         0 my ($candidate,$count);
538              
539 0 0       0 if (!$selectSubgroups){ #load subgroup hash and array:
540 0         0 open(GROUPS,"<$ENV/groups");
541 0         0 while (){
542 0         0 push @$groupKeys,$_;
543 0         0 @$line = split " | ",$_,2;
544 0         0 $groups->{$line->[1]}=$line->[0];
545             }
546 0 0       0 if ($globals->{onlyGroups}==0){
547 0         0 open(SUBGROUPS,"<$ENV/subGroups");
548 0         0 while (){
549 0         0 push @$subGroupKeys,$_;
550 0         0 @$line = split " | ",$_,2;
551 0         0 $subGroups->{$line->[1]}=$line->[0];
552             }
553             }
554             }
555            
556 0         0 my %anchors;
557 0         0 my %sourcePerturbed={};
558 0         0 my %targetPerturbed={}; # Perturbations must be distinct so we keep track of the already detected "Perturbed" $j's
559 0         0 my ($perturbation,$perturbationNoMod);
560 0         0 my ($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
561 0         0 my ($ind,$newPerturbationDetected,$anchorsInTarget);
562 0         0 my ($countPertubs,$countGrouping,$countOneToMany,$countElse,$countNoGroup)=(0,0,0,0,0);
563 0         0 while ($alSetChunk = $alSet->loadChunk($FH,$sentenceNum,$alignMode)){ # returns 0 if eof or last sentence pair
564 0         0 for ($k=0;$k<@$alSetChunk;$k++){
565             # print "\nsentence pair $sentenceNum\n";
566 0         0 $ruleApplied=0;
567 0         0 $al = $$alSetChunk[$k];
568 0 0       0 if ($verbose >0){
569 0         0 print $sentenceNum."\n";
570 0         0 print $al->sourceSentence."\n";
571 0         0 print $al->targetSentence."\n";
572             }
573 0         0 $modAl = $al->clone();
574 0         0 ($lastAnchorSource,$lastAnchorTarget)=(0,0);
575 0         0 %sourcePerturbed=();
576 0         0 %targetPerturbed=();
577 0         0 $j = 1;
578             #detect "perturbations" in the anchor diagonal looping only over $j (to have less repeated zones). We can only miss those where $i is aligned only to NULL
579 0         0 while ($j<@{$al->{sourceAl}}){
  0         0  
580 0   0     0 while ( !$al->isAnchor($j,"source") && $j<(@{$al->{sourceAl}})){
  0         0  
581 0         0 $j++;
582             }
583 0 0       0 if ($j<=@{$al->{sourceAl}}){
  0         0  
584 0 0       0 if ($j==@{$al->{sourceAl}}){
  0         0  
585 0         0 ($newAnchorSource,$newAnchorTarget) = ($j,scalar(@{$al->{targetAl}}));
  0         0  
586             }else{
587 0         0 ($newAnchorSource,$newAnchorTarget) = ($j,$al->{sourceAl}[$j][0]);
588             }
589 0         0 $newPerturbationDetected=0;
590 0 0 0     0 if (($newAnchorSource-$lastAnchorSource)!=1 && !$sourcePerturbed{$lastAnchorSource+1}){
    0 0        
591 0         0 $newPerturbationDetected = 1;
592             } elsif (($newAnchorTarget-$lastAnchorTarget)!=1 && !$targetPerturbed{$lastAnchorTarget+1}){
593 0         0 $anchorsInTarget=1;
594 0         0 for ($i=$lastAnchorTarget+1;$i<$newAnchorTarget;$i++){
595 0 0       0 if (!$al->isAnchor($i,"target")){$anchorsInTarget=0}
  0         0  
596             }
597 0 0       0 if (!$anchorsInTarget){$newPerturbationDetected=1};
  0         0  
598             }
599 0 0       0 if ( $newPerturbationDetected ){
600 0         0 $countPertubs++;
601             # print "\n($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget)\n";
602            
603 0         0 $perturbation = $al->cut($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
604 0         0 $perturbationNoMod = $al->cut($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
605              
606             # PRINT PERTURBATION TO FILE
607             #if (exists($newFH->{source})){
608             # $newFH->{source}->print(" ".join(" ",@{$perturbation->{sourceWords}})." \n");
609             #}
610             #if (exists($newFH->{target})){
611             # $newFH->{target}->print(" ".join(" ",@{$perturbation->{targetWords}})." \n");
612             #}
613             #$perturbation->getUnion;
614             #$lines = $perturbation->writeToBlinker;
615             #foreach $line (@$lines){
616             # $newFH->{sourceToTarget}->print("$grSentPairNum $line\n");
617             #}
618             #$grSentPairNum++;
619              
620             #END PRINT PERTURBATION TO FILE
621 0 0       0 if ($selectSubgroups){
622 0         0 $perturbation->selectSubgroups($groupsCurrentSentence,$subGroupsCurrentSentence,$globals);
623             }else{
624 0 0       0 if (1==0){
625             # if ($ruleApplied=$perturbation->applyOneToMany_2()){
626             $countOneToMany++;
627 0         0 }elsif (($ruleApplied=$perturbation->applyGrouping($groupKeys,$subGroupKeys,$globals))>0){
628 0         0 $countGrouping++;
629             }else{
630 0         0 my $defaultActionGen = $globals->{defaultActionGeneral};
631 0         0 $perturbation->$defaultActionGen();
632 0 0       0 if ($ruleApplied==-1){
633 0         0 $countNoGroup++;
634             }else{
635 0         0 $perturbation->processNull();
636 0         0 $countElse++;
637             }
638             }
639 0         0 $perturbation->paste($modAl);
640             }
641             # print "\ns indices:",join (" ",keys %{$perturbation->{sourceIndices}}),"\n";
642             # print "t indices:",join (" ",keys %{$perturbation->{targetIndices}}),"\n";
643 0         0 foreach $ind (keys %{$perturbation->{sourceIndices}}){
  0         0  
644 0 0       0 if ($ind>0){
645 0         0 $sourcePerturbed{$ind+$perturbation->{zeroSource}}=1;
646             }
647             }
648 0         0 foreach $ind (keys %{$perturbation->{targetIndices}}){
  0         0  
649 0 0       0 if ($ind>0){
650 0         0 $targetPerturbed{$ind+$perturbation->{zeroTarget}}=1;
651             }
652             }
653             # print "s perturbed:",join (" ",keys %sourcePerturbed),"\n";
654             # print "t perturbed:",join (" ",keys %targetPerturbed),"\n";
655             } #if perturbation
656            
657 0         0 $anchors{"$newAnchorSource $newAnchorTarget"}=1;
658 0         0 ($lastAnchorSource,$lastAnchorTarget) = ($newAnchorSource,$newAnchorTarget);
659 0         0 $j++;
660             }
661             } #while j...
662 0 0 0     0 if ($newFormat eq "NAACL" && !$selectSubgroups){
663 0 0       0 if (exists($newFH->{source})){
664 0         0 $newFH->{source}->print(" ".join(" ",@{$modAl->{sourceWords}})." \n");
  0         0  
665             }
666 0 0       0 if (exists($newFH->{target})){
667 0         0 $newFH->{target}->print(" ".join(" ",@{$modAl->{targetWords}})." \n");
  0         0  
668             }
669 0         0 $al->intersect();
670 0         0 $lines = $modAl->writeToBlinker;
671 0         0 foreach $line (@$lines){
672 0         0 $newFH->{sourceToTarget}->print("$internalSentPairNum $line\n");
673             }
674             }
675 0 0       0 if (($internalSentPairNum % 1000)==0){print STDERR $internalSentPairNum}
  0 0       0  
  0         0  
676             elsif (($internalSentPairNum % 100)==0){print STDERR "."}
677 0         0 $sentenceNum++;
678 0         0 $internalSentPairNum++;
679 0 0       0 if ($verbose > 0){print "Candidates:\n";}
  0         0  
680 0 0       0 if ($selectSubgroups){
681 0         0 foreach $candidate (keys %$groupsCurrentSentence){
682 0 0       0 if ($verbose > 0){print "$candidate\n";}
  0         0  
683 0         0 $groups->{$candidate}=$groups->{$candidate}+1;
684             }
685 0         0 %$groupsCurrentSentence=();
686 0 0       0 if ($globals->{onlyGroups}==0){
687 0         0 foreach $candidate (keys %$subGroupsCurrentSentence){
688 0         0 $subGroups->{$candidate}=$subGroups->{$candidate}+1;
689             }
690 0         0 %$subGroupsCurrentSentence=();
691             }
692             }
693             }#for k<@alSetChunk
694             } #while alsetchunk
695 0         0 print STDERR "\n";
696 0 0       0 if ($selectSubgroups==0){
697 0         0 closeFiles($newFH,$newFormat);
698             }
699 0         0 closeFiles($FH,$alSet->{format});
700 0 0       0 if ($selectSubgroups){
701 0 0       0 if ($verbose>0){print "\ngroups:",scalar(keys(%$groups))," - subgroups:",scalar(keys(%$subGroups)),"\n";}
  0         0  
702 0 0       0 open(GROUPS, ">$ENV/groups") or die "File opening error:$!";;
703 0         0 while (($candidate,$count)=each(%$groups)){
704             # print "groups $count | $candidate\n";
705 0 0       0 if ($count >= $globals->{minPhraseFrequency}){
706 0         0 print GROUPS "$count | $candidate\n";
707             }
708             }
709 0 0       0 if ($globals->{onlyGroups}==0){
710 0 0       0 open(SUBGROUPS, ">$ENV/subGroups") or die "File opening error:$!";;
711 0         0 while (($candidate,$count)=each(%$subGroups)){
712             # print "SUBGROUPS $count | $candidate\n";
713 0 0       0 if ($count >= $globals->{minPhraseFrequency}){
714 0         0 print SUBGROUPS "$count | $candidate\n";
715             }
716             }
717             }
718             }else{
719 0         0 print STDERR "perturbations:$countPertubs (oneToMany:$countOneToMany grouped:$countGrouping not grouped:$countNoGroup others:$countElse)\n";
720 0         0 $alSet->{location}=$newLocation;
721 0         0 $alSet->{format}=$newFormat;
722 0         0 $alSet->{firstSentPair}=1;
723 0         0 $alSet->{lastSentPair}="eof";
724             }
725             }
726              
727             sub orderAsBilCorpus {
728 0     0 0 0 my ($alSet,$newLocation,$newFormat,$alignMode,$corpSrc,$corpTrg,$verbose)=@_;
729 0 0       0 if (!defined($newFormat)){$newFormat="TALP"}
  0         0  
  0         0  
730             else {$newFormat = uc $newFormat}
731 0         0 $newLocation = readLocation($newLocation);
732 0         0 my $FH = $alSet->openFiles();
733 0         0 my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
734 0         0 my $inputSentPairNum=$alSet->{firstSentPair};
735 0         0 my $internalSentPairNum = 1;
736 0 0       0 if ($verbose >0){
737 0         0 select STDOUT; $| = 1; # enable autoflush (desactivate buffering)
  0         0  
738             }
739 0 0       0 open(CS,"<$corpSrc") || die "$corpSrc file opening error !";
740 0 0       0 open(CT,"<$corpTrg") || die "$corpTrg file opening error !";
741              
742             # LOAD BILINGUAL CORPUS IN HASH
743 0         0 my %newcorp;
744 0         0 my $cntCorp=0;
745 0         0 while (my $s=) {
746 0         0 chomp $s;
747 0         0 my $t=;
748 0         0 chomp $t;
749 0         0 $newcorp{"$s ||| $t"}=1;
750 0         0 $cntCorp++;
751             }
752 0 0       0 if ($verbose > 0){
753 0         0 print "Number of different sentence pairs in new corpus:".scalar(keys %newcorp)."\n";
754             }
755            
756             # PARSE AL SET
757 0         0 my $cntToFind=$cntCorp;
758 0         0 my %found;
759 0         0 while (my $alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,"as-is")){ # returns 0 if eof or last sentence pair
760 0         0 for (my $i=0;$i<@$alSetChunk;$i++){
761 0         0 my $al = $$alSetChunk[$i];
762 0         0 my $pair=$al->sourceSentence." ||| ".$al->targetSentence;
763 0 0       0 if ($newcorp{$pair}){
764 0         0 $found{$pair}=$al;
765 0         0 $cntToFind--;
766             }
767             } #for
768 0 0       0 if ($cntToFind==0){
769 0 0       0 if ($verbose>0){print STDERR "Leaving loop of giza file at line:$inputSentPairNum\n";}
  0         0  
770 0         0 last;
771             }
772 0         0 $inputSentPairNum++;
773 0 0       0 if ($verbose>0){
774 0 0       0 if ($inputSentPairNum % 100000 ==0){print $inputSentPairNum;}
  0         0  
775 0 0       0 if ($inputSentPairNum % 10000 ==0){print ".";}
  0         0  
776             }
777             } #while
778 0 0       0 if ($verbose>0){print "\n";}
  0         0  
779             # REORDER AL SET
780 0         0 print "reordering Alignment set...\n";
781 0         0 seek CS,0,0; #go back to beginning of file
782 0         0 seek CT,0,0;
783 0         0 while (my $s=) {
784 0         0 chomp $s;
785 0         0 my $t=;
786 0         0 chomp $t;
787 0 0       0 if (exists($found{"$s ||| $t"})){
788 0         0 my $al= $found{"$s ||| $t"};
789 0         0 $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
790             }else{
791 0         0 die "ERROR: not found sentence pair $internalSentPairNum in Alignment Set\n";
792             }
793 0         0 $internalSentPairNum++;
794             }
795 0         0 closeFiles($newFH,$newFormat);
796 0         0 closeFiles($FH,$alSet->{format});
797 0         0 $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
798             }
799              
800             sub adaptToBilCorpus {
801 0     0 0 0 my ($alSet,$newLocation,$newFormat,$alignMode,$corpSrc,$corpTrg,$restrictions,$verbose)=@_;
802 0         0 my $pdiff=$restrictions->{allowedPercentWordDiff};
803 0         0 my $mindiff=$restrictions->{minAllowedNumWordDiff};
804 0         0 my $maxdiff=$restrictions->{maxAllowedNumWordDiff};
805 0         0 my $nfirst=$restrictions->{numWordsConsideredFirst};
806 0         0 my $dumper = new Dumpvalue;
807              
808 0 0       0 if (!defined($newFormat)){$newFormat="TALP"}
  0         0  
  0         0  
809             else {$newFormat = uc $newFormat}
810 0         0 $newLocation = readLocation($newLocation);
811 0         0 my $FH = $alSet->openFiles();
812 0         0 my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
813 0         0 my $inputSentPairNum=$alSet->{firstSentPair};
814 0         0 my $internalSentPairNum = 1;
815 0         0 select STDOUT; $| = 1; # enable autoflush (desactivate buffering)
  0         0  
816 0 0       0 open(CS,"<$corpSrc") || die "$corpSrc file opening error !";
817 0 0       0 open(CT,"<$corpTrg") || die "$corpTrg file opening error !";
818              
819             # DETECT ALSET SENTENCES THAT ARE IN THE CORPUS
820 0         0 my %newcorp;
821 0         0 while (my $s=) {
822 0         0 chomp $s;
823 0         0 my $t=;
824 0         0 chomp $t;
825 0         0 $newcorp{"$s ||| $t"}=1;
826             }
827 0 0       0 if ($verbose > 0){
828 0         0 print "Number of different sentence pairs in new corpus:".scalar(keys %newcorp)."\n";
829             }
830 0         0 my $count=0;
831 0         0 while (my $alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){
832             # print $inputSentPairNum."\n";
833 0         0 for (my $i=0;$i<@$alSetChunk;$i++){
834 0         0 my $al = $$alSetChunk[$i];
835 0         0 my $s = $al->sourceSentence;
836 0         0 my $t = $al->targetSentence;
837 0         0 my @ws=split / /,$s;
838 0         0 my @wt=split / /,$t;
839 0         0 my ($nums,$numt)=(scalar(@ws),scalar(@wt));
840 0 0       0 if (!$newcorp{"$s ||| $t"}){
841             # DETECT CLOSEST SENTENCES IN NEW CORPUS AND MODIFY ALIGNMENT SET
842             # calculate values for length test (to, later, skip lcs calculation)
843 0         0 my $sAllowedDiff = Lingua::AlSetLib::max(Lingua::AlSetLib::min($nums*$pdiff/100,$maxdiff),$mindiff);
844 0         0 my $tAllowedDiff = Lingua::AlSetLib::max(Lingua::AlSetLib::min($numt*$pdiff/100,$maxdiff),$mindiff);
845 0         0 my $sMin=$nums-$sAllowedDiff;
846 0         0 my $sMax=$nums+$sAllowedDiff;
847 0         0 my $tMin=$numt-$tAllowedDiff;
848 0         0 my $tMax=$numt+$tAllowedDiff;
849 0 0       0 if ($verbose > 0){
850 0         0 print $s."\n";
851 0         0 print $t."\n";
852 0         0 print "ns:$nums nt:$numt allowed length diff: s:$sAllowedDiff t:$tAllowedDiff;\n\n";
853             }
854 0         0 my $bestLcs = 0;
855 0         0 my @bestSrc;
856             my @bestTrg;
857 0         0 my ($cntGoodLength,$cntPassedFirstLCS,$cnt2)=(0,0,0);
858             # parse new corpus
859 0         0 foreach my $pair (keys %newcorp){
860 0         0 my ($cs,$ct)=split / \|\|\| /,$pair;
861 0         0 my @wcs = split / /,$cs;
862 0         0 my @wct = split / /,$ct;
863 0         0 my ($numcs,$numct)=(scalar(@wcs),scalar(@wct));
864              
865 0 0       0 if ($verbose >2){print "ncs:$numcs nct:$numct\n";}
  0         0  
866 0 0       0 if ($verbose >2){print "CORPUS:\n$cs\n$ct\n";}
  0         0  
867             #length test:
868 0 0 0     0 if ($numcs<$sMin || $numcs>$sMax || $numct<$tMin || $numct>$tMax){
      0        
      0        
869             # skip lcs ratio calculation
870             }else{
871 0         0 $cntGoodLength++;
872             #calculate LCS, but first looking at the nfirst first words
873 0         0 my @fws;
874             my @fwt;
875 0         0 my @fwcs;
876 0         0 my @fwct;
877 0         0 for (my $i=0;$i<$nfirst;$i++){
878 0         0 push @fws,$ws[$i];
879 0         0 push @fwt,$wt[$i];
880 0         0 push @fwcs,$wcs[$i];
881 0         0 push @fwct,$wct[$i];
882             }
883 0         0 my $sLcs = Lingua::AlSetLib::LCS_ratio(\@fws,\@fwcs);
884 0         0 my $tLcs = Lingua::AlSetLib::LCS_ratio(\@fwt,\@fwct);
885             # first words LCS test:
886 0 0 0     0 if ($sLcs == 0 || $tLcs == 0){
887             # skip lcs ratio calculation
888             }else{
889 0         0 $cntPassedFirstLCS++;
890 0         0 my $sLcs = Lingua::AlSetLib::LCS_ratio(\@ws,\@wcs);
891 0         0 my $tLcs = Lingua::AlSetLib::LCS_ratio(\@wt,\@wct);
892 0 0       0 if ($verbose >1){
893 0         0 print "ncs:$numcs nct:$numct\n";
894 0         0 print "CORPUS:\n$cs\n$ct\n";
895 0         0 print "chars src lcsr: $sLcs\t trg lcsr:$tLcs\n";
896             }
897 0         0 my $lcs = $sLcs+$tLcs;
898 0 0       0 if ($lcs > $bestLcs){
    0          
899 0         0 $bestLcs = $lcs;
900 0         0 @bestSrc = ($cs);
901 0         0 @bestTrg = ($ct);
902             }elsif($lcs == $bestLcs){
903 0         0 push @bestSrc,$cs;
904 0         0 push @bestTrg,$ct;
905             }
906             }
907             } # if length test
908 0 0       0 if ($verbose >1){
909 0 0 0     0 if ($cnt2>0 && ($cnt2 % 100000)==0){print "$cnt2";}
  0 0 0     0  
  0         0  
910             elsif ($cnt2>0 && ($cnt2 % 1000)==0){print ".";}
911             }
912 0         0 $cnt2++;
913             } # for each sent pair in corpus
914 0         0 my $uniqBestSrc;
915             my $uniqBestTrg;
916 0         0 my $cntCharLevel=scalar(@bestSrc);
917 0 0       0 if ($cntCharLevel>1){
918 0         0 my $bestLcs = 0;
919             # calculate LCS at character level
920 0         0 for (my $i=0;$i<$cntCharLevel;$i++){
921 0         0 my $cs=$bestSrc[$i];
922 0         0 my $ct=$bestTrg[$i];
923 0         0 $cntCharLevel++;
924 0         0 my @chars = split //,$s;
925 0         0 my @charcs = split //,$cs;
926 0         0 my $sLcs = Lingua::AlSetLib::LCS_ratio(\@chars,\@charcs);
927 0         0 my @chart = split //,$t;
928 0         0 my @charct = split //,$ct;
929 0         0 my $tLcs = Lingua::AlSetLib::LCS_ratio(\@chart,\@charct);
930 0         0 my $lcs = $sLcs+$tLcs;
931 0 0       0 if ($lcs > $bestLcs){
932 0         0 $bestLcs = $lcs;
933 0         0 $uniqBestSrc = $cs;
934 0         0 $uniqBestTrg = $ct;
935             }
936             }
937             }else{
938 0         0 $uniqBestSrc = $bestSrc[0];
939 0         0 $uniqBestTrg = $bestTrg[0];
940             }
941 0 0       0 if ($cntGoodLength == 0){
942 0         0 print "WARNING: sentence pair $inputSentPairNum not found in corpus\n";
943             }else{
944 0 0       0 if ($verbose >0){
945 0         0 my $numCorp=scalar(keys %newcorp);
946 0         0 print "Passed length test: $cntGoodLength / $numCorp\n";
947 0         0 print "Passed first words LCS test: $cntPassedFirstLCS\n";
948 0 0       0 print "LCS calculated at character level: "; if ($cntCharLevel==1){print "0"}else{print $cntCharLevel}; print "\n";
  0         0  
  0         0  
  0         0  
  0         0  
949 0         0 print "\nbest lcsr: $bestLcs, best pair:\n$uniqBestSrc\n$uniqBestTrg\n";
950             }
951             # detect edits to pass from alset sent pair to corpus sent pair
952 0         0 my @bs=split / /,$uniqBestSrc;
953 0         0 my @diffs = Lingua::AlSetLib::diff( \@ws, \@bs );
954 0 0       0 if ($verbose>2){print $dumper->dumpValue(\@diffs);}
  0         0  
955              
956             # parse output of diff function
957 0         0 my @updatedPosi; #array: orig posis -> updated posis
958             my %reversePosi; #hash: updated posis -> orig posis
959 0         0 for (my $i=0;$i<=$nums;$i++){
960 0         0 $updatedPosi[$i]=$i;
961 0         0 $reversePosi{$i}=$i;
962             }
963              
964 0         0 foreach my $hunk (@diffs){
965 0         0 my @delPosi;
966             my @del;
967 0         0 my @addPosi;
968 0         0 my @add;
969 0         0 foreach my $change (@$hunk) {
970 0 0       0 if ($change->[0] eq '-'){
971 0         0 push @delPosi,$change->[1]+1;
972 0         0 push @del,$change->[2];
973             }else{
974 0         0 push @addPosi,$change->[1]+1;
975 0         0 push @add,$change->[2];
976             }
977             }
978             # del posis are relative to first array (@ws) => update posis
979             # add posis are relative to second array (@bs) => don't update posis
980 0         0 my $numDel=scalar(@delPosi);
981 0         0 my $numAdd=scalar(@addPosi);
982 0 0       0 if ($numDel==0){ #insertion
983 0         0 $al->splice("source",$addPosi[0],0,\@add);
984 0         0 print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n";
985             #update updatedPosi array
986 0         0 for (my $i=$reversePosi{"$addPosi[0]"};$i<=$nums;$i++){
987 0         0 $updatedPosi[$i]+=$numAdd;
988 0         0 $reversePosi{"$updatedPosi[$i]"}=$i;
989             }
990             }else{ # substitution or deletion
991 0         0 $al->splice("source",$updatedPosi[$delPosi[0]],$numDel,\@add);
992 0         0 print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n";
993             #update updatedPosi array
994 0         0 for (my $i=$delPosi[0]+$numDel;$i<=$nums;$i++){
995 0         0 $updatedPosi[$i]+=$numAdd-$numDel;
996 0         0 $reversePosi{"$updatedPosi[$i]"}=$i;
997             }
998             }
999             }
1000             #target
1001 0         0 my @bt=split / /,$uniqBestTrg;
1002 0         0 my @diffs = Lingua::AlSetLib::diff( \@wt, \@bt );
1003 0 0       0 if ($verbose>2){print $dumper->dumpValue(\@diffs);}
  0         0  
1004             # parse output of diff function
1005 0         0 @updatedPosi=();
1006 0         0 %reversePosi=();
1007 0         0 for (my $i=0;$i<=$numt;$i++){
1008 0         0 $updatedPosi[$i]=$i;
1009 0         0 $reversePosi{$i}=$i;
1010             }
1011 0         0 foreach my $hunk (@diffs){
1012 0         0 my @delPosi;
1013             my @del;
1014 0         0 my @add;
1015 0         0 my @addPosi;
1016 0         0 foreach my $change (@$hunk) {
1017 0 0       0 if ($change->[0] eq '-'){
1018 0         0 push @delPosi,$change->[1]+1;
1019 0         0 push @del,$change->[2];
1020             }else{
1021 0         0 push @addPosi,$change->[1]+1;
1022 0         0 push @add,$change->[2];
1023             }
1024             }
1025             #update updatedPosi array
1026 0         0 my $numDel=scalar(@delPosi);
1027 0         0 my $numAdd=scalar(@addPosi);
1028 0 0       0 if ($numDel==0){ #insertion
1029 0         0 $al->splice("target",$addPosi[0],0,\@add);
1030 0         0 print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n";
1031             #update updatedPosi array
1032 0         0 for (my $i=$reversePosi{"$addPosi[0]"};$i<=$numt;$i++){
1033 0         0 $updatedPosi[$i]+=$numAdd;
1034 0         0 $reversePosi{"$updatedPosi[$i]"}=$i;
1035             }
1036             }else{ # substitution or deletion
1037 0         0 $al->splice("target",$updatedPosi[$delPosi[0]],$numDel,\@add);
1038 0         0 print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n";
1039             #update updatedPosi array
1040 0         0 for (my $i=$delPosi[0]+$numDel;$i<=$numt;$i++){
1041 0         0 $updatedPosi[$i]+=$numAdd-$numDel;
1042 0         0 $reversePosi{"$updatedPosi[$i]"}=$i;
1043             }
1044             }
1045             }
1046 0 0       0 if ($verbose>0){print "--------------------------------------------------------------\n";}
  0         0  
1047             }
1048             }else{
1049             # sentence pair is in corpus: don't modify anything
1050             }
1051 0         0 $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
1052 0         0 $count++;
1053             }
1054 0         0 $inputSentPairNum++;
1055 0         0 $internalSentPairNum++;
1056             } #while
1057 0         0 print "$count sentence pairs parsed in alignment set\n";
1058 0         0 closeFiles($newFH,$newFormat);
1059 0         0 closeFiles($FH,$alSet->{format});
1060 0         0 $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
1061             }
1062              
1063             ######################################################################
1064             ### PRIVATE SUBS
1065             ######################################################################
1066              
1067             sub readLocation{
1068 2     2 0 4 my $location = shift;
1069            
1070 2 100       6 if (!ref($location)){ #if it is a path, put it in a location hash
1071 1         4 $location = {"sourceToTarget"=>$location}
1072             }
1073 2         5 return $location;
1074             }
1075              
1076             sub setRange {
1077 2     2 0 4 my ($alSet,$range) = @_;
1078            
1079 2         7 my @limits = split /-/, $range;
1080 2         3 my $numLimits = scalar(@limits);
1081 2 50 33     16 if ($numLimits == 0 || $numLimits >2){
    100          
1082 0         0 die "Invalid Range:$range\n";
1083             }elsif ($numLimits == 1){
1084 1         3 $limits[1]="";
1085             }
1086 2         8 $limits[0] =~ s/^\s+|\s+$//g;
1087 2         6 $limits[1] =~ s/^\s+|\s+$//g;
1088 2 50 33     20 if ($limits[0] !~ /\d+/ || $limits[0] == 0){
1089 0         0 $alSet->{firstSentPair}="1";
1090             }else{
1091 2         6 $alSet->{firstSentPair}=$limits[0];
1092             }
1093 2 100 66     11 if ($limits[1] !~ /\d+/ || $limits[1] == 0){
1094 1         3 $alSet->{lastSentPair}="eof";
1095             }else{
1096 1         5 $alSet->{lastSentPair}=$limits[1];
1097             }
1098             }
1099              
1100             #for future ease we save detailed infos contained in the source sample path
1101             #input: sourceToTarget dir (not optional), targetToSource dir (if exists), source path (optional) and target path (if necessary)
1102             #output: target (if not specified in input), sampleNum (sample number)
1103             sub completeBlinkerLocation{
1104 1     1 0 2 my $refToLocation = shift;
1105 1         4 my ($sourceLang,$targetLang);
1106              
1107 1 50       5 if ($refToLocation->{source}){
1108 0         0 my ($sourceDir,$sourceFileName)=split /\/([^\/]+)$/,$refToLocation->{source};
1109 0 0       0 if ($sourceFileName =~ /^(EN|FR)\.sample.\d+$/){
1110             #extract the sample number and target file:
1111 0         0 my ($sourceLang,$nothing,$sampleNum) = split /\./,$sourceFileName;
1112 0         0 $refToLocation->{sampleNum} = $sampleNum;
1113             }
1114             }
1115 1 50       4 if (!$refToLocation->{sampleNum}){
1116 1         4 $refToLocation->{sampleNum} = 1;
1117             }
1118             }
1119              
1120             # open (for read or write) the files contained in a "location" hash (ex. at the {location} key of the alignment set hash)
1121             # if opens for write needs old location hash to check you won't delete the old format files
1122             # returns a ref to a hash containing the filehandle variables (hash with same keys as "location" except for Blinker format)
1123             sub openLocation {
1124 2     2 0 4 my ($location,$format,$openMode,$oldLocation) = @_; #oldLocation: optional parameter
1125 2         4 my %FH;
1126              
1127 2 50       6 if ($openMode eq ">"){
1128 0 0       0 if ($format eq "BLINKER"){
1129 0         0 completeBlinkerLocation($location);
1130             }
1131             # check that your new files are different to prevent from deleting the old ones
1132 0         0 my %oldFiles = reverse %$oldLocation;
1133 0         0 my ($key,$newFile);
1134 0         0 while (($key, $newFile)=each %$location){
1135 0 0 0     0 if ($oldFiles{$newFile} && $key ne "sampleNum"){
1136 0         0 die "Convert function: you are opening for write one of the old format file: $newFile\n";
1137             }
1138             }
1139             #end of check
1140            
1141             # create directory structure where to create the file/directory if it doesn't exist, create it
1142 0         0 my $type;
1143 0 0       0 if ($format eq "BLINKER"){$type = "dir"}
  0         0  
  0         0  
1144             else {$type = "file"}
1145 0         0 createDirStructure($location->{sourceToTarget},$type);
1146 0 0       0 if ($location->{targetToSource}){
1147 0         0 createDirStructure($location->{targetToSource},$type);
1148             }
1149 0 0       0 if ($location->{source}){
1150 0         0 createDirStructure($location->{source},"file");
1151             }
1152 0 0       0 if ($location->{target}){
1153 0         0 createDirStructure($location->{target},"file");
1154             }
1155             #end create directory structure
1156             }
1157            
1158 2 50 66     24 if ($format eq "GIZA"){
    100          
    50          
1159 0 0       0 $FH{sourceToTarget} = IO::File->new($openMode.$location->{sourceToTarget}) or die "GIZA file (".$location->{sourceToTarget}.") opening error:$!";
1160 0 0       0 if ($location->{targetToSource}){
1161 0 0       0 $FH{targetToSource} = IO::File->new($openMode.$location->{targetToSource}) or die "GIZA file (".$location->{targetToSource}.") opening error:$!";
1162             }
1163             } elsif ($format eq "NAACL" || $format eq "TALP"){
1164 1 50       4 if ($location->{source}){
1165 1 50       13 $FH{source} = IO::File->new($openMode.$location->{source}) or die "Source file (".$location->{source}.") opening error:$!";
1166             }
1167 1 50       109 if ($location->{target}){
1168 1 50       6 $FH{target} = IO::File->new($openMode.$location->{target}) or die "Target file (".$location->{target}.") opening error:$!";
1169             }
1170 1 50       52 $FH{sourceToTarget} = IO::File->new($openMode.$location->{sourceToTarget}) or die "Alignment file (".$location->{sourceToTarget}.") opening error:$!";
1171 1 50       56 if ($location->{targetToSource}){
1172 0 0       0 $FH{targetToSource} = IO::File->new($openMode.$location->{targetToSource}) or die "Alignment file (".$location->{targetToSource}.") opening error:$!";
1173             }
1174             } elsif ($format eq "BLINKER"){
1175 1 50       3 if ($location->{source}){
1176 0 0       0 $FH{source} = IO::File->new($openMode.$location->{source}) or die "BLINKER source file (".$location->{source}.") opening error:$!";
1177             }
1178 1 50       6 if ($location->{target}){
1179 0 0       0 $FH{target} = IO::File->new($openMode.$location->{target}) or die "BLINKER source file (".$location->{target}.") opening error:$!";
1180             }
1181             }
1182 2         10 return (\%FH);
1183             }
1184              
1185              
1186             # if you want to create a file of path "directory_structure/file", makes "directory_structure" if necessary.
1187             # if you want to create a directory of path "directory_structure", makes it if it doesn't exist
1188             # type is "dir" (if you want to create a directory) or "file" (a file)
1189             sub createDirStructure {
1190 0     0 0 0 my ($path,$type)=@_;
1191            
1192 0 0       0 if ($type eq "dir"){
    0          
1193 0 0 0     0 unless(-e $path && -d _){
1194 0         0 system('mkdir -p '.$path);
1195             }
1196             }elsif ($type eq "file"){
1197 0         0 $path =~ s/\/$//;
1198 0 0       0 if ($path =~ /\//){
1199 0         0 my ($dir,$file)=split /\/[^\/]+$/,$path;
1200 0 0       0 unless (-e $dir){
1201 0         0 system('mkdir -p '.$dir);
1202             }
1203             }
1204             }
1205             }
1206              
1207             # open files of an alignment set for read and go to first sentence pair
1208             sub openFiles {
1209 2     2 0 3 my $alSet = shift;
1210 2         4 my %FH = %{openLocation($alSet->{location},$alSet->{format},"<")};
  2         9  
1211 2         5 my $fhPos;
1212             my $lineNb;
1213              
1214             # go to first Sentence pair:
1215 2 50       18 if ($alSet->{format} eq "TALP"){
    50          
    100          
    50          
1216 0         0 for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ #go to first Sentence pair
1217 0 0       0 if ($FH{source}){
1218 0         0 $FH{source}->getline();
1219             }
1220 0 0       0 if ($FH{target}){
1221 0         0 $FH{target}->getline();
1222             }
1223 0 0       0 if ($FH{sourceToTarget}){
1224 0         0 $FH{sourceToTarget}->getline();
1225             }
1226 0 0       0 if ($FH{targetToSource}){
1227 0         0 $FH{targetToSource}->getline();
1228             }
1229             }
1230             }elsif ($alSet->{format} eq "GIZA"){
1231 0         0 for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ #go to first Sentence pair
1232 0         0 $FH{sourceToTarget}->getline();
1233 0         0 $FH{sourceToTarget}->getline();
1234 0         0 $FH{sourceToTarget}->getline();
1235 0 0       0 if ($FH{targetToSource}){
1236 0         0 $FH{targetToSource}->getline();
1237 0         0 $FH{targetToSource}->getline();
1238 0         0 $FH{targetToSource}->getline();
1239             }
1240             }
1241             } elsif ($alSet->{format} eq "NAACL"){
1242 1         9 for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){
1243 0 0       0 if ($FH{source}){
1244 0         0 $FH{source}->getline();
1245             }
1246 0 0       0 if ($FH{target}){
1247 0         0 $FH{target}->getline();
1248             }
1249 0         0 $fhPos = $FH{sourceToTarget}->getpos;
1250 0   0     0 while ($FH{sourceToTarget}->getline() !~ m/^0*$alSet->{firstSentPair} .*/ && !$FH{sourceToTarget}->eof()) {
1251 0         0 $fhPos = $FH{sourceToTarget}->getpos;
1252             }
1253 0 0       0 if ($FH{sourceToTarget}->eof()){
1254 0         0 die "First sentence pair of range (number ".$alSet->{firstSentPair}.") not found in ".$alSet->{location}{sourceToTarget};
1255             }
1256 0         0 $FH{sourceToTarget}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
1257 0 0       0 if ($FH{targetToSource}){
1258 0         0 $fhPos = $FH{targetToSource}->getpos;
1259 0   0     0 while ($FH{targetToSource}->getline() !~ m/^0*$alSet->{firstSentPair} .*/ && !$FH{targetToSource}->eof()) {
1260 0         0 $fhPos = $FH{targetToSource}->getpos;
1261             }
1262 0 0       0 if ($FH{targetToSource}->eof()){
1263 0         0 die "First sentence pair of range (number ".$alSet->{firstSentPair}.") not found in ".$alSet->{location}{targetToSource};
1264             }
1265 0         0 $FH{targetToSource}->setpos($fhPos);
1266             }
1267             }
1268             } elsif ($alSet->{format} eq "BLINKER"){
1269 1 50       4 if ($FH{source}){
1270 0         0 for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){
1271 0         0 $FH{source}->getline();
1272             }
1273             }
1274 1 50       3 if ($FH{target}){
1275 0         0 for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){
1276 0         0 $FH{target}->getline();
1277             }
1278             }
1279             }
1280 2         5 return (\%FH);
1281             }
1282              
1283             # close the files contained in the hash at the {location} key of the alignment set hash
1284             sub closeFiles {
1285 2     2 0 5 my ($FH,$format) = @_;
1286            
1287 2 50 66     20 if ($format eq "GIZA"){
    100          
    50          
1288 0         0 $FH->{sourceToTarget}->close();
1289 0 0       0 if ($$FH{targetToSource}){
1290 0         0 $FH->{targetToSource}->close();
1291             }
1292             } elsif ($format eq "NAACL" || $format eq "TALP"){
1293 1 50       6 if ($FH->{source}){
1294 1         13 $FH->{source}->close();
1295             }
1296 1 50       24 if ($FH->{target}){
1297 1         5 $FH->{target}->close();
1298             }
1299 1 50       20 if ($FH->{sourceToTarget}){
1300 1         4 $FH->{sourceToTarget}->close();
1301             }
1302 1 50       17 if ($FH->{targetToSource}){
1303 0         0 $FH->{targetToSource}->close();
1304             }
1305             } elsif ($format eq "BLINKER"){
1306 1 50       3 if ($FH->{source}){
1307 0         0 $FH->{source}->close();
1308             }
1309 1 50       3 if ($FH->{target}){
1310 0         0 $FH->{target}->close();
1311             }
1312             }
1313             }
1314              
1315             # convert a chunk of alignment set file to an array of references to simple (1 sentence) alignment objects
1316             # returns 0 if the file is at eof
1317             sub loadChunk {
1318 22     22 0 38 my ($alSet,$alFH,$sentPairNum,$alignMode) = @_;
1319 22         25 my ($sourceString,$targetString,$alString,$reverseAlString);
1320 22         27 my $st_alignments=[];
1321 22         32 my $ts_alignments=[];
1322 22         28 my $al;
1323             my $theEnd;
1324            
1325 22 50 33     207 if (!defined($alignMode) || $alignMode =~ /^as.?is$/i){
    50          
    50          
1326 0         0 $alignMode = "as-is";
1327             }elsif ($alignMode =~ /^null.?align$/i){
1328 0         0 $alignMode = "null-align";
1329             }elsif ($alignMode =~ /^no.?null.?align$/i){
1330 22         35 $alignMode = "no-null-align";
1331             }else{
1332 0         0 die 'Incorrect alignment mode. Correct modes are "as-is","null-align" or "no-null-align".'."\n";
1333             }
1334 22 50       97 if ($alSet->{format} eq "TALP"){
    50          
    100          
    50          
1335 0 0       0 if ($alSet->{lastSentPair} eq "eof"){
1336 0         0 $theEnd = $$alFH{sourceToTarget}->eof();
1337             }else{
1338 0   0     0 $theEnd = ($$alFH{sourceToTarget}->eof() || $sentPairNum > $alSet->{lastSentPair});
1339             }
1340 0 0       0 if ($theEnd){
1341 0         0 return 0;
1342             }else{
1343 0 0       0 if ($alFH->{source}){
1344 0         0 $sourceString = $alFH->{source}->getline();
1345             }
1346 0 0       0 if ($alFH->{target}){
1347 0         0 $targetString = $alFH->{target}->getline();
1348             }
1349 0 0       0 if ($alFH->{sourceToTarget}){
1350 0         0 $alString = $alFH->{sourceToTarget}->getline();
1351             }
1352 0 0       0 if ($alFH->{targetToSource}){
1353 0         0 $reverseAlString = $alFH->{targetToSource}->getline();
1354             }
1355 0         0 $al = Lingua::Alignment->new;
1356 0         0 $al->loadFromTalp($alString,$reverseAlString,$sourceString,$targetString);
1357             }
1358             }elsif ($alSet->{format} eq "GIZA"){
1359 0 0       0 if ($alSet->{lastSentPair} eq "eof"){
1360 0         0 $theEnd = $$alFH{sourceToTarget}->eof();
1361             }else{
1362 0   0     0 $theEnd = ($$alFH{sourceToTarget}->eof() || $sentPairNum > $alSet->{lastSentPair});
1363             }
1364 0 0       0 if ($theEnd){
1365 0         0 return 0;
1366             }else{
1367 0         0 $$alFH{sourceToTarget}->getline();
1368 0         0 $targetString = $$alFH{sourceToTarget}->getline();
1369 0         0 $alString = $$alFH{sourceToTarget}->getline();
1370 0 0       0 if ($$alFH{targetToSource}){
1371 0         0 $$alFH{targetToSource}->getline();
1372 0         0 $$alFH{targetToSource}->getline();
1373 0         0 $reverseAlString = $$alFH{targetToSource}->getline();
1374             }
1375 0         0 $al = Lingua::Alignment->new;
1376 0         0 $al->loadFromGiza($alString,$targetString,$reverseAlString);
1377             }
1378             } elsif ($alSet->{format} eq "NAACL"){
1379 11         11 my $fhPos;
1380 11 50       21 if ($alSet->{lastSentPair} eq "eof"){
1381 0         0 $theEnd = $$alFH{sourceToTarget}->eof();
1382             }else{
1383 11   66     39 $theEnd = ($$alFH{sourceToTarget}->eof() || $sentPairNum > $alSet->{lastSentPair});
1384             }
1385 11 100       174 if ($theEnd){
1386 1         7 return 0;
1387             }else{
1388 10 50       25 if ($$alFH{source}){
1389 10         281 $sourceString = $$alFH{source}->getline();
1390             #strip tags and memorize snum:
1391 10         332 $sourceString =~ s/(.*)<\/s>/$1/;
1392             }
1393 10 50       29 if ($$alFH{target}){
1394 10         218 $targetString = $$alFH{target}->getline();
1395             #strip tags and memorize snum:
1396 10         308 $targetString =~ s/(.*)<\/s>/$2/;
1397             }
1398 10         43 $fhPos = $$alFH{sourceToTarget}->getpos;
1399 10         211 $alString = $$alFH{sourceToTarget}->getline();
1400 10         264 my ($num,$theRest)=split " ",$alString,2;
1401 10 50       27 if ($num==$sentPairNum){ #skip if there is no link for this sentence pair
1402 10         30 $fhPos = $$alFH{sourceToTarget}->getpos;
1403 10         20 push @$st_alignments,$theRest;
1404 10         211 while ($$alFH{sourceToTarget}->getline() =~ m/^$sentPairNum (.*)$/) {
1405 58         1919 push @$st_alignments,$1;
1406 58         1408 $fhPos = $$alFH{sourceToTarget}->getpos;
1407             }
1408             }
1409 10         384 $$alFH{sourceToTarget}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
1410 10 50       32 if ($$alFH{targetToSource}){
1411 0         0 $fhPos = $$alFH{targetToSource}->getpos;
1412 0         0 $alString = $$alFH{targetToSource}->getline();
1413 0         0 my ($num,$theRest)=split " ",$alString,2;
1414 0 0       0 if ($num==$sentPairNum){ #skip if there is no link for this sentence pair
1415 0         0 $fhPos = $$alFH{targetToSource}->getpos;
1416 0         0 push @$ts_alignments,$theRest;
1417 0         0 while ($$alFH{targetToSource}->getline() =~ m/^$sentPairNum (.*)$/) {
1418 0         0 push @$ts_alignments,$1;
1419 0         0 $fhPos = $$alFH{targetToSource}->getpos;
1420             }
1421             }
1422 0         0 $$alFH{targetToSource}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
1423             }
1424 10         36 $al = Lingua::Alignment->new;
1425 10         30 $al->loadFromBlinker($st_alignments,$ts_alignments,$sourceString,$targetString);
1426             }
1427             } elsif ($alSet->{format} eq "BLINKER"){
1428 11 50       21 if ($alSet->{lastSentPair} eq "eof"){
1429 11         485 $theEnd = !(-e $alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
1430             }else{
1431 0   0     0 $theEnd = !(-e $alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1)) || $sentPairNum > $alSet->{lastSentPair};
1432             }
1433 11 100       22 if ($theEnd){
1434 1         4 return 0;
1435             }else{
1436 10 50       24 if ($alFH->{source}){
1437 0         0 $sourceString = $alFH->{source}->getline();
1438             }
1439 10 50       21 if ($alFH->{target}){
1440 0         0 $targetString = $alFH->{target}->getline();
1441             }
1442 10         346 open(AL,"< ".$alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
1443 10         241 @$st_alignments = ;
1444 10         103 close(AL);
1445 10 50       31 if ($alSet->{location}->{targetToSource}){
1446 0         0 open(AL,"< ".$alSet->{location}->{targetToSource}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
1447 0         0 @$ts_alignments = ;
1448 0         0 close(AL);
1449             }
1450 10         44 $al = Lingua::Alignment->new;
1451 10         35 $al->loadFromBlinker($st_alignments,$ts_alignments,$sourceString,$targetString);
1452             }
1453             }
1454 20 50       68 if ($alignMode eq "null-align"){
    50          
1455 0         0 $al->forceNullAlign();
1456             }elsif ($alignMode eq "no-null-align"){
1457 20         55 $al->forceNoNullAlign();
1458             }
1459 20         110 return [$al];
1460             }
1461              
1462              
1463             sub updateObject {
1464 0     0 0 0 my ($alSet,$newFormat,$newLocation,$lastSentPairNum)=@_;
1465 0         0 $alSet->{location}->{sourceToTarget}=$newLocation->{sourceToTarget};
1466 0         0 $alSet->{location}->{targetToSource}=$newLocation->{targetToSource};
1467 0 0       0 if ($newLocation->{source}){
1468 0         0 $alSet->{location}->{source}=$newLocation->{source};
1469             }else{
1470 0 0 0     0 if ($alSet->{firstSentPair} != 1 || $alSet->{format} ne $newFormat){
1471             # in this case the numeration of the converted alignment file and that of the (not converted) source file will not correspond
1472 0         0 delete($alSet->{location}->{source});
1473             # warn "After converting into ",$newLocation->{sourceToTarget},", the numeration of the source words file",
1474             # " didn't correspond any more to that of the alignment file. So the 'source' entry has been removed from the location hash.";
1475             }
1476             }
1477 0 0       0 if ($newLocation->{target}){
1478 0         0 $alSet->{location}->{target}=$newLocation->{target};
1479             }else{
1480 0 0 0     0 if ($alSet->{firstSentPair} != 1 || $alSet->{format} ne $newFormat){
1481             # in this case the numeration of the converted alignment file and that of the (not converted) source file will not correspond
1482 0         0 delete($alSet->{location}->{target});
1483             # warn "After converting into ",$newLocation->{sourceToTarget},", the numeration of the target words file ",
1484             # "didn't correspond any more to that of the alignment file. So the 'target' entry has been removed from the location hash.";
1485             }
1486             }
1487 0         0 $alSet->{format}=$newFormat;
1488 0 0       0 if ($newFormat eq "BLINKER"){
    0          
1489 0         0 $alSet->{location}->{sampleNum}=$newLocation->{sampleNum};
1490             }elsif(exists($alSet->{location}->{sampleNum})){
1491 0         0 delete($alSet->{location}->{sampleNum});
1492             }
1493 0         0 $alSet->{firstSentPair}=1;
1494 0         0 $alSet->{lastSentPair}=$lastSentPairNum;
1495             }
1496              
1497             # returns the alignment set, with a unique new file set that has the required location,format and range values.
1498             # TO DO: conversion to Giza++ format
1499             sub convert {
1500 0     0 0 0 my ($alSet,$newLocation,$newFormat,$alignMode,$AlignmentSub)=@_;
1501 0 0       0 if (!defined($newFormat)){$newFormat="TALP"}
  0         0  
  0         0  
1502             else {$newFormat = uc $newFormat}
1503 0         0 $newLocation = readLocation($newLocation);
1504 0         0 my $FH = $alSet->openFiles();
1505 0         0 my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
1506 0         0 my ($i,$al,$alSetChunk,$line,$lines);
1507 0         0 my $inputSentPairNum=$alSet->{firstSentPair};
1508 0         0 my $internalSentPairNum = 1;
1509 0         0 while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
1510             # print $inputSentPairNum."\n";
1511 0         0 for ($i=0;$i<@$alSetChunk;$i++){
1512 0         0 $al = $$alSetChunk[$i];
1513 0 0       0 if (defined($AlignmentSub)){
1514             #look if $AlignmentSub is a ref to an Array or a subroutine
1515 0 0       0 if (ref($AlignmentSub) eq "ARRAY"){
1516 0         0 my ($sub,@params) = @$AlignmentSub;
1517 0         0 $al->$sub(@params);
1518             }else{
1519 0         0 $al->$AlignmentSub();
1520             }
1521             }
1522 0         0 $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
1523             } #for
1524 0         0 $inputSentPairNum++;
1525 0         0 $internalSentPairNum++;
1526             } #while
1527 0         0 closeFiles($newFH,$newFormat);
1528 0         0 closeFiles($FH,$alSet->{format});
1529 0         0 $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
1530             }
1531              
1532              
1533             # returns the alignment set, with only manyToMany links (source and target words file don't change)
1534             # at the moment, only works for sourceToTarget alignment (sourceAl)
1535             sub printManyToMany {
1536 0     0 0 0 my ($alSet,$newLocation,$newFormat)=@_;
1537 0 0       0 if (!defined($newFormat)){$newFormat="TALP"}
  0         0  
  0         0  
1538             else {$newFormat = uc $newFormat}
1539 0         0 $newLocation = readLocation($newLocation);
1540 0         0 my $dumper=new Dumpvalue;
1541 0         0 my $FH = $alSet->openFiles();
1542 0         0 my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
1543 0         0 my ($i,$al,$alSetChunk,$line,$lines);
1544 0         0 my $inputSentPairNum=$alSet->{firstSentPair};
1545 0         0 my $internalSentPairNum = 1;
1546            
1547 0         0 while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,"no-null-align")){ # returns 0 if eof or last sentence pair
1548 0         0 for ($i=0;$i<@$alSetChunk;$i++){
1549 0         0 $al = $$alSetChunk[$i];
1550 0         0 my $clone = $al->clone;
1551             # look for manyToMany links
1552 0         0 my $clusters = $al->getAlClusters;
1553 0         0 $al->{sourceAl}=[];
1554             # print $dumper->dumpValue($clusters);
1555 0         0 for (my $c=0;$c<@$clusters;$c++){
1556 0 0 0     0 if ( @{$clusters->[$c]{source}}>1 || @{$clusters->[$c]{target}}>1 ){
  0         0  
  0         0  
1557             # this is a many to many alignment
1558 0         0 foreach my $j (@{$clusters->[$c]{source}}){
  0         0  
1559 0         0 foreach my $k (@{$clusters->[$c]{target}}){
  0         0  
1560 0 0       0 if ($clone->isIn("sourceAl",$j,$k)){
1561 0         0 push @{$al->{sourceAl}[$j]},$k;
  0         0  
1562             }
1563             }
1564             }
1565             }
1566             }
1567 0         0 $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
1568             } #for
1569 0         0 $inputSentPairNum++;
1570 0         0 $internalSentPairNum++;
1571             } #while
1572 0         0 closeFiles($newFH,$newFormat);
1573 0         0 closeFiles($FH,$alSet->{format});
1574 0         0 $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
1575             }
1576              
1577              
1578             # identifies a link as sure or possible
1579             # input: a Naacl-file line containing the link and refs to sure and possible hashes
1580             # action: add to the relevant hash a key corresponding to this link
1581             sub identifySurePossible{
1582 149     149 0 241 my ($line,$sure,$possible)=@_;
1583 149         126 my @components;
1584             my $alignment;
1585            
1586             # print "line:$line\n";
1587            
1588             #code adapted from Rada Mihalcea's wa_eval_align.pl, rada@cs.unt.edu
1589             # get all line components: format should be
1590             # sentence_no position_L1 position_L2 [S|P] [confidence]
1591 149         376 @components = split /\s+/, $line;
1592            
1593 149 50       411 if(scalar(@components) < 3) {
1594 0         0 print STDERR "Incorrect format in answer file\n";
1595 0         0 exit;
1596             }
1597            
1598 149         262 $alignment = $components[0]." ".$components[1]." ".$components[2];
1599            
1600             # identify the S[ure] alignments
1601 149 0 0     279 if( scalar (@components) == 3 || (scalar (@components) == 4 && ($components[3] =~ /^[\d\.]+$/ || $components[3] eq 'S')) ||
      0        
      33        
      0        
      0        
      0        
1602             (scalar (@components) == 5 && ($components[3] eq 'S' || $components[4] eq 'S'))) {
1603 149         312 $sure->{$alignment} = 1;
1604             }
1605            
1606             # identify the P[robable] alignments
1607 149 0 33     812 if( (scalar (@components) == 4 && $components[3] eq 'P') || (scalar (@components) == 5 &&
      0        
      33        
      33        
1608             ($components[3] eq 'P' || $components[4] eq 'P'))) {
1609 0           $possible->{$alignment} = 1;
1610             }
1611             }
1612              
1613             1;
1614             __END__