File Coverage

blib/lib/Lingua/PTD.pm

Criterion	Covered	Total	%
statement	27	29	93.1
branch			n/a
condition			n/a
subroutine	10	10	100.0
pod			n/a
total	37	39	94.8

line	stmt	sub	time	code
1				package Lingua::PTD;
2				$Lingua::PTD::VERSION = '1.15';
3	1	1	20512	use 5.010;
	1		4
4
5	1	1	803	use parent 'Exporter';
	1		292
	1		5
6				our @EXPORT = 'toentry';
7				our @EXPORT_OK = qw/bws ucts/;
8	1	1	70	use warnings;
	1		6
	1		27
9	1	1	5	use strict;
	1		2
	1		19
10
11	1	1	843	use utf8;
	1		10
	1		5
12
13	1	1	730	use Unicode::CaseFold;
	1		883
	1		62
14
15	1	1	921	use Time::HiRes;
	1		1426
	1		4
16	1	1	659	use Lingua::PTD::Dumper;
	1		2
	1		49
17	1	1	512	use Lingua::PTD::BzDmp;
	1		3
	1		51
18	1	1	536	use Lingua::PTD::XzDmp;
	0
	0
19				use Lingua::PTD::SQLite;
20				use Lingua::PTD::TSV;
21				use Lingua::PTD::StarDict;
22
23				=encoding UTF-8
24
25				=head1 NAME
26
27				Lingua::PTD - Module to handle PTD files in Dumper Format
28
29				=head1 SYNOPSIS
30
31				use Lingua::PTD;
32
33				$ptd = Lingua::PTD->new( $ptd_file );
34
35				=head1 DESCRIPTION
36
37				PTD files in Perl Dumper format are simple hashes references. But they
38				use a specific structure, and this module provides a simple interface to
39				manipulate it.
40
41				=head2 C
42
43				The C constructor returns a new Lingua::PTD object. This constructor
44				receives a PTD file in dumper format.
45
46				my $ptd = Lingua::PTD->new( $ptd_file );
47
48				If the filename matches with C<< /dmp.bz2$/ >> (that is, ends in
49				dmp.bz2) it is considered to be a bzip2 file and will be decompressed
50				in the fly.
51
52				If it ends in C<<.sqlite>>, then it is supposed to contain an SQLite
53				file with the dictionary (with Lingua::PTD standard schema!).
54
55				Extra arguments are a flatenned hash with configuration
56				variables. Following options are recognized:
57
58				=over 4
59
60				=item C
61
62				Sets verbosity.
63
64				my $ptd = Lingua::PTD->new( $ptd_file, verbose => 1 );
65
66				=back
67
68				=cut
69
70				sub new {
71				my ($class, $filename, %ops) = @_;
72				die "Can't find ptd [$filename]\n" unless -f $filename;
73
74				my $self;
75				# switch
76				$self = Lingua::PTD::Dumper->new($filename) if $filename =~ /\.dmp$/i;
77				$self = Lingua::PTD::BzDmp ->new($filename) if $filename =~ /\.dmp\.bz2$/i;
78				$self = Lingua::PTD::XzDmp ->new($filename) if $filename =~ /\.dmp\.xz$/i;
79				$self = Lingua::PTD::SQLite->new($filename) if $filename =~ /\.sqlite$/i;
80
81				# default
82				$self = Lingua::PTD::Dumper->new($filename) unless $self;
83
84				$self->_calculate_sizes() unless $self->size; # in case it is already calculated
85
86				# configuration variables
87				$self->verbose($ops{verbose}) if exists $ops{verbose};
88
89				return $self;
90				}
91
92				=head2 C
93
94				With no arguments returns if the methods are configured to use verbose
95				mode, or not. If an argument is supplied, it is interpreted as a
96				boolean value, and sets methods verbosity.
97
98				$ptd->verbose(1);
99
100				=cut
101
102				sub verbose {
103				my $self = shift;
104				if (defined($_[0])) {
105				$self->{' verbose '} = shift
106				} else {
107				$self->{' verbose '} \|\| 0
108				}
109				}
110
111				=head2 C
112
113				The C method is used to write the PTD in its own format, but
114				taking care to sort words lexicographically, and sorting translations by
115				their probability (starting with higher probabilities).
116
117				The format is Perl code, and thus, can be used independetly of this module.
118
119				$ptd->dump;
120
121				Note that the C method writes to the Standard Output stream.
122
123				=cut
124
125				sub dump {
126				my $self = shift;
127
128				binmode STDOUT, ":utf8";
129				print "use utf8;\n";
130				print "\$a = {\n";
131				$self->downtr(
132				sub {
133				my ($w,$c,%t) = @_;
134				printf " '%s' => {\n", _protect_quotes($w);
135				printf " count => %d,\n", $c;
136				printf " trans => {\n";
137				for my $t (sort { $t{$b} <=> $t{$a} } keys %t) {
138				printf " '%s' => %.6f,\n", _protect_quotes($t), $t{$t};
139				}
140				printf " }\n";
141				printf " },\n";
142				},
143				sorted => 1,
144				task => 'dump',
145				);
146				print "}\n";
147				}
148
149				=head2 C
150
151				The C method returns an array (not a reference) to the list of
152				words of the dictionary: its domain. Pass a true value as argument and
153				the list is returned sorted.
154
155				my @words = $ptd->words;
156
157				=cut
158
159				sub words {
160				my $self = shift;
161				my $sorted = shift;
162				if ($sorted) {
163				return sort grep {!/^ /} keys %$self;
164				} else {
165				return grep {!/^ /} keys %$self;
166				}
167				}
168
169				=head2 C
170
171				The C method receives a word, and returns the list of its possible
172				translations.
173
174				my @translations = $ptd->trans( "dog" );
175
176				=cut
177
178				sub trans {
179				my ($self, $word, $trans) = @_;
180				return () unless exists $self->{$word};
181				if ($trans) {
182				return (exists($self->{$word}{trans}{$trans}))?1:0;
183				} else {
184				return keys %{$self->{$word}{trans}};
185				}
186				}
187
188
189				=head2 C
190
191				Checks if a word is in a dictionary
192
193				=cut
194
195				sub exists {
196				my ($self, $word ) = @_;
197				return exists $self->{$word};
198				}
199
200				=head2 C
201
202				The C method receives a word, and returns an hash where
203				keys are the its possible translations, and values the corresponding
204				translation probabilities.
205
206				my %trans = $ptd->transHash( "dog" );
207
208				Returns the empty hash if the word does not exist.
209
210				=cut
211
212				sub transHash {
213				my ($self, $word) = @_;
214				my %h = ();
215				for my $t ($self->trans($word)) {
216				$h{$t} = $self->prob($word, $t);
217				}
218				return %h;
219				}
220
221				=head2 C
222
223				The C method receives a word and a translation, and returns the
224				probability of that word being translated that way.
225
226				my $probability = $ptd->prob("cat", "gato");
227
228				=cut
229
230				sub prob {
231				my ($self, $word, $trad) = @_;
232				return 0 unless exists $self->{$word}{trans}{$trad};
233				return $self->{$word}{trans}{$trad};
234				}
235
236				=head2 C
237
238				Returns the total number of words from the source-corpus that originated
239				the PTD. Basically, the sum of the C attribute for all words.
240
241				my $size = $ptd->size;
242
243				=cut
244
245				sub size {
246				return $_[0]->{' size '}; # space is relevant
247				}
248
249				=head2 C
250
251				The C method receives a word and returns the occurrence count for
252				that word.
253
254				my $count = $ptd->count("cat");
255
256				If no argument is supplied, returns the total dictionary count (sum of
257				all words).
258
259				=cut
260
261				sub count {
262				my ($self, $word) = @_;
263				if (defined($word)) {
264				if (exists($self->{$word})) {
265				return $self->{$word}{count}
266				} else {
267				return 0;
268				}
269				} else {
270				return $self->{" count "};
271				}
272				}
273
274				=head2 C
275
276				Computes a bunch of statistics about the PTD and returns them in an
277				hash reference.
278
279				=cut
280
281				sub stats {
282				my $self = shift;
283				my $stats = {
284				size => $self->size,
285				count => $self->count,
286				};
287
288				$self->downtr( sub {
289				my ($w, $c, %t) = @_;
290				$c \|\|= 1;
291				$stats->{avgTransNr} += scalar(keys %t);
292				$stats->{occTotal} += $c;
293				if (!$stats->{occMin} \|\| $stats->{occMin} > $c) {
294				$stats->{occMin} = $c;
295				$stats->{occMinWord} = $w;
296				}
297				if (!$stats->{occMax} \|\| $stats->{occMax} < $c) {
298				$stats->{occMax} = $c;
299				$stats->{occMaxWord} = $w;
300				}
301				if (%t) {
302				my ($bestProb) = sort { $b <=> $a } values %t;
303				if (!$stats->{probMax} \|\| $stats->{probMax} < $bestProb) {
304				$stats->{probMax} = $bestProb;
305				}
306				if (!$stats->{probMin} \|\| $stats->{probMin} > $bestProb) {
307				$stats->{probMin} = $bestProb;
308				}
309				$stats->{avgBestTrans} += $bestProb;
310				}
311				},
312				task => 'stats');
313				$stats->{avgTransNr} /= $stats->{count};
314				$stats->{avgBestTrans} /= $stats->{count};
315				$stats->{avgOcc} = $stats->{occTotal} / $stats->{count};
316				return $stats;
317
318				}
319
320				=head2 C
321
322				This method subtracts to the domain of a PTD, the elements present on
323				a set of elements. This set can be defines as another PTD (domain is
324				used), as a Perl array reference, as a Perl hash reference (domain is
325				used) or as a Perl array (not reference). Returns the dictionary after
326				domain subtraction takes place.
327
328				# removes portuguese articles from the dictionary
329				$ptd->subtractDomain( qw.o a os as. );
330
331				# removes a set of stop words from the dictionary
332				$ptd->subtractDomain( \@stopWords );
333
334				# removes the words present on other_ptd from ptd
335				$ptd->subtractDomain( $other_ptd );
336
337				=cut
338
339				sub subtractDomain {
340				my ($self, $other, @more) = @_;
341
342				my @domain;
343				if (ref($other) =~ /Lingua::PTD/ and $other->isa("Lingua::PTD")) {
344				@domain = $other->words;
345				}
346				elsif (ref($other) eq "ARRAY") {
347				@domain = @$other
348				}
349				elsif (ref($other) eq "HASH") {
350				@domain = keys %$other
351				}
352				else {
353				@domain = ($other, @more);
354				}
355				my %domain;
356				@domain{@domain} = @domain;
357
358				$self -> downtr (
359				sub {
360				my ($w,$c,%t) = @_;
361				return exists($domain{$w}) ? undef : toentry($w,$c,%t)
362				},
363				filter => 1,
364				task => 'subtractDomain',
365				);
366				$self->_calculate_sizes();
367				return $self;
368				}
369
370
371
372				=head2 C
373
374				Domain restrict function: interface is similar to subtractDomain function
375
376				This method restricts the domain of a PTD to a set of elements. This
377				set can be defines as another PTD (domain is used), as a Perl array
378				reference, as a Perl hash reference (domain is used) or as a Perl
379				array (not reference). Returns the dictionary after domain restriction
380				takes place.
381
382				# restrict the dictionary to a set of words
383				$ptd->restrictDomain( \@someWords );
384
385				=cut
386
387				sub restrictDomain {
388				my ($self, $other, @more) = @_;
389
390				my @domain;
391				if (ref($other) =~ /Lingua::PTD/ and $other->isa("Lingua::PTD")) {
392				@domain = $other->words;
393				}
394				elsif (ref($other) eq "ARRAY") {
395				@domain = @$other
396				}
397				elsif (ref($other) eq "HASH") {
398				@domain = keys %$other
399				}
400				else {
401				@domain = ($other, @more);
402				}
403				my %domain;
404				@domain{@domain} = @domain;
405
406				$self -> downtr (
407				sub {
408				my ($w,$c,%t) = @_;
409				return exists($domain{$w}) ? toentry($w,$c,%t):undef
410				},
411				filter => 1,
412				task => 'restrictDomain',
413				);
414				$self->_calculate_sizes();
415				return $self;
416				}
417
418				=head2 C
419
420				This method recalculates all probabilities accordingly with the number
421				of translations available.
422
423				For instance, if you have
424
425				home => casa => 25%
426				=> lar => 25%
427
428				The resulting dictionary will have
429
430				home => casa => 50%
431				=> lar => 50%
432
433				Note that this methods B the object.
434
435				=cut
436
437				sub reprob {
438				my $self = shift;
439				$self->downtr(
440				sub {
441				my ($w, $c, %t) = @_;
442				my $actual = 0;
443				$actual += $t{$_} for (keys %t);
444				return undef unless $actual > 0.1;
445				$t{$_} /= $actual for (keys %t);
446				return toentry($w, $c, %t);
447				},
448				filter => 1,
449				task => 'reprob'
450				);
451				return $self;
452				}
453
454				=head2 C
455
456				This method intersects the current object with the supplied PTD.
457				Note that this method B the object values.
458
459				Occurrences count in the final dictionary is the minimum occurrence
460				value of the two dictionaries.
461
462				Only translations present on both dictionary are kept. The probability
463				will be the minimum on the two dictionaries.
464
465				=cut
466
467				sub intersect {
468				my ($self, $other) = @_;
469
470				$self->downtr
471				(
472				sub {
473				my ($w, $c, %t) = @_;
474				if ($other->trans($w)) {
475				$c = _min($c, $other->count($w));
476				for my $t (keys %t) {
477				if ($other->trans($w,$t)) {
478				$t{$t} = _min($t{$t}, $other->trans($w,$t));
479				}
480				else {
481				delete($t{$t});
482				}
483				}
484				return toentry($w, $c, %t);
485				} else {
486				return undef;
487				}
488				},
489				filter => 1,
490				task => 'intersect',
491				);
492				$self->_calculate_sizes();
493				}
494
495				sub _set_word_translation {
496				my ($self, $word, $translation, $probability) = @_;
497				$self->{$word}{trans}{$translation} = $probability;
498				}
499
500				sub _delete_word_translation {
501				my ($self, $word, $translation) = @_;
502				delete($self->{$word}{trans}{$translation});
503				}
504
505				sub _set_word_count {
506				my ($self, $word, $count) = @_;
507				$self->{$word}{count} = $count;
508				}
509
510				sub _delete_word {
511				my ($self, $word) = @_;
512				delete $self->{$word};
513				}
514
515				=head2 C
516
517				This method adds the current PTD with the supplied one (first
518				argument). Note that this method B the object values.
519
520				=cut
521
522				sub add {
523				my ($self, $other, %ops) = @_;
524
525				$ops{verbose} //= $self->verbose;
526
527				my ($S1,$S2) = ($self->size, $other->size);
528
529				$other->_init_transaction;
530				$self->downtr(sub {
531				my ($w, $c, %t) = @_;
532				if ($other->trans($w)) {
533				my ($c1, $c2) = ($c, $other->count($w));
534				for my $t (_uniq(keys %t, $other->trans($w))) {
535				my ($p1, $p2) = ($t{$t} \|\| 0, $other->prob($w,$t));
536				my ($w1, $w2) = ($c1 * $S2, $c2 * $S1);
537				if ($w1 + $w2) {
538				$t{$t} = ($w1 * $p1 + $w2 * $p2)/($w1 + $w2);
539				} else {
540				delete $t{$t};
541				}
542				}
543				toentry($w, $c1+$c2, %t);
544				} else {
545				toentry($w,$c,%t);
546				}
547				},
548				filter => 1,
549				task => 'add',
550				verbose => $ops{verbose},
551				);
552				$other->_commit;
553
554				$self->_init_transaction;
555				print STDERR "\tAdding new words\n" if $ops{verbose};
556				$other->downtr(sub {
557				my ($w, $c, %t) = @_;
558				return if $self->trans($w);
559				$self->_set_word_count($w, $c);
560				for my $t (keys %t) {
561				$self->_set_word_translation($w, $t, $t{$t});
562				}
563				},
564				task => 'add',
565				verbose => $ops{verbose},
566				);
567				$self->_commit;
568				$self->_calculate_sizes();
569				return $self;
570				}
571
572				sub _uniq {
573				my %f;
574				$f{$_}++ for @_;
575				return keys %f;
576				}
577
578				=head2 C
579
580				This method iterates over a dictionary and calls the function supplied
581				as argument. This function will receive, in each call, the word in the
582				source language, the number of occurrences, and the hash of
583				translations.
584
585				$ptd->downtr( sub { my ($w,$c,%t) = @_;
586				if ($w =~ /[^A-Za-z0-9]/) {
587				return undef;
588				} else {
589				return toentry($w,$c,%t);
590				}
591				},
592				filter => 1);
593
594				Set the filter flag if your downtr function is replacing the original
595				dictionary.
596
597				=cut
598
599				sub _init_transaction { }
600				sub _commit { }
601
602				sub downtr {
603				my ($self, $sub, %opt) = @_;
604
605				$opt{verbose} //= $self->verbose;
606				$opt{task} \|\|= $self->{' task '} \|\| "downtr";
607
608				my $time = [Time::HiRes::gettimeofday];
609				my $counter = 0;
610				$self->_init_transaction;
611
612				my @keys = $opt{sorted} ? $self->words(1) : $self->words;
613				for my $word (@keys) {
614				my $res = $sub->($word,
615				$self->count($word),
616				$self->transHash($word));
617				if ($opt{filter}) {
618				if (!defined($res)) {
619				$self->_delete_word($word)
620				} else {
621				$self->_update_word($word, $res);
622				}
623				}
624
625				$counter ++;
626				print STDERR "\r[$opt{task}]\tProcessing ($counter entries)..." if $opt{verbose} && !($counter%100);
627				}
628				$self->_commit;
629				$self->_calculate_sizes() if $opt{filter};
630
631				my $elapsed = Time::HiRes::tv_interval($time);
632				printf STDERR "\r[$opt{task}]\tProcessed %d entries (%.2f seconds).\n",
633				$counter, $elapsed if $opt{verbose};
634				}
635
636				sub _update_word {
637				my ($self, $word, $res) = @_;
638				my ($k) = keys %$res;
639				$res = $res->{$k};
640				if ($k eq $word) {
641				$self->{$word} = $res;
642				} else {
643				delete $self->{$word};
644				$self->{$k} = $res;
645				}
646				}
647
648				# sub _trans_hash {
649				# my ($self, $word) = @_;
650				# return %{$self->{$word}{trans}};
651				# }
652
653				=head2 C
654
655				This function is exported by default and creates a dictionary entry
656				given the word, word count, and hash of translations. Check C
657				for an example.
658
659				=cut
660
661				sub toentry {
662				## word, count, ref(%hash)
663				if (ref($_[2]) eq "HASH") {
664				return { $_[0] => { count => $_[1], trans => $_[2] }}
665				}
666				else {
667				my ($w, $c, %t) = @_;
668				return { $w => { count => $c, trans => \%t } }
669				}
670				}
671
672				=head2 C
673
674				Method to save a PTD in another format. First argument is the name of
675				the format, second is the filename to be used. Supported formats are
676				C<> for Perl Dump format, C<> for Bzipped Perl Dump format,
677				C<>, for Lzma xz Perl Dump format and C<> for SQLite
678				database file.
679
680				Return undef if the format is not known. Returns 0 if save failed. A
681				true value in success.
682
683				=cut
684
685				sub saveAs {
686				my ($self, $type, $filename, $opts) = @_;
687
688				warn "Lingua::PTD saveAs called without all required parameteres" unless $type && $filename;
689
690				my $done = undef;
691				# switch
692				Lingua::PTD::Dumper::_save($self => $filename) and $done = 1 if $type =~ /dmp/i;
693				Lingua::PTD::BzDmp::_save( $self => $filename) and $done = 1 if $type =~ /bz2/i;
694				Lingua::PTD::XzDmp::_save( $self => $filename) and $done = 1 if $type =~ /xz/i;
695				Lingua::PTD::SQLite::_save($self => $filename) and $done = 1 if $type =~ /sqlite/i;
696				Lingua::PTD::TSV::_save($self, $filename, $opts) and $done = 1 if $type =~ /tsv/i;
697				Lingua::PTD::StarDict::_save($self, $filename, $opts) and $done = 1 if $type =~ /stardict/i;
698				# XXX - add above in the documentation.
699
700				# default
701				warn "Requested PTD filetype is not known" unless defined $done;
702
703				return $done;
704				}
705
706				=head2 C
707
708				This method replaces the dictionary, B, lowercasing all
709				entries. This is specially usefull to process transation dictionaries
710				obtained with the C<-utf8> flag that (at the moment) does case
711				sensitive alignment.
712
713				$ptd->lowercase(verbose => 1);
714
715				NOTE: we are using case folding, that might no be always what you
716				expect, but proven to be more robust than relying on the system
717				lowercase implementation.
718
719				=cut
720
721				sub lowercase {
722				my ($self, %ops) = @_;
723
724				$ops{verbose} //= $self->verbose;
725
726				$self->downtr(
727				sub {
728				my ($w, $c, %t) = @_;
729
730				for my $k (keys %t) {
731				next unless $k =~ /[[:upper:]]/;
732
733				my $lk = fc $k;
734				$t{$lk} = exists($t{$lk}) ? $t{$lk} + $t{$k} : $t{$k};
735				delete $t{$k};
736				}
737
738				if ($w =~ /[[:upper:]]/) {
739				my $lw = fc $w;
740
741				my %ot = $self->transHash($lw);
742				if (%ot) {
743				my ($c1, $c2) = ($c, $self->count($lw));
744				for my $k (_uniq(keys %t, keys %ot)) {
745				my ($p1, $p2) = ($t{$k} \|\| 0, $ot{$k} \|\| 0);
746				if ($c1 + $c2) {
747				$t{$k} = ($c1 * $p1 + $c2 * $p2)/($c1+$c2);
748				} else {
749				delete $t{$k};
750				}
751				}
752				toentry($lw, $c1+$c2, %t)
753				} else {
754				toentry($lw, $c, %t)
755				}
756				} else {
757				toentry($w, $c, %t);
758				}
759				},
760				filter => 1,
761				task => 'lowercase',
762				verbose => $ops{verbose},
763				);
764				}
765
766				=head2 C
767
768				Create unambiguous-concept traslation sets.
769
770				my $result = ucts($ptd1, $ptd2, m=>0.1, M=>0.8);
771
772				Available options are:
773
774				=over 4
775
776				=item C
777
778				Mininum number of occurences of each token. Must be an
779				integer (default: 10).
780
781				=item C
782
783				Manixum number of occurences of each token. Must be an
784				integer (default: 100).
785
786				=item C
787
788				Minimum probabilty for translation. Must be a probability
789				in the interval [0,1] (default: 0.2).
790
791				=item C
792
793				Minimum probabilty for the inverse translations. Must be a
794				probability in the interval [0,1] (default: 0.8).
795
796				=item C
797
798				Print rank (default: 0).
799
800				=item C
801
802				Pretty print output (default: 0).
803
804				=item C
805
806				Pretty print output to file C.
807
808				=back
809
810				=cut
811
812				sub ucts {
813				my ($fileA, $fileB, %my_opts) = @_;
814
815				my $min_occur = $my_opts{m} \|\| 10;
816				my $max_occur = $my_opts{M} \|\| 100;
817				my $prob = $my_opts{p} \|\| 0.2;
818				my $probi = $my_opts{P} \|\| 0.8;
819				my $rank = $my_opts{r} \|\| 0;
820				my $pp = $my_opts{pp} \|\| 0;
821				my $output = $my_opts{output} \|\| '';
822
823				# check files exist
824				unless ($fileA and $fileB) {
825				die "Error: need at least two PTDs given as argument.";
826				}
827
828				# handle output handles
829				$pp = 1 if $output;
830				open STDOUT, '>', $output if $output;
831				binmode(STDOUT, ':utf8'); # XXX
832
833				# load PTDs
834				my $ptd;
835				if (ref($fileA) =~ m/^Lingua::PTD/) {
836				$ptd = $fileA;
837				}
838				else {
839				if (-e $fileA) {
840				$ptd = Lingua::PTD->new($fileA);
841				}
842				else {
843				die "Error: file not found: $_";
844				}
845				}
846				my $ptd_inv;
847				if (ref($fileB) =~ m/^Lingua::PTD/) {
848				$ptd_inv = $fileB;
849				}
850				else {
851				if (-e $fileB) {
852				$ptd_inv = Lingua::PTD->new($fileB);
853				}
854				else {
855				die "Error: file not found: $_";
856				}
857				}
858
859				if ($pp and $fileA =~ m/.*?(\w\w)\-(\w\w)/) { # XXX
860				print "Langs: $1, $2\n" if $pp;
861				}
862
863				my (%left, %right);
864
865				# process each word in the PTD
866				my @words = $ptd->words;
867				foreach (@words) {
868				my $r = __build_ucts($ptd, $ptd_inv, $min_occur, $max_occur, $prob, $probi, $_);
869				$left{$_} = $r if $r;
870				}
871				# process each word in the inverse PTD
872				@words = $ptd_inv->words;
873				foreach (@words) {
874				my $r = __build_ucts($ptd_inv, $ptd, $min_occur, $max_occur, $prob, $probi, $_);
875				$right{$_} = $r if $r;
876				}
877
878				my @final = ();
879				foreach my $l (keys %left) {
880				my %ll = ($l=>1);
881				my %rr;
882				$rr{$_}++ for @{$left{$l}->{trans}};
883				my $rank = $left{$l}->{rank};
884
885				foreach (@{$left{$l}->{trans}}) {
886				$rr{$_}++;
887				if (exists($right{$_})) {
888				$ll{$_}++ for @{$right{$_}->{trans}};
889				delete $right{$_};
890				}
891				}
892				push @final, {l=>[keys %ll], r=>[keys %rr], rank=>$rank};
893				}
894				foreach my $r (keys %right) {
895				my %ll;
896				my %rr = ($r=>1);;
897				$ll{$_}++ for @{$right{$r}->{trans}};
898				my $rank = $right{$r}->{rank};
899
900				push @final, {l=>[keys %ll], r=>[keys %rr], rank=>$rank};
901				}
902
903				if ($pp) {
904				__pp_ucts($_,$rank) foreach (@final);
905				}
906				else {
907				return [@final];
908				}
909
910				close STDOUT if $output;
911				}
912
913				sub __build_ucts {
914				my ($ptd, $ptd_inv, $min_occur, $max_occur, $prob, $probi, $word) = @_;
915
916				my $count = $ptd->count($word); ## or print STDERR "### $word\n";
917				$count //= 0;
918				return undef unless ($min_occur <= $count and $count <= $max_occur);
919
920				my $total = 0;
921				my %trans = ();
922				my %transHash = $ptd->transHash($word);
923
924				foreach (keys %transHash) {
925				my $p = $transHash{$_};
926				next unless ($p >= $prob);
927				my $p_inv = $ptd_inv->prob($_, $word);
928				next unless ($p_inv >= $probi);
929
930				my $counti = $ptd_inv->count($_);
931				if ( ($min_occur <= $counti) and ($counti <= $max_occur) ) {
932				if ($total) { $total = ($total+$p+$p_inv)/2; }
933				else { $total = $p+$p_inv; }
934
935				$trans{$_}++;
936				}
937				}
938
939				return undef unless %trans;
940				return {trans=>[keys %trans], rank=>$total};
941				}
942
943				=head2 C
944
945				Create bi-words sets given a PTD pair.
946
947				my $result = bws($ptd1, $ptd2, m=>0.1, p=>0.4);
948
949				C<$ptd1> and C<$ptd2> can be filenames for the PTDs or already create
950				PTD objects.
951
952				The following options are available:
953
954				=over 4
955
956				=item C
957
958				Mininum number of occurences of each token. Must be an integer
959				(default: 10).
960
961				=item C
962
963				Minimum probabilty for translation. Must be a probability
964				in the interval [0,1] (default: 0.4).
965
966				=item C
967
968				Print rank (default: 0).
969
970				=item C
971
972				Pretty print output (default: 0).
973
974				=item C
975
976				Pretty print output to file C.
977
978				=back
979
980				=cut
981
982				sub bws {
983				my ($fileA, $fileB, %my_opts) = @_;
984
985				my $min_occur = $my_opts{m} \|\| 10;
986				my $prob = $my_opts{p} \|\| 0.4;
987				my $rank = $my_opts{r} \|\| 0;
988				my $pp = $my_opts{pp} \|\| 0;
989				my $output = $my_opts{output} \|\| '';
990
991				my $filter = $my_opts{filter};
992
993				#my $sorter;
994				#if ($my_opts{sorter} && ref($my_opts{sorter}) eq 'CODE') {
995				# $sorter = \&{$my_opts{sorter}};
996				#}
997
998				# check files exist
999				unless ($fileA and $fileB) {
1000				die "Error: need at least two PTDs given as argument.";
1001				}
1002
1003				# handle output handles
1004				$pp = 1 if $output;
1005				open STDOUT, '>', $output if $output;
1006				binmode(STDOUT, ':utf8'); # XXX
1007
1008				# load PTDs
1009				my $ptd;
1010				if (ref($fileA) =~ m/^Lingua::PTD/) {
1011				$ptd = $fileA;
1012				}
1013				else {
1014				if (-e $fileA) {
1015				$ptd = Lingua::PTD->new($fileA);
1016				}
1017				else {
1018				die "Error: file not found: $_";
1019				}
1020				}
1021				my $ptd_inv;
1022				if (ref($fileB) =~ m/^Lingua::PTD/) {
1023				$ptd_inv = $fileB;
1024				}
1025				else {
1026				if (-e $fileB) {
1027				$ptd_inv = Lingua::PTD->new($fileB);
1028				}
1029				else {
1030				die "Error: file not found: $_";
1031				}
1032				}
1033
1034				if ($pp and $fileA =~ m/.*?(\w\w)\-(\w\w)/) { # XXX
1035				print "Langs: $1, $2\n" if $pp;
1036				}
1037
1038				my @final;
1039
1040				my @words = $ptd->words;
1041				my $total_words_l = $ptd->size();
1042				my $total_words_r = $ptd_inv->size();
1043				foreach my $word (@words) {
1044				my $count = $ptd->count($word);
1045				next unless ($count >= $min_occur);
1046				next if ($word eq "(none)");
1047
1048				my %transHash = $ptd->transHash($word);
1049				foreach (keys %transHash) {
1050				my $p = $transHash{$_};
1051				next unless ($p >= $prob);
1052				next if ($_ eq "(none)");
1053
1054				__pp_ucts({l=>[$word],r=>[$_],rank=>$p}, $rank) if $pp;
1055				push @final, {
1056				l=>$word, cl=>$count, tl=>$total_words_l,
1057				r=>$_, cr=>$ptd_inv->count($_), tr=>$total_words_r,
1058				rank=>$p } unless $pp;
1059				}
1060				}
1061				@words = $ptd_inv->words;
1062				foreach my $word (@words) {
1063				my $count = $ptd_inv->count($word);
1064				next unless ($count >= $min_occur);
1065				next if ($word eq "(none)");
1066
1067				my %transHash = $ptd_inv->transHash($word);
1068				foreach (keys %transHash) {
1069				my $p = $transHash{$_};
1070				next unless ($p >= $prob);
1071				next if ($_ eq "(none)");
1072
1073				__pp_ucts({l=>[$_],r=>[$word],rank=>$p}, $rank) if $pp;
1074				push @final, {
1075				l=>$_, cl=>$ptd->count($_), tl=>$total_words_l,
1076				r=>$word, cr=>$count, tr=>$total_words_r,
1077				rank=>$p } unless $pp;
1078				}
1079				}
1080
1081				# if only one filter, put it in an array
1082				$filter = [$filter] if ($filter and ref($filter) eq 'CODE');
1083				# apply array of filters in order
1084				if ($filter and ref($filter) eq 'ARRAY'){
1085				while (my $f = shift(@{$filter})) {
1086				@final = grep { $f->($_) } @final ;
1087				}
1088				}
1089
1090				close STDOUT if $output;
1091				return [@final] unless $pp;
1092				}
1093
1094				sub __pp_ucts {
1095				my ($r, $rank) = @_;
1096
1097				if ($rank) {
1098				printf "[%f]%s=%s\n", $r->{rank}, (join ',', @{$r->{l}}), join ',', @{$r->{r}};
1099				}
1100				else {
1101				printf "%s=%s\n", (join ',', @{$r->{l}}), join ',', @{$r->{r}};
1102				}
1103				}
1104
1105				=head1 SEE ALSO
1106
1107				NATools(3), perl(1)
1108
1109				=head1 AUTHOR
1110
1111				Alberto Manuel Brandão Simões, Eambs@cpan.orgE
1112
1113				=head1 COPYRIGHT AND LICENSE
1114
1115				Copyright (C) 2008-2014 by Alberto Manuel Brandão Simões
1116
1117				=cut
1118
1119				sub _calculate_sizes {
1120				my $self = shift;
1121				my $total = 0;
1122				my $count = 0;
1123				$self->downtr( sub { $count++; $total += $_[1] }, verbose => 0);
1124				$self->{" size "} = $total; ## Private keys are kept with spaces.
1125				$self->{" count "} = $count;
1126				}
1127
1128				sub _min { $_[0] < $_[1] ? $_[0] : $_[1] }
1129				sub _max { $_[0] > $_[1] ? $_[0] : $_[1] }
1130
1131				sub _protect_quotes {
1132				my $f = shift;
1133				for ($f) {
1134				s/\\/\\\\/g;
1135				s/'/\\'/g;
1136				}
1137				return $f;
1138				}
1139
1140
1141				"This isn't right. This isn't even wrong.";
1142				__END__