File Coverage

blib/lib/XML/TMX/CWB.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package XML::TMX::CWB;
2				$XML::TMX::CWB::VERSION = '0.08';
3	1	1	14486	use warnings;
	1		2
	1		30
4	1	1	3	use strict;
	1		2
	1		25
5	1	1	490	use Lingua::PT::PLNbase;
	1		29422
	1		9
6	1	1	1723	use XML::TMX::Reader;
	0
	0
7				use XML::TMX::Writer;
8				use CWB::CL::Strict;
9				use File::Spec::Functions;
10				use Encode;
11
12				use POSIX qw(locale_h);
13				setlocale(&POSIX::LC_ALL, "pt_PT");
14				use locale;
15
16				=head1 NAME
17
18				XML::TMX::CWB - TMX interface with Open Corpus Workbench
19
20				=head1 SYNOPSIS
21
22				XML::TMX::CWB->toCWB( tmx => $tmxfile,
23				from => 'PT', to => 'EN',
24				corpora => "/corpora",
25				corpus_name => "foo",
26				tokenize_source => 1,
27				tokenize_target => 1,
28				verbose => 1,
29				registry => '/path/to/cwb/registry' );
30
31				XML::TMX::CWB->toTMX( source => 'sourcecorpus',
32				target => 'targetcorpus',
33				source_lang => 'PT',
34				target_lang => 'ES',
35				verbose => 1,
36				output => "foobar.tmx");
37
38
39				=head1 METHODS
40
41				=head2 toTMX
42
43				Fetches an aligned pair of corpora on CWB and exports it as a TMX
44				file.
45
46				=cut
47
48				sub toTMX {
49				shift if $_[0] eq 'XML::TMX::CWB';
50				my %ops = @_;
51
52				die "Source and target corpora names are required.\n" unless $ops{source} and $ops{target};
53
54				my $Cs = CWB::CL::Corpus->new(uc $ops{source});
55				die "Can't find corpus [$ops{source}]\n" unless $Cs;
56				my $Ct = CWB::CL::Corpus->new(uc $ops{target});
57				die "Can't find corpus [$ops{target}]\n" unless $Ct;
58
59				my $align = $Cs->attribute(lc($ops{target}), "a");
60				my $count = $align->max_alg;
61
62				my $Ws = $Cs->attribute("word", "p");
63				my $Wt = $Ct->attribute("word", "p");
64
65				my $tmx = new XML::TMX::Writer();
66				$tmx->start_tmx( $ops{output} ? (OUTPUT => $ops{output}) : (),
67				TOOL => 'XML::TMX::CWB',
68				TOOLVERSION => $XML::TMX::CWB::VERSION);
69				for my $i (0 .. $count-1) {
70				my ($s1, $s2, $t1, $t2) = $align->alg2cpos($i);
71				my $source = join(" ",$Ws->cpos2str($s1 .. $s2));
72				my $target = join(" ",$Wt->cpos2str($t1 .. $t2));
73				Encode::_utf8_on($source);
74				Encode::_utf8_on($target);
75				$tmx->add_tu($ops{source_lang} => $source,
76				$ops{target_lang} => $target);
77				}
78				$tmx->end_tmx();
79				}
80
81				=head2 toCWB
82
83				Imports a TMX file (just two languages) to a parallel corpus on CWB.
84
85				=cut
86
87				sub _get_header_prop_list {
88				my ($reader, $prop) = @_;
89
90				return undef unless exists $reader->{header}{-prop}{$prop};
91
92				my $value = (ref($reader->{header}{-prop}{$prop}) eq "ARRAY")
93				? join(",",@{$reader->{header}{-prop}{$prop}})
94				: $reader->{header}{-prop}{$prop};
95
96				return [ split /\s,\s/ => $value ];
97				}
98
99
100				sub _RUN {
101				my $command = shift;
102				print STDERR "Running [$command]\n";
103				`$command`;
104				}
105
106				sub toCWB {
107				shift if $_[0] eq 'XML::TMX::CWB';
108				my %ops = @_;
109
110				my $tmx = $ops{tmx} or die "tmx file not specified.\n";
111
112				my $corpora = $ops{corpora} \|\| "/corpora";
113				die "Need a corpora folder" unless -d $corpora;
114
115				die "Can't open [$tmx] file for reading\n" unless -f $tmx;
116
117				# Create reader object
118				my $reader = XML::TMX::Reader->new($tmx);
119
120				my %tagged_languages = ();
121				for my $language (@{ _get_header_prop_list($reader, "pos-tagged") }) {
122				$tagged_languages{$language}++;
123				}
124
125				my $has_tagged_languages = scalar(keys %tagged_languages);
126
127				my $tag_data = undef;
128				my $s_attributes = ['s'];
129				my $pos_fields = [qw'word lemma pos'];
130
131				if ($has_tagged_languages) {
132
133				if ($ops{tok} \|\| $ops{tokenize_source} \|\| $ops{tokenize_target}) {
134				warn "Can't tokenize tagged languages. Ignoring tagging request for ",
135				join(", ", keys %tagged_languages);
136				}
137
138				$s_attributes \|\|= _get_header_prop_list('pos-s-attributes');
139				$pos_fields \|\|= _get_header_prop_list('pos-fields');
140
141				$tag_data = {
142				languages => \%tagged_languages,
143				'pos-s-attributes' => $s_attributes,
144				'pos-fields' => $pos_fields
145				};
146				}
147
148				my ($source, $target);
149
150				if ($ops{mono}) {
151				# Detect what languages to use
152				($source) = _detect_language($reader,
153				($ops{mono} \|\| undef));
154				$ops{verbose} && print STDERR "Using language [$source]\n";
155				} else {
156				# Detect what languages to use
157				($source, $target) = _detect_languages($reader,
158				($ops{from} \|\| undef),
159				($ops{to} \|\| undef));
160				$ops{verbose} && print STDERR "Using languages [$source, $target]\n";
161				}
162
163				# Detect corpus registry
164				my $registry = $ops{registry} \|\| $ENV{CORPUS_REGISTRY};
165				chomp( $registry = `cwb-config -r` ) unless $registry;
166				die "Could not detect a suitable CWB registry folder.\n" unless $registry && -d $registry;
167
168				# detect corpus name
169				my $cname = $ops{corpus_name} \|\| $tmx;
170				$cname =~ s/[.-]/_/g;
171
172				if ($ops{mono}) {
173				_mtmx2cqpfiles($reader, $cname, $source,
174				$ops{tok} \|\| 0,
175				$ops{verbose} \|\| 0,
176				$tag_data
177				);
178				_mencode($cname, $corpora, $registry, $source, $tag_data);
179				} else {
180				_tmx2cqpfiles($reader, $cname, $source, $target,
181				$ops{tokenize_source} \|\| 0,
182				$ops{tokenize_target} \|\| 0,
183				$ops{verbose} \|\| 0,
184				$tag_data
185				);
186				_encode($cname, $corpora, $registry, $source, $target, $tag_data);
187				unlink "target.cqp";
188				unlink "align.txt";
189				}
190
191				unlink "source.cqp";
192				}
193
194				sub _encode {
195				my ($cname, $corpora, $registry, $l1, $l2, $tagged) = @_;
196
197				my @languages = ($l1, $l2);
198				my @files = (qw'source target');
199
200				for my $i (0, 1) {
201				my ($name, $folder, $reg);
202				my ($posatt, $sattr) = ("", "");
203
204				my $l = $languages[$i];
205				my $f = $files[$i];
206
207				if ($tagged && exists($tagged->{languages}{$l})) {
208				shift @{$tagged->{'pos-fields'}};
209				$posatt = join(" ", map { "-P $_" } @{$tagged->{'pos-fields'}});
210				$sattr = join(" ", map { "-S $_" } @{$tagged->{'pos-s-attributes'}});
211				}
212
213				$name = lc("${cname}_$l");
214				$folder = catfile($corpora, $name);
215				$reg = catfile($registry, $name);
216
217				mkdir $folder;
218
219				_RUN("cwb-encode -c utf8 -d $folder -f $f.cqp -R $reg -S tu+id $sattr $posatt");
220				_RUN("cwb-make -r $registry -v " . uc($name));
221				}
222
223				_RUN("cwb-align-import -r $registry -v align.txt");
224				_RUN("cwb-align-import -r $registry -v -inverse align.txt");
225				}
226
227				sub _mencode {
228				my ($cname, $corpora, $registry, $l1, $tagged) = @_;
229
230				my $name = lc("${cname}_$l1");
231				my $folder = catfile($corpora, $name);
232				my $reg = catfile($registry, $name);
233
234				my ($posatt, $sattr) = ("", "");
235				if ($tagged && $tagged->{languages}{$l1}) {
236				shift @{$tagged->{'pos-fields'}};
237				$posatt = join(" ", map { "-P $_" } @{$tagged->{'pos-fields'}});
238				$sattr = join(" ", map { "-S $_" } @{$tagged->{'pos-s-attributes'}});
239				}
240
241				mkdir $folder;
242				_RUN("cwb-encode -c utf8 -d $folder -f source.cqp -R $reg -S tu+id $sattr $posatt");
243				_RUN("cwb-make -r $registry -v " . uc($name));
244				}
245
246				sub _tmx2cqpfiles {
247				my ($reader, $cname, $l1, $l2, $t1, $t2, $v, $tagged) = @_;
248				open F1, ">:utf8", "source.cqp" or die "Can't create cqp outfile\n";
249				open F2, ">:utf8", "target.cqp" or die "Can't create cqp outfile\n";
250				open AL, ">:utf8", "align.txt" or die "Can't create alignment file\n";
251				my $i = 1;
252
253				printf AL "%s\t%s\ttu\tid_{id}\n", uc("${cname}_$l1"), uc("${cname}_$l2");
254
255				print STDERR "Processing..." if $v;
256
257				my $proc = sub {
258				my ($langs) = @_;
259				return unless exists $langs->{$l1} && exists $langs->{$l2};
260
261				my (@S, @T);
262
263				# Language 1
264				if ($tagged && exists($tagged->{languages}{$l1})) {
265				@S = split /\n/, $langs->{$l1}{-seg};
266				}
267				else {
268				for ($langs->{$l1}{-seg}) {
269				s/&/&/g;
270				s/
271				s/
272				}
273				@S = $t1 ? tokenize($langs->{$l1}{-seg}) : split /\s+/, $langs->{$l1}{-seg};
274				}
275
276				# Language 2
277				if ($tagged && exists($tagged->{languages}{$l2})) {
278				@T = split /\n/, $langs->{$l2}{-seg};
279				}
280				else {
281				for ($langs->{$l2}{-seg}) {
282				s/&/&/g;
283				s/
284				s/
285				}
286
287				@T = $t2 ? tokenize($langs->{$l2}{-seg}) : split /\s+/, $langs->{$l2}{-seg};
288				}
289
290				print STDERR "\rProcessing... $i translation units" if $v && $i%1000==0;
291
292				print AL "id_$i\tid_$i\n";
293				print F1 "\n", join("\n", @S), "\n\n";
294				print F2 "\n", join("\n", @T), "\n\n";
295				++$i;
296				};
297
298				$reader->for_tu( $proc );
299				print STDERR "\rProcessing... $i translation units\n" if $v;
300				close AL;
301				close F1;
302				close F2;
303				}
304
305				sub _mtmx2cqpfiles {
306				my ($reader, $cname, $l1, $t1, $v, $tagged) = @_;
307				open F1, ">:utf8", "source.cqp" or die "Can't create cqp outfile\n";
308				my $i = 1;
309
310				print STDERR "Processing..." if $v;
311
312				my $proc = sub {
313				my ($langs) = @_;
314				return unless exists $langs->{$l1};
315
316				my (@S);
317				if ($tagged && exists($tagged->{languages}{$l1})) {
318				@S = split /\n/, $langs->{$l1}{-seg};
319				} else {
320				for ($langs->{$l1}{-seg}) {
321				s/&/&/g;
322				s/
323				s/>/>/g;
324				}
325
326				@S = $t1 ? tokenize($langs->{$l1}{-seg}) : split /\s+/, $langs->{$l1}{-seg};
327				}
328				print STDERR "\rProcessing... $i translation units" if $v && $i%1000==0;
329
330				print F1 "\n", join("\n", @S), "\n\n";
331				++$i;
332				};
333
334				$reader->for_tu( $proc );
335				print STDERR "\rProcessing... $i translation units\n" if $v;
336				close F1;
337				}
338
339				sub _detect_languages {
340				my ($reader, $from, $to) = @_;
341				my @languages = $reader->languages();
342
343				die "Language $from not available\n" if $from and !grep{_ieq($_, $from)}@languages;
344				die "Language $to not available\n" if $to and !grep{_ieq($_, $to)} @languages;
345
346				($from) = grep { _ieq($_, $from) } @languages if $from;
347				($to) = grep { _ieq($_, $to ) } @languages if $to;
348
349				return ($from, $to) if $from and $to;
350
351				if (scalar(@languages) == 2) {
352				$to = grep { $_ ne $from } @languages if $from and not $to;
353				$from = grep { $_ ne $to } @languages if $to and not $from;
354				($from, $to) = @languages if not ($to or $from);
355				return ($from, $to) if $from and $to;
356				}
357				die "Can't guess what languages to use!\n";
358				}
359
360				sub _detect_language {
361				my ($reader, $mono) = @_;
362				my @languages = $reader->languages();
363
364				die "Language $mono not available\n" if $mono and !grep {_ieq($_, $mono)} @languages;
365
366				($mono) = grep { _ieq($_, $mono) } @languages if $mono;
367
368				return ($mono) if $mono;
369
370				if (scalar(@languages) == 1) {
371				($mono) = @languages;
372				return ($mono);
373				}
374				die "Can't guess what languages to use!\n";
375				}
376
377				sub _ieq {
378				uc($_[0]) eq uc($_[1])
379				}
380
381
382
383				=head1 AUTHOR
384
385				Alberto Simoes, C<< >>
386
387				=head1 BUGS
388
389				Please report any bugs or feature requests to C, or through
390				the web interface at L. I will be notified, and then you'll
391				automatically be notified of progress on your bug as I make changes.
392
393
394
395
396				=head1 SUPPORT
397
398				You can find documentation for this module with the perldoc command.
399
400				perldoc XML::TMX::CWB
401
402
403				You can also look for information at:
404
405				=over 4
406
407				=item * RT: CPAN's request tracker
408
409				L
410
411				=item * AnnoCPAN: Annotated CPAN documentation
412
413				L
414
415				=item * CPAN Ratings
416
417				L
418
419				=item * Search CPAN
420
421				L
422
423				=back
424
425
426				=head1 ACKNOWLEDGEMENTS
427
428
429				=head1 LICENSE AND COPYRIGHT
430
431				Copyright 2010-2014 Alberto Simoes.
432
433				This program is free software; you can redistribute it and/or modify it
434				under the terms of either: the GNU General Public License as published
435				by the Free Software Foundation; or the Artistic License.
436
437				See http://dev.perl.org/licenses/ for more information.
438
439
440				=cut
441
442				1; # End of XML::TMX::CWB