File Coverage

blib/lib/XML/TMX/CWB.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package XML::TMX::CWB;
2				$XML::TMX::CWB::VERSION = '0.09';
3	1	1	15194	use warnings;
	1		2
	1		38
4	1	1	6	use strict;
	1		1
	1		37
5	1	1	661	use Lingua::PT::PLNbase;
	1		28290
	1		7
6	1	1	1355	use XML::TMX::Reader;
	0
	0
7				use XML::TMX::Writer;
8				use CWB::CL::Strict;
9				use File::Spec::Functions;
10				use Encode;
11
12				use POSIX qw(locale_h);
13				setlocale(&POSIX::LC_ALL, "pt_PT");
14				use locale;
15
16				=head1 NAME
17
18				XML::TMX::CWB - TMX interface with Open Corpus Workbench
19
20				=head1 SYNOPSIS
21
22				XML::TMX::CWB->toCWB( tmx => $tmxfile,
23				from => 'PT', to => 'EN',
24				corpora => "/corpora",
25				corpus_name => "foo",
26				tokenize_source => 1,
27				tokenize_target => 1,
28				verbose => 1,
29				registry => '/path/to/cwb/registry' );
30
31				XML::TMX::CWB->toTMX( source => 'sourcecorpus',
32				target => 'targetcorpus',
33				source_lang => 'PT',
34				target_lang => 'ES',
35				verbose => 1,
36				output => "foobar.tmx");
37
38
39				=head1 METHODS
40
41				=head2 toTMX
42
43				Fetches an aligned pair of corpora on CWB and exports it as a TMX
44				file.
45
46				=cut
47
48				sub toTMX {
49				shift if $_[0] eq 'XML::TMX::CWB';
50				my %ops = @_;
51
52				die "Source and target corpora names are required.\n" unless $ops{source} and $ops{target};
53
54				my $Cs = CWB::CL::Corpus->new(uc $ops{source});
55				die "Can't find corpus [$ops{source}]\n" unless $Cs;
56				my $Ct = CWB::CL::Corpus->new(uc $ops{target});
57				die "Can't find corpus [$ops{target}]\n" unless $Ct;
58
59				my $align = $Cs->attribute(lc($ops{target}), "a");
60				my $count = $align->max_alg;
61
62				my $Ws = $Cs->attribute("word", "p");
63				my $Wt = $Ct->attribute("word", "p");
64
65				my $tmx = new XML::TMX::Writer();
66				$tmx->start_tmx( $ops{output} ? (OUTPUT => $ops{output}) : (),
67				TOOL => 'XML::TMX::CWB',
68				TOOLVERSION => $XML::TMX::CWB::VERSION);
69				for my $i (0 .. $count-1) {
70				my ($s1, $s2, $t1, $t2) = $align->alg2cpos($i);
71				my $source = join(" ",$Ws->cpos2str($s1 .. $s2));
72				my $target = join(" ",$Wt->cpos2str($t1 .. $t2));
73				Encode::_utf8_on($source);
74				Encode::_utf8_on($target);
75				$tmx->add_tu($ops{source_lang} => $source,
76				$ops{target_lang} => $target);
77				}
78				$tmx->end_tmx();
79				}
80
81				=head2 toCWB
82
83				Imports a TMX file (just two languages) to a parallel corpus on CWB.
84
85				=cut
86
87				sub _get_header_prop_list {
88				my ($reader, $prop, $default) = @_;
89
90				$default \|\|= [];
91
92				return $default unless exists $reader->{header}{-prop}{$prop};
93
94				my $value = (ref($reader->{header}{-prop}{$prop}) eq "ARRAY")
95				? join(",",@{$reader->{header}{-prop}{$prop}})
96				: $reader->{header}{-prop}{$prop};
97
98				return [ split /\s,\s/ => $value ];
99				}
100
101
102				sub _RUN {
103				my $command = shift;
104				print STDERR "Running [$command]\n";
105				`$command`;
106				}
107
108				sub toCWB {
109				shift if $_[0] eq 'XML::TMX::CWB';
110				my %ops = @_;
111
112				my $tmx = $ops{tmx} or die "tmx file not specified.\n";
113
114				my $corpora = $ops{corpora} \|\| "/corpora";
115				die "Need a corpora folder" unless -d $corpora;
116
117				die "Can't open [$tmx] file for reading\n" unless -f $tmx;
118
119				# Create reader object
120				my $reader = XML::TMX::Reader->new($tmx);
121
122				my %tagged_languages = ();
123				for my $language (@{ _get_header_prop_list($reader, "pos-tagged") }) {
124				$tagged_languages{$language}++;
125				}
126
127				my $has_tagged_languages = scalar(keys %tagged_languages);
128
129				my $tag_data = undef;
130				my $s_attributes = ['s'];
131				my $pos_fields = [qw'word lemma pos'];
132
133				if ($has_tagged_languages) {
134
135				if ($ops{tok} \|\| $ops{tokenize_source} \|\| $ops{tokenize_target}) {
136				warn "Can't tokenize tagged languages. Ignoring tagging request for ",
137				join(", ", keys %tagged_languages);
138				}
139
140				$s_attributes = _get_header_prop_list($reader, 'pos-s-attributes', $s_attributes);
141				$pos_fields = _get_header_prop_list($reader, 'pos-fields', $pos_fields);
142
143				$tag_data = {
144				languages => \%tagged_languages,
145				'pos-s-attributes' => $s_attributes,
146				'pos-fields' => $pos_fields
147				};
148				}
149
150				my ($source, $target);
151
152				if ($ops{mono}) {
153				# Detect what languages to use
154				($source) = _detect_language($reader,
155				($ops{mono} \|\| undef));
156				$ops{verbose} && print STDERR "Using language [$source]\n";
157				} else {
158				# Detect what languages to use
159				($source, $target) = _detect_languages($reader,
160				($ops{from} \|\| undef),
161				($ops{to} \|\| undef));
162				$ops{verbose} && print STDERR "Using languages [$source, $target]\n";
163				}
164
165				# Detect corpus registry
166				my $registry = $ops{registry} \|\| $ENV{CORPUS_REGISTRY};
167				chomp( $registry = `cwb-config -r` ) unless $registry;
168				die "Could not detect a suitable CWB registry folder.\n" unless $registry && -d $registry;
169
170				# detect corpus name
171				my $cname = $ops{corpus_name} \|\| $tmx;
172				$cname =~ s/[.-]/_/g;
173
174				if ($ops{mono}) {
175				_mtmx2cqpfiles($reader, $cname, $source,
176				$ops{tok} \|\| 0,
177				$ops{verbose} \|\| 0,
178				$tag_data
179				);
180				_mencode($cname, $corpora, $registry, $source, $tag_data);
181				} else {
182				_tmx2cqpfiles($reader, $cname, $source, $target,
183				$ops{tokenize_source} \|\| 0,
184				$ops{tokenize_target} \|\| 0,
185				$ops{verbose} \|\| 0,
186				$tag_data
187				);
188				_encode($cname, $corpora, $registry, $source, $target, $tag_data);
189				unlink "target.cqp";
190				unlink "align.txt";
191				}
192
193				unlink "source.cqp";
194				}
195
196				sub _encode {
197				my ($cname, $corpora, $registry, $l1, $l2, $tagged) = @_;
198
199				my @languages = ($l1, $l2);
200				my @files = (qw'source target');
201
202				for my $i (0, 1) {
203				my ($name, $folder, $reg);
204				my ($posatt, $sattr) = ("", "");
205
206				my $l = $languages[$i];
207				my $f = $files[$i];
208
209				if ($tagged && exists($tagged->{languages}{$l})) {
210				shift @{$tagged->{'pos-fields'}};
211				$posatt = join(" ", map { "-P $_" } @{$tagged->{'pos-fields'}});
212				$sattr = join(" ", map { "-S $_" } @{$tagged->{'pos-s-attributes'}});
213				}
214
215				$name = lc("${cname}_$l");
216				$folder = catfile($corpora, $name);
217				$reg = catfile($registry, $name);
218
219				mkdir $folder;
220
221				_RUN("cwb-encode -c utf8 -d $folder -f $f.cqp -R $reg -S tu+id $sattr $posatt");
222				_RUN("cwb-make -r $registry -v " . uc($name));
223				}
224
225				_RUN("cwb-align-import -r $registry -v align.txt");
226				_RUN("cwb-align-import -r $registry -v -inverse align.txt");
227				}
228
229				sub _mencode {
230				my ($cname, $corpora, $registry, $l1, $tagged) = @_;
231
232				my $name = lc("${cname}_$l1");
233				my $folder = catfile($corpora, $name);
234				my $reg = catfile($registry, $name);
235
236				my ($posatt, $sattr) = ("", "");
237				if ($tagged && $tagged->{languages}{$l1}) {
238				shift @{$tagged->{'pos-fields'}};
239				$posatt = join(" ", map { "-P $_" } @{$tagged->{'pos-fields'}});
240				$sattr = join(" ", map { "-S $_" } @{$tagged->{'pos-s-attributes'}});
241				}
242
243				mkdir $folder;
244				_RUN("cwb-encode -c utf8 -d $folder -f source.cqp -R $reg -S tu+id $sattr $posatt");
245				_RUN("cwb-make -r $registry -v " . uc($name));
246				}
247
248				sub _tmx2cqpfiles {
249				my ($reader, $cname, $l1, $l2, $t1, $t2, $v, $tagged) = @_;
250				open F1, ">:utf8", "source.cqp" or die "Can't create cqp outfile\n";
251				open F2, ">:utf8", "target.cqp" or die "Can't create cqp outfile\n";
252				open AL, ">:utf8", "align.txt" or die "Can't create alignment file\n";
253				my $i = 1;
254
255				printf AL "%s\t%s\ttu\tid_{id}\n", uc("${cname}_$l1"), uc("${cname}_$l2");
256
257				print STDERR "Processing..." if $v;
258
259				my $proc = sub {
260				my ($langs) = @_;
261				return unless exists $langs->{$l1} && exists $langs->{$l2};
262
263				my (@S, @T);
264
265				# Language 1
266				if ($tagged && exists($tagged->{languages}{$l1})) {
267				@S = split /\n/, $langs->{$l1}{-seg};
268				}
269				else {
270				for ($langs->{$l1}{-seg}) {
271				s/&/&/g;
272				s/
273				s/
274				}
275				@S = $t1 ? tokenize($langs->{$l1}{-seg}) : split /\s+/, $langs->{$l1}{-seg};
276				}
277
278				# Language 2
279				if ($tagged && exists($tagged->{languages}{$l2})) {
280				@T = split /\n/, $langs->{$l2}{-seg};
281				}
282				else {
283				for ($langs->{$l2}{-seg}) {
284				s/&/&/g;
285				s/
286				s/
287				}
288
289				@T = $t2 ? tokenize($langs->{$l2}{-seg}) : split /\s+/, $langs->{$l2}{-seg};
290				}
291
292				print STDERR "\rProcessing... $i translation units" if $v && $i%1000==0;
293
294				print AL "id_$i\tid_$i\n";
295				print F1 "\n", join("\n", @S), "\n\n";
296				print F2 "\n", join("\n", @T), "\n\n";
297				++$i;
298				};
299
300				$reader->for_tu( $proc );
301				print STDERR "\rProcessing... $i translation units\n" if $v;
302				close AL;
303				close F1;
304				close F2;
305				}
306
307				sub _mtmx2cqpfiles {
308				my ($reader, $cname, $l1, $t1, $v, $tagged) = @_;
309				open F1, ">:utf8", "source.cqp" or die "Can't create cqp outfile\n";
310				my $i = 1;
311
312				print STDERR "Processing..." if $v;
313
314				my $proc = sub {
315				my ($langs) = @_;
316				return unless exists $langs->{$l1};
317
318				my (@S);
319				if ($tagged && exists($tagged->{languages}{$l1})) {
320				@S = split /\n/, $langs->{$l1}{-seg};
321				} else {
322				for ($langs->{$l1}{-seg}) {
323				s/&/&/g;
324				s/
325				s/>/>/g;
326				}
327
328				@S = $t1 ? tokenize($langs->{$l1}{-seg}) : split /\s+/, $langs->{$l1}{-seg};
329				}
330				print STDERR "\rProcessing... $i translation units" if $v && $i%1000==0;
331
332				print F1 "\n", join("\n", @S), "\n\n";
333				++$i;
334				};
335
336				$reader->for_tu( $proc );
337				print STDERR "\rProcessing... $i translation units\n" if $v;
338				close F1;
339				}
340
341				sub _detect_languages {
342				my ($reader, $from, $to) = @_;
343				my @languages = $reader->languages();
344
345				die "Language $from not available\n" if $from and !grep{_ieq($_, $from)}@languages;
346				die "Language $to not available\n" if $to and !grep{_ieq($_, $to)} @languages;
347
348				($from) = grep { _ieq($_, $from) } @languages if $from;
349				($to) = grep { _ieq($_, $to ) } @languages if $to;
350
351				return ($from, $to) if $from and $to;
352
353				if (scalar(@languages) == 2) {
354				$to = grep { $_ ne $from } @languages if $from and not $to;
355				$from = grep { $_ ne $to } @languages if $to and not $from;
356				($from, $to) = @languages if not ($to or $from);
357				return ($from, $to) if $from and $to;
358				}
359				die "Can't guess what languages to use!\n";
360				}
361
362				sub _detect_language {
363				my ($reader, $mono) = @_;
364				my @languages = $reader->languages();
365
366				die "Language $mono not available\n" if $mono and !grep {_ieq($_, $mono)} @languages;
367
368				($mono) = grep { _ieq($_, $mono) } @languages if $mono;
369
370				return ($mono) if $mono;
371
372				if (scalar(@languages) == 1) {
373				($mono) = @languages;
374				return ($mono);
375				}
376				die "Can't guess what languages to use!\n";
377				}
378
379				sub _ieq {
380				uc($_[0]) eq uc($_[1])
381				}
382
383
384
385				=head1 AUTHOR
386
387				Alberto Simoes, C<< >>
388
389				=head1 BUGS
390
391				Please report any bugs or feature requests to C, or through
392				the web interface at L. I will be notified, and then you'll
393				automatically be notified of progress on your bug as I make changes.
394
395
396
397
398				=head1 SUPPORT
399
400				You can find documentation for this module with the perldoc command.
401
402				perldoc XML::TMX::CWB
403
404
405				You can also look for information at:
406
407				=over 4
408
409				=item * RT: CPAN's request tracker
410
411				L
412
413				=item * AnnoCPAN: Annotated CPAN documentation
414
415				L
416
417				=item * CPAN Ratings
418
419				L
420
421				=item * Search CPAN
422
423				L
424
425				=back
426
427
428				=head1 ACKNOWLEDGEMENTS
429
430
431				=head1 LICENSE AND COPYRIGHT
432
433				Copyright 2010-2014 Alberto Simoes.
434
435				This program is free software; you can redistribute it and/or modify it
436				under the terms of either: the GNU General Public License as published
437				by the Free Software Foundation; or the Artistic License.
438
439				See http://dev.perl.org/licenses/ for more information.
440
441
442				=cut
443
444				1; # End of XML::TMX::CWB