File Coverage

blib/lib/Wiktionary/Parser/Section/Translations.pm

Criterion	Covered	Total	%
statement	138	163	84.6
branch	47	62	75.8
condition	9	14	64.2
subroutine	24	26	92.3
pod	0	18	0.0
total	218	283	77.0

line	stmt	bran	cond	sub	pod	time	code
1							package Wiktionary::Parser::Section::Translations;
2
3	3			3		42761	use Wiktionary::Parser::Section;
	3					8
	3					82
4	3			3		1935	use Wiktionary::Parser::Section::Translations::WordSense;
	3					10
	3					93
5	3			3		19	use Wiktionary::Parser::Section::Translations::WordSense::Lexeme;
	3					6
	3					71
6
7	3			3		15	use base qw(Wiktionary::Parser::Section);
	3					7
	3					7857
8
9							sub new {
10	42			42	0	50966	my $class = shift;
11	42					172	my %args = @_;
12	42					227	my $self = bless Wiktionary::Parser::Section->new(%args), $class;
13	42					190	return $self;
14							}
15
16
17							sub get_templates {
18							return [
19							{
20							# e.g. {{trans-bottom}}
21							pattern => qr/{{trans-bottom}}/,
22	5			5		18	parser => sub { return { meta => 'end_word_sense' } }
23							},
24							{
25							# comma - separates different blocks of words+metadata
26							pattern => qr/[,;\.]/,
27	97			97		318	parser => sub { return {meta => 'end_segment' } },
28							},
29							{
30							# e.g. * Swedish: ...
31							pattern => qr/\\:?\s([^\:]+)\:\s/,
32							parser => \&template_language,
33							},
34							{
35							# e.g. {{trans-top\|colour}}
36							pattern => qr/{{trans-top\\|([^}]+)/,
37	24			24		90	parser => sub { return { word_sense => $_[-1] } }
38							},
39
40							{
41							# e.g. {{trans-see\|apples and pears (''Cockney rhyming slang'')\|apples and pears}}
42							pattern => qr/{{trans-see\\|([^}]+)}}/,
43							parser =>\&template_trans_see,
44							},
45
46							{
47							# e.g. {{tø\|arz\|برتقان\|tr=burtuʕaan\|sc=Arab\|xs=Egyptian Arabic}}
48							pattern => qr/{{t[^\\|]*\\|([^}]+)}}/,
49							parser => \&template_t,
50							},
51							{
52							# e.g. {{ku-Arab\|[[پرته‌قاڵ]]
53							pattern => qr/{{(\w{2}\-Arab\\|[^}]+)}/,
54							parser => \&template_t,
55							},
56							{
57							# e.g. {{qualifier\|Dominican Republic\|Puerto Rico}
58							pattern => qr/{{qualifier\\|([^\}]+)}}/,
59							parser => \&template_qualifier,
60							},
61							{
62							# e.g. [[כלבא]] (kalbā’) {{m}}
63							pattern => qr/\[\[([^\]]+)\]\]\s+$([^$]+)\)\s+{{(\w)}}/,
64							parser => \&template_4,
65							},
66							{
67							# e.g. [[oranževec]] {{m}}
68							pattern => qr/\[\[([^\]]+)\]\]\s*{{(\w)}}/,
69							parser => \&template_2
70							},
71							{
72							# e.g. (narangi) {{f}}
73							pattern => qr/$([^\]]+)$\s*{{(\w)}}/,
74							parser => \&template_2
75							},
76							{
77							# e.g. [[colore de aranzu]], [[ruggiu]], [[ruiu]], [[arrubiu]]
78							pattern => qr/\[\[([^\]]+)\]\]/,
79							parser => \&template_3
80							},
81							{
82							# e.g. ''(collective)''
83							pattern => qr/\'\'([^\']+)\'\'/,
84							parser => \&template_q,
85							},
86
87							# run this last - just chop a character off if nothing else matches;
88							{
89							pattern => qr/./,
90	345			345		1053	parser => sub { return {meta => 'unparsable'} }
91							}
92
93	1551			1551	0	54428	];
94							}
95
96							sub template_q {
97	3			3	0	7	my $self = shift;
98	3					8	my $template = shift;
99	3					15	$template =~ s/[\[\]]//g;
100	3					13	return { qualifier => [$template] };
101							}
102
103							sub template_qualifier {
104	10			10	0	23	my $self = shift;
105	10					16	my $template = shift;
106	10					36	my @parts = split(/\\|/,$template);
107	10					34	$self->clean_tokens(@parts);
108	10					41	return {qualifier => \@parts}
109							}
110
111							sub template_language {
112	310			310	0	429	my $self = shift;
113	310					396	my $language = shift;
114	310					644	$self->clean_tokens($language);
115	310					1063	return { language => $language }
116							}
117
118							# parse the trans-see template into its parts
119							# http://en.wiktionary.org/wiki/Template:trans-see
120							sub template_trans_see {
121	0			0	0	0	my $self = shift;
122	0					0	my ($title) = shift;
123
124	0					0	my @params = split(/\\|/,$title,2);
125
126							# {{trans-see\|that}}
127	0	0	0			0	if ($params[0] && scalar @params == 1) {
128	0					0	$self->clean_tokens(@params);
129							return {
130	0					0	word_sense => "wiktionary:$params[0]",
131							meta => 'wiktionary_link',
132							}
133							}
134
135							# {{trans-see\|rally\|[[rally#Etymology 2\|rally]]}}
136	0	0				0	if (my $link_meta = $params[1] =~ m/\[\[([^\]]+)\]\]/) {
137	0					0	my @link_params = split(/\\|/, $1);
138	0					0	my $link;
139							# grab the last value if there's more than one entry
140	0	0				0	if (scalar @link_params > 1) {
141	0					0	$link = $link_params[-1];
142							} else {
143	0					0	$link = shift @link_params;
144							}
145
146	0					0	$link =~ s/\#.+$//;
147
148							return {
149	0					0	word_sense => "wiktionary:$link",
150							meta => 'wiktionary_link',
151							}
152							}
153
154							# {{trans-see\|Nahuatl language\|Nahuatl}}
155	0	0				0	if ($params[1]) {
156	0					0	my $link = $params[1];
157	0					0	$link =~ s/\#.+$//;
158							return {
159	0					0	word_sense => "wiktionary:$link",
160							meta => 'wiktionary_link',
161							}
162							}
163
164	0					0	return;
165							}
166
167							sub template_t {
168	327			327	0	449	my $self = shift;
169	327					422	my $template = shift;
170	327					1116	my @parts = split(/\\|/,$template);
171	327					425	my %meta;
172	327					666	$meta{language_code} = shift @parts;
173	327					456	my $translation = shift @parts;
174	327		50			640	$translation \|\|= '';
175	327					643	my @tokens = $translation =~ m/\[\[([^\]]+)\]\]/g;
176	327					516	$translation =~ s/\[\[([^\]]+)\]\]//g;
177	327					457	push @tokens, $translation;
178
179	327					495	$meta{translations} = [ grep { length($_) }@tokens ];
	334					1190
180	327					564	for my $token (@parts) {
181	376	100				2067	if ($token =~ m/^[fmcn]$/) {
		50
		100
		100
		100
		50
182							# f: fem.
183							# m: masc.
184							# c: common
185							# n: neut.
186	106					276	$meta{gender} = $token;
187							} elsif ($token =~ m/^[sp]$/) {
188							# s: singular
189							# p: plural
190	0					0	$meta{number} = $token;
191							} elsif ($token =~ m/^tr=(.+)/) {
192	104					404	$meta{transliteration} = $1;
193							} elsif ($token =~ m/^sc=(.+)/) {
194	75					267	$meta{script_code_template} = $1;
195							} elsif ($token =~ m/^alt=(.+)/) {
196	15					54	$meta{alternate} = $1;
197							} elsif ($token =~ m/^xs=(.+)/) {
198	76					341	$meta{xs} = $1;
199							}
200
201
202							}
203
204	327					998	return \%meta;
205							}
206
207							sub template_2 {
208	25			25	0	59	my $self = shift;
209	25					54	my @parts = @_;
210	25					42	my %meta;
211	25					58	my $translation = shift @parts;
212
213	25					66	my @tokens = $translation =~ m/\[\[([^\]]+)\]\]/g;
214	25					49	$translation =~ s/\[\[([^\]]+)\]\]//g;
215	25					46	push @tokens, $translation;
216	25					66	$meta{translations} = \@tokens;
217
218	25					69	$meta{gender} = shift @parts;
219	25					85	return \%meta;
220							}
221
222							sub template_3 {
223	56			56	0	89	my $self = shift;
224	56					122	my @parts = @_;
225	56					154	$self->clean_tokens(@parts);
226	56					88	my %meta;
227	56					127	$meta{translations} = \@parts;
228	56					140	return \%meta;
229							}
230
231
232							sub template_4 {
233	4			4	0	6	my $self = shift;
234	4					12	my @parts = @_;
235	4					12	$self->clean_tokens(@parts);
236	4					6	my %meta;
237
238	4					12	$meta{translations} = [shift @parts];
239	4					9	$meta{transliteration} = shift @parts;
240	4					7	my $gp = shift @parts;
241	4	100				20	if ($gp =~ m/[mfcn]/) {
		50
242	2					7	$meta{gender} = $gp;
243							} elsif ($gp =~ m/[ps]/) {
244	2					5	$meta{number} = $gp;
245							}
246
247	4					10	return \%meta;
248							}
249
250							sub clean_tokens {
251	380			380	0	400	my $self = shift;
252	380					675	for (@_) {
253	390					1269	$_ =~ s/[\[\]]//g;
254							}
255							}
256
257
258							# add a line of content to this section and parse it into its meaningful parts
259							sub add_content {
260	345			345	0	646	my $self = shift;
261	345					468	my $line = shift;
262
263							# TODO: template parsing issues
264							# {{qualifier\|...}}
265							#
266
267	345					428	my $line_copy = $line; # #$line is getting disembowled below
268	345					409	push @{$self->{content}}, $line_copy;
	345					824
269	345					1245	my $lexeme = Wiktionary::Parser::Section::Translations::WordSense::Lexeme->new();
270
271	345					839	while (my $meta = $self->get_template_match($line)) {
272	1206	50				2690	next unless defined $meta;
273
274	1206	100				2804	if ($meta->{word_sense}) {
275							# start new lexeme
276	24	50				88	if ($self->get_current_word_sense()) {
277	0					0	$self->get_current_word_sense()->add_lexeme($lexeme);
278	0					0	$lexeme = Wiktionary::Parser::Section::Translations::WordSense::Lexeme->new();
279							}
280	24					220	my $word_sense = Wiktionary::Parser::Section::Translations::WordSense->new(
281							word_sense => $meta->{word_sense},
282							);
283	24					89	$self->add_word_sense($word_sense);
284	24					72	$self->set_current_word_sense($word_sense);
285	24					145	next;
286							}
287
288	1182	100	100			3802	if ($meta->{meta} && $meta->{meta} eq 'end_word_sense') {
289	5					19	$self->set_current_word_sense(undef);
290							}
291
292	1182	100	100			3534	if ($meta->{meta} && $meta->{meta} eq 'end_segment') {
293	97	100				296	if ($self->get_current_word_sense()) {
294	96					197	$self->get_current_word_sense()->add_lexeme($lexeme);
295							}
296	97					389	my $_lexeme = Wiktionary::Parser::Section::Translations::WordSense::Lexeme->new();
297							}
298
299
300	1182	50				2388	$lexeme->add_translations(@{$meta->{translations} \|\| []}) if ($meta->{translations});
	412	100				2032
301	1182	100				2745	$lexeme->set_gender($meta->{gender}) if ($meta->{gender});
302	1182	100				2106	$lexeme->set_number($meta->{number}) if ($meta->{number});
303	1182	100				2200	$lexeme->set_transliteration($meta->{transliteration}) if ($meta->{transliteration});
304	1182	100				6218	$lexeme->set_alternate($meta->{alternate}) if ($meta->{alternate});
305
306	1182	50				2015	$lexeme->add_qualifiers(@{$meta->{qualifier} \|\| [] }) if $meta->{qualifier};
	13	100				246
307
308	1182	100				11955	$lexeme->set_language_name($meta->{language}) if $meta->{language};
309	1182	100				6312	$lexeme->set_language_code($meta->{language_code}) if $meta->{language_code};
310
311							}
312
313	345	100	66			879	if ($self->get_current_word_sense() && $lexeme) {
314	336					637	$self->get_current_word_sense()->add_lexeme($lexeme);
315							}
316							}
317							# find the template pattern that matches the beginning of the given line
318							sub get_template_match {
319	1551			1551	0	1814	my $self = shift;
320
321	1551					2916	my $templates = $self->get_templates();
322
323	1551					5196	for my $template (@$templates) {
324
325	13935					19967	my $pattern = $template->{pattern};
326	13935					16064	my $parser = $template->{parser};
327
328	13935	100				243071	if (my @matches = $_[0] =~ m/^$pattern/) {
329							# if there's a match parse extracted string into a hash of metadata
330	1206					2567	my $meta = $parser->($self,@matches);
331
332							# remove the matched piece from the line
333	1206					6876	$_[0] =~ s/$pattern//;
334	1206					3426	$_[0] =~ s/^\s*//;
335	1206					18367	return $meta;
336							}
337							}
338	345					5189	return;
339							}
340
341							sub add_word_sense {
342	24			24	0	37	my $self = shift;
343	24					43	my $word_sense = shift;
344	24					91	my $sense = $word_sense->get_word();
345	24					119	$self->{word_senses}{$sense} = $word_sense;
346							}
347
348							sub get_word_sense {
349	0			0	0	0	my $self = shift;
350	0					0	my $sense = shift;
351	0					0	return $self->{word_senses}{$sense};
352							}
353
354
355							sub get_word_senses {
356	40			40	0	61	my $self = shift;
357	40	100				43	return [values %{$self->{word_senses} \|\| {}}];
	40					249
358							}
359
360							sub set_current_word_sense {
361	29			29	0	40	my $self = shift;
362	29					66	$self->{current_word_sense} = shift;
363							}
364
365							sub get_current_word_sense {
366	898			898	0	1037	my $self = shift;
367	898					3962	return $self->{current_word_sense};
368							}
369							1;