File Coverage

blib/lib/Text/Amuse/Preprocessor/TypographyFilters.pm

Criterion	Covered	Total	%
statement	85	86	98.8
branch	18	32	56.2
condition			n/a
subroutine	13	14	92.8
pod	6	6	100.0
total	122	138	88.4

line	stmt	bran	sub	pod	time	code
1						package Text::Amuse::Preprocessor::TypographyFilters;
2
3	13		13		817	use strict;
	13				33
	13				392
4	13		13		69	use warnings;
	13				26
	13				300
5	13		13		699	use utf8;
	13				37
	13				147
6						# use Encode;
7
8						=encoding utf8
9
10						=head1 NAME
11
12						Text::Amuse::Preprocessor::TypographyFilters - Text::Amuse::Preprocessor's filters
13
14						=head1 DESCRIPTION
15
16						Used internally by L.
17
18						=head1 FUNCTIONS
19
20						=head2 linkify($string);
21
22						Activate links in $string and returns it.
23
24						=cut
25
26						sub linkify {
27	2040		2040	1	3634	my $l = shift;
28	2040	50			4102	return unless defined $l;
29	2040				4555	$l =~ s{(?
30						((https?:\/\/) # protocol
31						(\w[\w\-\.]+\.\w+) # domain
32						(\:\d+)? # the port
33						(/ # a slash
34						[^\[<>\s]* # everything that is not a space, a < > and a [
35						[\w/] # but end with a letter or a slash
36						)?
37						)
38						(?!\]) # and look around
39						}{[[$1][$3]]}gx;
40	2040				4664	return $l;
41						}
42
43
44						=head2 characters
45
46						Return an hashref where keys are the language codes, and the values an
47						hashref with the definition of punctuation characters. Each of them
48						has the following keys: C, C, C,
49						C, C, C, C.
50
51						C refers to a dash between digits.
52
53						=cut
54
55						# EM-DASH: 2014
56						# EN-DASH: 2013
57
58						sub characters {
59						return {
60	57		57	1	15285	en => {
61						ldouble => "\x{201c}",
62						rdouble => "\x{201d}",
63						lsingle => "\x{2018}",
64						rsingle => "\x{2019}",
65						apos => "\x{2019}",
66						emdash => "\x{2014}",
67						endash => "\x{2013}",
68						dash => "\x{2014}",
69						},
70						# esperanto same as english, for the moment
71						eo => {
72						ldouble => "\x{201c}",
73						rdouble => "\x{201d}",
74						lsingle => "\x{2018}",
75						rsingle => "\x{2019}",
76						apos => "\x{2019}",
77						emdash => "\x{2014}",
78						endash => "\x{2013}",
79						dash => "\x{2014}",
80						},
81						# „…”
82						pl => {
83						ldouble => "\x{201e}",
84						rdouble => "\x{201d}",
85						lsingle => "\x{2018}",
86						rsingle => "\x{2019}",
87						apos => "\x{2019}",
88						emdash => "\x{2014}",
89						endash => "\x{2013}",
90						dash => "\x{2014}",
91						},
92						sq => {
93						ldouble => "\x{201c}",
94						rdouble => "\x{201d}",
95						lsingle => "\x{2018}",
96						rsingle => "\x{2019}",
97						apos => "\x{2019}",
98						emdash => "\x{2014}",
99						endash => "\x{2013}",
100						dash => "\x{2014}",
101						},
102						pt => {
103						ldouble => "\x{201c}",
104						rdouble => "\x{201d}",
105						lsingle => "\x{2018}",
106						rsingle => "\x{2019}",
107						apos => "\x{2019}",
108						emdash => "\x{2013}",
109						endash => "\x{2013}",
110						dash => "\x{2014}",
111						},
112						es => {
113						ldouble => "\x{ab}",
114						rdouble => "\x{bb}",
115						lsingle => "\x{2018}",
116						rsingle => "\x{2019}",
117						apos => "\x{2019}",
118						emdash => "\x{2014}",
119						endash => "-",
120						dash => "\x{2014}",
121						},
122						fr => {
123						ldouble => "\x{ab} ",
124						rdouble => " \x{bb}",
125						lsingle => "\x{2018}",
126						rsingle => "\x{2019}",
127						apos => "\x{2019}",
128						emdash => "\x{2014}",
129						endash => "\x{2013}",
130						dash => "\x{2014}",
131						},
132
133						# according to http://en.wikipedia.org/wiki/International_variation_in_quotation_marks#Finnish_and_Swedish
134						sv => {
135						ldouble => "\x{201d}",
136						rdouble => "\x{201d}",
137						lsingle => "\x{2019}",
138						rsingle => "\x{2019}",
139						apos => "\x{2019}",
140						# finnish uses short dash
141						emdash => "\x{2013}",
142						endash => "-",
143						dash => "\x{2013}",
144						},
145						fi => {
146						ldouble => "\x{201d}",
147						rdouble => "\x{201d}",
148						lsingle => "\x{2019}",
149						rsingle => "\x{2019}",
150						apos => "\x{2019}",
151						# finnish uses short dash
152						emdash => "\x{2013}",
153						endash => "-",
154						dash => "\x{2013}",
155						},
156						id => {
157						ldouble => "\x{201c}",
158						rdouble => "\x{201d}",
159						lsingle => "\x{2018}",
160						rsingle => "\x{2019}",
161						apos => "\x{2019}",
162						# https://id.wikipedia.org/wiki/Tanda_pisah
163						emdash => "\x{2013}", # en dash –
164						endash => "\x{2013}", # en dash –
165						dash => "\x{2013}",
166						},
167						nl => {
168						ldouble => "\x{201c}",
169						rdouble => "\x{201d}",
170						lsingle => "\x{2018}",
171						rsingle => "\x{2019}",
172						apos => "\x{2019}",
173						emdash => "\x{2013}", # en dash –
174						endash => "-", # between numbers, use the hyphen
175						dash => "\x{2013}",
176						},
177						# like serbian
178						bg => {
179						# „članak o ’svicima’“
180						ldouble => "\x{201e}",
181						rdouble => "\x{201c}",
182						lsingle => "\x{2019}",
183						rsingle => "\x{2019}",
184						apos => "\x{2019}",
185						# serbian uses short dash.
186						emdash => "\x{2013}",
187						endash => "\x{2013}",
188						dash => "\x{2014}",
189						},
190						sr => {
191						# „članak o ’svicima’“
192						ldouble => "\x{201e}",
193						rdouble => "\x{201c}",
194						lsingle => "\x{2019}",
195						rsingle => "\x{2019}",
196						apos => "\x{2019}",
197						# serbian uses short dash.
198						emdash => "\x{2013}",
199						endash => "\x{2013}",
200						dash => "\x{2014}",
201						},
202						hr => {
203						# http://pravopis.hr/pravilo/navodnici/71/ „...” i »...«.
204						ldouble => "\x{201e}",
205						rdouble => "\x{201d}",
206						# http://pravopis.hr/pravilo/polunavodnici/73/ ‘...’
207						lsingle => "\x{2018}",
208						rsingle => "\x{2019}",
209						apos => "\x{2019}",
210						# croatian uses short dash:
211						# http://pravopis.hr/pravilo/crtica/69/
212						emdash => "\x{2013}",
213						endash => "\x{2013}",
214						dash => "\x{2014}",
215						},
216						ru => {
217						ldouble => "\x{ab}",
218						rdouble => "\x{bb}",
219						lsingle => "\x{201e}",
220						rsingle => "\x{201c}",
221						apos => "\x{2019}",
222						emdash => "\x{2014}",
223						endash => "-",
224						dash => "\x{2014}",
225						},
226						it => {
227						ldouble => "\x{201c}",
228						rdouble => "\x{201d}",
229						lsingle => "\x{2018}",
230						rsingle => "\x{2019}",
231						apos => "\x{2019}",
232						emdash => "\x{2013}",
233						endash => "-",
234						dash => "\x{2014}",
235						},
236						# Macedonian „…“ ’…‘
237						mk => {
238						ldouble => "\x{201e}",
239						rdouble => "\x{201c}",
240						lsingle => "\x{2019}",
241						rsingle => "\x{2018}",
242						apos => "\x{2019}",
243						emdash => "\x{2013}",
244						endash => "\x{2013}",
245						dash => "\x{2014}",
246						},
247						# http://de.wikipedia.org/wiki/Halbgeviertstrich
248						# http://en.wikipedia.org/wiki/International_variation_in_quotation_marks#German_.28Germany_and_Austria.29
249						de => {
250						ldouble => "\x{201e}",
251						rdouble => "\x{201c}",
252						lsingle => "\x{201a}",
253						rsingle => "\x{2018}",
254						apos => "\x{2019}",
255						emdash => "\x{2013}",
256						endash => "\x{2013}",
257						dash => "\x{2013}",
258						},
259						# »Outer quotation ’inner’ hyphen-for-words – and a dash«
260						# (en-dash between spaces is correct)
261						da => {
262						ldouble => "\x{bb}",
263						rdouble => "\x{ab}",
264						lsingle => "\x{2019}",
265						rsingle => "\x{2019}",
266						apos => "\x{2019}",
267						emdash => "\x{2013}",
268						endash => "\x{2013}",
269						dash => "\x{2013}",
270						},
271						};
272						}
273
274
275						=head2 specific_filters
276
277						Return an hashref where the key is the language codes and the value a
278						subroutine to filter the line.
279
280						Here we put the routines which can't be abstracted away in a
281						language-indipendent fashion.
282
283						=cut
284
285						sub _english_specific {
286	226		226		383	my $l = shift;
287	226				938	$l =~ s!\b(\d+)(th\|rd\|st\|nd)\b!$1^$2!g;
288	226				498	return $l;
289						}
290
291						sub specific_filters {
292						return {
293	55		55	1	250	en => \&_english_specific,
294						};
295						}
296
297						=head2 specific_filter($lang)
298
299						Return the specific filter for lang, if present.
300
301						=cut
302
303						sub specific_filter {
304	55		55	1	145	my ($lang) = @_;
305	55	50			131	return unless $lang;
306	55				122	return specific_filters->{$lang};
307						}
308
309						=head2 filter($lang)
310
311						Return a sub for the typographical fixes for the language $lang.
312
313						=cut
314
315
316						sub filter {
317	56		56	1	1110	my ($lang) = @_;
318	56	50			145	return unless $lang;
319	56				129	my $all = characters();
320	56				185	my $chars = $all->{$lang};
321	56	50			145	return unless $chars;
322
323						# copy to avoid typos
324	56	50			133	my $ldouble = $chars->{ldouble} or die;
325	56	50			141	my $rdouble = $chars->{rdouble} or die;
326	56	50			140	my $lsingle = $chars->{lsingle} or die;
327	56	50			126	my $rsingle = $chars->{rsingle} or die;
328	56	50			129	my $apos = $chars->{apos} or die;
329	56	50			143	my $emdash = $chars->{emdash} or die;
330	56	50			149	my $endash = $chars->{endash} or die;
331	56	50			164	my $dash = $chars->{dash} or die;
332						my $filter = sub {
333	2149		2149		3541	my $l = shift;
334
335						# if there is nothing to do, speed up.
336	2149	100			6563	return $l unless $l =~ /['"`-]/;
337
338						# first, consider `` and '' opening and closing doubles
339	1266				2657	$l =~ s/``/$ldouble/g;
340
341	1266				2372	$l =~ s/`/$lsingle/g;
342
343						# but set it as ", we'll replace that later
344	1266				2497	$l =~ s/''/"/g;
345
346						# beginning of the line, long dash
347	1266				2394	$l =~ s/^-(?=\s)/$dash/gm;
348
349						# between spaces, just replace
350	1266				3254	$l =~ s/(?<=\S)(\x{20}+)-{1,3}(\x{20}+)(?=\S)/$1$emdash$2/g;
351
352						# end of line with
353	1266				3410	$l =~ s/(?<=\S) +-{1,3}$/ $emdash/gm;
354
355						# -word and word-, in the middle of a line
356	1266				2728	$l =~ s/(?<=\S)(\x{20}+)-(\w.+?\w)-(?=\x{20})/$1$emdash $2 $emdash/g;
357
358						# an opening before two digits probably is an apostrophe.
359						# Very common case.
360	1266				3008	$l =~ s/'(?=\d\d\b)/$apos/g;
361
362						# if it touches a word on the right, and on the left there is not a
363						# word, it's an opening quote
364	1266				3727	$l =~ s/(?<=\W)"(?=\w)/$ldouble/g;
365	1266				3536	$l =~ s/(?<=\W)'(?=\w)/$lsingle/g;
366
367						# beginning of line, opening
368	1266				2795	$l =~ s/^"/$ldouble/gm;
369	1266				2534	$l =~ s/^'/$lsingle/gm;
370
371						# end of line/chunk, closing
372	1266				3908	$l =~ s/"( *)$/$rdouble$1/gm;
373	1266				3229	$l =~ s/'( *)$/$rsingle$1/gm;
374
375						# if there is a space at the left, it's opening
376	1266				2833	$l =~ s/(?<=\s)"/$ldouble/g;
377	1266				2550	$l =~ s/(?<=\s)'/$lsingle/g;
378
379						# print encode('UTF-8', "**** $l");
380
381						# apostrophes, between non-white material, probably
382	1266				2823	$l =~ s/(?<=\w)'(?=\w)/$apos/g;
383
384						# print encode('UTF-8', "**** $l");
385
386						# or before a left quote
387	1266				4754	$l =~ s/(?<=\w)'(\Q$lsingle\E)/$apos$1/g;
388	1266				3979	$l =~ s/(?<=\w)'(\Q$ldouble\E)/$apos$1/g;
389
390						# print encode('UTF-8', "**** $l");
391
392						# word at the left, closing
393	1266				3429	$l =~ s/(?<=\w)"(?=\W)/$rdouble/g;
394	1266				3250	$l =~ s/(?<=\w)'(?=\W)/$rsingle/g;
395
396
397						# the others are right quotes, hopefully
398	1266				2513	$l =~ s/"/$rdouble/gs;
399	1266				2320	$l =~ s/'/$rsingle/g;
400
401						# replace with an endash, but only if between digits and not
402						# in the middle of something
403	1266				3288	$l =~ s/(?
404
405	1266				3729	return $l;
406	56				309	};
407	56				783	return $filter;
408						}
409
410						sub _nbsp_filters {
411	0		0		0	return { ru => \&_ru_nbsp_filter };
412						}
413
414						sub _nbsp_specs {
415						return {
416						# to read: add a space ...
417	15		15		266	ru => {
418						before_words => [
419						"\x{2013}", "\x{2014}", "\x{2212}",
420						"б", "ж", "ли", "же", "ль", "бы", "бы,", "же",
421						],
422						after_digit_before_words => [
423						"января", "февраля",
424						"марта", "апреля",
425						"мая", "июня", "июля",
426						"августа", "сентября",
427						"октября", "ноября",
428						"декабря", "г", "кг",
429						"мм", "дм", "см", "м",
430						"км", "л", "В", "А",
431						"ВТ", "W", "°C",
432						],
433						after_words => [
434						"в", "к", "о", "с", "у",
435						"В", "К", "О", "С", "У",
436						"на", "от", "об", "из", "за", "по", "до", "во",
437						"та", "ту", "то", "те", "ко", "со",
438						"На", "От", "Об", "Из", "За", "По", "До", "Во",
439						"Ко", "Та", "Ту", "То", "Те", "Со",
440						"А", "А,", "а", "а,",
441						"И", "И,", "и", "и,",
442						"но", "но,", "Но", "Но,",
443						"да", "да,", "Да", "Да,",
444						"не", "ни", "Не", "Ни",
445						"ну", "ну,", "Ну", "Ну,",
446						"с.", "ч.", "см.", "См.",
447						"им.", "Им.","т.", "п."
448						]
449						},
450						};
451						}
452
453						=head2 nbsp_filter($lang)
454
455						Return a sub (if the filter exists) to place non-breaking spaces in
456						language-specific places.
457
458						=cut
459
460						sub nbsp_filter {
461	15		15	1	37	my ($lang) = @_;
462	15	50			38	return unless $lang;
463	15				40	my $specs = _nbsp_specs()->{$lang};
464	15	100			88	return unless $specs;
465	5				10	my @patterns;
466	5				9	foreach my $token (@{ $specs->{before_words} }) {
	5				19
467	55				1114	push @patterns, [
468						qr/(?<=\S)
469						\s+
470						\Q$token\E
471						(?=\W\|$)/xm,
472						"\x{a0}$token"
473						];
474						}
475	5				14	foreach my $token (@{ $specs->{after_digit_before_words} }) {
	5				17
476	125				2292	push @patterns, [
477						qr/(?<=\d)
478						\s+
479						\Q$token\E
480						(?=\W\|$)
481						/xm,
482						"\x{a0}$token"
483						];
484						}
485	5				14	foreach my $token (@{ $specs->{after_words} }) {
	5				17
486	350				4754	push @patterns, [
487						qr/\b
488						\Q$token\E
489						\s+
490						(?=\S\|$)
491						/xm,
492						"$token\x{a0}"
493						];
494						}
495	5	50			20	return unless (@patterns);
496						return sub {
497	173		173		289	my $l = shift;
498	173				327	foreach my $pattern (@patterns) {
499	18338				33661	my ($from, $to) = @$pattern;
500	18338				47433	$l =~ s/$from/$to/g;
501						}
502	173				494	return $l;
503	5				56	};
504						}
505
506
507
508						1;