File Coverage

blib/lib/Text/Amuse/Preprocessor/TypographyFilters.pm
Criterion Covered Total %
statement 85 86 98.8
branch 18 32 56.2
condition n/a
subroutine 13 14 92.8
pod 6 6 100.0
total 122 138 88.4


line stmt bran cond sub pod time code
1             package Text::Amuse::Preprocessor::TypographyFilters;
2              
3 13     13   817 use strict;
  13         33  
  13         392  
4 13     13   69 use warnings;
  13         26  
  13         300  
5 13     13   699 use utf8;
  13         37  
  13         147  
6             # use Encode;
7              
8             =encoding utf8
9              
10             =head1 NAME
11              
12             Text::Amuse::Preprocessor::TypographyFilters - Text::Amuse::Preprocessor's filters
13              
14             =head1 DESCRIPTION
15              
16             Used internally by L.
17              
18             =head1 FUNCTIONS
19              
20             =head2 linkify($string);
21              
22             Activate links in $string and returns it.
23              
24             =cut
25              
26             sub linkify {
27 2040     2040 1 3634 my $l = shift;
28 2040 50       4102 return unless defined $l;
29 2040         4555 $l =~ s{(?
30             ((https?:\/\/) # protocol
31             (\w[\w\-\.]+\.\w+) # domain
32             (\:\d+)? # the port
33             (/ # a slash
34             [^\[<>\s]* # everything that is not a space, a < > and a [
35             [\w/] # but end with a letter or a slash
36             )?
37             )
38             (?!\]) # and look around
39             }{[[$1][$3]]}gx;
40 2040         4664 return $l;
41             }
42              
43              
44             =head2 characters
45              
46             Return an hashref where keys are the language codes, and the values an
47             hashref with the definition of punctuation characters. Each of them
48             has the following keys: C, C, C,
49             C, C, C, C.
50              
51             C refers to a dash between digits.
52              
53             =cut
54              
55             # EM-DASH: 2014
56             # EN-DASH: 2013
57              
58             sub characters {
59             return {
60 57     57 1 15285 en => {
61             ldouble => "\x{201c}",
62             rdouble => "\x{201d}",
63             lsingle => "\x{2018}",
64             rsingle => "\x{2019}",
65             apos => "\x{2019}",
66             emdash => "\x{2014}",
67             endash => "\x{2013}",
68             dash => "\x{2014}",
69             },
70             # esperanto same as english, for the moment
71             eo => {
72             ldouble => "\x{201c}",
73             rdouble => "\x{201d}",
74             lsingle => "\x{2018}",
75             rsingle => "\x{2019}",
76             apos => "\x{2019}",
77             emdash => "\x{2014}",
78             endash => "\x{2013}",
79             dash => "\x{2014}",
80             },
81             # „…”
82             pl => {
83             ldouble => "\x{201e}",
84             rdouble => "\x{201d}",
85             lsingle => "\x{2018}",
86             rsingle => "\x{2019}",
87             apos => "\x{2019}",
88             emdash => "\x{2014}",
89             endash => "\x{2013}",
90             dash => "\x{2014}",
91             },
92             sq => {
93             ldouble => "\x{201c}",
94             rdouble => "\x{201d}",
95             lsingle => "\x{2018}",
96             rsingle => "\x{2019}",
97             apos => "\x{2019}",
98             emdash => "\x{2014}",
99             endash => "\x{2013}",
100             dash => "\x{2014}",
101             },
102             pt => {
103             ldouble => "\x{201c}",
104             rdouble => "\x{201d}",
105             lsingle => "\x{2018}",
106             rsingle => "\x{2019}",
107             apos => "\x{2019}",
108             emdash => "\x{2013}",
109             endash => "\x{2013}",
110             dash => "\x{2014}",
111             },
112             es => {
113             ldouble => "\x{ab}",
114             rdouble => "\x{bb}",
115             lsingle => "\x{2018}",
116             rsingle => "\x{2019}",
117             apos => "\x{2019}",
118             emdash => "\x{2014}",
119             endash => "-",
120             dash => "\x{2014}",
121             },
122             fr => {
123             ldouble => "\x{ab} ",
124             rdouble => " \x{bb}",
125             lsingle => "\x{2018}",
126             rsingle => "\x{2019}",
127             apos => "\x{2019}",
128             emdash => "\x{2014}",
129             endash => "\x{2013}",
130             dash => "\x{2014}",
131             },
132              
133             # according to http://en.wikipedia.org/wiki/International_variation_in_quotation_marks#Finnish_and_Swedish
134             sv => {
135             ldouble => "\x{201d}",
136             rdouble => "\x{201d}",
137             lsingle => "\x{2019}",
138             rsingle => "\x{2019}",
139             apos => "\x{2019}",
140             # finnish uses short dash
141             emdash => "\x{2013}",
142             endash => "-",
143             dash => "\x{2013}",
144             },
145             fi => {
146             ldouble => "\x{201d}",
147             rdouble => "\x{201d}",
148             lsingle => "\x{2019}",
149             rsingle => "\x{2019}",
150             apos => "\x{2019}",
151             # finnish uses short dash
152             emdash => "\x{2013}",
153             endash => "-",
154             dash => "\x{2013}",
155             },
156             id => {
157             ldouble => "\x{201c}",
158             rdouble => "\x{201d}",
159             lsingle => "\x{2018}",
160             rsingle => "\x{2019}",
161             apos => "\x{2019}",
162             # https://id.wikipedia.org/wiki/Tanda_pisah
163             emdash => "\x{2013}", # en dash –
164             endash => "\x{2013}", # en dash –
165             dash => "\x{2013}",
166             },
167             nl => {
168             ldouble => "\x{201c}",
169             rdouble => "\x{201d}",
170             lsingle => "\x{2018}",
171             rsingle => "\x{2019}",
172             apos => "\x{2019}",
173             emdash => "\x{2013}", # en dash –
174             endash => "-", # between numbers, use the hyphen
175             dash => "\x{2013}",
176             },
177             # like serbian
178             bg => {
179             # „članak o ’svicima’“
180             ldouble => "\x{201e}",
181             rdouble => "\x{201c}",
182             lsingle => "\x{2019}",
183             rsingle => "\x{2019}",
184             apos => "\x{2019}",
185             # serbian uses short dash.
186             emdash => "\x{2013}",
187             endash => "\x{2013}",
188             dash => "\x{2014}",
189             },
190             sr => {
191             # „članak o ’svicima’“
192             ldouble => "\x{201e}",
193             rdouble => "\x{201c}",
194             lsingle => "\x{2019}",
195             rsingle => "\x{2019}",
196             apos => "\x{2019}",
197             # serbian uses short dash.
198             emdash => "\x{2013}",
199             endash => "\x{2013}",
200             dash => "\x{2014}",
201             },
202             hr => {
203             # http://pravopis.hr/pravilo/navodnici/71/ „...” i »...«.
204             ldouble => "\x{201e}",
205             rdouble => "\x{201d}",
206             # http://pravopis.hr/pravilo/polunavodnici/73/ ‘...’
207             lsingle => "\x{2018}",
208             rsingle => "\x{2019}",
209             apos => "\x{2019}",
210             # croatian uses short dash:
211             # http://pravopis.hr/pravilo/crtica/69/
212             emdash => "\x{2013}",
213             endash => "\x{2013}",
214             dash => "\x{2014}",
215             },
216             ru => {
217             ldouble => "\x{ab}",
218             rdouble => "\x{bb}",
219             lsingle => "\x{201e}",
220             rsingle => "\x{201c}",
221             apos => "\x{2019}",
222             emdash => "\x{2014}",
223             endash => "-",
224             dash => "\x{2014}",
225             },
226             it => {
227             ldouble => "\x{201c}",
228             rdouble => "\x{201d}",
229             lsingle => "\x{2018}",
230             rsingle => "\x{2019}",
231             apos => "\x{2019}",
232             emdash => "\x{2013}",
233             endash => "-",
234             dash => "\x{2014}",
235             },
236             # Macedonian „…“ ’…‘
237             mk => {
238             ldouble => "\x{201e}",
239             rdouble => "\x{201c}",
240             lsingle => "\x{2019}",
241             rsingle => "\x{2018}",
242             apos => "\x{2019}",
243             emdash => "\x{2013}",
244             endash => "\x{2013}",
245             dash => "\x{2014}",
246             },
247             # http://de.wikipedia.org/wiki/Halbgeviertstrich
248             # http://en.wikipedia.org/wiki/International_variation_in_quotation_marks#German_.28Germany_and_Austria.29
249             de => {
250             ldouble => "\x{201e}",
251             rdouble => "\x{201c}",
252             lsingle => "\x{201a}",
253             rsingle => "\x{2018}",
254             apos => "\x{2019}",
255             emdash => "\x{2013}",
256             endash => "\x{2013}",
257             dash => "\x{2013}",
258             },
259             # »Outer quotation ’inner’ hyphen-for-words – and a dash«
260             # (en-dash between spaces is correct)
261             da => {
262             ldouble => "\x{bb}",
263             rdouble => "\x{ab}",
264             lsingle => "\x{2019}",
265             rsingle => "\x{2019}",
266             apos => "\x{2019}",
267             emdash => "\x{2013}",
268             endash => "\x{2013}",
269             dash => "\x{2013}",
270             },
271             };
272             }
273              
274              
275             =head2 specific_filters
276              
277             Return an hashref where the key is the language codes and the value a
278             subroutine to filter the line.
279              
280             Here we put the routines which can't be abstracted away in a
281             language-indipendent fashion.
282              
283             =cut
284              
285             sub _english_specific {
286 226     226   383 my $l = shift;
287 226         938 $l =~ s!\b(\d+)(th|rd|st|nd)\b!$1$2!g;
288 226         498 return $l;
289             }
290              
291             sub specific_filters {
292             return {
293 55     55 1 250 en => \&_english_specific,
294             };
295             }
296              
297             =head2 specific_filter($lang)
298              
299             Return the specific filter for lang, if present.
300              
301             =cut
302              
303             sub specific_filter {
304 55     55 1 145 my ($lang) = @_;
305 55 50       131 return unless $lang;
306 55         122 return specific_filters->{$lang};
307             }
308              
309             =head2 filter($lang)
310              
311             Return a sub for the typographical fixes for the language $lang.
312              
313             =cut
314              
315              
316             sub filter {
317 56     56 1 1110 my ($lang) = @_;
318 56 50       145 return unless $lang;
319 56         129 my $all = characters();
320 56         185 my $chars = $all->{$lang};
321 56 50       145 return unless $chars;
322              
323             # copy to avoid typos
324 56 50       133 my $ldouble = $chars->{ldouble} or die;
325 56 50       141 my $rdouble = $chars->{rdouble} or die;
326 56 50       140 my $lsingle = $chars->{lsingle} or die;
327 56 50       126 my $rsingle = $chars->{rsingle} or die;
328 56 50       129 my $apos = $chars->{apos} or die;
329 56 50       143 my $emdash = $chars->{emdash} or die;
330 56 50       149 my $endash = $chars->{endash} or die;
331 56 50       164 my $dash = $chars->{dash} or die;
332             my $filter = sub {
333 2149     2149   3541 my $l = shift;
334              
335             # if there is nothing to do, speed up.
336 2149 100       6563 return $l unless $l =~ /['"`-]/;
337              
338             # first, consider `` and '' opening and closing doubles
339 1266         2657 $l =~ s/``/$ldouble/g;
340              
341 1266         2372 $l =~ s/`/$lsingle/g;
342              
343             # but set it as ", we'll replace that later
344 1266         2497 $l =~ s/''/"/g;
345              
346             # beginning of the line, long dash
347 1266         2394 $l =~ s/^-(?=\s)/$dash/gm;
348              
349             # between spaces, just replace
350 1266         3254 $l =~ s/(?<=\S)(\x{20}+)-{1,3}(\x{20}+)(?=\S)/$1$emdash$2/g;
351              
352             # end of line with
353 1266         3410 $l =~ s/(?<=\S) +-{1,3}$/ $emdash/gm;
354              
355             # -word and word-, in the middle of a line
356 1266         2728 $l =~ s/(?<=\S)(\x{20}+)-(\w.+?\w)-(?=\x{20})/$1$emdash $2 $emdash/g;
357              
358             # an opening before two digits *probably* is an apostrophe.
359             # Very common case.
360 1266         3008 $l =~ s/'(?=\d\d\b)/$apos/g;
361              
362             # if it touches a word on the right, and on the left there is not a
363             # word, it's an opening quote
364 1266         3727 $l =~ s/(?<=\W)"(?=\w)/$ldouble/g;
365 1266         3536 $l =~ s/(?<=\W)'(?=\w)/$lsingle/g;
366              
367             # beginning of line, opening
368 1266         2795 $l =~ s/^"/$ldouble/gm;
369 1266         2534 $l =~ s/^'/$lsingle/gm;
370              
371             # end of line/chunk, closing
372 1266         3908 $l =~ s/"( *)$/$rdouble$1/gm;
373 1266         3229 $l =~ s/'( *)$/$rsingle$1/gm;
374              
375             # if there is a space at the left, it's opening
376 1266         2833 $l =~ s/(?<=\s)"/$ldouble/g;
377 1266         2550 $l =~ s/(?<=\s)'/$lsingle/g;
378              
379             # print encode('UTF-8', "**** $l");
380              
381             # apostrophes, between non-white material, probably
382 1266         2823 $l =~ s/(?<=\w)'(?=\w)/$apos/g;
383              
384             # print encode('UTF-8', "**** $l");
385              
386             # or before a left quote
387 1266         4754 $l =~ s/(?<=\w)'(\Q$lsingle\E)/$apos$1/g;
388 1266         3979 $l =~ s/(?<=\w)'(\Q$ldouble\E)/$apos$1/g;
389              
390             # print encode('UTF-8', "**** $l");
391              
392             # word at the left, closing
393 1266         3429 $l =~ s/(?<=\w)"(?=\W)/$rdouble/g;
394 1266         3250 $l =~ s/(?<=\w)'(?=\W)/$rsingle/g;
395              
396              
397             # the others are right quotes, hopefully
398 1266         2513 $l =~ s/"/$rdouble/gs;
399 1266         2320 $l =~ s/'/$rsingle/g;
400              
401             # replace with an endash, but only if between digits and not
402             # in the middle of something
403 1266         3288 $l =~ s/(?
404              
405 1266         3729 return $l;
406 56         309 };
407 56         783 return $filter;
408             }
409              
410             sub _nbsp_filters {
411 0     0   0 return { ru => \&_ru_nbsp_filter };
412             }
413              
414             sub _nbsp_specs {
415             return {
416             # to read: add a space ...
417 15     15   266 ru => {
418             before_words => [
419             "\x{2013}", "\x{2014}", "\x{2212}",
420             "б", "ж", "ли", "же", "ль", "бы", "бы,", "же",
421             ],
422             after_digit_before_words => [
423             "января", "февраля",
424             "марта", "апреля",
425             "мая", "июня", "июля",
426             "августа", "сентября",
427             "октября", "ноября",
428             "декабря", "г", "кг",
429             "мм", "дм", "см", "м",
430             "км", "л", "В", "А",
431             "ВТ", "W", "°C",
432             ],
433             after_words => [
434             "в", "к", "о", "с", "у",
435             "В", "К", "О", "С", "У",
436             "на", "от", "об", "из", "за", "по", "до", "во",
437             "та", "ту", "то", "те", "ко", "со",
438             "На", "От", "Об", "Из", "За", "По", "До", "Во",
439             "Ко", "Та", "Ту", "То", "Те", "Со",
440             "А", "А,", "а", "а,",
441             "И", "И,", "и", "и,",
442             "но", "но,", "Но", "Но,",
443             "да", "да,", "Да", "Да,",
444             "не", "ни", "Не", "Ни",
445             "ну", "ну,", "Ну", "Ну,",
446             "с.", "ч.", "см.", "См.",
447             "им.", "Им.","т.", "п."
448             ]
449             },
450             };
451             }
452              
453             =head2 nbsp_filter($lang)
454              
455             Return a sub (if the filter exists) to place non-breaking spaces in
456             language-specific places.
457              
458             =cut
459              
460             sub nbsp_filter {
461 15     15 1 37 my ($lang) = @_;
462 15 50       38 return unless $lang;
463 15         40 my $specs = _nbsp_specs()->{$lang};
464 15 100       88 return unless $specs;
465 5         10 my @patterns;
466 5         9 foreach my $token (@{ $specs->{before_words} }) {
  5         19  
467 55         1114 push @patterns, [
468             qr/(?<=\S)
469             \s+
470             \Q$token\E
471             (?=\W|$)/xm,
472             "\x{a0}$token"
473             ];
474             }
475 5         14 foreach my $token (@{ $specs->{after_digit_before_words} }) {
  5         17  
476 125         2292 push @patterns, [
477             qr/(?<=\d)
478             \s+
479             \Q$token\E
480             (?=\W|$)
481             /xm,
482             "\x{a0}$token"
483             ];
484             }
485 5         14 foreach my $token (@{ $specs->{after_words} }) {
  5         17  
486 350         4754 push @patterns, [
487             qr/\b
488             \Q$token\E
489             \s+
490             (?=\S|$)
491             /xm,
492             "$token\x{a0}"
493             ];
494             }
495 5 50       20 return unless (@patterns);
496             return sub {
497 173     173   289 my $l = shift;
498 173         327 foreach my $pattern (@patterns) {
499 18338         33661 my ($from, $to) = @$pattern;
500 18338         47433 $l =~ s/$from/$to/g;
501             }
502 173         494 return $l;
503 5         56 };
504             }
505              
506              
507              
508             1;