line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
1136
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
44
|
|
2
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
48
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package KSx::Analysis::StripAccents; |
5
|
1
|
|
|
1
|
|
15
|
use base qw( KinoSearch::Analysis::Analyzer ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1087
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.05'; |
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
1383
|
use Encode qw 'encode decode'; |
|
1
|
|
|
|
|
13052
|
|
|
1
|
|
|
|
|
119
|
|
10
|
1
|
|
|
1
|
|
962
|
use Text::Unaccent 'unac_string_utf16'; |
|
1
|
|
|
|
|
2702
|
|
|
1
|
|
|
|
|
227
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub analyze_batch { |
13
|
3
|
|
|
3
|
0
|
2892
|
my ( $self, $batch ) = @_; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# lc and unaccent all of the terms, one by one |
16
|
3
|
|
|
|
|
39
|
while ( my $token = $batch->next ) { |
17
|
|
|
|
|
|
|
# I have to use UTF-16BE, since, although it’s not documented, |
18
|
|
|
|
|
|
|
# Text::Unaccent only supports big-endian. And I have to encode it, |
19
|
|
|
|
|
|
|
# since it doesn’t support Perl’s Unicode strings. (And it’ll con- |
20
|
|
|
|
|
|
|
# vert it to UTF-16 behind the scenes anyway, if I don’t.) |
21
|
1
|
|
|
1
|
|
19883
|
$token->set_text( |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
18
|
|
|
3
|
|
|
|
|
30
|
|
22
|
|
|
|
|
|
|
lc uc decode 'utf-16be', unac_string_utf16 |
23
|
|
|
|
|
|
|
encode 'UTF-16BE', $token->get_text ); |
24
|
|
|
|
|
|
|
# We have an ‘lc uc’ there, since some letters won’t be normalised |
25
|
|
|
|
|
|
|
# properly without it; e.g., ‘Σσς’ should be normalised to three |
26
|
|
|
|
|
|
|
# instances of the same character (‘σσσ’ as opposed to ‘σσς’). |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
3
|
|
|
|
|
64734
|
$batch->reset; |
30
|
3
|
|
|
|
|
37
|
return $batch; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
*transform = *analyze_batch; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
1; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
__END__ |