File Coverage

blib/lib/KSx/Analysis/StripAccents.pm
Criterion Covered Total %
statement 23 23 100.0
branch n/a
condition n/a
subroutine 7 7 100.0
pod 0 1 0.0
total 30 31 96.7


line stmt bran cond sub pod time code
1 1     1   1136 use strict;
  1         3  
  1         44  
2 1     1   6 use warnings;
  1         2  
  1         48  
3              
4             package KSx::Analysis::StripAccents;
5 1     1   15 use base qw( KinoSearch::Analysis::Analyzer );
  1         2  
  1         1087  
6              
7             our $VERSION = '0.05';
8              
9 1     1   1383 use Encode qw 'encode decode';
  1         13052  
  1         119  
10 1     1   962 use Text::Unaccent 'unac_string_utf16';
  1         2702  
  1         227  
11              
12             sub analyze_batch {
13 3     3 0 2892 my ( $self, $batch ) = @_;
14              
15             # lc and unaccent all of the terms, one by one
16 3         39 while ( my $token = $batch->next ) {
17             # I have to use UTF-16BE, since, although it’s not documented,
18             # Text::Unaccent only supports big-endian. And I have to encode it,
19             # since it doesn’t support Perl’s Unicode strings. (And it’ll con-
20             # vert it to UTF-16 behind the scenes anyway, if I don’t.)
21 1     1   19883 $token->set_text(
  1         3  
  1         18  
  3         30  
22             lc uc decode 'utf-16be', unac_string_utf16
23             encode 'UTF-16BE', $token->get_text );
24             # We have an ‘lc uc’ there, since some letters won’t be normalised
25             # properly without it; e.g., ‘Σσς’ should be normalised to three
26             # instances of the same character (‘σσσ’ as opposed to ‘σσς’).
27             }
28              
29 3         64734 $batch->reset;
30 3         37 return $batch;
31             }
32              
33             *transform = *analyze_batch;
34              
35             1;
36              
37             __END__