| line | stmt | bran | cond | sub | pod | time | code | 
| 1 | 1 |  |  | 1 |  | 1136 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 44 |  | 
| 2 | 1 |  |  | 1 |  | 6 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 48 |  | 
| 3 |  |  |  |  |  |  |  | 
| 4 |  |  |  |  |  |  | package KSx::Analysis::StripAccents; | 
| 5 | 1 |  |  | 1 |  | 15 | use base qw( KinoSearch::Analysis::Analyzer ); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 1087 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | our $VERSION = '0.05'; | 
| 8 |  |  |  |  |  |  |  | 
| 9 | 1 |  |  | 1 |  | 1383 | use Encode qw 'encode decode'; | 
|  | 1 |  |  |  |  | 13052 |  | 
|  | 1 |  |  |  |  | 119 |  | 
| 10 | 1 |  |  | 1 |  | 962 | use Text::Unaccent 'unac_string_utf16'; | 
|  | 1 |  |  |  |  | 2702 |  | 
|  | 1 |  |  |  |  | 227 |  | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | sub analyze_batch { | 
| 13 | 3 |  |  | 3 | 0 | 2892 | my ( $self, $batch ) = @_; | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | # lc and unaccent all of the terms, one by one | 
| 16 | 3 |  |  |  |  | 39 | while ( my $token = $batch->next ) { | 
| 17 |  |  |  |  |  |  | # I have to use UTF-16BE, since, although it’s not documented, | 
| 18 |  |  |  |  |  |  | # Text::Unaccent only supports big-endian. And I have to encode it, | 
| 19 |  |  |  |  |  |  | # since it doesn’t support Perl’s Unicode strings. (And it’ll con- | 
| 20 |  |  |  |  |  |  | # vert it to UTF-16 behind the scenes anyway, if I don’t.) | 
| 21 | 1 |  |  | 1 |  | 19883 | $token->set_text( | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 18 |  | 
|  | 3 |  |  |  |  | 30 |  | 
| 22 |  |  |  |  |  |  | lc uc decode 'utf-16be', unac_string_utf16 | 
| 23 |  |  |  |  |  |  | encode 'UTF-16BE', $token->get_text ); | 
| 24 |  |  |  |  |  |  | # We have an ‘lc uc’ there, since some letters won’t be normalised | 
| 25 |  |  |  |  |  |  | # properly without it; e.g., ‘Σσς’ should be normalised to three | 
| 26 |  |  |  |  |  |  | # instances of the same character (‘σσσ’ as opposed to ‘σσς’). | 
| 27 |  |  |  |  |  |  | } | 
| 28 |  |  |  |  |  |  |  | 
| 29 | 3 |  |  |  |  | 64734 | $batch->reset; | 
| 30 | 3 |  |  |  |  | 37 | return $batch; | 
| 31 |  |  |  |  |  |  | } | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | *transform = *analyze_batch; | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | 1; | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | __END__ |