File Coverage

blib/lib/KSx/Analysis/StripAccents.pm

Criterion	Covered	Total	%
statement	23	23	100.0
branch			n/a
condition			n/a
subroutine	7	7	100.0
pod	0	1	0.0
total	30	31	96.7

line	stmt	sub	pod	time	code
1	1	1		1136	use strict;
	1			3
	1			44
2	1	1		6	use warnings;
	1			2
	1			48
3
4					package KSx::Analysis::StripAccents;
5	1	1		15	use base qw( KinoSearch::Analysis::Analyzer );
	1			2
	1			1087
6
7					our $VERSION = '0.05';
8
9	1	1		1383	use Encode qw 'encode decode';
	1			13052
	1			119
10	1	1		962	use Text::Unaccent 'unac_string_utf16';
	1			2702
	1			227
11
12					sub analyze_batch {
13	3	3	0	2892	my ( $self, $batch ) = @_;
14
15					# lc and unaccent all of the terms, one by one
16	3			39	while ( my $token = $batch->next ) {
17					# I have to use UTF-16BE, since, although it’s not documented,
18					# Text::Unaccent only supports big-endian. And I have to encode it,
19					# since it doesn’t support Perl’s Unicode strings. (And it’ll con-
20					# vert it to UTF-16 behind the scenes anyway, if I don’t.)
21	1	1		19883	$token->set_text(
	1			3
	1			18
	3			30
22					lc uc decode 'utf-16be', unac_string_utf16
23					encode 'UTF-16BE', $token->get_text );
24					# We have an ‘lc uc’ there, since some letters won’t be normalised
25					# properly without it; e.g., ‘Σσς’ should be normalised to three
26					# instances of the same character (‘σσσ’ as opposed to ‘σσς’).
27					}
28
29	3			64734	$batch->reset;
30	3			37	return $batch;
31					}
32
33					transform = analyze_batch;
34
35					1;
36
37					__END__