line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::Analysis::LetterTokenizer; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Plucene::Analysis::LetterTokenizer - Letter tokenizer |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# isa Plucene::Analysis::CharTokenizer |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 DESCRIPTION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
This is the letter tokenizer class, which divides text at non-letters. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Note: this does a decent job for most European languages, but does a |
16
|
|
|
|
|
|
|
terrible job for some Asian languages, where words are not separated |
17
|
|
|
|
|
|
|
by spaces |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut |
20
|
|
|
|
|
|
|
|
21
|
19
|
|
|
19
|
|
119
|
use strict; |
|
19
|
|
|
|
|
37
|
|
|
19
|
|
|
|
|
626
|
|
22
|
19
|
|
|
19
|
|
107
|
use warnings; |
|
19
|
|
|
|
|
44
|
|
|
19
|
|
|
|
|
1104
|
|
23
|
|
|
|
|
|
|
|
24
|
19
|
|
|
19
|
|
100
|
use base 'Plucene::Analysis::CharTokenizer'; |
|
19
|
|
|
|
|
39
|
|
|
19
|
|
|
|
|
10955
|
|
25
|
|
|
|
|
|
|
|
26
|
143970
|
|
|
143970
|
1
|
543635
|
sub token_re { qr/[[:alpha:]]+/ } |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
1; |