File Coverage

blib/lib/Search/Tokenizer.pm
Criterion Covered Total %
statement 62 68 91.1
branch 20 28 71.4
condition 5 8 62.5
subroutine 8 11 72.7
pod 4 5 80.0
total 99 120 82.5


line stmt bran cond sub pod time code
1             package Search::Tokenizer;
2 1     1   78408 use warnings;
  1         3  
  1         39  
3 1     1   6 use strict;
  1         2  
  1         19  
4 1     1   520 use Unicode::CaseFold ();
  1         1080  
  1         477  
5              
6             our $VERSION = '1.02';
7              
8             sub new {
9 2     2 0 682 my $class = shift;
10              
11             # defaults
12 2         10 my $regex = qr/\w+/;
13 2         6 my $lower = 1;
14 2         6 my $filter = undef;
15 2         4 my $filter_in_place = undef;
16 2         5 my $stopwords = undef;
17              
18             # parse arguments
19 2 50       13 unshift @_, "regex" if @_ == 1; # positional API
20 2         10 while (my $arg = shift) {
21 7         11 my $val = shift;
22 7   100     30 $arg .= "=>" . (ref($val) || "NOREF");
23 7         15 for ($arg) {
24 7 100       23 /^regex=>Regexp$/ and do { $regex = $val; last};
  2         10  
  2         10  
25 5 100       14 /^lower=>NOREF$/ and do { $lower = !!$val; last};
  1         3  
  1         3  
26 4 50       13 /^filter=>CODE$/ and do { $filter = $val; last};
  0         0  
  0         0  
27 4 100       14 /^filter_in_place=>CODE$/ and do { $filter_in_place = $val; last};
  2         6  
  2         6  
28 2 50       10 /^stopwords=>HASH$/ and do { $stopwords = $val; last};
  2         3  
  2         6  
29 0         0 die "Invalid option or invalid operand: $arg";
30             }
31             }
32              
33             # check that regex doest not match the empty string
34 2 50       14 not "" =~ $regex
35             or die "regex $regex matches the empty string: cannot tokenize";
36              
37             # return tokenizer factory: closure
38             return sub {
39 5     5   2383 my $string = shift;
40 5         10 my $term_index = -1;
41              
42             # return tokenizer : additional closure on $string and $term_index
43             return sub {
44              
45             # get next occurrence of $regex in $string (thanks to the /g flag)
46 3026         21341 while ($string =~ /$regex/g) {
47              
48             # index of this term within the input string
49 3040         4629 $term_index += 1;
50              
51             # boundaries for the match
52 3040         4460 my $end = pos($string);
53 3040         5081 my $term = $&; # used to be slow in older perls, but now OK
54 3040         4680 my $start = $end - length($term);
55              
56             # the old way used to be as follows, but it is ridiculously slow on utf8 strings
57             # .. see https://github.com/Perl/perl5/issues/18786
58             #
59             # my ($start, $end) = ($-[0], $+[0]);
60             # my $term = substr($string, $start, $end-$start);
61              
62             # apply filtering and stopwords, if any
63 3040 100       7293 $term = Unicode::CaseFold::fc($term) if $lower;
64 3040 50       19762 $term = $filter->($term) if $filter;
65 3040 50       54267 $filter_in_place->($term) if $filter_in_place;
66 3040 100 66     20991 undef $term if $stopwords and $stopwords->{$term};
67              
68             # if $term was not cancelled by filters above, return it
69 3040 100       5191 if ($term) {
70 3021 50       11756 return wantarray ? ($term, length($term), $start, $end, $term_index)
71             : $term;
72             }
73             } # otherwise, loop again to extract next term
74              
75             # otherwise, no more term in input string, return undef or empty list
76 5         12 return;
77 5         52 };
78 2         20 };
79             }
80              
81             sub word {
82 0     0 1 0 __PACKAGE__->new(regex => qr/\w+/, @_);
83             }
84              
85             sub word_locale {
86 1     1   562 use locale;
  1         622  
  1         5  
87 0     0 1 0 __PACKAGE__->new(regex => qr/\w+/, @_);
88             }
89              
90             sub word_unicode {
91 1     1 1 783 __PACKAGE__->new(regex => qr/\p{Word}+/, @_);
  1     0   15  
  1         16  
  0         0  
92             }
93              
94             sub unaccent {
95 1     1 1 1831 require Text::Transliterator::Unaccent;
96 1         41170 my %args = @_;
97 1   33     8 my $want_lower = !exists $args{lower} || $args{lower};
98 1 50       5 my %unaccenter_args = $want_lower ? () : (upper => 0);
99 1         10 my $unaccenter = Text::Transliterator::Unaccent->new(%unaccenter_args);
100 1         40022 __PACKAGE__->new(regex => qr/\p{Word}+/,
101             filter_in_place => $unaccenter,
102             %args);
103             }
104              
105              
106             1; # End of Search::Tokenizer
107              
108              
109             __END__