File Coverage

blib/lib/Search/Tokenizer.pm
Criterion Covered Total %
statement 61 67 91.0
branch 20 28 71.4
condition 5 8 62.5
subroutine 8 11 72.7
pod 4 5 80.0
total 98 119 82.3


line stmt bran cond sub pod time code
1             package Search::Tokenizer;
2 1     1   62126 use warnings;
  1         3  
  1         34  
3 1     1   16 use strict;
  1         2  
  1         35  
4 1     1   1021 use Unicode::CaseFold ();
  1         1205  
  1         559  
5              
6             our $VERSION = '1.01';
7              
8             sub new {
9 2     2 0 398 my $class = shift;
10              
11             # defaults
12 2         11 my $regex = qr/\w+/;
13 2         6 my $lower = 1;
14 2         7 my $filter = undef;
15 2         5 my $filter_in_place = undef;
16 2         6 my $stopwords = undef;
17              
18             # parse arguments
19 2 50       11 unshift @_, "regex" if @_ == 1; # positional API
20 2         11 while (my $arg = shift) {
21 7         12 my $val = shift;
22 7   100     33 $arg .= "=>" . (ref($val) || "NOREF");
23 7         18 for ($arg) {
24 7 100       30 /^regex=>Regexp$/ and do { $regex = $val; last};
  2         5  
  2         15  
25 5 100       19 /^lower=>NOREF$/ and do { $lower = !!$val; last};
  1         4  
  1         5  
26 4 50       15 /^filter=>CODE$/ and do { $filter = $val; last};
  0         0  
  0         0  
27 4 100       16 /^filter_in_place=>CODE$/ and do { $filter_in_place = $val; last};
  2         5  
  2         7  
28 2 50       14 /^stopwords=>HASH$/ and do { $stopwords = $val; last};
  2         5  
  2         8  
29 0         0 die "Invalid option or invalid operand: $arg";
30             }
31             }
32              
33             # check that regex doest not match the empty string
34 2 50       14 not "" =~ $regex
35             or die "regex $regex matches the empty string: cannot tokenize";
36              
37             # return tokenizer factory: closure
38             return sub {
39 3     3   1290 my $string = shift;
40 3         7 my $term_index = -1;
41              
42             # return tokenizer : additional closure on $string and $term_index
43             return sub {
44              
45             # get next occurrence of $regex in $string (thanks to the /g flag)
46 10         160 while ($string =~ /$regex/g) {
47              
48             # index of this term within the input string
49 12         19 $term_index += 1;
50              
51             # boundaries for the match
52 12         46 my ($start, $end) = ($-[0], $+[0]);
53              
54             # extract matched substring (more efficient than $&)
55 12         36 my $term = substr($string, $start, $end-$start);
56              
57             # apply filtering and stopwords, if any
58 12 100       53 $term = Unicode::CaseFold::fc($term) if $lower;
59 12 50       80 $term = $filter->($term) if $filter;
60 12 50       283 $filter_in_place->($term) if $filter_in_place;
61 12 100 66     308 undef $term if $stopwords and $stopwords->{$term};
62              
63             # if $term was not cancelled by filters above, return it
64 12 100       60 if ($term) {
65 7 50       57 return wantarray ? ($term, length($term), $start, $end, $term_index)
66             : $term;
67             }
68             } # otherwise, loop again to extract next term
69              
70             # otherwise, no more term in input string, return undef or empty list
71 3         9 return;
72 3         25 };
73 2         23 };
74             }
75              
76             sub word {
77 0     0 1 0 __PACKAGE__->new(regex => qr/\w+/, @_);
78             }
79              
80             sub word_locale {
81 1     1   851 use locale;
  1         223  
  1         6  
82 0     0 1 0 __PACKAGE__->new(regex => qr/\w+/, @_);
83             }
84              
85             sub word_unicode {
86 1     1 1 1104 __PACKAGE__->new(regex => qr/\p{Word}+/, @_);
  1     0   12  
  1         14  
  0         0  
87             }
88              
89             sub unaccent {
90 1     1 1 2809 require Text::Transliterator::Unaccent;
91 1         98402 my %args = @_;
92 1   33     1031 my $want_lower = !exists $args{lower} || $args{lower};
93 1 50       7 my %unaccenter_args = $want_lower ? () : (upper => 0);
94 1         13 my $unaccenter = Text::Transliterator::Unaccent->new(%unaccenter_args);
95 1         40745 __PACKAGE__->new(regex => qr/\p{Word}+/,
96             filter_in_place => $unaccenter,
97             %args);
98             }
99              
100              
101             1; # End of Search::Tokenizer
102              
103              
104             __END__