File Coverage

blib/lib/Plucene/Analysis/CharTokenizer.pm
Criterion Covered Total %
statement 32 33 96.9
branch 7 8 87.5
condition 2 3 66.6
subroutine 7 8 87.5
pod 3 3 100.0
total 51 55 92.7


line stmt bran cond sub pod time code
1             package Plucene::Analysis::CharTokenizer;
2              
3             =head1 NAME
4              
5             Plucene::Analysis::CharTokenizer - base class for character tokenisers
6              
7             =head1 SYNOPSIS
8              
9             # isa Plucene::Analysis::Tokenizer
10              
11             my $next = $chartokenizer->next;
12            
13             =head1 DESCRIPTION
14              
15             This is an abstract base class for simple, character-oriented tokenizers.
16              
17             =head1 METHODS
18              
19             =cut
20              
21 19     19   119 use strict;
  19         46  
  19         1130  
22 19     19   106 use warnings;
  19         42  
  19         463  
23              
24 19     19   104 use Carp;
  19         45  
  19         2081  
25              
26 19     19   10687 use Plucene::Analysis::Token;
  19         57  
  19         155  
27              
28 19     19   754 use base 'Plucene::Analysis::Tokenizer';
  19         44  
  19         11664  
29              
30             =head2 token_re
31              
32             This should be defined in subclasses.
33              
34             =cut
35              
36             # And here we deviate from the script
37 0     0 1 0 sub token_re { die "You should define this" }
38              
39             # Class::Virtually::Abstract doesn't like being called twice.
40              
41             =head2 normalize
42              
43             This will normalise the character before it is added to the token.
44              
45             =cut
46              
47 382     382 1 2220 sub normalize { return $_[1] }
48              
49             =head2 next
50              
51             my $next = $chartokenizer->next;
52              
53             This will return the next token in the string, or undef at the end
54             of the string.
55            
56             =cut
57              
58             sub next {
59 144566     144566 1 210839 my $self = shift;
60 144566         385815 my $re = $self->token_re();
61 144566         267537 my $fh = $self->{reader};
62             retry:
63 159707 100 66     628270 if (!defined $self->{buffer} or !length $self->{buffer}) {
64 16240 100       57176 return if eof($fh);
65 15664         196373 $self->{start} = tell($fh);
66 15664         129748 $self->{buffer} .= <$fh>;
67             }
68 159131 50       1639977 return unless length $self->{buffer};
69              
70 159131 100       1075456 if ($self->{buffer} =~ s/(.*?)($re)//) {
71 143990         305185 $self->{start} += length $1;
72 143990         421935 my $word = $self->normalize($2);
73 143990         580586 my $rv = Plucene::Analysis::Token->new(
74             text => $word,
75             start => $self->{start},
76             end => ($self->{start} + length($word)));
77 143990         1964224 $self->{start} += length($word);
78 143990         701270 return $rv;
79             }
80              
81             # No match, rest of buffer is useless.
82 15141         29849 $self->{buffer} = "";
83              
84             # But we should try for some more text
85 15141         29562 goto retry;
86             }
87              
88             1;