File Coverage

blib/lib/Search/Tools/TokenListUtils.pm
Criterion Covered Total %
statement 55 76 72.3
branch 15 26 57.6
condition 9 14 64.2
subroutine 5 6 83.3
pod 4 4 100.0
total 88 126 69.8


line stmt bran cond sub pod time code
1             package Search::Tools::TokenListUtils;
2 32     32   13319 use Moo::Role;
  32         37  
  32         173  
3 32     32   6704 use Carp;
  32         41  
  32         16561  
4              
5             our $VERSION = '1.004';
6              
7             =head1 NAME
8              
9             Search::Tools::TokenListUtils - mixin methods for TokenList and TokenListPP
10              
11             =head1 SYNOPSIS
12              
13             my $tokens = $tokenizer->tokenize( $string );
14             if ( $tokens->str eq $string) {
15             print "string is same, before and after tokenize()\n";
16             }
17             else {
18             warn "I'm filing a bug report against Search::Tools right away!\n";
19             }
20            
21             my ($start_pos, $end_pos) = $tokens->get_window( 5, 20 );
22             # $start_pos probably == 0
23             # $end_pos probably == 25
24            
25             my $slice = $tokens->get_window_pos( 5, 20 );
26             for my $token (@$slice) {
27             print "token = $token\n";
28             }
29              
30             =head1 DESCRIPTION
31              
32             Search::Tools::TokenListUtils contains pure-Perl methods inhertited
33             by both Search::Tools::TokenList and Search::Tools::TokenListPP.
34              
35             =head1 METHODS
36              
37             =head2 str
38              
39             Returns a serialized version of the TokenList. If you haven't
40             altered the TokenList since you got it from tokenize(),
41             then str() returns a scalar string identical to (but not the same as)
42             the string you passed to tokenize().
43              
44             Both Search::Tools::TokenList and TokenListPP are overloaded
45             to stringify to the str() value.
46              
47             =cut
48              
49             sub str {
50 12     12 1 1190 my $self = shift;
51 12         22 my $joiner = shift(@_);
52 12 50       38 if ( !defined $joiner ) {
53 12         17 $joiner = '';
54             }
55 12         19 return join( $joiner, map {"$_"} @{ $self->as_array } );
  376         430  
  12         32  
56             }
57              
58             =head2 get_window( I [, I, I] )
59              
60             Returns array with two values: I and I positions
61             for the array of length I on either side of I.
62             Like taking a slice of the TokenList.
63              
64             Note that I is the number of B not B.
65             So if you're looking for the number of "words", think about
66             I*2.
67              
68             Note too that I is the number of B on B
69             side of I. So the entire window width (length of the returned
70             slice) is I*2 +/-1. The window is guaranteed to be bounded
71             by B.
72              
73             If I is true, the window is shifted to try and match
74             the first token prior to I that returns true for is_sentence_start().
75              
76             =cut
77              
78             sub get_window {
79 77     77 1 70 my $self = shift;
80 77         61 my $pos = shift;
81 77 50       125 if ( !defined $pos ) {
82 0         0 croak "pos required";
83             }
84              
85 77   50     123 my $size = int(shift) || 20;
86 77   50     184 my $as_sentence = shift || 0;
87 77         128 my $max_index = $self->len - 1;
88              
89 77 50 33     238 if ( $pos > $max_index or $pos < 0 ) {
90 0         0 croak "illegal pos value: no such index in TokenList";
91             }
92              
93             #warn "window size $size for pos $pos";
94              
95             # get the $size tokens on either side of $tok
96 77         58 my ( $start, $end );
97              
98             # is token too close to the top of the stack?
99 77 100       107 if ( $pos > $size ) {
100 62         53 $start = $pos - $size;
101             }
102              
103             # is token too close to the bottom of the stack?
104 77 100       107 if ( $pos < ( $max_index - $size ) ) {
105 76         70 $end = $pos + $size;
106             }
107 77   100     117 $start ||= 0;
108 77   66     103 $end ||= $max_index;
109              
110 77 50       97 if ($as_sentence) {
111 0         0 my $sentence_starts = $self->get_sentence_starts;
112              
113             # default to what we have.
114 0         0 my $start_for_pos = $start;
115 0         0 my $i = 0;
116              
117             #warn "looking for sentence_start for start = $start end = $end\n";
118 0         0 for (@$sentence_starts) {
119              
120             #warn " $_ [$i]\n";
121 0 0       0 if ( $_ >= $pos ) {
122 0         0 $start_for_pos = $sentence_starts->[$i];
123 0         0 last;
124             }
125 0         0 $i++;
126             }
127              
128             #warn "found $start_for_pos (start = $start end = $end)\n";
129 0 0       0 if ( $start_for_pos != $start ) {
130 0 0       0 if ( $start_for_pos < $start ) {
131 0         0 $end -= ( $start - $start_for_pos );
132             }
133             else {
134 0         0 $end += ( $start_for_pos - $start );
135             }
136 0         0 $start = $start_for_pos;
137             }
138              
139             #warn "now $start_for_pos (start = $start end = $end)\n";
140             }
141             else {
142              
143             # make sure window starts and ends with is_match
144 77         216 while ( !$self->get_token($start)->is_match ) {
145 31         65 $start++;
146             }
147 77         193 while ( !$self->get_token($end)->is_match ) {
148 42         87 $end--;
149             }
150             }
151              
152             #warn "return $start .. $end";
153             #warn "$size ~~ " . ( $end - $start );
154              
155 77         155 return ( $start, $end );
156             }
157              
158             =head2 get_window_tokens( I [, I] )
159              
160             Like get_window() but returns an array ref of a slice
161             of the TokenList containing Tokens.
162              
163             =cut
164              
165             sub get_window_tokens {
166 0     0 1 0 my $self = shift;
167 0         0 my ( $start, $end ) = $self->get_window(@_);
168 0         0 my @slice = ();
169 0         0 for ( $start .. $end ) {
170 0         0 push( @slice, $self->get_token($_) );
171             }
172 0         0 return \@slice;
173             }
174              
175             =head2 as_sentences([I])
176              
177             Returns a reference to an array of arrays,
178             where each child array is a "sentence" worth of Token objects.
179             You can stringify each sentence array like:
180              
181             my $sentences = $tokenlist->as_sentences;
182             for my $s (@$sentences) {
183             printf("sentence: %s\n", join("", map {"$_"} @$s));
184             }
185              
186             If you pass a single true value to as_sentences(),
187             then the array returned will consist of plain scalar strings
188             with whitespace normalized.
189              
190             =cut
191              
192             sub as_sentences {
193 2     2 1 375 my $self = shift;
194 2   100     11 my $stringed = shift || 0;
195 2         3 my @sents;
196             my @s;
197              
198             # use array method since we do not know the iterator position
199 2         3 for my $t ( @{ $self->as_array } ) {
  2         9  
200 144 100       255 if ( $t->is_sentence_start ) {
201              
202             # if has any, add anonymous copy to master
203 8 100       15 if (@s) {
204 6         17 push @sents, [@s];
205             }
206              
207             # reset
208 8         16 @s = ();
209             }
210              
211             # add
212 144         138 push @s, $t;
213             }
214 2 50       7 if (@s) {
215 2         7 push @sents, [@s];
216             }
217 2 100       8 if ($stringed) {
218 1         2 my @stringed;
219 1         3 for my $s (@sents) {
220 4         7 my $str = join( "", map {"$_"} @$s );
  72         136  
221 4         28 $str =~ s/\s\s+/\ /g;
222 4         24 $str =~ s/\s+$//;
223 4         11 push @stringed, $str;
224             }
225 1         11 return \@stringed;
226             }
227              
228 1         5 return \@sents;
229             }
230              
231             1;
232              
233             __END__