File Coverage

blib/lib/Text/Ngramize.pm
Criterion Covered Total %
statement 312 373 83.6
branch 54 100 54.0
condition 10 18 55.5
subroutine 39 44 88.6
pod 8 27 29.6
total 423 562 75.2


line stmt bran cond sub pod time code
1             package Text::Ngramize;
2              
3             require 5.008_000;
4 1     1   23954 use strict;
  1         2  
  1         34  
5 1     1   4 use warnings;
  1         2  
  1         24  
6 1     1   795 use integer;
  1         13  
  1         4  
7 1     1   22 use Carp;
  1         1  
  1         71  
8              
9 1     1   5 use constant INDEX_TOKEN => 0;
  1         1  
  1         93  
10 1     1   4 use constant INDEX_POSITION => 1;
  1         1  
  1         36  
11 1     1   4 use constant INDEX_LENGTH => 2;
  1         1  
  1         88  
12              
13             BEGIN {
14 1     1   6 use Exporter ();
  1         1  
  1         24  
15 1     1   5 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         1  
  1         338  
16 1     1   3 $VERSION = '1.03';
17 1         17 @ISA = qw(Exporter);
18 1         2 @EXPORT = qw();
19 1         2 @EXPORT_OK = qw();
20 1         2720 %EXPORT_TAGS = ();
21             }
22              
23             =head1 NAME
24              
25             C - Computes lists of n-grams from text.
26              
27             =head1 SYNOPSIS
28              
29             use Text::Ngramize;
30             use Data::Dump qw(dump);
31             my $ngramizer = Text::Ngramize->new (normalizeText => 1);
32             my $text = "This sentence has 7 words; doesn't it?";
33             dump $ngramizer-> (text => \$text);
34              
35             =head1 DESCRIPTION
36              
37             C is used to compute the list of n-grams derived from the
38             bytes, characters, or words of the text provided. Methods
39             are included that provide positional information about the n-grams
40             computed within the text.
41              
42             =head1 CONSTRUCTOR
43              
44             =head2 C
45              
46             use Text::Ngramize;
47             use Data::Dump qw(dump);
48             my $ngramizer = Text::Ngramize->new (normalizeText => 1);
49             my $text = ' To be.';
50             dump $ngramizer->getListOfNgrams (text => \$text);
51             # dumps:
52             # ["to ", "o b", " be", "be "]
53              
54             The constructor C has optional parameters that set how the
55             n-grams are computed. C is used to
56             set the type of tokens used in the n-grams, C is used to set
57             normalization of the text before the tokens are extracted, and C
58             is the character used to join n-grams of words.
59              
60             =over
61              
62             =item C
63              
64             typeOfNgrams => 'characters'
65              
66             C sets the type of tokens to extract from the text
67             to form the n-grams: C<'asc'>
68             indicates the list of ASC characters comprising the bytes of the text are to be used, C<'characters'> indicates the
69             list of characters are to be used, and
70             C<'words'> indicates the words in the text are to be used. Note a word is defined
71             as a substring that matches the Perl regular expression '\p{Alphabetic}+', see L
72             for details. The default is C<'characters'>.
73              
74             =item C
75              
76             sizeOfNgrams => 3
77              
78             C holds the size of the n-grams that are to be created from the tokens extracted. Note n-grams of
79             size one are the tokens themselves. C should be a positive integer; the default is three.
80              
81             =item C
82              
83             normalizeText => 0
84              
85             If C evalutes to true, the text is normalized before the tokens are extracted;
86             normalization proceeds by converting the text to lower case, replacing all non-alphabetic
87             characters with a space, compressing multiple spaces to a single space, and removing
88             any leading space but not a trailing space. The
89             default value of C is zero, or false.
90              
91             =item C
92              
93             ngramWordSeparator => ' '
94              
95             C is the character used to separate token-C when forming n-grams from them. It is
96             only used when C is set to C<'words'>; the default is a space. Note, this is used to
97             avoid having n-grams clash, for example, with bigrams the word pairs C<'a aaa'> and C<'aa aa'> would
98             produce the same n-gram C<'aaaa'> without a space separating them.
99              
100             =back
101              
102             =cut
103              
104             sub new
105             {
106 311     311 1 34638541 my ($Class, %Parameters) = @_;
107 311   33     4671 my $Self = bless ({}, ref ($Class) || $Class);
108              
109             # set the default type of tokens.
110 311         1287 my $typeOfNgrams = 'characters';
111              
112             # get the type of tokens to create from the text.
113 311 100       2648 $typeOfNgrams = lc $Parameters{typeOfNgrams} if exists $Parameters{typeOfNgrams};
114 311 50       3423 unless ($typeOfNgrams =~ /^(a|c|w)/)
115             {
116 0         0 croak "Token type '" . $Parameters{typeOfNgrams} . "' is invalid; should be 'asc', 'characters', or 'words'.\n";
117             }
118 311         2988 my %types = qw (a asc c characters w words);
119 311         1059 $typeOfNgrams = $types{substr ($typeOfNgrams, 0, 1)};
120 311         1274 $Self->{typeOfNgrams} = $typeOfNgrams;
121              
122             # set normalizeText.
123 311   33     1897 $Self->{normalizeText} = exists ($Parameters{normalizeText}) && $Parameters{normalizeText};
124              
125             # get the size of the ngrams.
126 311         1925 my $sizeOfNgrams = 3;
127 311 100       1647 $sizeOfNgrams = int abs $Parameters{sizeOfNgrams} if exists $Parameters{sizeOfNgrams};
128 311 50       1234 $sizeOfNgrams = 1 if ($sizeOfNgrams < 1);
129 311         789 $Self->{sizeOfNgrams} = $sizeOfNgrams;
130              
131             # get the delimiter for words
132 311         860 my $ngramWordSeparator = ' ';
133 311 50       973 $ngramWordSeparator = $Parameters{ngramWordSeparator} if exists $Parameters{ngramWordSeparator};
134 311         820 $Self->{ngramWordSeparator} = $ngramWordSeparator;
135              
136             # sets values used for returning hash values of n-grams.
137 311         1624 $Self->setBitsInInteger;
138 311         1612 $Self->setByteHashValues;
139              
140 311         2317 return $Self;
141             }
142              
143             =head1 METHODS
144              
145             =head2 C
146              
147             Returns the type of n-grams computed as a string, either C<'asc'>,
148             C<'characters'>, or C<'words'>.
149              
150             use Text::Ngramize;
151             use Data::Dump qw(dump);
152             my $ngramizer = Text::Ngramize->new ();
153             dump $ngramizer->getTypeOfNgrams;
154             # dumps:
155             # "characters"
156              
157             =cut
158              
159             sub getTypeOfNgrams
160             {
161 0     0 1 0 return $_[0]->{typeOfNgrams};
162             }
163              
164             =head2 C
165              
166             Returns the size of n-grams computed.
167              
168             use Text::Ngramize;
169             use Data::Dump qw(dump);
170             my $ngramizer = Text::Ngramize->new ();
171             dump $ngramizer->getSizeOfNgrams;
172             # dumps:
173             # 3
174              
175             =cut
176              
177             sub getSizeOfNgrams
178             {
179 0     0 1 0 return $_[0]->{sizeOfNgrams};
180             }
181              
182             =head2 C
183              
184             The function C returns an array reference to the list of n-grams computed
185             from the text provided or the list of tokens provided by C.
186              
187             =over
188              
189             =item C
190              
191             text => ...
192              
193             C holds the text that the tokens are to be extracted from. It can be a single string,
194             a reference to a string, a reference to an array of strings, or any combination of these.
195              
196             =item C
197              
198             listOfTokens => ...
199              
200             Optionally, if C is not defined, then the list of tokens to use in forming the
201             n-grams can be provided by C, which should point to an array reference of strings.
202              
203             =back
204              
205             An example using the method:
206              
207             use Text::Ngramize;
208             use Data::Dump qw(dump);
209             my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
210             my $text = "This isn't a sentence.";
211             dump $ngramizer->getListOfNgrams (text => \$text);
212             # dumps:
213             # ["this isn t", "isn t a", "t a sentence"]
214             dump $ngramizer->getListOfNgrams (listOfTokens => [qw(aa bb cc dd)]);
215             # dumps:
216             # ["aa bb cc", "bb cc dd"]
217              
218             =cut
219              
220             sub getListOfNgrams
221             {
222 130     130 1 4024 my ($Self, %Parameters) = @_;
223              
224             # get the size of the ngrams to compute.
225 130         717 my $sizeOfNgrams = $Self->{sizeOfNgrams};
226              
227             # get the list of tokens from the user to the text provided.
228 130         379 my $listOfTokens;
229 130 50       806 if (exists ($Parameters{text}))
    0          
230             {
231             # compute the list of tokens from the text provided.
232 130         766 $listOfTokens = $Self->getListOfTokens($Parameters{text});
233             }
234             elsif (exists ($Parameters{listOfTokens}))
235             {
236             # use the list of tokens provided by the user.
237 0         0 $listOfTokens = $Parameters{listOfTokens};
238             }
239             else
240             {
241             # gotta have some tokens to make n-grams.
242 0         0 croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
243             }
244              
245             # if the list of tokens is empty or the size of the n-grams is one,
246             # just return the list of tokens as the list of n-grams.
247 130 100 66     2306 return $listOfTokens if (($#$listOfTokens == -1) || ($sizeOfNgrams == 1));
248              
249             # get the string to use to merge the tokens.
250 100         340 my $separator = '';
251 100 50       773 $separator = $Self->{ngramWordSeparator} if ($Self->{typeOfNgrams} =~ /^w/);
252              
253             # compute the list of n-grams.
254 100         267 my @listOfNgrams;
255 100         287 my $indexOfLastTokenInNgram = $sizeOfNgrams - 1;
256 100         284 my $indexOfLastNgram = scalar (@$listOfTokens) - $sizeOfNgrams + 1;
257 100         560 for (my $i = 0; $i < $indexOfLastNgram; $i++, $indexOfLastTokenInNgram++)
258             {
259 887610         5393051 push @listOfNgrams, join($separator, @$listOfTokens[$i..$indexOfLastTokenInNgram]);
260             }
261              
262             # note, if the number of tokens in the list is less than sizeOfNgrams, then
263             # no n-grams are returned, that is, @listOfNgrams is empty.
264 100         244008 return \@listOfNgrams;
265             }
266              
267             =head2 C
268              
269             The function C returns an array reference to the list of n-grams computed
270             from the text provided or the list of tokens provided by C. Each item in the list returned
271             is of the form C<['n-gram', starting-index, n-gram-length]>; the starting index and n-gram length are
272             relative to the unnormalized text. When C is C<'asc'> the index and length refer to bytes,
273             when C is C<'characters'> or C<'words'> they refer to characters.
274              
275             =over
276              
277             =item C
278              
279             text => ...
280              
281             C holds the text that the tokens are to be extracted from. It can be a single string,
282             a reference to a string, a reference to an array of strings, or any combination of these.
283              
284             =item C
285              
286             listOfTokens => ...
287              
288             Optionally, if C is not defined, then the list of tokens to use in forming the
289             n-grams can be provided by C, which should point to an array reference where
290             each item in the array is of the form C<[token, starting-position, length]>, where
291             C and C are integers indicating the position of the token.
292              
293             =back
294              
295             An example using the method:
296              
297             use Text::Ngramize;
298             use Data::Dump qw(dump);
299             my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
300             my $text = " This isn't a sentence.";
301             dump $ngramizer->getListOfNgramsWithPositions (text => \$text);
302             # dumps:
303             # [
304             # ["this isn t", 1, 11],
305             # ["isn t a", 7, 7],
306             # ["t a sentence", 11, 13],
307             # ]
308              
309             =cut
310              
311             sub getListOfNgramsWithPositions
312             {
313 130     130 1 1520 my ($Self, %Parameters) = @_;
314              
315             # get the size of the ngrams to compute.
316 130         372 my $sizeOfNgrams = $Self->{sizeOfNgrams};
317              
318             # get the list of tokens from the user to the text provided.
319 130         256 my $listOfTokens;
320 130 50       834 if (exists ($Parameters{text}))
    0          
321             {
322             # compute the list of tokens from the text provided.
323 130         849 $listOfTokens = $Self->getListOfTokensWithPositions (%Parameters);
324             }
325             elsif (exists ($Parameters{listOfTokens}))
326             {
327             # use the list of tokens provided by the user.
328 0         0 $listOfTokens = $Parameters{listOfTokens};
329             }
330             else
331             {
332             # gotta have some tokens to make n-grams.
333 0         0 croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
334             }
335              
336             # if the list of tokens is empty or the size of the n-grams is one,
337             # just return the list of tokens as the list of n-grams.
338 130 100 66     2098 return $listOfTokens if (($#$listOfTokens == -1) || ($sizeOfNgrams == 1));
339              
340             # get the string to use to merge the tokens.
341 100         337 my $separator = '';
342 100 50       2082 $separator = $Self->{ngramWordSeparator} if ($Self->{typeOfNgrams} =~ /^w/);
343              
344             # compute the list of n-grams and their positions.
345 100         225 my @listOfNgramsWithPositions;
346 100         824 my @indices= (0..($sizeOfNgrams - 1));
347 100         297 my $indexOfLastTokenInNgram = $sizeOfNgrams - 1;
348 100         286 my $indexOfLastNgram = scalar (@$listOfTokens) - $sizeOfNgrams + 1;
349 100         752 for (my $i = 0; $i < $indexOfLastNgram; $i++, $indexOfLastTokenInNgram++)
350             {
351 887610         1649851 push @listOfNgramsWithPositions, [join ($separator, map {$listOfTokens->[$_+$i][0]} @indices),
  10994460         29936597  
352              
353             # index to start of first n-gram
354             $listOfTokens->[$i][1],
355              
356             # length of the n-gram.
357             $listOfTokens->[$i + $sizeOfNgrams - 1][1] + $listOfTokens->[$i + $sizeOfNgrams - 1][2] - $listOfTokens->[$i][1]];
358             }
359              
360             # note, if the number of tokens in the list is less than sizeOfNgrams, then
361             # no n-grams are returned, that is, @listOfNgrams is empty.
362 100         778830 return \@listOfNgramsWithPositions;
363             }
364              
365             =head2 C
366              
367             The function C returns an array reference to the list of integer hash values
368             computed from the n-grams
369             of the text provided or the list of tokens provided by C. The advantage of using
370             hashes over strings is that they take less memory and are theoretically faster to compute. With strings
371             the time to compute the n-grams is proportional to their size, with hashes it is not
372             since they are computed recursively. Also, the amount of memory used to store the n-gram strings
373             grows proportional to their size, with hashes it does not. The disadvantage lies with
374             hashing collisions, but these will be very rare. However, for small n-gram sizes hash values
375             may take more time to compute since all code is written in Perl.
376              
377             =over
378              
379             =item C
380              
381             text => ...
382              
383             C holds the text that the tokens are to be extracted from. It can be a single string,
384             a reference to a string, a reference to an array of strings, or any combination of these.
385              
386             =item C
387              
388             listOfTokens => ...
389              
390             Optionally, if C is not defined, then the list of tokens to use in forming the
391             n-grams can be provided by C, which should point to an array reference of strings.
392              
393             =back
394              
395             An example using the method:
396              
397             use Text::Ngramize;
398             use Data::Dump qw(dump);
399             my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
400             my $text = "This isn't a sentence.";
401             dump $ngramizer->getListOfNgramHashValues (text => \$text);
402             # NOTE: hash values may vary across computers.
403             # dumps:
404             # [
405             # "4038955636454686726",
406             # "5576083060948369410",
407             # "6093054335710494749",
408             # ] dump $ngramizer->getListOfNgramHashValues (listOfTokens => [qw(aa bb cc dd)]);
409             # dumps:
410             # ["7326140501871656967", "5557417594488258562"]
411              
412             =cut
413              
414             sub getListOfNgramHashValues
415             {
416 180     180 1 12202 my ($Self, %Parameters) = @_;
417              
418             # get the size of the ngrams to compute.
419 180         722 my $sizeOfNgrams = $Self->{sizeOfNgrams};
420              
421             # get the list of tokens from the user to the text provided.
422 180         438 my $listOfTokens;
423 180 50       1038 if (exists ($Parameters{text}))
    0          
424             {
425             # compute the list of tokens from the text provided.
426 180         1292 $listOfTokens = $Self->getListOfTokens($Parameters{text});
427             }
428             elsif (exists ($Parameters{listOfTokens}))
429             {
430             # use the list of tokens provided by the user.
431 0         0 $listOfTokens = $Parameters{listOfTokens};
432             }
433             else
434             {
435             # gotta have some tokens to make n-grams.
436 0         0 croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
437             }
438              
439             # get the list of hashes for the tokens.
440 180         1483 my $listOfHashesOfTokens = $Self->getHashValuesOfListOfStrings (listOfStrings => $listOfTokens);
441 180         827 my $totalTokens = $#$listOfHashesOfTokens + 1;
442              
443             # if the list of tokens is empty or the size of the n-grams is one,
444             # just return the list of hash values of the tokens.
445 180 100 66     2051 if (($totalTokens == 0) || ($sizeOfNgrams == 1))
446             {
447 30         39144 return $listOfHashesOfTokens;
448             }
449              
450             # holds the hash values computed.
451 150         391 my @listOfHashValues;
452              
453             # compute the shifts need to add a hash code to the running hash value.
454 150         692 my $bitInInteger = $Self->{bitsInInteger};
455 150         358 my $addShiftRight = $bitInInteger - 1;
456              
457             # compute the value of the first hash.
458 150         334 my $runningHashValue = 0;
459 150         599 for (my $i = 0; $i < $sizeOfNgrams; $i++)
460             {
461 1860         9593 $runningHashValue = ($runningHashValue << 1) ^ $Self->rshift($runningHashValue,$addShiftRight) ^ $listOfHashesOfTokens->[$i];
462             }
463 150         489 push @listOfHashValues, $runningHashValue;
464              
465             # compute the shifts needed to remove the oldest hash code in the running hash value.
466 150         2175 my $indexToRemove = 0;
467 150         372 my $removeShiftLeft = ($sizeOfNgrams - 1) % $bitInInteger;
468 150         326 my $removeShiftRight = $bitInInteger - $removeShiftLeft;
469 150         466 my $removeShiftMask = ~(-1 << $removeShiftLeft);
470 150         369 my $addShiftMask = ~(-1 << 1);
471              
472 150         704 for (my $indexToAdd = $sizeOfNgrams; $indexToAdd < $totalTokens; $indexToRemove++, $indexToAdd++)
473             {
474 984500         1453859 $runningHashValue ^= ($listOfHashesOfTokens->[$indexToRemove] << $removeShiftLeft) ^ (($listOfHashesOfTokens->[$indexToRemove] >> $removeShiftRight) & $removeShiftMask);
475 984500         1519265 $runningHashValue = ($runningHashValue << 1) ^ (($runningHashValue >> $addShiftRight) & $addShiftMask) ^ $listOfHashesOfTokens->[$indexToAdd];
476 984500         2365417 push @listOfHashValues, $runningHashValue;
477             }
478              
479             # note, if the number of tokens in the list is less than sizeOfNgrams, then
480             # no n-grams are returned, that is, @listOfHashValues is empty.
481 150         239619 return \@listOfHashValues;
482             }
483              
484             =head2 C
485              
486             The function C returns an array reference to the list of integer hash values and
487             n-gram positional information computed
488             from the text provided or the list of tokens provided by C. Each item in the list returned
489             is of the form C<['n-gram-hash', starting-index, n-gram-length]>; the starting index and n-gram length are
490             relative to the unnormalized text. When C is C<'asc'> the index and length refer to bytes,
491             when C is C<'characters'> or C<'words'> they refer to characters.
492              
493             The advantage of using
494             hashes over strings is that they take less memory and are theoretically faster to compute. With strings
495             the time to compute the n-grams is proportional to their size, with hashes it is not
496             since they are computed recursively. Also, the amount of memory used to store the n-gram strings
497             grows proportional to their size, with hashes it does not. The disadvantage lies with
498             hashing collisions, but these will be very rare. However, for small n-gram sizes hash values
499             may take more time to compute since all code is written in Perl.
500              
501             =over
502              
503             =item C
504              
505             text => ...
506              
507             C holds the text that the tokens are to be extracted from. It can be a single string,
508             a reference to a string, a reference to an array of strings, or any combination of these.
509              
510             =item C
511              
512             listOfTokens => ...
513              
514             Optionally, if C is not defined, then the list of tokens to use in forming the
515             n-grams can be provided by C, which should point to an array reference where
516             each item in the array is of the form C<[token, starting-position, length]>, where
517             C and C are integers indicating the position of the token.
518              
519             =back
520              
521             An example using the method:
522              
523             use Text::Ngramize;
524             use Data::Dump qw(dump);
525             my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
526             my $text = " This isn't a sentence.";
527             dump $ngramizer->getListOfNgramHashValuesWithPositions (text => \$text);
528             # NOTE: hash values may vary across computers.
529             # dumps:
530             # [
531             # ["4038955636454686726", 1, 11],
532             # ["5576083060948369410", 7, 7],
533             # ["6093054335710494749", 11, 13],
534             # ]
535              
536             =cut
537              
538             sub getListOfNgramHashValuesWithPositions
539             {
540 180     180 1 2869 my ($Self, %Parameters) = @_;
541              
542             # get the size of the ngrams to compute.
543 180         473 my $sizeOfNgrams = $Self->{sizeOfNgrams};
544              
545             # get the list of tokens from the user to the text provided.
546 180         540 my $listOfTokens;
547 180 50       774 if (exists ($Parameters{text}))
    0          
548             {
549             # compute the list of tokens from the text provided.
550 180         1284 $listOfTokens = $Self->getListOfTokensWithPositions(%Parameters);
551             }
552             elsif (exists ($Parameters{listOfTokens}))
553             {
554             # use the list of tokens provided by the user.
555 0         0 $listOfTokens = $Parameters{listOfTokens};
556             }
557             else
558             {
559             # gotta have some tokens to make n-grams.
560 0         0 croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
561             }
562              
563             # get the list of hashes for the tokens.
564 180         6751 my $listOfHashesOfTokens = $Self->getHashValuesOfListOfStrings (listOfStrings => [map {$_->[0]} @$listOfTokens]);
  1172300         2935227  
565 180         434947 my $totalTokens = $#$listOfHashesOfTokens + 1;
566              
567             # if the list of tokens is empty or the size of the n-grams is one,
568             # just return the list of hash values of the tokens.
569 180 100 66     2760 if (($totalTokens == 0) || ($sizeOfNgrams == 1))
570             {
571 30         130 for (my $i = 0; $i < $totalTokens; $i++)
572             {
573 185940         845053 $listOfHashesOfTokens->[$i] = [$listOfHashesOfTokens->[$i], $listOfTokens->[$i][1], $listOfTokens->[$i][2]];
574             }
575 30         79526 return $listOfHashesOfTokens;
576             }
577              
578             # holds the hash values computed.
579 150         454 my @listOfHashValues;
580              
581             # compute the shifts need to add a hash code to the running hash value.
582 150         889 my $bitInInteger = $Self->{bitsInInteger};
583 150         653 my $addShiftRight = $bitInInteger - 1;
584              
585             # compute the value of the first hash.
586 150         316 my $runningHashValue = 0;
587 150         834 for (my $i = 0; $i < $sizeOfNgrams; $i++)
588             {
589 1860         6695 $runningHashValue = ($runningHashValue << 1) ^ $Self->rshift($runningHashValue,$addShiftRight) ^ $listOfHashesOfTokens->[$i];
590             }
591 150         1588 push @listOfHashValues, [$runningHashValue, $listOfTokens->[0][1], $listOfTokens->[$sizeOfNgrams - 1][1] + $listOfTokens->[$sizeOfNgrams - 1][2] - $listOfTokens->[0][1]];
592              
593             # compute the shifts need to remove the oldest hash code in the running hash value.
594 150         297 my $indexToRemove = 0;
595 150         386 my $removeShiftLeft = ($sizeOfNgrams - 1) % $bitInInteger;
596 150         291 my $removeShiftRight = $bitInInteger - $removeShiftLeft;
597 150         351 my $removeShiftMask = ~(-1 << $removeShiftLeft);
598 150         241 my $addShiftMask = ~(-1 << 1);
599              
600 150         585 for (my $indexToAdd = $sizeOfNgrams; $indexToAdd < $totalTokens; $indexToRemove++, $indexToAdd++)
601             {
602 984500         1673640 $runningHashValue ^= ($listOfHashesOfTokens->[$indexToRemove] << $removeShiftLeft) ^ (($listOfHashesOfTokens->[$indexToRemove] >> $removeShiftRight) & $removeShiftMask);
603 984500         1602857 $runningHashValue = ($runningHashValue << 1) ^ (($runningHashValue >> $addShiftRight) & $addShiftMask) ^ $listOfHashesOfTokens->[$indexToAdd];
604 984500         5365035 push @listOfHashValues, [$runningHashValue, $listOfTokens->[$indexToRemove + 1][1], $listOfTokens->[$indexToRemove + $sizeOfNgrams][1] + $listOfTokens->[$indexToRemove + $sizeOfNgrams][2] - $listOfTokens->[$indexToRemove + 1][1]];
605             }
606              
607             # note, if the number of tokens in the list is less than sizeOfNgrams, then
608             # no n-grams are returned, that is, @listOfHashValues is empty.
609 150         579451 return \@listOfHashValues;
610             }
611              
612             sub getListOfTokens
613             {
614 310     310 0 1461 my ($Self, @Text) = @_;
615              
616             # if no text, return the empty list.
617 310 50       1494 return [] unless @Text;
618 310         933 my $text = \@Text;
619              
620             # get the tokens.
621 310 100       5943 if ($Self->{typeOfNgrams} =~ /^a/)
    100          
    50          
622             {
623 120         601 return $Self->getListOfAsc ($text);
624             }
625             elsif ($Self->{typeOfNgrams} =~ /^c/)
626             {
627 120         764 return $Self->getListOfCharacters ($text);
628             }
629             elsif ($Self->{typeOfNgrams} =~ /^w/)
630             {
631 70         345 return $Self->getListOfWords ($text);
632             }
633             else
634             {
635 0         0 croak "programming error: parameter typeOfNgrams has value '" . $Self->{typeOfNgrams}. "' and should not.\n";
636             }
637             }
638              
639             # returns an array reference to the list of tokens found text provided
640             # including each tokens' position and length in the original text. Each
641             # entry in the array is of the form ['token', starting-position, length]
642             sub getListOfTokensWithPositions # (text => '..')
643             {
644 310     310 0 1126 my ($Self, %Parameters) = @_;
645              
646             # if no text, return the empty list.
647 310 50       1591 return [] unless exists $Parameters{text};
648 310         772 my $text = $Parameters{text};
649              
650             # get the tokens.
651 310 100       3375 if ($Self->{typeOfNgrams} =~ /^a/)
    100          
    50          
652             {
653 120         609 return $Self->getListOfAscWithPositions (\$text);
654             }
655             elsif ($Self->{typeOfNgrams} =~ /^c/)
656             {
657 120         635 return $Self->getListOfCharactersWithPositions (\$text);
658             }
659             elsif ($Self->{typeOfNgrams} =~ /^w/)
660             {
661 70         385 return $Self->getListOfWordsWithCharacterPositions (\$text);
662             }
663             else
664             {
665 0         0 croak "programming error: parameter typeOfNgrams has value '" . $Self->{typeOfNgrams}. "' and should not.\n";
666             }
667             }
668              
669             # uses unpack to return the list of asc characters of the text.
670             sub getListOfAsc # ($text)
671             {
672 1     1   2021 use bytes;
  1         16  
  1         7  
673 120     120 0 295 my $Self = shift;
674              
675             # get the text as a list of string references.
676 120         725 my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
677              
678 120         223 my @listOfTokens;
679 120         362 foreach my $stringRef (@$listOfText)
680             {
681 120 50       543 if ($Self->{normalizeText})
682             {
683 0         0 push @listOfTokens, map {chr} unpack ('C*', $Self->normalizeText ($stringRef));
  0         0  
684             }
685             else
686             {
687 120         95892 push @listOfTokens, map {chr} unpack ('C*', $$stringRef);
  1568610         3282478  
688             }
689             }
690 120         1957 return \@listOfTokens;
691             }
692              
693             # uses an empty split to get all the characters in the text.
694             sub getListOfCharacters # ($text)
695             {
696 120     120 0 292 my $Self = shift;
697 120         709 return $Self->getListOfTokensUsingRegexp ('', @_);
698             }
699              
700             # returns a list of all substrings of letters.
701             sub getListOfWords # ($text)
702             {
703 70     70 0 165 my $Self = shift;
704 70         316 return $Self->getListOfTokensUsingRegexp ('[^\p{IsAlphabetic}]+', @_);
705             }
706              
707             # regexp must be a regexp without bounding slashes.
708             sub getListOfTokensUsingRegexp # ($regexp, $text)
709             {
710 190     190 0 399 my $Self = shift;
711 190         381 my $regexp = shift;
712              
713             # get the text as a list of string references.
714 190         1047 my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
715              
716 190         465 my @listOfTokens;
717 190         569 foreach my $stringRef (@$listOfText)
718             {
719 190 50       893 if ($Self->{normalizeText})
720             {
721 0         0 push @listOfTokens, split (/$regexp/, $Self->normalizeText ($stringRef));
722             }
723             else
724             {
725 190         629992 push @listOfTokens, split (/$regexp/, $$stringRef);
726             }
727             }
728 190         5714 return \@listOfTokens;
729             }
730              
731             # returns a copy of the text that is lower case, has all none letters replaced
732             # with spaces, and has all multiple spaces replaced with one space.
733             sub normalizeText # ($text)
734             {
735 0     0 1 0 my $Self = shift;
736 0         0 my $Text = shift;
737              
738             # make a copy of the text, and downcase it.
739 0         0 my $type = ref ($Text);
740 0 0       0 if ($type eq '')
    0          
741             {
742 0         0 $Text = lc $Text;
743             }
744             elsif ($type eq 'SCALAR')
745             {
746 0         0 $Text = lc $$Text;
747             }
748              
749             # convert all none letters to spaces; tried to use tr/// but \p{IsAlphabetic} gave a
750             # warning with it.
751 1     1   1719 $Text =~ s/\P{IsAlphabetic}/ /g;
  1         10  
  1         12  
  0         0  
752              
753             # compress mulitple spaces to one space.
754 0         0 $Text =~ tr/ / /s;
755              
756             # remove leading spaces.
757 0         0 $Text =~ s/^ +//;
758              
759 0         0 return $Text;
760             }
761              
762             # if the tokens are to be ASC, positions without normalization with be just the
763             # byte number. to get the byte number with filtering, first we need to convert
764             # original text into characters with positions that include their byte index. then
765             # filter the text. so really we convert to characters first and generate either
766             # character positions or byte positions. from there we can get byte positions or
767             # characters positions or word positions.
768              
769             # give a string reference returns an array reference of the form:
770             # [
771             # [character, position = $OffSet, 1],
772             # [character, position = $OffSet + 1, 1],
773             # ...
774             # [character, position = $Offset + length ($$StringRef) - 1, 1]
775             # ]
776             sub getCharactersWithCharacterPositions # ($StringRef, $OffSet)
777             {
778 190     190 0 573 my $Self = shift;
779 190         590 my $StringRef = shift;
780              
781             # default value of the offset is zero.
782 190         377 my $Offset = shift;
783 190 50       866 $Offset = 0 unless defined $Offset;
784              
785             # build the list of characters with their position.
786 190         375916 my @listOfCharacters = map { [$_, $Offset++, 1] } split //, $$StringRef;
  1000060         3824225  
787 190         365886 return \@listOfCharacters;
788             }
789              
790             # give a string reference returns an array reference of the form:
791             # [
792             # [character, byte-position, bytes::length (character)],
793             # ...
794             # [character, byte-position, bytes::length (character)]
795             # ]
796             sub getCharactersWithBytePositions # ($StringRef, $OffSet)
797             {
798 120     120 0 193 my $Self = shift;
799 120         202 my $StringRef = shift;
800              
801             # default value of the offset is zero.
802 120         183 my $Offset = shift;
803 120 50       355 $Offset = 0 unless defined $Offset;
804              
805             # build the list of characters with their byte position.
806 120         303900 my @listOfCharacters = split //, $$StringRef;
807             {
808 1     1   29867 use bytes;
  1         13  
  1         11  
  120         35546  
809 120         442 foreach my $char (@listOfCharacters)
810             {
811 623930         1411002 my $len = bytes::length ($char);
812 623930         2695834 $char = [$char, $Offset, $len];
813 623930         1033821 $Offset += $len;
814             }
815 1     1   56 no bytes;
  1         2  
  1         4  
816             }
817 120         767 return \@listOfCharacters;
818             }
819              
820             # given the list of characters returned from getCharactersWithCharacterPositions or
821             # from getCharactersWithBytePositions returns a list of the normalized characters.
822             sub normalizeCharacterList
823             {
824 0     0 0 0 my $Self = shift;
825              
826             # get the list of characters with position info.
827 0         0 my $ListOfCharacters = shift;
828              
829 0         0 my @filteredList;
830 0         0 my $previousCharIsSpace = 1;
831              
832 0         0 for (my $i = 0; $i < @$ListOfCharacters; $i++)
833             {
834             # get the pair [character, position].
835 0         0 my $charPos = $ListOfCharacters->[$i];
836              
837             # lowercase the character or convert it to a space.
838 0         0 my $newChar;
839 0 0       0 if ($charPos->[0] =~ m/^\p{IsAlphabetic}$/o)
840             {
841 0         0 $newChar = lc $charPos->[0];
842             }
843             else
844             {
845 0         0 $newChar = ' ';
846             }
847              
848             # add the new character to the list.
849 0 0       0 if ($newChar ne ' ')
    0          
850             {
851             # if the new character is not a space, add it.
852 0         0 $previousCharIsSpace = 0;
853 0         0 push @filteredList, [$newChar, $charPos->[1], $charPos->[2]];
854             }
855             elsif (!$previousCharIsSpace)
856             {
857             # if the new character is a space but the previous was not, add it.
858 0         0 $previousCharIsSpace = 1;
859 0         0 push @filteredList, [$newChar, $charPos->[1], $charPos->[2]];
860             }
861             }
862              
863             # return the new list of characters and their positions.
864 0         0 return \@filteredList;
865             }
866              
867             # given a list of text (strings), returns an array reference to the list
868             # of ASC characters comprizing all the text, with their bytes positions and
869             # length (always 1) in the original text.
870             sub getListOfAscWithPositions # ($Text)
871             {
872 120     120 0 265 my $Self = shift;
873              
874             # get the text as a list of string references.
875 120         599 my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
876              
877 120         215 my @listOfTokensWithPosition;
878 120         246 my $offset = 0;
879 120         551 for (my $i = 0; $i < @$listOfText; $i++)
880             {
881 120         283 my $stringRef = $listOfText->[$i];
882              
883             # convert the string to a list of characters.
884 120         477 my $listOfCharacters = $Self->getCharactersWithBytePositions ($stringRef, $offset);
885              
886             # normalize of the list of characters.
887 120 50       937 if ($Self->{normalizeText})
888             {
889 0         0 $listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
890             }
891              
892             # expand each character into its list of bytes.
893             {
894 1     1   365 use bytes;
  1         2  
  1         3  
  120         459  
895              
896 120         611 for (my $j = 0; $j < @$listOfCharacters; $j++)
897             {
898             # get the characters, its position, and length.
899 623930         1136043 my $charPosLen = $listOfCharacters->[$j];
900              
901             # split the characters into ASC bytes.
902 623930         1510248 my @listOfAsc = map {chr} unpack ('C*', $charPosLen->[0]);
  1568610         3352913  
903              
904 623930         1166973 my $byteOffset = $charPosLen->[1];
905 623930         936597 foreach my $asc (@listOfAsc)
906             {
907 1568610         5803960 push @listOfTokensWithPosition, [$asc, $byteOffset++, 1];
908             }
909             }
910              
911 120         1422 $offset += bytes::length ($$stringRef);
912 1     1   85 no bytes;
  1         2  
  1         3  
913             }
914             }
915              
916 120         398380 return \@listOfTokensWithPosition;
917             }
918              
919             # given a list of text (strings), returns an array reference to the list
920             # of characters comprizing all the text, with their character positions and
921             # length in the original text.
922             sub getListOfCharactersWithPositions # ($Text)
923             {
924 120     120 0 248 my $Self = shift;
925              
926             # get the text as a list of string references.
927 120         666 my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
928              
929 120         199 my @listOfTokensWithPosition;
930 120         1320 my $offset = 0;
931 120         604 for (my $i = 0; $i < @$listOfText; $i++)
932             {
933 120         307 my $stringRef = $listOfText->[$i];
934              
935             # convert the string to a list of characters.
936 120         614 my $listOfCharacters = $Self->getCharactersWithCharacterPositions ($stringRef, $offset);
937              
938             # normalize of the list of characters.
939 120 50       1199 if ($Self->{normalizeText})
940             {
941 0         0 $listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
942             }
943              
944             # append the list of characters.
945 120         172945 push @listOfTokensWithPosition, @$listOfCharacters;
946              
947             # accumulate the character offsets.
948 120         75449 $offset += length ($$stringRef);
949             }
950              
951 120         1478 return \@listOfTokensWithPosition;
952             }
953              
954             # given a list of text (strings), returns an array reference to the list
955             # of words comprizing all the text, with their character positions and
956             # length in the original text. [word, start-position, length]
957             sub getListOfWordsWithCharacterPositions # ($Text)
958             {
959 70     70 0 176 my $Self = shift;
960              
961             # get the text as a list of string references.
962 70         509 my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
963              
964 70         144 my @listOfTokensWithPosition;
965 70         168 my $offset = 0;
966 70         329 for (my $i = 0; $i < @$listOfText; $i++)
967             {
968 70         278 my $stringRef = $listOfText->[$i];
969              
970             # convert the string to a list of characters.
971 70         363 my $listOfCharacters = $Self->getCharactersWithCharacterPositions ($stringRef, $offset);
972              
973             # normalize of the list of characters.
974 70 50       745 if ($Self->{normalizeText})
975             {
976 0         0 $listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
977             }
978              
979             # get the list of words.
980 70         576 my $listOfWords = $Self->getListOfWordsFromCharacterList ($listOfCharacters);
981              
982             # append the list of characters.
983 70         22492 push @listOfTokensWithPosition, @$listOfWords;
984              
985             # accumulate the character offsets.
986 70         293885 $offset += length ($$stringRef);
987             }
988              
989 70         1065 return \@listOfTokensWithPosition;
990             }
991              
992             # given a list of text (strings), returns an array reference to the list
993             # of words comprizing all the text, with their bytes positions and
994             # length in the original text. [word, start-position, length]
995             sub getListOfWordsWithBytePositions # ($Text)
996             {
997 0     0 0 0 my $Self = shift;
998              
999             # get the text as a list of string references.
1000 0         0 my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
1001              
1002 0         0 my @listOfTokensWithPosition;
1003 0         0 my $offset = 0;
1004 0         0 for (my $i = 0; $i < @$listOfText; $i++)
1005             {
1006 0         0 my $stringRef = $listOfText->[$i];
1007              
1008             # convert the string to a list of characters.
1009 0         0 my $listOfCharacters = $Self->getCharactersWithBytePositions ($stringRef, $offset);
1010              
1011             # normalize of the list of characters.
1012 0 0       0 if ($Self->{normalizeText})
1013             {
1014 0         0 $listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
1015             }
1016              
1017             # get the list of words.
1018 0         0 my $listOfWords = $Self->getListOfWordsFromCharacterList ($listOfCharacters);
1019              
1020             # append the list of characters.
1021 0         0 push @listOfTokensWithPosition, @$listOfWords;
1022              
1023             # accumulate the byte offsets.
1024 0         0 $offset += length ($$stringRef);
1025             }
1026              
1027 0         0 return \@listOfTokensWithPosition;
1028             }
1029              
1030             sub getListOfWordsFromCharacterList # ($ListOfCharactersWithPositions)
1031             {
1032 70     70 0 195 my $Self = shift;
1033              
1034             # get the list of characters with position info.
1035 70         172 my $ListOfCharacters = shift;
1036              
1037 70         152 my @listOfWords;
1038             my @currentWord;
1039 70         442 for (my $i = 0; $i < @$ListOfCharacters; $i++)
1040             {
1041             # get the pair [character, position, length].
1042 376130         694395 my $charPosLen = $ListOfCharacters->[$i];
1043              
1044 376130 100       1485875 if ($charPosLen->[0] =~ m/^\p{IsAlphabetic}$/)
    100          
1045             {
1046             # got a letter so accumulate the letters of the word.
1047 307250         881926 push @currentWord, $charPosLen;
1048             }
1049             elsif (@currentWord)
1050             {
1051             # none letter, so concat all the characters of the word.
1052 53700         105626 my $word = join ('', map {$_->[0]} @currentWord);
  306620         654781  
1053              
1054             # get the position of the first character in the word.
1055 53700         117709 my $position = $currentWord[0]->[1];
1056              
1057             # compute total characters in the word.
1058 53700         125627 my $length = $currentWord[-1]->[1] - $currentWord[0]->[1] + $currentWord[-1]->[2];
1059              
1060             # store the word info.
1061 53700         162666 push @listOfWords, [$word, $position, $length];
1062              
1063             # clear the cache or word characters.
1064 53700         206747 $#currentWord = -1;
1065             }
1066             }
1067              
1068             # store the last word if there is one.
1069 70 100       279 if (@currentWord)
1070             {
1071             # none letter, so concat all the characters of the word.
1072 60         197 my $word = join ('', map {$_->[0]} @currentWord);
  630         1103  
1073              
1074             # get the position of the first character in the word.
1075 60         198 my $position = $currentWord[0]->[1];
1076              
1077             # get to total characters in the word.
1078 60         229 my $length = $currentWord[-1]->[1] - $currentWord[0]->[1] + $currentWord[-1]->[2];
1079              
1080             # store the word info.
1081 60         226 push @listOfWords, [$word, $position, $length];
1082             }
1083              
1084             # return the new list of characters and their positions.
1085 70         577 return \@listOfWords;
1086             }
1087              
1088             # flattens a list of scalars, references, arrays, references to arrays, and
1089             # any combination of them into a list of references to the scalars.
1090             sub getListOfAllScalarsAsReferences
1091             {
1092 930     930 0 1876 my $Self = shift;
1093              
1094 930         1412 my @listOfRefsToScalars;
1095 930         3913 foreach my $item (@_)
1096             {
1097 930         2432 my $type = ref ($item);
1098 930 100       5190 if ($type eq '')
    100          
    50          
    0          
1099             {
1100 310         3870 push @listOfRefsToScalars, \$item;
1101             }
1102             elsif ($type eq 'SCALAR')
1103             {
1104 310         1520 push @listOfRefsToScalars, $item;
1105             }
1106             elsif ($type eq 'ARRAY')
1107             {
1108 310         596 push @listOfRefsToScalars, @{$Self->getListOfAllScalarsAsReferences (@$item)};
  310         1444  
1109             }
1110             elsif ($type eq 'REF')
1111             {
1112 0         0 push @listOfRefsToScalars, @{$Self->getListOfAllScalarsAsReferences ($$item)};
  0         0  
1113             }
1114             }
1115 930         9463 return \@listOfRefsToScalars;
1116             }
1117              
1118             # give an array reference defined using listOfStrings => ... that defines
1119             # a list of strings. this routine returns the hash values computed for the
1120             # strings.
1121             sub getHashValuesOfListOfStrings
1122             {
1123 1     1   781 use bytes;
  1         2  
  1         3  
1124              
1125 360     360 0 2520 my ($Self, %Parameters) = @_;
1126              
1127             # get the bits to circular shift the hash values by.
1128 360         1735 my $shiftBits = $Self->{bitsInInteger} - 1;
1129 360         906 my $mask = ~(-1 << 1);
1130              
1131             # get the hash values of the bytes.
1132 360         923 my $byteHashValues = $Self->{byteHashValues};
1133              
1134 360         772 my @listOfHashValues;
1135 360         867 foreach my $string (@{$Parameters{listOfStrings}})
  360         1685  
1136             {
1137 2344600         2709103 my $value = 0;
1138 2344600         4271473 foreach my $byte (unpack ('C*', $string))
1139             {
1140 4701240         9125080 $value = ($value << 1) ^ (($value >> $shiftBits) & $mask) ^ $byteHashValues->[$byte];
1141             }
1142 2344600         4830241 push @listOfHashValues, $value;
1143             }
1144 360         3934 return \@listOfHashValues;
1145             }
1146              
1147             # create and store the list of hash values used for bytes; used as a
1148             # basis for all hash values.
1149             sub setByteHashValues
1150             {
1151 311     311 0 861 my ($Self, %Parameters) = @_;
1152              
1153             # get the seed to use for the random number generator.
1154 311         702 my $randomSeed = 1093;
1155 311 50       1223 $randomSeed = $Parameters{randomSeed} if exists $Parameters{randomSeed};
1156 311         961 srand ($randomSeed);
1157              
1158             # create and store the list of byte hash values.
1159 311         546 my $size = 256;
1160 311         759 my @byteHashValues;
1161 311         1507 my $maxValue = $Self->rshift(~0,1);
1162 311         1593 for (my $i = 0; $i < $size; $i++)
1163             {
1164 79616         251805 push @byteHashValues, (int rand ($maxValue)) ^ ((int rand ($maxValue)) << 7);
1165             }
1166 311         1127 $Self->{byteHashValues} = \@byteHashValues;
1167 311         1047 return;
1168             }
1169              
1170             # compute and store the number of bits in an integer.
1171             sub setBitsInInteger
1172             {
1173 311     311 0 619 my $Self = shift;
1174              
1175             # maybe some day computer words will contain 1024 bits.
1176 311         639 my $maxIterations = 1024;
1177 311         455 my $bitsInInteger = 0;
1178 311         1289 for (my $maxInteger = ~0; $maxInteger; $maxInteger <<= 1)
1179             {
1180 19904         23011 ++$bitsInInteger;
1181              
1182             # ensure the loop is finite.
1183 19904 50       55413 last if (--$maxIterations < 1);
1184             }
1185 311         964 $Self->{bitsInInteger} = $bitsInInteger;
1186 311         975 return;
1187             }
1188              
1189             # does a right bit shift even if the int is signed.
1190             sub rshift
1191             {
1192 4031 50   4031 0 22669 return ($_[1] >> $_[2]) & ~(-1 << ($_[0]->{bitsInInteger} - $_[2])) if ($_[2]);
1193 0           return $_[1];
1194             }
1195              
1196             =head1 INSTALLATION
1197              
1198             To install the module run the following commands:
1199              
1200             perl Makefile.PL
1201             make
1202             make test
1203             make install
1204              
1205             If you are on a windows box you should use 'nmake' rather than 'make'.
1206              
1207             =head1 AUTHOR
1208              
1209             Jeff Kubina
1210              
1211             =head1 COPYRIGHT
1212              
1213             Copyright (c) 2009 Jeff Kubina. All rights reserved.
1214             This program is free software; you can redistribute
1215             it and/or modify it under the same terms as Perl itself.
1216              
1217             The full text of the license can be found in the
1218             LICENSE file included with this module.
1219              
1220             =head1 KEYWORDS
1221              
1222             information processing, ngram, ngrams, n-gram, n-grams, string, text
1223              
1224             =head1 SEE ALSO
1225              
1226             L, L, L
1227              
1228             =begin html
1229              
1230             n-gram, split, unpack
1231              
1232             =end html
1233              
1234             =cut
1235              
1236             1;
1237             # The preceding line will help the module return a true value