File Coverage

blib/lib/Text/Ngramize.pm

Criterion	Covered	Total	%
statement	312	373	83.6
branch	54	100	54.0
condition	10	18	55.5
subroutine	39	44	88.6
pod	8	27	29.6
total	423	562	75.2

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Ngramize;
2
3							require 5.008_000;
4	1			1		23954	use strict;
	1					2
	1					34
5	1			1		4	use warnings;
	1					2
	1					24
6	1			1		795	use integer;
	1					13
	1					4
7	1			1		22	use Carp;
	1					1
	1					71
8
9	1			1		5	use constant INDEX_TOKEN => 0;
	1					1
	1					93
10	1			1		4	use constant INDEX_POSITION => 1;
	1					1
	1					36
11	1			1		4	use constant INDEX_LENGTH => 2;
	1					1
	1					88
12
13							BEGIN {
14	1			1		6	use Exporter ();
	1					1
	1					24
15	1			1		5	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	1					1
	1					338
16	1			1		3	$VERSION = '1.03';
17	1					17	@ISA = qw(Exporter);
18	1					2	@EXPORT = qw();
19	1					2	@EXPORT_OK = qw();
20	1					2720	%EXPORT_TAGS = ();
21							}
22
23							=head1 NAME
24
25							C - Computes lists of n-grams from text.
26
27							=head1 SYNOPSIS
28
29							use Text::Ngramize;
30							use Data::Dump qw(dump);
31							my $ngramizer = Text::Ngramize->new (normalizeText => 1);
32							my $text = "This sentence has 7 words; doesn't it?";
33							dump $ngramizer-> (text => \$text);
34
35							=head1 DESCRIPTION
36
37							C is used to compute the list of n-grams derived from the
38							bytes, characters, or words of the text provided. Methods
39							are included that provide positional information about the n-grams
40							computed within the text.
41
42							=head1 CONSTRUCTOR
43
44							=head2 C
45
46							use Text::Ngramize;
47							use Data::Dump qw(dump);
48							my $ngramizer = Text::Ngramize->new (normalizeText => 1);
49							my $text = ' To be.';
50							dump $ngramizer->getListOfNgrams (text => \$text);
51							# dumps:
52							# ["to ", "o b", " be", "be "]
53
54							The constructor C has optional parameters that set how the
55							n-grams are computed. C is used to
56							set the type of tokens used in the n-grams, C is used to set
57							normalization of the text before the tokens are extracted, and C
58							is the character used to join n-grams of words.
59
60							=over
61
62							=item C
63
64							typeOfNgrams => 'characters'
65
66							C sets the type of tokens to extract from the text
67							to form the n-grams: C<'asc'>
68							indicates the list of ASC characters comprising the bytes of the text are to be used, C<'characters'> indicates the
69							list of characters are to be used, and
70							C<'words'> indicates the words in the text are to be used. Note a word is defined
71							as a substring that matches the Perl regular expression '\p{Alphabetic}+', see L
72							for details. The default is C<'characters'>.
73
74							=item C
75
76							sizeOfNgrams => 3
77
78							C holds the size of the n-grams that are to be created from the tokens extracted. Note n-grams of
79							size one are the tokens themselves. C should be a positive integer; the default is three.
80
81							=item C
82
83							normalizeText => 0
84
85							If C evalutes to true, the text is normalized before the tokens are extracted;
86							normalization proceeds by converting the text to lower case, replacing all non-alphabetic
87							characters with a space, compressing multiple spaces to a single space, and removing
88							any leading space but not a trailing space. The
89							default value of C is zero, or false.
90
91							=item C
92
93							ngramWordSeparator => ' '
94
95							C is the character used to separate token-C when forming n-grams from them. It is
96							only used when C is set to C<'words'>; the default is a space. Note, this is used to
97							avoid having n-grams clash, for example, with bigrams the word pairs C<'a aaa'> and C<'aa aa'> would
98							produce the same n-gram C<'aaaa'> without a space separating them.
99
100							=back
101
102							=cut
103
104							sub new
105							{
106	311			311	1	34638541	my ($Class, %Parameters) = @_;
107	311		33			4671	my $Self = bless ({}, ref ($Class) \|\| $Class);
108
109							# set the default type of tokens.
110	311					1287	my $typeOfNgrams = 'characters';
111
112							# get the type of tokens to create from the text.
113	311	100				2648	$typeOfNgrams = lc $Parameters{typeOfNgrams} if exists $Parameters{typeOfNgrams};
114	311	50				3423	unless ($typeOfNgrams =~ /^(a\|c\|w)/)
115							{
116	0					0	croak "Token type '" . $Parameters{typeOfNgrams} . "' is invalid; should be 'asc', 'characters', or 'words'.\n";
117							}
118	311					2988	my %types = qw (a asc c characters w words);
119	311					1059	$typeOfNgrams = $types{substr ($typeOfNgrams, 0, 1)};
120	311					1274	$Self->{typeOfNgrams} = $typeOfNgrams;
121
122							# set normalizeText.
123	311		33			1897	$Self->{normalizeText} = exists ($Parameters{normalizeText}) && $Parameters{normalizeText};
124
125							# get the size of the ngrams.
126	311					1925	my $sizeOfNgrams = 3;
127	311	100				1647	$sizeOfNgrams = int abs $Parameters{sizeOfNgrams} if exists $Parameters{sizeOfNgrams};
128	311	50				1234	$sizeOfNgrams = 1 if ($sizeOfNgrams < 1);
129	311					789	$Self->{sizeOfNgrams} = $sizeOfNgrams;
130
131							# get the delimiter for words
132	311					860	my $ngramWordSeparator = ' ';
133	311	50				973	$ngramWordSeparator = $Parameters{ngramWordSeparator} if exists $Parameters{ngramWordSeparator};
134	311					820	$Self->{ngramWordSeparator} = $ngramWordSeparator;
135
136							# sets values used for returning hash values of n-grams.
137	311					1624	$Self->setBitsInInteger;
138	311					1612	$Self->setByteHashValues;
139
140	311					2317	return $Self;
141							}
142
143							=head1 METHODS
144
145							=head2 C
146
147							Returns the type of n-grams computed as a string, either C<'asc'>,
148							C<'characters'>, or C<'words'>.
149
150							use Text::Ngramize;
151							use Data::Dump qw(dump);
152							my $ngramizer = Text::Ngramize->new ();
153							dump $ngramizer->getTypeOfNgrams;
154							# dumps:
155							# "characters"
156
157							=cut
158
159							sub getTypeOfNgrams
160							{
161	0			0	1	0	return $_[0]->{typeOfNgrams};
162							}
163
164							=head2 C
165
166							Returns the size of n-grams computed.
167
168							use Text::Ngramize;
169							use Data::Dump qw(dump);
170							my $ngramizer = Text::Ngramize->new ();
171							dump $ngramizer->getSizeOfNgrams;
172							# dumps:
173							# 3
174
175							=cut
176
177							sub getSizeOfNgrams
178							{
179	0			0	1	0	return $_[0]->{sizeOfNgrams};
180							}
181
182							=head2 C
183
184							The function C returns an array reference to the list of n-grams computed
185							from the text provided or the list of tokens provided by C.
186
187							=over
188
189							=item C
190
191							text => ...
192
193							C holds the text that the tokens are to be extracted from. It can be a single string,
194							a reference to a string, a reference to an array of strings, or any combination of these.
195
196							=item C
197
198							listOfTokens => ...
199
200							Optionally, if C is not defined, then the list of tokens to use in forming the
201							n-grams can be provided by C, which should point to an array reference of strings.
202
203							=back
204
205							An example using the method:
206
207							use Text::Ngramize;
208							use Data::Dump qw(dump);
209							my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
210							my $text = "This isn't a sentence.";
211							dump $ngramizer->getListOfNgrams (text => \$text);
212							# dumps:
213							# ["this isn t", "isn t a", "t a sentence"]
214							dump $ngramizer->getListOfNgrams (listOfTokens => [qw(aa bb cc dd)]);
215							# dumps:
216							# ["aa bb cc", "bb cc dd"]
217
218							=cut
219
220							sub getListOfNgrams
221							{
222	130			130	1	4024	my ($Self, %Parameters) = @_;
223
224							# get the size of the ngrams to compute.
225	130					717	my $sizeOfNgrams = $Self->{sizeOfNgrams};
226
227							# get the list of tokens from the user to the text provided.
228	130					379	my $listOfTokens;
229	130	50				806	if (exists ($Parameters{text}))
		0
230							{
231							# compute the list of tokens from the text provided.
232	130					766	$listOfTokens = $Self->getListOfTokens($Parameters{text});
233							}
234							elsif (exists ($Parameters{listOfTokens}))
235							{
236							# use the list of tokens provided by the user.
237	0					0	$listOfTokens = $Parameters{listOfTokens};
238							}
239							else
240							{
241							# gotta have some tokens to make n-grams.
242	0					0	croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
243							}
244
245							# if the list of tokens is empty or the size of the n-grams is one,
246							# just return the list of tokens as the list of n-grams.
247	130	100	66			2306	return $listOfTokens if (($#$listOfTokens == -1) \|\| ($sizeOfNgrams == 1));
248
249							# get the string to use to merge the tokens.
250	100					340	my $separator = '';
251	100	50				773	$separator = $Self->{ngramWordSeparator} if ($Self->{typeOfNgrams} =~ /^w/);
252
253							# compute the list of n-grams.
254	100					267	my @listOfNgrams;
255	100					287	my $indexOfLastTokenInNgram = $sizeOfNgrams - 1;
256	100					284	my $indexOfLastNgram = scalar (@$listOfTokens) - $sizeOfNgrams + 1;
257	100					560	for (my $i = 0; $i < $indexOfLastNgram; $i++, $indexOfLastTokenInNgram++)
258							{
259	887610					5393051	push @listOfNgrams, join($separator, @$listOfTokens[$i..$indexOfLastTokenInNgram]);
260							}
261
262							# note, if the number of tokens in the list is less than sizeOfNgrams, then
263							# no n-grams are returned, that is, @listOfNgrams is empty.
264	100					244008	return \@listOfNgrams;
265							}
266
267							=head2 C
268
269							The function C returns an array reference to the list of n-grams computed
270							from the text provided or the list of tokens provided by C. Each item in the list returned
271							is of the form C<['n-gram', starting-index, n-gram-length]>; the starting index and n-gram length are
272							relative to the unnormalized text. When C is C<'asc'> the index and length refer to bytes,
273							when C is C<'characters'> or C<'words'> they refer to characters.
274
275							=over
276
277							=item C
278
279							text => ...
280
281							C holds the text that the tokens are to be extracted from. It can be a single string,
282							a reference to a string, a reference to an array of strings, or any combination of these.
283
284							=item C
285
286							listOfTokens => ...
287
288							Optionally, if C is not defined, then the list of tokens to use in forming the
289							n-grams can be provided by C, which should point to an array reference where
290							each item in the array is of the form C<[token, starting-position, length]>, where
291							C and C are integers indicating the position of the token.
292
293							=back
294
295							An example using the method:
296
297							use Text::Ngramize;
298							use Data::Dump qw(dump);
299							my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
300							my $text = " This isn't a sentence.";
301							dump $ngramizer->getListOfNgramsWithPositions (text => \$text);
302							# dumps:
303							# [
304							# ["this isn t", 1, 11],
305							# ["isn t a", 7, 7],
306							# ["t a sentence", 11, 13],
307							# ]
308
309							=cut
310
311							sub getListOfNgramsWithPositions
312							{
313	130			130	1	1520	my ($Self, %Parameters) = @_;
314
315							# get the size of the ngrams to compute.
316	130					372	my $sizeOfNgrams = $Self->{sizeOfNgrams};
317
318							# get the list of tokens from the user to the text provided.
319	130					256	my $listOfTokens;
320	130	50				834	if (exists ($Parameters{text}))
		0
321							{
322							# compute the list of tokens from the text provided.
323	130					849	$listOfTokens = $Self->getListOfTokensWithPositions (%Parameters);
324							}
325							elsif (exists ($Parameters{listOfTokens}))
326							{
327							# use the list of tokens provided by the user.
328	0					0	$listOfTokens = $Parameters{listOfTokens};
329							}
330							else
331							{
332							# gotta have some tokens to make n-grams.
333	0					0	croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
334							}
335
336							# if the list of tokens is empty or the size of the n-grams is one,
337							# just return the list of tokens as the list of n-grams.
338	130	100	66			2098	return $listOfTokens if (($#$listOfTokens == -1) \|\| ($sizeOfNgrams == 1));
339
340							# get the string to use to merge the tokens.
341	100					337	my $separator = '';
342	100	50				2082	$separator = $Self->{ngramWordSeparator} if ($Self->{typeOfNgrams} =~ /^w/);
343
344							# compute the list of n-grams and their positions.
345	100					225	my @listOfNgramsWithPositions;
346	100					824	my @indices= (0..($sizeOfNgrams - 1));
347	100					297	my $indexOfLastTokenInNgram = $sizeOfNgrams - 1;
348	100					286	my $indexOfLastNgram = scalar (@$listOfTokens) - $sizeOfNgrams + 1;
349	100					752	for (my $i = 0; $i < $indexOfLastNgram; $i++, $indexOfLastTokenInNgram++)
350							{
351	887610					1649851	push @listOfNgramsWithPositions, [join ($separator, map {$listOfTokens->[$_+$i][0]} @indices),
	10994460					29936597
352
353							# index to start of first n-gram
354							$listOfTokens->[$i][1],
355
356							# length of the n-gram.
357							$listOfTokens->[$i + $sizeOfNgrams - 1][1] + $listOfTokens->[$i + $sizeOfNgrams - 1][2] - $listOfTokens->[$i][1]];
358							}
359
360							# note, if the number of tokens in the list is less than sizeOfNgrams, then
361							# no n-grams are returned, that is, @listOfNgrams is empty.
362	100					778830	return \@listOfNgramsWithPositions;
363							}
364
365							=head2 C
366
367							The function C returns an array reference to the list of integer hash values
368							computed from the n-grams
369							of the text provided or the list of tokens provided by C. The advantage of using
370							hashes over strings is that they take less memory and are theoretically faster to compute. With strings
371							the time to compute the n-grams is proportional to their size, with hashes it is not
372							since they are computed recursively. Also, the amount of memory used to store the n-gram strings
373							grows proportional to their size, with hashes it does not. The disadvantage lies with
374							hashing collisions, but these will be very rare. However, for small n-gram sizes hash values
375							may take more time to compute since all code is written in Perl.
376
377							=over
378
379							=item C
380
381							text => ...
382
383							C holds the text that the tokens are to be extracted from. It can be a single string,
384							a reference to a string, a reference to an array of strings, or any combination of these.
385
386							=item C
387
388							listOfTokens => ...
389
390							Optionally, if C is not defined, then the list of tokens to use in forming the
391							n-grams can be provided by C, which should point to an array reference of strings.
392
393							=back
394
395							An example using the method:
396
397							use Text::Ngramize;
398							use Data::Dump qw(dump);
399							my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
400							my $text = "This isn't a sentence.";
401							dump $ngramizer->getListOfNgramHashValues (text => \$text);
402							# NOTE: hash values may vary across computers.
403							# dumps:
404							# [
405							# "4038955636454686726",
406							# "5576083060948369410",
407							# "6093054335710494749",
408							# ] dump $ngramizer->getListOfNgramHashValues (listOfTokens => [qw(aa bb cc dd)]);
409							# dumps:
410							# ["7326140501871656967", "5557417594488258562"]
411
412							=cut
413
414							sub getListOfNgramHashValues
415							{
416	180			180	1	12202	my ($Self, %Parameters) = @_;
417
418							# get the size of the ngrams to compute.
419	180					722	my $sizeOfNgrams = $Self->{sizeOfNgrams};
420
421							# get the list of tokens from the user to the text provided.
422	180					438	my $listOfTokens;
423	180	50				1038	if (exists ($Parameters{text}))
		0
424							{
425							# compute the list of tokens from the text provided.
426	180					1292	$listOfTokens = $Self->getListOfTokens($Parameters{text});
427							}
428							elsif (exists ($Parameters{listOfTokens}))
429							{
430							# use the list of tokens provided by the user.
431	0					0	$listOfTokens = $Parameters{listOfTokens};
432							}
433							else
434							{
435							# gotta have some tokens to make n-grams.
436	0					0	croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
437							}
438
439							# get the list of hashes for the tokens.
440	180					1483	my $listOfHashesOfTokens = $Self->getHashValuesOfListOfStrings (listOfStrings => $listOfTokens);
441	180					827	my $totalTokens = $#$listOfHashesOfTokens + 1;
442
443							# if the list of tokens is empty or the size of the n-grams is one,
444							# just return the list of hash values of the tokens.
445	180	100	66			2051	if (($totalTokens == 0) \|\| ($sizeOfNgrams == 1))
446							{
447	30					39144	return $listOfHashesOfTokens;
448							}
449
450							# holds the hash values computed.
451	150					391	my @listOfHashValues;
452
453							# compute the shifts need to add a hash code to the running hash value.
454	150					692	my $bitInInteger = $Self->{bitsInInteger};
455	150					358	my $addShiftRight = $bitInInteger - 1;
456
457							# compute the value of the first hash.
458	150					334	my $runningHashValue = 0;
459	150					599	for (my $i = 0; $i < $sizeOfNgrams; $i++)
460							{
461	1860					9593	$runningHashValue = ($runningHashValue << 1) ^ $Self->rshift($runningHashValue,$addShiftRight) ^ $listOfHashesOfTokens->[$i];
462							}
463	150					489	push @listOfHashValues, $runningHashValue;
464
465							# compute the shifts needed to remove the oldest hash code in the running hash value.
466	150					2175	my $indexToRemove = 0;
467	150					372	my $removeShiftLeft = ($sizeOfNgrams - 1) % $bitInInteger;
468	150					326	my $removeShiftRight = $bitInInteger - $removeShiftLeft;
469	150					466	my $removeShiftMask = ~(-1 << $removeShiftLeft);
470	150					369	my $addShiftMask = ~(-1 << 1);
471
472	150					704	for (my $indexToAdd = $sizeOfNgrams; $indexToAdd < $totalTokens; $indexToRemove++, $indexToAdd++)
473							{
474	984500					1453859	$runningHashValue ^= ($listOfHashesOfTokens->[$indexToRemove] << $removeShiftLeft) ^ (($listOfHashesOfTokens->[$indexToRemove] >> $removeShiftRight) & $removeShiftMask);
475	984500					1519265	$runningHashValue = ($runningHashValue << 1) ^ (($runningHashValue >> $addShiftRight) & $addShiftMask) ^ $listOfHashesOfTokens->[$indexToAdd];
476	984500					2365417	push @listOfHashValues, $runningHashValue;
477							}
478
479							# note, if the number of tokens in the list is less than sizeOfNgrams, then
480							# no n-grams are returned, that is, @listOfHashValues is empty.
481	150					239619	return \@listOfHashValues;
482							}
483
484							=head2 C
485
486							The function C returns an array reference to the list of integer hash values and
487							n-gram positional information computed
488							from the text provided or the list of tokens provided by C. Each item in the list returned
489							is of the form C<['n-gram-hash', starting-index, n-gram-length]>; the starting index and n-gram length are
490							relative to the unnormalized text. When C is C<'asc'> the index and length refer to bytes,
491							when C is C<'characters'> or C<'words'> they refer to characters.
492
493							The advantage of using
494							hashes over strings is that they take less memory and are theoretically faster to compute. With strings
495							the time to compute the n-grams is proportional to their size, with hashes it is not
496							since they are computed recursively. Also, the amount of memory used to store the n-gram strings
497							grows proportional to their size, with hashes it does not. The disadvantage lies with
498							hashing collisions, but these will be very rare. However, for small n-gram sizes hash values
499							may take more time to compute since all code is written in Perl.
500
501							=over
502
503							=item C
504
505							text => ...
506
507							C holds the text that the tokens are to be extracted from. It can be a single string,
508							a reference to a string, a reference to an array of strings, or any combination of these.
509
510							=item C
511
512							listOfTokens => ...
513
514							Optionally, if C is not defined, then the list of tokens to use in forming the
515							n-grams can be provided by C, which should point to an array reference where
516							each item in the array is of the form C<[token, starting-position, length]>, where
517							C and C are integers indicating the position of the token.
518
519							=back
520
521							An example using the method:
522
523							use Text::Ngramize;
524							use Data::Dump qw(dump);
525							my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', normalizeText => 1);
526							my $text = " This isn't a sentence.";
527							dump $ngramizer->getListOfNgramHashValuesWithPositions (text => \$text);
528							# NOTE: hash values may vary across computers.
529							# dumps:
530							# [
531							# ["4038955636454686726", 1, 11],
532							# ["5576083060948369410", 7, 7],
533							# ["6093054335710494749", 11, 13],
534							# ]
535
536							=cut
537
538							sub getListOfNgramHashValuesWithPositions
539							{
540	180			180	1	2869	my ($Self, %Parameters) = @_;
541
542							# get the size of the ngrams to compute.
543	180					473	my $sizeOfNgrams = $Self->{sizeOfNgrams};
544
545							# get the list of tokens from the user to the text provided.
546	180					540	my $listOfTokens;
547	180	50				774	if (exists ($Parameters{text}))
		0
548							{
549							# compute the list of tokens from the text provided.
550	180					1284	$listOfTokens = $Self->getListOfTokensWithPositions(%Parameters);
551							}
552							elsif (exists ($Parameters{listOfTokens}))
553							{
554							# use the list of tokens provided by the user.
555	0					0	$listOfTokens = $Parameters{listOfTokens};
556							}
557							else
558							{
559							# gotta have some tokens to make n-grams.
560	0					0	croak "neither parameter 'text => ' nor 'listOfTokens => ' were defined; at least one of them must be defined.\n";
561							}
562
563							# get the list of hashes for the tokens.
564	180					6751	my $listOfHashesOfTokens = $Self->getHashValuesOfListOfStrings (listOfStrings => [map {$_->[0]} @$listOfTokens]);
	1172300					2935227
565	180					434947	my $totalTokens = $#$listOfHashesOfTokens + 1;
566
567							# if the list of tokens is empty or the size of the n-grams is one,
568							# just return the list of hash values of the tokens.
569	180	100	66			2760	if (($totalTokens == 0) \|\| ($sizeOfNgrams == 1))
570							{
571	30					130	for (my $i = 0; $i < $totalTokens; $i++)
572							{
573	185940					845053	$listOfHashesOfTokens->[$i] = [$listOfHashesOfTokens->[$i], $listOfTokens->[$i][1], $listOfTokens->[$i][2]];
574							}
575	30					79526	return $listOfHashesOfTokens;
576							}
577
578							# holds the hash values computed.
579	150					454	my @listOfHashValues;
580
581							# compute the shifts need to add a hash code to the running hash value.
582	150					889	my $bitInInteger = $Self->{bitsInInteger};
583	150					653	my $addShiftRight = $bitInInteger - 1;
584
585							# compute the value of the first hash.
586	150					316	my $runningHashValue = 0;
587	150					834	for (my $i = 0; $i < $sizeOfNgrams; $i++)
588							{
589	1860					6695	$runningHashValue = ($runningHashValue << 1) ^ $Self->rshift($runningHashValue,$addShiftRight) ^ $listOfHashesOfTokens->[$i];
590							}
591	150					1588	push @listOfHashValues, [$runningHashValue, $listOfTokens->[0][1], $listOfTokens->[$sizeOfNgrams - 1][1] + $listOfTokens->[$sizeOfNgrams - 1][2] - $listOfTokens->[0][1]];
592
593							# compute the shifts need to remove the oldest hash code in the running hash value.
594	150					297	my $indexToRemove = 0;
595	150					386	my $removeShiftLeft = ($sizeOfNgrams - 1) % $bitInInteger;
596	150					291	my $removeShiftRight = $bitInInteger - $removeShiftLeft;
597	150					351	my $removeShiftMask = ~(-1 << $removeShiftLeft);
598	150					241	my $addShiftMask = ~(-1 << 1);
599
600	150					585	for (my $indexToAdd = $sizeOfNgrams; $indexToAdd < $totalTokens; $indexToRemove++, $indexToAdd++)
601							{
602	984500					1673640	$runningHashValue ^= ($listOfHashesOfTokens->[$indexToRemove] << $removeShiftLeft) ^ (($listOfHashesOfTokens->[$indexToRemove] >> $removeShiftRight) & $removeShiftMask);
603	984500					1602857	$runningHashValue = ($runningHashValue << 1) ^ (($runningHashValue >> $addShiftRight) & $addShiftMask) ^ $listOfHashesOfTokens->[$indexToAdd];
604	984500					5365035	push @listOfHashValues, [$runningHashValue, $listOfTokens->[$indexToRemove + 1][1], $listOfTokens->[$indexToRemove + $sizeOfNgrams][1] + $listOfTokens->[$indexToRemove + $sizeOfNgrams][2] - $listOfTokens->[$indexToRemove + 1][1]];
605							}
606
607							# note, if the number of tokens in the list is less than sizeOfNgrams, then
608							# no n-grams are returned, that is, @listOfHashValues is empty.
609	150					579451	return \@listOfHashValues;
610							}
611
612							sub getListOfTokens
613							{
614	310			310	0	1461	my ($Self, @Text) = @_;
615
616							# if no text, return the empty list.
617	310	50				1494	return [] unless @Text;
618	310					933	my $text = \@Text;
619
620							# get the tokens.
621	310	100				5943	if ($Self->{typeOfNgrams} =~ /^a/)
		100
		50
622							{
623	120					601	return $Self->getListOfAsc ($text);
624							}
625							elsif ($Self->{typeOfNgrams} =~ /^c/)
626							{
627	120					764	return $Self->getListOfCharacters ($text);
628							}
629							elsif ($Self->{typeOfNgrams} =~ /^w/)
630							{
631	70					345	return $Self->getListOfWords ($text);
632							}
633							else
634							{
635	0					0	croak "programming error: parameter typeOfNgrams has value '" . $Self->{typeOfNgrams}. "' and should not.\n";
636							}
637							}
638
639							# returns an array reference to the list of tokens found text provided
640							# including each tokens' position and length in the original text. Each
641							# entry in the array is of the form ['token', starting-position, length]
642							sub getListOfTokensWithPositions # (text => '..')
643							{
644	310			310	0	1126	my ($Self, %Parameters) = @_;
645
646							# if no text, return the empty list.
647	310	50				1591	return [] unless exists $Parameters{text};
648	310					772	my $text = $Parameters{text};
649
650							# get the tokens.
651	310	100				3375	if ($Self->{typeOfNgrams} =~ /^a/)
		100
		50
652							{
653	120					609	return $Self->getListOfAscWithPositions (\$text);
654							}
655							elsif ($Self->{typeOfNgrams} =~ /^c/)
656							{
657	120					635	return $Self->getListOfCharactersWithPositions (\$text);
658							}
659							elsif ($Self->{typeOfNgrams} =~ /^w/)
660							{
661	70					385	return $Self->getListOfWordsWithCharacterPositions (\$text);
662							}
663							else
664							{
665	0					0	croak "programming error: parameter typeOfNgrams has value '" . $Self->{typeOfNgrams}. "' and should not.\n";
666							}
667							}
668
669							# uses unpack to return the list of asc characters of the text.
670							sub getListOfAsc # ($text)
671							{
672	1			1		2021	use bytes;
	1					16
	1					7
673	120			120	0	295	my $Self = shift;
674
675							# get the text as a list of string references.
676	120					725	my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
677
678	120					223	my @listOfTokens;
679	120					362	foreach my $stringRef (@$listOfText)
680							{
681	120	50				543	if ($Self->{normalizeText})
682							{
683	0					0	push @listOfTokens, map {chr} unpack ('C*', $Self->normalizeText ($stringRef));
	0					0
684							}
685							else
686							{
687	120					95892	push @listOfTokens, map {chr} unpack ('C*', $$stringRef);
	1568610					3282478
688							}
689							}
690	120					1957	return \@listOfTokens;
691							}
692
693							# uses an empty split to get all the characters in the text.
694							sub getListOfCharacters # ($text)
695							{
696	120			120	0	292	my $Self = shift;
697	120					709	return $Self->getListOfTokensUsingRegexp ('', @_);
698							}
699
700							# returns a list of all substrings of letters.
701							sub getListOfWords # ($text)
702							{
703	70			70	0	165	my $Self = shift;
704	70					316	return $Self->getListOfTokensUsingRegexp ('[^\p{IsAlphabetic}]+', @_);
705							}
706
707							# regexp must be a regexp without bounding slashes.
708							sub getListOfTokensUsingRegexp # ($regexp, $text)
709							{
710	190			190	0	399	my $Self = shift;
711	190					381	my $regexp = shift;
712
713							# get the text as a list of string references.
714	190					1047	my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
715
716	190					465	my @listOfTokens;
717	190					569	foreach my $stringRef (@$listOfText)
718							{
719	190	50				893	if ($Self->{normalizeText})
720							{
721	0					0	push @listOfTokens, split (/$regexp/, $Self->normalizeText ($stringRef));
722							}
723							else
724							{
725	190					629992	push @listOfTokens, split (/$regexp/, $$stringRef);
726							}
727							}
728	190					5714	return \@listOfTokens;
729							}
730
731							# returns a copy of the text that is lower case, has all none letters replaced
732							# with spaces, and has all multiple spaces replaced with one space.
733							sub normalizeText # ($text)
734							{
735	0			0	1	0	my $Self = shift;
736	0					0	my $Text = shift;
737
738							# make a copy of the text, and downcase it.
739	0					0	my $type = ref ($Text);
740	0	0				0	if ($type eq '')
		0
741							{
742	0					0	$Text = lc $Text;
743							}
744							elsif ($type eq 'SCALAR')
745							{
746	0					0	$Text = lc $$Text;
747							}
748
749							# convert all none letters to spaces; tried to use tr/// but \p{IsAlphabetic} gave a
750							# warning with it.
751	1			1		1719	$Text =~ s/\P{IsAlphabetic}/ /g;
	1					10
	1					12
	0					0
752
753							# compress mulitple spaces to one space.
754	0					0	$Text =~ tr/ / /s;
755
756							# remove leading spaces.
757	0					0	$Text =~ s/^ +//;
758
759	0					0	return $Text;
760							}
761
762							# if the tokens are to be ASC, positions without normalization with be just the
763							# byte number. to get the byte number with filtering, first we need to convert
764							# original text into characters with positions that include their byte index. then
765							# filter the text. so really we convert to characters first and generate either
766							# character positions or byte positions. from there we can get byte positions or
767							# characters positions or word positions.
768
769							# give a string reference returns an array reference of the form:
770							# [
771							# [character, position = $OffSet, 1],
772							# [character, position = $OffSet + 1, 1],
773							# ...
774							# [character, position = $Offset + length ($$StringRef) - 1, 1]
775							# ]
776							sub getCharactersWithCharacterPositions # ($StringRef, $OffSet)
777							{
778	190			190	0	573	my $Self = shift;
779	190					590	my $StringRef = shift;
780
781							# default value of the offset is zero.
782	190					377	my $Offset = shift;
783	190	50				866	$Offset = 0 unless defined $Offset;
784
785							# build the list of characters with their position.
786	190					375916	my @listOfCharacters = map { [$_, $Offset++, 1] } split //, $$StringRef;
	1000060					3824225
787	190					365886	return \@listOfCharacters;
788							}
789
790							# give a string reference returns an array reference of the form:
791							# [
792							# [character, byte-position, bytes::length (character)],
793							# ...
794							# [character, byte-position, bytes::length (character)]
795							# ]
796							sub getCharactersWithBytePositions # ($StringRef, $OffSet)
797							{
798	120			120	0	193	my $Self = shift;
799	120					202	my $StringRef = shift;
800
801							# default value of the offset is zero.
802	120					183	my $Offset = shift;
803	120	50				355	$Offset = 0 unless defined $Offset;
804
805							# build the list of characters with their byte position.
806	120					303900	my @listOfCharacters = split //, $$StringRef;
807							{
808	1			1		29867	use bytes;
	1					13
	1					11
	120					35546
809	120					442	foreach my $char (@listOfCharacters)
810							{
811	623930					1411002	my $len = bytes::length ($char);
812	623930					2695834	$char = [$char, $Offset, $len];
813	623930					1033821	$Offset += $len;
814							}
815	1			1		56	no bytes;
	1					2
	1					4
816							}
817	120					767	return \@listOfCharacters;
818							}
819
820							# given the list of characters returned from getCharactersWithCharacterPositions or
821							# from getCharactersWithBytePositions returns a list of the normalized characters.
822							sub normalizeCharacterList
823							{
824	0			0	0	0	my $Self = shift;
825
826							# get the list of characters with position info.
827	0					0	my $ListOfCharacters = shift;
828
829	0					0	my @filteredList;
830	0					0	my $previousCharIsSpace = 1;
831
832	0					0	for (my $i = 0; $i < @$ListOfCharacters; $i++)
833							{
834							# get the pair [character, position].
835	0					0	my $charPos = $ListOfCharacters->[$i];
836
837							# lowercase the character or convert it to a space.
838	0					0	my $newChar;
839	0	0				0	if ($charPos->[0] =~ m/^\p{IsAlphabetic}$/o)
840							{
841	0					0	$newChar = lc $charPos->[0];
842							}
843							else
844							{
845	0					0	$newChar = ' ';
846							}
847
848							# add the new character to the list.
849	0	0				0	if ($newChar ne ' ')
		0
850							{
851							# if the new character is not a space, add it.
852	0					0	$previousCharIsSpace = 0;
853	0					0	push @filteredList, [$newChar, $charPos->[1], $charPos->[2]];
854							}
855							elsif (!$previousCharIsSpace)
856							{
857							# if the new character is a space but the previous was not, add it.
858	0					0	$previousCharIsSpace = 1;
859	0					0	push @filteredList, [$newChar, $charPos->[1], $charPos->[2]];
860							}
861							}
862
863							# return the new list of characters and their positions.
864	0					0	return \@filteredList;
865							}
866
867							# given a list of text (strings), returns an array reference to the list
868							# of ASC characters comprizing all the text, with their bytes positions and
869							# length (always 1) in the original text.
870							sub getListOfAscWithPositions # ($Text)
871							{
872	120			120	0	265	my $Self = shift;
873
874							# get the text as a list of string references.
875	120					599	my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
876
877	120					215	my @listOfTokensWithPosition;
878	120					246	my $offset = 0;
879	120					551	for (my $i = 0; $i < @$listOfText; $i++)
880							{
881	120					283	my $stringRef = $listOfText->[$i];
882
883							# convert the string to a list of characters.
884	120					477	my $listOfCharacters = $Self->getCharactersWithBytePositions ($stringRef, $offset);
885
886							# normalize of the list of characters.
887	120	50				937	if ($Self->{normalizeText})
888							{
889	0					0	$listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
890							}
891
892							# expand each character into its list of bytes.
893							{
894	1			1		365	use bytes;
	1					2
	1					3
	120					459
895
896	120					611	for (my $j = 0; $j < @$listOfCharacters; $j++)
897							{
898							# get the characters, its position, and length.
899	623930					1136043	my $charPosLen = $listOfCharacters->[$j];
900
901							# split the characters into ASC bytes.
902	623930					1510248	my @listOfAsc = map {chr} unpack ('C*', $charPosLen->[0]);
	1568610					3352913
903
904	623930					1166973	my $byteOffset = $charPosLen->[1];
905	623930					936597	foreach my $asc (@listOfAsc)
906							{
907	1568610					5803960	push @listOfTokensWithPosition, [$asc, $byteOffset++, 1];
908							}
909							}
910
911	120					1422	$offset += bytes::length ($$stringRef);
912	1			1		85	no bytes;
	1					2
	1					3
913							}
914							}
915
916	120					398380	return \@listOfTokensWithPosition;
917							}
918
919							# given a list of text (strings), returns an array reference to the list
920							# of characters comprizing all the text, with their character positions and
921							# length in the original text.
922							sub getListOfCharactersWithPositions # ($Text)
923							{
924	120			120	0	248	my $Self = shift;
925
926							# get the text as a list of string references.
927	120					666	my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
928
929	120					199	my @listOfTokensWithPosition;
930	120					1320	my $offset = 0;
931	120					604	for (my $i = 0; $i < @$listOfText; $i++)
932							{
933	120					307	my $stringRef = $listOfText->[$i];
934
935							# convert the string to a list of characters.
936	120					614	my $listOfCharacters = $Self->getCharactersWithCharacterPositions ($stringRef, $offset);
937
938							# normalize of the list of characters.
939	120	50				1199	if ($Self->{normalizeText})
940							{
941	0					0	$listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
942							}
943
944							# append the list of characters.
945	120					172945	push @listOfTokensWithPosition, @$listOfCharacters;
946
947							# accumulate the character offsets.
948	120					75449	$offset += length ($$stringRef);
949							}
950
951	120					1478	return \@listOfTokensWithPosition;
952							}
953
954							# given a list of text (strings), returns an array reference to the list
955							# of words comprizing all the text, with their character positions and
956							# length in the original text. [word, start-position, length]
957							sub getListOfWordsWithCharacterPositions # ($Text)
958							{
959	70			70	0	176	my $Self = shift;
960
961							# get the text as a list of string references.
962	70					509	my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
963
964	70					144	my @listOfTokensWithPosition;
965	70					168	my $offset = 0;
966	70					329	for (my $i = 0; $i < @$listOfText; $i++)
967							{
968	70					278	my $stringRef = $listOfText->[$i];
969
970							# convert the string to a list of characters.
971	70					363	my $listOfCharacters = $Self->getCharactersWithCharacterPositions ($stringRef, $offset);
972
973							# normalize of the list of characters.
974	70	50				745	if ($Self->{normalizeText})
975							{
976	0					0	$listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
977							}
978
979							# get the list of words.
980	70					576	my $listOfWords = $Self->getListOfWordsFromCharacterList ($listOfCharacters);
981
982							# append the list of characters.
983	70					22492	push @listOfTokensWithPosition, @$listOfWords;
984
985							# accumulate the character offsets.
986	70					293885	$offset += length ($$stringRef);
987							}
988
989	70					1065	return \@listOfTokensWithPosition;
990							}
991
992							# given a list of text (strings), returns an array reference to the list
993							# of words comprizing all the text, with their bytes positions and
994							# length in the original text. [word, start-position, length]
995							sub getListOfWordsWithBytePositions # ($Text)
996							{
997	0			0	0	0	my $Self = shift;
998
999							# get the text as a list of string references.
1000	0					0	my $listOfText = $Self->getListOfAllScalarsAsReferences (@_);
1001
1002	0					0	my @listOfTokensWithPosition;
1003	0					0	my $offset = 0;
1004	0					0	for (my $i = 0; $i < @$listOfText; $i++)
1005							{
1006	0					0	my $stringRef = $listOfText->[$i];
1007
1008							# convert the string to a list of characters.
1009	0					0	my $listOfCharacters = $Self->getCharactersWithBytePositions ($stringRef, $offset);
1010
1011							# normalize of the list of characters.
1012	0	0				0	if ($Self->{normalizeText})
1013							{
1014	0					0	$listOfCharacters = $Self->normalizeCharacterList ($listOfCharacters);
1015							}
1016
1017							# get the list of words.
1018	0					0	my $listOfWords = $Self->getListOfWordsFromCharacterList ($listOfCharacters);
1019
1020							# append the list of characters.
1021	0					0	push @listOfTokensWithPosition, @$listOfWords;
1022
1023							# accumulate the byte offsets.
1024	0					0	$offset += length ($$stringRef);
1025							}
1026
1027	0					0	return \@listOfTokensWithPosition;
1028							}
1029
1030							sub getListOfWordsFromCharacterList # ($ListOfCharactersWithPositions)
1031							{
1032	70			70	0	195	my $Self = shift;
1033
1034							# get the list of characters with position info.
1035	70					172	my $ListOfCharacters = shift;
1036
1037	70					152	my @listOfWords;
1038							my @currentWord;
1039	70					442	for (my $i = 0; $i < @$ListOfCharacters; $i++)
1040							{
1041							# get the pair [character, position, length].
1042	376130					694395	my $charPosLen = $ListOfCharacters->[$i];
1043
1044	376130	100				1485875	if ($charPosLen->[0] =~ m/^\p{IsAlphabetic}$/)
		100
1045							{
1046							# got a letter so accumulate the letters of the word.
1047	307250					881926	push @currentWord, $charPosLen;
1048							}
1049							elsif (@currentWord)
1050							{
1051							# none letter, so concat all the characters of the word.
1052	53700					105626	my $word = join ('', map {$_->[0]} @currentWord);
	306620					654781
1053
1054							# get the position of the first character in the word.
1055	53700					117709	my $position = $currentWord[0]->[1];
1056
1057							# compute total characters in the word.
1058	53700					125627	my $length = $currentWord[-1]->[1] - $currentWord[0]->[1] + $currentWord[-1]->[2];
1059
1060							# store the word info.
1061	53700					162666	push @listOfWords, [$word, $position, $length];
1062
1063							# clear the cache or word characters.
1064	53700					206747	$#currentWord = -1;
1065							}
1066							}
1067
1068							# store the last word if there is one.
1069	70	100				279	if (@currentWord)
1070							{
1071							# none letter, so concat all the characters of the word.
1072	60					197	my $word = join ('', map {$_->[0]} @currentWord);
	630					1103
1073
1074							# get the position of the first character in the word.
1075	60					198	my $position = $currentWord[0]->[1];
1076
1077							# get to total characters in the word.
1078	60					229	my $length = $currentWord[-1]->[1] - $currentWord[0]->[1] + $currentWord[-1]->[2];
1079
1080							# store the word info.
1081	60					226	push @listOfWords, [$word, $position, $length];
1082							}
1083
1084							# return the new list of characters and their positions.
1085	70					577	return \@listOfWords;
1086							}
1087
1088							# flattens a list of scalars, references, arrays, references to arrays, and
1089							# any combination of them into a list of references to the scalars.
1090							sub getListOfAllScalarsAsReferences
1091							{
1092	930			930	0	1876	my $Self = shift;
1093
1094	930					1412	my @listOfRefsToScalars;
1095	930					3913	foreach my $item (@_)
1096							{
1097	930					2432	my $type = ref ($item);
1098	930	100				5190	if ($type eq '')
		100
		50
		0
1099							{
1100	310					3870	push @listOfRefsToScalars, \$item;
1101							}
1102							elsif ($type eq 'SCALAR')
1103							{
1104	310					1520	push @listOfRefsToScalars, $item;
1105							}
1106							elsif ($type eq 'ARRAY')
1107							{
1108	310					596	push @listOfRefsToScalars, @{$Self->getListOfAllScalarsAsReferences (@$item)};
	310					1444
1109							}
1110							elsif ($type eq 'REF')
1111							{
1112	0					0	push @listOfRefsToScalars, @{$Self->getListOfAllScalarsAsReferences ($$item)};
	0					0
1113							}
1114							}
1115	930					9463	return \@listOfRefsToScalars;
1116							}
1117
1118							# give an array reference defined using listOfStrings => ... that defines
1119							# a list of strings. this routine returns the hash values computed for the
1120							# strings.
1121							sub getHashValuesOfListOfStrings
1122							{
1123	1			1		781	use bytes;
	1					2
	1					3
1124
1125	360			360	0	2520	my ($Self, %Parameters) = @_;
1126
1127							# get the bits to circular shift the hash values by.
1128	360					1735	my $shiftBits = $Self->{bitsInInteger} - 1;
1129	360					906	my $mask = ~(-1 << 1);
1130
1131							# get the hash values of the bytes.
1132	360					923	my $byteHashValues = $Self->{byteHashValues};
1133
1134	360					772	my @listOfHashValues;
1135	360					867	foreach my $string (@{$Parameters{listOfStrings}})
	360					1685
1136							{
1137	2344600					2709103	my $value = 0;
1138	2344600					4271473	foreach my $byte (unpack ('C*', $string))
1139							{
1140	4701240					9125080	$value = ($value << 1) ^ (($value >> $shiftBits) & $mask) ^ $byteHashValues->[$byte];
1141							}
1142	2344600					4830241	push @listOfHashValues, $value;
1143							}
1144	360					3934	return \@listOfHashValues;
1145							}
1146
1147							# create and store the list of hash values used for bytes; used as a
1148							# basis for all hash values.
1149							sub setByteHashValues
1150							{
1151	311			311	0	861	my ($Self, %Parameters) = @_;
1152
1153							# get the seed to use for the random number generator.
1154	311					702	my $randomSeed = 1093;
1155	311	50				1223	$randomSeed = $Parameters{randomSeed} if exists $Parameters{randomSeed};
1156	311					961	srand ($randomSeed);
1157
1158							# create and store the list of byte hash values.
1159	311					546	my $size = 256;
1160	311					759	my @byteHashValues;
1161	311					1507	my $maxValue = $Self->rshift(~0,1);
1162	311					1593	for (my $i = 0; $i < $size; $i++)
1163							{
1164	79616					251805	push @byteHashValues, (int rand ($maxValue)) ^ ((int rand ($maxValue)) << 7);
1165							}
1166	311					1127	$Self->{byteHashValues} = \@byteHashValues;
1167	311					1047	return;
1168							}
1169
1170							# compute and store the number of bits in an integer.
1171							sub setBitsInInteger
1172							{
1173	311			311	0	619	my $Self = shift;
1174
1175							# maybe some day computer words will contain 1024 bits.
1176	311					639	my $maxIterations = 1024;
1177	311					455	my $bitsInInteger = 0;
1178	311					1289	for (my $maxInteger = ~0; $maxInteger; $maxInteger <<= 1)
1179							{
1180	19904					23011	++$bitsInInteger;
1181
1182							# ensure the loop is finite.
1183	19904	50				55413	last if (--$maxIterations < 1);
1184							}
1185	311					964	$Self->{bitsInInteger} = $bitsInInteger;
1186	311					975	return;
1187							}
1188
1189							# does a right bit shift even if the int is signed.
1190							sub rshift
1191							{
1192	4031	50		4031	0	22669	return ($_[1] >> $_[2]) & ~(-1 << ($_[0]->{bitsInInteger} - $_[2])) if ($_[2]);
1193	0						return $_[1];
1194							}
1195
1196							=head1 INSTALLATION
1197
1198							To install the module run the following commands:
1199
1200							perl Makefile.PL
1201							make
1202							make test
1203							make install
1204
1205							If you are on a windows box you should use 'nmake' rather than 'make'.
1206
1207							=head1 AUTHOR
1208
1209							Jeff Kubina
1210
1211							=head1 COPYRIGHT
1212
1213							Copyright (c) 2009 Jeff Kubina. All rights reserved.
1214							This program is free software; you can redistribute
1215							it and/or modify it under the same terms as Perl itself.
1216
1217							The full text of the license can be found in the
1218							LICENSE file included with this module.
1219
1220							=head1 KEYWORDS
1221
1222							information processing, ngram, ngrams, n-gram, n-grams, string, text
1223
1224							=head1 SEE ALSO
1225
1226							L, L, L
1227
1228							=begin html
1229
1230							n-gram, split, unpack
1231
1232							=end html
1233
1234							=cut
1235
1236							1;
1237							# The preceding line will help the module return a true value