File Coverage

blib/lib/Text/StemTagPOS.pm

Criterion	Covered	Total	%
statement	343	444	77.2
branch	62	136	45.5
condition	12	27	44.4
subroutine	40	44	90.9
pod	7	7	100.0
total	464	658	70.5

line	stmt	bran	cond	sub	pod	time	code
1							package Text::StemTagPOS;
2
3							require 5.006002;
4	3			3		155992	use strict;
	3					11
	3					149
5	3			3		17	use warnings;
	3					7
	3					180
6	3			3		18	use Carp;
	3					10
	3					553
7	3			3		5432	use Encode;
	3					93626
	3					354
8	3			3		3191	use Lingua::Stem::Snowball;
	3					8033
	3					206
9	3			3		4021	use Lingua::EN::Tagger;
	3					529363
	3					180
10	3			3		5353	use Data::Dump qw(dump);
	3					59664
	3					407
11
12							BEGIN {
13	3			3		37	use Exporter ();
	3					8
	3					77
14	3			3		17	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	3					7
	3					368
15	3			3		31	$VERSION = '0.61';
16	3					66	@ISA = qw(Exporter);
17	3					7	@EXPORT = qw();
18	3					7	@EXPORT_OK = qw();
19	3					503	%EXPORT_TAGS = ();
20							}
21
22	3			3		21	use constant WORD_STEMMED => 0;
	3					6
	3					226
23	3			3		16	use constant WORD_ORIGINAL => 1;
	3					5
	3					130
24	3			3		15	use constant WORD_POSTAG => 2;
	3					14
	3					610
25	3			3		18	use constant WORD_INDEX => 3;
	3					5
	3					141
26	3			3		17	use constant WORD_CHAR_POSITION => 4;
	3					6
	3					123
27	3			3		16	use constant WORD_CHAR_LENGTH => 5;
	3					4
	3					119
28	3			3		15	use constant WORD_SENTENCE_ID => 6;
	3					5
	3					148
29	3			3		549	use constant WORD_USER_DEFINED => 7;
	3					7
	3					148
30
31	3			3		15	use constant POSTAGS_PERIOD => 'PP';
	3					6
	3					144
32	3			3		17	use constant POSTAGS_PUNCTUATION => qw(PP PGP PPC PPD PPL PPR PPS LRB RRB SYM);
	3					463
	3					220
33	3			3		19	use constant POSTAGS_NOUN => qw(NN NNP NNPS NNS);
	3					6
	3					160
34	3			3		16	use constant POSTAGS_ADJECTIVE => qw(CD JJ JJR JJS);
	3					7
	3					298
35	3			3		16	use constant POSTAGS_VERB => qw(VB VBD VBG VBN VBP VBZ);
	3					7
	3					177
36	3			3		24	use constant POSTAGS_ADVERB => qw(RB RBR RBS RP WRB);
	3					6
	3					153
37	3			3		17	use constant POSTAGS_CONTENT_ADVERB => qw(RBR RBS RP);
	3					5
	3					730
38	3			3		19	use constant POSTAGS_ALL => qw(CC CD DET EX FW IN JJ JJR JJS LS MD NN NNP NNPS NNS PDT POS PRP PRPS RB RBR RBS RP SYM TO UH VB VBD VBG VBN VBP VBZ WDT WP WPS WRB PP PGP PPC PPD PPL PPR PPS LRB RRB);
	3					5
	3					323
39	3			3		18	use constant POSTAGS_CONTENT => (POSTAGS_CONTENT_ADVERB, POSTAGS_VERB, POSTAGS_ADJECTIVE, POSTAGS_NOUN);
	3					5
	3					643
40	3			3		18	use constant POSTAGS_TEXTRANK => (POSTAGS_NOUN, POSTAGS_ADJECTIVE);
	3					7
	3					39782
41
42							=head1 NAME
43
44							C - Computes stemmed/POS tagged lists of text.
45
46							=head1 SYNOPSIS
47
48							use Text::StemTagPOS;
49							use Data::Dump qw(dump);
50							my $stemTagger = Text::StemTagPOS->new;
51							my $text = 'The first sentence. Sentence number two.';
52							my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
53							dump $listOfStemmedTaggedSentences;
54
55							=head1 DESCRIPTION
56
57							C uses the modules L and L
58							to do part-of-speech tagging and stemming of English text. It was developed
59							to pre-process text for other modules. Encoding of all text
60							should be in Perl's internal format; see L for converting text from
61							various encodes to a Perl string.
62
63							=head1 CONSTRUCTOR
64
65							=head2 C
66
67							The method C creates an instance of the C class with the following
68							parameters:
69
70							=over
71
72							=item C
73
74							isoLangCode => 'en'
75
76							C is the ISO language code of the language that will be tagged and
77							stemmed by the object. It must be 'en', which is the default; other languages
78							may be added when POS taggers for them are added to CPAN.
79
80							=item C
81
82							endingSentenceTag => 'PP'
83
84							C is the part-of-speech tag from L
85							that will be used to indicate
86							the end of a sentence. The default is 'PP'. The value of C must be
87							a tag generated by the module L; see method
88							C for all the possible tags; which are based on the
89							Penn Treebank tagset.
90
91
92							=item C and/or C
93
94							listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]
95
96							The method C uses C and C
97							to build the default list of the
98							parts-of-speech to be retained when filtering previously tagged text.
99							The default list is C<[qw(TEXTRANK_WORDS)]>,
100							which is all the nouns and adjectives in the text, as used in the textrank algorithm. Permitted
101							types for C are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION',
102							'TEXTRANK_WORDS', and 'VERBS'. C provides finer control over the
103							parts-of-speech to be retained. For a list
104							of all the possible tags see method C.
105
106							=back
107
108							=cut
109
110							sub new
111							{
112							# create the class object.
113	4			4	1	648	my ($Class, %Parameters) = @_;
114	4		33			41	my $Self = bless {}, ref($Class) \|\| $Class;
115
116							# get the stemmer to normalize the words, default is english.
117	4					13	my $isoLangCode = 'en';
118	4	50				18	$isoLangCode = lc $Parameters{isoLangCode} if (exists ($Parameters{isoLangCode}));
119	4					53	$Self->{stemmer} = Lingua::Stem::Snowball->new (lang => $isoLangCode, encode => 'UTF-8');
120
121							# get the POS tagger.
122	4	50				495	if ($isoLangCode eq 'en')
123							{
124	4					48	$Self->{tagger} = new Lingua::EN::Tagger;
125							}
126							else
127							{
128	0					0	croak "parameter isoLangCode must be 'en'.\n";
129							}
130
131							# read in the part of speech tags.
132	4					1236548	$Self->_getPartOfSpeechTags ();
133
134							# get the ending sentence part of speech tag which will be
135							# used to break text into array of sentences.
136	4	50				39	unless (exists ($Parameters{endingSentenceTag}))
137							{
138	4					17	$Parameters{endingSentenceTag} = 'PP';
139							}
140	4					18	$Self->{endingSentenceTag} = uc $Parameters{endingSentenceTag};
141	4					19	$Self->{endingSentenceTag} =~ tr/A-Z//cd;
142	4	50				39	$Self->{endingSentenceTag} = '/' . $Self->{endingSentenceTag} if (substr ($Self->{endingSentenceTag}, 0, 1) ne '/');
143
144							# get the list of parts of speech to keep when filtering.
145							# the default is to keep only nouns and adjetives.
146	4					30	$Self->{hashOfPOSTagsToKeep} = $Self->_getHashOfPOSTagsToKeep (%Parameters, instantiation => 1);
147	4					30	return $Self;
148							}
149
150							=head1 METHODS
151
152							=head2 C
153
154							getStemmedAndTaggedText (@Text, $Text, \@Text)
155
156							The method C returns a hierarchy of array references containing the stemmed words,
157							the original words, their part-of-speech tag, and their word position index within the original text.
158							The hierarchy is of the form
159
160							[
161							[ # sentence level: first sentence.
162							[ # word level: first word.
163							stemmed word, original word, part-of-speech tag, word index, word position, word length
164							]
165							[ # word level: second word.
166							stemmed word, original word, part-of-speech tag, word index, word position, word length
167							]
168							...
169							]
170							[ # sentence level: second sentence.
171							[ # word level: first word.
172							stemmed word, original word, part-of-speech tag, word index, word position, word length
173							]
174							[ # word level: second word.
175							stemmed word, original word, part-of-speech tag, word index, word position, word length
176							]
177							...
178							]
179							]
180
181							Its only parameters are any combination of strings of text as scalars, references to
182							scalars, arrays of strings of text, or references to arrays of strings of text, etc...
183							The following examples below show the various ways to call the method; note that the constants
184							Text::StemTagPOS::WORD_STEMMED,
185							Text::StemTagPOS::WORD_ORIGINAL,
186							Text::StemTagPOS::WORD_POSTAG,
187							Text::StemTagPOS::WORD_INDEX,
188							Text::StemTagPOS::WORD_CHAR_POSITION,
189							Text::StemTagPOS::WORD_CHAR_LENGTH,
190							Text::StemTagPOS::WORD_SENTENCE_ID, and
191							Text::StemTagPOS::WORD_USER_DEFINED,
192							are used to access the information about each word.
193
194							use Text::StemTagPOS;
195							use Data::Dump qw(dump);
196							my $stemTagger = Text::StemTagPOS->new;
197							my $text = 'The first sentence. Sentence number two.';
198							my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
199							dump $listOfStemmedTaggedSentences;
200
201							# dumps:
202							# [
203							# [
204							# ["the", "The", "/DET", 0, 0, 3, 0],
205							# [" ", " ", "/PGP", 1, 3, 1, 0],
206							# ["first", "first", "/JJ", 2, 4, 5, 0],
207							# [" ", " ", "/PGP", 3, 9, 1, 0],
208							# ["sentenc", "sentence", "/NN", 4, 10, 8, 0],
209							# [".", ".", "/PP", 5, 18, 1, 0],
210							# [" ", " ", "/PGP", 6, 19, 1, 0],
211							# ],
212							# [
213							# ["sentenc", "Sentence", "/NN", 7, 20, 8, 1],
214							# [" ", " ", "/PGP", 8, 28, 1, 1],
215							# ["number", "number", "/NN", 9, 29, 6, 1],
216							# [" ", " ", "/PGP", 10, 35, 1, 1],
217							# ["two", "two", "/CD", 11, 36, 3, 1],
218							# [".", ".", "/PP", 12, 39, 1, 1],
219							# ],
220							# ]
221
222							my $word = $listOfStemmedTaggedSentences->[0][0];
223							print
224							'WORD_STEMMED: ' .
225							"'" . $word->[Text::StemTagPOS::WORD_STEMMED] . "'\n" .
226							'WORD_ORIGINAL: ' .
227							"'" . $word->[Text::StemTagPOS::WORD_ORIGINAL] . "'\n" .
228							'WORD_POSTAG: ' .
229							"'" . $word->[Text::StemTagPOS::WORD_POSTAG] . "'\n" .
230							'WORD_INDEX: ' .
231							$word->[Text::StemTagPOS::WORD_INDEX] . "\n" .
232							'WORD_CHAR_POSITION: ' .
233							$word->[Text::StemTagPOS::WORD_CHAR_POSITION] . "\n" .
234							'WORD_CHAR_LENGTH: ' .
235							$word->[Text::StemTagPOS::WORD_CHAR_LENGTH] . "\n";
236
237							# prints:
238							# WORD_STEMMED: 'the'
239							# WORD_ORIGINAL: 'The'
240							# WORD_POSTAG: '/DET'
241							# WORD_INDEX: 0
242							# WORD_CHAR_POSITION: 0
243							# WORD_CHAR_LENGTH: 3
244
245							The following example shows the various ways the text can be passed to the method:
246
247							use Text::StemTagPOS;
248							use Data::Dump qw(dump);
249							my $stemTagger = Text::StemTagPOS->new;
250							my $text = 'This is a sentence with seven words.';
251							dump $stemTagger->getStemmedAndTaggedText ($text,
252							[$text, \$text], ($text, \$text));
253
254							=cut
255
256							sub getStemmedAndTaggedText
257							{
258	4322			4322	1	11935	my ($Self, @Text) = @_;
259
260							# convert the data to process to a list of strings.
261	4322					17282	my $ListOfStrings = $Self->_flattenList (@Text);
262
263	4322					12952	my @sentences;
264	4322					19241	my $initialIndex = 0;
265	4322					13781	foreach my $text (@$ListOfStrings)
266							{
267							# skip undefined text.
268	4322	50				13633	next unless defined $text;
269
270							# replace all backslashes with a space, since the tagger adds these.
271	4322					8346	my $cleanedText = $text;
272	4322					21505	$cleanedText =~ tr/\/\-/ /;
273	4322	50				30279	if ($cleanedText !~ m/^\s*$/)
274							{
275							# tag the text.
276	4322					68847	my $taggedText = $Self->{tagger}->get_readable ($cleanedText);
277
278							# convert to a list of sentence word,tag pairs.
279	4322					7836324	my $listOfSentences = $Self->_convertTextToListOfSentenceWordTags ($taggedText);
280
281							# add the position in the original text and the length of the words.
282	4322					26863	$listOfSentences = $Self->_addPositionLengthWordInfo ($initialIndex, $text, $listOfSentences);
283
284							# save the list of sentences.
285	4322					18877	push @sentences, @$listOfSentences;
286							}
287
288							# update the initial index for the next string.
289	4322					12898	$initialIndex += length $text;
290							}
291
292	4322					9325	my $wordIndex = 0;
293	4322					8430	foreach my $sentenceList (@sentences)
294							{
295							# pull off the words in the sentence and put them in a list.
296							# words must be encoded to utf8 for stemmer.
297	4345					10326	my @wordList = map {encode("utf8", $_->[WORD_ORIGINAL])} @$sentenceList;
	28300					592412
298
299							# stem the list of words.
300	4345					194460	$Self->{stemmer}->stem_in_place (\@wordList);
301
302							# insert the stemmed words to the list as:
303							# [stemmed word, original word, part of speech tag, word index, word position, word length]
304	4345					29741	for (my $i = 0; $i < @wordList; $i++)
305							{
306	28300					62672	my $wordTags = $sentenceList->[$i];
307							# insert the stemmed word and convert it from utf8 to Perl's internal format.
308	28300					75286	$wordTags->[WORD_STEMMED] = lc decode ('utf8', $wordList[$i]);
309	28300					925358	$wordTags->[WORD_INDEX] = $wordIndex++;
310							}
311							}
312
313	4322					21171	return \@sentences;
314							}
315
316							# add the position of the words in the original text to the info about each
317							# of the words. really should rewrite Lingua::EN::Tagger to provide this
318							# information; sloppy to recompute it and method used is fragile.
319							sub _addPositionLengthWordInfo
320							{
321	4322			4322		11244	my ($Self, $PositionOffset, $Text, $ListOfSentences) = @_;
322
323	4322					9102	my $startingIndex = 0;
324	4322					7572	my @listOfWords;
325	4322					14707	foreach my $sentence (@$ListOfSentences)
326							{
327	4345					14781	foreach my $word (@$sentence)
328							{
329							# store all the words for subsequent analysis for missing words.
330	16356					29948	push @listOfWords, $word;
331
332							# get and store the length of the word.
333	16356					32459	my $length = length ($word->[WORD_ORIGINAL]);
334	16356					26712	$word->[WORD_CHAR_LENGTH] = $length;
335
336							# default position is -1, like index function.
337	16356					24380	$word->[WORD_CHAR_POSITION] = -1;
338
339							# find the starting position of the word.
340	16356					37780	my $position = index ($Text, $word->[WORD_ORIGINAL], $startingIndex);
341
342	16356	100				44072	if ($position >= $startingIndex)
343							{
344							# add position and update starting position for next search.
345	16342					32406	$word->[WORD_CHAR_POSITION] = $PositionOffset + $position;
346	16342					44747	$startingIndex = $position + $length;
347							}
348							}
349							}
350
351							# estimate the positions of the missing words.
352	4322					9092	my $totalWords = @listOfWords;
353	4322					12308	my $i = -1;
354	4322					14800	while (++$i < $totalWords)
355							{
356	16342	100				75231	if ($listOfWords[$i]->[WORD_CHAR_POSITION] == -1)
357							{
358							# get the starting and ending index of the sequence of missing words.
359	12					14	my $noPositionStartIndex = $i;
360	12					27	while (++$i < $totalWords)
361							{
362	14	100				97	last if ($listOfWords[$i]->[WORD_CHAR_POSITION] != -1);
363							}
364	12					17	my $noPositionEndIndex = $i;
365	12					15	my $totalMissingWords = $noPositionEndIndex - $noPositionStartIndex;
366	12	50				27	$totalMissingWords = 0 if ($totalMissingWords < 0);
367
368							# get the substring corresponding to the missing words.
369
370							# compute the starting index of the substring.
371	12					14	my $startIndexPosition = $PositionOffset;
372	12	50				27	if ($noPositionStartIndex > 0)
373							{
374	12					25	$startIndexPosition = $listOfWords[$noPositionStartIndex - 1]->[WORD_CHAR_POSITION] + $listOfWords[$noPositionStartIndex - 1]->[WORD_CHAR_LENGTH];
375							}
376
377							# get the ending string index.
378	12					13	my $endIndexPosition = length $Text;
379	12	50				27	if ($noPositionEndIndex < $totalWords)
380							{
381	12					17	$endIndexPosition = $listOfWords[$noPositionEndIndex]->[WORD_CHAR_POSITION];
382							}
383
384							# get the substring.
385	12					28	my $substring = substr ($Text, $startIndexPosition - $PositionOffset, $endIndexPosition - $startIndexPosition);
386
387							# extract the non-white space items.
388	12					14	my @listOfTokens;
389	12					65	while ($substring =~ m/([^\s]+)/g)
390							{
391	14					106	push @listOfTokens, [$-[0], $+[0] - $-[1], $1];
392							}
393
394	12					19	my $substringsToMatch = @listOfTokens;
395	12	50				25	$substringsToMatch = $totalMissingWords if ($totalMissingWords < $substringsToMatch);
396	12					31	for (my $i = 0; $ i < $substringsToMatch; $i++)
397							{
398	14					26	$listOfWords[$noPositionStartIndex + $i]->[WORD_CHAR_POSITION] = $startIndexPosition + $listOfTokens[$i]->[0];
399	14					70	$listOfWords[$noPositionStartIndex + $i]->[WORD_CHAR_LENGTH] = $listOfTokens[$i]->[1];
400							}
401							}
402							}
403
404							# add the gaps.
405							# still need some work here to get the words into the list of sentences that they belong in.
406	4322					30277	my $listOfWordsAndGaps = _addGapsToListOfWords (listOfWords => \@listOfWords, stringLength => length ($Text), positionOffset => $PositionOffset);
407
408							# add the sentence id to each gap.
409	4322					8821	my $currentSentenceId = 0;
410	4322					7982	my @newListOfSentences;
411	4322					10075	foreach my $word (@$listOfWordsAndGaps)
412							{
413	28300	100				79489	$newListOfSentences[$currentSentenceId] = [] unless defined $newListOfSentences[$currentSentenceId];
414
415	28300	100				69255	if ($word->[WORD_POSTAG] eq '/PGP')
416							{
417	11944					39849	$word->[WORD_SENTENCE_ID] = $currentSentenceId;
418	11944					30678	$word->[WORD_ORIGINAL] = substr ($Text, $word->[WORD_CHAR_POSITION] - $PositionOffset, $word->[WORD_CHAR_LENGTH]);
419							}
420							else
421							{
422	16356					29965	$currentSentenceId = $word->[WORD_SENTENCE_ID];
423							}
424
425	28300					34868	push @{$newListOfSentences[$currentSentenceId]}, $word;
	28300					81398
426							}
427
428	4322					28336	return \@newListOfSentences;
429							}
430
431
432							sub _addGapsToListOfWords
433							{
434							# get the parameters.
435	4322			4322		24572	my %Parameters = @_;
436
437							# get the length of the original string.
438	4322					10344	my $stringLength = $Parameters{stringLength};
439
440							# get the list of words.
441	4322					7379	my $listOfWords = $Parameters{listOfWords};
442
443							# get the position offset if it exists and is defined.
444	4322					9853	my $positionOffset = 0;
445	4322	50	33			34937	$positionOffset = $Parameters{positionOffset} if (exists ($Parameters{positionOffset}) && defined ($Parameters{positionOffset}));
446
447							# build the list of complete tokens.
448	4322					18258	my @listOfWordsAndGaps = @$listOfWords;
449
450							# get the list of missing substring positions.
451	4322					14904	my $listOfMissingSubstrings = _getListOfMissingSubstringPositions(listOfSubstringPositions => $listOfWords, stringLength => $stringLength, positionOffset => $positionOffset);
452
453							# add the gaps to the list.
454	4322					22439	foreach my $gapInfo (@$listOfMissingSubstrings)
455							{
456	11944					19574	$gapInfo->[WORD_STEMMED] = ' ';
457	11944					16293	$gapInfo->[WORD_ORIGINAL] = undef;
458	11944					24643	$gapInfo->[WORD_POSTAG] = '/PGP';
459	11944					24274	$gapInfo->[WORD_INDEX] = undef;
460							}
461
462							# sort the substrings by position.
463	4322					22350	@listOfWordsAndGaps = sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } (@listOfWordsAndGaps, @$listOfMissingSubstrings);
	60156					97182
464
465							# if test is true, make sure things were computed correctly.
466	4322	50	33			31613	if (exists($Parameters{test}) && $Parameters{test})
467							{
468	0					0	my $totalSubstrings = @listOfWordsAndGaps;
469
470	0					0	for (my $i = 1 ; $i < $totalSubstrings ; $i++)
471							{
472
473							# make sure the strings are sorted.
474	0	0				0	if ($listOfWordsAndGaps[ $i - 1 ]->[WORD_CHAR_POSITION] > $listOfWordsAndGaps[$i]->[WORD_CHAR_POSITION])
475							{
476	0					0	croak 'error: substrings in $listOfSubstringInfo are not sorted and should be.';
477							}
478
479							# make sure the strings have at least one character.
480	0	0				0	if ($listOfWordsAndGaps[ $i - 1 ]->[WORD_CHAR_LENGTH] < 1)
481							{
482	0					0	croak 'error: substrings in $listOfSubstringInfo has length less than one.';
483							}
484							}
485							}
486
487							# returns the complete list of tokens sorted by their starting index in ascending order.
488	4322					19660	return \@listOfWordsAndGaps;
489							}
490
491
492
493							# routine returns an array reference of the gaps or missing substrings given
494							# a list of substrings. for example, if listOfSubstringPositions is
495							# [[2,4], [9,2], [11,5], [20,1]] and stringLength is 25, then the list returned
496							# is [[0, 2], [6, 3], [16, 4], [21, 4]].
497							sub _getListOfMissingSubstringPositions # (listOfSubstringPositions => \@, stringLength => n, positionOffset => n)
498							{
499							# get the parameters.
500	4322			4322		20758	my %Parameters = @_;
501
502							# get the position offset if it exists and is defined.
503	4322					8091	my $positionOffset = 0;
504	4322	50	33			41703	$positionOffset = $Parameters{positionOffset} if (exists ($Parameters{positionOffset}) && defined ($Parameters{positionOffset}));
505
506							# if listOfSubstringPositions is not defined, we have one special case.
507	4322	50				19717	if (!defined($Parameters{listOfSubstringPositions}))
508							{
509	0	0				0	if (!defined($Parameters{stringLength}))
510							{
511
512							# no parameters defined, so return the empty list.
513	0					0	return [];
514							}
515							else
516							{
517	0	0				0	if (int($Parameters{stringLength}) > 0)
518							{
519
520							# positive string length, but no substrings, so gap is entire string.
521	0					0	return [ 0, int($Parameters{stringLength}) - 1 ];
522							}
523							else
524							{
525
526							# non-positive string length, so return empty list.
527	0					0	return [];
528							}
529							}
530							}
531
532							# get the list of [WORD_CHAR_POSITION, WORD_CHAR_LENGTH].
533	4322					9541	my $listOfSubstringPositions = $Parameters{listOfSubstringPositions};
534
535							# get the number of subtrings.
536	4322					9364	my $totalSubstrings = $#$listOfSubstringPositions + 1;
537
538							# skip substrings having length less than one or a negative position.
539	4322					7827	my @filteredListOfSubstringPositions;
540	4322					26441	for (my $i = 0 ; $i < $totalSubstrings ; $i++)
541							{
542	16356	50				45419	next if ($listOfSubstringPositions->[$i][WORD_CHAR_LENGTH] < 1);
543	16356	50				38942	next if ($listOfSubstringPositions->[$i][WORD_CHAR_POSITION] < 0);
544	16356					65483	push @filteredListOfSubstringPositions, $listOfSubstringPositions->[$i];
545							}
546	4322					19170	$listOfSubstringPositions = \@filteredListOfSubstringPositions;
547	4322					9429	$totalSubstrings = $#$listOfSubstringPositions + 1;
548
549							# get the entire strings length if defined.
550	4322					6370	my $stringLength;
551	4322	50				17977	$stringLength = int abs $Parameters{stringLength} if exists $Parameters{stringLength};
552
553							# if $stringLength is undefined use the last substring to compute the length
554							# of the entire string; the string will not end with a gap in this case.
555	4322	50	33			16624	if (!defined($stringLength) && $totalSubstrings)
556							{
557	0					0	$stringLength = 0;
558	0					0	foreach my $currentSubstringInfo (@$listOfSubstringPositions)
559							{
560	0					0	my $last = $currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH];
561	0	0				0	$stringLength = $last if $last > $stringLength;
562							}
563	0					0	$stringLength -= $positionOffset;
564							}
565
566							# if $stringLength is not defined at this point there are no gaps.
567	4322	50	33			29021	return [] unless ((defined $stringLength) && ($stringLength > 0));
568
569							# if $totalSubstrings is zero, then the entire string is a gap.
570	4322	50				11770	unless ($totalSubstrings)
571							{
572	0					0	my @substringGapInfo;
573	0					0	$substringGapInfo[WORD_CHAR_POSITION] = 0;
574	0					0	$substringGapInfo[WORD_CHAR_LENGTH] = $stringLength;
575	0					0	return [ \@substringGapInfo ];
576							}
577
578							# sort the pairs by their position.
579	4322					20966	my @listOfSubstringPositions = sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } @$listOfSubstringPositions;
	20269					46141
580
581							# @listOfMissingSubstringPositions holds all the gaps.
582	4322					7288	my @listOfMissingSubstringPositions;
583
584							# get the first substring position.
585	4322					8557	my $currentSubstringInfo = $listOfSubstringPositions[0];
586
587							# if the first substring does not start with position 0, add the beginning gap.
588	4322	50				14378	if ($currentSubstringInfo->[WORD_CHAR_POSITION] > $positionOffset)
589							{
590	0					0	my @substringGapInfo;
591	0					0	$substringGapInfo[WORD_CHAR_POSITION] = $positionOffset;
592	0					0	$substringGapInfo[WORD_CHAR_LENGTH] = $currentSubstringInfo->[WORD_CHAR_POSITION] - $positionOffset;
593	0					0	push @listOfMissingSubstringPositions, \@substringGapInfo;
594							}
595
596							# compute the gaps.
597	4322					16662	for (my $i = 1 ; $i < $totalSubstrings ; $i++)
598							{
599
600							# get the information about the previous and current substrings.
601	12034					41451	my $previousSubstringInfo = $listOfSubstringPositions[ $i - 1 ];
602	12034					17506	my $currentSubstringInfo = $listOfSubstringPositions[$i];
603
604							# compute the starting index and length of the gap.
605	12034					22728	my $gapStartPosition = $previousSubstringInfo->[WORD_CHAR_POSITION] + $previousSubstringInfo->[WORD_CHAR_LENGTH];
606	12034					19475	my $gapEndPosition = $currentSubstringInfo->[WORD_CHAR_POSITION] - 1;
607	12034					19744	my $gapLength = $gapEndPosition - $gapStartPosition + 1;
608
609							# if the gap is not a positive size, skip it.
610							# maybe a warning should be logged since it really should not happen.
611	12034	100				49055	if ($gapLength > 0)
612							{
613
614							# store the information about the gap.
615	11944					15583	my @substringGapInfo;
616	11944					25344	$substringGapInfo[WORD_CHAR_POSITION] = $gapStartPosition;
617	11944					21944	$substringGapInfo[WORD_CHAR_LENGTH] = $gapLength;
618	11944					65016	push @listOfMissingSubstringPositions, \@substringGapInfo;
619							}
620							}
621
622							# add any trailing gap to the list.
623	4322					7861	$currentSubstringInfo = $listOfSubstringPositions[-1];
624	4322	50				28224	if ($currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH] < $stringLength)
625							{
626	0					0	my @substringGapInfo;
627	0					0	$substringGapInfo[WORD_CHAR_POSITION] = $currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH];
628	0					0	$substringGapInfo[WORD_CHAR_LENGTH] = $stringLength - ($currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH]);
629	0					0	push @listOfMissingSubstringPositions, \@substringGapInfo;
630							}
631
632							# if test is true, check if gaps were computed correctly.
633	4322	50	33			19722	if (exists($Parameters{test}) && $Parameters{test})
634							{
635	0					0	my @allSubstrings =
636	0					0	sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } (@listOfSubstringPositions, @listOfMissingSubstringPositions);
637	0					0	my $totalSubstrings = @allSubstrings;
638
639	0					0	for (my $i = 1 ; $i < $totalSubstrings ; $i++)
640							{
641
642							# make sure the strings are sorted.
643	0	0				0	if ($allSubstrings[ $i - 1 ]->[WORD_CHAR_POSITION] + $allSubstrings[ $i - 1 ]->[WORD_CHAR_LENGTH] < $allSubstrings[$i]->[WORD_CHAR_POSITION])
644							{
645	0					0	my $logger = Log::Log4perl->get_logger();
646	0					0	$logger->logdie("error: missed computing a gap.");
647							}
648							}
649							}
650
651							# returns the list of missing substrings.
652	4322					27902	return \@listOfMissingSubstringPositions;
653							}
654
655
656							=head2 C
657
658							getTaggedTextToKeep (listOfStemmedTaggedSentences => [...],
659							listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]);
660
661							The method C returns all the array references of the words
662							that have a part-of-speech tag that is of a type specified by
663							C or C. The word lists
664							returned have the same hierarchical sentence structure used by C.
665							Note C and C
666							are optional parameters, if neither is defined, then the values used when the
667							object was instantiated are used. If one of them is defined, its values override the default
668							values.
669
670							=over
671
672							=item C
673
674							listOfStemmedTaggedSentences => [...]
675
676							C is the array reference returned by
677							C or
678							a previous call to C.
679
680							=item C and/or C
681
682							listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]
683
684							C and C define the list of
685							parts-of-speech types to be retained when filtering previously tagged text.
686							Permitted values for C are
687							are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION',
688							'TEXTRANK_WORDS', and 'VERBS'. For the possible value of C
689							see the method C.
690							Note C and C
691							are optional parameters, if neither is defined, then the values used when the
692							object was instantiated are used. If one of them is defined, its values override the default
693							values.
694
695							=back
696
697							use Text::StemTagPOS;
698							use Data::Dump qw(dump);
699							my $stemTagger = Text::StemTagPOS->new;
700							my $text = 'This is the first sentence. This is the last sentence.';
701							my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
702							dump $stemTagger->getTaggedTextToKeep (
703							listOfStemmedTaggedSentences => $listOfStemmedTaggedSentences);
704
705							# dumps:
706							# [
707							# [
708							# ["first", "first", "/JJ", 6, 12, 5, 0],
709							# ["sentenc", "sentence", "/NN", 8, 18, 8, 0],
710							# ],
711							# [
712							# ["last", "last", "/JJ", 17, 40, 4, 1],
713							# ["sentenc", "sentence", "/NN", 19, 45, 8, 1],
714							# ],
715							# ]
716
717							=cut
718
719							sub getTaggedTextToKeep
720							{
721	0			0	1	0	my ($Self, %Parameters) = @_;
722
723							# get the list of sentences.
724	0					0	my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences};
725
726							# get the hash of tags to keep.
727	0					0	my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters);
728
729							# copy off the tokens to keep in each sentence.
730	0					0	my @listOfFilteredSentences;
731	0					0	foreach my $sentence (@$listOfStemmedTaggedSentences)
732							{
733	0					0	my @newSentence;
734	0					0	foreach my $token (@$sentence)
735							{
736	0	0				0	if (exists ($hashOfPOSToKeep->{$token->[WORD_POSTAG]}))
737							{
738	0					0	push @newSentence, $token;
739							}
740							}
741							#push @listOfFilteredSentences, \@newSentence if (@newSentence > 0);
742	0					0	push @listOfFilteredSentences, \@newSentence;
743							}
744
745							# return the list of filtered sentences.
746	0					0	return \@listOfFilteredSentences;
747							}
748
749							=head2 C
750
751							getWordsPhrasesInTaggedText (listOfStemmedTaggedSentences => ...,
752							listOfPhrasesToFind => [...], listOfPOSTypesToKeep => [...],
753							listOfPOSTagsToKeep => [...]);
754
755							The method C returns a reference to an array where
756							each entry in the array corresponds
757							to the word or phrase in C. The value of each entry is a list
758							of word indices
759							where the words or phrases were found. Each list contains integer
760							pairs of the form [first-word-index, last-word-index] where first-word-index is the index to the first
761							word of the phrase and last-word-index the index of the last word. The values of the index are those
762							assigned to the stemmed and tagged word in C.
763
764							[
765							[ # first phrase locations
766							[first word index, last word index],
767							[first word index, last word index], ...]
768							]
769							[ # second phrase locations
770							[first word index, last word index],
771							[first word index, last word index], ...]
772							]
773							...
774							]
775
776							=over
777
778							=item C
779
780							listOfStemmedTaggedSentences => [...]
781
782							C is the array reference returned by C or
783							C.
784
785							=item C
786
787							listOfPhrasesToFind => [...]
788
789							C is an array reference containing a list of strings of
790							text that are either single words or phrases that are to be located in the text
791							provided by C. Before the words or phrases are located they are filtered
792							using C or C.
793
794
795							=item C and/or C
796
797							listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]
798
799							C and C defines the list of
800							parts-of-speech types to be retained when filtering previously tagged text.
801							Permitted values for C are
802							are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION',
803							'TEXTRANK_WORDS', and 'VERBS'. For the possible value of C
804							see the method C.
805							Note C and C
806							are optional parameters, if neither is defined, then the values used when the
807							object was instantiated are used. If one of them is defined, its values override the default
808							values.
809
810							=back
811
812							The code below illustrates the output format:
813
814							use Text::StemTagPOS;
815							use Data::Dump qw(dump);
816							my $stemTagger = Text::StemTagPOS->new;
817							my $text = 'This is the first sentence. This is the last sentence.';
818							my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
819							dump $listOfStemmedTaggedSentences;
820							my $listOfWordsOrPhrasesToFind = ['first sentence','this is',
821							'third sentence', 'sentence'];
822							my $phraseLocations = $stemTagger->getWordsPhrasesInTaggedText (
823							listOfPOSTypesToKeep => [qw(ALL)],
824							listOfStemmedTaggedSentences => $listOfStemmedTaggedSentences,
825							listOfWordsOrPhrasesToFind => $listOfWordsOrPhrasesToFind);
826							dump $phraseLocations;
827							# [
828							# [[6, 8]], # 'first sentence'
829							# [[0, 2], [11, 13]], # 'this is': note period in text has index 5.
830							# [], # 'third sentence'
831							# [[8, 8], [19, 19]] # 'sentence'
832							# ]
833
834							=cut
835
836							sub getWordsPhrasesInTaggedText
837							{
838	300			300	1	5355783	my ($Self, %Parameters) = @_;
839
840							# stem and tag all of the word and phrase text.
841							# put all the listOfStemmedTaggedSentences into an array,
842							# create the suffix array of the listOfStemmedTaggedSentences array.
843
844							# get the hash of tags to keep.
845	300					2939	my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters);
846
847							# get the tagged text.
848	300					1191	my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences};
849
850							# create a list of the filtered stemmed words of the text.
851	300					844	my @wordsOfText;
852	300					841	foreach my $sentence (@$listOfStemmedTaggedSentences)
853							{
854	29474					75427	foreach my $word (@$sentence)
855							{
856							# skip over the words types to ignore.
857	442744	50				1350551	next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]}));
858	442744					885142	push @wordsOfText, $word;
859							}
860							}
861
862							# create the suffix array of the tagged words, sorting by the stemmed word.
863	300					1248	my $totalWords = @wordsOfText;
864
865							# hash all the indices of the stemmed words in the suffix array of the text.
866	300					979	my %wordIndices;
867	300					2964	for(my $index = 0; $index < $totalWords; $index++)
868							{
869	442744					840436	my $word = $wordsOfText[$index];
870	442744	50				2394697	$wordIndices{$word->[WORD_STEMMED]} = [] unless exists $wordIndices{$word->[WORD_STEMMED]};
871	442744					615658	push @{$wordIndices{$word->[WORD_STEMMED]}}, $index;
	442744					1830082
872							}
873
874							# stem and part-of-speech tag the word or phrases to find.
875	300					822	my @listOfStemmedWordsInPhrasesToFind;
876	300					1176	my $listOfWordsOrPhrasesToFind = $Parameters{listOfWordsOrPhrasesToFind};
877	300					1453	foreach my $wordOrPhrase (@$listOfWordsOrPhrasesToFind)
878							{
879	4321					24086	my $stemmed = $Self->getStemmedAndTaggedText ($wordOrPhrase);
880
881							# flatten the words of a phrase into on list.
882	4321					9594	my @allWords;
883	4321					11384	foreach my $sentence (@$stemmed)
884							{
885	4321					14860	foreach my $word (@$sentence)
886							{
887							# remove the words to ignore.
888	27041	100				84286	next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]}));
889	15681					45001	push @allWords, $word;
890							}
891							}
892
893	4321					32282	push @listOfStemmedWordsInPhrasesToFind, \@allWords ;
894							}
895
896							# given the starting index into the suffix array and a word list, return
897							# the number of words matching the word list at that index.
898							my $getMatchingWords = sub
899							{
900	4321			4321		6689	my ($StartingIndex, $StemmedWordList) = @_;
901	4321					7881	my $stemmedIndex = 0;
902	4321					5796	my $textIndex = $StartingIndex;
903	4321		100			21753	while (($textIndex < $totalWords) && ($stemmedIndex < @$StemmedWordList))
904							{
905	15681	50				75726	last if ($wordsOfText[$textIndex]->[WORD_STEMMED] cmp $StemmedWordList->[$stemmedIndex][WORD_STEMMED]);
906	15681					24881	$textIndex++;
907	15681					68213	$stemmedIndex++;
908							}
909	4321					19467	return $textIndex - $StartingIndex;
910	300					4330	};
911
912							# find all the phrases in the filtered text.
913	300					711	my @phrasesFound;
914	300					841	my $phraseIndex = 0;
915	300					859	foreach my $phraseStemmedWordList (@listOfStemmedWordsInPhrasesToFind)
916							{
917	4321					5740	my @fullPhrasesFound;
918	4321					14044	my @stemmedPhraseWords = @$phraseStemmedWordList;
919	4321					6607	my $wordsInPhrase = $#stemmedPhraseWords + 1;
920
921							# get the first word in the phrase.
922	4321					7564	my $firstWord = $stemmedPhraseWords[0];
923	4321	50				19786	if (exists ($wordIndices{$firstWord->[WORD_STEMMED]}))
924							{
925							# get all the indices in the suffix array that the first word occurs at.
926	4321					29879	my $listOfStartingIndices = $wordIndices{$firstWord->[WORD_STEMMED]};
927
928							# compute the number of matching phrase words at each position.
929	4321					10688	foreach my $startingIndex (@$listOfStartingIndices)
930							{
931	4321					9559	my $matchingLength = &$getMatchingWords ($startingIndex, \@stemmedPhraseWords);
932	4321					11569	my $firstWordIndex = $wordsOfText[$startingIndex]->[WORD_INDEX];
933	4321					10116	my $lastWordIndex = $wordsOfText[$startingIndex + $matchingLength - 1]->[WORD_INDEX];
934
935	4321	50				17419	if ($matchingLength == $wordsInPhrase)
936							{
937							# full match of the phrase.
938	4321					21275	push @fullPhrasesFound, [$firstWordIndex, $lastWordIndex];
939							}
940							}
941							}
942	4321					16702	$phrasesFound[$phraseIndex++] = \@fullPhrasesFound;
943							}
944	300					908785	return \@phrasesFound;
945							}
946
947							=head2 C
948
949							The method C takes no parameters. It returns an array
950							reference where each item in the list is of the form C<[part of speech tag, description, examples]>.
951							It is meant for getting the part-of-speech tags that can be used to populate C.
952
953							use Text::StemTagPOS;
954							use Data::Dump qw(dump);
955							my $stemTagger = Text::StemTagPOS->new;
956							dump $stemTagger->getListOfPartOfSpeechTags;
957
958							=cut
959
960							# this method takes no parameters. it reads in the information about the
961							# part of speech tags stored in the DATA section of this file. the information
962							# is stored in the object at posTags as an array reference containing the
963							# array [POS tag, description, examples] for each possible tag.
964							sub getListOfPartOfSpeechTags
965							{
966	0			0	1	0	my $Self = shift;
967
968							# return a copy of the posTags array.
969	0					0	my @partOfSpeechTags = map {[@$_]} @{$Self->{posTags}};
	0					0
	0					0
970	0					0	return \@partOfSpeechTags;
971							}
972
973
974							# _convertTextToListOfSentenceWordTags->('string of tagged words returned from xxxx');
975							sub _convertTextToListOfSentenceWordTags
976							{
977							# get the method class object.
978	4322			4322		11660	my $Self = $_[0];
979
980							# split the string of tagged words into a list of the words and tags. note,
981							# all POS tags are a slash followed by 2, 3 or 4 uppercase letters.
982	4322					73375	my @list = split (/(\/[A-Z]{2,4})/, $_[1]);
983
984							# trim off any leading or trailing spaces from each word or tag.
985	4322					13494	foreach my $item (@list)
986							{
987	32712					116979	$item =~ s/^\s*//;
988	32712					70468	$item =~ s/\s+$//;
989							}
990
991							# restructure the list so each item is of the form [word, tag].
992	4322					9200	my $wordOrPhrase;
993							my @taggerWordList;
994	4322					11075	foreach my $item (@list)
995							{
996	32712	100				92258	if (substr ($item, 0, 1) eq '/')
997							{
998							# at this point the item in the list is a tag, so form the new [word,tag] pair
999							# and save it.
1000	16356					32512	my $wordInfo = [];
1001	16356					39199	$wordInfo->[WORD_ORIGINAL] = $wordOrPhrase;
1002	16356					35433	$wordInfo->[WORD_POSTAG] = $item;
1003	16356					40379	push @taggerWordList, $wordInfo;
1004	16356					34927	$wordOrPhrase = undef;
1005							}
1006							else
1007							{
1008							# at this point we have a new word, so append it to $wordOrPhrase.
1009	16356	50				38626	if (!defined ($wordOrPhrase))
1010							{
1011	16356					42972	$wordOrPhrase = $item;
1012							}
1013							else
1014							{
1015	0					0	$wordOrPhrase .= ' ' . $item;
1016							}
1017							}
1018							}
1019
1020							# now partition the list into sentences.
1021	4322					10570	my @sentences;
1022	4322					10521	my $currentSentence = [];
1023	4322					14299	foreach my $wordTag (@taggerWordList)
1024							{
1025	16356	100				50494	if ($wordTag->[WORD_POSTAG] eq $Self->{endingSentenceTag})
1026							{
1027							# if the sentence has no words, skip it.
1028	24	50				63	next unless ($#$currentSentence > -1);
1029
1030							# add the POS sentence ender tag to the end of the sentence list.
1031	24					43	push @$currentSentence, $wordTag;
1032
1033							# add the sentence to the list of sentences.
1034	24					37	push @sentences, $currentSentence;
1035	24					58	$currentSentence = [];
1036							}
1037							else
1038							{
1039	16332					38092	push @$currentSentence, $wordTag;
1040							}
1041							}
1042
1043							# if there is a sentence left, save it.
1044	4322	100				30933	if ($#$currentSentence > -1)
1045							{
1046	4321					10486	push @sentences, $currentSentence;
1047							}
1048
1049							# add the sentence id to each word.
1050	4322					16517	for (my $i = 0; $i < @sentences; $i++)
1051							{
1052	4345					19630	foreach my $word (@{$sentences[$i]})
	4345					35625
1053							{
1054	16356					46344	$word->[WORD_SENTENCE_ID] = $i;
1055							}
1056							}
1057
1058							# return the list of sentences of word,tag pairs.
1059	4322					24404	return \@sentences;
1060							}
1061
1062
1063							# converts a list of strings, string refs, arrays of strings, etc... to a
1064							# simpler list of strings; essentially flattening a list.
1065							sub _flattenList
1066							{
1067	4322			4322		10894	my ($Self, @Data) = @_;
1068
1069	4322					6442	my @ListOfStrings;
1070	4322					11793	foreach my $item (@_)
1071							{
1072	8644					18556	my $type = ref ($item);
1073	8644	100				64628	if ($type eq '')
		100
		50
		50
1074							{
1075	4321					15269	push @ListOfStrings, $item;
1076							}
1077							elsif ($type eq 'SCALAR')
1078							{
1079	1					4	push @ListOfStrings, $$item;
1080							}
1081							elsif ($type eq 'ARRAY')
1082							{
1083	0					0	push @ListOfStrings, @{$Self->_flattenList (@$item)};
	0					0
1084							}
1085							elsif ($type eq 'REF')
1086							{
1087	0					0	push @ListOfStrings, @{$Self->_flattenList ($$item)};
	0					0
1088							}
1089							}
1090	4322					14194	return \@ListOfStrings;
1091							}
1092
1093
1094							# this method takes no parameters. it reads in the information about the
1095							# part of speech tags stored in the DATA section of this file. the information
1096							# is stored in the object at posTags as an array reference containing the
1097							# array [POS tag, description, examples] for each possible tag.
1098							sub _getPartOfSpeechTags
1099							{
1100	4			4		15	my $Self = shift;
1101
1102	4					268	my $posTags =
1103							[
1104							['CC', 'Conjunction, coordinating', 'and, or'],
1105							['CD', 'Adjective, cardinal number', '3, fifteen'],
1106							['DET', 'Determiner', 'this, each, some'],
1107							['EX', 'Pronoun, existential there', 'there'],
1108							['FW', 'Foreign words', ''],
1109							['IN', 'Preposition / Conjunction', 'for, of, although, that'],
1110							['JJ', 'Adjective', 'happy, bad'],
1111							['JJR', 'Adjective, comparative', 'happier, worse'],
1112							['JJS', 'Adjective, superlative', 'happiest, worst'],
1113							['LS', 'Symbol, list item', 'A, A.'],
1114							['MD', 'Verb, modal', "can, could, 'll"],
1115							['NN', 'Noun', 'aircraft, data'],
1116							['NNP', 'Noun, proper', 'London, Michael'],
1117							['NNPS', 'Noun, proper, plural', 'Australians, Methodists'],
1118							['NNS', 'Noun, plural', 'women, books'],
1119							['PDT', 'Determiner, prequalifier', 'quite, all, half'],
1120							['POS', 'Possessive', "'s"],
1121							['PRP', 'Determiner, possessive second', 'mine, yours'],
1122							['PRPS', 'Determiner, possessive', 'their, your'],
1123							['RB', 'Adverb', 'often, not, very, here'],
1124							['RBR', 'Adverb, comparative', 'faster'],
1125							['RBS', 'Adverb, superlative', 'fastest'],
1126							['RP', 'Adverb, particle', 'up, off, out'],
1127							['SYM', 'Symbol', '*'],
1128							['TO', 'Preposition', 'to'],
1129							['UH', 'Interjection', 'oh, yes, mmm'],
1130							['VB', 'Verb, infinitive', 'take, live'],
1131							['VBD', 'Verb, past tense', 'took, lived'],
1132							['VBG', 'Verb, gerund', 'taking, living'],
1133							['VBN', 'Verb, past/passive participle', 'taken, lived'],
1134							['VBP', 'Verb, base present form', 'take, live'],
1135							['VBZ', 'Verb, present 3SG -s form', 'takes, lives'],
1136							['WDT', 'Determiner, question', 'which, whatever'],
1137							['WP', 'Pronoun, question', 'who, whoever'],
1138							['WPS', 'Determiner, possessive & question', 'whose'],
1139							['WRB', 'Adverb, question', 'when, how, however'],
1140							['PP', 'Punctuation, sentence ender', '., !, ?'],
1141							['PPC', 'Punctuation, comma', ','],
1142							['PGP', 'Punctuation, whitespace, gap', ' '],
1143							['PPD', 'Punctuation, dollar sign', '$'],
1144							['PPL', 'Punctuation, quotation mark left', '``'],
1145							['PPR', 'Punctuation, quotation mark right', "''"],
1146							['PPS', 'Punctuation, colon, semicolon, elipsis', ':, ..., -'],
1147							['LRB', 'Punctuation, left bracket', '(, {, ['],
1148							['RRB', 'Punctuation, right bracket', '), }, ]']
1149							];
1150
1151							# store the pos array.
1152	4					23	$Self->{posTags} = $posTags;
1153	4					16	return;
1154							}
1155
1156
1157							sub _getListOfPOSTagsFromPOSTypesList
1158							{
1159	1			1		3	my ($Self, %Parameters) = @_;
1160
1161	1					3	my $listOfPOSTypesToKeep = [];
1162	1	50				4	$listOfPOSTypesToKeep = $Parameters{listOfPOSTypesToKeep} if exists $Parameters{listOfPOSTypesToKeep};
1163
1164							# ALL, NOUNS, VERBS, PUNCTUATION, ADJECTIVES, ADVERBS, CONTENT_WORDS, TEXTRANK_WORDS, VERBS
1165	1					2	my @listOfPOSTagsToKeep;
1166	1					6	foreach my $type (@$listOfPOSTypesToKeep)
1167							{
1168	0					0	my $ucType = uc $type;
1169	0					0	$ucType =~ tr/A-Z\_//cd;
1170	0	0				0	if (($ucType cmp 'NOUNS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_NOUN; }
	0	0				0
		0
		0
		0
		0
		0
		0
		0
1171	0					0	elsif (($ucType cmp 'VERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_VERB; }
1172	0					0	elsif (($ucType cmp 'ADJECTIVES') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ADJECTIVE; }
1173	0					0	elsif (($ucType cmp 'ADVERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ADVERB; }
1174	0					0	elsif (($ucType cmp 'CONTENT_ADVERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_CONTENT_ADVERB; }
1175	0					0	elsif (($ucType cmp 'PUNCTUATION') == 0) { push @listOfPOSTagsToKeep, POSTAGS_PUNCTUATION; }
1176	0					0	elsif (($ucType cmp 'CONTENT_WORDS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_CONTENT; }
1177	0					0	elsif (($ucType cmp 'TEXTRANK_WORDS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_TEXTRANK; }
1178	0					0	elsif (($ucType cmp 'ALL') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ALL; }
1179							}
1180
1181	1					4	return \@listOfPOSTagsToKeep;
1182							}
1183
1184
1185							# returns a hash reference of the part of speech tags that are
1186							# provided in listOfPOSTagsToKeep.
1187							sub _getHashOfPOSTagsToKeep
1188							{
1189	304			304		1410	my ($Self, %Parameters) = @_;
1190
1191							# get the list of POS tags to keep.
1192	304					910	my $listOfPOSTagsToKeep;
1193	304	100	66			4823	if (!exists ($Parameters{listOfPOSTagsToKeep}) && !exists ($Parameters{listOfPOSTypesToKeep}))
1194							{
1195	303	100				3525	return $Self->{hashOfPOSTagsToKeep} unless exists $Parameters{instantiation};
1196	3					22	$listOfPOSTagsToKeep = [POSTAGS_TEXTRANK];
1197							}
1198							else
1199							{
1200	1					6	$listOfPOSTagsToKeep = $Self->_getListOfPOSTagsFromPOSTypesList (%Parameters);
1201	1	50				4	push @$listOfPOSTagsToKeep, @{$Parameters{listOfPOSTagsToKeep}} if (exists ($Parameters{listOfPOSTagsToKeep}));
	1					22
1202							}
1203
1204	4					11	my %hashOfPOSTagsToKeep;
1205	4					16	foreach my $pos (@$listOfPOSTagsToKeep)
1206							{
1207	68					76	my $posClean = uc $pos;
1208	68					72	$posClean =~ tr/A-Z0-9//cd;
1209
1210							# prefix with a slash.
1211	68					172	$hashOfPOSTagsToKeep{'/' . $posClean} = 1;
1212							}
1213
1214	4					31	return \%hashOfPOSTagsToKeep;
1215							}
1216
1217							=head2 C
1218
1219							The method C returns an array reference of the sorted stemmed
1220							words in the text given by C.
1221
1222							=over
1223
1224							=item C
1225
1226							listOfStemmedTaggedSentences => [...]
1227
1228							C is the array reference returned by C or
1229							C of the text.
1230
1231							=back
1232
1233							use Text::StemTagPOS;
1234							use Data::Dump qw(dump);
1235							my $stemTagger = Text::StemTagPOS->new;
1236							my $text = 'The first sentence. Sentence number two.';
1237							my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
1238							dump $listOfStemmedTaggedSentences;
1239
1240							=cut
1241
1242							sub getListOfStemmedWordsInText
1243							{
1244	0			0	1		my ($Self, %Parameters) = @_;
1245
1246							# get the hash of tags to keep.
1247	0						my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters);
1248
1249							# get the tagged text.
1250	0						my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences};
1251
1252							# create a list of the filtered stemmed words of the text.
1253	0						my %wordsOfText;
1254	0						foreach my $sentence (@$listOfStemmedTaggedSentences)
1255							{
1256	0						foreach my $word (@$sentence)
1257							{
1258							# skip over the words types to ignore.
1259	0	0					next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]}));
1260	0						$wordsOfText{$word->[WORD_STEMMED]} = 1;
1261							}
1262							}
1263	0						my @words = sort keys %wordsOfText;
1264	0						return \@words;
1265							}
1266
1267							=head2 C
1268
1269							The method C returns an array reference of the sorted stemmed
1270							words of the intersection of all the words in the documents given by C;
1271
1272							=over
1273
1274							=item C
1275
1276							listOfStemmedTaggedDocuments => [...]
1277
1278							C is a list of document references returned by C or
1279							C.
1280
1281							=back
1282
1283							=cut
1284
1285							sub getListOfStemmedWordsInAllDocuments
1286							{
1287	0			0	1		my ($Self, %Parameters) = @_;
1288
1289							# get the list of documents to process.
1290	0						my $listOfStemmedTaggedDocuments = $Parameters{listOfStemmedTaggedDocuments};
1291	0						my $totalDocuments = @$listOfStemmedTaggedDocuments;
1292	0						my %wordOccurence;
1293
1294							# put the words of the first document into a hash.
1295	0						my $sentences = $listOfStemmedTaggedDocuments->[0];
1296	0						foreach my $sentence (@$sentences)
1297							{
1298	0						foreach my $word (@$sentence)
1299							{
1300	0						$wordOccurence{$word->[WORD_STEMMED]} = 1;
1301							}
1302							}
1303
1304							# add the words from the remaining documents, only if the word
1305							# occurred in all previous documents.
1306	0						for (my $i = 1; $i < $totalDocuments; $i++)
1307							{
1308	0						my $sentences = $listOfStemmedTaggedDocuments->[$i];
1309	0						foreach my $sentence (@$sentences)
1310							{
1311	0						foreach my $word (@$sentence)
1312							{
1313							# if the word has not occurred in a previous document, skip it.
1314	0	0					next unless exists $wordOccurence{$word->[WORD_STEMMED]};
1315
1316							# if the word has not occurred in all previous documents, delete it.
1317	0	0					if ($wordOccurence{$word->[WORD_STEMMED]} < $i)
1318							{
1319	0						delete $wordOccurence{$word->[WORD_STEMMED]};
1320	0						next;
1321							}
1322
1323							# set the words occurence.
1324	0						$wordOccurence{$word->[WORD_STEMMED]} = $i + 1;
1325							}
1326							}
1327							}
1328
1329							# return the words that occured in all the documents.
1330	0						my @wordsInAll;
1331	0						while (my ($word, $occurence) = each %wordOccurence)
1332							{
1333	0	0					push @wordsInAll, $word if ($occurence == $totalDocuments);
1334							}
1335	0						@wordsInAll = sort @wordsInAll;
1336	0						return \@wordsInAll;
1337							}
1338
1339							=head1 INSTALLATION
1340
1341							To install the module run the following commands:
1342
1343							perl Makefile.PL
1344							make
1345							make test
1346							make install
1347
1348							If you are on a windows box you should use 'nmake' rather than 'make'.
1349
1350							=head1 AUTHOR
1351
1352							Jeff Kubina
1353
1354							=head1 BUGS
1355
1356							Please email bugs reports or feature requests to C, or through
1357							the web interface at L. The author
1358							will be notified and you can be automatically notified of progress on the bug fix or feature request.
1359
1360							=head1 COPYRIGHT
1361
1362							Copyright (c) 2010 Jeff Kubina. All rights reserved.
1363							This program is free software; you can redistribute
1364							it and/or modify it under the same terms as Perl itself.
1365
1366							The full text of the license can be found in the
1367							LICENSE file included with this module.
1368
1369							=head1 KEYWORDS
1370
1371							natural language processing, NLP, part of speech tagging, POS, stemming
1372
1373							=head1 SEE ALSO
1374
1375							L, L, L, L,
1376							L, L
1377
1378							=begin html
1379
1380							See the Lingua::EN::Tagger README
1381							file for a list of the part-of-speech tags.
1382
1383							=end html
1384
1385							=cut
1386
1387							1;
1388							# The preceding line will help the module return a true value