File Coverage

blib/lib/Text/StemTagPOS.pm
Criterion Covered Total %
statement 343 444 77.2
branch 62 136 45.5
condition 12 27 44.4
subroutine 40 44 90.9
pod 7 7 100.0
total 464 658 70.5


line stmt bran cond sub pod time code
1             package Text::StemTagPOS;
2              
3             require 5.006002;
4 3     3   155992 use strict;
  3         11  
  3         149  
5 3     3   17 use warnings;
  3         7  
  3         180  
6 3     3   18 use Carp;
  3         10  
  3         553  
7 3     3   5432 use Encode;
  3         93626  
  3         354  
8 3     3   3191 use Lingua::Stem::Snowball;
  3         8033  
  3         206  
9 3     3   4021 use Lingua::EN::Tagger;
  3         529363  
  3         180  
10 3     3   5353 use Data::Dump qw(dump);
  3         59664  
  3         407  
11              
12             BEGIN {
13 3     3   37 use Exporter ();
  3         8  
  3         77  
14 3     3   17 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  3         7  
  3         368  
15 3     3   31 $VERSION = '0.61';
16 3         66 @ISA = qw(Exporter);
17 3         7 @EXPORT = qw();
18 3         7 @EXPORT_OK = qw();
19 3         503 %EXPORT_TAGS = ();
20             }
21              
22 3     3   21 use constant WORD_STEMMED => 0;
  3         6  
  3         226  
23 3     3   16 use constant WORD_ORIGINAL => 1;
  3         5  
  3         130  
24 3     3   15 use constant WORD_POSTAG => 2;
  3         14  
  3         610  
25 3     3   18 use constant WORD_INDEX => 3;
  3         5  
  3         141  
26 3     3   17 use constant WORD_CHAR_POSITION => 4;
  3         6  
  3         123  
27 3     3   16 use constant WORD_CHAR_LENGTH => 5;
  3         4  
  3         119  
28 3     3   15 use constant WORD_SENTENCE_ID => 6;
  3         5  
  3         148  
29 3     3   549 use constant WORD_USER_DEFINED => 7;
  3         7  
  3         148  
30              
31 3     3   15 use constant POSTAGS_PERIOD => 'PP';
  3         6  
  3         144  
32 3     3   17 use constant POSTAGS_PUNCTUATION => qw(PP PGP PPC PPD PPL PPR PPS LRB RRB SYM);
  3         463  
  3         220  
33 3     3   19 use constant POSTAGS_NOUN => qw(NN NNP NNPS NNS);
  3         6  
  3         160  
34 3     3   16 use constant POSTAGS_ADJECTIVE => qw(CD JJ JJR JJS);
  3         7  
  3         298  
35 3     3   16 use constant POSTAGS_VERB => qw(VB VBD VBG VBN VBP VBZ);
  3         7  
  3         177  
36 3     3   24 use constant POSTAGS_ADVERB => qw(RB RBR RBS RP WRB);
  3         6  
  3         153  
37 3     3   17 use constant POSTAGS_CONTENT_ADVERB => qw(RBR RBS RP);
  3         5  
  3         730  
38 3     3   19 use constant POSTAGS_ALL => qw(CC CD DET EX FW IN JJ JJR JJS LS MD NN NNP NNPS NNS PDT POS PRP PRPS RB RBR RBS RP SYM TO UH VB VBD VBG VBN VBP VBZ WDT WP WPS WRB PP PGP PPC PPD PPL PPR PPS LRB RRB);
  3         5  
  3         323  
39 3     3   18 use constant POSTAGS_CONTENT => (POSTAGS_CONTENT_ADVERB, POSTAGS_VERB, POSTAGS_ADJECTIVE, POSTAGS_NOUN);
  3         5  
  3         643  
40 3     3   18 use constant POSTAGS_TEXTRANK => (POSTAGS_NOUN, POSTAGS_ADJECTIVE);
  3         7  
  3         39782  
41              
42             =head1 NAME
43              
44             C - Computes stemmed/POS tagged lists of text.
45              
46             =head1 SYNOPSIS
47              
48             use Text::StemTagPOS;
49             use Data::Dump qw(dump);
50             my $stemTagger = Text::StemTagPOS->new;
51             my $text = 'The first sentence. Sentence number two.';
52             my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
53             dump $listOfStemmedTaggedSentences;
54              
55             =head1 DESCRIPTION
56              
57             C uses the modules L and L
58             to do part-of-speech tagging and stemming of English text. It was developed
59             to pre-process text for other modules. Encoding of all text
60             should be in Perl's internal format; see L for converting text from
61             various encodes to a Perl string.
62              
63             =head1 CONSTRUCTOR
64              
65             =head2 C
66              
67             The method C creates an instance of the C class with the following
68             parameters:
69              
70             =over
71              
72             =item C
73              
74             isoLangCode => 'en'
75              
76             C is the ISO language code of the language that will be tagged and
77             stemmed by the object. It must be 'en', which is the default; other languages
78             may be added when POS taggers for them are added to CPAN.
79              
80             =item C
81              
82             endingSentenceTag => 'PP'
83              
84             C is the part-of-speech tag from L
85             that will be used to indicate
86             the end of a sentence. The default is 'PP'. The value of C must be
87             a tag generated by the module L; see method
88             C for all the possible tags; which are based on the
89             Penn Treebank tagset.
90              
91              
92             =item C and/or C
93              
94             listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]
95              
96             The method C uses C and C
97             to build the default list of the
98             parts-of-speech to be retained when filtering previously tagged text.
99             The default list is C<[qw(TEXTRANK_WORDS)]>,
100             which is all the nouns and adjectives in the text, as used in the textrank algorithm. Permitted
101             types for C are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION',
102             'TEXTRANK_WORDS', and 'VERBS'. C provides finer control over the
103             parts-of-speech to be retained. For a list
104             of all the possible tags see method C.
105              
106             =back
107              
108             =cut
109              
110             sub new
111             {
112             # create the class object.
113 4     4 1 648 my ($Class, %Parameters) = @_;
114 4   33     41 my $Self = bless {}, ref($Class) || $Class;
115              
116             # get the stemmer to normalize the words, default is english.
117 4         13 my $isoLangCode = 'en';
118 4 50       18 $isoLangCode = lc $Parameters{isoLangCode} if (exists ($Parameters{isoLangCode}));
119 4         53 $Self->{stemmer} = Lingua::Stem::Snowball->new (lang => $isoLangCode, encode => 'UTF-8');
120              
121             # get the POS tagger.
122 4 50       495 if ($isoLangCode eq 'en')
123             {
124 4         48 $Self->{tagger} = new Lingua::EN::Tagger;
125             }
126             else
127             {
128 0         0 croak "parameter isoLangCode must be 'en'.\n";
129             }
130              
131             # read in the part of speech tags.
132 4         1236548 $Self->_getPartOfSpeechTags ();
133              
134             # get the ending sentence part of speech tag which will be
135             # used to break text into array of sentences.
136 4 50       39 unless (exists ($Parameters{endingSentenceTag}))
137             {
138 4         17 $Parameters{endingSentenceTag} = 'PP';
139             }
140 4         18 $Self->{endingSentenceTag} = uc $Parameters{endingSentenceTag};
141 4         19 $Self->{endingSentenceTag} =~ tr/A-Z//cd;
142 4 50       39 $Self->{endingSentenceTag} = '/' . $Self->{endingSentenceTag} if (substr ($Self->{endingSentenceTag}, 0, 1) ne '/');
143              
144             # get the list of parts of speech to keep when filtering.
145             # the default is to keep only nouns and adjetives.
146 4         30 $Self->{hashOfPOSTagsToKeep} = $Self->_getHashOfPOSTagsToKeep (%Parameters, instantiation => 1);
147 4         30 return $Self;
148             }
149              
150             =head1 METHODS
151              
152             =head2 C
153              
154             getStemmedAndTaggedText (@Text, $Text, \@Text)
155              
156             The method C returns a hierarchy of array references containing the stemmed words,
157             the original words, their part-of-speech tag, and their word position index within the original text.
158             The hierarchy is of the form
159              
160             [
161             [ # sentence level: first sentence.
162             [ # word level: first word.
163             stemmed word, original word, part-of-speech tag, word index, word position, word length
164             ]
165             [ # word level: second word.
166             stemmed word, original word, part-of-speech tag, word index, word position, word length
167             ]
168             ...
169             ]
170             [ # sentence level: second sentence.
171             [ # word level: first word.
172             stemmed word, original word, part-of-speech tag, word index, word position, word length
173             ]
174             [ # word level: second word.
175             stemmed word, original word, part-of-speech tag, word index, word position, word length
176             ]
177             ...
178             ]
179             ]
180              
181             Its only parameters are any combination of strings of text as scalars, references to
182             scalars, arrays of strings of text, or references to arrays of strings of text, etc...
183             The following examples below show the various ways to call the method; note that the constants
184             Text::StemTagPOS::WORD_STEMMED,
185             Text::StemTagPOS::WORD_ORIGINAL,
186             Text::StemTagPOS::WORD_POSTAG,
187             Text::StemTagPOS::WORD_INDEX,
188             Text::StemTagPOS::WORD_CHAR_POSITION,
189             Text::StemTagPOS::WORD_CHAR_LENGTH,
190             Text::StemTagPOS::WORD_SENTENCE_ID, and
191             Text::StemTagPOS::WORD_USER_DEFINED,
192             are used to access the information about each word.
193              
194             use Text::StemTagPOS;
195             use Data::Dump qw(dump);
196             my $stemTagger = Text::StemTagPOS->new;
197             my $text = 'The first sentence. Sentence number two.';
198             my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
199             dump $listOfStemmedTaggedSentences;
200              
201             # dumps:
202             # [
203             # [
204             # ["the", "The", "/DET", 0, 0, 3, 0],
205             # [" ", " ", "/PGP", 1, 3, 1, 0],
206             # ["first", "first", "/JJ", 2, 4, 5, 0],
207             # [" ", " ", "/PGP", 3, 9, 1, 0],
208             # ["sentenc", "sentence", "/NN", 4, 10, 8, 0],
209             # [".", ".", "/PP", 5, 18, 1, 0],
210             # [" ", " ", "/PGP", 6, 19, 1, 0],
211             # ],
212             # [
213             # ["sentenc", "Sentence", "/NN", 7, 20, 8, 1],
214             # [" ", " ", "/PGP", 8, 28, 1, 1],
215             # ["number", "number", "/NN", 9, 29, 6, 1],
216             # [" ", " ", "/PGP", 10, 35, 1, 1],
217             # ["two", "two", "/CD", 11, 36, 3, 1],
218             # [".", ".", "/PP", 12, 39, 1, 1],
219             # ],
220             # ]
221              
222             my $word = $listOfStemmedTaggedSentences->[0][0];
223             print
224             'WORD_STEMMED: ' .
225             "'" . $word->[Text::StemTagPOS::WORD_STEMMED] . "'\n" .
226             'WORD_ORIGINAL: ' .
227             "'" . $word->[Text::StemTagPOS::WORD_ORIGINAL] . "'\n" .
228             'WORD_POSTAG: ' .
229             "'" . $word->[Text::StemTagPOS::WORD_POSTAG] . "'\n" .
230             'WORD_INDEX: ' .
231             $word->[Text::StemTagPOS::WORD_INDEX] . "\n" .
232             'WORD_CHAR_POSITION: ' .
233             $word->[Text::StemTagPOS::WORD_CHAR_POSITION] . "\n" .
234             'WORD_CHAR_LENGTH: ' .
235             $word->[Text::StemTagPOS::WORD_CHAR_LENGTH] . "\n";
236              
237             # prints:
238             # WORD_STEMMED: 'the'
239             # WORD_ORIGINAL: 'The'
240             # WORD_POSTAG: '/DET'
241             # WORD_INDEX: 0
242             # WORD_CHAR_POSITION: 0
243             # WORD_CHAR_LENGTH: 3
244              
245             The following example shows the various ways the text can be passed to the method:
246              
247             use Text::StemTagPOS;
248             use Data::Dump qw(dump);
249             my $stemTagger = Text::StemTagPOS->new;
250             my $text = 'This is a sentence with seven words.';
251             dump $stemTagger->getStemmedAndTaggedText ($text,
252             [$text, \$text], ($text, \$text));
253              
254             =cut
255              
256             sub getStemmedAndTaggedText
257             {
258 4322     4322 1 11935 my ($Self, @Text) = @_;
259              
260             # convert the data to process to a list of strings.
261 4322         17282 my $ListOfStrings = $Self->_flattenList (@Text);
262              
263 4322         12952 my @sentences;
264 4322         19241 my $initialIndex = 0;
265 4322         13781 foreach my $text (@$ListOfStrings)
266             {
267             # skip undefined text.
268 4322 50       13633 next unless defined $text;
269              
270             # replace all backslashes with a space, since the tagger adds these.
271 4322         8346 my $cleanedText = $text;
272 4322         21505 $cleanedText =~ tr/\/\-/ /;
273 4322 50       30279 if ($cleanedText !~ m/^\s*$/)
274             {
275             # tag the text.
276 4322         68847 my $taggedText = $Self->{tagger}->get_readable ($cleanedText);
277              
278             # convert to a list of sentence word,tag pairs.
279 4322         7836324 my $listOfSentences = $Self->_convertTextToListOfSentenceWordTags ($taggedText);
280              
281             # add the position in the original text and the length of the words.
282 4322         26863 $listOfSentences = $Self->_addPositionLengthWordInfo ($initialIndex, $text, $listOfSentences);
283              
284             # save the list of sentences.
285 4322         18877 push @sentences, @$listOfSentences;
286             }
287              
288             # update the initial index for the next string.
289 4322         12898 $initialIndex += length $text;
290             }
291              
292 4322         9325 my $wordIndex = 0;
293 4322         8430 foreach my $sentenceList (@sentences)
294             {
295             # pull off the words in the sentence and put them in a list.
296             # words must be encoded to utf8 for stemmer.
297 4345         10326 my @wordList = map {encode("utf8", $_->[WORD_ORIGINAL])} @$sentenceList;
  28300         592412  
298              
299             # stem the list of words.
300 4345         194460 $Self->{stemmer}->stem_in_place (\@wordList);
301              
302             # insert the stemmed words to the list as:
303             # [stemmed word, original word, part of speech tag, word index, word position, word length]
304 4345         29741 for (my $i = 0; $i < @wordList; $i++)
305             {
306 28300         62672 my $wordTags = $sentenceList->[$i];
307             # insert the stemmed word and convert it from utf8 to Perl's internal format.
308 28300         75286 $wordTags->[WORD_STEMMED] = lc decode ('utf8', $wordList[$i]);
309 28300         925358 $wordTags->[WORD_INDEX] = $wordIndex++;
310             }
311             }
312              
313 4322         21171 return \@sentences;
314             }
315              
316             # add the position of the words in the original text to the info about each
317             # of the words. really should rewrite Lingua::EN::Tagger to provide this
318             # information; sloppy to recompute it and method used is fragile.
319             sub _addPositionLengthWordInfo
320             {
321 4322     4322   11244 my ($Self, $PositionOffset, $Text, $ListOfSentences) = @_;
322              
323 4322         9102 my $startingIndex = 0;
324 4322         7572 my @listOfWords;
325 4322         14707 foreach my $sentence (@$ListOfSentences)
326             {
327 4345         14781 foreach my $word (@$sentence)
328             {
329             # store all the words for subsequent analysis for missing words.
330 16356         29948 push @listOfWords, $word;
331              
332             # get and store the length of the word.
333 16356         32459 my $length = length ($word->[WORD_ORIGINAL]);
334 16356         26712 $word->[WORD_CHAR_LENGTH] = $length;
335              
336             # default position is -1, like index function.
337 16356         24380 $word->[WORD_CHAR_POSITION] = -1;
338              
339             # find the starting position of the word.
340 16356         37780 my $position = index ($Text, $word->[WORD_ORIGINAL], $startingIndex);
341              
342 16356 100       44072 if ($position >= $startingIndex)
343             {
344             # add position and update starting position for next search.
345 16342         32406 $word->[WORD_CHAR_POSITION] = $PositionOffset + $position;
346 16342         44747 $startingIndex = $position + $length;
347             }
348             }
349             }
350              
351             # estimate the positions of the missing words.
352 4322         9092 my $totalWords = @listOfWords;
353 4322         12308 my $i = -1;
354 4322         14800 while (++$i < $totalWords)
355             {
356 16342 100       75231 if ($listOfWords[$i]->[WORD_CHAR_POSITION] == -1)
357             {
358             # get the starting and ending index of the sequence of missing words.
359 12         14 my $noPositionStartIndex = $i;
360 12         27 while (++$i < $totalWords)
361             {
362 14 100       97 last if ($listOfWords[$i]->[WORD_CHAR_POSITION] != -1);
363             }
364 12         17 my $noPositionEndIndex = $i;
365 12         15 my $totalMissingWords = $noPositionEndIndex - $noPositionStartIndex;
366 12 50       27 $totalMissingWords = 0 if ($totalMissingWords < 0);
367              
368             # get the substring corresponding to the missing words.
369              
370             # compute the starting index of the substring.
371 12         14 my $startIndexPosition = $PositionOffset;
372 12 50       27 if ($noPositionStartIndex > 0)
373             {
374 12         25 $startIndexPosition = $listOfWords[$noPositionStartIndex - 1]->[WORD_CHAR_POSITION] + $listOfWords[$noPositionStartIndex - 1]->[WORD_CHAR_LENGTH];
375             }
376              
377             # get the ending string index.
378 12         13 my $endIndexPosition = length $Text;
379 12 50       27 if ($noPositionEndIndex < $totalWords)
380             {
381 12         17 $endIndexPosition = $listOfWords[$noPositionEndIndex]->[WORD_CHAR_POSITION];
382             }
383              
384             # get the substring.
385 12         28 my $substring = substr ($Text, $startIndexPosition - $PositionOffset, $endIndexPosition - $startIndexPosition);
386              
387             # extract the non-white space items.
388 12         14 my @listOfTokens;
389 12         65 while ($substring =~ m/([^\s]+)/g)
390             {
391 14         106 push @listOfTokens, [$-[0], $+[0] - $-[1], $1];
392             }
393              
394 12         19 my $substringsToMatch = @listOfTokens;
395 12 50       25 $substringsToMatch = $totalMissingWords if ($totalMissingWords < $substringsToMatch);
396 12         31 for (my $i = 0; $ i < $substringsToMatch; $i++)
397             {
398 14         26 $listOfWords[$noPositionStartIndex + $i]->[WORD_CHAR_POSITION] = $startIndexPosition + $listOfTokens[$i]->[0];
399 14         70 $listOfWords[$noPositionStartIndex + $i]->[WORD_CHAR_LENGTH] = $listOfTokens[$i]->[1];
400             }
401             }
402             }
403            
404             # add the gaps.
405             # still need some work here to get the words into the list of sentences that they belong in.
406 4322         30277 my $listOfWordsAndGaps = _addGapsToListOfWords (listOfWords => \@listOfWords, stringLength => length ($Text), positionOffset => $PositionOffset);
407            
408             # add the sentence id to each gap.
409 4322         8821 my $currentSentenceId = 0;
410 4322         7982 my @newListOfSentences;
411 4322         10075 foreach my $word (@$listOfWordsAndGaps)
412             {
413 28300 100       79489 $newListOfSentences[$currentSentenceId] = [] unless defined $newListOfSentences[$currentSentenceId];
414              
415 28300 100       69255 if ($word->[WORD_POSTAG] eq '/PGP')
416             {
417 11944         39849 $word->[WORD_SENTENCE_ID] = $currentSentenceId;
418 11944         30678 $word->[WORD_ORIGINAL] = substr ($Text, $word->[WORD_CHAR_POSITION] - $PositionOffset, $word->[WORD_CHAR_LENGTH]);
419             }
420             else
421             {
422 16356         29965 $currentSentenceId = $word->[WORD_SENTENCE_ID];
423             }
424            
425 28300         34868 push @{$newListOfSentences[$currentSentenceId]}, $word;
  28300         81398  
426             }
427            
428 4322         28336 return \@newListOfSentences;
429             }
430              
431              
432             sub _addGapsToListOfWords
433             {
434             # get the parameters.
435 4322     4322   24572 my %Parameters = @_;
436            
437             # get the length of the original string.
438 4322         10344 my $stringLength = $Parameters{stringLength};
439              
440             # get the list of words.
441 4322         7379 my $listOfWords = $Parameters{listOfWords};
442            
443             # get the position offset if it exists and is defined.
444 4322         9853 my $positionOffset = 0;
445 4322 50 33     34937 $positionOffset = $Parameters{positionOffset} if (exists ($Parameters{positionOffset}) && defined ($Parameters{positionOffset}));
446              
447             # build the list of complete tokens.
448 4322         18258 my @listOfWordsAndGaps = @$listOfWords;
449              
450             # get the list of missing substring positions.
451 4322         14904 my $listOfMissingSubstrings = _getListOfMissingSubstringPositions(listOfSubstringPositions => $listOfWords, stringLength => $stringLength, positionOffset => $positionOffset);
452            
453             # add the gaps to the list.
454 4322         22439 foreach my $gapInfo (@$listOfMissingSubstrings)
455             {
456 11944         19574 $gapInfo->[WORD_STEMMED] = ' ';
457 11944         16293 $gapInfo->[WORD_ORIGINAL] = undef;
458 11944         24643 $gapInfo->[WORD_POSTAG] = '/PGP';
459 11944         24274 $gapInfo->[WORD_INDEX] = undef;
460             }
461              
462             # sort the substrings by position.
463 4322         22350 @listOfWordsAndGaps = sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } (@listOfWordsAndGaps, @$listOfMissingSubstrings);
  60156         97182  
464              
465             # if test is true, make sure things were computed correctly.
466 4322 50 33     31613 if (exists($Parameters{test}) && $Parameters{test})
467             {
468 0         0 my $totalSubstrings = @listOfWordsAndGaps;
469              
470 0         0 for (my $i = 1 ; $i < $totalSubstrings ; $i++)
471             {
472              
473             # make sure the strings are sorted.
474 0 0       0 if ($listOfWordsAndGaps[ $i - 1 ]->[WORD_CHAR_POSITION] > $listOfWordsAndGaps[$i]->[WORD_CHAR_POSITION])
475             {
476 0         0 croak 'error: substrings in $listOfSubstringInfo are not sorted and should be.';
477             }
478              
479             # make sure the strings have at least one character.
480 0 0       0 if ($listOfWordsAndGaps[ $i - 1 ]->[WORD_CHAR_LENGTH] < 1)
481             {
482 0         0 croak 'error: substrings in $listOfSubstringInfo has length less than one.';
483             }
484             }
485             }
486              
487             # returns the complete list of tokens sorted by their starting index in ascending order.
488 4322         19660 return \@listOfWordsAndGaps;
489             }
490              
491              
492              
493             # routine returns an array reference of the gaps or missing substrings given
494             # a list of substrings. for example, if listOfSubstringPositions is
495             # [[2,4], [9,2], [11,5], [20,1]] and stringLength is 25, then the list returned
496             # is [[0, 2], [6, 3], [16, 4], [21, 4]].
497             sub _getListOfMissingSubstringPositions # (listOfSubstringPositions => \@, stringLength => n, positionOffset => n)
498             {
499             # get the parameters.
500 4322     4322   20758 my %Parameters = @_;
501            
502             # get the position offset if it exists and is defined.
503 4322         8091 my $positionOffset = 0;
504 4322 50 33     41703 $positionOffset = $Parameters{positionOffset} if (exists ($Parameters{positionOffset}) && defined ($Parameters{positionOffset}));
505              
506             # if listOfSubstringPositions is not defined, we have one special case.
507 4322 50       19717 if (!defined($Parameters{listOfSubstringPositions}))
508             {
509 0 0       0 if (!defined($Parameters{stringLength}))
510             {
511              
512             # no parameters defined, so return the empty list.
513 0         0 return [];
514             }
515             else
516             {
517 0 0       0 if (int($Parameters{stringLength}) > 0)
518             {
519              
520             # positive string length, but no substrings, so gap is entire string.
521 0         0 return [ 0, int($Parameters{stringLength}) - 1 ];
522             }
523             else
524             {
525              
526             # non-positive string length, so return empty list.
527 0         0 return [];
528             }
529             }
530             }
531              
532             # get the list of [WORD_CHAR_POSITION, WORD_CHAR_LENGTH].
533 4322         9541 my $listOfSubstringPositions = $Parameters{listOfSubstringPositions};
534              
535             # get the number of subtrings.
536 4322         9364 my $totalSubstrings = $#$listOfSubstringPositions + 1;
537              
538             # skip substrings having length less than one or a negative position.
539 4322         7827 my @filteredListOfSubstringPositions;
540 4322         26441 for (my $i = 0 ; $i < $totalSubstrings ; $i++)
541             {
542 16356 50       45419 next if ($listOfSubstringPositions->[$i][WORD_CHAR_LENGTH] < 1);
543 16356 50       38942 next if ($listOfSubstringPositions->[$i][WORD_CHAR_POSITION] < 0);
544 16356         65483 push @filteredListOfSubstringPositions, $listOfSubstringPositions->[$i];
545             }
546 4322         19170 $listOfSubstringPositions = \@filteredListOfSubstringPositions;
547 4322         9429 $totalSubstrings = $#$listOfSubstringPositions + 1;
548              
549             # get the entire strings length if defined.
550 4322         6370 my $stringLength;
551 4322 50       17977 $stringLength = int abs $Parameters{stringLength} if exists $Parameters{stringLength};
552              
553             # if $stringLength is undefined use the last substring to compute the length
554             # of the entire string; the string will not end with a gap in this case.
555 4322 50 33     16624 if (!defined($stringLength) && $totalSubstrings)
556             {
557 0         0 $stringLength = 0;
558 0         0 foreach my $currentSubstringInfo (@$listOfSubstringPositions)
559             {
560 0         0 my $last = $currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH];
561 0 0       0 $stringLength = $last if $last > $stringLength;
562             }
563 0         0 $stringLength -= $positionOffset;
564             }
565              
566             # if $stringLength is not defined at this point there are no gaps.
567 4322 50 33     29021 return [] unless ((defined $stringLength) && ($stringLength > 0));
568              
569             # if $totalSubstrings is zero, then the entire string is a gap.
570 4322 50       11770 unless ($totalSubstrings)
571             {
572 0         0 my @substringGapInfo;
573 0         0 $substringGapInfo[WORD_CHAR_POSITION] = 0;
574 0         0 $substringGapInfo[WORD_CHAR_LENGTH] = $stringLength;
575 0         0 return [ \@substringGapInfo ];
576             }
577              
578             # sort the pairs by their position.
579 4322         20966 my @listOfSubstringPositions = sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } @$listOfSubstringPositions;
  20269         46141  
580              
581             # @listOfMissingSubstringPositions holds all the gaps.
582 4322         7288 my @listOfMissingSubstringPositions;
583              
584             # get the first substring position.
585 4322         8557 my $currentSubstringInfo = $listOfSubstringPositions[0];
586              
587             # if the first substring does not start with position 0, add the beginning gap.
588 4322 50       14378 if ($currentSubstringInfo->[WORD_CHAR_POSITION] > $positionOffset)
589             {
590 0         0 my @substringGapInfo;
591 0         0 $substringGapInfo[WORD_CHAR_POSITION] = $positionOffset;
592 0         0 $substringGapInfo[WORD_CHAR_LENGTH] = $currentSubstringInfo->[WORD_CHAR_POSITION] - $positionOffset;
593 0         0 push @listOfMissingSubstringPositions, \@substringGapInfo;
594             }
595            
596             # compute the gaps.
597 4322         16662 for (my $i = 1 ; $i < $totalSubstrings ; $i++)
598             {
599              
600             # get the information about the previous and current substrings.
601 12034         41451 my $previousSubstringInfo = $listOfSubstringPositions[ $i - 1 ];
602 12034         17506 my $currentSubstringInfo = $listOfSubstringPositions[$i];
603              
604             # compute the starting index and length of the gap.
605 12034         22728 my $gapStartPosition = $previousSubstringInfo->[WORD_CHAR_POSITION] + $previousSubstringInfo->[WORD_CHAR_LENGTH];
606 12034         19475 my $gapEndPosition = $currentSubstringInfo->[WORD_CHAR_POSITION] - 1;
607 12034         19744 my $gapLength = $gapEndPosition - $gapStartPosition + 1;
608              
609             # if the gap is not a positive size, skip it.
610             # maybe a warning should be logged since it really should not happen.
611 12034 100       49055 if ($gapLength > 0)
612             {
613              
614             # store the information about the gap.
615 11944         15583 my @substringGapInfo;
616 11944         25344 $substringGapInfo[WORD_CHAR_POSITION] = $gapStartPosition;
617 11944         21944 $substringGapInfo[WORD_CHAR_LENGTH] = $gapLength;
618 11944         65016 push @listOfMissingSubstringPositions, \@substringGapInfo;
619             }
620             }
621              
622             # add any trailing gap to the list.
623 4322         7861 $currentSubstringInfo = $listOfSubstringPositions[-1];
624 4322 50       28224 if ($currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH] < $stringLength)
625             {
626 0         0 my @substringGapInfo;
627 0         0 $substringGapInfo[WORD_CHAR_POSITION] = $currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH];
628 0         0 $substringGapInfo[WORD_CHAR_LENGTH] = $stringLength - ($currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH]);
629 0         0 push @listOfMissingSubstringPositions, \@substringGapInfo;
630             }
631              
632             # if test is true, check if gaps were computed correctly.
633 4322 50 33     19722 if (exists($Parameters{test}) && $Parameters{test})
634             {
635 0         0 my @allSubstrings =
636 0         0 sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } (@listOfSubstringPositions, @listOfMissingSubstringPositions);
637 0         0 my $totalSubstrings = @allSubstrings;
638              
639 0         0 for (my $i = 1 ; $i < $totalSubstrings ; $i++)
640             {
641              
642             # make sure the strings are sorted.
643 0 0       0 if ($allSubstrings[ $i - 1 ]->[WORD_CHAR_POSITION] + $allSubstrings[ $i - 1 ]->[WORD_CHAR_LENGTH] < $allSubstrings[$i]->[WORD_CHAR_POSITION])
644             {
645 0         0 my $logger = Log::Log4perl->get_logger();
646 0         0 $logger->logdie("error: missed computing a gap.");
647             }
648             }
649             }
650              
651             # returns the list of missing substrings.
652 4322         27902 return \@listOfMissingSubstringPositions;
653             }
654              
655              
656             =head2 C
657              
658             getTaggedTextToKeep (listOfStemmedTaggedSentences => [...],
659             listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]);
660              
661             The method C returns all the array references of the words
662             that have a part-of-speech tag that is of a type specified by
663             C or C. The word lists
664             returned have the same hierarchical sentence structure used by C.
665             Note C and C
666             are optional parameters, if neither is defined, then the values used when the
667             object was instantiated are used. If one of them is defined, its values override the default
668             values.
669              
670             =over
671              
672             =item C
673              
674             listOfStemmedTaggedSentences => [...]
675              
676             C is the array reference returned by
677             C or
678             a previous call to C.
679              
680             =item C and/or C
681              
682             listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]
683              
684             C and C define the list of
685             parts-of-speech types to be retained when filtering previously tagged text.
686             Permitted values for C are
687             are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION',
688             'TEXTRANK_WORDS', and 'VERBS'. For the possible value of C
689             see the method C.
690             Note C and C
691             are optional parameters, if neither is defined, then the values used when the
692             object was instantiated are used. If one of them is defined, its values override the default
693             values.
694              
695             =back
696              
697             use Text::StemTagPOS;
698             use Data::Dump qw(dump);
699             my $stemTagger = Text::StemTagPOS->new;
700             my $text = 'This is the first sentence. This is the last sentence.';
701             my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
702             dump $stemTagger->getTaggedTextToKeep (
703             listOfStemmedTaggedSentences => $listOfStemmedTaggedSentences);
704              
705             # dumps:
706             # [
707             # [
708             # ["first", "first", "/JJ", 6, 12, 5, 0],
709             # ["sentenc", "sentence", "/NN", 8, 18, 8, 0],
710             # ],
711             # [
712             # ["last", "last", "/JJ", 17, 40, 4, 1],
713             # ["sentenc", "sentence", "/NN", 19, 45, 8, 1],
714             # ],
715             # ]
716              
717             =cut
718              
719             sub getTaggedTextToKeep
720             {
721 0     0 1 0 my ($Self, %Parameters) = @_;
722              
723             # get the list of sentences.
724 0         0 my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences};
725            
726             # get the hash of tags to keep.
727 0         0 my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters);
728              
729             # copy off the tokens to keep in each sentence.
730 0         0 my @listOfFilteredSentences;
731 0         0 foreach my $sentence (@$listOfStemmedTaggedSentences)
732             {
733 0         0 my @newSentence;
734 0         0 foreach my $token (@$sentence)
735             {
736 0 0       0 if (exists ($hashOfPOSToKeep->{$token->[WORD_POSTAG]}))
737             {
738 0         0 push @newSentence, $token;
739             }
740             }
741             #push @listOfFilteredSentences, \@newSentence if (@newSentence > 0);
742 0         0 push @listOfFilteredSentences, \@newSentence;
743             }
744              
745             # return the list of filtered sentences.
746 0         0 return \@listOfFilteredSentences;
747             }
748              
749             =head2 C
750              
751             getWordsPhrasesInTaggedText (listOfStemmedTaggedSentences => ...,
752             listOfPhrasesToFind => [...], listOfPOSTypesToKeep => [...],
753             listOfPOSTagsToKeep => [...]);
754              
755             The method C returns a reference to an array where
756             each entry in the array corresponds
757             to the word or phrase in C. The value of each entry is a list
758             of word indices
759             where the words or phrases were found. Each list contains integer
760             pairs of the form [first-word-index, last-word-index] where first-word-index is the index to the first
761             word of the phrase and last-word-index the index of the last word. The values of the index are those
762             assigned to the stemmed and tagged word in C.
763              
764             [
765             [ # first phrase locations
766             [first word index, last word index],
767             [first word index, last word index], ...]
768             ]
769             [ # second phrase locations
770             [first word index, last word index],
771             [first word index, last word index], ...]
772             ]
773             ...
774             ]
775              
776             =over
777              
778             =item C
779              
780             listOfStemmedTaggedSentences => [...]
781              
782             C is the array reference returned by C or
783             C.
784              
785             =item C
786              
787             listOfPhrasesToFind => [...]
788              
789             C is an array reference containing a list of strings of
790             text that are either single words or phrases that are to be located in the text
791             provided by C. Before the words or phrases are located they are filtered
792             using C or C.
793              
794              
795             =item C and/or C
796              
797             listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]
798              
799             C and C defines the list of
800             parts-of-speech types to be retained when filtering previously tagged text.
801             Permitted values for C are
802             are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION',
803             'TEXTRANK_WORDS', and 'VERBS'. For the possible value of C
804             see the method C.
805             Note C and C
806             are optional parameters, if neither is defined, then the values used when the
807             object was instantiated are used. If one of them is defined, its values override the default
808             values.
809              
810             =back
811              
812             The code below illustrates the output format:
813              
814             use Text::StemTagPOS;
815             use Data::Dump qw(dump);
816             my $stemTagger = Text::StemTagPOS->new;
817             my $text = 'This is the first sentence. This is the last sentence.';
818             my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
819             dump $listOfStemmedTaggedSentences;
820             my $listOfWordsOrPhrasesToFind = ['first sentence','this is',
821             'third sentence', 'sentence'];
822             my $phraseLocations = $stemTagger->getWordsPhrasesInTaggedText (
823             listOfPOSTypesToKeep => [qw(ALL)],
824             listOfStemmedTaggedSentences => $listOfStemmedTaggedSentences,
825             listOfWordsOrPhrasesToFind => $listOfWordsOrPhrasesToFind);
826             dump $phraseLocations;
827             # [
828             # [[6, 8]], # 'first sentence'
829             # [[0, 2], [11, 13]], # 'this is': note period in text has index 5.
830             # [], # 'third sentence'
831             # [[8, 8], [19, 19]] # 'sentence'
832             # ]
833              
834             =cut
835              
836             sub getWordsPhrasesInTaggedText
837             {
838 300     300 1 5355783 my ($Self, %Parameters) = @_;
839              
840             # stem and tag all of the word and phrase text.
841             # put all the listOfStemmedTaggedSentences into an array,
842             # create the suffix array of the listOfStemmedTaggedSentences array.
843              
844             # get the hash of tags to keep.
845 300         2939 my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters);
846              
847             # get the tagged text.
848 300         1191 my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences};
849              
850             # create a list of the filtered stemmed words of the text.
851 300         844 my @wordsOfText;
852 300         841 foreach my $sentence (@$listOfStemmedTaggedSentences)
853             {
854 29474         75427 foreach my $word (@$sentence)
855             {
856             # skip over the words types to ignore.
857 442744 50       1350551 next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]}));
858 442744         885142 push @wordsOfText, $word;
859             }
860             }
861              
862             # create the suffix array of the tagged words, sorting by the stemmed word.
863 300         1248 my $totalWords = @wordsOfText;
864              
865             # hash all the indices of the stemmed words in the suffix array of the text.
866 300         979 my %wordIndices;
867 300         2964 for(my $index = 0; $index < $totalWords; $index++)
868             {
869 442744         840436 my $word = $wordsOfText[$index];
870 442744 50       2394697 $wordIndices{$word->[WORD_STEMMED]} = [] unless exists $wordIndices{$word->[WORD_STEMMED]};
871 442744         615658 push @{$wordIndices{$word->[WORD_STEMMED]}}, $index;
  442744         1830082  
872             }
873              
874             # stem and part-of-speech tag the word or phrases to find.
875 300         822 my @listOfStemmedWordsInPhrasesToFind;
876 300         1176 my $listOfWordsOrPhrasesToFind = $Parameters{listOfWordsOrPhrasesToFind};
877 300         1453 foreach my $wordOrPhrase (@$listOfWordsOrPhrasesToFind)
878             {
879 4321         24086 my $stemmed = $Self->getStemmedAndTaggedText ($wordOrPhrase);
880              
881             # flatten the words of a phrase into on list.
882 4321         9594 my @allWords;
883 4321         11384 foreach my $sentence (@$stemmed)
884             {
885 4321         14860 foreach my $word (@$sentence)
886             {
887             # remove the words to ignore.
888 27041 100       84286 next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]}));
889 15681         45001 push @allWords, $word;
890             }
891             }
892              
893 4321         32282 push @listOfStemmedWordsInPhrasesToFind, \@allWords ;
894             }
895              
896             # given the starting index into the suffix array and a word list, return
897             # the number of words matching the word list at that index.
898             my $getMatchingWords = sub
899             {
900 4321     4321   6689 my ($StartingIndex, $StemmedWordList) = @_;
901 4321         7881 my $stemmedIndex = 0;
902 4321         5796 my $textIndex = $StartingIndex;
903 4321   100     21753 while (($textIndex < $totalWords) && ($stemmedIndex < @$StemmedWordList))
904             {
905 15681 50       75726 last if ($wordsOfText[$textIndex]->[WORD_STEMMED] cmp $StemmedWordList->[$stemmedIndex][WORD_STEMMED]);
906 15681         24881 $textIndex++;
907 15681         68213 $stemmedIndex++;
908             }
909 4321         19467 return $textIndex - $StartingIndex;
910 300         4330 };
911              
912             # find all the phrases in the filtered text.
913 300         711 my @phrasesFound;
914 300         841 my $phraseIndex = 0;
915 300         859 foreach my $phraseStemmedWordList (@listOfStemmedWordsInPhrasesToFind)
916             {
917 4321         5740 my @fullPhrasesFound;
918 4321         14044 my @stemmedPhraseWords = @$phraseStemmedWordList;
919 4321         6607 my $wordsInPhrase = $#stemmedPhraseWords + 1;
920              
921             # get the first word in the phrase.
922 4321         7564 my $firstWord = $stemmedPhraseWords[0];
923 4321 50       19786 if (exists ($wordIndices{$firstWord->[WORD_STEMMED]}))
924             {
925             # get all the indices in the suffix array that the first word occurs at.
926 4321         29879 my $listOfStartingIndices = $wordIndices{$firstWord->[WORD_STEMMED]};
927              
928             # compute the number of matching phrase words at each position.
929 4321         10688 foreach my $startingIndex (@$listOfStartingIndices)
930             {
931 4321         9559 my $matchingLength = &$getMatchingWords ($startingIndex, \@stemmedPhraseWords);
932 4321         11569 my $firstWordIndex = $wordsOfText[$startingIndex]->[WORD_INDEX];
933 4321         10116 my $lastWordIndex = $wordsOfText[$startingIndex + $matchingLength - 1]->[WORD_INDEX];
934              
935 4321 50       17419 if ($matchingLength == $wordsInPhrase)
936             {
937             # full match of the phrase.
938 4321         21275 push @fullPhrasesFound, [$firstWordIndex, $lastWordIndex];
939             }
940             }
941             }
942 4321         16702 $phrasesFound[$phraseIndex++] = \@fullPhrasesFound;
943             }
944 300         908785 return \@phrasesFound;
945             }
946              
947             =head2 C
948              
949             The method C takes no parameters. It returns an array
950             reference where each item in the list is of the form C<[part of speech tag, description, examples]>.
951             It is meant for getting the part-of-speech tags that can be used to populate C.
952              
953             use Text::StemTagPOS;
954             use Data::Dump qw(dump);
955             my $stemTagger = Text::StemTagPOS->new;
956             dump $stemTagger->getListOfPartOfSpeechTags;
957              
958             =cut
959              
960             # this method takes no parameters. it reads in the information about the
961             # part of speech tags stored in the DATA section of this file. the information
962             # is stored in the object at posTags as an array reference containing the
963             # array [POS tag, description, examples] for each possible tag.
964             sub getListOfPartOfSpeechTags
965             {
966 0     0 1 0 my $Self = shift;
967              
968             # return a copy of the posTags array.
969 0         0 my @partOfSpeechTags = map {[@$_]} @{$Self->{posTags}};
  0         0  
  0         0  
970 0         0 return \@partOfSpeechTags;
971             }
972              
973              
974             # _convertTextToListOfSentenceWordTags->('string of tagged words returned from xxxx');
975             sub _convertTextToListOfSentenceWordTags
976             {
977             # get the method class object.
978 4322     4322   11660 my $Self = $_[0];
979              
980             # split the string of tagged words into a list of the words and tags. note,
981             # all POS tags are a slash followed by 2, 3 or 4 uppercase letters.
982 4322         73375 my @list = split (/(\/[A-Z]{2,4})/, $_[1]);
983              
984             # trim off any leading or trailing spaces from each word or tag.
985 4322         13494 foreach my $item (@list)
986             {
987 32712         116979 $item =~ s/^\s*//;
988 32712         70468 $item =~ s/\s+$//;
989             }
990              
991             # restructure the list so each item is of the form [word, tag].
992 4322         9200 my $wordOrPhrase;
993             my @taggerWordList;
994 4322         11075 foreach my $item (@list)
995             {
996 32712 100       92258 if (substr ($item, 0, 1) eq '/')
997             {
998             # at this point the item in the list is a tag, so form the new [word,tag] pair
999             # and save it.
1000 16356         32512 my $wordInfo = [];
1001 16356         39199 $wordInfo->[WORD_ORIGINAL] = $wordOrPhrase;
1002 16356         35433 $wordInfo->[WORD_POSTAG] = $item;
1003 16356         40379 push @taggerWordList, $wordInfo;
1004 16356         34927 $wordOrPhrase = undef;
1005             }
1006             else
1007             {
1008             # at this point we have a new word, so append it to $wordOrPhrase.
1009 16356 50       38626 if (!defined ($wordOrPhrase))
1010             {
1011 16356         42972 $wordOrPhrase = $item;
1012             }
1013             else
1014             {
1015 0         0 $wordOrPhrase .= ' ' . $item;
1016             }
1017             }
1018             }
1019              
1020             # now partition the list into sentences.
1021 4322         10570 my @sentences;
1022 4322         10521 my $currentSentence = [];
1023 4322         14299 foreach my $wordTag (@taggerWordList)
1024             {
1025 16356 100       50494 if ($wordTag->[WORD_POSTAG] eq $Self->{endingSentenceTag})
1026             {
1027             # if the sentence has no words, skip it.
1028 24 50       63 next unless ($#$currentSentence > -1);
1029              
1030             # add the POS sentence ender tag to the end of the sentence list.
1031 24         43 push @$currentSentence, $wordTag;
1032              
1033             # add the sentence to the list of sentences.
1034 24         37 push @sentences, $currentSentence;
1035 24         58 $currentSentence = [];
1036             }
1037             else
1038             {
1039 16332         38092 push @$currentSentence, $wordTag;
1040             }
1041             }
1042              
1043             # if there is a sentence left, save it.
1044 4322 100       30933 if ($#$currentSentence > -1)
1045             {
1046 4321         10486 push @sentences, $currentSentence;
1047             }
1048            
1049             # add the sentence id to each word.
1050 4322         16517 for (my $i = 0; $i < @sentences; $i++)
1051             {
1052 4345         19630 foreach my $word (@{$sentences[$i]})
  4345         35625  
1053             {
1054 16356         46344 $word->[WORD_SENTENCE_ID] = $i;
1055             }
1056             }
1057              
1058             # return the list of sentences of word,tag pairs.
1059 4322         24404 return \@sentences;
1060             }
1061              
1062              
1063             # converts a list of strings, string refs, arrays of strings, etc... to a
1064             # simpler list of strings; essentially flattening a list.
1065             sub _flattenList
1066             {
1067 4322     4322   10894 my ($Self, @Data) = @_;
1068              
1069 4322         6442 my @ListOfStrings;
1070 4322         11793 foreach my $item (@_)
1071             {
1072 8644         18556 my $type = ref ($item);
1073 8644 100       64628 if ($type eq '')
    100          
    50          
    50          
1074             {
1075 4321         15269 push @ListOfStrings, $item;
1076             }
1077             elsif ($type eq 'SCALAR')
1078             {
1079 1         4 push @ListOfStrings, $$item;
1080             }
1081             elsif ($type eq 'ARRAY')
1082             {
1083 0         0 push @ListOfStrings, @{$Self->_flattenList (@$item)};
  0         0  
1084             }
1085             elsif ($type eq 'REF')
1086             {
1087 0         0 push @ListOfStrings, @{$Self->_flattenList ($$item)};
  0         0  
1088             }
1089             }
1090 4322         14194 return \@ListOfStrings;
1091             }
1092              
1093              
1094             # this method takes no parameters. it reads in the information about the
1095             # part of speech tags stored in the DATA section of this file. the information
1096             # is stored in the object at posTags as an array reference containing the
1097             # array [POS tag, description, examples] for each possible tag.
1098             sub _getPartOfSpeechTags
1099             {
1100 4     4   15 my $Self = shift;
1101              
1102 4         268 my $posTags =
1103             [
1104             ['CC', 'Conjunction, coordinating', 'and, or'],
1105             ['CD', 'Adjective, cardinal number', '3, fifteen'],
1106             ['DET', 'Determiner', 'this, each, some'],
1107             ['EX', 'Pronoun, existential there', 'there'],
1108             ['FW', 'Foreign words', ''],
1109             ['IN', 'Preposition / Conjunction', 'for, of, although, that'],
1110             ['JJ', 'Adjective', 'happy, bad'],
1111             ['JJR', 'Adjective, comparative', 'happier, worse'],
1112             ['JJS', 'Adjective, superlative', 'happiest, worst'],
1113             ['LS', 'Symbol, list item', 'A, A.'],
1114             ['MD', 'Verb, modal', "can, could, 'll"],
1115             ['NN', 'Noun', 'aircraft, data'],
1116             ['NNP', 'Noun, proper', 'London, Michael'],
1117             ['NNPS', 'Noun, proper, plural', 'Australians, Methodists'],
1118             ['NNS', 'Noun, plural', 'women, books'],
1119             ['PDT', 'Determiner, prequalifier', 'quite, all, half'],
1120             ['POS', 'Possessive', "'s"],
1121             ['PRP', 'Determiner, possessive second', 'mine, yours'],
1122             ['PRPS', 'Determiner, possessive', 'their, your'],
1123             ['RB', 'Adverb', 'often, not, very, here'],
1124             ['RBR', 'Adverb, comparative', 'faster'],
1125             ['RBS', 'Adverb, superlative', 'fastest'],
1126             ['RP', 'Adverb, particle', 'up, off, out'],
1127             ['SYM', 'Symbol', '*'],
1128             ['TO', 'Preposition', 'to'],
1129             ['UH', 'Interjection', 'oh, yes, mmm'],
1130             ['VB', 'Verb, infinitive', 'take, live'],
1131             ['VBD', 'Verb, past tense', 'took, lived'],
1132             ['VBG', 'Verb, gerund', 'taking, living'],
1133             ['VBN', 'Verb, past/passive participle', 'taken, lived'],
1134             ['VBP', 'Verb, base present form', 'take, live'],
1135             ['VBZ', 'Verb, present 3SG -s form', 'takes, lives'],
1136             ['WDT', 'Determiner, question', 'which, whatever'],
1137             ['WP', 'Pronoun, question', 'who, whoever'],
1138             ['WPS', 'Determiner, possessive & question', 'whose'],
1139             ['WRB', 'Adverb, question', 'when, how, however'],
1140             ['PP', 'Punctuation, sentence ender', '., !, ?'],
1141             ['PPC', 'Punctuation, comma', ','],
1142             ['PGP', 'Punctuation, whitespace, gap', ' '],
1143             ['PPD', 'Punctuation, dollar sign', '$'],
1144             ['PPL', 'Punctuation, quotation mark left', '``'],
1145             ['PPR', 'Punctuation, quotation mark right', "''"],
1146             ['PPS', 'Punctuation, colon, semicolon, elipsis', ':, ..., -'],
1147             ['LRB', 'Punctuation, left bracket', '(, {, ['],
1148             ['RRB', 'Punctuation, right bracket', '), }, ]']
1149             ];
1150              
1151             # store the pos array.
1152 4         23 $Self->{posTags} = $posTags;
1153 4         16 return;
1154             }
1155              
1156              
1157             sub _getListOfPOSTagsFromPOSTypesList
1158             {
1159 1     1   3 my ($Self, %Parameters) = @_;
1160              
1161 1         3 my $listOfPOSTypesToKeep = [];
1162 1 50       4 $listOfPOSTypesToKeep = $Parameters{listOfPOSTypesToKeep} if exists $Parameters{listOfPOSTypesToKeep};
1163              
1164             # ALL, NOUNS, VERBS, PUNCTUATION, ADJECTIVES, ADVERBS, CONTENT_WORDS, TEXTRANK_WORDS, VERBS
1165 1         2 my @listOfPOSTagsToKeep;
1166 1         6 foreach my $type (@$listOfPOSTypesToKeep)
1167             {
1168 0         0 my $ucType = uc $type;
1169 0         0 $ucType =~ tr/A-Z\_//cd;
1170 0 0       0 if (($ucType cmp 'NOUNS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_NOUN; }
  0 0       0  
    0          
    0          
    0          
    0          
    0          
    0          
    0          
1171 0         0 elsif (($ucType cmp 'VERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_VERB; }
1172 0         0 elsif (($ucType cmp 'ADJECTIVES') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ADJECTIVE; }
1173 0         0 elsif (($ucType cmp 'ADVERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ADVERB; }
1174 0         0 elsif (($ucType cmp 'CONTENT_ADVERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_CONTENT_ADVERB; }
1175 0         0 elsif (($ucType cmp 'PUNCTUATION') == 0) { push @listOfPOSTagsToKeep, POSTAGS_PUNCTUATION; }
1176 0         0 elsif (($ucType cmp 'CONTENT_WORDS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_CONTENT; }
1177 0         0 elsif (($ucType cmp 'TEXTRANK_WORDS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_TEXTRANK; }
1178 0         0 elsif (($ucType cmp 'ALL') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ALL; }
1179             }
1180              
1181 1         4 return \@listOfPOSTagsToKeep;
1182             }
1183              
1184              
1185             # returns a hash reference of the part of speech tags that are
1186             # provided in listOfPOSTagsToKeep.
1187             sub _getHashOfPOSTagsToKeep
1188             {
1189 304     304   1410 my ($Self, %Parameters) = @_;
1190              
1191             # get the list of POS tags to keep.
1192 304         910 my $listOfPOSTagsToKeep;
1193 304 100 66     4823 if (!exists ($Parameters{listOfPOSTagsToKeep}) && !exists ($Parameters{listOfPOSTypesToKeep}))
1194             {
1195 303 100       3525 return $Self->{hashOfPOSTagsToKeep} unless exists $Parameters{instantiation};
1196 3         22 $listOfPOSTagsToKeep = [POSTAGS_TEXTRANK];
1197             }
1198             else
1199             {
1200 1         6 $listOfPOSTagsToKeep = $Self->_getListOfPOSTagsFromPOSTypesList (%Parameters);
1201 1 50       4 push @$listOfPOSTagsToKeep, @{$Parameters{listOfPOSTagsToKeep}} if (exists ($Parameters{listOfPOSTagsToKeep}));
  1         22  
1202             }
1203              
1204 4         11 my %hashOfPOSTagsToKeep;
1205 4         16 foreach my $pos (@$listOfPOSTagsToKeep)
1206             {
1207 68         76 my $posClean = uc $pos;
1208 68         72 $posClean =~ tr/A-Z0-9//cd;
1209              
1210             # prefix with a slash.
1211 68         172 $hashOfPOSTagsToKeep{'/' . $posClean} = 1;
1212             }
1213              
1214 4         31 return \%hashOfPOSTagsToKeep;
1215             }
1216              
1217             =head2 C
1218              
1219             The method C returns an array reference of the sorted stemmed
1220             words in the text given by C.
1221              
1222             =over
1223              
1224             =item C
1225              
1226             listOfStemmedTaggedSentences => [...]
1227              
1228             C is the array reference returned by C or
1229             C of the text.
1230              
1231             =back
1232              
1233             use Text::StemTagPOS;
1234             use Data::Dump qw(dump);
1235             my $stemTagger = Text::StemTagPOS->new;
1236             my $text = 'The first sentence. Sentence number two.';
1237             my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text);
1238             dump $listOfStemmedTaggedSentences;
1239              
1240             =cut
1241              
1242             sub getListOfStemmedWordsInText
1243             {
1244 0     0 1   my ($Self, %Parameters) = @_;
1245              
1246             # get the hash of tags to keep.
1247 0           my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters);
1248              
1249             # get the tagged text.
1250 0           my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences};
1251              
1252             # create a list of the filtered stemmed words of the text.
1253 0           my %wordsOfText;
1254 0           foreach my $sentence (@$listOfStemmedTaggedSentences)
1255             {
1256 0           foreach my $word (@$sentence)
1257             {
1258             # skip over the words types to ignore.
1259 0 0         next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]}));
1260 0           $wordsOfText{$word->[WORD_STEMMED]} = 1;
1261             }
1262             }
1263 0           my @words = sort keys %wordsOfText;
1264 0           return \@words;
1265             }
1266              
1267             =head2 C
1268              
1269             The method C returns an array reference of the sorted stemmed
1270             words of the intersection of all the words in the documents given by C;
1271              
1272             =over
1273              
1274             =item C
1275              
1276             listOfStemmedTaggedDocuments => [...]
1277              
1278             C is a list of document references returned by C or
1279             C.
1280              
1281             =back
1282              
1283             =cut
1284              
1285             sub getListOfStemmedWordsInAllDocuments
1286             {
1287 0     0 1   my ($Self, %Parameters) = @_;
1288              
1289             # get the list of documents to process.
1290 0           my $listOfStemmedTaggedDocuments = $Parameters{listOfStemmedTaggedDocuments};
1291 0           my $totalDocuments = @$listOfStemmedTaggedDocuments;
1292 0           my %wordOccurence;
1293              
1294             # put the words of the first document into a hash.
1295 0           my $sentences = $listOfStemmedTaggedDocuments->[0];
1296 0           foreach my $sentence (@$sentences)
1297             {
1298 0           foreach my $word (@$sentence)
1299             {
1300 0           $wordOccurence{$word->[WORD_STEMMED]} = 1;
1301             }
1302             }
1303              
1304             # add the words from the remaining documents, only if the word
1305             # occurred in all previous documents.
1306 0           for (my $i = 1; $i < $totalDocuments; $i++)
1307             {
1308 0           my $sentences = $listOfStemmedTaggedDocuments->[$i];
1309 0           foreach my $sentence (@$sentences)
1310             {
1311 0           foreach my $word (@$sentence)
1312             {
1313             # if the word has not occurred in a previous document, skip it.
1314 0 0         next unless exists $wordOccurence{$word->[WORD_STEMMED]};
1315              
1316             # if the word has not occurred in all previous documents, delete it.
1317 0 0         if ($wordOccurence{$word->[WORD_STEMMED]} < $i)
1318             {
1319 0           delete $wordOccurence{$word->[WORD_STEMMED]};
1320 0           next;
1321             }
1322              
1323             # set the words occurence.
1324 0           $wordOccurence{$word->[WORD_STEMMED]} = $i + 1;
1325             }
1326             }
1327             }
1328              
1329             # return the words that occured in all the documents.
1330 0           my @wordsInAll;
1331 0           while (my ($word, $occurence) = each %wordOccurence)
1332             {
1333 0 0         push @wordsInAll, $word if ($occurence == $totalDocuments);
1334             }
1335 0           @wordsInAll = sort @wordsInAll;
1336 0           return \@wordsInAll;
1337             }
1338              
1339             =head1 INSTALLATION
1340              
1341             To install the module run the following commands:
1342              
1343             perl Makefile.PL
1344             make
1345             make test
1346             make install
1347              
1348             If you are on a windows box you should use 'nmake' rather than 'make'.
1349              
1350             =head1 AUTHOR
1351              
1352             Jeff Kubina
1353            
1354             =head1 BUGS
1355              
1356             Please email bugs reports or feature requests to C, or through
1357             the web interface at L. The author
1358             will be notified and you can be automatically notified of progress on the bug fix or feature request.
1359              
1360             =head1 COPYRIGHT
1361              
1362             Copyright (c) 2010 Jeff Kubina. All rights reserved.
1363             This program is free software; you can redistribute
1364             it and/or modify it under the same terms as Perl itself.
1365              
1366             The full text of the license can be found in the
1367             LICENSE file included with this module.
1368              
1369             =head1 KEYWORDS
1370              
1371             natural language processing, NLP, part of speech tagging, POS, stemming
1372              
1373             =head1 SEE ALSO
1374              
1375             L, L, L, L,
1376             L, L
1377              
1378             =begin html
1379              
1380             See the Lingua::EN::Tagger README
1381             file for a list of the part-of-speech tags.
1382              
1383             =end html
1384              
1385             =cut
1386              
1387             1;
1388             # The preceding line will help the module return a true value