File Coverage

lib/Lingua/EN/Ngram.pm
Criterion Covered Total %
statement 104 105 99.0
branch 27 28 96.4
condition 1 3 33.3
subroutine 9 9 100.0
pod 5 5 100.0
total 146 150 97.3


line stmt bran cond sub pod time code
1             package Lingua::EN::Ngram;
2              
3             # Ngram.pm - Extract and count words and phrases from texts
4              
5             # Eric Lease Morgan
6             # September 12, 2010 - first investigations; based on Lingua::EN::Bigram
7             # November 25, 2010 - added non-Latin characters; Happy Thanksgiving!
8             # March 28, 2018 - removed lower-casing the text; why did I do lower case previously?
9              
10              
11             # include
12 1     1   553 use strict;
  1         1  
  1         22  
13 1     1   4 use warnings;
  1         1  
  1         349  
14              
15             # define
16             our $VERSION = '0.03';
17              
18              
19             sub new {
20              
21             # get input
22 9     9 1 2321 my ( $class, %options ) = @_;
23            
24             # initialize
25 9         12 my $self = {};
26              
27             # process optional options
28 9 100       20 if ( %options ) {
29            
30             # check for one and only one argument
31 7         13 my @keys = keys %options;
32 7 100       13 if ( scalar @keys != 1 ) { die "This method -- new -- can only take one and only one option (text or file)." }
  1         8  
33            
34             # initalize from text
35 6 100       15 if ( $options{ 'text' } ) { $self->{ text } = $options{ 'text' } }
  1 100       2  
36            
37             # initialize from file
38             elsif ( $options{ 'file' } ) {
39            
40             # slurp
41 4         5 my $file = $options{ 'file' };
42 4 100       128 open F, $file or die "The file argument ($file) passed to this method is invalid: $!\n";
43 3         7 my $text = do { local $/; };
  3         10  
  3         2699  
44 3         26 close F;
45            
46             # do the work
47 3         13 $self->{ text } = $text;
48            
49             }
50            
51             # invalid option
52 1         8 else { die "This method -- new -- can only take one option, text or file." }
53            
54             }
55            
56             # return
57 6         16 return bless $self, $class;
58            
59             }
60              
61              
62             sub text {
63              
64             # get input
65 11     11 1 25 my ( $self, $text ) = @_;
66            
67             # set
68 11 100       17 if ( $text ) { $self->{ text } = $text }
  1         53  
69            
70             # return
71 11         53 return $self->{ text };
72            
73             }
74              
75              
76             sub ngram {
77              
78             # get input
79 8     8 1 3506 my ( $self, $n ) = @_;
80              
81             # sanity check
82 8 100       18 if ( ! $n ) { die "This method -- ngram -- requires an integer as an argument." }
  1         18  
83 7 100       25 if ( $n =~ /\D/ ) { die "This method -- ngram -- requires an integer as an argument." }
  1         8  
84            
85             # initalize
86 6         10 my $text = $self->text;
87 6         6783 $text =~ tr/a-zA-Zà-ƶÀ-Ƶ'()\-,.?!;:/\n/cs;
88 6         25374 $text =~ s/([,.?!:;()\-])/\n$1\n/g;
89 6         77682 $text =~ s/\n+/\n/g;
90 6         41182 my @words = split /\n/, $text;
91              
92 6         25 my @ngrams = ();
93 6         8 my %count = ();
94 1     1   6 no warnings;
  1         1  
  1         401  
95              
96             # process each word
97 6         22 for ( my $i = 0; $i <= $#words; $i++ ) {
98            
99             # repeat n number of times
100 296023         232698 my $tokens = '';
101 296023         328889 for ( my $j = $i; $j < $i + $n; $j++ ) { $tokens .= $words[ $j ] . ' ' }
  1425737         1768998  
102            
103             # remove the trailing space
104 296023         219700 chop $tokens;
105            
106             # build the ngram and count
107 296023         271561 $ngrams[ $i ] = $tokens;
108 296023         638086 $count{ $ngrams[ $i ] }++;
109            
110             }
111            
112             # done
113 6         38499 return \%count;
114              
115             }
116              
117              
118             sub tscore {
119              
120             # get input
121 1     1 1 1038 my ( $self ) = shift;
122            
123             # get all the words
124 1         3 my $text = $self->text;
125 1         231 $text =~ tr/a-zA-Z'()\-,.?!;:/\n/cs;
126 1         868 $text =~ s/([,.?!:;()\-])/\n$1\n/g;
127 1         2643 $text =~ s/\n+/\n/g;
128 1         1128 my @words = split /\n/, $text;
129              
130             # count the words
131 1         4 my %word_count = ();
132 1         14 for ( my $i = 0; $i <= $#words; $i++ ) { $word_count{ $words[ $i ] }++ }
  9664         12455  
133            
134             # get all the bigrams
135 1         2 my @bigrams = ();
136 1         3 for ( my $i = 0; $i < $#words; $i++ ) {
137            
138             # repeat n number of times
139 9663         7469 my $tokens = '';
140 9663         11353 for ( my $j = $i; $j < $i + 2; $j++ ) { $tokens .= $words[ $j ] . ' ' }
  19326         24240  
141            
142             # remove the trailing space
143 9663         6907 chop $tokens;
144            
145             # build the ngram
146 9663         13121 $bigrams[ $i ] = $tokens;
147            
148             }
149              
150             # count the bigrams
151 1         2 my %bigram_count = ();
152 1         3 for ( my $i = 0; $i < $#words; $i++ ) { $bigram_count{ $bigrams[ $i ] }++ }
  9663         12997  
153              
154             # calculate t-score
155 1         3 my %tscore = ();
156 1         4 for ( my $i = 0; $i < $#words; $i++ ) {
157              
158             $tscore{ $bigrams[ $i ] } = ( $bigram_count{ $bigrams[ $i ] } -
159             $word_count{ $words[ $i ] } *
160             $word_count{ $words[ $i + 1 ] } /
161 9663         23934 ( $#words + 1 ) ) / sqrt( $bigram_count{ $bigrams[ $i ] } );
162              
163             }
164            
165             # done
166 1         1629 return \%tscore;
167            
168             }
169              
170              
171             sub intersection {
172              
173             # get input
174 5     5 1 1057 my ( $self, %options ) = @_;
175              
176             # initialize
177 5         7 my %intersections = ();
178 5         6 my %ngrams = ();
179 5         6 my @counts = ();
180 5         5 my $objects = '';
181 5         6 my $length = 0;
182 1     1   6 no warnings;
  1         1  
  1         264  
183            
184             # sanity checks
185 5 100       10 if ( ! %options ) { die 'This method -- interesection -- requires two options: corpus and length.' }
  1         9  
186             else {
187            
188 4 100 33     21 if ( scalar keys %options != 2 ) { die 'This method -- interesection -- requires two options: corpus and length.' }
  1 50       8  
189 0         0 elsif ( ! $options{ 'corpus' } or ! $options{ 'length' } ) { die 'This method -- interesection -- requires two options: corpus and length.' }
190             else {
191            
192 3         5 $objects = $options{ 'corpus' };
193 3 100       7 if ( ref( $objects ) ne 'ARRAY' ) { die 'The corpus option passed to the interesections method must be an array reference.' }
  1         8  
194            
195 2         3 $length = $options{ 'length' };
196 2 100       11 if ( $length =~ /\D/ ) { die "The length option passed to the intersections method mus be an integer." }
  1         8  
197              
198             }
199            
200             }
201            
202            
203             # process each object
204 1         4 for ( my $i = 0; $i <= $#$objects; $i++ ) {
205            
206             # count each ngram
207 2         10 my $count = $$objects[ $i ]->ngram( $length );
208 2         80583 foreach ( keys %$count ) { $ngrams{ $_ }++ }
  256558         344962  
209            
210             # save counts for later reference; all puns intended
211 2         20792 $counts[ $i ] = $count;
212            
213             }
214            
215             # process each ngram
216 1         73876 foreach ( keys %ngrams ) {
217              
218             # check for intersection; ngram in all total number of objects
219 256066 100       368590 if ( $ngrams{ $_ } == ( $#$objects + 1 )) {
220            
221             # calculate total occurances
222 492         424 my $total = 0;
223 492         610 for ( my $i = 0; $i <= $#$objects; $i++ ) { $total += $counts[ $i ]{ $_ }}
  984         1734  
224            
225             # update result
226 492         848 $intersections{ $_ } = $total;
227            
228             }
229              
230             }
231              
232             # done
233 1         137407 return \%intersections;
234            
235             }
236              
237              
238             =head1 NAME
239              
240             Lingua::EN::Ngram - Extract n-grams from texts and list them according to frequency and/or T-Score
241              
242              
243             =head1 SYNOPSIS
244              
245             # initalize
246             use Lingua::EN::Ngram;
247             $ngram = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
248              
249             # calculate t-score; t-score is only available for bigrams
250             $tscore = $ngram->tscore;
251             foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
252              
253             print "$$tscore{ $_ }\t" . "$_\n";
254              
255             }
256              
257             # list trigrams according to frequency
258             $trigrams = $ngram->ngram( 3 );
259             foreach my $trigram ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
260              
261             print $$trigrams{ $trigram }, "\t$trigram\n";
262              
263             }
264              
265              
266             =head1 DESCRIPTION
267              
268             This module is designed to extract n-grams from texts and list them according to frequency and/or T-Score.
269              
270             To elaborate, the purpose of Lingua::EN::Ngram is to: 1) pull out all of the ngrams (multi-word phrases) in a given text, and 2) list these phrases according to their frequency. Using this module is it possible to create lists of the most common phrases in a text as well as order them by their probable occurance, thus implying significance. This process is useful for the purposes of textual analysis and "distant reading".
271              
272             The two-word phrases (bigrams) are also listable by their T-Score. The T-Score, as well as a number of the module's other methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
273              
274             Finally, the intersection method enables the developer to find ngrams common in an arbitrary number of texts. Use this to look for common themes across a corpus.
275              
276              
277             =head1 METHODS
278              
279              
280             =head2 new
281              
282             Create a new Lingua::EN::Ngram object:
283              
284             # initalize
285             $ngram = Lingua::EN::Ngram->new;
286              
287              
288             =head2 new( text => $scalar )
289              
290             Create a new Lingua::EN::Ngram object whose contents equal the content of a scalar:
291              
292             # initalize with scalar
293             $ngram = Lingua::EN::Ngram->new( text => 'All good things must come to an end...' );
294              
295              
296             =head2 new( file => $scalar )
297              
298             Create a new Lingua::EN::Ngram object whose contents equal the content of a file:
299              
300             # initalize with file
301             $ngram = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
302              
303              
304             =head2 text
305              
306             Set or get the text to be analyzed:
307              
308             # fill Lingua::EN::Ngram object with content
309             $ngram->text( 'All good things must come to an end...' );
310              
311             # get the Lingua::EN::Bigram object's content
312             $text = $ngram->text;
313              
314              
315             =head2 tscore
316              
317             Return a reference to a hash whose keys are a bigram and whose values are a T-Score -- a probabalistic calculation determining the significance of the bigram occuring in the text:
318              
319             # get t-score
320             $tscore = $ngrams->tscore;
321              
322             # list bigrams according to t-score
323             foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
324              
325             print "$$tscore{ $_ }\t" . "$_\n";
326              
327             }
328              
329             T-Score can only be computed against bigrams.
330              
331              
332             =head2 ngram( $scalar )
333              
334             Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in the text:
335              
336             # create a list of trigrams
337             $trigrams = $ngrams->ngram( 3 );
338              
339             # display frequency
340             foreach ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
341              
342             print $$trigrams{ $_ }, "\t$_\n";
343              
344             }
345              
346             This method requires a single parameter and that parameter must be an integer. For example, to get a list of bigrams, pass 2 to ngram. To get a list of quadgrams, pass 4.
347              
348              
349             =head2 intersection( corpus => [ @array ], length => $scalar )
350              
351             Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in a corpus of texts:
352              
353             # build corpus
354             $walden = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
355             $rivers = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
356             $corpus = Lingua::EN::Ngram->new;
357              
358             # compute intersections
359             $intersections = $corpus->intersection( corpus => [ ( $walden, $rivers ) ], length => 5 );
360              
361             # display frequency
362             foreach ( sort { $$intersections{ $b } <=> $$intersections{ $a }} keys %$intersections ) {
363              
364             print $$intersections{ $_ }, "\t$_\n";
365              
366             }
367              
368             The value of corpus must be an array reference, and each element must be Lingua::EN::Ngram objects. The value of length must be an integer.
369              
370              
371             =head1 DISCUSSION
372              
373             Given the increasing availability of full text materials, this module is intended to help "digital humanists" apply mathematical methods to the analysis of texts. For example, the developer can extract the high-frequency words using the ngram method and allow the user to search for those words in a concordance. The use of ngram( 2 ) simply returns the frequency of bigrams in a text, but the tscore method can order them in a more finely tuned manner.
374              
375             Consider using T-Score-weighted bigrams as classification terms to supplement the "aboutness" of texts. Concatonate many texts together and look for common phrases written by the author. Compare these commonly used phrases to the commonly used phrases of other authors.
376              
377             All ngrams return by the ngram method include punctuation. This is intentional. Developers may need want to remove ngrams containing such values from the output. Similarly, no effort has been made to remove commonly used words -- stop words -- from the methods. Consider the use of Lingua::StopWords, Lingua::EN::StopWords, or the creation of your own stop word list to make output more meaningful. The distribution came with a script (bin/ngrams.pl) demonstrating how to remove puncutation from the displayed output. Another script (bin/intesections.pl) demonstrates how to extract and count ngrams across two texts.
378              
379             Finally, this is not the only module supporting ngram extraction. See also Text::NSP.
380              
381              
382             =head1 TODO
383              
384             There are probably a number of ways the module can be improved:
385              
386             =over
387              
388             * the distribution's license should probably be changed to the Perl Aristic License
389              
390             * the addition of alternative T-Score calculations would be nice
391              
392             * make sure the module works with character sets beyond ASCII (done, I think, as of version 0.02)
393              
394             =back
395              
396              
397             =head1 CHANGES
398              
399             =over
400              
401             * March 28, 2018 (version 0.03) - removed lower casing of letters and install ngrams script
402              
403             * November 25, 2010 (version 0.02) - added non-Latin characters
404              
405             * September 12, 2010 (version 0.01) - initial release but an almost complete rewrite of Lingua::EN::Bigram
406              
407              
408              
409             =back
410              
411             =head1 ACKNOWLEDGEMENTS
412              
413             T-Score, as well as a number of the module's methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
414              
415              
416             =head1 AUTHOR
417              
418             Eric Lease Morgan
419              
420             =cut
421              
422             # return true or die
423             1;