File Coverage

lib/Lingua/EN/Ngram.pm
Criterion Covered Total %
statement 104 105 99.0
branch 27 28 96.4
condition 1 3 33.3
subroutine 9 9 100.0
pod 5 5 100.0
total 146 150 97.3


line stmt bran cond sub pod time code
1             package Lingua::EN::Ngram;
2              
3             # Ngram.pm - Extract and count words and phrases from texts
4              
5             # Eric Lease Morgan
6             # September 12, 2010 - first investigations; based on Lingua::EN::Bigram
7             # November 25, 2010 - added non-Latin characters; Happy Thanksgiving!
8              
9              
10             # include
11 1     1   993 use strict;
  1         3  
  1         51  
12 1     1   5 use warnings;
  1         3  
  1         572  
13              
14             # define
15             our $VERSION = '0.02';
16              
17              
18             sub new {
19              
20             # get input
21 9     9 1 3491 my ( $class, %options ) = @_;
22            
23             # initialize
24 9         16 my $self = {};
25              
26             # process optional options
27 9 100       28 if ( %options ) {
28            
29             # check for one and only one argument
30 7         19 my @keys = keys %options;
31 7 100       20 if ( scalar @keys != 1 ) { die "This method -- new -- can only take one and only one option (text or file)." }
  1         14  
32            
33             # initalize from text
34 6 100       22 if ( $options{ 'text' } ) { $self->{ text } = $options{ 'text' } }
  1 100       6  
35            
36             # initialize from file
37             elsif ( $options{ 'file' } ) {
38            
39             # slurp
40 4         7 my $file = $options{ 'file' };
41 4 100       191 open F, $file or die "The file argument ($file) passed to this method is invalid: $!\n";
42 3         5 my $text = do { local $/; };
  3         13  
  3         4500  
43 3         64 close F;
44            
45             # do the work
46 3         23 $self->{ text } = $text;
47            
48             }
49            
50             # invalid option
51 1         16 else { die "This method -- new -- can only take one option, text or file." }
52            
53             }
54            
55             # return
56 6         42 return bless $self, $class;
57            
58             }
59              
60              
61             sub text {
62              
63             # get input
64 11     11 1 39 my ( $self, $text ) = @_;
65            
66             # set
67 11 100       34 if ( $text ) { $self->{ text } = $text }
  1         120  
68            
69             # return
70 11         64 return $self->{ text };
71            
72             }
73              
74              
75             sub ngram {
76              
77             # get input
78 8     8 1 8598 my ( $self, $n ) = @_;
79              
80             # sanity check
81 8 100       31 if ( ! $n ) { die "This method -- ngram -- requires an integer as an argument." }
  1         13  
82 7 100       41 if ( $n =~ /\D/ ) { die "This method -- ngram -- requires an integer as an argument." }
  1         9  
83            
84             # initalize
85 6         25 my $text = $self->text;
86 6         10497 $text =~ tr/a-zA-Zà-ƶÀ-Ƶ'()\-,.?!;:/\n/cs;
87 6         43248 $text =~ s/([,.?!:;()\-])/\n$1\n/g;
88 6         148688 $text =~ s/\n+/\n/g;
89 6         120336 my @words = split /\n/, lc( $text );
90              
91 6         16737 my @ngrams = ();
92 6         32 my %count = ();
93 1     1   25 no warnings;
  1         2  
  1         764  
94              
95             # process each word
96 6         57 for ( my $i = 0; $i <= $#words; $i++ ) {
97            
98             # repeat n number of times
99 296023         321461 my $tokens = '';
100 296023         681420 for ( my $j = $i; $j < $i + $n; $j++ ) { $tokens .= $words[ $j ] . ' ' }
  1425737         3664717  
101            
102             # remove the trailing space
103 296023         328387 chop $tokens;
104            
105             # build the ngram and count
106 296023         456593 $ngrams[ $i ] = $tokens;
107 296023         1469063 $count{ $ngrams[ $i ] }++;
108            
109             }
110            
111             # done
112 6         66921 return \%count;
113              
114             }
115              
116              
117             sub tscore {
118              
119             # get input
120 1     1 1 5291 my ( $self ) = shift;
121            
122             # get all the words
123 1         7 my $text = $self->text;
124 1         348 $text =~ tr/a-zA-Z'()\-,.?!;:/\n/cs;
125 1         1425 $text =~ s/([,.?!:;()\-])/\n$1\n/g;
126 1         5239 $text =~ s/\n+/\n/g;
127 1         3615 my @words = split /\n/, lc( $text );
128              
129             # count the words
130 1         364 my %word_count = ();
131 1         11 for ( my $i = 0; $i <= $#words; $i++ ) { $word_count{ $words[ $i ] }++ }
  9664         22745  
132            
133             # get all the bigrams
134 1         3 my @bigrams = ();
135 1         7 for ( my $i = 0; $i < $#words; $i++ ) {
136            
137             # repeat n number of times
138 9663         9778 my $tokens = '';
139 9663         18959 for ( my $j = $i; $j < $i + 2; $j++ ) { $tokens .= $words[ $j ] . ' ' }
  19326         46229  
140            
141             # remove the trailing space
142 9663         9660 chop $tokens;
143            
144             # build the ngram
145 9663         33502 $bigrams[ $i ] = $tokens;
146            
147             }
148              
149             # count the bigrams
150 1         7 my %bigram_count = ();
151 1         6 for ( my $i = 0; $i < $#words; $i++ ) { $bigram_count{ $bigrams[ $i ] }++ }
  9663         28613  
152              
153             # calculate t-score
154 1         5 my %tscore = ();
155 1         7 for ( my $i = 0; $i < $#words; $i++ ) {
156              
157 9663         56504 $tscore{ $bigrams[ $i ] } = ( $bigram_count{ $bigrams[ $i ] } -
158             $word_count{ $words[ $i ] } *
159             $word_count{ $words[ $i + 1 ] } /
160             ( $#words + 1 ) ) / sqrt( $bigram_count{ $bigrams[ $i ] } );
161              
162             }
163            
164             # done
165 1         5532 return \%tscore;
166            
167             }
168              
169              
170             sub intersection {
171              
172             # get input
173 5     5 1 1786 my ( $self, %options ) = @_;
174              
175             # initialize
176 5         9 my %intersections = ();
177 5         8 my %ngrams = ();
178 5         8 my @counts = ();
179 5         8 my $objects = '';
180 5         6 my $length = 0;
181 1     1   7 no warnings;
  1         2  
  1         431  
182            
183             # sanity checks
184 5 100       10 if ( ! %options ) { die 'This method -- interesection -- requires two options: corpus and length.' }
  1         18  
185             else {
186            
187 4 100 33     26 if ( scalar keys %options != 2 ) { die 'This method -- interesection -- requires two options: corpus and length.' }
  1 50       10  
188 0         0 elsif ( ! $options{ 'corpus' } or ! $options{ 'length' } ) { die 'This method -- interesection -- requires two options: corpus and length.' }
189             else {
190            
191 3         6 $objects = $options{ 'corpus' };
192 3 100       10 if ( ref( $objects ) ne 'ARRAY' ) { die 'The corpus option passed to the interesections method must be an array reference.' }
  1         7  
193            
194 2         5 $length = $options{ 'length' };
195 2 100       19 if ( $length =~ /\D/ ) { die "The length option passed to the intersections method mus be an integer." }
  1         13  
196              
197             }
198            
199             }
200            
201            
202             # process each object
203 1         5 for ( my $i = 0; $i <= $#$objects; $i++ ) {
204            
205             # count each ngram
206 2         18 my $count = $$objects[ $i ]->ngram( $length );
207 2         117806 foreach ( keys %$count ) { $ngrams{ $_ }++ }
  256466         596464  
208            
209             # save counts for later reference; all puns intended
210 2         50472 $counts[ $i ] = $count;
211            
212             }
213            
214             # process each ngram
215 1         207694 foreach ( keys %ngrams ) {
216              
217             # check for intersection; ngram in all total number of objects
218 255948 100       781057 if ( $ngrams{ $_ } == ( $#$objects + 1 )) {
219            
220             # calculate total occurances
221 518         743 my $total = 0;
222 518         1653 for ( my $i = 0; $i <= $#$objects; $i++ ) { $total += $counts[ $i ]{ $_ }}
  1036         4960  
223            
224             # update result
225 518         1869 $intersections{ $_ } = $total;
226            
227             }
228              
229             }
230              
231             # done
232 1         306573 return \%intersections;
233            
234             }
235              
236              
237             =head1 NAME
238              
239             Lingua::EN::Ngram - Extract n-grams from texts and list them according to frequency and/or T-Score
240              
241              
242             =head1 SYNOPSIS
243              
244             # initalize
245             use Lingua::EN::Ngram;
246             $ngram = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
247              
248             # calculate t-score; t-score is only available for bigrams
249             $tscore = $ngram->tscore;
250             foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
251              
252             print "$$tscore{ $_ }\t" . "$_\n";
253              
254             }
255              
256             # list trigrams according to frequency
257             $trigrams = $ngram->ngram( 3 );
258             foreach my $trigram ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
259              
260             print $$trigrams{ $trigram }, "\t$trigram\n";
261              
262             }
263              
264              
265             =head1 DESCRIPTION
266              
267             This module is designed to extract n-grams from texts and list them according to frequency and/or T-Score.
268              
269             To elaborate, the purpose of Lingua::EN::Ngram is to: 1) pull out all of the ngrams (multi-word phrases) in a given text, and 2) list these phrases according to their frequency. Using this module is it possible to create lists of the most common phrases in a text as well as order them by their probable occurance, thus implying significance. This process is useful for the purposes of textual analysis and "distant reading".
270              
271             The two-word phrases (bigrams) are also listable by their T-Score. The T-Score, as well as a number of the module's other methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
272              
273             Finally, the intersection method enables the developer to find ngrams common in an arbitrary number of texts. Use this to look for common themes across a corpus.
274              
275              
276             =head1 METHODS
277              
278              
279             =head2 new
280              
281             Create a new Lingua::EN::Ngram object:
282              
283             # initalize
284             $ngram = Lingua::EN::Ngram->new;
285              
286              
287             =head2 new( text => $scalar )
288              
289             Create a new Lingua::EN::Ngram object whose contents equal the content of a scalar:
290              
291             # initalize with scalar
292             $ngram = Lingua::EN::Ngram->new( text => 'All good things must come to an end...' );
293              
294              
295             =head2 new( file => $scalar )
296              
297             Create a new Lingua::EN::Ngram object whose contents equal the content of a file:
298              
299             # initalize with file
300             $ngram = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
301              
302              
303             =head2 text
304              
305             Set or get the text to be analyzed:
306              
307             # fill Lingua::EN::Ngram object with content
308             $ngram->text( 'All good things must come to an end...' );
309              
310             # get the Lingua::EN::Bigram object's content
311             $text = $ngram->text;
312              
313              
314             =head2 tscore
315              
316             Return a reference to a hash whose keys are a bigram and whose values are a T-Score -- a probabalistic calculation determining the significance of the bigram occuring in the text:
317              
318             # get t-score
319             $tscore = $ngrams->tscore;
320              
321             # list bigrams according to t-score
322             foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
323              
324             print "$$tscore{ $_ }\t" . "$_\n";
325              
326             }
327              
328             T-Score can only be computed against bigrams.
329              
330              
331             =head2 ngram( $scalar )
332              
333             Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in the text:
334              
335             # create a list of trigrams
336             $trigrams = $ngrams->ngram( 3 );
337              
338             # display frequency
339             foreach ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
340              
341             print $$trigrams{ $_ }, "\t$_\n";
342              
343             }
344              
345             This method requires a single parameter and that parameter must be an integer. For example, to get a list of bigrams, pass 2 to ngram. To get a list of quadgrams, pass 4.
346              
347              
348             =head2 intersection( corpus => [ @array ], length => $scalar )
349              
350             Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in a corpus of texts:
351              
352             # build corpus
353             $walden = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
354             $rivers = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
355             $corpus = Lingua::EN::Ngram->new;
356              
357             # compute intersections
358             $intersections = $corpus->intersection( corpus => [ ( $walden, $rivers ) ], length => 5 );
359              
360             # display frequency
361             foreach ( sort { $$intersections{ $b } <=> $$intersections{ $a }} keys %$intersections ) {
362              
363             print $$intersections{ $_ }, "\t$_\n";
364              
365             }
366              
367             The value of corpus must be an array reference, and each element must be Lingua::EN::Ngram objects. The value of length must be an integer.
368              
369              
370             =head1 DISCUSSION
371              
372             Given the increasing availability of full text materials, this module is intended to help "digital humanists" apply mathematical methods to the analysis of texts. For example, the developer can extract the high-frequency words using the ngram method and allow the user to search for those words in a concordance. The use of ngram( 2 ) simply returns the frequency of bigrams in a text, but the tscore method can order them in a more finely tuned manner.
373              
374             Consider using T-Score-weighted bigrams as classification terms to supplement the "aboutness" of texts. Concatonate many texts together and look for common phrases written by the author. Compare these commonly used phrases to the commonly used phrases of other authors.
375              
376             All ngrams return by the ngram method include punctuation. This is intentional. Developers may need want to remove ngrams containing such values from the output. Similarly, no effort has been made to remove commonly used words -- stop words -- from the methods. Consider the use of Lingua::StopWords, Lingua::EN::StopWords, or the creation of your own stop word list to make output more meaningful. The distribution came with a script (bin/ngrams.pl) demonstrating how to remove puncutation from the displayed output. Another script (bin/intesections.pl) demonstrates how to extract and count ngrams across two texts.
377              
378             Finally, this is not the only module supporting ngram extraction. See also Text::NSP.
379              
380              
381             =head1 TODO
382              
383             There are probably a number of ways the module can be improved:
384              
385             =over
386              
387             * the distribution's license should probably be changed to the Perl Aristic License
388              
389             * the addition of alternative T-Score calculations would be nice
390              
391             * make sure the module works with character sets beyond ASCII (done, I think, as of version 0.02)
392              
393             =back
394              
395              
396             =head1 CHANGES
397              
398             =over
399              
400             * September 12, 2010 (version 0.01) - initial release but an almost complete rewrite of Lingua::EN::Bigram
401              
402             * November 25, 2010 (version 0.02) - added non-Latin characters
403              
404             =back
405              
406             =head1 ACKNOWLEDGEMENTS
407              
408             T-Score, as well as a number of the module's methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
409              
410              
411             =head1 AUTHOR
412              
413             Eric Lease Morgan
414              
415             =cut
416              
417             # return true or die
418             1;