File Coverage

lib/Lingua/EN/Ngram.pm

Criterion	Covered	Total	%
statement	104	105	99.0
branch	27	28	96.4
condition	1	3	33.3
subroutine	9	9	100.0
pod	5	5	100.0
total	146	150	97.3

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::Ngram;
2
3							# Ngram.pm - Extract and count words and phrases from texts
4
5							# Eric Lease Morgan
6							# September 12, 2010 - first investigations; based on Lingua::EN::Bigram
7							# November 25, 2010 - added non-Latin characters; Happy Thanksgiving!
8							# March 28, 2018 - removed lower-casing the text; why did I do lower case previously?
9
10
11							# include
12	1			1		553	use strict;
	1					1
	1					22
13	1			1		4	use warnings;
	1					1
	1					349
14
15							# define
16							our $VERSION = '0.03';
17
18
19							sub new {
20
21							# get input
22	9			9	1	2321	my ( $class, %options ) = @_;
23
24							# initialize
25	9					12	my $self = {};
26
27							# process optional options
28	9	100				20	if ( %options ) {
29
30							# check for one and only one argument
31	7					13	my @keys = keys %options;
32	7	100				13	if ( scalar @keys != 1 ) { die "This method -- new -- can only take one and only one option (text or file)." }
	1					8
33
34							# initalize from text
35	6	100				15	if ( $options{ 'text' } ) { $self->{ text } = $options{ 'text' } }
	1	100				2
36
37							# initialize from file
38							elsif ( $options{ 'file' } ) {
39
40							# slurp
41	4					5	my $file = $options{ 'file' };
42	4	100				128	open F, $file or die "The file argument ($file) passed to this method is invalid: $!\n";
43	3					7	my $text = do { local $/; };
	3					10
	3					2699
44	3					26	close F;
45
46							# do the work
47	3					13	$self->{ text } = $text;
48
49							}
50
51							# invalid option
52	1					8	else { die "This method -- new -- can only take one option, text or file." }
53
54							}
55
56							# return
57	6					16	return bless $self, $class;
58
59							}
60
61
62							sub text {
63
64							# get input
65	11			11	1	25	my ( $self, $text ) = @_;
66
67							# set
68	11	100				17	if ( $text ) { $self->{ text } = $text }
	1					53
69
70							# return
71	11					53	return $self->{ text };
72
73							}
74
75
76							sub ngram {
77
78							# get input
79	8			8	1	3506	my ( $self, $n ) = @_;
80
81							# sanity check
82	8	100				18	if ( ! $n ) { die "This method -- ngram -- requires an integer as an argument." }
	1					18
83	7	100				25	if ( $n =~ /\D/ ) { die "This method -- ngram -- requires an integer as an argument." }
	1					8
84
85							# initalize
86	6					10	my $text = $self->text;
87	6					6783	$text =~ tr/a-zA-Zà-ƶÀ-Ƶ'()\-,.?!;:/\n/cs;
88	6					25374	$text =~ s/([,.?!:;()\-])/\n$1\n/g;
89	6					77682	$text =~ s/\n+/\n/g;
90	6					41182	my @words = split /\n/, $text;
91
92	6					25	my @ngrams = ();
93	6					8	my %count = ();
94	1			1		6	no warnings;
	1					1
	1					401
95
96							# process each word
97	6					22	for ( my $i = 0; $i <= $#words; $i++ ) {
98
99							# repeat n number of times
100	296023					232698	my $tokens = '';
101	296023					328889	for ( my $j = $i; $j < $i + $n; $j++ ) { $tokens .= $words[ $j ] . ' ' }
	1425737					1768998
102
103							# remove the trailing space
104	296023					219700	chop $tokens;
105
106							# build the ngram and count
107	296023					271561	$ngrams[ $i ] = $tokens;
108	296023					638086	$count{ $ngrams[ $i ] }++;
109
110							}
111
112							# done
113	6					38499	return \%count;
114
115							}
116
117
118							sub tscore {
119
120							# get input
121	1			1	1	1038	my ( $self ) = shift;
122
123							# get all the words
124	1					3	my $text = $self->text;
125	1					231	$text =~ tr/a-zA-Z'()\-,.?!;:/\n/cs;
126	1					868	$text =~ s/([,.?!:;()\-])/\n$1\n/g;
127	1					2643	$text =~ s/\n+/\n/g;
128	1					1128	my @words = split /\n/, $text;
129
130							# count the words
131	1					4	my %word_count = ();
132	1					14	for ( my $i = 0; $i <= $#words; $i++ ) { $word_count{ $words[ $i ] }++ }
	9664					12455
133
134							# get all the bigrams
135	1					2	my @bigrams = ();
136	1					3	for ( my $i = 0; $i < $#words; $i++ ) {
137
138							# repeat n number of times
139	9663					7469	my $tokens = '';
140	9663					11353	for ( my $j = $i; $j < $i + 2; $j++ ) { $tokens .= $words[ $j ] . ' ' }
	19326					24240
141
142							# remove the trailing space
143	9663					6907	chop $tokens;
144
145							# build the ngram
146	9663					13121	$bigrams[ $i ] = $tokens;
147
148							}
149
150							# count the bigrams
151	1					2	my %bigram_count = ();
152	1					3	for ( my $i = 0; $i < $#words; $i++ ) { $bigram_count{ $bigrams[ $i ] }++ }
	9663					12997
153
154							# calculate t-score
155	1					3	my %tscore = ();
156	1					4	for ( my $i = 0; $i < $#words; $i++ ) {
157
158							$tscore{ $bigrams[ $i ] } = ( $bigram_count{ $bigrams[ $i ] } -
159							$word_count{ $words[ $i ] } *
160							$word_count{ $words[ $i + 1 ] } /
161	9663					23934	( $#words + 1 ) ) / sqrt( $bigram_count{ $bigrams[ $i ] } );
162
163							}
164
165							# done
166	1					1629	return \%tscore;
167
168							}
169
170
171							sub intersection {
172
173							# get input
174	5			5	1	1057	my ( $self, %options ) = @_;
175
176							# initialize
177	5					7	my %intersections = ();
178	5					6	my %ngrams = ();
179	5					6	my @counts = ();
180	5					5	my $objects = '';
181	5					6	my $length = 0;
182	1			1		6	no warnings;
	1					1
	1					264
183
184							# sanity checks
185	5	100				10	if ( ! %options ) { die 'This method -- interesection -- requires two options: corpus and length.' }
	1					9
186							else {
187
188	4	100	33			21	if ( scalar keys %options != 2 ) { die 'This method -- interesection -- requires two options: corpus and length.' }
	1	50				8
189	0					0	elsif ( ! $options{ 'corpus' } or ! $options{ 'length' } ) { die 'This method -- interesection -- requires two options: corpus and length.' }
190							else {
191
192	3					5	$objects = $options{ 'corpus' };
193	3	100				7	if ( ref( $objects ) ne 'ARRAY' ) { die 'The corpus option passed to the interesections method must be an array reference.' }
	1					8
194
195	2					3	$length = $options{ 'length' };
196	2	100				11	if ( $length =~ /\D/ ) { die "The length option passed to the intersections method mus be an integer." }
	1					8
197
198							}
199
200							}
201
202
203							# process each object
204	1					4	for ( my $i = 0; $i <= $#$objects; $i++ ) {
205
206							# count each ngram
207	2					10	my $count = $$objects[ $i ]->ngram( $length );
208	2					80583	foreach ( keys %$count ) { $ngrams{ $_ }++ }
	256558					344962
209
210							# save counts for later reference; all puns intended
211	2					20792	$counts[ $i ] = $count;
212
213							}
214
215							# process each ngram
216	1					73876	foreach ( keys %ngrams ) {
217
218							# check for intersection; ngram in all total number of objects
219	256066	100				368590	if ( $ngrams{ $_ } == ( $#$objects + 1 )) {
220
221							# calculate total occurances
222	492					424	my $total = 0;
223	492					610	for ( my $i = 0; $i <= $#$objects; $i++ ) { $total += $counts[ $i ]{ $_ }}
	984					1734
224
225							# update result
226	492					848	$intersections{ $_ } = $total;
227
228							}
229
230							}
231
232							# done
233	1					137407	return \%intersections;
234
235							}
236
237
238							=head1 NAME
239
240							Lingua::EN::Ngram - Extract n-grams from texts and list them according to frequency and/or T-Score
241
242
243							=head1 SYNOPSIS
244
245							# initalize
246							use Lingua::EN::Ngram;
247							$ngram = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
248
249							# calculate t-score; t-score is only available for bigrams
250							$tscore = $ngram->tscore;
251							foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
252
253							print "$$tscore{ $_ }\t" . "$_\n";
254
255							}
256
257							# list trigrams according to frequency
258							$trigrams = $ngram->ngram( 3 );
259							foreach my $trigram ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
260
261							print $$trigrams{ $trigram }, "\t$trigram\n";
262
263							}
264
265
266							=head1 DESCRIPTION
267
268							This module is designed to extract n-grams from texts and list them according to frequency and/or T-Score.
269
270							To elaborate, the purpose of Lingua::EN::Ngram is to: 1) pull out all of the ngrams (multi-word phrases) in a given text, and 2) list these phrases according to their frequency. Using this module is it possible to create lists of the most common phrases in a text as well as order them by their probable occurance, thus implying significance. This process is useful for the purposes of textual analysis and "distant reading".
271
272							The two-word phrases (bigrams) are also listable by their T-Score. The T-Score, as well as a number of the module's other methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
273
274							Finally, the intersection method enables the developer to find ngrams common in an arbitrary number of texts. Use this to look for common themes across a corpus.
275
276
277							=head1 METHODS
278
279
280							=head2 new
281
282							Create a new Lingua::EN::Ngram object:
283
284							# initalize
285							$ngram = Lingua::EN::Ngram->new;
286
287
288							=head2 new( text => $scalar )
289
290							Create a new Lingua::EN::Ngram object whose contents equal the content of a scalar:
291
292							# initalize with scalar
293							$ngram = Lingua::EN::Ngram->new( text => 'All good things must come to an end...' );
294
295
296							=head2 new( file => $scalar )
297
298							Create a new Lingua::EN::Ngram object whose contents equal the content of a file:
299
300							# initalize with file
301							$ngram = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
302
303
304							=head2 text
305
306							Set or get the text to be analyzed:
307
308							# fill Lingua::EN::Ngram object with content
309							$ngram->text( 'All good things must come to an end...' );
310
311							# get the Lingua::EN::Bigram object's content
312							$text = $ngram->text;
313
314
315							=head2 tscore
316
317							Return a reference to a hash whose keys are a bigram and whose values are a T-Score -- a probabalistic calculation determining the significance of the bigram occuring in the text:
318
319							# get t-score
320							$tscore = $ngrams->tscore;
321
322							# list bigrams according to t-score
323							foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
324
325							print "$$tscore{ $_ }\t" . "$_\n";
326
327							}
328
329							T-Score can only be computed against bigrams.
330
331
332							=head2 ngram( $scalar )
333
334							Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in the text:
335
336							# create a list of trigrams
337							$trigrams = $ngrams->ngram( 3 );
338
339							# display frequency
340							foreach ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
341
342							print $$trigrams{ $_ }, "\t$_\n";
343
344							}
345
346							This method requires a single parameter and that parameter must be an integer. For example, to get a list of bigrams, pass 2 to ngram. To get a list of quadgrams, pass 4.
347
348
349							=head2 intersection( corpus => [ @array ], length => $scalar )
350
351							Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in a corpus of texts:
352
353							# build corpus
354							$walden = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
355							$rivers = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
356							$corpus = Lingua::EN::Ngram->new;
357
358							# compute intersections
359							$intersections = $corpus->intersection( corpus => [ ( $walden, $rivers ) ], length => 5 );
360
361							# display frequency
362							foreach ( sort { $$intersections{ $b } <=> $$intersections{ $a }} keys %$intersections ) {
363
364							print $$intersections{ $_ }, "\t$_\n";
365
366							}
367
368							The value of corpus must be an array reference, and each element must be Lingua::EN::Ngram objects. The value of length must be an integer.
369
370
371							=head1 DISCUSSION
372
373							Given the increasing availability of full text materials, this module is intended to help "digital humanists" apply mathematical methods to the analysis of texts. For example, the developer can extract the high-frequency words using the ngram method and allow the user to search for those words in a concordance. The use of ngram( 2 ) simply returns the frequency of bigrams in a text, but the tscore method can order them in a more finely tuned manner.
374
375							Consider using T-Score-weighted bigrams as classification terms to supplement the "aboutness" of texts. Concatonate many texts together and look for common phrases written by the author. Compare these commonly used phrases to the commonly used phrases of other authors.
376
377							All ngrams return by the ngram method include punctuation. This is intentional. Developers may need want to remove ngrams containing such values from the output. Similarly, no effort has been made to remove commonly used words -- stop words -- from the methods. Consider the use of Lingua::StopWords, Lingua::EN::StopWords, or the creation of your own stop word list to make output more meaningful. The distribution came with a script (bin/ngrams.pl) demonstrating how to remove puncutation from the displayed output. Another script (bin/intesections.pl) demonstrates how to extract and count ngrams across two texts.
378
379							Finally, this is not the only module supporting ngram extraction. See also Text::NSP.
380
381
382							=head1 TODO
383
384							There are probably a number of ways the module can be improved:
385
386							=over
387
388							* the distribution's license should probably be changed to the Perl Aristic License
389
390							* the addition of alternative T-Score calculations would be nice
391
392							* make sure the module works with character sets beyond ASCII (done, I think, as of version 0.02)
393
394							=back
395
396
397							=head1 CHANGES
398
399							=over
400
401							* March 28, 2018 (version 0.03) - removed lower casing of letters and install ngrams script
402
403							* November 25, 2010 (version 0.02) - added non-Latin characters
404
405							* September 12, 2010 (version 0.01) - initial release but an almost complete rewrite of Lingua::EN::Bigram
406
407
408
409							=back
410
411							=head1 ACKNOWLEDGEMENTS
412
413							T-Score, as well as a number of the module's methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
414
415
416							=head1 AUTHOR
417
418							Eric Lease Morgan
419
420							=cut
421
422							# return true or die
423							1;