File Coverage

lib/Lingua/EN/Ngram.pm

Criterion	Covered	Total	%
statement	104	105	99.0
branch	27	28	96.4
condition	1	3	33.3
subroutine	9	9	100.0
pod	5	5	100.0
total	146	150	97.3

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::Ngram;
2
3							# Ngram.pm - Extract and count words and phrases from texts
4
5							# Eric Lease Morgan
6							# September 12, 2010 - first investigations; based on Lingua::EN::Bigram
7							# November 25, 2010 - added non-Latin characters; Happy Thanksgiving!
8
9
10							# include
11	1			1		993	use strict;
	1					3
	1					51
12	1			1		5	use warnings;
	1					3
	1					572
13
14							# define
15							our $VERSION = '0.02';
16
17
18							sub new {
19
20							# get input
21	9			9	1	3491	my ( $class, %options ) = @_;
22
23							# initialize
24	9					16	my $self = {};
25
26							# process optional options
27	9	100				28	if ( %options ) {
28
29							# check for one and only one argument
30	7					19	my @keys = keys %options;
31	7	100				20	if ( scalar @keys != 1 ) { die "This method -- new -- can only take one and only one option (text or file)." }
	1					14
32
33							# initalize from text
34	6	100				22	if ( $options{ 'text' } ) { $self->{ text } = $options{ 'text' } }
	1	100				6
35
36							# initialize from file
37							elsif ( $options{ 'file' } ) {
38
39							# slurp
40	4					7	my $file = $options{ 'file' };
41	4	100				191	open F, $file or die "The file argument ($file) passed to this method is invalid: $!\n";
42	3					5	my $text = do { local $/; };
	3					13
	3					4500
43	3					64	close F;
44
45							# do the work
46	3					23	$self->{ text } = $text;
47
48							}
49
50							# invalid option
51	1					16	else { die "This method -- new -- can only take one option, text or file." }
52
53							}
54
55							# return
56	6					42	return bless $self, $class;
57
58							}
59
60
61							sub text {
62
63							# get input
64	11			11	1	39	my ( $self, $text ) = @_;
65
66							# set
67	11	100				34	if ( $text ) { $self->{ text } = $text }
	1					120
68
69							# return
70	11					64	return $self->{ text };
71
72							}
73
74
75							sub ngram {
76
77							# get input
78	8			8	1	8598	my ( $self, $n ) = @_;
79
80							# sanity check
81	8	100				31	if ( ! $n ) { die "This method -- ngram -- requires an integer as an argument." }
	1					13
82	7	100				41	if ( $n =~ /\D/ ) { die "This method -- ngram -- requires an integer as an argument." }
	1					9
83
84							# initalize
85	6					25	my $text = $self->text;
86	6					10497	$text =~ tr/a-zA-Zà-ƶÀ-Ƶ'()\-,.?!;:/\n/cs;
87	6					43248	$text =~ s/([,.?!:;()\-])/\n$1\n/g;
88	6					148688	$text =~ s/\n+/\n/g;
89	6					120336	my @words = split /\n/, lc( $text );
90
91	6					16737	my @ngrams = ();
92	6					32	my %count = ();
93	1			1		25	no warnings;
	1					2
	1					764
94
95							# process each word
96	6					57	for ( my $i = 0; $i <= $#words; $i++ ) {
97
98							# repeat n number of times
99	296023					321461	my $tokens = '';
100	296023					681420	for ( my $j = $i; $j < $i + $n; $j++ ) { $tokens .= $words[ $j ] . ' ' }
	1425737					3664717
101
102							# remove the trailing space
103	296023					328387	chop $tokens;
104
105							# build the ngram and count
106	296023					456593	$ngrams[ $i ] = $tokens;
107	296023					1469063	$count{ $ngrams[ $i ] }++;
108
109							}
110
111							# done
112	6					66921	return \%count;
113
114							}
115
116
117							sub tscore {
118
119							# get input
120	1			1	1	5291	my ( $self ) = shift;
121
122							# get all the words
123	1					7	my $text = $self->text;
124	1					348	$text =~ tr/a-zA-Z'()\-,.?!;:/\n/cs;
125	1					1425	$text =~ s/([,.?!:;()\-])/\n$1\n/g;
126	1					5239	$text =~ s/\n+/\n/g;
127	1					3615	my @words = split /\n/, lc( $text );
128
129							# count the words
130	1					364	my %word_count = ();
131	1					11	for ( my $i = 0; $i <= $#words; $i++ ) { $word_count{ $words[ $i ] }++ }
	9664					22745
132
133							# get all the bigrams
134	1					3	my @bigrams = ();
135	1					7	for ( my $i = 0; $i < $#words; $i++ ) {
136
137							# repeat n number of times
138	9663					9778	my $tokens = '';
139	9663					18959	for ( my $j = $i; $j < $i + 2; $j++ ) { $tokens .= $words[ $j ] . ' ' }
	19326					46229
140
141							# remove the trailing space
142	9663					9660	chop $tokens;
143
144							# build the ngram
145	9663					33502	$bigrams[ $i ] = $tokens;
146
147							}
148
149							# count the bigrams
150	1					7	my %bigram_count = ();
151	1					6	for ( my $i = 0; $i < $#words; $i++ ) { $bigram_count{ $bigrams[ $i ] }++ }
	9663					28613
152
153							# calculate t-score
154	1					5	my %tscore = ();
155	1					7	for ( my $i = 0; $i < $#words; $i++ ) {
156
157	9663					56504	$tscore{ $bigrams[ $i ] } = ( $bigram_count{ $bigrams[ $i ] } -
158							$word_count{ $words[ $i ] } *
159							$word_count{ $words[ $i + 1 ] } /
160							( $#words + 1 ) ) / sqrt( $bigram_count{ $bigrams[ $i ] } );
161
162							}
163
164							# done
165	1					5532	return \%tscore;
166
167							}
168
169
170							sub intersection {
171
172							# get input
173	5			5	1	1786	my ( $self, %options ) = @_;
174
175							# initialize
176	5					9	my %intersections = ();
177	5					8	my %ngrams = ();
178	5					8	my @counts = ();
179	5					8	my $objects = '';
180	5					6	my $length = 0;
181	1			1		7	no warnings;
	1					2
	1					431
182
183							# sanity checks
184	5	100				10	if ( ! %options ) { die 'This method -- interesection -- requires two options: corpus and length.' }
	1					18
185							else {
186
187	4	100	33			26	if ( scalar keys %options != 2 ) { die 'This method -- interesection -- requires two options: corpus and length.' }
	1	50				10
188	0					0	elsif ( ! $options{ 'corpus' } or ! $options{ 'length' } ) { die 'This method -- interesection -- requires two options: corpus and length.' }
189							else {
190
191	3					6	$objects = $options{ 'corpus' };
192	3	100				10	if ( ref( $objects ) ne 'ARRAY' ) { die 'The corpus option passed to the interesections method must be an array reference.' }
	1					7
193
194	2					5	$length = $options{ 'length' };
195	2	100				19	if ( $length =~ /\D/ ) { die "The length option passed to the intersections method mus be an integer." }
	1					13
196
197							}
198
199							}
200
201
202							# process each object
203	1					5	for ( my $i = 0; $i <= $#$objects; $i++ ) {
204
205							# count each ngram
206	2					18	my $count = $$objects[ $i ]->ngram( $length );
207	2					117806	foreach ( keys %$count ) { $ngrams{ $_ }++ }
	256466					596464
208
209							# save counts for later reference; all puns intended
210	2					50472	$counts[ $i ] = $count;
211
212							}
213
214							# process each ngram
215	1					207694	foreach ( keys %ngrams ) {
216
217							# check for intersection; ngram in all total number of objects
218	255948	100				781057	if ( $ngrams{ $_ } == ( $#$objects + 1 )) {
219
220							# calculate total occurances
221	518					743	my $total = 0;
222	518					1653	for ( my $i = 0; $i <= $#$objects; $i++ ) { $total += $counts[ $i ]{ $_ }}
	1036					4960
223
224							# update result
225	518					1869	$intersections{ $_ } = $total;
226
227							}
228
229							}
230
231							# done
232	1					306573	return \%intersections;
233
234							}
235
236
237							=head1 NAME
238
239							Lingua::EN::Ngram - Extract n-grams from texts and list them according to frequency and/or T-Score
240
241
242							=head1 SYNOPSIS
243
244							# initalize
245							use Lingua::EN::Ngram;
246							$ngram = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
247
248							# calculate t-score; t-score is only available for bigrams
249							$tscore = $ngram->tscore;
250							foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
251
252							print "$$tscore{ $_ }\t" . "$_\n";
253
254							}
255
256							# list trigrams according to frequency
257							$trigrams = $ngram->ngram( 3 );
258							foreach my $trigram ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
259
260							print $$trigrams{ $trigram }, "\t$trigram\n";
261
262							}
263
264
265							=head1 DESCRIPTION
266
267							This module is designed to extract n-grams from texts and list them according to frequency and/or T-Score.
268
269							To elaborate, the purpose of Lingua::EN::Ngram is to: 1) pull out all of the ngrams (multi-word phrases) in a given text, and 2) list these phrases according to their frequency. Using this module is it possible to create lists of the most common phrases in a text as well as order them by their probable occurance, thus implying significance. This process is useful for the purposes of textual analysis and "distant reading".
270
271							The two-word phrases (bigrams) are also listable by their T-Score. The T-Score, as well as a number of the module's other methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
272
273							Finally, the intersection method enables the developer to find ngrams common in an arbitrary number of texts. Use this to look for common themes across a corpus.
274
275
276							=head1 METHODS
277
278
279							=head2 new
280
281							Create a new Lingua::EN::Ngram object:
282
283							# initalize
284							$ngram = Lingua::EN::Ngram->new;
285
286
287							=head2 new( text => $scalar )
288
289							Create a new Lingua::EN::Ngram object whose contents equal the content of a scalar:
290
291							# initalize with scalar
292							$ngram = Lingua::EN::Ngram->new( text => 'All good things must come to an end...' );
293
294
295							=head2 new( file => $scalar )
296
297							Create a new Lingua::EN::Ngram object whose contents equal the content of a file:
298
299							# initalize with file
300							$ngram = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
301
302
303							=head2 text
304
305							Set or get the text to be analyzed:
306
307							# fill Lingua::EN::Ngram object with content
308							$ngram->text( 'All good things must come to an end...' );
309
310							# get the Lingua::EN::Bigram object's content
311							$text = $ngram->text;
312
313
314							=head2 tscore
315
316							Return a reference to a hash whose keys are a bigram and whose values are a T-Score -- a probabalistic calculation determining the significance of the bigram occuring in the text:
317
318							# get t-score
319							$tscore = $ngrams->tscore;
320
321							# list bigrams according to t-score
322							foreach ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {
323
324							print "$$tscore{ $_ }\t" . "$_\n";
325
326							}
327
328							T-Score can only be computed against bigrams.
329
330
331							=head2 ngram( $scalar )
332
333							Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in the text:
334
335							# create a list of trigrams
336							$trigrams = $ngrams->ngram( 3 );
337
338							# display frequency
339							foreach ( sort { $$trigrams{ $b } <=> $$trigrams{ $a } } keys %$trigrams ) {
340
341							print $$trigrams{ $_ }, "\t$_\n";
342
343							}
344
345							This method requires a single parameter and that parameter must be an integer. For example, to get a list of bigrams, pass 2 to ngram. To get a list of quadgrams, pass 4.
346
347
348							=head2 intersection( corpus => [ @array ], length => $scalar )
349
350							Return a hash reference whose keys are ngrams of length $scalar and whose values are the number of times the ngrams appear in a corpus of texts:
351
352							# build corpus
353							$walden = Lingua::EN::Ngram->new( file => './etc/walden.txt' );
354							$rivers = Lingua::EN::Ngram->new( file => './etc/rivers.txt' );
355							$corpus = Lingua::EN::Ngram->new;
356
357							# compute intersections
358							$intersections = $corpus->intersection( corpus => [ ( $walden, $rivers ) ], length => 5 );
359
360							# display frequency
361							foreach ( sort { $$intersections{ $b } <=> $$intersections{ $a }} keys %$intersections ) {
362
363							print $$intersections{ $_ }, "\t$_\n";
364
365							}
366
367							The value of corpus must be an array reference, and each element must be Lingua::EN::Ngram objects. The value of length must be an integer.
368
369
370							=head1 DISCUSSION
371
372							Given the increasing availability of full text materials, this module is intended to help "digital humanists" apply mathematical methods to the analysis of texts. For example, the developer can extract the high-frequency words using the ngram method and allow the user to search for those words in a concordance. The use of ngram( 2 ) simply returns the frequency of bigrams in a text, but the tscore method can order them in a more finely tuned manner.
373
374							Consider using T-Score-weighted bigrams as classification terms to supplement the "aboutness" of texts. Concatonate many texts together and look for common phrases written by the author. Compare these commonly used phrases to the commonly used phrases of other authors.
375
376							All ngrams return by the ngram method include punctuation. This is intentional. Developers may need want to remove ngrams containing such values from the output. Similarly, no effort has been made to remove commonly used words -- stop words -- from the methods. Consider the use of Lingua::StopWords, Lingua::EN::StopWords, or the creation of your own stop word list to make output more meaningful. The distribution came with a script (bin/ngrams.pl) demonstrating how to remove puncutation from the displayed output. Another script (bin/intesections.pl) demonstrates how to extract and count ngrams across two texts.
377
378							Finally, this is not the only module supporting ngram extraction. See also Text::NSP.
379
380
381							=head1 TODO
382
383							There are probably a number of ways the module can be improved:
384
385							=over
386
387							* the distribution's license should probably be changed to the Perl Aristic License
388
389							* the addition of alternative T-Score calculations would be nice
390
391							* make sure the module works with character sets beyond ASCII (done, I think, as of version 0.02)
392
393							=back
394
395
396							=head1 CHANGES
397
398							=over
399
400							* September 12, 2010 (version 0.01) - initial release but an almost complete rewrite of Lingua::EN::Bigram
401
402							* November 25, 2010 (version 0.02) - added non-Latin characters
403
404							=back
405
406							=head1 ACKNOWLEDGEMENTS
407
408							T-Score, as well as a number of the module's methods, is calculated as per Nugues, P. M. (2006). An introduction to language processing with Perl and Prolog: An outline of theories, implementation, and application with special consideration of English, French, and German. Cognitive technologies. Berlin: Springer.
409
410
411							=head1 AUTHOR
412
413							Eric Lease Morgan
414
415							=cut
416
417							# return true or die
418							1;