File Coverage

lib/Lingua/EN/Fathom.pm

Criterion	Covered	Total	%
statement	102	163	62.5
branch	18	32	56.2
condition	3	9	33.3
subroutine	17	26	65.3
pod	17	18	94.4
total	157	248	63.3

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::Fathom - Measure readability of English text
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::Fathom;
8
9							my $text = Lingua::EN::Fathom->new();
10
11							$text->analyse_file("sample.txt"); # Analyse contents of a text file
12
13							$accumulate = 1;
14							$text->analyse_block($text_string,$accumulate); # Analyse contents of a text string
15
16							# Methods to return statistics on the analysed text
17							$text->num_chars;
18							$text->num_words;
19							$text->percent_complex_words;
20							$text->num_sentences;
21							$text->num_text_lines;
22							$text->num_non_text_lines;
23							$text->num_blank_lines; # trailing EOLs are ignored
24							$text->num_paragraphs;
25							$text->syllables_per_word;
26							$text->words_per_sentence;
27							$text->unique_words;
28							$text->fog;
29							$text->flesch;
30							$text->kincaid;
31
32							# Call all of the above methods and present as a formatted report
33							print($text->report);
34
35							# get a hash of unique words, keyed by word and occurrence as the value
36							$text->unique_words
37
38							# Print a list of unique words
39							%words = $text->unique_words;
40							foreach $word ( sort keys %words )
41							{
42							print("$words{$word} :$word\n");
43							}
44
45
46							=head1 REQUIRES
47
48							Lingua::EN::Syllable, Lingua::EN::Sentence
49
50
51							=head1 DESCRIPTION
52
53							This module analyses English text in either a string or file. Totals are
54							then calculated for the number of characters, words, sentences, blank
55							and non blank (text) lines and paragraphs.
56
57							Three common readability statistics are also derived, the Fog, Flesch and
58							Kincaid indices.
59
60							All of these properties can be accessed through individual methods, or by
61							generating a text report.
62
63							A hash of all unique words and the number of times they occur is generated.
64
65
66							=head1 METHODS
67
68							=head2 new
69
70							The C method creates an instance of an text object This must be called
71							before any of the following methods are invoked. Note that the object only
72							needs to be created once, and can be reused with new input data.
73
74							my $text = Lingua::EN::Fathom->new();
75
76							=head2 analyse_file
77
78							The C method takes as input the name of a text file. Various
79							text based statistics are calculated for the file. This method and
80							C are prerequisites for all the following methods. An optional
81							argument may be supplied to control accumulation of statistics. If set to
82							a non zero value, all statistics are accumulated with each successive call.
83
84							$text->analyse_file("sample.txt");
85
86
87							=head2 analyse_block
88
89							The C method takes as input a text string. Various
90							text based statistics are calculated for the file. This method and
91							C are prerequisites for all the following methods. An optional
92							argument may be supplied to control accumulation of statistics. If set to
93							a non zero value, all statistics are accumulated with each successive call.
94
95							$text->analyse_block($text_str);
96
97							=head2 num_chars
98
99							Returns the number of characters in the analysed text file or block. This
100							includes characters such as spaces, and punctuation marks.
101
102							=head2 num_words
103
104							Returns the number of words in the analysed text file or block. A word must
105							consist of letters a-z with at least one vowel sound, and optionally an
106							apostrophe or hyphen. Items such as "&, K108, NW" are not counted as words.
107
108							=head2 percent_complex_words
109
110							Returns the percentage of complex words in the analysed text file or block. A
111							complex word must consist of three or more syllables. This statistic is used to
112							calculate the fog index.
113
114							=head2 num_sentences
115
116							Returns the number of sentences in the analysed text file or block. A sentence
117							is any group of words and non words terminated with a single full stop. Spaces
118							may occur before and after the full stop.
119
120
121							=head2 num_text_lines
122
123							Returns the number of lines containing some text in the analysed
124							text file or block.
125
126							=head2 num_blank_lines
127
128							Returns the number of lines NOT containing any text in the analysed
129							text file or block.
130
131							=head2 num_paragraphs
132
133							Returns the number of paragraphs in the analysed text file or block.
134
135							=head2 syllables_per_word
136
137							Returns the average number of syllables per word in the analysed
138							text file or block.
139
140							=head2 words_per_sentence
141
142							Returns the average number of words per sentence in the analysed
143							text file or block.
144
145
146
147							=head2 READABILITY
148
149							Three indices of text readability are calculated. They all measure complexity as
150							a function of syllables per word and words per sentence. They assume the text is
151							well formed and logical. You could analyse a passage of nonsensical English and
152							find the readability is quite good, provided the words are not too complex and
153							the sentences not too long.
154
155							For more information see: L
156
157
158							=head2 fog
159
160							Returns the Fog index for the analysed text file or block.
161
162							( words_per_sentence + percent_complex_words ) * 0.4
163
164							The Fog index, developed by Robert Gunning, is a well known and simple
165							formula for measuring readability. The index indicates the number of years
166							of formal education a reader of average intelligence would need to read the
167							text once and understand that piece of writing with its word sentence workload.
168
169							18 unreadable
170							14 difficult
171							12 ideal
172							10 acceptable
173							8 childish
174
175
176							=head2 flesch
177
178							Returns the Flesch reading ease score for the analysed text file or block.
179
180							206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
181
182							This score rates text on a 100 point scale. The higher the score, the easier
183							it is to understand the text. A score of 60 to 70 is considered to be optimal.
184
185
186							=head2 kincaid
187
188							Returns the Flesch-Kincaid grade level score for the analysed text
189							file or block.
190
191							(11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59;
192
193							This score rates text on U.S. grade school level. So a score of 8.0 means
194							that the document can be understood by an eighth grader. A score of 7.0 to
195							8.0 is considered to be optimal.
196
197							=head2 unique_words
198
199							Returns a hash of unique words. The words (in lower case) are held in
200							the hash keys while the number of occurrences are held in the hash values.
201
202
203							=head2 report
204
205							print($text->report);
206
207							Produces a text based report containing all Fathom statistics for
208							the currently analysed text block or file. For example:
209
210							Number of characters : 813
211							Number of words : 135
212							Percent of complex words : 20.00
213							Average syllables per word : 1.7704
214							Number of sentences : 12
215							Average words per sentence : 11.2500
216							Number of text lines : 13
217							Number of blank lines : 8
218							Number of paragraphs : 4
219
220
221							READABILITY INDICES
222
223							Fog : 12.5000
224							Flesch : 45.6429
225							Flesch-Kincaid : 9.6879
226
227							The return value is a string containing the report contents
228
229
230							=head1 SEE ALSO
231
232							L,L,L
233
234
235							=head1 POSSIBLE EXTENSIONS
236
237							Count white space and punctuation characters
238							Allow user control over what strictly defines a word
239
240							=head1 LIMITATIONS
241
242							The syllable count provided in Lingua::EN::Syllable is about 90% accurate
243							Acronyms that contain vowels, like GPO, will be counted as words.
244							The fog index should exclude proper names
245
246
247
248							=head1 AUTHOR
249
250							Lingua::EN::Fathom was written by Kim Ryan .
251
252							=head1 COPYRIGHT AND LICENSE
253
254							Copyright (c) 2023 Kim Ryan. All rights reserved.
255
256							This library is free software; you can redistribute it and/or modify
257							it under the same terms as Perl itself.
258
259							=cut
260
261							#------------------------------------------------------------------------------
262
263							package Lingua::EN::Fathom;
264
265	1			1		78671	use Lingua::EN::Syllable;
	1					544
	1					63
266	1			1		555	use Lingua::EN::Sentence;
	1					18078
	1					50
267	1			1		7	use strict;
	1					2
	1					34
268	1			1		6	use warnings;
	1					4
	1					1839
269
270							our $VERSION = '1.24';
271
272							#------------------------------------------------------------------------------
273							# Create a new instance of a text object.
274
275							sub new
276							{
277	1			1	1	93	my $class = shift;
278
279	1					4	my $text = {};
280	1					3	bless($text,$class);
281	1					3	$text = &_initialize($text);
282	1					2	return($text);
283							}
284							#------------------------------------------------------------------------------
285							# Analyse text stored in a file, reading from the file one line at a time
286
287							sub analyse_file
288							{
289	0			0	1	0	my $text = shift;
290	0					0	my ($file_name,$accumulate) = @_;
291
292	0	0				0	unless ( $accumulate )
293							{
294	0					0	$text = _initialize($text);
295							}
296
297	0					0	$text->{file_name} = $file_name;
298
299							# Only analyse non-empty text files
300	0	0	0			0	unless ( -T $file_name and -s $file_name )
301							{
302	0					0	return($text);
303							}
304
305	0					0	open(IN_FH,"<$file_name");
306
307	0					0	my $in_paragraph = 0;
308	0					0	my $all_text;
309	0					0	while ( )
310							{
311	0					0	my $one_line = $_;
312	0					0	$all_text .= $one_line;
313	0					0	($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
314							}
315	0					0	close(IN_FH);
316
317	0					0	my $sentences= Lingua::EN::Sentence::get_sentences($all_text);
318	0					0	$text->{num_sentences} = scalar(@$sentences);
319	0					0	$text->_calculate_readability;
320
321	0					0	return($text);
322							}
323							#------------------------------------------------------------------------------
324							# Analyse a block of text, stored as a string. The string may contain line
325							# terminators.
326
327							sub analyse_block
328							{
329	1			1	1	5	my $text = shift;
330	1					3	my ($block,$accumulate) = @_;
331
332	1	50				4	unless ( $accumulate )
333							{
334	1					3	$text = _initialize($text);
335							}
336
337	1	50				4	unless ( $block )
338							{
339	0					0	return($text);
340							}
341
342	1					2	my $in_paragraph = 0;
343
344							# Split on EOL character
345							# repeating trailing line terminators are stripped
346	1					6	my @all_lines = split(/\n/,$block);
347	1					3	my $one_line;
348	1					24	foreach $one_line ( @all_lines )
349							{
350	9					15	($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
351							}
352
353	1					5	my $sentences= Lingua::EN::Sentence::get_sentences($block);
354	1	50				3761	if (defined($sentences))
355							{
356	1					5	$text->{num_sentences} = scalar(@$sentences);
357							}
358
359
360	1					8	$text->_calculate_readability;
361
362	1					4	return($text);
363							}
364							#------------------------------------------------------------------------------
365							sub num_chars
366							{
367	1			1	1	6	my $text = shift;
368	1					7	return($text->{num_chars});
369							}
370							#------------------------------------------------------------------------------
371							sub num_words
372							{
373	1			1	1	5	my $text = shift;
374	1					4	return($text->{num_words});
375							}
376							#------------------------------------------------------------------------------
377							sub percent_complex_words
378							{
379	0			0	1	0	my $text = shift;
380	0					0	return($text->{percent_complex_words});
381							}
382							#------------------------------------------------------------------------------
383							sub num_sentences
384							{
385	1			1	1	2	my $text = shift;
386	1					5	return($text->{num_sentences});
387							}
388							#------------------------------------------------------------------------------
389							sub num_text_lines
390							{
391	1			1	1	3	my $text = shift;
392	1					4	return($text->{num_text_lines});
393							}
394							#------------------------------------------------------------------------------
395							sub num_non_text_lines
396							{
397	1			1	0	7	my $text = shift;
398	1					5	return($text->{num_non_text_lines});
399							}
400							#------------------------------------------------------------------------------
401							sub num_blank_lines
402							{
403	1			1	1	2	my $text = shift;
404	1					7	return($text->{num_blank_lines});
405							}
406							#------------------------------------------------------------------------------
407							sub num_paragraphs
408							{
409	1			1	1	3	my $text = shift;
410	1					4	return($text->{num_paragraphs});
411							}
412							#------------------------------------------------------------------------------
413							sub syllables_per_word
414							{
415	0			0	1	0	my $text = shift;
416	0					0	return($text->{syllables_per_word});
417							}
418							#------------------------------------------------------------------------------
419							sub words_per_sentence
420							{
421	0			0	1	0	my $text = shift;
422	0					0	return($text->{words_per_sentence});
423							}
424							#------------------------------------------------------------------------------
425							sub fog
426							{
427	0			0	1	0	my $text = shift;
428	0					0	return($text->{fog});
429							}
430							#------------------------------------------------------------------------------
431							sub flesch
432							{
433	0			0	1	0	my $text = shift;
434	0					0	return($text->{flesch});
435							}
436							#------------------------------------------------------------------------------
437							sub kincaid
438							{
439	0			0	1	0	my $text = shift;
440	0					0	return($text->{kincaid});
441							}
442							#------------------------------------------------------------------------------
443							# Return anonymous hash of all the unique words in analysed text. The words
444							# occurrence count is stored in the hash value.
445
446							sub unique_words
447							{
448	0			0	1	0	my $text = shift;
449	0	0				0	if ( $text->{unique_words} )
450							{
451	0					0	return( %{ $text->{unique_words} } );
	0					0
452							}
453							else
454							{
455	0					0	return(undef);
456							}
457							}
458							#------------------------------------------------------------------------------
459							# Provide a formatted text report of all statistics for a text object.
460							# Return report as a string.
461
462							sub report
463							{
464	0			0	1	0	my $text = shift;
465	0					0	my $report = '';
466
467
468							$text->{file_name} and
469	0	0				0	$report .= sprintf("File name : %s\n",$text->{file_name} );
470
471	0					0	$report .= sprintf("Number of characters : %d\n", $text->num_chars);
472	0					0	$report .= sprintf("Number of words : %d\n", $text->num_words);
473	0					0	$report .= sprintf("Percent of complex words : %.2f\n",$text->percent_complex_words);
474	0					0	$report .= sprintf("Average syllables per word : %.4f\n",$text->syllables_per_word);
475	0					0	$report .= sprintf("Number of sentences : %d\n", $text->num_sentences);
476	0					0	$report .= sprintf("Average words per sentence : %.4f\n",$text->words_per_sentence);
477	0					0	$report .= sprintf("Number of text lines : %d\n", $text->num_text_lines);
478	0					0	$report .= sprintf("Number of non-text lines : %d\n", $text->num_non_text_lines);
479	0					0	$report .= sprintf("Number of blank lines : %d\n", $text->num_blank_lines);
480	0					0	$report .= sprintf("Number of paragraphs : %d\n", $text->num_paragraphs);
481
482	0					0	$report .= "\n\nREADABILITY INDICES\n\n";
483	0					0	$report .= sprintf("Fog : %.4f\n",$text->fog);
484	0					0	$report .= sprintf("Flesch : %.4f\n",$text->flesch);
485	0					0	$report .= sprintf("Flesch-Kincaid : %.4f\n",$text->kincaid);
486
487	0					0	return($report);
488							}
489
490							#------------------------------------------------------------------------------
491							# PRIVATE METHODS
492							#------------------------------------------------------------------------------
493							sub _initialize
494							{
495	2			2		3	my $text = shift;
496
497	2					9	$text->{num_chars} = 0;
498	2					3	$text->{num_syllables} = 0;
499	2					4	$text->{num_words} = 0;
500	2					3	$text->{num_complex_words} = 0;
501	2					3	$text->{syllables_per_word} = 0;
502	2					3	$text->{words_per_sentence} = 0;
503	2					3	$text->{percent_complex_words} = 0;
504	2					5	$text->{num_text_lines} = 0;
505	2					3	$text->{num_non_text_lines} = 0;
506	2					2	$text->{num_blank_lines} = 0;
507	2					4	$text->{num_paragraphs} = 0;
508	2					3	$text->{num_sentences} = 0;
509	2					3	$text->{unique_words} = ();
510	2					3	$text->{file_name} = '';
511
512	2					3	$text->{fog} = 0;
513	2					4	$text->{flesch} = 0;
514	2					3	$text->{kincaid} = 0;
515
516	2					4	return($text);
517							}
518							#------------------------------------------------------------------------------
519							# Increment number of text lines, blank lines and paragraphs
520
521							sub _analyse_line
522							{
523	9			9		12	my $text = shift;
524
525	9					17	my ($one_line,$in_paragraph) = @_;
526	9	100				33	if ( $one_line =~ /\w/ )
		100
		50
527							{
528	6					11	chomp($one_line);
529	6					11	$text = _analyse_words($text,$one_line);
530	6					10	$text->{num_text_lines}++;
531
532	6	100				14	unless ( $in_paragraph )
533							{
534	2					3	$text->{num_paragraphs}++;
535	2					3	$in_paragraph = 1;
536							}
537							}
538							elsif ($one_line eq '' ) # empty line
539							{
540	2					3	$text->{num_blank_lines}++;
541	2					3	$in_paragraph = 0;
542							}
543							elsif ($one_line =~ /^\W+$/ ) # non text
544							{
545	1					3	$text->{num_non_text_lines}++;
546	1					1	$in_paragraph = 0;
547							}
548	9					24	return($in_paragraph,$text);
549							}
550							#------------------------------------------------------------------------------
551							# Try to detect real words in line. Increment syllable, word, and complex word counters.
552
553							sub _analyse_words
554							{
555	6			6		8	my $text = shift;
556	6					8	my ($one_line) = @_;
557
558	6					10	$text->{num_chars} += length($one_line);
559
560							# Word found, such as: twice, BOTH, a, I'd, non-plussed ..
561
562							# Ignore words like 'Mr.', K12, &, X.Y.Z ...
563							# It could be argued that Mr. is a word, but this approach should detect most of the non words
564							# which have punctuation or numbers in them
565
566	6					23	while ( $one_line =~ /\b([a-z][-'a-z]*)\b/ig )
567							{
568	57					104	my $one_word = $1;
569
570							# Try to filter out acronyms and abbreviations by accepting
571							# words with a vowel sound. This won't work for GPO etc.
572	57	100				126	next unless $one_word =~ /[aeiouy]/i;
573
574							# Test for valid hyphenated word like be-bop
575	55	100				98	if ( $one_word =~ /-/ )
576							{
577	1	50				15	next unless $one_word =~ /[a-z]{2,}-[a-z]{2,}/i;
578							}
579
580							# word frequency count
581	54					152	$text->{unique_words}{lc($one_word)}++;
582
583	54					57	$text->{num_words}++;
584
585							# Use subroutine from Lingua::EN::Syllable
586	54					90	my $num_syllables_current_word = syllable($one_word);
587	54					10019	$text->{num_syllables} += $num_syllables_current_word;
588
589							# Required for Fog index, count non hyphenated words of 3 or more
590							# syllables. Should add check for proper names in here as well
591	54	100	66			245	if ( $num_syllables_current_word > 2 and $one_word !~ /-/ )
592							{
593	4					18	$text->{num_complex_words}++;
594							}
595							}
596
597	6					17	return($text);
598							}
599							#------------------------------------------------------------------------------
600							# Determine the three readability indices
601
602							sub _calculate_readability
603							{
604	1			1		4	my $text = shift;
605
606	1	50	33			7	if ( $text->{num_sentences} and $text->{num_words} )
607							{
608	1					6	$text->{words_per_sentence} = $text->{num_words} / $text->{num_sentences};
609	1					4	$text->{syllables_per_word} = $text->{num_syllables} / $text->{num_words};
610							$text->{percent_complex_words} =
611	1					5	( $text->{num_complex_words} / $text->{num_words} ) * 100;
612
613	1					3	$text->{fog} = ( $text->{words_per_sentence} + $text->{percent_complex_words} ) * 0.4;
614
615							$text->{flesch} = 206.835 - (1.015 * $text->{words_per_sentence}) -
616	1					4	(84.6 * $text->{syllables_per_word});
617
618							$text->{kincaid} = (11.8 * $text->{syllables_per_word}) +
619	1					5	(0.39 * $text->{words_per_sentence}) - 15.59;
620							}
621							else
622							{
623	0						$text->{words_per_sentence} = 0;
624	0						$text->{syllables_per_word} = 0;
625	0						$text->{num_complex_words} = 0;
626	0						$text->{fog} = 0;
627	0						$text->{flesch} = 0;
628	0						$text->{kincaid} = 0;
629							}
630							}
631							#------------------------------------------------------------------------------
632							return(1);