File Coverage

lib/Lingua/EN/Fathom.pm

Criterion	Covered	Total	%
statement	102	167	61.0
branch	18	32	56.2
condition	3	9	33.3
subroutine	17	28	60.7
pod	18	20	90.0
total	158	256	61.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::Fathom - Measure readability of English text
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::Fathom;
8
9							my $text = Lingua::EN::Fathom->new();
10
11							$text->analyse_file("sample.txt"); # Analyse contents of a text file
12
13							$accumulate = 1;
14							my $text_string = q{
15							Returns the number of words in the analysed text file or block. A word must
16							consist of letters a-z with at least one vowel sound, and optionally an
17							apostrophe or hyphen.
18
19							##########################################
20							Items such as "&, K108, NSW" are not counted as words.
21							Common abbreviations such a U.S. or numbers like 1.23 will not denote the end of
22							a sentence.
23							};
24
25							$text->analyse_block($text_string,$accumulate); # Analyse contents of a text string
26
27							print($text->report); # Create a formatted report
28
29							Number of characters : 312
30							Number of words : 54
31							Percent of complex words : 7.41
32							Average syllables per word : 1.4259
33							Number of sentences : 4
34							Average words per sentence : 13.5000
35							Number of text lines : 6
36							Number of non-text lines : 1
37							Number of blank lines : 2
38							Number of paragraphs : 2
39
40							READABILITY INDICES
41
42							Fog : 8.3630
43							Flesch : 72.4992
44							Flesch-Kincaid : 6.5009
45
46							# Methods to return statistics on the analysed text
47							$text->num_chars;
48							$text->num_words;
49							$text->percent_complex_words;
50							$text->num_sentences;
51							$text->num_text_lines;
52							$text->num_non_text_lines;
53							$text->num_blank_lines; # trailing EOLs are ignored
54							$text->num_paragraphs;
55							$text->syllables_per_word;
56							$text->words_per_sentence;
57							$text->unique_words;
58							$text->fog;
59							$text->flesch;
60							$text->kincaid;
61
62							# get a hash of unique words, keyed by word and occurrence as the value
63							$text->unique_words
64
65							# Print a list of unique words
66							%words = $text->unique_words;
67							foreach $word ( sort keys %words )
68							{
69							print("$words{$word} :$word\n");
70							}
71
72							=head1 REQUIRES
73
74							Lingua::EN::Syllable, Lingua::EN::Sentence
75
76
77							=head1 DESCRIPTION
78
79							This module analyses English text in either a string or file. Totals are
80							then calculated for the number of characters, words, sentences, blank
81							and non blank (text) lines and paragraphs.
82
83							Three common readability statistics are also derived, the Fog, Flesch and
84							Kincaid indices.
85
86							All of these properties can be accessed through individual methods, or by
87							generating a text report.
88
89							A hash of all unique words and the number of times they occur is generated.
90
91
92							=head1 METHODS
93
94							=head2 new
95
96							The C method creates an instance of an text object This must be called
97							before any of the following methods are invoked. Note that the object only
98							needs to be created once, and can be reused with new input data.
99
100							my $text = Lingua::EN::Fathom->new();
101
102							=head2 analyse_file
103
104							The C method takes as input the name of a text file. Various
105							text based statistics are calculated for the file. This method and
106							C are prerequisites for all the following methods. An optional
107							argument may be supplied to control accumulation of statistics. If set to
108							a non zero value, all statistics are accumulated with each successive call.
109
110							$text->analyse_file("sample.txt");
111
112
113							=head2 analyse_block
114
115							The C method takes as input a text string. Various
116							text based statistics are calculated for the file. This method and
117							C are prerequisites for all the following methods. An optional
118							argument may be supplied to control accumulation of statistics. If set to
119							a non zero value, all statistics are accumulated with each successive call.
120
121							$text->analyse_block($text_str,$accumulate);
122
123							=head2 num_chars
124
125							Returns the number of characters in the analysed text file or block. This
126							includes characters such as spaces, and punctuation marks.
127
128							=head2 num_words
129
130							Returns the number of words in the analysed text file or block. A word must
131							consist of letters a-z with at least one vowel sound, and optionally an
132							apostrophe or hyphen. Items such as "&, K108, NW" are not counted as words.
133
134							=head2 percent_complex_words
135
136							Returns the percentage of complex words in the analysed text file or block. A
137							complex word must consist of three or more syllables. This statistic is used to
138							calculate the fog index.
139
140							=head2 num_sentences
141
142							Returns the number of sentences in the analysed text file or block. A sentence
143							is any group of words and non words terminated with a single full stop. Spaces
144							may occur before and after the full stop.
145
146							=head2 num_text_lines
147
148							Returns the number of lines containing some text in the analysed
149							text file or block.
150
151							=head2 num_non_text_lines
152
153							Returns the number of lines containing no text in the analysed
154							text file or block.
155
156							=head2 num_blank_lines
157
158							Returns the number of empty lines in the analysed
159							text file or block.
160
161							=head2 num_paragraphs
162
163							Returns the number of paragraphs in the analysed text file or block.
164
165							=head2 syllables_per_word
166
167							Returns the average number of syllables per word in the analysed
168							text file or block.
169
170							=head2 words_per_sentence
171
172							Returns the average number of words per sentence in the analysed
173							text file or block.
174
175
176							=head2 READABILITY
177
178							Three indices of text readability are calculated. They all measure complexity as
179							a function of syllables per word and words per sentence. They assume the text is
180							well formed and logical. You could analyse a passage of nonsensical English and
181							find the readability is quite good, provided the words are not too complex and
182							the sentences not too long.
183
184							For more information see: L
185
186
187							=head2 fog
188
189							Returns the Fog index for the analysed text file or block.
190
191							( words_per_sentence + percent_complex_words ) * 0.4
192
193							The Fog index, developed by Robert Gunning, is a well known and simple
194							formula for measuring readability. The index indicates the number of years
195							of formal education a reader of average intelligence would need to read the
196							text once and understand that piece of writing with its word sentence workload.
197
198							18 unreadable
199							14 difficult
200							12 ideal
201							10 acceptable
202							8 childish
203
204
205							=head2 flesch
206
207							Returns the Flesch reading ease score for the analysed text file or block.
208
209							206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
210
211							This score rates text on a 100 point scale. The higher the score, the easier
212							it is to understand the text. A score of 60 to 70 is considered to be optimal.
213
214
215							=head2 kincaid
216
217							Returns the Flesch-Kincaid grade level score for the analysed text
218							file or block.
219
220							(11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59;
221
222							This score rates text on U.S. grade school level. So a score of 8.0 means
223							that the document can be understood by an eighth grader. A score of 7.0 to
224							8.0 is considered to be optimal.
225
226							=head2 unique_words
227
228							Returns a hash of unique words. The words (in lower case) are held in
229							the hash keys while the number of occurrences are held in the hash values.
230
231
232							=head2 report
233
234							print($text->report);
235
236							Produces a text based report containing all Fathom statistics for
237							the currently analysed text block or file. For example:
238
239							Number of characters : 813
240							Number of words : 135
241							Percent of complex words : 20.00
242							Average syllables per word : 1.7704
243							Number of sentences : 12
244							Average words per sentence : 11.2500
245							Number of text lines : 13
246							Number of non text lines : 0
247							Number of blank lines : 8
248							Number of paragraphs : 4
249
250
251							READABILITY INDICES
252
253							Fog : 12.5000
254							Flesch : 45.6429
255							Flesch-Kincaid : 9.6879
256
257							The return value is a string containing the report contents
258
259
260							=head1 SEE ALSO
261
262							L,L,L
263
264
265							=head1 POSSIBLE EXTENSIONS
266
267							Count white space and punctuation characters
268							Allow user control over what strictly defines a word
269
270							=head1 LIMITATIONS
271
272							The syllable count provided in Lingua::EN::Syllable is about 90% accurate
273							Acronyms that contain vowels, like GPO, will be counted as words.
274							The fog index should exclude proper names
275
276
277
278							=head1 AUTHOR
279
280							Lingua::EN::Fathom was written by Kim Ryan .
281
282							=head1 COPYRIGHT AND LICENSE
283
284							Copyright (c) 2023 Kim Ryan. All rights reserved.
285
286							This library is free software; you can redistribute it and/or modify
287							it under the same terms as Perl itself.
288
289							=cut
290
291							#------------------------------------------------------------------------------
292
293							package Lingua::EN::Fathom;
294
295	1			1		70598	use Lingua::EN::Syllable;
	1					506
	1					54
296	1			1		505	use Lingua::EN::Sentence;
	1					16485
	1					52
297	1			1		6	use strict;
	1					2
	1					33
298	1			1		5	use warnings;
	1					2
	1					1842
299
300							our $VERSION = '1.26';
301
302							#------------------------------------------------------------------------------
303							# Create a new instance of a text object.
304
305							sub new
306							{
307	1			1	1	88	my $class = shift;
308
309	1					3	my $text = {};
310	1					3	bless($text,$class);
311	1					4	$text = &_initialize($text);
312	1					2	return($text);
313							}
314							#------------------------------------------------------------------------------
315							# Analyse text stored in a file, reading from the file one line at a time
316
317							sub analyse_file
318							{
319	0			0	1	0	my $text = shift;
320	0					0	my ($file_name,$accumulate) = @_;
321
322	0	0				0	unless ( $accumulate )
323							{
324	0					0	$text = _initialize($text);
325							}
326
327	0					0	$text->{file_name} = $file_name;
328
329							# Only analyse non-empty text files
330	0	0	0			0	unless ( -T $file_name and -s $file_name )
331							{
332	0					0	return($text);
333							}
334
335	0					0	open(IN_FH,"<$file_name");
336
337	0					0	my $in_paragraph = 0;
338	0					0	my $all_text;
339	0					0	while ( )
340							{
341	0					0	my $one_line = $_;
342	0					0	$all_text .= $one_line;
343	0					0	($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
344							}
345	0					0	close(IN_FH);
346
347	0					0	my $sentences= Lingua::EN::Sentence::get_sentences($all_text);
348	0					0	$text->{num_sentences} += scalar(@$sentences);
349	0					0	$text->_calculate_readability;
350
351	0					0	return($text);
352							}
353							#------------------------------------------------------------------------------
354							# Analyse a block of text, stored as a string. The string may contain line
355							# terminators.
356
357							sub analyse_block
358							{
359	1			1	1	5	my $text = shift;
360	1					2	my ($block,$accumulate) = @_;
361
362	1	50				4	unless ( $accumulate )
363							{
364	1					2	$text = _initialize($text);
365							}
366
367	1	50				3	unless ( $block )
368							{
369	0					0	return($text);
370							}
371
372	1					2	my $in_paragraph = 0;
373
374							# Split on EOL character
375							# repeating trailing line terminators are stripped
376	1					17	my @all_lines = split(/\n/,$block);
377	1					3	my $one_line;
378	1					19	foreach $one_line ( @all_lines )
379							{
380	9					19	($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
381							}
382
383	1					5	my $sentences= Lingua::EN::Sentence::get_sentences($block);
384	1	50				3592	if (defined($sentences))
385							{
386	1					4	$text->{num_sentences} += scalar(@$sentences);
387							}
388
389
390	1					7	$text->_calculate_readability;
391
392	1					5	return($text);
393							}
394							#------------------------------------------------------------------------------
395							sub num_chars
396							{
397	1			1	1	5	my $text = shift;
398	1					7	return($text->{num_chars});
399							}
400							#------------------------------------------------------------------------------
401							sub num_words
402							{
403	1			1	1	3	my $text = shift;
404	1					4	return($text->{num_words});
405							}
406							#------------------------------------------------------------------------------
407							sub percent_complex_words
408							{
409	0			0	1	0	my $text = shift;
410	0					0	return($text->{percent_complex_words});
411							}
412							#------------------------------------------------------------------------------
413							sub num_sentences
414							{
415	1			1	1	3	my $text = shift;
416	1					5	return($text->{num_sentences});
417							}
418							#------------------------------------------------------------------------------
419							sub num_text_lines
420							{
421	1			1	1	3	my $text = shift;
422	1					4	return($text->{num_text_lines});
423							}
424							#------------------------------------------------------------------------------
425							sub num_non_text_lines
426							{
427	1			1	1	2	my $text = shift;
428	1					5	return($text->{num_non_text_lines});
429							}
430							#------------------------------------------------------------------------------
431							sub num_blank_lines
432							{
433	1			1	1	2	my $text = shift;
434	1					4	return($text->{num_blank_lines});
435							}
436							#------------------------------------------------------------------------------
437							sub num_paragraphs
438							{
439	1			1	1	2	my $text = shift;
440	1					4	return($text->{num_paragraphs});
441							}
442							#------------------------------------------------------------------------------
443							sub syllables_per_word
444							{
445	0			0	1	0	my $text = shift;
446	0					0	return($text->{syllables_per_word});
447							}
448							#------------------------------------------------------------------------------
449							sub words_per_sentence
450							{
451	0			0	1	0	my $text = shift;
452	0					0	return($text->{words_per_sentence});
453							}
454							#------------------------------------------------------------------------------
455							# extra
456							sub num_syllables
457							{
458	0			0	0	0	my $text = shift;
459	0					0	return($text->{num_syllables});
460							}
461							#------------------------------------------------------------------------------
462							sub complex_words
463							{
464	0			0	0	0	my $text = shift;
465	0					0	return($text->{num_complex_words});
466							}
467							#------------------------------------------------------------------------------
468							sub fog
469							{
470	0			0	1	0	my $text = shift;
471	0					0	return($text->{fog});
472							}
473							#------------------------------------------------------------------------------
474							sub flesch
475							{
476	0			0	1	0	my $text = shift;
477	0					0	return($text->{flesch});
478							}
479							#------------------------------------------------------------------------------
480							sub kincaid
481							{
482	0			0	1	0	my $text = shift;
483	0					0	return($text->{kincaid});
484							}
485							#------------------------------------------------------------------------------
486							# Return anonymous hash of all the unique words in analysed text. The words
487							# occurrence count is stored in the hash value.
488
489							sub unique_words
490							{
491	0			0	1	0	my $text = shift;
492	0	0				0	if ( $text->{unique_words} )
493							{
494	0					0	return( %{ $text->{unique_words} } );
	0					0
495							}
496							else
497							{
498	0					0	return(undef);
499							}
500							}
501							#------------------------------------------------------------------------------
502							# Provide a formatted text report of all statistics for a text object.
503							# Return report as a string.
504
505							sub report
506							{
507	0			0	1	0	my $text = shift;
508	0					0	my $report = '';
509
510
511							$text->{file_name} and
512	0	0				0	$report .= sprintf("File name : %s\n",$text->{file_name} );
513
514	0					0	$report .= sprintf("Number of characters : %d\n", $text->num_chars);
515	0					0	$report .= sprintf("Number of words : %d\n", $text->num_words);
516	0					0	$report .= sprintf("Percent of complex words : %.2f\n",$text->percent_complex_words);
517	0					0	$report .= sprintf("Average syllables per word : %.4f\n",$text->syllables_per_word);
518	0					0	$report .= sprintf("Number of sentences : %d\n", $text->num_sentences);
519	0					0	$report .= sprintf("Average words per sentence : %.4f\n",$text->words_per_sentence);
520	0					0	$report .= sprintf("Number of text lines : %d\n", $text->num_text_lines);
521	0					0	$report .= sprintf("Number of non-text lines : %d\n", $text->num_non_text_lines);
522	0					0	$report .= sprintf("Number of blank lines : %d\n", $text->num_blank_lines);
523	0					0	$report .= sprintf("Number of paragraphs : %d\n", $text->num_paragraphs);
524
525	0					0	$report .= "\n\nREADABILITY INDICES\n\n";
526	0					0	$report .= sprintf("Fog : %.4f\n",$text->fog);
527	0					0	$report .= sprintf("Flesch : %.4f\n",$text->flesch);
528	0					0	$report .= sprintf("Flesch-Kincaid : %.4f\n",$text->kincaid);
529
530	0					0	return($report);
531							}
532
533							#------------------------------------------------------------------------------
534							# PRIVATE METHODS
535							#------------------------------------------------------------------------------
536							sub _initialize
537							{
538	2			2		5	my $text = shift;
539
540	2					7	$text->{num_chars} = 0;
541	2					4	$text->{num_syllables} = 0;
542	2					2	$text->{num_words} = 0;
543	2					3	$text->{num_complex_words} = 0;
544	2					2	$text->{syllables_per_word} = 0;
545	2					4	$text->{words_per_sentence} = 0;
546	2					3	$text->{percent_complex_words} = 0;
547	2					4	$text->{num_text_lines} = 0;
548	2					3	$text->{num_non_text_lines} = 0;
549	2					3	$text->{num_blank_lines} = 0;
550	2					3	$text->{num_paragraphs} = 0;
551	2					3	$text->{num_sentences} = 0;
552	2					3	$text->{unique_words} = ();
553	2					3	$text->{file_name} = '';
554
555	2					3	$text->{fog} = 0;
556	2					4	$text->{flesch} = 0;
557	2					3	$text->{kincaid} = 0;
558
559	2					5	return($text);
560							}
561							#------------------------------------------------------------------------------
562							# Increment number of text lines, blank lines and paragraphs
563
564							sub _analyse_line
565							{
566	9			9		11	my $text = shift;
567
568	9					14	my ($one_line,$in_paragraph) = @_;
569	9	100				32	if ( $one_line =~ /\w/ )
		100
		50
570							{
571	6					9	chomp($one_line);
572	6					12	$text = _analyse_words($text,$one_line);
573	6					9	$text->{num_text_lines}++;
574
575	6	100				11	unless ( $in_paragraph )
576							{
577	2					2	$text->{num_paragraphs}++;
578	2					3	$in_paragraph = 1;
579							}
580							}
581							elsif ($one_line eq '' ) # empty line
582							{
583	2					4	$text->{num_blank_lines}++;
584	2					2	$in_paragraph = 0;
585							}
586							elsif ($one_line =~ /^\W+$/ ) # non text
587							{
588	1					2	$text->{num_non_text_lines}++;
589	1					1	$in_paragraph = 0;
590							}
591	9					19	return($in_paragraph,$text);
592							}
593							#------------------------------------------------------------------------------
594							# Try to detect real words in line. Increment syllable, word, and complex word counters.
595
596							sub _analyse_words
597							{
598	6			6		7	my $text = shift;
599	6					8	my ($one_line) = @_;
600
601	6					10	$text->{num_chars} += length($one_line);
602
603							# Word found, such as: twice, BOTH, a, I'd, non-plussed ..
604
605							# Ignore words like 'Mr.', K12, &, X.Y.Z ...
606							# It could be argued that Mr. is a word, but this approach should detect most of the non words
607							# which have punctuation or numbers in them
608
609	6					35	while ( $one_line =~ /\b([a-z][-'a-z]*)\b/ig )
610							{
611	57					112	my $one_word = $1;
612
613							# Try to filter out acronyms and abbreviations by accepting
614							# words with a vowel sound. This won't work for GPO etc.
615	57	100				139	next unless $one_word =~ /[aeiouy]/i;
616
617							# Test for valid hyphenated word like be-bop
618	55	100				93	if ( $one_word =~ /-/ )
619							{
620	1	50				7	next unless $one_word =~ /[a-z]{2,}-[a-z]{2,}/i;
621							}
622
623							# word frequency count
624	54					111	$text->{unique_words}{lc($one_word)}++;
625
626	54					55	$text->{num_words}++;
627
628							# Use subroutine from Lingua::EN::Syllable
629	54					95	my $num_syllables_current_word = syllable($one_word);
630	54					9746	$text->{num_syllables} += $num_syllables_current_word;
631
632							# Required for Fog index, count non hyphenated words of 3 or more
633							# syllables. Should add check for proper names in here as well
634	54	100	66			237	if ( $num_syllables_current_word > 2 and $one_word !~ /-/ )
635							{
636	4					16	$text->{num_complex_words}++;
637							}
638							}
639
640	6					12	return($text);
641							}
642							#------------------------------------------------------------------------------
643							# Determine the three readability indices
644
645							sub _calculate_readability
646							{
647	1			1		2	my $text = shift;
648
649	1	50	33			9	if ( $text->{num_sentences} and $text->{num_words} )
650							{
651	1					5	$text->{words_per_sentence} = $text->{num_words} / $text->{num_sentences};
652	1					3	$text->{syllables_per_word} = $text->{num_syllables} / $text->{num_words};
653							$text->{percent_complex_words} =
654	1					5	( $text->{num_complex_words} / $text->{num_words} ) * 100;
655
656	1					2	$text->{fog} = ( $text->{words_per_sentence} + $text->{percent_complex_words} ) * 0.4;
657
658							$text->{flesch} = 206.835 - (1.015 * $text->{words_per_sentence}) -
659	1					5	(84.6 * $text->{syllables_per_word});
660
661							$text->{kincaid} = (11.8 * $text->{syllables_per_word}) +
662	1					3	(0.39 * $text->{words_per_sentence}) - 15.59;
663							}
664							else
665							{
666	0						$text->{words_per_sentence} = 0;
667	0						$text->{syllables_per_word} = 0;
668	0						$text->{num_complex_words} = 0;
669	0						$text->{fog} = 0;
670	0						$text->{flesch} = 0;
671	0						$text->{kincaid} = 0;
672							}
673							}
674							#------------------------------------------------------------------------------
675							return(1);