File Coverage

lib/Lingua/EN/Fathom.pm

Criterion	Covered	Total	%
statement	102	167	61.0
branch	18	32	56.2
condition	3	9	33.3
subroutine	17	28	60.7
pod	18	20	90.0
total	158	256	61.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::Fathom - Measure readability of English text
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::Fathom;
8
9							my $text = Lingua::EN::Fathom->new();
10
11							$text->analyse_file("sample.txt"); # Analyse contents of a text file
12
13							$accumulate = 1;
14							my $text_string = q{
15							Returns the number of words in the analysed text file or block. A word must
16							consist of letters a-z with at least one vowel sound, and optionally an
17							apostrophe or hyphen.
18
19							##########################################
20							Items such as "&, K108, NSW" are not counted as words.
21							Common abbreviations such a U.S. or numbers like 1.23 will not denote the end of
22							a sentence.
23							};
24
25							$text->analyse_block($text_string,$accumulate); # Analyse contents of a text string
26
27							print($text->report); # Create a formatted report
28
29							Number of characters : 312
30							Number of words : 54
31							Percent of complex words : 7.41
32							Average syllables per word : 1.4259
33							Number of sentences : 4
34							Average words per sentence : 13.5000
35							Number of text lines : 6
36							Number of non-text lines : 1
37							Number of blank lines : 2
38							Number of paragraphs : 2
39
40							READABILITY INDICES
41
42							Fog : 8.3630
43							Flesch : 72.4992
44							Flesch-Kincaid : 6.5009
45
46							# Methods to return statistics on the analysed text
47							$text->num_chars;
48							$text->num_words;
49							$text->percent_complex_words;
50							$text->num_sentences;
51							$text->num_text_lines;
52							$text->num_non_text_lines;
53							$text->num_blank_lines; # trailing EOLs are ignored
54							$text->num_paragraphs;
55							$text->syllables_per_word;
56							$text->words_per_sentence;
57							$text->unique_words;
58							$text->fog;
59							$text->flesch;
60							$text->kincaid;
61
62							# get a hash of unique words, keyed by word and occurrence as the value
63							$text->unique_words
64
65							# Print a list of unique words
66							%words = $text->unique_words;
67							foreach $word ( sort keys %words )
68							{
69							print("$words{$word} :$word\n");
70							}
71
72							=head1 REQUIRES
73
74							Lingua::EN::Syllable, Lingua::EN::Sentence
75
76
77							=head1 DESCRIPTION
78
79							This module analyses English text in either a string or file. Totals are
80							then calculated for the number of characters, words, sentences, blank
81							and non blank (text) lines and paragraphs.
82
83							Three common readability statistics are also derived, the Fog, Flesch and
84							Kincaid indices.
85
86							All of these properties can be accessed through individual methods, or by
87							generating a text report.
88
89							A hash of all unique words and the number of times they occur is generated.
90
91
92							=head1 METHODS
93
94							=head2 new
95
96							The C method creates an instance of an text object This must be called
97							before any of the following methods are invoked. Note that the object only
98							needs to be created once, and can be reused with new input data.
99
100							my $text = Lingua::EN::Fathom->new();
101
102							=head2 analyse_file
103
104							The C method takes as input the name of a text file. Various
105							text based statistics are calculated for the file. This method and
106							C are prerequisites for all the following methods. An optional
107							argument may be supplied to control accumulation of statistics. If set to
108							a non zero value, all statistics are accumulated with each successive call.
109
110							$text->analyse_file("sample.txt");
111
112
113							=head2 analyse_block
114
115							The C method takes as input a text string. Various
116							text based statistics are calculated for the file. This method and
117							C are prerequisites for all the following methods. An optional
118							argument may be supplied to control accumulation of statistics. If set to
119							a non zero value, all statistics are accumulated with each successive call.
120
121							$text->analyse_block($text_str,$accumulate);
122
123							=head2 num_chars
124
125							Returns the number of characters in the analysed text file or block. This
126							includes characters such as spaces, and punctuation marks.
127
128							=head2 num_words
129
130							Returns the number of words in the analysed text file or block. A word must
131							consist of letters a-z with at least one vowel sound, and optionally an
132							apostrophe or hyphen. Items such as "&, K108, NW" are not counted as words.
133
134							=head2 percent_complex_words
135
136							Returns the percentage of complex words in the analysed text file or block. A
137							complex word must consist of three or more syllables. This statistic is used to
138							calculate the fog index.
139
140							=head2 num_sentences
141
142							Returns the number of sentences in the analysed text file or block. A sentence
143							is any group of words and non words terminated with a single full stop. Spaces
144							may occur before and after the full stop.
145
146							=head2 num_text_lines
147
148							Returns the number of lines containing some text in the analysed
149							text file or block.
150
151							=head2 num_non_text_lines
152
153							Returns the number of lines containing no text in the analysed
154							text file or block.
155
156							=head2 num_blank_lines
157
158							Returns the number of empty lines in the analysed
159							text file or block.
160
161							=head2 num_paragraphs
162
163							Returns the number of paragraphs in the analysed text file or block.
164
165							=head2 syllables_per_word
166
167							Returns the average number of syllables per word in the analysed
168							text file or block.
169
170							=head2 words_per_sentence
171
172							Returns the average number of words per sentence in the analysed
173							text file or block.
174
175
176							=head2 READABILITY
177
178							Three indices of text readability are calculated. They all measure complexity as
179							a function of syllables per word and words per sentence. They assume the text is
180							well formed and logical. You could analyse a passage of nonsensical English and
181							find the readability is quite good, provided the words are not too complex and
182							the sentences not too long.
183
184							For more information see: L
185
186
187							=head2 fog
188
189							Returns the Fog index for the analysed text file or block.
190
191							( words_per_sentence + percent_complex_words ) * 0.4
192
193							The Fog index, developed by Robert Gunning, is a well known and simple
194							formula for measuring readability. The index indicates the number of years
195							of formal education a reader of average intelligence would need to read the
196							text once and understand that piece of writing with its word sentence workload.
197
198							18 unreadable
199							14 difficult
200							12 ideal
201							10 acceptable
202							8 childish
203
204
205							=head2 flesch
206
207							Returns the Flesch reading ease score for the analysed text file or block.
208
209							206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
210
211							This score rates text on a 100 point scale. The higher the score, the easier
212							it is to understand the text. A score of 60 to 70 is considered to be optimal.
213
214
215							=head2 kincaid
216
217							Returns the Flesch-Kincaid grade level score for the analysed text
218							file or block.
219
220							(11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59;
221
222							This score rates text on U.S. grade school level. So a score of 8.0 means
223							that the document can be understood by an eighth grader. A score of 7.0 to
224							8.0 is considered to be optimal.
225
226							=head2 unique_words
227
228							Returns a hash of unique words. The words (in lower case) are held in
229							the hash keys while the number of occurrences are held in the hash values.
230
231
232							=head2 report
233
234							print($text->report);
235
236							Produces a text based report containing all Fathom statistics for
237							the currently analysed text block or file. For example:
238
239							Number of characters : 813
240							Number of words : 135
241							Percent of complex words : 20.00
242							Average syllables per word : 1.7704
243							Number of sentences : 12
244							Average words per sentence : 11.2500
245							Number of text lines : 13
246							Number of non text lines : 0
247							Number of blank lines : 8
248							Number of paragraphs : 4
249
250
251							READABILITY INDICES
252
253							Fog : 12.5000
254							Flesch : 45.6429
255							Flesch-Kincaid : 9.6879
256
257							The return value is a string containing the report contents
258
259
260							=head1 SEE ALSO
261
262							L,L,L
263
264
265							=head1 POSSIBLE EXTENSIONS
266
267							Count white space and punctuation characters
268							Allow user control over what strictly defines a word
269
270							=head1 LIMITATIONS
271
272							The syllable count provided in Lingua::EN::Syllable is about 90% accurate
273							Acronyms that contain vowels, like GPO, will be counted as words.
274							The fog index should exclude proper names
275
276
277
278							=head1 AUTHOR
279
280							Lingua::EN::Fathom was written by Kim Ryan .
281
282							=head1 COPYRIGHT AND LICENSE
283
284							Copyright (c) 2023 Kim Ryan. All rights reserved.
285
286							This library is free software; you can redistribute it and/or modify
287							it under the same terms as Perl itself.
288
289							=cut
290
291							#------------------------------------------------------------------------------
292
293							package Lingua::EN::Fathom;
294
295	1			1		74672	use Lingua::EN::Syllable;
	1					523
	1					61
296	1			1		548	use Lingua::EN::Sentence;
	1					16850
	1					51
297	1			1		7	use strict;
	1					2
	1					34
298	1			1		7	use warnings;
	1					1
	1					1907
299
300							our $VERSION = '1.27';
301
302							#------------------------------------------------------------------------------
303							# Create a new instance of a text object.
304
305							sub new
306							{
307	1			1	1	87	my $class = shift;
308
309	1					2	my $text = {};
310	1					2	bless($text,$class);
311	1					3	$text = &_initialize($text);
312	1					3	return($text);
313							}
314							#------------------------------------------------------------------------------
315							# Analyse text stored in a file, reading from the file one line at a time
316
317							sub analyse_file
318							{
319	0			0	1	0	my $text = shift;
320	0					0	my ($file_name,$accumulate) = @_;
321
322	0	0				0	unless ( $accumulate )
323							{
324	0					0	$text = _initialize($text);
325							}
326
327	0					0	$text->{file_name} = $file_name;
328
329							# Only analyse non-empty text files
330	0	0	0			0	unless ( -T $file_name and -s $file_name )
331							{
332	0					0	return($text);
333							}
334
335	0					0	open(IN_FH,"<$file_name");
336
337	0					0	my $in_paragraph = 0;
338	0					0	my $all_text;
339	0					0	while ( )
340							{
341	0					0	my $one_line = $_;
342	0					0	$all_text .= $one_line;
343	0					0	($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
344							}
345	0					0	close(IN_FH);
346
347	0					0	my $sentences= Lingua::EN::Sentence::get_sentences($all_text);
348	0					0	$text->{num_sentences} += scalar(@$sentences);
349	0					0	$text->_calculate_readability;
350
351	0					0	return($text);
352							}
353							#------------------------------------------------------------------------------
354							# Analyse a block of text, stored as a string. The string may contain line
355							# terminators.
356
357							sub analyse_block
358							{
359	1			1	1	6	my $text = shift;
360	1					4	my ($block,$accumulate) = @_;
361
362	1	50				4	unless ( $accumulate )
363							{
364	1					2	$text = _initialize($text);
365							}
366
367	1	50				4	unless ( $block )
368							{
369	0					0	return($text);
370							}
371
372	1					1	my $in_paragraph = 0;
373
374							# Split on EOL character
375							# repeating trailing line terminators are stripped
376	1					16	my @all_lines = split(/\n/,$block);
377	1					3	my $one_line;
378	1					24	foreach $one_line ( @all_lines )
379							{
380	9					19	($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
381							}
382
383	1					9	my $sentences= Lingua::EN::Sentence::get_sentences($block);
384	1	50				3727	if (defined($sentences))
385							{
386	1					4	$text->{num_sentences} += scalar(@$sentences);
387							}
388
389
390	1					6	$text->_calculate_readability;
391
392	1					4	return($text);
393							}
394							#------------------------------------------------------------------------------
395							sub num_chars
396							{
397	1			1	1	5	my $text = shift;
398	1					4	return($text->{num_chars});
399							}
400							#------------------------------------------------------------------------------
401							sub num_words
402							{
403	1			1	1	3	my $text = shift;
404	1					3	return($text->{num_words});
405							}
406							#------------------------------------------------------------------------------
407							sub percent_complex_words
408							{
409	0			0	1	0	my $text = shift;
410	0					0	return($text->{percent_complex_words});
411							}
412							#------------------------------------------------------------------------------
413							sub num_sentences
414							{
415	1			1	1	2	my $text = shift;
416	1					4	return($text->{num_sentences});
417							}
418							#------------------------------------------------------------------------------
419							sub num_text_lines
420							{
421	1			1	1	4	my $text = shift;
422	1					4	return($text->{num_text_lines});
423							}
424							#------------------------------------------------------------------------------
425							sub num_non_text_lines
426							{
427	1			1	1	3	my $text = shift;
428	1					3	return($text->{num_non_text_lines});
429							}
430							#------------------------------------------------------------------------------
431							sub num_blank_lines
432							{
433	1			1	1	3	my $text = shift;
434	1					4	return($text->{num_blank_lines});
435							}
436							#------------------------------------------------------------------------------
437							sub num_paragraphs
438							{
439	1			1	1	2	my $text = shift;
440	1					6	return($text->{num_paragraphs});
441							}
442							#------------------------------------------------------------------------------
443							sub syllables_per_word
444							{
445	0			0	1	0	my $text = shift;
446	0					0	return($text->{syllables_per_word});
447							}
448							#------------------------------------------------------------------------------
449							sub words_per_sentence
450							{
451	0			0	1	0	my $text = shift;
452	0					0	return($text->{words_per_sentence});
453							}
454							#------------------------------------------------------------------------------
455							sub num_syllables
456							{
457	0			0	0	0	my $text = shift;
458	0					0	return($text->{num_syllables});
459							}
460							#------------------------------------------------------------------------------
461							sub complex_words
462							{
463	0			0	0	0	my $text = shift;
464	0					0	return($text->{num_complex_words});
465							}
466							#------------------------------------------------------------------------------
467							sub fog
468							{
469	0			0	1	0	my $text = shift;
470	0					0	return($text->{fog});
471							}
472							#------------------------------------------------------------------------------
473							sub flesch
474							{
475	0			0	1	0	my $text = shift;
476	0					0	return($text->{flesch});
477							}
478							#------------------------------------------------------------------------------
479							sub kincaid
480							{
481	0			0	1	0	my $text = shift;
482	0					0	return($text->{kincaid});
483							}
484							#------------------------------------------------------------------------------
485							# Return anonymous hash of all the unique words in analysed text. The words
486							# occurrence count is stored in the hash value.
487
488							sub unique_words
489							{
490	0			0	1	0	my $text = shift;
491	0	0				0	if ( $text->{unique_words} )
492							{
493	0					0	return( %{ $text->{unique_words} } );
	0					0
494							}
495							else
496							{
497	0					0	return(undef);
498							}
499							}
500							#------------------------------------------------------------------------------
501							# Provide a formatted text report of all statistics for a text object.
502							# Return report as a string.
503
504							sub report
505							{
506	0			0	1	0	my $text = shift;
507	0					0	my $report = '';
508
509
510							$text->{file_name} and
511	0	0				0	$report .= sprintf("File name : %s\n",$text->{file_name} );
512
513	0					0	$report .= sprintf("Number of characters : %d\n", $text->num_chars);
514	0					0	$report .= sprintf("Number of words : %d\n", $text->num_words);
515	0					0	$report .= sprintf("Percent of complex words : %.2f\n",$text->percent_complex_words);
516	0					0	$report .= sprintf("Average syllables per word : %.4f\n",$text->syllables_per_word);
517	0					0	$report .= sprintf("Number of sentences : %d\n", $text->num_sentences);
518	0					0	$report .= sprintf("Average words per sentence : %.4f\n",$text->words_per_sentence);
519	0					0	$report .= sprintf("Number of text lines : %d\n", $text->num_text_lines);
520	0					0	$report .= sprintf("Number of non-text lines : %d\n", $text->num_non_text_lines);
521	0					0	$report .= sprintf("Number of blank lines : %d\n", $text->num_blank_lines);
522	0					0	$report .= sprintf("Number of paragraphs : %d\n", $text->num_paragraphs);
523
524	0					0	$report .= "\n\nREADABILITY INDICES\n\n";
525	0					0	$report .= sprintf("Fog : %.4f\n",$text->fog);
526	0					0	$report .= sprintf("Flesch : %.4f\n",$text->flesch);
527	0					0	$report .= sprintf("Flesch-Kincaid : %.4f\n",$text->kincaid);
528
529	0					0	return($report);
530							}
531
532							#------------------------------------------------------------------------------
533							# PRIVATE METHODS
534							#------------------------------------------------------------------------------
535							sub _initialize
536							{
537	2			2		4	my $text = shift;
538
539	2					8	$text->{num_chars} = 0;
540	2					3	$text->{num_syllables} = 0;
541	2					3	$text->{num_words} = 0;
542	2					3	$text->{num_complex_words} = 0;
543	2					3	$text->{syllables_per_word} = 0;
544	2					3	$text->{words_per_sentence} = 0;
545	2					2	$text->{percent_complex_words} = 0;
546	2					4	$text->{num_text_lines} = 0;
547	2					3	$text->{num_non_text_lines} = 0;
548	2					4	$text->{num_blank_lines} = 0;
549	2					3	$text->{num_paragraphs} = 0;
550	2					3	$text->{num_sentences} = 0;
551	2					2	$text->{unique_words} = ();
552	2					3	$text->{file_name} = '';
553
554	2					3	$text->{fog} = 0;
555	2					4	$text->{flesch} = 0;
556	2					2	$text->{kincaid} = 0;
557
558	2					3	return($text);
559							}
560							#------------------------------------------------------------------------------
561							# Increment number of text lines, blank lines and paragraphs
562
563							sub _analyse_line
564							{
565	9			9		11	my $text = shift;
566
567	9					15	my ($one_line,$in_paragraph) = @_;
568	9	100				35	if ( $one_line =~ /\w/ )
		100
		50
569							{
570	6					9	chomp($one_line);
571	6					10	$text = _analyse_words($text,$one_line);
572	6					7	$text->{num_text_lines}++;
573
574	6	100				13	unless ( $in_paragraph )
575							{
576	2					3	$text->{num_paragraphs}++;
577	2					4	$in_paragraph = 1;
578							}
579							}
580							elsif ($one_line eq '' ) # empty line
581							{
582	2					3	$text->{num_blank_lines}++;
583	2					2	$in_paragraph = 0;
584							}
585							elsif ($one_line =~ /^\W+$/ ) # non text
586							{
587	1					1	$text->{num_non_text_lines}++;
588	1					1	$in_paragraph = 0;
589							}
590	9					22	return($in_paragraph,$text);
591							}
592							#------------------------------------------------------------------------------
593							# Try to detect real words in line. Increment syllable, word, and complex word counters.
594
595							sub _analyse_words
596							{
597	6			6		6	my $text = shift;
598	6					7	my ($one_line) = @_;
599
600	6					7	$text->{num_chars} += length($one_line);
601
602							# Word found, such as: twice, BOTH, a, I'd, non-plussed ..
603
604							# Ignore words like 'Mr.', K12, &, X.Y.Z ...
605							# It could be argued that Mr. is a word, but this approach should detect most of the non words
606							# which have punctuation or numbers in them
607
608	6					35	while ( $one_line =~ /\b([a-z][-'a-z]*)\b/ig )
609							{
610	57					108	my $one_word = $1;
611
612							# Try to filter out acronyms and abbreviations by accepting
613							# words with a vowel sound. This won't work for GPO etc.
614	57	100				123	next unless $one_word =~ /[aeiouy]/i;
615
616							# Test for valid hyphenated word like be-bop
617	55	100				92	if ( $one_word =~ /-/ )
618							{
619	1	50				8	next unless $one_word =~ /[a-z]{2,}-[a-z]{2,}/i;
620							}
621
622							# word frequency count
623	54					134	$text->{unique_words}{lc($one_word)}++;
624
625	54					54	$text->{num_words}++;
626
627							# Use subroutine from Lingua::EN::Syllable
628	54					88	my $num_syllables_current_word = syllable($one_word);
629	54					9857	$text->{num_syllables} += $num_syllables_current_word;
630
631							# Required for Fog index, count non hyphenated words of 3 or more
632							# syllables. Should add check for proper names in here as well
633	54	100	66			233	if ( $num_syllables_current_word > 2 and $one_word !~ /-/ )
634							{
635	4					14	$text->{num_complex_words}++;
636							}
637							}
638
639	6					12	return($text);
640							}
641							#------------------------------------------------------------------------------
642							# Determine the three readability indices
643
644							sub _calculate_readability
645							{
646	1			1		2	my $text = shift;
647
648	1	50	33			7	if ( $text->{num_sentences} and $text->{num_words} )
649							{
650	1					3	$text->{words_per_sentence} = $text->{num_words} / $text->{num_sentences};
651	1					2	$text->{syllables_per_word} = $text->{num_syllables} / $text->{num_words};
652							$text->{percent_complex_words} =
653	1					4	( $text->{num_complex_words} / $text->{num_words} ) * 100;
654
655	1					3	$text->{fog} = ( $text->{words_per_sentence} + $text->{percent_complex_words} ) * 0.4;
656
657							$text->{flesch} = 206.835 - (1.015 * $text->{words_per_sentence}) -
658	1					3	(84.6 * $text->{syllables_per_word});
659
660							$text->{kincaid} = (11.8 * $text->{syllables_per_word}) +
661	1					5	(0.39 * $text->{words_per_sentence}) - 15.59;
662							}
663							else
664							{
665	0						$text->{words_per_sentence} = 0;
666	0						$text->{syllables_per_word} = 0;
667	0						$text->{num_complex_words} = 0;
668	0						$text->{fog} = 0;
669	0						$text->{flesch} = 0;
670	0						$text->{kincaid} = 0;
671							}
672							}
673							#------------------------------------------------------------------------------
674							return(1);