File Coverage

lib/Lingua/EN/Fathom.pm
Criterion Covered Total %
statement 102 163 62.5
branch 18 32 56.2
condition 3 9 33.3
subroutine 17 26 65.3
pod 17 18 94.4
total 157 248 63.3


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::Fathom - Measure readability of English text
4              
5             =head1 SYNOPSIS
6              
7             use Lingua::EN::Fathom;
8              
9             my $text = Lingua::EN::Fathom->new();
10              
11             $text->analyse_file("sample.txt"); # Analyse contents of a text file
12              
13             $accumulate = 1;
14             $text->analyse_block($text_string,$accumulate); # Analyse contents of a text string
15              
16             # Methods to return statistics on the analysed text
17             $text->num_chars;
18             $text->num_words;
19             $text->percent_complex_words;
20             $text->num_sentences;
21             $text->num_text_lines;
22             $text->num_non_text_lines;
23             $text->num_blank_lines; # trailing EOLs are ignored
24             $text->num_paragraphs;
25             $text->syllables_per_word;
26             $text->words_per_sentence;
27             $text->unique_words;
28             $text->fog;
29             $text->flesch;
30             $text->kincaid;
31              
32             # Call all of the above methods and present as a formatted report
33             print($text->report);
34              
35             # get a hash of unique words, keyed by word and occurrence as the value
36             $text->unique_words
37              
38             # Print a list of unique words
39             %words = $text->unique_words;
40             foreach $word ( sort keys %words )
41             {
42             print("$words{$word} :$word\n");
43             }
44              
45              
46             =head1 REQUIRES
47              
48             Lingua::EN::Syllable, Lingua::EN::Sentence
49              
50              
51             =head1 DESCRIPTION
52              
53             This module analyses English text in either a string or file. Totals are
54             then calculated for the number of characters, words, sentences, blank
55             and non blank (text) lines and paragraphs.
56              
57             Three common readability statistics are also derived, the Fog, Flesch and
58             Kincaid indices.
59              
60             All of these properties can be accessed through individual methods, or by
61             generating a text report.
62              
63             A hash of all unique words and the number of times they occur is generated.
64              
65              
66             =head1 METHODS
67              
68             =head2 new
69              
70             The C method creates an instance of an text object This must be called
71             before any of the following methods are invoked. Note that the object only
72             needs to be created once, and can be reused with new input data.
73              
74             my $text = Lingua::EN::Fathom->new();
75              
76             =head2 analyse_file
77              
78             The C method takes as input the name of a text file. Various
79             text based statistics are calculated for the file. This method and
80             C are prerequisites for all the following methods. An optional
81             argument may be supplied to control accumulation of statistics. If set to
82             a non zero value, all statistics are accumulated with each successive call.
83              
84             $text->analyse_file("sample.txt");
85              
86              
87             =head2 analyse_block
88              
89             The C method takes as input a text string. Various
90             text based statistics are calculated for the file. This method and
91             C are prerequisites for all the following methods. An optional
92             argument may be supplied to control accumulation of statistics. If set to
93             a non zero value, all statistics are accumulated with each successive call.
94              
95             $text->analyse_block($text_str);
96              
97             =head2 num_chars
98              
99             Returns the number of characters in the analysed text file or block. This
100             includes characters such as spaces, and punctuation marks.
101              
102             =head2 num_words
103              
104             Returns the number of words in the analysed text file or block. A word must
105             consist of letters a-z with at least one vowel sound, and optionally an
106             apostrophe or hyphen. Items such as "&, K108, NW" are not counted as words.
107              
108             =head2 percent_complex_words
109              
110             Returns the percentage of complex words in the analysed text file or block. A
111             complex word must consist of three or more syllables. This statistic is used to
112             calculate the fog index.
113              
114             =head2 num_sentences
115              
116             Returns the number of sentences in the analysed text file or block. A sentence
117             is any group of words and non words terminated with a single full stop. Spaces
118             may occur before and after the full stop.
119              
120              
121             =head2 num_text_lines
122              
123             Returns the number of lines containing some text in the analysed
124             text file or block.
125              
126             =head2 num_blank_lines
127              
128             Returns the number of lines NOT containing any text in the analysed
129             text file or block.
130              
131             =head2 num_paragraphs
132              
133             Returns the number of paragraphs in the analysed text file or block.
134              
135             =head2 syllables_per_word
136              
137             Returns the average number of syllables per word in the analysed
138             text file or block.
139              
140             =head2 words_per_sentence
141              
142             Returns the average number of words per sentence in the analysed
143             text file or block.
144              
145              
146              
147             =head2 READABILITY
148              
149             Three indices of text readability are calculated. They all measure complexity as
150             a function of syllables per word and words per sentence. They assume the text is
151             well formed and logical. You could analyse a passage of nonsensical English and
152             find the readability is quite good, provided the words are not too complex and
153             the sentences not too long.
154              
155             For more information see: L
156              
157              
158             =head2 fog
159              
160             Returns the Fog index for the analysed text file or block.
161              
162             ( words_per_sentence + percent_complex_words ) * 0.4
163              
164             The Fog index, developed by Robert Gunning, is a well known and simple
165             formula for measuring readability. The index indicates the number of years
166             of formal education a reader of average intelligence would need to read the
167             text once and understand that piece of writing with its word sentence workload.
168              
169             18 unreadable
170             14 difficult
171             12 ideal
172             10 acceptable
173             8 childish
174              
175              
176             =head2 flesch
177              
178             Returns the Flesch reading ease score for the analysed text file or block.
179              
180             206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
181              
182             This score rates text on a 100 point scale. The higher the score, the easier
183             it is to understand the text. A score of 60 to 70 is considered to be optimal.
184              
185              
186             =head2 kincaid
187              
188             Returns the Flesch-Kincaid grade level score for the analysed text
189             file or block.
190              
191             (11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59;
192              
193             This score rates text on U.S. grade school level. So a score of 8.0 means
194             that the document can be understood by an eighth grader. A score of 7.0 to
195             8.0 is considered to be optimal.
196              
197             =head2 unique_words
198              
199             Returns a hash of unique words. The words (in lower case) are held in
200             the hash keys while the number of occurrences are held in the hash values.
201              
202              
203             =head2 report
204              
205             print($text->report);
206              
207             Produces a text based report containing all Fathom statistics for
208             the currently analysed text block or file. For example:
209            
210             Number of characters : 813
211             Number of words : 135
212             Percent of complex words : 20.00
213             Average syllables per word : 1.7704
214             Number of sentences : 12
215             Average words per sentence : 11.2500
216             Number of text lines : 13
217             Number of blank lines : 8
218             Number of paragraphs : 4
219              
220              
221             READABILITY INDICES
222              
223             Fog : 12.5000
224             Flesch : 45.6429
225             Flesch-Kincaid : 9.6879
226              
227             The return value is a string containing the report contents
228              
229              
230             =head1 SEE ALSO
231              
232             L,L,L
233              
234              
235             =head1 POSSIBLE EXTENSIONS
236              
237             Count white space and punctuation characters
238             Allow user control over what strictly defines a word
239              
240             =head1 LIMITATIONS
241              
242             The syllable count provided in Lingua::EN::Syllable is about 90% accurate
243             Acronyms that contain vowels, like GPO, will be counted as words.
244             The fog index should exclude proper names
245              
246              
247              
248             =head1 AUTHOR
249              
250             Lingua::EN::Fathom was written by Kim Ryan .
251              
252             =head1 COPYRIGHT AND LICENSE
253              
254             Copyright (c) 2023 Kim Ryan. All rights reserved.
255              
256             This library is free software; you can redistribute it and/or modify
257             it under the same terms as Perl itself.
258              
259             =cut
260              
261             #------------------------------------------------------------------------------
262              
263             package Lingua::EN::Fathom;
264              
265 1     1   78671 use Lingua::EN::Syllable;
  1         544  
  1         63  
266 1     1   555 use Lingua::EN::Sentence;
  1         18078  
  1         50  
267 1     1   7 use strict;
  1         2  
  1         34  
268 1     1   6 use warnings;
  1         4  
  1         1839  
269              
270             our $VERSION = '1.24';
271              
272             #------------------------------------------------------------------------------
273             # Create a new instance of a text object.
274              
275             sub new
276             {
277 1     1 1 93 my $class = shift;
278              
279 1         4 my $text = {};
280 1         3 bless($text,$class);
281 1         3 $text = &_initialize($text);
282 1         2 return($text);
283             }
284             #------------------------------------------------------------------------------
285             # Analyse text stored in a file, reading from the file one line at a time
286              
287             sub analyse_file
288             {
289 0     0 1 0 my $text = shift;
290 0         0 my ($file_name,$accumulate) = @_;
291              
292 0 0       0 unless ( $accumulate )
293             {
294 0         0 $text = _initialize($text);
295             }
296              
297 0         0 $text->{file_name} = $file_name;
298              
299             # Only analyse non-empty text files
300 0 0 0     0 unless ( -T $file_name and -s $file_name )
301             {
302 0         0 return($text);
303             }
304              
305 0         0 open(IN_FH,"<$file_name");
306              
307 0         0 my $in_paragraph = 0;
308 0         0 my $all_text;
309 0         0 while ( )
310             {
311 0         0 my $one_line = $_;
312 0         0 $all_text .= $one_line;
313 0         0 ($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
314             }
315 0         0 close(IN_FH);
316            
317 0         0 my $sentences= Lingua::EN::Sentence::get_sentences($all_text);
318 0         0 $text->{num_sentences} = scalar(@$sentences);
319 0         0 $text->_calculate_readability;
320              
321 0         0 return($text);
322             }
323             #------------------------------------------------------------------------------
324             # Analyse a block of text, stored as a string. The string may contain line
325             # terminators.
326              
327             sub analyse_block
328             {
329 1     1 1 5 my $text = shift;
330 1         3 my ($block,$accumulate) = @_;
331              
332 1 50       4 unless ( $accumulate )
333             {
334 1         3 $text = _initialize($text);
335             }
336              
337 1 50       4 unless ( $block )
338             {
339 0         0 return($text);
340             }
341              
342 1         2 my $in_paragraph = 0;
343              
344             # Split on EOL character
345             # repeating trailing line terminators are stripped
346 1         6 my @all_lines = split(/\n/,$block);
347 1         3 my $one_line;
348 1         24 foreach $one_line ( @all_lines )
349             {
350 9         15 ($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
351             }
352            
353 1         5 my $sentences= Lingua::EN::Sentence::get_sentences($block);
354 1 50       3761 if (defined($sentences))
355             {
356 1         5 $text->{num_sentences} = scalar(@$sentences);
357             }
358            
359            
360 1         8 $text->_calculate_readability;
361            
362 1         4 return($text);
363             }
364             #------------------------------------------------------------------------------
365             sub num_chars
366             {
367 1     1 1 6 my $text = shift;
368 1         7 return($text->{num_chars});
369             }
370             #------------------------------------------------------------------------------
371             sub num_words
372             {
373 1     1 1 5 my $text = shift;
374 1         4 return($text->{num_words});
375             }
376             #------------------------------------------------------------------------------
377             sub percent_complex_words
378             {
379 0     0 1 0 my $text = shift;
380 0         0 return($text->{percent_complex_words});
381             }
382             #------------------------------------------------------------------------------
383             sub num_sentences
384             {
385 1     1 1 2 my $text = shift;
386 1         5 return($text->{num_sentences});
387             }
388             #------------------------------------------------------------------------------
389             sub num_text_lines
390             {
391 1     1 1 3 my $text = shift;
392 1         4 return($text->{num_text_lines});
393             }
394             #------------------------------------------------------------------------------
395             sub num_non_text_lines
396             {
397 1     1 0 7 my $text = shift;
398 1         5 return($text->{num_non_text_lines});
399             }
400             #------------------------------------------------------------------------------
401             sub num_blank_lines
402             {
403 1     1 1 2 my $text = shift;
404 1         7 return($text->{num_blank_lines});
405             }
406             #------------------------------------------------------------------------------
407             sub num_paragraphs
408             {
409 1     1 1 3 my $text = shift;
410 1         4 return($text->{num_paragraphs});
411             }
412             #------------------------------------------------------------------------------
413             sub syllables_per_word
414             {
415 0     0 1 0 my $text = shift;
416 0         0 return($text->{syllables_per_word});
417             }
418             #------------------------------------------------------------------------------
419             sub words_per_sentence
420             {
421 0     0 1 0 my $text = shift;
422 0         0 return($text->{words_per_sentence});
423             }
424             #------------------------------------------------------------------------------
425             sub fog
426             {
427 0     0 1 0 my $text = shift;
428 0         0 return($text->{fog});
429             }
430             #------------------------------------------------------------------------------
431             sub flesch
432             {
433 0     0 1 0 my $text = shift;
434 0         0 return($text->{flesch});
435             }
436             #------------------------------------------------------------------------------
437             sub kincaid
438             {
439 0     0 1 0 my $text = shift;
440 0         0 return($text->{kincaid});
441             }
442             #------------------------------------------------------------------------------
443             # Return anonymous hash of all the unique words in analysed text. The words
444             # occurrence count is stored in the hash value.
445              
446             sub unique_words
447             {
448 0     0 1 0 my $text = shift;
449 0 0       0 if ( $text->{unique_words} )
450             {
451 0         0 return( %{ $text->{unique_words} } );
  0         0  
452             }
453             else
454             {
455 0         0 return(undef);
456             }
457             }
458             #------------------------------------------------------------------------------
459             # Provide a formatted text report of all statistics for a text object.
460             # Return report as a string.
461              
462             sub report
463             {
464 0     0 1 0 my $text = shift;
465 0         0 my $report = '';
466            
467              
468             $text->{file_name} and
469 0 0       0 $report .= sprintf("File name : %s\n",$text->{file_name} );
470              
471 0         0 $report .= sprintf("Number of characters : %d\n", $text->num_chars);
472 0         0 $report .= sprintf("Number of words : %d\n", $text->num_words);
473 0         0 $report .= sprintf("Percent of complex words : %.2f\n",$text->percent_complex_words);
474 0         0 $report .= sprintf("Average syllables per word : %.4f\n",$text->syllables_per_word);
475 0         0 $report .= sprintf("Number of sentences : %d\n", $text->num_sentences);
476 0         0 $report .= sprintf("Average words per sentence : %.4f\n",$text->words_per_sentence);
477 0         0 $report .= sprintf("Number of text lines : %d\n", $text->num_text_lines);
478 0         0 $report .= sprintf("Number of non-text lines : %d\n", $text->num_non_text_lines);
479 0         0 $report .= sprintf("Number of blank lines : %d\n", $text->num_blank_lines);
480 0         0 $report .= sprintf("Number of paragraphs : %d\n", $text->num_paragraphs);
481              
482 0         0 $report .= "\n\nREADABILITY INDICES\n\n";
483 0         0 $report .= sprintf("Fog : %.4f\n",$text->fog);
484 0         0 $report .= sprintf("Flesch : %.4f\n",$text->flesch);
485 0         0 $report .= sprintf("Flesch-Kincaid : %.4f\n",$text->kincaid);
486              
487 0         0 return($report);
488             }
489              
490             #------------------------------------------------------------------------------
491             # PRIVATE METHODS
492             #------------------------------------------------------------------------------
493             sub _initialize
494             {
495 2     2   3 my $text = shift;
496              
497 2         9 $text->{num_chars} = 0;
498 2         3 $text->{num_syllables} = 0;
499 2         4 $text->{num_words} = 0;
500 2         3 $text->{num_complex_words} = 0;
501 2         3 $text->{syllables_per_word} = 0;
502 2         3 $text->{words_per_sentence} = 0;
503 2         3 $text->{percent_complex_words} = 0;
504 2         5 $text->{num_text_lines} = 0;
505 2         3 $text->{num_non_text_lines} = 0;
506 2         2 $text->{num_blank_lines} = 0;
507 2         4 $text->{num_paragraphs} = 0;
508 2         3 $text->{num_sentences} = 0;
509 2         3 $text->{unique_words} = ();
510 2         3 $text->{file_name} = '';
511              
512 2         3 $text->{fog} = 0;
513 2         4 $text->{flesch} = 0;
514 2         3 $text->{kincaid} = 0;
515              
516 2         4 return($text);
517             }
518             #------------------------------------------------------------------------------
519             # Increment number of text lines, blank lines and paragraphs
520              
521             sub _analyse_line
522             {
523 9     9   12 my $text = shift;
524            
525 9         17 my ($one_line,$in_paragraph) = @_;
526 9 100       33 if ( $one_line =~ /\w/ )
    100          
    50          
527             {
528 6         11 chomp($one_line);
529 6         11 $text = _analyse_words($text,$one_line);
530 6         10 $text->{num_text_lines}++;
531            
532 6 100       14 unless ( $in_paragraph )
533             {
534 2         3 $text->{num_paragraphs}++;
535 2         3 $in_paragraph = 1;
536             }
537             }
538             elsif ($one_line eq '' ) # empty line
539             {
540 2         3 $text->{num_blank_lines}++;
541 2         3 $in_paragraph = 0;
542             }
543             elsif ($one_line =~ /^\W+$/ ) # non text
544             {
545 1         3 $text->{num_non_text_lines}++;
546 1         1 $in_paragraph = 0;
547             }
548 9         24 return($in_paragraph,$text);
549             }
550             #------------------------------------------------------------------------------
551             # Try to detect real words in line. Increment syllable, word, and complex word counters.
552              
553             sub _analyse_words
554             {
555 6     6   8 my $text = shift;
556 6         8 my ($one_line) = @_;
557              
558 6         10 $text->{num_chars} += length($one_line);
559              
560             # Word found, such as: twice, BOTH, a, I'd, non-plussed ..
561            
562             # Ignore words like 'Mr.', K12, &, X.Y.Z ...
563             # It could be argued that Mr. is a word, but this approach should detect most of the non words
564             # which have punctuation or numbers in them
565            
566 6         23 while ( $one_line =~ /\b([a-z][-'a-z]*)\b/ig )
567             {
568 57         104 my $one_word = $1;
569              
570             # Try to filter out acronyms and abbreviations by accepting
571             # words with a vowel sound. This won't work for GPO etc.
572 57 100       126 next unless $one_word =~ /[aeiouy]/i;
573              
574             # Test for valid hyphenated word like be-bop
575 55 100       98 if ( $one_word =~ /-/ )
576             {
577 1 50       15 next unless $one_word =~ /[a-z]{2,}-[a-z]{2,}/i;
578             }
579              
580             # word frequency count
581 54         152 $text->{unique_words}{lc($one_word)}++;
582            
583 54         57 $text->{num_words}++;
584              
585             # Use subroutine from Lingua::EN::Syllable
586 54         90 my $num_syllables_current_word = syllable($one_word);
587 54         10019 $text->{num_syllables} += $num_syllables_current_word;
588              
589             # Required for Fog index, count non hyphenated words of 3 or more
590             # syllables. Should add check for proper names in here as well
591 54 100 66     245 if ( $num_syllables_current_word > 2 and $one_word !~ /-/ )
592             {
593 4         18 $text->{num_complex_words}++;
594             }
595             }
596              
597 6         17 return($text);
598             }
599             #------------------------------------------------------------------------------
600             # Determine the three readability indices
601              
602             sub _calculate_readability
603             {
604 1     1   4 my $text = shift;
605              
606 1 50 33     7 if ( $text->{num_sentences} and $text->{num_words} )
607             {
608 1         6 $text->{words_per_sentence} = $text->{num_words} / $text->{num_sentences};
609 1         4 $text->{syllables_per_word} = $text->{num_syllables} / $text->{num_words};
610             $text->{percent_complex_words} =
611 1         5 ( $text->{num_complex_words} / $text->{num_words} ) * 100;
612              
613 1         3 $text->{fog} = ( $text->{words_per_sentence} + $text->{percent_complex_words} ) * 0.4;
614              
615             $text->{flesch} = 206.835 - (1.015 * $text->{words_per_sentence}) -
616 1         4 (84.6 * $text->{syllables_per_word});
617              
618             $text->{kincaid} = (11.8 * $text->{syllables_per_word}) +
619 1         5 (0.39 * $text->{words_per_sentence}) - 15.59;
620             }
621             else
622             {
623 0           $text->{words_per_sentence} = 0;
624 0           $text->{syllables_per_word} = 0;
625 0           $text->{num_complex_words} = 0;
626 0           $text->{fog} = 0;
627 0           $text->{flesch} = 0;
628 0           $text->{kincaid} = 0;
629             }
630             }
631             #------------------------------------------------------------------------------
632             return(1);