File Coverage

lib/Lingua/EN/Fathom.pm
Criterion Covered Total %
statement 102 167 61.0
branch 18 32 56.2
condition 3 9 33.3
subroutine 17 28 60.7
pod 18 20 90.0
total 158 256 61.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::Fathom - Measure readability of English text
4            
5             =head1 SYNOPSIS
6            
7             use Lingua::EN::Fathom;
8            
9             my $text = Lingua::EN::Fathom->new();
10            
11             $text->analyse_file("sample.txt"); # Analyse contents of a text file
12            
13             $accumulate = 1;
14             my $text_string = q{
15             Returns the number of words in the analysed text file or block. A word must
16             consist of letters a-z with at least one vowel sound, and optionally an
17             apostrophe or hyphen.
18            
19             ##########################################
20             Items such as "&, K108, NSW" are not counted as words.
21             Common abbreviations such a U.S. or numbers like 1.23 will not denote the end of
22             a sentence.
23             };
24            
25             $text->analyse_block($text_string,$accumulate); # Analyse contents of a text string
26            
27             print($text->report); # Create a formatted report
28            
29             Number of characters : 312
30             Number of words : 54
31             Percent of complex words : 7.41
32             Average syllables per word : 1.4259
33             Number of sentences : 4
34             Average words per sentence : 13.5000
35             Number of text lines : 6
36             Number of non-text lines : 1
37             Number of blank lines : 2
38             Number of paragraphs : 2
39            
40             READABILITY INDICES
41            
42             Fog : 8.3630
43             Flesch : 72.4992
44             Flesch-Kincaid : 6.5009
45            
46             # Methods to return statistics on the analysed text
47             $text->num_chars;
48             $text->num_words;
49             $text->percent_complex_words;
50             $text->num_sentences;
51             $text->num_text_lines;
52             $text->num_non_text_lines;
53             $text->num_blank_lines; # trailing EOLs are ignored
54             $text->num_paragraphs;
55             $text->syllables_per_word;
56             $text->words_per_sentence;
57             $text->unique_words;
58             $text->fog;
59             $text->flesch;
60             $text->kincaid;
61            
62             # get a hash of unique words, keyed by word and occurrence as the value
63             $text->unique_words
64            
65             # Print a list of unique words
66             %words = $text->unique_words;
67             foreach $word ( sort keys %words )
68             {
69             print("$words{$word} :$word\n");
70             }
71            
72             =head1 REQUIRES
73              
74             Lingua::EN::Syllable, Lingua::EN::Sentence
75              
76              
77             =head1 DESCRIPTION
78              
79             This module analyses English text in either a string or file. Totals are
80             then calculated for the number of characters, words, sentences, blank
81             and non blank (text) lines and paragraphs.
82              
83             Three common readability statistics are also derived, the Fog, Flesch and
84             Kincaid indices.
85              
86             All of these properties can be accessed through individual methods, or by
87             generating a text report.
88              
89             A hash of all unique words and the number of times they occur is generated.
90              
91              
92             =head1 METHODS
93              
94             =head2 new
95              
96             The C method creates an instance of an text object This must be called
97             before any of the following methods are invoked. Note that the object only
98             needs to be created once, and can be reused with new input data.
99              
100             my $text = Lingua::EN::Fathom->new();
101              
102             =head2 analyse_file
103              
104             The C method takes as input the name of a text file. Various
105             text based statistics are calculated for the file. This method and
106             C are prerequisites for all the following methods. An optional
107             argument may be supplied to control accumulation of statistics. If set to
108             a non zero value, all statistics are accumulated with each successive call.
109              
110             $text->analyse_file("sample.txt");
111              
112              
113             =head2 analyse_block
114              
115             The C method takes as input a text string. Various
116             text based statistics are calculated for the file. This method and
117             C are prerequisites for all the following methods. An optional
118             argument may be supplied to control accumulation of statistics. If set to
119             a non zero value, all statistics are accumulated with each successive call.
120              
121             $text->analyse_block($text_str,$accumulate);
122              
123             =head2 num_chars
124              
125             Returns the number of characters in the analysed text file or block. This
126             includes characters such as spaces, and punctuation marks.
127              
128             =head2 num_words
129              
130             Returns the number of words in the analysed text file or block. A word must
131             consist of letters a-z with at least one vowel sound, and optionally an
132             apostrophe or hyphen. Items such as "&, K108, NW" are not counted as words.
133              
134             =head2 percent_complex_words
135              
136             Returns the percentage of complex words in the analysed text file or block. A
137             complex word must consist of three or more syllables. This statistic is used to
138             calculate the fog index.
139              
140             =head2 num_sentences
141              
142             Returns the number of sentences in the analysed text file or block. A sentence
143             is any group of words and non words terminated with a single full stop. Spaces
144             may occur before and after the full stop.
145              
146             =head2 num_text_lines
147              
148             Returns the number of lines containing some text in the analysed
149             text file or block.
150              
151             =head2 num_non_text_lines
152              
153             Returns the number of lines containing no text in the analysed
154             text file or block.
155              
156             =head2 num_blank_lines
157              
158             Returns the number of empty lines in the analysed
159             text file or block.
160              
161             =head2 num_paragraphs
162              
163             Returns the number of paragraphs in the analysed text file or block.
164              
165             =head2 syllables_per_word
166              
167             Returns the average number of syllables per word in the analysed
168             text file or block.
169              
170             =head2 words_per_sentence
171              
172             Returns the average number of words per sentence in the analysed
173             text file or block.
174              
175              
176             =head2 READABILITY
177              
178             Three indices of text readability are calculated. They all measure complexity as
179             a function of syllables per word and words per sentence. They assume the text is
180             well formed and logical. You could analyse a passage of nonsensical English and
181             find the readability is quite good, provided the words are not too complex and
182             the sentences not too long.
183              
184             For more information see: L
185              
186              
187             =head2 fog
188              
189             Returns the Fog index for the analysed text file or block.
190              
191             ( words_per_sentence + percent_complex_words ) * 0.4
192              
193             The Fog index, developed by Robert Gunning, is a well known and simple
194             formula for measuring readability. The index indicates the number of years
195             of formal education a reader of average intelligence would need to read the
196             text once and understand that piece of writing with its word sentence workload.
197              
198             18 unreadable
199             14 difficult
200             12 ideal
201             10 acceptable
202             8 childish
203              
204              
205             =head2 flesch
206              
207             Returns the Flesch reading ease score for the analysed text file or block.
208              
209             206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
210              
211             This score rates text on a 100 point scale. The higher the score, the easier
212             it is to understand the text. A score of 60 to 70 is considered to be optimal.
213              
214              
215             =head2 kincaid
216              
217             Returns the Flesch-Kincaid grade level score for the analysed text
218             file or block.
219              
220             (11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59;
221              
222             This score rates text on U.S. grade school level. So a score of 8.0 means
223             that the document can be understood by an eighth grader. A score of 7.0 to
224             8.0 is considered to be optimal.
225              
226             =head2 unique_words
227              
228             Returns a hash of unique words. The words (in lower case) are held in
229             the hash keys while the number of occurrences are held in the hash values.
230              
231              
232             =head2 report
233              
234             print($text->report);
235              
236             Produces a text based report containing all Fathom statistics for
237             the currently analysed text block or file. For example:
238            
239             Number of characters : 813
240             Number of words : 135
241             Percent of complex words : 20.00
242             Average syllables per word : 1.7704
243             Number of sentences : 12
244             Average words per sentence : 11.2500
245             Number of text lines : 13
246             Number of non text lines : 0
247             Number of blank lines : 8
248             Number of paragraphs : 4
249              
250              
251             READABILITY INDICES
252              
253             Fog : 12.5000
254             Flesch : 45.6429
255             Flesch-Kincaid : 9.6879
256              
257             The return value is a string containing the report contents
258              
259              
260             =head1 SEE ALSO
261              
262             L,L,L
263              
264              
265             =head1 POSSIBLE EXTENSIONS
266              
267             Count white space and punctuation characters
268             Allow user control over what strictly defines a word
269              
270             =head1 LIMITATIONS
271              
272             The syllable count provided in Lingua::EN::Syllable is about 90% accurate
273             Acronyms that contain vowels, like GPO, will be counted as words.
274             The fog index should exclude proper names
275              
276              
277              
278             =head1 AUTHOR
279              
280             Lingua::EN::Fathom was written by Kim Ryan .
281              
282             =head1 COPYRIGHT AND LICENSE
283              
284             Copyright (c) 2023 Kim Ryan. All rights reserved.
285              
286             This library is free software; you can redistribute it and/or modify
287             it under the same terms as Perl itself.
288              
289             =cut
290              
291             #------------------------------------------------------------------------------
292              
293             package Lingua::EN::Fathom;
294              
295 1     1   70598 use Lingua::EN::Syllable;
  1         506  
  1         54  
296 1     1   505 use Lingua::EN::Sentence;
  1         16485  
  1         52  
297 1     1   6 use strict;
  1         2  
  1         33  
298 1     1   5 use warnings;
  1         2  
  1         1842  
299              
300             our $VERSION = '1.26';
301              
302             #------------------------------------------------------------------------------
303             # Create a new instance of a text object.
304              
305             sub new
306             {
307 1     1 1 88 my $class = shift;
308              
309 1         3 my $text = {};
310 1         3 bless($text,$class);
311 1         4 $text = &_initialize($text);
312 1         2 return($text);
313             }
314             #------------------------------------------------------------------------------
315             # Analyse text stored in a file, reading from the file one line at a time
316              
317             sub analyse_file
318             {
319 0     0 1 0 my $text = shift;
320 0         0 my ($file_name,$accumulate) = @_;
321              
322 0 0       0 unless ( $accumulate )
323             {
324 0         0 $text = _initialize($text);
325             }
326              
327 0         0 $text->{file_name} = $file_name;
328              
329             # Only analyse non-empty text files
330 0 0 0     0 unless ( -T $file_name and -s $file_name )
331             {
332 0         0 return($text);
333             }
334              
335 0         0 open(IN_FH,"<$file_name");
336              
337 0         0 my $in_paragraph = 0;
338 0         0 my $all_text;
339 0         0 while ( )
340             {
341 0         0 my $one_line = $_;
342 0         0 $all_text .= $one_line;
343 0         0 ($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
344             }
345 0         0 close(IN_FH);
346            
347 0         0 my $sentences= Lingua::EN::Sentence::get_sentences($all_text);
348 0         0 $text->{num_sentences} += scalar(@$sentences);
349 0         0 $text->_calculate_readability;
350              
351 0         0 return($text);
352             }
353             #------------------------------------------------------------------------------
354             # Analyse a block of text, stored as a string. The string may contain line
355             # terminators.
356              
357             sub analyse_block
358             {
359 1     1 1 5 my $text = shift;
360 1         2 my ($block,$accumulate) = @_;
361              
362 1 50       4 unless ( $accumulate )
363             {
364 1         2 $text = _initialize($text);
365             }
366              
367 1 50       3 unless ( $block )
368             {
369 0         0 return($text);
370             }
371              
372 1         2 my $in_paragraph = 0;
373              
374             # Split on EOL character
375             # repeating trailing line terminators are stripped
376 1         17 my @all_lines = split(/\n/,$block);
377 1         3 my $one_line;
378 1         19 foreach $one_line ( @all_lines )
379             {
380 9         19 ($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
381             }
382            
383 1         5 my $sentences= Lingua::EN::Sentence::get_sentences($block);
384 1 50       3592 if (defined($sentences))
385             {
386 1         4 $text->{num_sentences} += scalar(@$sentences);
387             }
388            
389            
390 1         7 $text->_calculate_readability;
391            
392 1         5 return($text);
393             }
394             #------------------------------------------------------------------------------
395             sub num_chars
396             {
397 1     1 1 5 my $text = shift;
398 1         7 return($text->{num_chars});
399             }
400             #------------------------------------------------------------------------------
401             sub num_words
402             {
403 1     1 1 3 my $text = shift;
404 1         4 return($text->{num_words});
405             }
406             #------------------------------------------------------------------------------
407             sub percent_complex_words
408             {
409 0     0 1 0 my $text = shift;
410 0         0 return($text->{percent_complex_words});
411             }
412             #------------------------------------------------------------------------------
413             sub num_sentences
414             {
415 1     1 1 3 my $text = shift;
416 1         5 return($text->{num_sentences});
417             }
418             #------------------------------------------------------------------------------
419             sub num_text_lines
420             {
421 1     1 1 3 my $text = shift;
422 1         4 return($text->{num_text_lines});
423             }
424             #------------------------------------------------------------------------------
425             sub num_non_text_lines
426             {
427 1     1 1 2 my $text = shift;
428 1         5 return($text->{num_non_text_lines});
429             }
430             #------------------------------------------------------------------------------
431             sub num_blank_lines
432             {
433 1     1 1 2 my $text = shift;
434 1         4 return($text->{num_blank_lines});
435             }
436             #------------------------------------------------------------------------------
437             sub num_paragraphs
438             {
439 1     1 1 2 my $text = shift;
440 1         4 return($text->{num_paragraphs});
441             }
442             #------------------------------------------------------------------------------
443             sub syllables_per_word
444             {
445 0     0 1 0 my $text = shift;
446 0         0 return($text->{syllables_per_word});
447             }
448             #------------------------------------------------------------------------------
449             sub words_per_sentence
450             {
451 0     0 1 0 my $text = shift;
452 0         0 return($text->{words_per_sentence});
453             }
454             #------------------------------------------------------------------------------
455             # extra
456             sub num_syllables
457             {
458 0     0 0 0 my $text = shift;
459 0         0 return($text->{num_syllables});
460             }
461             #------------------------------------------------------------------------------
462             sub complex_words
463             {
464 0     0 0 0 my $text = shift;
465 0         0 return($text->{num_complex_words});
466             }
467             #------------------------------------------------------------------------------
468             sub fog
469             {
470 0     0 1 0 my $text = shift;
471 0         0 return($text->{fog});
472             }
473             #------------------------------------------------------------------------------
474             sub flesch
475             {
476 0     0 1 0 my $text = shift;
477 0         0 return($text->{flesch});
478             }
479             #------------------------------------------------------------------------------
480             sub kincaid
481             {
482 0     0 1 0 my $text = shift;
483 0         0 return($text->{kincaid});
484             }
485             #------------------------------------------------------------------------------
486             # Return anonymous hash of all the unique words in analysed text. The words
487             # occurrence count is stored in the hash value.
488              
489             sub unique_words
490             {
491 0     0 1 0 my $text = shift;
492 0 0       0 if ( $text->{unique_words} )
493             {
494 0         0 return( %{ $text->{unique_words} } );
  0         0  
495             }
496             else
497             {
498 0         0 return(undef);
499             }
500             }
501             #------------------------------------------------------------------------------
502             # Provide a formatted text report of all statistics for a text object.
503             # Return report as a string.
504              
505             sub report
506             {
507 0     0 1 0 my $text = shift;
508 0         0 my $report = '';
509            
510              
511             $text->{file_name} and
512 0 0       0 $report .= sprintf("File name : %s\n",$text->{file_name} );
513              
514 0         0 $report .= sprintf("Number of characters : %d\n", $text->num_chars);
515 0         0 $report .= sprintf("Number of words : %d\n", $text->num_words);
516 0         0 $report .= sprintf("Percent of complex words : %.2f\n",$text->percent_complex_words);
517 0         0 $report .= sprintf("Average syllables per word : %.4f\n",$text->syllables_per_word);
518 0         0 $report .= sprintf("Number of sentences : %d\n", $text->num_sentences);
519 0         0 $report .= sprintf("Average words per sentence : %.4f\n",$text->words_per_sentence);
520 0         0 $report .= sprintf("Number of text lines : %d\n", $text->num_text_lines);
521 0         0 $report .= sprintf("Number of non-text lines : %d\n", $text->num_non_text_lines);
522 0         0 $report .= sprintf("Number of blank lines : %d\n", $text->num_blank_lines);
523 0         0 $report .= sprintf("Number of paragraphs : %d\n", $text->num_paragraphs);
524              
525 0         0 $report .= "\n\nREADABILITY INDICES\n\n";
526 0         0 $report .= sprintf("Fog : %.4f\n",$text->fog);
527 0         0 $report .= sprintf("Flesch : %.4f\n",$text->flesch);
528 0         0 $report .= sprintf("Flesch-Kincaid : %.4f\n",$text->kincaid);
529              
530 0         0 return($report);
531             }
532              
533             #------------------------------------------------------------------------------
534             # PRIVATE METHODS
535             #------------------------------------------------------------------------------
536             sub _initialize
537             {
538 2     2   5 my $text = shift;
539              
540 2         7 $text->{num_chars} = 0;
541 2         4 $text->{num_syllables} = 0;
542 2         2 $text->{num_words} = 0;
543 2         3 $text->{num_complex_words} = 0;
544 2         2 $text->{syllables_per_word} = 0;
545 2         4 $text->{words_per_sentence} = 0;
546 2         3 $text->{percent_complex_words} = 0;
547 2         4 $text->{num_text_lines} = 0;
548 2         3 $text->{num_non_text_lines} = 0;
549 2         3 $text->{num_blank_lines} = 0;
550 2         3 $text->{num_paragraphs} = 0;
551 2         3 $text->{num_sentences} = 0;
552 2         3 $text->{unique_words} = ();
553 2         3 $text->{file_name} = '';
554              
555 2         3 $text->{fog} = 0;
556 2         4 $text->{flesch} = 0;
557 2         3 $text->{kincaid} = 0;
558              
559 2         5 return($text);
560             }
561             #------------------------------------------------------------------------------
562             # Increment number of text lines, blank lines and paragraphs
563              
564             sub _analyse_line
565             {
566 9     9   11 my $text = shift;
567            
568 9         14 my ($one_line,$in_paragraph) = @_;
569 9 100       32 if ( $one_line =~ /\w/ )
    100          
    50          
570             {
571 6         9 chomp($one_line);
572 6         12 $text = _analyse_words($text,$one_line);
573 6         9 $text->{num_text_lines}++;
574            
575 6 100       11 unless ( $in_paragraph )
576             {
577 2         2 $text->{num_paragraphs}++;
578 2         3 $in_paragraph = 1;
579             }
580             }
581             elsif ($one_line eq '' ) # empty line
582             {
583 2         4 $text->{num_blank_lines}++;
584 2         2 $in_paragraph = 0;
585             }
586             elsif ($one_line =~ /^\W+$/ ) # non text
587             {
588 1         2 $text->{num_non_text_lines}++;
589 1         1 $in_paragraph = 0;
590             }
591 9         19 return($in_paragraph,$text);
592             }
593             #------------------------------------------------------------------------------
594             # Try to detect real words in line. Increment syllable, word, and complex word counters.
595              
596             sub _analyse_words
597             {
598 6     6   7 my $text = shift;
599 6         8 my ($one_line) = @_;
600              
601 6         10 $text->{num_chars} += length($one_line);
602              
603             # Word found, such as: twice, BOTH, a, I'd, non-plussed ..
604            
605             # Ignore words like 'Mr.', K12, &, X.Y.Z ...
606             # It could be argued that Mr. is a word, but this approach should detect most of the non words
607             # which have punctuation or numbers in them
608            
609 6         35 while ( $one_line =~ /\b([a-z][-'a-z]*)\b/ig )
610             {
611 57         112 my $one_word = $1;
612              
613             # Try to filter out acronyms and abbreviations by accepting
614             # words with a vowel sound. This won't work for GPO etc.
615 57 100       139 next unless $one_word =~ /[aeiouy]/i;
616              
617             # Test for valid hyphenated word like be-bop
618 55 100       93 if ( $one_word =~ /-/ )
619             {
620 1 50       7 next unless $one_word =~ /[a-z]{2,}-[a-z]{2,}/i;
621             }
622              
623             # word frequency count
624 54         111 $text->{unique_words}{lc($one_word)}++;
625            
626 54         55 $text->{num_words}++;
627              
628             # Use subroutine from Lingua::EN::Syllable
629 54         95 my $num_syllables_current_word = syllable($one_word);
630 54         9746 $text->{num_syllables} += $num_syllables_current_word;
631              
632             # Required for Fog index, count non hyphenated words of 3 or more
633             # syllables. Should add check for proper names in here as well
634 54 100 66     237 if ( $num_syllables_current_word > 2 and $one_word !~ /-/ )
635             {
636 4         16 $text->{num_complex_words}++;
637             }
638             }
639              
640 6         12 return($text);
641             }
642             #------------------------------------------------------------------------------
643             # Determine the three readability indices
644              
645             sub _calculate_readability
646             {
647 1     1   2 my $text = shift;
648              
649 1 50 33     9 if ( $text->{num_sentences} and $text->{num_words} )
650             {
651 1         5 $text->{words_per_sentence} = $text->{num_words} / $text->{num_sentences};
652 1         3 $text->{syllables_per_word} = $text->{num_syllables} / $text->{num_words};
653             $text->{percent_complex_words} =
654 1         5 ( $text->{num_complex_words} / $text->{num_words} ) * 100;
655              
656 1         2 $text->{fog} = ( $text->{words_per_sentence} + $text->{percent_complex_words} ) * 0.4;
657              
658             $text->{flesch} = 206.835 - (1.015 * $text->{words_per_sentence}) -
659 1         5 (84.6 * $text->{syllables_per_word});
660              
661             $text->{kincaid} = (11.8 * $text->{syllables_per_word}) +
662 1         3 (0.39 * $text->{words_per_sentence}) - 15.59;
663             }
664             else
665             {
666 0           $text->{words_per_sentence} = 0;
667 0           $text->{syllables_per_word} = 0;
668 0           $text->{num_complex_words} = 0;
669 0           $text->{fog} = 0;
670 0           $text->{flesch} = 0;
671 0           $text->{kincaid} = 0;
672             }
673             }
674             #------------------------------------------------------------------------------
675             return(1);