File Coverage

lib/Lingua/EN/Fathom.pm
Criterion Covered Total %
statement 102 167 61.0
branch 18 32 56.2
condition 3 9 33.3
subroutine 17 28 60.7
pod 18 20 90.0
total 158 256 61.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::Fathom - Measure readability of English text
4            
5             =head1 SYNOPSIS
6            
7             use Lingua::EN::Fathom;
8            
9             my $text = Lingua::EN::Fathom->new();
10            
11             $text->analyse_file("sample.txt"); # Analyse contents of a text file
12            
13             $accumulate = 1;
14             my $text_string = q{
15             Returns the number of words in the analysed text file or block. A word must
16             consist of letters a-z with at least one vowel sound, and optionally an
17             apostrophe or hyphen.
18            
19             ##########################################
20             Items such as "&, K108, NSW" are not counted as words.
21             Common abbreviations such a U.S. or numbers like 1.23 will not denote the end of
22             a sentence.
23             };
24            
25             $text->analyse_block($text_string,$accumulate); # Analyse contents of a text string
26            
27             print($text->report); # Create a formatted report
28            
29             Number of characters : 312
30             Number of words : 54
31             Percent of complex words : 7.41
32             Average syllables per word : 1.4259
33             Number of sentences : 4
34             Average words per sentence : 13.5000
35             Number of text lines : 6
36             Number of non-text lines : 1
37             Number of blank lines : 2
38             Number of paragraphs : 2
39            
40             READABILITY INDICES
41            
42             Fog : 8.3630
43             Flesch : 72.4992
44             Flesch-Kincaid : 6.5009
45            
46             # Methods to return statistics on the analysed text
47             $text->num_chars;
48             $text->num_words;
49             $text->percent_complex_words;
50             $text->num_sentences;
51             $text->num_text_lines;
52             $text->num_non_text_lines;
53             $text->num_blank_lines; # trailing EOLs are ignored
54             $text->num_paragraphs;
55             $text->syllables_per_word;
56             $text->words_per_sentence;
57             $text->unique_words;
58             $text->fog;
59             $text->flesch;
60             $text->kincaid;
61            
62             # get a hash of unique words, keyed by word and occurrence as the value
63             $text->unique_words
64            
65             # Print a list of unique words
66             %words = $text->unique_words;
67             foreach $word ( sort keys %words )
68             {
69             print("$words{$word} :$word\n");
70             }
71            
72             =head1 REQUIRES
73              
74             Lingua::EN::Syllable, Lingua::EN::Sentence
75              
76              
77             =head1 DESCRIPTION
78              
79             This module analyses English text in either a string or file. Totals are
80             then calculated for the number of characters, words, sentences, blank
81             and non blank (text) lines and paragraphs.
82              
83             Three common readability statistics are also derived, the Fog, Flesch and
84             Kincaid indices.
85              
86             All of these properties can be accessed through individual methods, or by
87             generating a text report.
88              
89             A hash of all unique words and the number of times they occur is generated.
90              
91              
92             =head1 METHODS
93              
94             =head2 new
95              
96             The C method creates an instance of an text object This must be called
97             before any of the following methods are invoked. Note that the object only
98             needs to be created once, and can be reused with new input data.
99              
100             my $text = Lingua::EN::Fathom->new();
101              
102             =head2 analyse_file
103              
104             The C method takes as input the name of a text file. Various
105             text based statistics are calculated for the file. This method and
106             C are prerequisites for all the following methods. An optional
107             argument may be supplied to control accumulation of statistics. If set to
108             a non zero value, all statistics are accumulated with each successive call.
109              
110             $text->analyse_file("sample.txt");
111              
112              
113             =head2 analyse_block
114              
115             The C method takes as input a text string. Various
116             text based statistics are calculated for the file. This method and
117             C are prerequisites for all the following methods. An optional
118             argument may be supplied to control accumulation of statistics. If set to
119             a non zero value, all statistics are accumulated with each successive call.
120              
121             $text->analyse_block($text_str,$accumulate);
122              
123             =head2 num_chars
124              
125             Returns the number of characters in the analysed text file or block. This
126             includes characters such as spaces, and punctuation marks.
127              
128             =head2 num_words
129              
130             Returns the number of words in the analysed text file or block. A word must
131             consist of letters a-z with at least one vowel sound, and optionally an
132             apostrophe or hyphen. Items such as "&, K108, NW" are not counted as words.
133              
134             =head2 percent_complex_words
135              
136             Returns the percentage of complex words in the analysed text file or block. A
137             complex word must consist of three or more syllables. This statistic is used to
138             calculate the fog index.
139              
140             =head2 num_sentences
141              
142             Returns the number of sentences in the analysed text file or block. A sentence
143             is any group of words and non words terminated with a single full stop. Spaces
144             may occur before and after the full stop.
145              
146             =head2 num_text_lines
147              
148             Returns the number of lines containing some text in the analysed
149             text file or block.
150              
151             =head2 num_non_text_lines
152              
153             Returns the number of lines containing no text in the analysed
154             text file or block.
155              
156             =head2 num_blank_lines
157              
158             Returns the number of empty lines in the analysed
159             text file or block.
160              
161             =head2 num_paragraphs
162              
163             Returns the number of paragraphs in the analysed text file or block.
164              
165             =head2 syllables_per_word
166              
167             Returns the average number of syllables per word in the analysed
168             text file or block.
169              
170             =head2 words_per_sentence
171              
172             Returns the average number of words per sentence in the analysed
173             text file or block.
174              
175              
176             =head2 READABILITY
177              
178             Three indices of text readability are calculated. They all measure complexity as
179             a function of syllables per word and words per sentence. They assume the text is
180             well formed and logical. You could analyse a passage of nonsensical English and
181             find the readability is quite good, provided the words are not too complex and
182             the sentences not too long.
183              
184             For more information see: L
185              
186              
187             =head2 fog
188              
189             Returns the Fog index for the analysed text file or block.
190              
191             ( words_per_sentence + percent_complex_words ) * 0.4
192              
193             The Fog index, developed by Robert Gunning, is a well known and simple
194             formula for measuring readability. The index indicates the number of years
195             of formal education a reader of average intelligence would need to read the
196             text once and understand that piece of writing with its word sentence workload.
197              
198             18 unreadable
199             14 difficult
200             12 ideal
201             10 acceptable
202             8 childish
203              
204              
205             =head2 flesch
206              
207             Returns the Flesch reading ease score for the analysed text file or block.
208              
209             206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
210              
211             This score rates text on a 100 point scale. The higher the score, the easier
212             it is to understand the text. A score of 60 to 70 is considered to be optimal.
213              
214              
215             =head2 kincaid
216              
217             Returns the Flesch-Kincaid grade level score for the analysed text
218             file or block.
219              
220             (11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59;
221              
222             This score rates text on U.S. grade school level. So a score of 8.0 means
223             that the document can be understood by an eighth grader. A score of 7.0 to
224             8.0 is considered to be optimal.
225              
226             =head2 unique_words
227              
228             Returns a hash of unique words. The words (in lower case) are held in
229             the hash keys while the number of occurrences are held in the hash values.
230              
231              
232             =head2 report
233              
234             print($text->report);
235              
236             Produces a text based report containing all Fathom statistics for
237             the currently analysed text block or file. For example:
238            
239             Number of characters : 813
240             Number of words : 135
241             Percent of complex words : 20.00
242             Average syllables per word : 1.7704
243             Number of sentences : 12
244             Average words per sentence : 11.2500
245             Number of text lines : 13
246             Number of non text lines : 0
247             Number of blank lines : 8
248             Number of paragraphs : 4
249              
250              
251             READABILITY INDICES
252              
253             Fog : 12.5000
254             Flesch : 45.6429
255             Flesch-Kincaid : 9.6879
256              
257             The return value is a string containing the report contents
258              
259              
260             =head1 SEE ALSO
261              
262             L,L,L
263              
264              
265             =head1 POSSIBLE EXTENSIONS
266              
267             Count white space and punctuation characters
268             Allow user control over what strictly defines a word
269              
270             =head1 LIMITATIONS
271              
272             The syllable count provided in Lingua::EN::Syllable is about 90% accurate
273             Acronyms that contain vowels, like GPO, will be counted as words.
274             The fog index should exclude proper names
275              
276              
277              
278             =head1 AUTHOR
279              
280             Lingua::EN::Fathom was written by Kim Ryan .
281              
282             =head1 COPYRIGHT AND LICENSE
283              
284             Copyright (c) 2023 Kim Ryan. All rights reserved.
285              
286             This library is free software; you can redistribute it and/or modify
287             it under the same terms as Perl itself.
288              
289             =cut
290              
291             #------------------------------------------------------------------------------
292              
293             package Lingua::EN::Fathom;
294              
295 1     1   74672 use Lingua::EN::Syllable;
  1         523  
  1         61  
296 1     1   548 use Lingua::EN::Sentence;
  1         16850  
  1         51  
297 1     1   7 use strict;
  1         2  
  1         34  
298 1     1   7 use warnings;
  1         1  
  1         1907  
299              
300             our $VERSION = '1.27';
301              
302             #------------------------------------------------------------------------------
303             # Create a new instance of a text object.
304              
305             sub new
306             {
307 1     1 1 87 my $class = shift;
308              
309 1         2 my $text = {};
310 1         2 bless($text,$class);
311 1         3 $text = &_initialize($text);
312 1         3 return($text);
313             }
314             #------------------------------------------------------------------------------
315             # Analyse text stored in a file, reading from the file one line at a time
316              
317             sub analyse_file
318             {
319 0     0 1 0 my $text = shift;
320 0         0 my ($file_name,$accumulate) = @_;
321              
322 0 0       0 unless ( $accumulate )
323             {
324 0         0 $text = _initialize($text);
325             }
326              
327 0         0 $text->{file_name} = $file_name;
328              
329             # Only analyse non-empty text files
330 0 0 0     0 unless ( -T $file_name and -s $file_name )
331             {
332 0         0 return($text);
333             }
334              
335 0         0 open(IN_FH,"<$file_name");
336              
337 0         0 my $in_paragraph = 0;
338 0         0 my $all_text;
339 0         0 while ( )
340             {
341 0         0 my $one_line = $_;
342 0         0 $all_text .= $one_line;
343 0         0 ($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
344             }
345 0         0 close(IN_FH);
346            
347 0         0 my $sentences= Lingua::EN::Sentence::get_sentences($all_text);
348 0         0 $text->{num_sentences} += scalar(@$sentences);
349 0         0 $text->_calculate_readability;
350              
351 0         0 return($text);
352             }
353             #------------------------------------------------------------------------------
354             # Analyse a block of text, stored as a string. The string may contain line
355             # terminators.
356              
357             sub analyse_block
358             {
359 1     1 1 6 my $text = shift;
360 1         4 my ($block,$accumulate) = @_;
361              
362 1 50       4 unless ( $accumulate )
363             {
364 1         2 $text = _initialize($text);
365             }
366              
367 1 50       4 unless ( $block )
368             {
369 0         0 return($text);
370             }
371              
372 1         1 my $in_paragraph = 0;
373              
374             # Split on EOL character
375             # repeating trailing line terminators are stripped
376 1         16 my @all_lines = split(/\n/,$block);
377 1         3 my $one_line;
378 1         24 foreach $one_line ( @all_lines )
379             {
380 9         19 ($in_paragraph,$text) = _analyse_line($text,$one_line,$in_paragraph);
381             }
382            
383 1         9 my $sentences= Lingua::EN::Sentence::get_sentences($block);
384 1 50       3727 if (defined($sentences))
385             {
386 1         4 $text->{num_sentences} += scalar(@$sentences);
387             }
388            
389            
390 1         6 $text->_calculate_readability;
391            
392 1         4 return($text);
393             }
394             #------------------------------------------------------------------------------
395             sub num_chars
396             {
397 1     1 1 5 my $text = shift;
398 1         4 return($text->{num_chars});
399             }
400             #------------------------------------------------------------------------------
401             sub num_words
402             {
403 1     1 1 3 my $text = shift;
404 1         3 return($text->{num_words});
405             }
406             #------------------------------------------------------------------------------
407             sub percent_complex_words
408             {
409 0     0 1 0 my $text = shift;
410 0         0 return($text->{percent_complex_words});
411             }
412             #------------------------------------------------------------------------------
413             sub num_sentences
414             {
415 1     1 1 2 my $text = shift;
416 1         4 return($text->{num_sentences});
417             }
418             #------------------------------------------------------------------------------
419             sub num_text_lines
420             {
421 1     1 1 4 my $text = shift;
422 1         4 return($text->{num_text_lines});
423             }
424             #------------------------------------------------------------------------------
425             sub num_non_text_lines
426             {
427 1     1 1 3 my $text = shift;
428 1         3 return($text->{num_non_text_lines});
429             }
430             #------------------------------------------------------------------------------
431             sub num_blank_lines
432             {
433 1     1 1 3 my $text = shift;
434 1         4 return($text->{num_blank_lines});
435             }
436             #------------------------------------------------------------------------------
437             sub num_paragraphs
438             {
439 1     1 1 2 my $text = shift;
440 1         6 return($text->{num_paragraphs});
441             }
442             #------------------------------------------------------------------------------
443             sub syllables_per_word
444             {
445 0     0 1 0 my $text = shift;
446 0         0 return($text->{syllables_per_word});
447             }
448             #------------------------------------------------------------------------------
449             sub words_per_sentence
450             {
451 0     0 1 0 my $text = shift;
452 0         0 return($text->{words_per_sentence});
453             }
454             #------------------------------------------------------------------------------
455             sub num_syllables
456             {
457 0     0 0 0 my $text = shift;
458 0         0 return($text->{num_syllables});
459             }
460             #------------------------------------------------------------------------------
461             sub complex_words
462             {
463 0     0 0 0 my $text = shift;
464 0         0 return($text->{num_complex_words});
465             }
466             #------------------------------------------------------------------------------
467             sub fog
468             {
469 0     0 1 0 my $text = shift;
470 0         0 return($text->{fog});
471             }
472             #------------------------------------------------------------------------------
473             sub flesch
474             {
475 0     0 1 0 my $text = shift;
476 0         0 return($text->{flesch});
477             }
478             #------------------------------------------------------------------------------
479             sub kincaid
480             {
481 0     0 1 0 my $text = shift;
482 0         0 return($text->{kincaid});
483             }
484             #------------------------------------------------------------------------------
485             # Return anonymous hash of all the unique words in analysed text. The words
486             # occurrence count is stored in the hash value.
487              
488             sub unique_words
489             {
490 0     0 1 0 my $text = shift;
491 0 0       0 if ( $text->{unique_words} )
492             {
493 0         0 return( %{ $text->{unique_words} } );
  0         0  
494             }
495             else
496             {
497 0         0 return(undef);
498             }
499             }
500             #------------------------------------------------------------------------------
501             # Provide a formatted text report of all statistics for a text object.
502             # Return report as a string.
503              
504             sub report
505             {
506 0     0 1 0 my $text = shift;
507 0         0 my $report = '';
508            
509              
510             $text->{file_name} and
511 0 0       0 $report .= sprintf("File name : %s\n",$text->{file_name} );
512              
513 0         0 $report .= sprintf("Number of characters : %d\n", $text->num_chars);
514 0         0 $report .= sprintf("Number of words : %d\n", $text->num_words);
515 0         0 $report .= sprintf("Percent of complex words : %.2f\n",$text->percent_complex_words);
516 0         0 $report .= sprintf("Average syllables per word : %.4f\n",$text->syllables_per_word);
517 0         0 $report .= sprintf("Number of sentences : %d\n", $text->num_sentences);
518 0         0 $report .= sprintf("Average words per sentence : %.4f\n",$text->words_per_sentence);
519 0         0 $report .= sprintf("Number of text lines : %d\n", $text->num_text_lines);
520 0         0 $report .= sprintf("Number of non-text lines : %d\n", $text->num_non_text_lines);
521 0         0 $report .= sprintf("Number of blank lines : %d\n", $text->num_blank_lines);
522 0         0 $report .= sprintf("Number of paragraphs : %d\n", $text->num_paragraphs);
523              
524 0         0 $report .= "\n\nREADABILITY INDICES\n\n";
525 0         0 $report .= sprintf("Fog : %.4f\n",$text->fog);
526 0         0 $report .= sprintf("Flesch : %.4f\n",$text->flesch);
527 0         0 $report .= sprintf("Flesch-Kincaid : %.4f\n",$text->kincaid);
528              
529 0         0 return($report);
530             }
531              
532             #------------------------------------------------------------------------------
533             # PRIVATE METHODS
534             #------------------------------------------------------------------------------
535             sub _initialize
536             {
537 2     2   4 my $text = shift;
538              
539 2         8 $text->{num_chars} = 0;
540 2         3 $text->{num_syllables} = 0;
541 2         3 $text->{num_words} = 0;
542 2         3 $text->{num_complex_words} = 0;
543 2         3 $text->{syllables_per_word} = 0;
544 2         3 $text->{words_per_sentence} = 0;
545 2         2 $text->{percent_complex_words} = 0;
546 2         4 $text->{num_text_lines} = 0;
547 2         3 $text->{num_non_text_lines} = 0;
548 2         4 $text->{num_blank_lines} = 0;
549 2         3 $text->{num_paragraphs} = 0;
550 2         3 $text->{num_sentences} = 0;
551 2         2 $text->{unique_words} = ();
552 2         3 $text->{file_name} = '';
553              
554 2         3 $text->{fog} = 0;
555 2         4 $text->{flesch} = 0;
556 2         2 $text->{kincaid} = 0;
557              
558 2         3 return($text);
559             }
560             #------------------------------------------------------------------------------
561             # Increment number of text lines, blank lines and paragraphs
562              
563             sub _analyse_line
564             {
565 9     9   11 my $text = shift;
566            
567 9         15 my ($one_line,$in_paragraph) = @_;
568 9 100       35 if ( $one_line =~ /\w/ )
    100          
    50          
569             {
570 6         9 chomp($one_line);
571 6         10 $text = _analyse_words($text,$one_line);
572 6         7 $text->{num_text_lines}++;
573            
574 6 100       13 unless ( $in_paragraph )
575             {
576 2         3 $text->{num_paragraphs}++;
577 2         4 $in_paragraph = 1;
578             }
579             }
580             elsif ($one_line eq '' ) # empty line
581             {
582 2         3 $text->{num_blank_lines}++;
583 2         2 $in_paragraph = 0;
584             }
585             elsif ($one_line =~ /^\W+$/ ) # non text
586             {
587 1         1 $text->{num_non_text_lines}++;
588 1         1 $in_paragraph = 0;
589             }
590 9         22 return($in_paragraph,$text);
591             }
592             #------------------------------------------------------------------------------
593             # Try to detect real words in line. Increment syllable, word, and complex word counters.
594              
595             sub _analyse_words
596             {
597 6     6   6 my $text = shift;
598 6         7 my ($one_line) = @_;
599              
600 6         7 $text->{num_chars} += length($one_line);
601              
602             # Word found, such as: twice, BOTH, a, I'd, non-plussed ..
603            
604             # Ignore words like 'Mr.', K12, &, X.Y.Z ...
605             # It could be argued that Mr. is a word, but this approach should detect most of the non words
606             # which have punctuation or numbers in them
607            
608 6         35 while ( $one_line =~ /\b([a-z][-'a-z]*)\b/ig )
609             {
610 57         108 my $one_word = $1;
611              
612             # Try to filter out acronyms and abbreviations by accepting
613             # words with a vowel sound. This won't work for GPO etc.
614 57 100       123 next unless $one_word =~ /[aeiouy]/i;
615              
616             # Test for valid hyphenated word like be-bop
617 55 100       92 if ( $one_word =~ /-/ )
618             {
619 1 50       8 next unless $one_word =~ /[a-z]{2,}-[a-z]{2,}/i;
620             }
621              
622             # word frequency count
623 54         134 $text->{unique_words}{lc($one_word)}++;
624            
625 54         54 $text->{num_words}++;
626              
627             # Use subroutine from Lingua::EN::Syllable
628 54         88 my $num_syllables_current_word = syllable($one_word);
629 54         9857 $text->{num_syllables} += $num_syllables_current_word;
630              
631             # Required for Fog index, count non hyphenated words of 3 or more
632             # syllables. Should add check for proper names in here as well
633 54 100 66     233 if ( $num_syllables_current_word > 2 and $one_word !~ /-/ )
634             {
635 4         14 $text->{num_complex_words}++;
636             }
637             }
638              
639 6         12 return($text);
640             }
641             #------------------------------------------------------------------------------
642             # Determine the three readability indices
643              
644             sub _calculate_readability
645             {
646 1     1   2 my $text = shift;
647              
648 1 50 33     7 if ( $text->{num_sentences} and $text->{num_words} )
649             {
650 1         3 $text->{words_per_sentence} = $text->{num_words} / $text->{num_sentences};
651 1         2 $text->{syllables_per_word} = $text->{num_syllables} / $text->{num_words};
652             $text->{percent_complex_words} =
653 1         4 ( $text->{num_complex_words} / $text->{num_words} ) * 100;
654              
655 1         3 $text->{fog} = ( $text->{words_per_sentence} + $text->{percent_complex_words} ) * 0.4;
656              
657             $text->{flesch} = 206.835 - (1.015 * $text->{words_per_sentence}) -
658 1         3 (84.6 * $text->{syllables_per_word});
659              
660             $text->{kincaid} = (11.8 * $text->{syllables_per_word}) +
661 1         5 (0.39 * $text->{words_per_sentence}) - 15.59;
662             }
663             else
664             {
665 0           $text->{words_per_sentence} = 0;
666 0           $text->{syllables_per_word} = 0;
667 0           $text->{num_complex_words} = 0;
668 0           $text->{fog} = 0;
669 0           $text->{flesch} = 0;
670 0           $text->{kincaid} = 0;
671             }
672             }
673             #------------------------------------------------------------------------------
674             return(1);