File Coverage

blib/lib/Text/IQ.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				package Text::IQ;
2
3	1	1	6	use warnings;
	1		2
	1		25
4	1	1	5	use strict;
	1		1
	1		15
5	1	1	4	use Carp;
	1		2
	1		45
6	1	1	302	use Search::Tools::Tokenizer;
	1		130094
	1		28
7	1	1	6	use Search::Tools::UTF8;
	1		2
	1		69
8	1	1	242	use Search::Tools::SpellCheck;
	0
	0
9				use Scalar::Util qw( openhandle );
10				use File::Basename;
11				use Data::Dump qw( dump );
12
13				our $VERSION = '0.005';
14
15				=head1 NAME
16
17				Text::IQ - naive intelligence about a body of text
18
19				=head1 SYNOPSIS
20
21				use Text::IQ::EN; # English text
22				my $file = 'path/to/file';
23				my $iq = Text::IQ::EN->new( $file );
24				printf("Number of words: %d\n", $iq->num_words);
25				printf("Avg word length: %d\n", $iq->word_length);
26				printf("Number of sentences: %d\n", $iq->num_sentences);
27				printf("Avg sentence length: %d\n", $iq->sentence_length);
28				printf("Misspellings: %d\n", $iq->num_misspellings);
29				printf("Grammar errors: %d\n", $iq->num_grammar_errors);
30
31				# access internal Search::Tools::TokenList
32				my $tokens = $iq->tokens;
33
34				=cut
35
36				=head1 METHODS
37
38				=head2 new( I )
39
40				=head2 new( I )
41
42				Constructor method. Returns Text::IQ object. Single argument
43				is either the path to a file or a reference to a simple scalar
44				string.
45
46				=cut
47
48				sub new {
49				my $class = shift;
50				my $self = bless {
51				num_words => 0,
52				total_word_length => 0,
53				num_sentences => 0,
54				total_sentence_length => 0,
55				tmp_sent_len => 0,
56				num_syllables => 0,
57				num_complex_words => 0,
58				}, $class;
59				my $text = shift;
60				if ( !defined $text ) {
61				croak "text required";
62				}
63				if ( ref $text eq 'SCALAR' ) {
64				$self->{_text} = to_utf8($$text);
65				}
66				else {
67				$self->{_text} = to_utf8( Search::Tools->slurp($text) );
68				}
69				my $tokenizer = Search::Tools::Tokenizer->new();
70				$self->{_tokens}
71				= $tokenizer->tokenize( $self->{_text}, sub { $self->_examine(@_) },
72				);
73				$self->{avg_word_length}
74				= $self->{total_word_length} / $self->{num_words};
75				$self->{avg_sentence_length}
76				= $self->{total_sentence_length} / $self->{num_sentences};
77				return $self;
78				}
79
80				sub _examine {
81				my $self = shift;
82				my $token = shift;
83				$self->{num_words}++;
84				$self->{total_word_length} += $token->u8len;
85				my $syll = $self->get_num_syllables("$token");
86				$self->{num_syllables} += $syll;
87				if ( $syll > 2 and $token !~ m/\-/ ) {
88				$self->{num_complex_words}++;
89				}
90				if ( $token->is_sentence_start ) {
91				$self->{num_sentences}++;
92				$self->{total_sentence_length} += $self->{tmp_sent_len};
93				$self->{tmp_sent_len} = 0;
94				}
95				$self->{tmp_sent_len}++;
96				}
97
98				=head2 get_sentences
99
100				Wrapper around the L as_sentences() method.
101				Passes through the same arguments as as_sentences().
102
103				=head2 num_words
104
105				Returns the number of words in the text.
106
107				=head2 num_sentences
108
109				Returns the number of sentences in the text.
110
111				=head2 avg_word_length
112
113				Returns the average number of characters in each word.
114
115				=head2 avg_sentence_length
116
117				Returns the average length of each sentence.
118
119				=head2 num_complex_words
120
121				Returns the number of words with more than 2 syllables.
122
123				=head2 num_syllables
124
125				Returns the total number of syllables in the text.
126
127				=cut
128
129				sub get_sentences { shift->{_tokens}->as_sentences(@_) }
130				sub num_words { shift->{num_words} }
131				sub num_sentences { shift->{num_sentences} }
132				sub avg_word_length { shift->{avg_word_length} }
133				sub avg_sentence_length { shift->{avg_sentence_length} }
134				sub num_complex_words { shift->{num_complex_words} }
135				sub num_syllables { shift->{num_syllables} }
136
137				# see http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_test
138				# and http://www.plainlanguage.com/Resources/readability.html
139				# and Lingua::EN::Fathom
140
141				=head2 flesch
142
143				Returns the Flesch score per L.
144
145				=cut
146
147				sub flesch {
148				my $self = shift;
149				return
150				206.835
151				- ( 1.015 * ( $self->{num_words} / $self->{num_sentences} ) )
152				- ( 84.6 * ( $self->{num_syllables} / $self->{num_words} ) );
153				}
154
155				=head2 fog
156
157				Returns the Fog score per L.
158
159				=cut
160
161				sub fog {
162				my $self = shift;
163				return ( ( $self->{num_words} / $self->{num_sentences} )
164				+ ( ( $self->{num_complex_words} / $self->{num_words} ) * 100 ) )
165				* 0.4;
166				}
167
168				=head2 kincaid
169
170				Returns the Kincaid score per L.
171
172				=cut
173
174				sub kincaid {
175				my $self = shift;
176				return ( 11.8 * ( $self->{num_syllables} / $self->{num_words} ) )
177				+ ( 0.39 * ( $self->{num_words} / $self->{num_sentences} ) ) - 15.59;
178				}
179
180				1;
181
182				__END__