File Coverage

blib/lib/Text/TFIDF/Ngram.pm

Criterion	Covered	Total	%
statement	76	76	100.0
branch	23	24	95.8
condition	9	11	81.8
subroutine	13	13	100.0
pod	4	5	80.0
total	125	129	96.9

line	stmt	bran	cond	sub	pod	time	code
1							package Text::TFIDF::Ngram;
2							our $AUTHORITY = 'cpan:GENE';
3
4							# ABSTRACT: Compute the TF-IDF measure for ngram phrases
5
6							our $VERSION = '0.0509';
7
8	1			1		1048	use strictures 2;
	1					1389
	1					34
9	1			1		169	use Carp;
	1					13
	1					51
10	1			1		397	use Lingua::EN::Ngram;
	1					1142
	1					29
11	1			1		357	use Lingua::StopWords qw( getStopWords );
	1					282
	1					51
12	1			1		8	use List::Util qw( sum0 );
	1					1
	1					78
13	1			1		531	use Moo;
	1					9426
	1					4
14	1			1		1572	use namespace::clean;
	1					9276
	1					7
15
16
17							has files => (
18							is => 'ro',
19							isa => sub { croak 'Invalid files list' unless ref $_[0] eq 'ARRAY' },
20							);
21
22
23							has size => (
24							is => 'ro',
25							isa => sub { croak 'Invalid integer' unless $_[0] && $_[0] =~ /^\d+$/ && $_[0] > 0 },
26							default => sub { 1 },
27							);
28
29
30							has stopwords => (
31							is => 'ro',
32							isa => sub { croak 'Invalid Boolean' unless defined $_[0] },
33							default => sub { 1 },
34							);
35
36
37							has punctuation => (
38							is => 'ro',
39							default => sub { qr/(?!')[[:punct:]]/ },
40							);
41
42
43							has lowercase => (
44							is => 'ro',
45							default => sub { 0 },
46							);
47
48
49							has counts => (
50							is => 'ro',
51							init_arg => undef,
52							);
53
54
55							has file_tfidf => (
56							is => 'ro',
57							init_arg => undef,
58							);
59
60
61							sub BUILD {
62	8			8	0	69	my ( $self, $args ) = @_;
63
64	8	100				21	return unless $args->{files};
65
66	7		50			17	$args->{size} \|\|= 1;
67
68	7					9	for my $file ( @{ $args->{files} } ) {
	7					14
69	11					23	$self->_process_ngrams( $file, $args->{size} );
70							}
71							}
72
73							sub _process_ngrams {
74	11			11		21	my ( $self, $file, $size ) = @_;
75
76	11					36	my $ngram = Lingua::EN::Ngram->new( file => $file );
77	11					1108	my $phrase = $ngram->ngram($size);
78
79	11	100				1167	if ( $self->lowercase ) {
80	2					8	$phrase = { map { lc $_ => $phrase->{$_} } keys %$phrase };
	25					75
81							}
82
83	11					37	my $stop = getStopWords('en');
84
85	11					53179	my $counts;
86
87	11					45	for my $p ( keys %$phrase ) {
88							next if $self->stopwords
89	91	100	100			184	&& grep { $stop->{$_} } split /\s+/, $p; # Exclude stopwords
	8					26
90
91	86					106	my $pat = $self->punctuation;
92	86	100				248	$p =~ s/$pat//g if $pat; # Remove unwanted punctuation
93
94							# Skip if we don't have an ngram of the requested size anymore
95	86					203	my @p = grep { $_ } split /\s+/, $p;
	142					247
96	86	100				169	next unless @p == $size;
97
98							# Skip an ngram with a lone single quote (allowed by the default punctuation)
99	67	50				77	next if grep { $_ eq "'" } @p;
	123					196
100
101	67	100				116	$p = lc $p if $self->lowercase;
102
103	67					155	$counts->{$p} = $phrase->{$p};
104							}
105
106	11					230	$self->{counts}{$file} = $counts;
107							}
108
109
110							sub tf {
111	15			15	1	4653	my ( $self, $file, $phrase ) = @_;
112	15	100	66			73	return 0 unless exists $self->counts->{$file} && exists $self->counts->{$file}{$phrase};
113	14					29	return $self->counts->{$file}{$phrase} / sum0( values %{ $self->counts->{$file} } );
	14					103
114							}
115
116
117							sub idf {
118	20			20	1	2745	my ( $self, $phrase ) = @_;
119
120	20					27	my $count = 0;
121
122	20					23	for my $file ( keys %{ $self->counts } ) {
	20					53
123	40	100				81	$count++ if exists $self->counts->{$file}{$phrase};
124							}
125
126	20	100				55	unless ( $count ) {
127	2					234	carp "'$phrase' is not present in any document";
128	2					150	return undef;
129							}
130
131	18					21	return - log( $count / keys %{ $self->counts } ) / log(10) + 0;
	18					91
132							}
133
134
135							sub tfidf {
136	14			14	1	2518	my ( $self, $file, $phrase ) = @_;
137	14					20	my $idf = $self->idf($phrase);
138	14	100				31	return undef unless $idf;
139	8					15	return $self->tf( $file, $phrase ) * $idf;
140							}
141
142
143							sub tfidf_by_file {
144	1			1	1	327	my ($self) = @_;
145
146	1					3	my %seen;
147
148	1					2	for my $file ( keys %{ $self->counts } ) {
	1					5
149	2					3	for my $phrase ( keys %{ $self->counts->{$file} } ) {
	2					6
150	8					13	my $tfidf = $self->tfidf( $file, $phrase );
151
152	8	100	100			27	next if $seen{$phrase}++ \|\| !defined $tfidf;
153
154	4					9	$self->{file_tfidf}{$file}{$phrase} = $tfidf;
155							}
156							}
157
158	1					4	return $self->file_tfidf;
159							}
160
161							1;
162
163							__END__