File Coverage

blib/lib/Text/WordCounter.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1	1	1	23730	use strict;
	1		2
	1		50
2	1	1	6	use warnings;
	1		2
	1		65
3
4				package Text::WordCounter;
5				{
6				$Text::WordCounter::VERSION = '0.001';
7				}
8
9	1	1	1017	use namespace::autoclean;
	1		30995
	1		8
10	1	1	647	use Moose;
	0
	0
11
12				use Lingua::ZH::MMSEG;
13				use Unicode::UCD qw(charinfo);
14				use URI::Find;
15				use Lingua::Stem;
16
17				has stemming => (is => 'rw', isa => 'Int', default => 0);
18				has stopwords => (is => 'ro', isa => 'HashRef', default => sub { {} });
19
20				sub is_stop_word {
21				my( $self, $word, $script ) = @_;
22				return 0 if( $script eq 'Han' );
23				return 1 if exists $self->stopwords->{lc $word};
24				return length($word) <= 3;
25				}
26
27				sub normalize {
28				my ($self, $word) = @_;
29
30				if ($self->stemming) {
31				my $stemmed = Lingua::Stem::stem($word)->[0];
32				if ($stemmed ne '') {
33				return $stemmed;
34				}
35				}
36				return lc $word
37				}
38
39				my %char_cache = ();
40				sub split_scripts {
41				my ( $self, $text ) = @_;
42				my @parts;
43				while ( $text =~ /(\X)/g ) {
44				my $part = $1;
45				my $pos = pos( $text );
46				my $ord = ord $part;
47
48				unless ($char_cache{$ord}) {
49				if (scalar(keys(%char_cache)) > 5000) {
50				# XXX: Some LRU cache would be more appropriate, but this cleaning
51				# will probably happen very rarely or never, so there's (hopefully) no
52				# need to bother about it too much
53				undef %char_cache;
54				}
55
56				$char_cache{$ord} = charinfo($ord);
57				}
58				my $charinfo = $char_cache{$ord};
59
60				if( ! defined $charinfo ){
61				warn "$1 does not look like good UTF8 - no charinfo";
62				next;
63				}
64				my $script = $charinfo->{script};
65				if( ! defined $script ){
66				warn "$1 does not look like good UTF8 - no script";
67				next;
68				}
69				next if $script eq 'Common';
70				$text=~ /((\p{$script}\|[-0-9:])*)/g;
71				$part .= $1;
72				push @parts, { text => $part, script => $script };
73				}
74				# warn join ' \| ', map { $_->{text} } @parts;
75				return @parts;
76				}
77
78				sub word_count {
79				my ( $self, $text, $features ) = @_;
80				$features \|\|= {};
81				for my $part ( $self->split_scripts( $text ) ){
82				my @words = ( $part->{text} );
83				if( $part->{script} eq 'Han' ){
84				@words = mmseg( $part->{text} );
85				}
86				for my $word ( @words ){
87				next if $self->is_stop_word( $word, $part->{script} );
88				$features->{ $self->normalize( $word ) }++;
89				}
90				}
91				return $features;
92				}
93
94
95				__PACKAGE__->meta()->make_immutable();
96
97				1;
98
99				# ABSTRACT: counting words in multilingual texts
100
101				__END__
102
103				=pod
104
105				=head1 NAME
106
107				Text::WordCounter - counting words in multilingual texts
108
109				=head1 VERSION
110
111				version 0.001
112
113				=head1 SYNOPSIS
114
115				my $counter = Text::WordCounter->new();
116
117				my $word_count = $counter->word_count( $text )
118
119				=head1 DESCRIPTION
120
121				It is quite heuristic, for example '-' and digits inside word characters
122				are treated as a word character, see the tests to find out how all the special
123				cases are resolved,
124
125				The features parameter should be a hashref and is an accumulator for found
126				features.
127
128				=head1 ATTRIBUTES
129
130				=head2 stemming
131
132				If set stemming via Lingua::Stem is performed on the words.
133				We never managed to make it sanely in multilingual texts.
134
135				=head2 stopwords
136
137				A hashref with words to discard.
138
139				=head1 INSTANCE METHODS
140
141				=head2 C<is_stop_word>
142
143				=head2 C<normalize>
144
145				Lowercases words and stemms them if the C<stemming> attribute is true.
146
147				=head2 C<split_scripts>
148
149				=head2 C<word_count>
150
151				Returns a hashref with word counts.
152
153				=head1 LIMITATIONS
154
155				From languages that don't use spaces only Chinese is currently supported
156				(using Lingua::ZH::MMSEG).
157
158				=head1 SEE ALSO
159
160				__END__
161
162				=head1 AUTHORS
163
164				=over 4
165
166				=item *
167
168				Zbigniew Lukasiak <zlukasiak@opera.com>
169
170				=item *
171
172				Tadeusz SoÅ›nierz, tsosnierz@opera.com
173
174				=back
175
176				=head1 COPYRIGHT AND LICENSE
177
178				This software is Copyright (c) 2012 by Opera Software ASA.
179
180				This is free software, licensed under:
181
182				The Artistic License 2.0 (GPL Compatible)
183
184				=cut