File Coverage

blib/lib/Lingua/IdSplitter.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				package Lingua::IdSplitter;
2				# ABSTRACT: split identifiers into words
3
4	4	4	82963	use strict;
	4		9
	4		101
5	4	4	21	use warnings;
	4		9
	4		112
6
7	4	4	4134	use Text::Aspell;
	0
	0
8				use LWP::Simple;
9				use String::CamelCase qw/decamelize/;
10				use File::ShareDir ':ALL';
11				use Data::Dumper;
12
13				sub new {
14				my ($class, @dicts) = @_;
15				my $self = bless({}, $class);
16
17				$self->{dicts} = [];
18				foreach (@dicts) {
19				if (ref($_) eq 'HASH') {
20				push @{$self->{dicts}}, $_;
21				}
22				if (ref($_) eq '') {
23				my $d = $self->_load_dict($_);
24				push @{$self->{dicts}}, $d;
25				}
26				}
27				$self->{speller} = Text::Aspell->new;
28
29				return $self;
30				}
31
32				sub _load_dict {
33				my ($self, $name) = @_;
34				$name .= '.csv' unless ($name =~ m/\.csv$/);
35
36				my $file;
37				$file = $name if (-e $name);
38				unless ($file) {
39				$file = "share/dictionaries/$name" if (-e "share/dictionaries/$name");
40				}
41				eval "require Lingua::IdSplitter;"; # XXX - be nice
42				unless ($file) {
43				$file = dist_file('Lingua-IdSplitter', "dictionaries/$name") unless $@;
44				}
45				unless ($file) {
46				print "$name not found";
47				exit;
48				}
49
50				my $words = {};
51				open F, '<', $file;
52				while () {
53				chomp;
54				my ($left, $right) = split /\s,\s/, $_;
55				$words->{lc $left} = lc $right;
56				}
57
58				return { weight=>0.6, words=>$words };
59				}
60
61				sub soft_split {
62				my ($self, $id) = @_;
63				$id = lc $id;
64				return () unless ($self->{speller} and $id);
65
66				# test if the id is a single word or abbreviation
67				my $test = $self->_valid_word($id);
68				if ($test and $test->{w} > 0) {
69				push @{$self->{explain_rank}}, "$test->{t}(<-$test->{s}) ---> $test->{w}\n" if ($test->{w} ne $test->{s});
70				return ($test);
71				}
72
73				# set initial values
74				$self->{full} = $id;
75				$self->{max} = length($id);
76				$self->{found} = {};
77				$self->{cand} = [];
78
79				# create possible words for each level
80				my @chars = split //, $id;
81				my $i = 0;
82				while ($i < length($id)) {
83				$self->{found}->{$i} = [$self->_find_words(join('', @chars[$i .. length($id)-1]))];
84				$i++;
85				}
86
87				# create list of possible candidates
88				foreach (@{$self->{found}->{0}}) {
89				$self->_find_next(length($_->{s}), $_);
90				}
91
92				# post-process candidates list
93				$self->_post_process;
94
95				# compute rank for each solution and sort by rank
96				my @rank;
97				foreach (@{$self->{cand}}) {
98				my $expr = $self->_calc_score($_);
99				my $score = eval $expr;
100				push @rank, {terms=>$_, expr=>$expr, score=>$score};
101				}
102				@rank = sort {$b->{score}<=>$a->{score}} @rank;
103				$self->{rank} = [@rank];
104
105				my $top = shift @rank;
106				push @{$self->{explain_rank}}, $self->_explain_rank();
107
108				return $top ? @{$top->{terms}} : ({s=>$self->{full},t=>$self->{full}});
109				}
110
111				sub _find_words {
112				my ($self, $term) = @_;
113				my @res;
114
115				my @chars = split //, $term;
116				my $left = '';
117				while (@chars) {
118				$left .= shift @chars;
119				push @res, $self->_valid_word($left) if ($self->_valid_word($left));
120				}
121
122				return @res;
123				}
124
125				sub _find_next {
126				my ($self, $lvl, @curr) = @_;
127
128				if ($lvl < $self->{max}) {
129				foreach (@{$self->{found}->{$lvl}}) {
130				$self->_find_next($lvl+length($_->{s}), @curr, $_);
131				}
132				}
133				else {
134				my @strs = map {$_->{s}} @curr;
135				push @{$self->{cand}}, [@curr] if (join('', @strs) eq $self->{full});
136				}
137				}
138
139				sub _calc_score {
140				my ($self,$cand) = @_;
141
142				my @mul = ();
143				my $max_len = 0;
144				foreach (@$cand) {
145				push @mul, '('.$_->{w}.'*'.($_->{s}?length($_->{s}):0).')';
146				$max_len = length($_->{t}) if length($_->{t})>$max_len;
147				}
148				my $expr = '('.join('', @mul).') '.$max_len.' / ('.scalar(@$cand).'*'.scalar(@$cand).')';
149				#my $expr = '('.join('', @mul).') / ('.scalar(@$cand).''.scalar(@$cand).')';
150
151				return $expr;
152				}
153
154				sub _valid_word {
155				my ($self, $word) = @_;
156
157				# consider number valid words
158				if ($word =~ m/^\d+$/) {
159				return {s=>$word,t=>$word,w=>0.3};
160				}
161
162				foreach my $d (@{$self->{dicts}}) {
163				foreach my $w (keys %{$d->{words}}) {
164				my $o = $w;
165				$w =~ s#/##g;
166
167				return {s=>$o,t=>$d->{words}->{$o},w=>$d->{weight}} if ($w eq $word);
168				}
169				}
170
171				if ($self->{speller}->check($word)) {
172				return {s=>$word,t=>$word,w=>0.3};
173				}
174
175				# word not found
176				return undef;
177				}
178
179				# experimental
180				sub _post_process {
181				my ($self) = @_;
182
183				foreach my $cand (@{$self->{cand}}) {
184				$self->_post_process_cand($cand, 0, 2);
185				$self->_post_process_cand($cand, 0, 3);
186				}
187				}
188
189				sub _post_process_cand {
190				my ($self, $cand, $offset, $n) = @_;
191				my @cand = @$cand;
192				return unless ( scalar(@cand)>=($offset+$offset+$n) );
193
194				my @bef = splice @cand, 0, $offset;
195				my (@itens) = splice @cand, $offset, $offset+$n;
196				foreach (@itens) { return unless $_; }
197				#return unless ($a and $b and $c);
198				if ( $self->_post_process_cand_verify(@itens) ) {
199				my $word = join('', map {$_->{s}} @itens);
200				my @new = ( @bef, {s=>$word,t=>$word,w=>0.1}, @cand );
201				push @{$self->{cand}}, [@new];
202				}
203
204				my @next = @$cand;
205				if ( scalar(@next) >= $n ) {
206				shift @next;
207				$self->_post_process_cand([@next],$offset+1,$n);
208				}
209				}
210
211				sub _post_process_cand_verify {
212				my ($self, @cand) = @_;
213
214				foreach (@cand) {
215				return 0 unless ($_ and length($_->{s})==1 and length($_->{t})==1);
216				}
217
218				return 1;
219				}
220
221				sub hard_split {
222				my ($self, $id) = @_;
223
224				my @first;
225				if ($id =~ m/[_\.\-\:]/) {
226				$id =~ s/^_+//g;
227				$id =~ s/_+$//g;
228
229				@first = split /[_\.\-\:]+/, $id;
230				push @{$self->{hard}}, {tech=>"'_' separator", terms=>[@first]};
231				}
232				push @first, $id unless @first;
233
234				my @res;
235				foreach my $i (@first) {
236				if ( ($i =~ m/[A-Z][a-z0-9]+(.?)[A-Z][a-z0-9]+/) or ($i =~ m/[a-z0-9]+(.?)[A-Z]/) ) { # FIXME CamelCase detection
237				my @snd = split /_/, decamelize($i);
238				@snd = map {lc} @snd;
239				push @res, @snd;
240				push @{$self->{hard}}, {tech=>'CamelCase', terms=>[@res]};
241				}
242				else {
243				push @res, $i;
244				}
245				}
246
247				my @final;
248				if (@res) {
249				push @final, {s=>$_, t=>$_} foreach @res;
250				}
251				else {
252				push @final, {s=>$id, t=>$id};
253				}
254				return @final;
255				}
256
257				sub split {
258				my ($self, $id) = @_;
259
260				# hard splits first
261				my @res = $self->hard_split($id);
262
263				# soft splits second
264				my @final;
265				foreach (@res) {
266				push @final, $self->soft_split($_->{s});
267				}
268
269				return @final;
270				}
271
272				sub explain {
273				my ($self) = @_;
274				my $str;
275
276				if ($self->{hard}) {
277				$str .= "\n## hard split\n";
278				foreach (@{$self->{hard}}) {
279				$str .= "Technique: $_->{tech}\n";
280				$str .= "Terms: ".join(',',@{$_->{terms}});
281				$str .= "\n";
282				}
283				}
284
285				if ( $self->{explain_rank}) {
286				$str .= "\n## soft split rank(s):\n";
287				$str .= join("\n", @{$self->{explain_rank}});
288				}
289
290				return $str;
291				}
292
293				sub _explain_rank {
294				my ($self) = @_;
295
296				my $r;
297				foreach (@{$self->{rank}}) {
298				my @parts;
299				foreach (@{$_->{terms}}) {
300				if ($_->{t} eq $_->{s}) {
301				push @parts, $_->{t};
302				}
303				else {
304				push @parts, "$_->{t}(<-$_->{s})";
305				}
306				}
307				$r .= join(',',@parts) . " ---> $_->{expr} = $_->{score}\n";
308				}
309
310				return $r;
311				}
312
313				1;
314
315				__END__