File Coverage

blib/lib/Lingua/EN/Segment.pm
Criterion Covered Total %
statement 54 54 100.0
branch 3 4 75.0
condition 9 15 60.0
subroutine 16 16 100.0
pod 5 5 100.0
total 87 94 92.5


line stmt bran cond sub pod time code
1             package Lingua::EN::Segment;
2              
3 3     3   7136 use strict;
  3         5  
  3         66  
4 3     3   12 use warnings;
  3         4  
  3         74  
5 3     3   11 no warnings 'uninitialized';
  3         4  
  3         134  
6              
7             our $VERSION = '0.003';
8             $VERSION = eval $VERSION;
9              
10 3     3   12 use Carp;
  3         4  
  3         157  
11 3     3   1200 use English qw(-no_match_vars);
  3         2534  
  3         13  
12 3     3   1223 use File::ShareDir;
  3         4868  
  3         107  
13 3     3   12 use List::Util qw(min);
  3         5  
  3         191  
14 3     3   1442 use Memoize;
  3         5699  
  3         2156  
15              
16             =head1 NAME
17              
18             Lingua::EN::Segment - split English-language domain names etc. into words
19              
20             =head1 SYNOPSIS
21              
22             my $segmenter = Lingua::EN::Segment->new;
23             for my $domain (<>) {
24             chomp $domain;
25             my @words = $segmenter->segment($domain);
26             print "$domain: ", join(', ', @words), "\n";
27             }
28              
29             =head1 DESCRIPTION
30              
31             Sometimes you have a string that to a human eye is clearly made up of many
32             words glommed together without spaces or hyphens. This module uses some mild
33             cunning and a large list of known words from Google to try and work out how
34             the string should be split into words.
35              
36             =head2 new
37              
38             Out: $segmenter
39              
40             Returns a Lingua::EN::Segment object.
41              
42             =cut
43              
44             sub new {
45 2     2 1 907 my ($package, %args) = @_;
46              
47 2   33     15 return bless \%args => ref($package) || $package;
48             }
49              
50             =head2 dist_dir
51              
52             Out: $dist_dir
53              
54             Returns the name of the directory where distribution-specific files are
55             installed.
56              
57             =cut
58              
59             sub dist_dir {
60 3     3 1 16 my ($self) = @_;
61              
62 3   66     21 $self->{dist_dir} ||= File::ShareDir::dist_dir('Lingua-EN-Segment');
63             }
64              
65             =head2 segment
66              
67             In: $unsegmented_string
68             Out: @words
69              
70             Supplied with an unsegmented string - e.g. a domain name - returns a list of
71             words that are most statistically likely to be the words that make up this
72             string.
73              
74             =cut
75              
76             sub segment {
77 19     19 1 11463 my ($self, $unsegmented_string) = @_;
78              
79 19 100       54 return if !length($unsegmented_string);
80 18         316 my $combination = $self->_best_combination($unsegmented_string, '');
81 18         157 return @{ $combination->{words} };
  18         139  
82             }
83              
84             # Supplied with an unsegmented string and the previous word (or ''
85             # if this is the beginning of the input string), splits up the unsegmented
86             # string into a word and a remainder, segments the remainder in turn,
87             # and returns the most likely match.
88             memoize('_best_combination', NORMALIZER => sub { "$_[1] $_[2]" });
89             sub _best_combination {
90             my ($self, $unsegmented_string, $previous_word) = @_;
91              
92             # Work out all the possible words at the beginning of this string.
93             # (31 characters is the longest word in our corpus that is genuinely
94             # a real word, and not other words glommed together.)
95             # Then run this whole algorithm on the remainder, thus effectively
96             # working on the string from both the front and the back.
97             my @possible_combinations;
98             for my $prefix_length (1..min(length($unsegmented_string), 31)) {
99             my $current_word = substr($unsegmented_string, 0, $prefix_length);
100             my $current_probability
101             = $self->_probability($current_word, $previous_word);
102             my $remainder_word = substr($unsegmented_string, $prefix_length);
103             if ($remainder_word
104             and my $remainder
105             = $self->_best_combination($remainder_word, $current_word))
106             {
107             my $combination = {
108             current => {
109             words => [$current_word],
110             probability => $current_probability,
111              
112             },
113             remainder => $remainder
114             };
115             $combination->{words} = [map { @{ $combination->{$_}{words} } }
116             qw(current remainder)];
117             $combination->{probability} = $combination->{current}{probability}
118             * $combination->{remainder}{probability};
119             push @possible_combinations, $combination;
120             } else {
121             push @possible_combinations,
122             {
123             probability => $current_probability,
124             words => [$current_word],
125             };
126             }
127             }
128             return (sort { $b->{probability} <=> $a->{probability} }
129             @possible_combinations)[0];
130             }
131              
132             # Supplied with a word and the previous word, returns the probability of it
133             # matching something legitimate, either from the bigram corpus, or falling back
134             # to the unigram corpus.
135              
136             memoize('_probability', NORMALIZER => sub { "$_[1] $_[2]" });
137             sub _probability {
138             my ($self, $word, $previous_word) = @_;
139            
140             my $biword = $previous_word . ' ' . $word;
141             if ( exists $self->bigrams->{$biword}
142             && exists $self->unigrams->{$previous_word})
143             {
144             return $self->bigrams->{$biword}
145             / $self->_unigram_probability($previous_word);
146             } else {
147             return $self->_unigram_probability($word);
148             }
149             }
150              
151             sub _unigram_probability {
152 360632     360632   499448 my ($self, $word) = @_;
153              
154 360632   66     506141 return $self->unigrams->{$word} || $self->unigrams->{__unknown__}->($word);
155             }
156              
157             =head2 unigrams
158              
159             Out: \%unigrams
160              
161             Returns a hashref of word => likelihood to appear in Google's huge list of
162             words that they got off the Internet. The higher the likelihood, the more
163             likely that this is a genuine regularly-used word, rather than an obscure
164             word or a typo.
165              
166             =cut
167              
168             sub unigrams {
169 663179     663179 1 803718 my ($self) = @_;
170              
171 663179   66     1830023 return $self->{unigrams} ||= $self->_read_file('count_1w.txt');
172             }
173              
174             =head2 bigrams
175              
176             Out: \%bigrams
177              
178             As L, but returns a lookup table of "word1 word2" => likelihood
179             for combinations of words.
180              
181             =cut
182              
183             sub bigrams {
184 360959     360959 1 418984 my ($self) = @_;
185              
186 360959   66     1199350 return $self->{bigrams} ||= $self->_read_file('count_2w.txt');
187             }
188              
189             sub _read_file {
190 3     3   7 my ($self, $filename) = @_;
191              
192 3         8 my $full_filename = $self->dist_dir . '/' . $filename;
193 3 50       259 open(my $fh, '<', $full_filename)
194             or croak "Couldn't read unigrams from $full_filename: $OS_ERROR";
195 3         7 my (%count, $total_count);
196 3         43311 while (<$fh>) {
197 953024         1066634 chomp;
198 953024         2285173 my ($word, $count) = split(/\t+/, $_);
199 953024         2349827 $count{$word} = $count;
200 953024         1908656 $total_count += $count;
201             }
202 3         393962 my %likelihood = map { $_ => $count{$_} / $total_count } %count;
  1906048         5124807  
203             $likelihood{__unknown__} = sub {
204 302178     302178   412616 my $word = shift;
205 302178         1152435 return 10 / ($total_count * 10**length($word));
206 3         355948 };
207 3         453077 return \%likelihood;
208             }
209              
210              
211             =head1 ACKNOWLEDGEMENTS
212              
213             This code is based on
214             L.
215              
216             =cut
217              
218             1;