| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!/usr/bin/perl |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Lingua::EN::Numericalize - Replaces English descriptions of numbers with numerals |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=cut |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
package Lingua::EN::Numericalize; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
require Exporter; |
|
12
|
|
|
|
|
|
|
our @ISA = qw/Exporter/; |
|
13
|
|
|
|
|
|
|
our @EXPORT = qw/&str2nbr/; |
|
14
|
|
|
|
|
|
|
our $VERSION = substr q$Revision: 1.52 $, 10; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
local $\ = $/; |
|
17
|
|
|
|
|
|
|
our $debug = 0; |
|
18
|
|
|
|
|
|
|
our $UK = 0; |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Lingua::EN::Numericalize; |
|
23
|
|
|
|
|
|
|
print str2nbr("one thousand maniacs"); |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$_ = "six hundred three-score and six"; |
|
26
|
|
|
|
|
|
|
str2nbr(); |
|
27
|
|
|
|
|
|
|
print; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$Lingua::EN::Numericalize::UK = 1; |
|
30
|
|
|
|
|
|
|
print str2nbr("one billion"); # 1,000,000,000,000 |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
This module interpolates English descriptions of numbers in a given string with their numeric counterparts. It supports both ordinal and cardinal numbers, negative numbers, and very large numbers. |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
The module exports a single function into the caller's namespace as follows: |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=over |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item B |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
This function receives an optional string (using $_ if none is passed) and converts all English text that describes a number into its numeric equivalent. When called in a void context, the function sets $_ to the new value. |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=back |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
The module's behaviour is affected by the following variables: |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=over |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub str2nbr { |
|
53
|
101
|
|
|
101
|
1
|
9865
|
my $s = lc(shift); |
|
54
|
101
|
50
|
|
|
|
216
|
local $_ if wantarray(); |
|
55
|
|
|
|
|
|
|
|
|
56
|
101
|
|
|
|
|
5842
|
$s =~ s/$_/$strrep{$_}/eeg for keys %strrep; |
|
|
11
|
|
|
|
|
796
|
|
|
57
|
|
|
|
|
|
|
|
|
58
|
101
|
|
|
|
|
179
|
my @ret; |
|
59
|
101
|
|
|
|
|
518
|
for (split /\b/, $s) { |
|
60
|
355
|
100
|
|
|
|
982
|
push(@ret, $_), next if /^\d+$/; |
|
61
|
332
|
100
|
|
|
|
858
|
push(@ret, $_), next if /[^a-zA-Z0-9]/; |
|
62
|
204
|
|
|
|
|
329
|
push(@ret, word2num()); |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# generate number sequences |
|
66
|
|
|
|
|
|
|
|
|
67
|
101
|
|
|
|
|
159
|
my $i = 0; |
|
68
|
101
|
|
|
|
|
234
|
while ($i < $#ret) { |
|
69
|
259
|
100
|
|
|
|
441
|
$ret[$i] = [ $ret[$i] ], $n = 1 if isnbr($ret[$i]); |
|
70
|
259
|
100
|
|
|
|
537
|
if (ref($ret[$i])) { |
|
71
|
248
|
|
|
|
|
332
|
my $next = $ret[$i + 1]; |
|
72
|
248
|
100
|
|
|
|
322
|
if (isnbr($next)) { |
|
73
|
113
|
|
|
|
|
102
|
push @{$ret[$i]}, $next; |
|
|
113
|
|
|
|
|
214
|
|
|
74
|
113
|
|
|
|
|
154
|
splice(@ret, $i + 1, 1); |
|
75
|
113
|
|
|
|
|
280
|
next; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
135
|
|
|
|
|
219
|
my $nexxt = $ret[$i + 2]; |
|
78
|
135
|
50
|
66
|
|
|
186
|
if (isconj($next) && (isnbr($nexxt) || isconj($nexxt))) { |
|
|
|
|
66
|
|
|
|
|
|
79
|
132
|
|
|
|
|
202
|
splice(@ret, $i + 1, 1); |
|
80
|
132
|
|
|
|
|
337
|
next; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
} |
|
83
|
14
|
|
|
|
|
34
|
$i++; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# calculate sequences |
|
87
|
|
|
|
|
|
|
|
|
88
|
101
|
|
100
|
|
|
341
|
ref && ($_ = seq2int(@$_)) for @ret; |
|
89
|
|
|
|
|
|
|
|
|
90
|
101
|
|
|
|
|
733
|
$_ = join "", @ret; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=item B<$Lingua::EN::Numericalize::UK> |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
This variable may be set to indicate that the UK meaning of C should be used. By default, this module uses the American meaning of this word :( Please note that all the related larger numbers e.g. trillion, quadrillion, etc. assume the chosen behaviour as well. |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=item B<$Lingua::EN::Numericalize::debug> |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
If set to true, the module outputs on standard error messages useful for debugging. |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=back |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=cut |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub isnbr { |
|
106
|
639
|
|
100
|
639
|
0
|
3486
|
! /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ && return for @_; |
|
107
|
263
|
|
|
|
|
696
|
return 1; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub isconj { |
|
111
|
159
|
|
33
|
159
|
0
|
305
|
my $w = shift || $_; |
|
112
|
159
|
|
100
|
|
|
3770
|
$w =~ /$_/ && return 1 for @conj; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# splits "fourtytwo", "onehundred", etc. |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub compound { |
|
118
|
204
|
|
0
|
204
|
0
|
421
|
my $w = shift || $_ || return; |
|
119
|
|
|
|
|
|
|
|
|
120
|
204
|
|
|
|
|
4346
|
$w =~ s/(\d)$_$/$1$abb{$_}/ for keys %abb; |
|
121
|
|
|
|
|
|
|
|
|
122
|
204
|
|
|
|
|
317
|
my @ret; my @w2n = keys %word2num; |
|
|
204
|
|
|
|
|
3307
|
|
|
123
|
204
|
|
|
|
|
1025
|
for (my $i = 0; $i < @w2n; $i++) { |
|
124
|
8623
|
100
|
|
|
|
75589
|
push(@ret, $word2num{$w2n[$i]}), $i = 0 |
|
125
|
|
|
|
|
|
|
if $w =~ s/$w2n[$i]$//; |
|
126
|
8623
|
100
|
|
|
|
29205
|
last unless $w; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
204
|
100
|
|
|
|
408
|
push @ret, $w if $w; |
|
129
|
204
|
|
|
|
|
1109
|
reverse @ret; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub word2num { |
|
133
|
204
|
|
50
|
204
|
0
|
649
|
my $w = shift || $_ || return; |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
$w =~ s/$_/$tokrep{$_}/g |
|
136
|
204
|
|
|
|
|
6583
|
for keys %tokrep; |
|
137
|
|
|
|
|
|
|
|
|
138
|
204
|
|
|
|
|
314
|
my @ret; |
|
139
|
204
|
|
|
|
|
328
|
for $w (compound($w)) { |
|
140
|
209
|
|
|
|
|
260
|
my $o = $w; |
|
141
|
209
|
|
|
|
|
476
|
for (keys %suffix) { |
|
142
|
715
|
|
100
|
|
|
9019
|
my ($m) = $w =~ /(.*)$_$/; $m ||= ""; |
|
|
715
|
|
|
|
|
2328
|
|
|
143
|
715
|
100
|
|
|
|
2003
|
$w = $suffix{$_}->($word2num{$m}), last |
|
144
|
|
|
|
|
|
|
if $word2num{$m}; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
209
|
|
|
|
|
629
|
push @ret, $w; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
|
|
149
|
204
|
|
|
|
|
561
|
@ret; |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub seq2int { |
|
153
|
142
|
|
|
142
|
0
|
237
|
my @seq = @_; |
|
154
|
142
|
50
|
|
|
|
244
|
print "seq2int(): ", join "-", @seq if $debug; |
|
155
|
142
|
|
|
|
|
201
|
my ($i, $max) = (0) x 2; |
|
156
|
142
|
|
66
|
|
|
661
|
($max < $seq[$_]) && ($max = $seq[$_], $i = $_) for 0 .. $#seq; |
|
157
|
142
|
100
|
|
|
|
273
|
if ($i == 0) { |
|
158
|
92
|
|
|
|
|
87
|
my $ret = 0; |
|
159
|
92
|
|
|
|
|
590
|
$ret += $_ for @seq; |
|
160
|
92
|
|
|
|
|
370
|
return $ret; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
50
|
|
|
|
|
150
|
$seq[$i] * seq2int(@seq[0 .. $i - 1]) + seq2int(@seq[$i + 1 .. $#seq]); |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# conjunctions are valid separators for text numbers |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
our @conj = ('and', 'of', '\s+', '-', ','); |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# abbreviations |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
our %abb = ( |
|
172
|
|
|
|
|
|
|
k => "0" x 3, |
|
173
|
|
|
|
|
|
|
m => "0" x 6, |
|
174
|
|
|
|
|
|
|
b => "0" x ($UK ? 12 : 9), |
|
175
|
|
|
|
|
|
|
); |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
our %strrep = ( |
|
178
|
|
|
|
|
|
|
'milion' => q/"million"/, # common mispelling |
|
179
|
|
|
|
|
|
|
'(\d)\s*,\s*(\d)' => q/"$1$2"/, # commas in numbers ok to remove |
|
180
|
|
|
|
|
|
|
q/baker('?s)?(\s+)?dozen/ => q/"baker"/, # colloquialism |
|
181
|
|
|
|
|
|
|
'(\d)(st|nd|rd|th)' => q/"$1"/, |
|
182
|
|
|
|
|
|
|
); |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
our %tokrep = ( |
|
185
|
|
|
|
|
|
|
'th$' => "", # cardinals |
|
186
|
|
|
|
|
|
|
'(s?e)?s$' => "", # pluralis |
|
187
|
|
|
|
|
|
|
'tie' => "ty", # four[tie]th |
|
188
|
|
|
|
|
|
|
); |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
our %suffix = ( |
|
191
|
|
|
|
|
|
|
teen => sub { 10 + shift }, |
|
192
|
|
|
|
|
|
|
ty => sub { 10 * shift }, |
|
193
|
|
|
|
|
|
|
illiard => sub { 10 ** (9 + 6 * (shift() - 1)) }, |
|
194
|
|
|
|
|
|
|
illion => sub { |
|
195
|
|
|
|
|
|
|
my $k = shift; |
|
196
|
|
|
|
|
|
|
return 1e6 if $k == 1; |
|
197
|
|
|
|
|
|
|
my $n = $UK ? 6 * $k : 3 * ($k - 1); |
|
198
|
|
|
|
|
|
|
10 ** ($n + 6); |
|
199
|
|
|
|
|
|
|
}, |
|
200
|
|
|
|
|
|
|
); |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
our %latin = ( |
|
203
|
|
|
|
|
|
|
un => 1, |
|
204
|
|
|
|
|
|
|
duo => 2, |
|
205
|
|
|
|
|
|
|
tre => 3, tr => 3, |
|
206
|
|
|
|
|
|
|
quattuor => 4, quadr => 4, |
|
207
|
|
|
|
|
|
|
quin => 5, quint => 5, |
|
208
|
|
|
|
|
|
|
sex => 6, sext => 6, |
|
209
|
|
|
|
|
|
|
septen => 7, sept => 7, |
|
210
|
|
|
|
|
|
|
octo => 8, oct => 8, |
|
211
|
|
|
|
|
|
|
novem => 9, non => 9, |
|
212
|
|
|
|
|
|
|
dec => 10, |
|
213
|
|
|
|
|
|
|
undec => 11, |
|
214
|
|
|
|
|
|
|
duodec => 12, |
|
215
|
|
|
|
|
|
|
tredec => 13, |
|
216
|
|
|
|
|
|
|
quattuordec => 14, |
|
217
|
|
|
|
|
|
|
quindec => 15, |
|
218
|
|
|
|
|
|
|
hex => 16, |
|
219
|
|
|
|
|
|
|
vigint => 20, vig => 20, |
|
220
|
|
|
|
|
|
|
trig => 30, |
|
221
|
|
|
|
|
|
|
cent => 100, |
|
222
|
|
|
|
|
|
|
); |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
our %word2num = ( |
|
225
|
|
|
|
|
|
|
naught => 0, |
|
226
|
|
|
|
|
|
|
first => 1, |
|
227
|
|
|
|
|
|
|
second => 2, |
|
228
|
|
|
|
|
|
|
third => 3, |
|
229
|
|
|
|
|
|
|
zero => 0, |
|
230
|
|
|
|
|
|
|
one => 1, |
|
231
|
|
|
|
|
|
|
two => 2, |
|
232
|
|
|
|
|
|
|
three => 3, thir => 3, |
|
233
|
|
|
|
|
|
|
four => 4, for => 4, |
|
234
|
|
|
|
|
|
|
five => 5, fif => 5, |
|
235
|
|
|
|
|
|
|
six => 6, |
|
236
|
|
|
|
|
|
|
seven => 7, |
|
237
|
|
|
|
|
|
|
eight => 8, eigh => 8, |
|
238
|
|
|
|
|
|
|
nine => 9, nin => 9, |
|
239
|
|
|
|
|
|
|
ten => 10, |
|
240
|
|
|
|
|
|
|
eleven => 11, |
|
241
|
|
|
|
|
|
|
twelve => 12, twelf => 12, |
|
242
|
|
|
|
|
|
|
twen => 2, |
|
243
|
|
|
|
|
|
|
hundred => 100, |
|
244
|
|
|
|
|
|
|
thousand => 1000, |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
m => 1, # million/milliard |
|
247
|
|
|
|
|
|
|
b => 2, # billion |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
googol => 10 ** 100, |
|
250
|
|
|
|
|
|
|
googolplex => 10 ** (10 ** 100), |
|
251
|
|
|
|
|
|
|
score => 20, |
|
252
|
|
|
|
|
|
|
gros => 12 * 12, # gross |
|
253
|
|
|
|
|
|
|
dozen => 12, |
|
254
|
|
|
|
|
|
|
baker => 13, |
|
255
|
|
|
|
|
|
|
eleventyone => 111, |
|
256
|
|
|
|
|
|
|
eleventyfirst => 111, |
|
257
|
|
|
|
|
|
|
); |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
%word2num = (%word2num, %latin); |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
1; |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
__END__ |