| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
2
|
|
|
2
|
|
46310
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
67
|
|
|
2
|
2
|
|
|
2
|
|
8
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
106
|
|
|
3
|
|
|
|
|
|
|
package Lingua::EN::Tokenizer::Offsets; |
|
4
|
|
|
|
|
|
|
{ |
|
5
|
|
|
|
|
|
|
$Lingua::EN::Tokenizer::Offsets::VERSION = '0.03'; |
|
6
|
|
|
|
|
|
|
} |
|
7
|
2
|
|
|
2
|
|
1668
|
use utf8::all; |
|
|
2
|
|
|
|
|
135433
|
|
|
|
2
|
|
|
|
|
16
|
|
|
8
|
2
|
|
|
2
|
|
45215
|
use Data::Dump qw/dump/; |
|
|
2
|
|
|
|
|
17767
|
|
|
|
2
|
|
|
|
|
181
|
|
|
9
|
2
|
|
|
2
|
|
20
|
use feature qw/say/; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
87
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
2
|
|
|
2
|
|
10
|
use base 'Exporter'; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
1983
|
|
|
12
|
|
|
|
|
|
|
our @EXPORT_OK = qw/ |
|
13
|
|
|
|
|
|
|
initial_offsets |
|
14
|
|
|
|
|
|
|
token_offsets |
|
15
|
|
|
|
|
|
|
adjust_offsets |
|
16
|
|
|
|
|
|
|
get_tokens |
|
17
|
|
|
|
|
|
|
tokenize |
|
18
|
|
|
|
|
|
|
offsets2tokens |
|
19
|
|
|
|
|
|
|
/; |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# ABSTRACT: Finds word (token) boundaries, and returns their offsets. |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub tokenize { |
|
26
|
0
|
|
|
0
|
1
|
0
|
my ($text) = @_; |
|
27
|
0
|
|
|
|
|
0
|
my $tokens = get_tokens($text); |
|
28
|
0
|
|
|
|
|
0
|
return join ' ',@$tokens; |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub token_offsets { |
|
34
|
2
|
|
|
2
|
0
|
4
|
my ($text) = @_; |
|
35
|
2
|
50
|
|
|
|
11
|
return [] unless defined $text; |
|
36
|
2
|
|
|
|
|
9
|
my $offsets = initial_offsets($text); |
|
37
|
2
|
|
|
|
|
24
|
$offsets = adjust_offsets($text,$offsets); |
|
38
|
2
|
|
|
|
|
16
|
return $offsets; |
|
39
|
|
|
|
|
|
|
} |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub get_tokens { |
|
44
|
2
|
|
|
2
|
1
|
26
|
my ($text) = @_; |
|
45
|
2
|
|
|
|
|
11
|
my $offsets = token_offsets($text); |
|
46
|
2
|
|
|
|
|
13
|
my $tokens = offsets2tokens($text,$offsets); |
|
47
|
2
|
|
|
|
|
61
|
return $tokens; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub adjust_offsets { |
|
54
|
4
|
|
|
4
|
1
|
10
|
my ($text,$offsets) = @_; |
|
55
|
4
|
50
|
|
|
|
11
|
$text = $$text if ref($text); |
|
56
|
4
|
|
|
|
|
8
|
my $size = @$offsets; |
|
57
|
4
|
|
|
|
|
13
|
for(my $i=0; $i<$size; $i++){ |
|
58
|
947
|
|
|
|
|
1088
|
my $start = $offsets->[$i][0]; |
|
59
|
947
|
|
|
|
|
937
|
my $end = $offsets->[$i][1]; |
|
60
|
947
|
|
|
|
|
917
|
my $length = $end - $start; |
|
61
|
947
|
100
|
|
|
|
1450
|
if ($length <= 0){ |
|
62
|
31
|
|
|
|
|
41
|
delete $offsets->[$i]; |
|
63
|
31
|
|
|
|
|
64
|
next; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
916
|
|
|
|
|
1524
|
my $s = substr($text,$start,$length); |
|
66
|
916
|
100
|
|
|
|
1999
|
if ($s =~ /^\s*$/){ |
|
67
|
67
|
|
|
|
|
133
|
delete $offsets->[$i]; |
|
68
|
67
|
|
|
|
|
149
|
next; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
849
|
|
|
|
|
3741
|
$s =~ /^(\s*).*?(\s*)$/s; |
|
71
|
849
|
50
|
|
|
|
1547
|
if(defined($1)){ $start += length($1); } |
|
|
849
|
|
|
|
|
1071
|
|
|
72
|
849
|
50
|
|
|
|
1466
|
if(defined($2)){ $end -= length($2); } |
|
|
849
|
|
|
|
|
1004
|
|
|
73
|
849
|
|
|
|
|
2535
|
$offsets->[$i] = [$start, $end]; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
4
|
|
|
|
|
15
|
my $new_offsets = [ grep { defined } @$offsets ]; |
|
|
945
|
|
|
|
|
1134
|
|
|
76
|
4
|
|
|
|
|
15
|
return $new_offsets; |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub initial_offsets { |
|
81
|
2
|
|
|
2
|
1
|
4
|
my ($text) = @_; |
|
82
|
2
|
50
|
|
|
|
8
|
$text = $$text if ref($text); |
|
83
|
2
|
|
|
|
|
4
|
my $end; |
|
84
|
2
|
|
|
|
|
15
|
my $text_end = length($text); |
|
85
|
2
|
|
|
|
|
6
|
my $offsets = [[0,$text_end]]; |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# token patterns |
|
88
|
2
|
|
|
|
|
44
|
my @patterns = ( |
|
89
|
|
|
|
|
|
|
qr{([^\p{IsAlnum}\s\.\'\`\,\-’])}, |
|
90
|
|
|
|
|
|
|
qr{(?
|
|
91
|
|
|
|
|
|
|
qr{(?<=\p{IsN})(,)(?!\d)}, |
|
92
|
|
|
|
|
|
|
qr{(?
|
|
93
|
|
|
|
|
|
|
qr{(?
|
|
94
|
|
|
|
|
|
|
qr{(?
|
|
95
|
|
|
|
|
|
|
qr{(?<=\p{isAlpha})(['`’])(?!\p{isAlpha})}, |
|
96
|
|
|
|
|
|
|
qr{(?<=\p{isAlpha})()['`’](?=\p{isAlpha})}, |
|
97
|
|
|
|
|
|
|
qr{(?:^|\s)(\S+)(?:$|\s)}, |
|
98
|
|
|
|
|
|
|
qr{(?:^|[^\.])(\.\.+)(?:$|[^\.])}, |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
qr{(?<=\p{isAlpha})['`]()(?=\p{isAlpha})}, |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
); |
|
103
|
|
|
|
|
|
|
|
|
104
|
2
|
|
|
|
|
7
|
for my $pat (@patterns){ |
|
105
|
22
|
|
|
|
|
30
|
my $size = @$offsets; |
|
106
|
22
|
|
|
|
|
54
|
for(my $i=0; $i<$size; $i++){ |
|
107
|
2096
|
|
|
|
|
2895
|
my $start = $offsets->[$i][0]; |
|
108
|
2096
|
|
|
|
|
2454
|
my $length = $offsets->[$i][1]-$start; |
|
109
|
2096
|
|
|
|
|
8141
|
my $s = substr($text,$start,$length); |
|
110
|
|
|
|
|
|
|
|
|
111
|
2096
|
|
|
|
|
2500
|
my $split_points = []; |
|
112
|
|
|
|
|
|
|
|
|
113
|
2096
|
100
|
|
|
|
8785
|
if($s =~ /^$pat(?!$)/g){ |
|
114
|
37
|
|
|
|
|
81
|
my $first = $-[1]; |
|
115
|
37
|
|
|
|
|
110
|
push @$split_points,[$start+$first,$start+$first]; |
|
116
|
37
|
|
|
|
|
70
|
my $second = $+[1]; |
|
117
|
37
|
50
|
|
|
|
132
|
push @$split_points,[$start+$second,$start+$second] if $first != $second; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
2096
|
|
|
|
|
14910
|
while($s =~ /(?
|
|
120
|
182
|
|
|
|
|
2377
|
my $first = $-[1]; |
|
121
|
182
|
|
|
|
|
478
|
push @$split_points,[$start+$first,$start+$first]; |
|
122
|
182
|
|
|
|
|
1169
|
my $second = $+[1]; |
|
123
|
182
|
100
|
|
|
|
1721
|
push @$split_points,[$start+$second,$start+$second] if $first != $second; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
2096
|
100
|
|
|
|
9705
|
if($s =~ /(?
|
|
126
|
39
|
|
|
|
|
417
|
my $first = $-[1]; |
|
127
|
39
|
|
|
|
|
103
|
push @$split_points,[$start+$first,$start+$first]; |
|
128
|
39
|
|
|
|
|
92
|
my $second = $+[1]; |
|
129
|
39
|
50
|
|
|
|
171
|
push @$split_points,[$start+$second,$start+$second] if $first != $second; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
2096
|
100
|
|
|
|
10441
|
_split_tokens($offsets,$i,[ sort { $a->[0] <=> $b->[0] } @$split_points ]) if @$split_points; |
|
|
619
|
|
|
|
|
987
|
|
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
} |
|
135
|
2
|
|
|
|
|
18
|
return _nonbp($text,$offsets); |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub _split_tokens { |
|
139
|
50
|
|
|
50
|
|
74
|
my ($offsets,$i,$split_points) = @_; |
|
140
|
50
|
|
|
|
|
56
|
my ($end,$start) = @{shift @$split_points}; |
|
|
50
|
|
|
|
|
93
|
|
|
141
|
50
|
|
|
|
|
69
|
my $last = $offsets->[$i][1]; |
|
142
|
50
|
|
|
|
|
69
|
$offsets->[$i][1] = $end; |
|
143
|
50
|
|
|
|
|
129
|
while(my $p = shift @$split_points){ |
|
144
|
464
|
50
|
|
|
|
1273
|
push @$offsets, [$start,$p->[0]] unless $start == $p->[0]; |
|
145
|
464
|
|
|
|
|
1140
|
$start = $p->[1]; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
50
|
|
|
|
|
351
|
push @$offsets, [$start, $last]; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub offsets2tokens { |
|
153
|
2
|
|
|
2
|
1
|
5
|
my ($text, $offsets) = @_; |
|
154
|
2
|
50
|
|
|
|
10
|
$text = $$text if ref($text); |
|
155
|
2
|
|
|
|
|
5
|
my $tokens = []; |
|
156
|
2
|
|
|
|
|
14
|
foreach my $o ( sort {$a->[0] <=> $b->[0]} @$offsets) { |
|
|
439
|
|
|
|
|
483
|
|
|
157
|
431
|
|
|
|
|
405
|
my $start = $o->[0]; |
|
158
|
431
|
|
|
|
|
449
|
my $length = $o->[1]-$o->[0]; |
|
159
|
431
|
|
|
|
|
820
|
push @$tokens, substr($text,$start,$length); |
|
160
|
|
|
|
|
|
|
} |
|
161
|
2
|
|
|
|
|
8
|
return $tokens; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub _load_prefixes { |
|
166
|
2
|
|
|
2
|
|
5
|
my ($prefixref) = @_; |
|
167
|
2
|
|
|
|
|
15
|
$INC{'Lingua/EN/Tokenizer/Offsets.pm'} =~ m{\.pm$}; |
|
168
|
2
|
|
|
|
|
13
|
my $prefixfile = "$`/nonbreaking_prefix.en"; |
|
169
|
|
|
|
|
|
|
|
|
170
|
2
|
50
|
|
|
|
253
|
open my $prefix, '<', $prefixfile or die "Could not open file '$prefixfile'!"; |
|
171
|
2
|
|
|
|
|
290
|
while (<$prefix>) { |
|
172
|
214
|
100
|
100
|
|
|
1103
|
next if /^#/ or /^\s*$/; |
|
173
|
186
|
|
|
|
|
215
|
my $item = $_; |
|
174
|
186
|
|
|
|
|
195
|
chomp($item); |
|
175
|
186
|
100
|
|
|
|
279
|
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) { $prefixref->{$1} = 2; } |
|
|
6
|
|
|
|
|
39
|
|
|
176
|
180
|
|
|
|
|
660
|
else { $prefixref->{$item} = 1; } |
|
177
|
|
|
|
|
|
|
} |
|
178
|
2
|
|
|
|
|
76
|
close($prefix); |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub _nonbp { |
|
182
|
2
|
|
|
2
|
|
7
|
my ($text,$offsets) = @_; |
|
183
|
2
|
50
|
|
|
|
12
|
$text = $$text if ref($text); |
|
184
|
2
|
|
|
|
|
6
|
my $nonbpref = {}; |
|
185
|
2
|
|
|
|
|
39
|
_load_prefixes($nonbpref); |
|
186
|
2
|
|
|
|
|
8
|
my $new_offsets = adjust_offsets($text,$offsets); |
|
187
|
2
|
|
|
|
|
16
|
$new_offsets = [ sort { $a->[0] <=> $b->[0] } @$new_offsets ]; |
|
|
969
|
|
|
|
|
1023
|
|
|
188
|
2
|
|
|
|
|
14
|
my $size = @$new_offsets; |
|
189
|
2
|
|
|
|
|
5
|
my $extra = []; |
|
190
|
2
|
|
|
|
|
14
|
for(my $i=0; $i<$size-1; $i++){ |
|
191
|
416
|
|
|
|
|
456
|
my $start = $new_offsets->[$i][0]; |
|
192
|
416
|
|
|
|
|
404
|
my $end = $new_offsets->[$i][1]; |
|
193
|
416
|
|
|
|
|
392
|
my $length = $end-$start; |
|
194
|
416
|
|
|
|
|
631
|
my $s = substr($text,$start,$length); |
|
195
|
416
|
|
|
|
|
370
|
my $j=$i+1; |
|
196
|
416
|
|
|
|
|
743
|
my $t = substr($text,$new_offsets->[$j][0], $new_offsets->[$j][1]-$new_offsets->[$j][0]); |
|
197
|
|
|
|
|
|
|
|
|
198
|
416
|
100
|
|
|
|
1246
|
if($s =~ /^(\S+)\.\s?$/){ |
|
199
|
24
|
|
|
|
|
39
|
my $pre = $1; |
|
200
|
24
|
0
|
100
|
|
|
259
|
unless ( |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
201
|
|
|
|
|
|
|
($pre =~ /\./ and $pre =~ /\p{IsAlpha}/) |
|
202
|
|
|
|
|
|
|
or ($nonbpref->{$pre} and $nonbpref->{$pre}==1) |
|
203
|
|
|
|
|
|
|
or ($t =~ /^[\p{IsLower}]/) |
|
204
|
|
|
|
|
|
|
or ( |
|
205
|
|
|
|
|
|
|
$nonbpref->{$pre} |
|
206
|
|
|
|
|
|
|
and $nonbpref->{$pre}==2 |
|
207
|
|
|
|
|
|
|
and $t =~ /^\d+/) |
|
208
|
|
|
|
|
|
|
){ |
|
209
|
15
|
|
|
|
|
50
|
$s =~ /^(.*[^\s\.])\.\s*?$/; |
|
210
|
15
|
100
|
|
|
|
53
|
next unless defined($+[1]); |
|
211
|
13
|
|
|
|
|
38
|
push @$extra, [$start+$+[1],$end]; |
|
212
|
13
|
|
|
|
|
59
|
$new_offsets->[$i][1] = $start+$+[1]; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
} |
|
216
|
2
|
|
|
|
|
13
|
return [ sort { $a->[0] <=> $b->[0] } (@$new_offsets,@$extra) ]; |
|
|
639
|
|
|
|
|
764
|
|
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
1; |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
__END__ |