line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::RU::OpenCorpora::Tokenizer; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
86983
|
use utf8; |
|
2
|
|
|
|
|
15
|
|
|
2
|
|
|
|
|
13
|
|
4
|
2
|
|
|
2
|
|
60
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
68
|
|
5
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
62
|
|
6
|
|
|
|
|
|
|
|
7
|
2
|
|
|
2
|
|
114883
|
use Unicode::Normalize; |
|
2
|
|
|
|
|
10247
|
|
|
2
|
|
|
|
|
209
|
|
8
|
2
|
|
|
2
|
|
1881
|
use Lingua::RU::OpenCorpora::Tokenizer::List; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
62
|
|
9
|
2
|
|
|
2
|
|
1487
|
use Lingua::RU::OpenCorpora::Tokenizer::Vectors; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
3504
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = 0.06; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new { |
14
|
1
|
|
|
1
|
1
|
28
|
my $class = shift; |
15
|
|
|
|
|
|
|
|
16
|
1
|
|
|
|
|
5
|
my $self = bless {@_}, $class; |
17
|
1
|
|
|
|
|
6
|
$self->_init; |
18
|
|
|
|
|
|
|
|
19
|
1
|
|
|
|
|
6
|
$self; |
20
|
|
|
|
|
|
|
} |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub tokens { |
23
|
0
|
|
|
0
|
1
|
0
|
my($self, $text, $options) = @_; |
24
|
|
|
|
|
|
|
|
25
|
0
|
0
|
|
|
|
0
|
$options = {} unless defined $options; |
26
|
0
|
|
|
|
|
0
|
$options->{want_tokens} = 1; |
27
|
0
|
0
|
|
|
|
0
|
$options->{threshold} = 1 unless defined $options->{threshold}; |
28
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
0
|
$self->_do_tokenize($text, $options); |
30
|
|
|
|
|
|
|
|
31
|
0
|
|
|
|
|
0
|
$self->{tokens}; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub tokens_bounds { |
35
|
6
|
|
|
6
|
1
|
28269
|
my($self, $text, $options) = @_; |
36
|
|
|
|
|
|
|
|
37
|
6
|
50
|
|
|
|
39
|
$options = {} unless defined $options; |
38
|
6
|
|
|
|
|
22
|
$options->{want_tokens} = 0; |
39
|
|
|
|
|
|
|
|
40
|
6
|
|
|
|
|
28
|
$self->_do_tokenize($text, $options); |
41
|
|
|
|
|
|
|
|
42
|
6
|
|
|
|
|
36
|
$self->{bounds}; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub _do_tokenize { |
46
|
6
|
|
|
6
|
|
19
|
my($self, $text, $options) = @_; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# workaround for The Unicode Bug |
49
|
|
|
|
|
|
|
# see https://metacpan.org/module/perlunicode#The-Unicode-Bug |
50
|
6
|
|
|
|
|
17
|
utf8::upgrade($text); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# normalize Unicode to prevent decomposed characters to be processed separately |
53
|
6
|
|
|
|
|
152
|
$text = NFC($text); |
54
|
|
|
|
|
|
|
|
55
|
6
|
|
|
|
|
128
|
my $chars = $self->{chars} = [split //, $text]; |
56
|
6
|
|
|
|
|
56
|
$self->{bounds} = []; |
57
|
6
|
|
|
|
|
36
|
$self->{tokens} = []; |
58
|
|
|
|
|
|
|
|
59
|
6
|
|
|
|
|
14
|
my $token; |
60
|
6
|
|
|
|
|
13
|
for(my $i = 0; $i <= $#{ $chars }; $i++) { |
|
170
|
|
|
|
|
476
|
|
61
|
164
|
100
|
|
|
|
1003
|
my $ctx = { |
62
|
|
|
|
|
|
|
char => $chars->[$i], |
63
|
|
|
|
|
|
|
prevchar => $i ? $chars->[$i - 1] : '', |
64
|
|
|
|
|
|
|
nextchar => $chars->[$i + 1], |
65
|
|
|
|
|
|
|
nnextchar => $chars->[$i + 2], |
66
|
|
|
|
|
|
|
pos => $i, |
67
|
|
|
|
|
|
|
}; |
68
|
|
|
|
|
|
|
not defined $ctx->{$_} and $ctx->{$_} = '' |
69
|
164
|
|
100
|
|
|
869
|
for qw(nextchar nnextchar); |
70
|
|
|
|
|
|
|
|
71
|
164
|
|
|
|
|
385
|
$self->_get_char_sequences($ctx); |
72
|
164
|
|
|
|
|
327
|
$self->_vectorize($ctx); |
73
|
|
|
|
|
|
|
|
74
|
164
|
|
|
|
|
580
|
my $coeff = $self->{vectors}->in_list($ctx->{vector}); |
75
|
164
|
50
|
|
|
|
392
|
$coeff = 0.5 unless defined $coeff; |
76
|
|
|
|
|
|
|
|
77
|
164
|
50
|
|
|
|
328
|
if($options->{want_tokens}) { |
78
|
0
|
|
|
|
|
0
|
$token .= $chars->[$i]; |
79
|
|
|
|
|
|
|
|
80
|
0
|
0
|
0
|
|
|
0
|
if( |
81
|
0
|
|
|
|
|
0
|
$coeff >= $options->{threshold} |
82
|
|
|
|
|
|
|
or $ctx->{pos} == $#{ $chars } |
83
|
|
|
|
|
|
|
) |
84
|
|
|
|
|
|
|
{ |
85
|
0
|
|
|
|
|
0
|
$token =~ s{^\s+|\s+$}{}g; |
86
|
0
|
0
|
|
|
|
0
|
push @{ $self->{tokens} }, $token if $token; |
|
0
|
|
|
|
|
0
|
|
87
|
0
|
|
|
|
|
0
|
$token = ''; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
else { |
91
|
164
|
100
|
|
|
|
684
|
if($coeff) { |
92
|
29
|
|
|
|
|
33
|
push @{ $self->{bounds} }, [$ctx->{pos}, $coeff]; |
|
29
|
|
|
|
|
193
|
|
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub _get_char_sequences { |
99
|
164
|
|
|
164
|
|
227
|
my($self, $ctx) = @_; |
100
|
|
|
|
|
|
|
|
101
|
164
|
|
|
|
|
253
|
my $seq = my $seq_left = my $seq_right = ''; |
102
|
164
|
|
|
|
|
179
|
my $spacer = ''; |
103
|
|
|
|
|
|
|
|
104
|
164
|
100
|
100
|
|
|
952
|
if( |
105
|
|
|
|
|
|
|
$ctx->{nextchar} =~ m|([-./?=:&"!+()])| |
106
|
|
|
|
|
|
|
or $ctx->{char} =~ m|([-./?=:&"!+()])| |
107
|
|
|
|
|
|
|
) |
108
|
|
|
|
|
|
|
{ |
109
|
6
|
|
|
|
|
15
|
$spacer = $1; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
164
|
100
|
|
|
|
347
|
if(length $spacer) { |
113
|
|
|
|
|
|
|
# go left |
114
|
6
|
|
|
|
|
25
|
for(my $i = $ctx->{pos}; $i >= 0; $i--) { |
115
|
55
|
|
|
|
|
85
|
my $ch = $self->{chars}[$i]; |
116
|
|
|
|
|
|
|
|
117
|
55
|
|
33
|
|
|
124
|
my $case1 = !!( |
118
|
|
|
|
|
|
|
_is_hyphen($spacer) |
119
|
|
|
|
|
|
|
and ( |
120
|
|
|
|
|
|
|
_is_cyr($ch) |
121
|
|
|
|
|
|
|
or _is_hyphen($ch) |
122
|
|
|
|
|
|
|
or _is_single_quote($ch) |
123
|
|
|
|
|
|
|
) |
124
|
|
|
|
|
|
|
); |
125
|
55
|
|
66
|
|
|
96
|
my $case2 = !!( |
126
|
|
|
|
|
|
|
not _is_hyphen($spacer) |
127
|
|
|
|
|
|
|
and not _is_space($ch) |
128
|
|
|
|
|
|
|
); |
129
|
|
|
|
|
|
|
|
130
|
55
|
100
|
66
|
|
|
181
|
if($case1 or $case2) { |
131
|
49
|
|
|
|
|
84
|
$seq_left = $ch . $seq_left; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
else { |
134
|
6
|
|
|
|
|
13
|
last; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
49
|
100
|
|
|
|
226
|
$seq_left = substr $seq_left, 0, -1 |
138
|
|
|
|
|
|
|
if substr($seq_left, -1) eq $spacer; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# go right |
142
|
6
|
|
|
|
|
16
|
for(my $i = $ctx->{pos} + 1; $i <= $#{ $self->{chars} }; $i++) { |
|
9
|
|
|
|
|
30
|
|
143
|
3
|
|
|
|
|
7
|
my $ch = $self->{chars}[$i]; |
144
|
|
|
|
|
|
|
|
145
|
3
|
|
33
|
|
|
8
|
my $case1 = !!( |
146
|
|
|
|
|
|
|
_is_hyphen($spacer) |
147
|
|
|
|
|
|
|
and ( |
148
|
|
|
|
|
|
|
_is_cyr($ch) |
149
|
|
|
|
|
|
|
or _is_hyphen($ch) |
150
|
|
|
|
|
|
|
or _is_single_quote($ch) |
151
|
|
|
|
|
|
|
) |
152
|
|
|
|
|
|
|
); |
153
|
3
|
|
33
|
|
|
8
|
my $case2 = !!( |
154
|
|
|
|
|
|
|
not _is_hyphen($spacer) |
155
|
|
|
|
|
|
|
and not _is_space($ch) |
156
|
|
|
|
|
|
|
); |
157
|
|
|
|
|
|
|
|
158
|
3
|
50
|
33
|
|
|
15
|
if($case1 or $case2) { |
159
|
3
|
|
|
|
|
6
|
$seq_right .= $ch; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
else { |
162
|
0
|
|
|
|
|
0
|
last; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
3
|
50
|
|
|
|
20
|
$seq_right = substr $seq_right, 0, 1 |
166
|
|
|
|
|
|
|
if substr($seq_right, -1) eq $spacer; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
6
|
|
|
|
|
16
|
$seq = join '', $seq_left, $spacer, $seq_right; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
164
|
|
|
|
|
296
|
$ctx->{spacer} = $spacer; |
173
|
164
|
|
|
|
|
273
|
$ctx->{seq} = $seq; |
174
|
164
|
|
|
|
|
360
|
$ctx->{seq_left} = $seq_left; |
175
|
164
|
|
|
|
|
248
|
$ctx->{seq_right} = $seq_right; |
176
|
|
|
|
|
|
|
|
177
|
164
|
|
|
|
|
289
|
return; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub _vectorize { |
181
|
164
|
|
|
|
|
671
|
my $ckey = join ',', _is_hyphen($_[1]->{spacer}), |
182
|
164
|
|
|
164
|
|
357
|
@{$_[1]}{qw(spacer prevchar char nextchar nnextchar seq_left seq seq_right)}; |
183
|
164
|
|
66
|
|
|
910
|
$_[1]->{vector} = $_[0]->{_vectors_cache}{$ckey} ||= $_[0]->_do_vectorize($_[1]); |
184
|
|
|
|
|
|
|
|
185
|
164
|
|
|
|
|
233
|
return; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub _do_vectorize { |
189
|
153
|
|
|
153
|
|
198
|
my($self, $ctx) = @_; |
190
|
|
|
|
|
|
|
|
191
|
153
|
|
|
|
|
260
|
my $spacer = !!length $ctx->{spacer}; |
192
|
153
|
|
66
|
|
|
302
|
my $spacer_is_hyphen = $spacer && _is_hyphen($ctx->{spacer}); |
193
|
|
|
|
|
|
|
|
194
|
153
|
50
|
66
|
|
|
310
|
my @bits = ( |
|
|
50
|
66
|
|
|
|
|
|
|
100
|
100
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
195
|
|
|
|
|
|
|
_char_class($ctx->{char}), |
196
|
|
|
|
|
|
|
_char_class($ctx->{nextchar}), |
197
|
|
|
|
|
|
|
_is_digit($ctx->{prevchar}), |
198
|
|
|
|
|
|
|
_is_digit($ctx->{nnextchar}), |
199
|
|
|
|
|
|
|
$spacer_is_hyphen |
200
|
|
|
|
|
|
|
? _is_dict_seq($self->{hyphens}, $ctx->{seq}) |
201
|
|
|
|
|
|
|
: 0, |
202
|
|
|
|
|
|
|
$spacer_is_hyphen |
203
|
|
|
|
|
|
|
? _is_suffix($ctx->{seq_right}) |
204
|
|
|
|
|
|
|
: 0, |
205
|
|
|
|
|
|
|
_is_same_pm($ctx->{char}, $ctx->{nextchar}), |
206
|
|
|
|
|
|
|
($spacer and not $spacer_is_hyphen) |
207
|
|
|
|
|
|
|
? _looks_like_url($ctx->{seq}, $ctx->{seq_right}) |
208
|
|
|
|
|
|
|
: 0, |
209
|
|
|
|
|
|
|
($spacer and not $spacer_is_hyphen) |
210
|
|
|
|
|
|
|
? _is_exception_seq($self->{exceptions}, $ctx->{seq}) |
211
|
|
|
|
|
|
|
: 0, |
212
|
|
|
|
|
|
|
$spacer_is_hyphen |
213
|
|
|
|
|
|
|
? _is_prefix($self->{prefixes}, $ctx->{seq_left}) |
214
|
|
|
|
|
|
|
: 0, |
215
|
|
|
|
|
|
|
(_is_colon($ctx->{spacer}) and !!length $ctx->{seq_right}) |
216
|
|
|
|
|
|
|
? _looks_like_time($ctx->{seq_left}, $ctx->{seq_right}) |
217
|
|
|
|
|
|
|
: 0, |
218
|
|
|
|
|
|
|
); |
219
|
|
|
|
|
|
|
|
220
|
153
|
|
|
|
|
1244
|
oct join '', '0b', @bits; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
sub _init { |
224
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
225
|
|
|
|
|
|
|
|
226
|
1
|
|
|
|
|
3
|
for(qw(exceptions prefixes hyphens)) { |
227
|
3
|
|
|
|
|
44
|
my $list = Lingua::RU::OpenCorpora::Tokenizer::List->new( |
228
|
|
|
|
|
|
|
$_, |
229
|
|
|
|
|
|
|
{ |
230
|
|
|
|
|
|
|
data_dir => $self->{data_dir}, |
231
|
|
|
|
|
|
|
}, |
232
|
|
|
|
|
|
|
); |
233
|
3
|
|
|
|
|
24
|
$self->{$_} = $list; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
1
|
|
|
|
|
21
|
my $vectors = Lingua::RU::OpenCorpora::Tokenizer::Vectors->new({ |
237
|
|
|
|
|
|
|
data_dir => $self->{data_dir}, |
238
|
|
|
|
|
|
|
}); |
239
|
1
|
|
|
|
|
7
|
$self->{vectors} = $vectors; |
240
|
|
|
|
|
|
|
|
241
|
1
|
|
|
|
|
3
|
return; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
21
|
100
|
|
21
|
|
92
|
sub _is_pmark { $_[0] =~ /^[,?!";«»]$/ ? 1 : 0 } |
245
|
|
|
|
|
|
|
|
246
|
2
|
100
|
|
2
|
|
12
|
sub _is_latin { $_[0] =~ /^\p{Latin}$/ ? 1 : 0 } |
|
2
|
|
|
16
|
|
3
|
|
|
2
|
|
|
|
|
42
|
|
|
16
|
|
|
|
|
115
|
|
247
|
|
|
|
|
|
|
|
248
|
306
|
100
|
|
306
|
|
1574
|
sub _is_cyr { $_[0] =~ /^\p{Cyrillic}$/ ? 1 : 0 } |
249
|
|
|
|
|
|
|
|
250
|
322
|
50
|
|
322
|
|
1212
|
sub _is_digit { $_[0] =~ /^[0-9]$/ ? 1 : 0 } |
251
|
|
|
|
|
|
|
|
252
|
8
|
50
|
|
8
|
|
37
|
sub _is_bracket1 { $_[0] =~ /^[\[({<]$/ ? 1 : 0 } |
253
|
|
|
|
|
|
|
|
254
|
8
|
50
|
|
8
|
|
39
|
sub _is_bracket2 { $_[0] =~ /^[\])}>]$/ ? 1 : 0 } |
255
|
|
|
|
|
|
|
|
256
|
0
|
0
|
|
0
|
|
0
|
sub _is_suffix { $_[0] =~ /^(?:то|таки|с|ка|де)$/ ? 1 : 0 } |
257
|
|
|
|
|
|
|
|
258
|
119
|
100
|
|
119
|
|
492
|
sub _is_space { $_[0] =~ /^\s$/ ? 1 : 0 } |
259
|
|
|
|
|
|
|
|
260
|
302
|
50
|
|
302
|
|
790
|
sub _is_hyphen { $_[0] eq '-' ? 1 : 0 } |
261
|
|
|
|
|
|
|
|
262
|
25
|
100
|
|
25
|
|
112
|
sub _is_dot { $_[0] eq '.' ? 1 : 0 } |
263
|
|
|
|
|
|
|
|
264
|
8
|
50
|
|
8
|
|
41
|
sub _is_single_quote { $_[0] eq "'" ? 1 : 0 } |
265
|
|
|
|
|
|
|
|
266
|
8
|
50
|
|
8
|
|
29
|
sub _is_slash { $_[0] eq '/' ? 1 : 0 } |
267
|
|
|
|
|
|
|
|
268
|
161
|
100
|
|
161
|
|
750
|
sub _is_colon { $_[0] eq ':' ? 1 : 0 } |
269
|
|
|
|
|
|
|
|
270
|
153
|
100
|
|
153
|
|
966
|
sub _is_same_pm { $_[0] eq $_[1] ? 1 : 0 } |
271
|
|
|
|
|
|
|
|
272
|
0
|
0
|
|
0
|
|
0
|
sub _is_prefix { $_[0]->in_list($_[1]) ? 1 : 0 } |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub _is_dict_seq { |
275
|
0
|
0
|
0
|
0
|
|
0
|
return 0 if not $_[1] or substr $_[1], 0, 1 eq '-'; |
276
|
|
|
|
|
|
|
|
277
|
0
|
0
|
|
|
|
0
|
$_[0]->in_list($_[1]) ? 1 : 0; |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
sub _is_exception_seq { |
281
|
6
|
|
|
6
|
|
12
|
my $seq = $_[1]; # need a copy |
282
|
|
|
|
|
|
|
|
283
|
6
|
50
|
|
|
|
36
|
return 1 if $_[0]->in_list($seq); |
284
|
|
|
|
|
|
|
|
285
|
6
|
50
|
|
|
|
5427
|
return 0 unless $seq =~ /^\W|\W$/; |
286
|
|
|
|
|
|
|
|
287
|
6
|
|
|
|
|
19
|
$seq =~ s/^\W+//; |
288
|
6
|
50
|
|
|
|
24
|
return 1 if $_[0]->in_list($seq); |
289
|
|
|
|
|
|
|
|
290
|
6
|
|
|
|
|
38
|
while($seq =~ s/\W$//) { |
291
|
9
|
50
|
|
|
|
30
|
return 1 if $_[0]->in_list($seq); |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
6
|
|
|
|
|
21
|
0; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub _looks_like_url { |
298
|
6
|
100
|
|
6
|
|
34
|
return 0 unless !!length $_[1]; |
299
|
3
|
50
|
|
|
|
12
|
return 0 if length $_[0] < 5; |
300
|
3
|
50
|
|
|
|
12
|
return 0 if substr $_[0], 0, 1 eq '.'; |
301
|
|
|
|
|
|
|
|
302
|
3
|
50
|
33
|
|
|
62
|
$_[0] =~ m{^\W*https?://?} |
|
|
|
33
|
|
|
|
|
303
|
|
|
|
|
|
|
or $_[0] =~ m{^\W*www\.} |
304
|
|
|
|
|
|
|
or $_[0] =~ m<.\.(?:[a-z]{2,3}|ру|рф)\W*$>i |
305
|
|
|
|
|
|
|
or return 0; |
306
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
0
|
1; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub _looks_like_time { |
311
|
1
|
|
|
1
|
|
2
|
my($seq_left, $seq_right) = @_; # need copies |
312
|
|
|
|
|
|
|
|
313
|
1
|
|
|
|
|
7
|
$seq_left =~ s/^[^0-9]{1,2}//; |
314
|
1
|
|
|
|
|
7
|
$seq_right =~ s/[^0-9]+$//; |
315
|
|
|
|
|
|
|
|
316
|
1
|
50
|
33
|
|
|
13
|
return 0 if $seq_left !~ /^[0-9]{1,2}$/ |
317
|
|
|
|
|
|
|
or $seq_right !~ /^[0-9]{2}$/; |
318
|
|
|
|
|
|
|
|
319
|
0
|
0
|
0
|
|
|
0
|
($seq_left < 24 and $seq_right < 60) |
320
|
|
|
|
|
|
|
? 1 |
321
|
|
|
|
|
|
|
: 0; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
sub _char_class { |
325
|
306
|
100
|
|
306
|
|
525
|
_is_cyr($_[0]) ? '0001' : |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
326
|
|
|
|
|
|
|
_is_space($_[0]) ? '0010' : |
327
|
|
|
|
|
|
|
_is_dot($_[0]) ? '0011' : |
328
|
|
|
|
|
|
|
_is_pmark($_[0]) ? '0100' : |
329
|
|
|
|
|
|
|
_is_hyphen($_[0]) ? '0101' : |
330
|
|
|
|
|
|
|
_is_digit($_[0]) ? '0110' : |
331
|
|
|
|
|
|
|
_is_latin($_[0]) ? '0111' : |
332
|
|
|
|
|
|
|
_is_bracket1($_[0]) ? '1000' : |
333
|
|
|
|
|
|
|
_is_bracket2($_[0]) ? '1001' : |
334
|
|
|
|
|
|
|
_is_single_quote($_[0]) ? '1010' : |
335
|
|
|
|
|
|
|
_is_slash($_[0]) ? '1011' : |
336
|
|
|
|
|
|
|
_is_colon($_[0]) ? '1100' : '0000'; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
1; |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
__END__ |