| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package MKDoc::Text::Structured::Inline; |
|
2
|
21
|
|
|
21
|
|
57281
|
use URI::Find; |
|
|
21
|
|
|
|
|
311477
|
|
|
|
21
|
|
|
|
|
1622
|
|
|
3
|
21
|
|
|
21
|
|
235
|
use warnings; |
|
|
21
|
|
|
|
|
46
|
|
|
|
21
|
|
|
|
|
641
|
|
|
4
|
21
|
|
|
21
|
|
105
|
use strict; |
|
|
21
|
|
|
|
|
44
|
|
|
|
21
|
|
|
|
|
36553
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $Text = ''; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $LongestWord = 78; |
|
9
|
|
|
|
|
|
|
our $NoFollow = 0; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
MKDoc::Text::Structured::Inline - convert text to HTML without handling block-level tags |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $text = some_structured_text(); |
|
18
|
|
|
|
|
|
|
my $this = MKDoc::Text::Structured::Inline::process ($text); |
|
19
|
|
|
|
|
|
|
my $that = MKDoc::Text::Structured::Inline::process_entities_only ($text); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SUMMARY |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
L is used by L to |
|
24
|
|
|
|
|
|
|
generate inline HTML elements such as hyperlinks, emphasis and entities. |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
This module is also useful directly when the full block-level rendering of |
|
27
|
|
|
|
|
|
|
L is unwanted. |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 USAGE |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head2 Processing text and adding HTML tags |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
For example, when processing text that is going to end up in an header, |
|
34
|
|
|
|
|
|
|
you wouldn't want any block level tags generated: |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
$header = "< My (c) symbol should be *bold* > -- and http://example.com/ 'linked'"; |
|
37
|
|
|
|
|
|
|
$header = MKDoc::Text::Structured::Inline::process ($title); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$header is now: |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
< My © symbol should be bold > — and http://example.com/ ‘linked’ |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub process |
|
46
|
|
|
|
|
|
|
{ |
|
47
|
93
|
|
|
93
|
0
|
160
|
local $Text; |
|
48
|
93
|
|
|
|
|
169
|
$Text = shift; |
|
49
|
93
|
|
|
|
|
247
|
$Text = " $Text "; |
|
50
|
93
|
|
|
|
|
206
|
$Text =~ s/\n/ /gsm; |
|
51
|
|
|
|
|
|
|
|
|
52
|
93
|
|
|
|
|
223
|
_make_entities(); |
|
53
|
|
|
|
|
|
|
|
|
54
|
93
|
|
|
|
|
167
|
$Text =~ s/>/ >/g; |
|
55
|
|
|
|
|
|
|
# automagically finds hyperlinks |
|
56
|
|
|
|
|
|
|
my $finder = URI::Find->new ( |
|
57
|
|
|
|
|
|
|
sub { |
|
58
|
10
|
|
|
10
|
|
62553
|
my ($uri, $orig_uri) = @_; |
|
59
|
10
|
|
|
|
|
34
|
$orig_uri =~ s/^mailto://; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# http://googleblog.blogspot.com/2005/01/preventing-comment-spam.html |
|
62
|
10
|
100
|
|
|
|
52
|
if ($NoFollow) |
|
63
|
|
|
|
|
|
|
{ |
|
64
|
3
|
|
|
|
|
14
|
return qq|$orig_uri|; |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
else |
|
67
|
|
|
|
|
|
|
{ |
|
68
|
7
|
|
|
|
|
31
|
return qq|$orig_uri|; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
} |
|
71
|
93
|
|
|
|
|
1254
|
); |
|
72
|
93
|
|
|
|
|
1439
|
$finder->find (\$Text); |
|
73
|
93
|
|
|
|
|
33405
|
$Text =~ s/ >/>/g; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# abbreviations |
|
76
|
93
|
|
|
|
|
379
|
while ($Text =~ s/([[:upper:]][[:upper:]]+)\s+(\(.*?\))/_make_abbr_implicit ($1, $2)/e) {}; # implicit |
|
|
2
|
|
|
|
|
10
|
|
|
77
|
93
|
|
|
|
|
492
|
while ($Text =~ s/([[:upper:]][[:upper:]]+)(\(.*?\))/_make_abbr_explicit ($1, $2)/e) {}; # explicit |
|
|
9
|
|
|
|
|
37
|
|
|
78
|
93
|
|
|
|
|
245
|
_make_simplequotes(); |
|
79
|
93
|
|
|
|
|
286
|
_make_doublequotes(); |
|
80
|
93
|
|
|
|
|
272
|
_make_strong(); |
|
81
|
93
|
|
|
|
|
257
|
_make_em(); |
|
82
|
93
|
|
|
|
|
263
|
_make_smilies(); |
|
83
|
93
|
|
|
|
|
319
|
_break_long_words(); |
|
84
|
|
|
|
|
|
|
|
|
85
|
93
|
|
|
|
|
386
|
$Text =~ s/^ //; |
|
86
|
93
|
|
|
|
|
405
|
$Text =~ s/ $//; |
|
87
|
93
|
|
|
|
|
1138
|
return $Text; |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=pod |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=head2 Processing text without adding tags |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Another example, if you were processing text that will end up in an HTML |
|
95
|
|
|
|
|
|
|
tag, this tag should never contain any other tags, so you should use |
|
96
|
|
|
|
|
|
|
the MKDoc::Text::Structured::Inline::process_entities_only() method: |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
$title = "< My (c) symbol shouldn't be *bold* > -- or http://example.com/ 'linked'"; |
|
99
|
|
|
|
|
|
|
$title = MKDoc::Text::Structured::Inline::process_entities_only ($title); |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
$title is now: |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
< My © symbol shouldn't be *bold* — > or http://example.com/ ‘linked’ |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub process_entities_only |
|
108
|
|
|
|
|
|
|
{ |
|
109
|
2
|
|
|
2
|
0
|
955
|
local $Text; |
|
110
|
2
|
|
|
|
|
2
|
$Text = shift; |
|
111
|
2
|
|
|
|
|
6
|
$Text = " $Text "; |
|
112
|
2
|
|
|
|
|
5
|
$Text =~ s/\n/ /gsm; |
|
113
|
|
|
|
|
|
|
|
|
114
|
2
|
|
|
|
|
5
|
_make_entities(); |
|
115
|
2
|
|
|
|
|
6
|
_make_simplequotes(); |
|
116
|
2
|
|
|
|
|
6
|
_make_doublequotes(); |
|
117
|
2
|
|
|
|
|
5
|
_break_long_words(); |
|
118
|
|
|
|
|
|
|
|
|
119
|
2
|
|
|
|
|
6
|
$Text =~ s/^ //; |
|
120
|
2
|
|
|
|
|
5
|
$Text =~ s/ $//; |
|
121
|
2
|
|
|
|
|
6
|
return $Text; |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub _make_entities |
|
126
|
|
|
|
|
|
|
{ |
|
127
|
95
|
|
|
95
|
|
340
|
$Text =~ s/&/&/g; |
|
128
|
95
|
|
|
|
|
166
|
$Text =~ s/</g; |
|
129
|
95
|
|
|
|
|
166
|
$Text =~ s/>/>/g; |
|
130
|
95
|
|
|
|
|
165
|
$Text =~ s/"/"/g; |
|
131
|
|
|
|
|
|
|
|
|
132
|
95
|
|
|
|
|
183
|
$Text =~ s/(?<=(?:\s|\n))--(?=(?:\s|\n))/\—/g; # -- becomes em-dash |
|
133
|
95
|
|
|
|
|
184
|
$Text =~ s/(?<=(?:\s|\n))-(?=(?:\s|\n))/\–/g; # - becomes en-dash |
|
134
|
95
|
|
|
|
|
229
|
$Text =~ s/(?
|
|
135
|
|
|
|
|
|
|
|
|
136
|
21
|
|
|
21
|
|
40006
|
$Text =~ s/\(tm\)(?=(?:\s|\n|\p{IsPunct}))/\™/gi; # (tm) becomes trademark |
|
|
21
|
|
|
|
|
241
|
|
|
|
21
|
|
|
|
|
321
|
|
|
|
95
|
|
|
|
|
223
|
|
|
137
|
95
|
|
|
|
|
351
|
$Text =~ s/\(r\)(?=(?:\s|\n|\p{IsPunct}))/\®/gi; # (r) becomes registered |
|
138
|
95
|
|
|
|
|
210
|
$Text =~ s/\(c\)(?=(?:\s|\n|\p{IsPunct}))/\©/gi; # (c) becomes copyright |
|
139
|
95
|
|
|
|
|
210
|
$Text =~ s/(?<=(?:\s|\n))(\d+)\s*x\s*(\d+)(?=(?:\s|\n|\p{isPunct}))/$1\×$2/g; # x becomes dimension |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub _make_abbr_implicit |
|
144
|
|
|
|
|
|
|
{ |
|
145
|
2
|
|
|
2
|
|
6
|
my $abbr = shift; |
|
146
|
2
|
|
|
|
|
5
|
my $brack = shift; |
|
147
|
2
|
|
|
|
|
4
|
my $title = $brack; |
|
148
|
2
|
|
|
|
|
11
|
$title =~ s/^\s*\(\s*//; |
|
149
|
2
|
|
|
|
|
12
|
$title =~ s/\s*\)\s*$//; |
|
150
|
2
|
|
|
|
|
31
|
return qq|$abbr ($title)|; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub _make_abbr_explicit |
|
155
|
|
|
|
|
|
|
{ |
|
156
|
9
|
|
|
9
|
|
24
|
my $abbr = shift; |
|
157
|
9
|
|
|
|
|
19
|
my $brack = shift; |
|
158
|
9
|
|
|
|
|
18
|
my $title = $brack; |
|
159
|
9
|
|
|
|
|
47
|
$title =~ s/^\s*\(\s*//; |
|
160
|
9
|
|
|
|
|
57
|
$title =~ s/\s*\)\s*$//; |
|
161
|
9
|
|
|
|
|
263
|
return qq|$abbr|; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub _make_simplequotes |
|
166
|
|
|
|
|
|
|
{ |
|
167
|
179
|
|
|
|
|
411
|
$Text = join '', map { |
|
168
|
95
|
|
|
95
|
|
272
|
my $stuff = $_; |
|
169
|
179
|
|
|
|
|
355
|
$stuff = " $stuff "; |
|
170
|
179
|
|
|
|
|
647
|
while ($stuff =~ s/ |
|
171
|
|
|
|
|
|
|
(?<=(?:\s|\n)) # must start with space or carriage return |
|
172
|
|
|
|
|
|
|
\' # simple quote |
|
173
|
|
|
|
|
|
|
([^ \t\n\']|[^ \t\n\'].*?[^ \t\n\']) # stuff to capture and smart-quotize |
|
174
|
|
|
|
|
|
|
\' # simple quote |
|
175
|
|
|
|
|
|
|
(?=(?:<|\s|\n|\p{IsPunct}(?:\s|\n|<))) # must be followed by space, \n or (punctuation + space or \n) |
|
176
|
6
|
|
|
|
|
22
|
/_make_simplequotes_wrap ($1)/xes) {} |
|
177
|
|
|
|
|
|
|
|
|
178
|
179
|
|
|
|
|
584
|
$stuff =~ s/^ //; |
|
179
|
179
|
|
|
|
|
774
|
$stuff =~ s/ $//; |
|
180
|
179
|
|
|
|
|
792
|
$stuff; |
|
181
|
|
|
|
|
|
|
} _tokenize ($Text); |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
sub _make_simplequotes_wrap |
|
186
|
|
|
|
|
|
|
{ |
|
187
|
6
|
|
|
6
|
|
15
|
my $stuff = shift; |
|
188
|
6
|
|
|
|
|
14
|
local $Text = $stuff; |
|
189
|
6
|
|
|
|
|
53
|
return "‘$Text’"; |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub _make_doublequotes |
|
195
|
|
|
|
|
|
|
{ |
|
196
|
179
|
|
|
|
|
256
|
$Text = join '', map { |
|
197
|
95
|
|
|
95
|
|
213
|
my $stuff = $_; |
|
198
|
179
|
|
|
|
|
342
|
$stuff = " $stuff "; |
|
199
|
179
|
|
|
|
|
462
|
$stuff =~ s/"//g; |
|
200
|
179
|
|
|
|
|
534
|
$stuff =~ s/"/"/g; |
|
201
|
179
|
|
|
|
|
534
|
while ($stuff =~ s/ |
|
202
|
|
|
|
|
|
|
(?<=(?:\s|\n)) # must start with space or carriage return |
|
203
|
|
|
|
|
|
|
\" # double quote |
|
204
|
|
|
|
|
|
|
([^ \t\n\"]|[^ \t\n\"].*?[^ \t\n\"]) # stuff to capture and smart-quotize |
|
205
|
|
|
|
|
|
|
\" # double quote |
|
206
|
|
|
|
|
|
|
(?=(?:<|\s|\n|\p{IsPunct}(?:\s|\n|<))) # must be followed by space, \n or (punctuation + space or \n) |
|
207
|
6
|
|
|
|
|
20
|
/_make_doublequotes_wrap ($1)/xes) {} |
|
208
|
|
|
|
|
|
|
|
|
209
|
179
|
|
|
|
|
554
|
$stuff =~ s/^ //; |
|
210
|
179
|
|
|
|
|
585
|
$stuff =~ s/ $//; |
|
211
|
179
|
|
|
|
|
346
|
$stuff =~ s/"/"/g; |
|
212
|
179
|
|
|
|
|
295
|
$stuff =~ s//"/g; |
|
213
|
179
|
|
|
|
|
571
|
$stuff; |
|
214
|
|
|
|
|
|
|
} _tokenize ($Text); |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub _make_doublequotes_wrap |
|
219
|
|
|
|
|
|
|
{ |
|
220
|
6
|
|
|
6
|
|
60
|
my $stuff = shift; |
|
221
|
6
|
|
|
|
|
13
|
local $Text = $stuff; |
|
222
|
6
|
|
|
|
|
54
|
return "“$Text”"; |
|
223
|
|
|
|
|
|
|
} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub _make_strong |
|
227
|
|
|
|
|
|
|
{ |
|
228
|
181
|
|
|
|
|
251
|
$Text = join '', map { |
|
229
|
97
|
|
|
97
|
|
218
|
my $stuff = $_; |
|
230
|
181
|
|
|
|
|
575
|
$stuff = " $stuff "; |
|
231
|
181
|
|
|
|
|
508
|
while ($stuff =~ s/ |
|
232
|
|
|
|
|
|
|
(?<=(?:\s|\n)) # must start with space or carriage return |
|
233
|
|
|
|
|
|
|
\* # star |
|
234
|
|
|
|
|
|
|
(\S|\S.*?\S) # stuff to capture and emphasize |
|
235
|
|
|
|
|
|
|
\* # star |
|
236
|
|
|
|
|
|
|
(?=(?:<|\s|\n|\p{IsPunct}(?:\s|\n|<))) # must be followed by space, \n or (punctuation + space or \n) |
|
237
|
5
|
|
|
|
|
16
|
/_make_strong_wrap ($1)/xes) {} |
|
238
|
|
|
|
|
|
|
|
|
239
|
181
|
|
|
|
|
14605
|
$stuff =~ s/^ //; |
|
240
|
181
|
|
|
|
|
543
|
$stuff =~ s/ $//; |
|
241
|
181
|
|
|
|
|
582
|
$stuff; |
|
242
|
|
|
|
|
|
|
} _tokenize ($Text); |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
sub _make_strong_wrap |
|
247
|
|
|
|
|
|
|
{ |
|
248
|
5
|
|
|
5
|
|
12
|
my $stuff = shift; |
|
249
|
5
|
|
|
|
|
12
|
local $Text = $stuff; |
|
250
|
5
|
|
|
|
|
26
|
_make_em ($Text); |
|
251
|
5
|
|
|
|
|
44
|
return "$Text"; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
sub _make_em |
|
256
|
|
|
|
|
|
|
{ |
|
257
|
202
|
|
|
|
|
290
|
$Text = join '', map { |
|
258
|
98
|
|
|
98
|
|
258
|
my $stuff = $_; |
|
259
|
202
|
|
|
|
|
369
|
$stuff = " $stuff "; |
|
260
|
202
|
|
|
|
|
591
|
while ($stuff =~ s/ |
|
261
|
|
|
|
|
|
|
(?<=(?:\s|\n)) # must start with space or carriage return |
|
262
|
|
|
|
|
|
|
_ # underscore |
|
263
|
|
|
|
|
|
|
(\S|\S.*?\S) # stuff to capture and emphasize |
|
264
|
|
|
|
|
|
|
_ # underscore |
|
265
|
|
|
|
|
|
|
(?=(?:<|\s|\n|\p{IsPunct}(?:\s|\n))) # must be followed by space, \n or (punctuation + space or \n) |
|
266
|
4
|
|
|
|
|
12
|
/_make_em_wrap ($1)/xes) {} |
|
267
|
|
|
|
|
|
|
|
|
268
|
202
|
|
|
|
|
575
|
$stuff =~ s/^ //; |
|
269
|
202
|
|
|
|
|
568
|
$stuff =~ s/ $//; |
|
270
|
202
|
|
|
|
|
633
|
$stuff; |
|
271
|
|
|
|
|
|
|
} _tokenize ($Text); |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub _make_em_wrap |
|
276
|
|
|
|
|
|
|
{ |
|
277
|
4
|
|
|
4
|
|
8
|
my $stuff = shift; |
|
278
|
|
|
|
|
|
|
|
|
279
|
4
|
|
|
|
|
7
|
local $Text = $stuff; |
|
280
|
4
|
|
|
|
|
17
|
_make_strong ($Text); |
|
281
|
4
|
|
|
|
|
26
|
return "$Text"; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub _make_smilies |
|
286
|
|
|
|
|
|
|
{ |
|
287
|
213
|
|
|
|
|
287
|
$Text = join '', map { |
|
288
|
93
|
|
|
93
|
|
208
|
my $stuff = $_; |
|
289
|
213
|
100
|
|
|
|
631
|
$stuff =~ s/:-\)/:-)<\/span>/g unless ($stuff =~ /^); |
|
290
|
213
|
100
|
|
|
|
588
|
$stuff =~ s/:-\(/:-(<\/span>/g unless ($stuff =~ /^); |
|
291
|
|
|
|
|
|
|
# don't do ;-) think about what happens with &-) |
|
292
|
213
|
|
|
|
|
488
|
$stuff; |
|
293
|
|
|
|
|
|
|
} _tokenize ($Text); |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub _break_long_words |
|
298
|
|
|
|
|
|
|
{ |
|
299
|
241
|
|
|
|
|
332
|
$Text = join '', map { |
|
300
|
97
|
|
|
97
|
|
3678
|
my $stuff = $_; |
|
301
|
241
|
100
|
|
|
|
4147
|
$stuff = _insert_spaces ($stuff, $LongestWord) unless ($stuff =~ /^); |
|
302
|
241
|
|
|
|
|
631
|
$stuff; |
|
303
|
|
|
|
|
|
|
} _tokenize ($Text); |
|
304
|
|
|
|
|
|
|
} |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub _insert_spaces |
|
308
|
|
|
|
|
|
|
{ |
|
309
|
172
|
|
|
172
|
|
10365
|
my $text = shift; |
|
310
|
172
|
|
100
|
|
|
1270
|
my $length = shift || return $text; |
|
311
|
|
|
|
|
|
|
# we can break continuous non-space text after "/", ";" or "-" |
|
312
|
169
|
|
|
|
|
1435
|
$text =~ s/(\S{$length}[\/;-])(?=\S)/$1 /g; |
|
313
|
|
|
|
|
|
|
# we can break continuous non-space text so long as it doesn't contain an ampersand |
|
314
|
169
|
|
|
|
|
983
|
$text =~ s/([^[:space:]&]{$length})(?=\S)/$1 /g; |
|
315
|
169
|
|
|
|
|
475
|
return $text; |
|
316
|
|
|
|
|
|
|
} |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
sub _tokenize |
|
320
|
|
|
|
|
|
|
{ |
|
321
|
575
|
|
|
575
|
|
822
|
my $text = shift; |
|
322
|
575
|
|
|
|
|
3544
|
my @res = $text =~ /([^<]+)|(<.+?>)/g; |
|
323
|
575
|
|
|
|
|
1130
|
return grep { defined $_ } @res; |
|
|
2390
|
|
|
|
|
7510
|
|
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
1; |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
__END__ |