| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package PDF::Builder::Content::Hyphenate_basic; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
9282
|
use base 'PDF::Builder::Content::Text'; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
90
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
19
|
|
|
6
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
942
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '3.024'; # VERSION |
|
9
|
|
|
|
|
|
|
our $LAST_UPDATE = '3.024'; # manually update whenever code is changed |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
PDF::Builder::Content::Hyphenate_basic - Simple hyphenation capability |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
These are internal routines that are somewhat experimental, and may (or may |
|
18
|
|
|
|
|
|
|
not) be extended in the future. They are called from various Content routines |
|
19
|
|
|
|
|
|
|
that take long strings of text and split them into fixed-length lines. |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
Words are split to fill the line most completely, without regard to widows and |
|
22
|
|
|
|
|
|
|
orphans, long runs of hyphens at the right edge, "rivers" of space flowing |
|
23
|
|
|
|
|
|
|
through a paragraph, and other problems. Also, only simple splitting is done |
|
24
|
|
|
|
|
|
|
(not actually I), on a simple, language-independent basis. No dictionary |
|
25
|
|
|
|
|
|
|
or rules-based splitting is currently done. |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
This functionality may well be replaced by "hooks" to call language-specific |
|
28
|
|
|
|
|
|
|
word-splitting rules, as well as worrying about the appearance of the results |
|
29
|
|
|
|
|
|
|
(such as Knuth-Plass). |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Main entry. Returns array of left portion of word (and -) to stick on end of |
|
34
|
|
|
|
|
|
|
# sentence (may be empty) and remaining (right) portion of word to go on next |
|
35
|
|
|
|
|
|
|
# line (usually not empty). |
|
36
|
|
|
|
|
|
|
sub splitWord { |
|
37
|
0
|
|
|
0
|
0
|
|
my ($self, $word, $width, %opts) = @_; |
|
38
|
|
|
|
|
|
|
# copy dashed option names to preferred undashed names |
|
39
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spHH'} && !defined $opts{'spHH'}) { $opts{'spHH'} = delete($opts{'-spHH'}); } |
|
|
0
|
|
|
|
|
|
|
|
40
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spOP'} && !defined $opts{'spOP'}) { $opts{'spOP'} = delete($opts{'-spOP'}); } |
|
|
0
|
|
|
|
|
|
|
|
41
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spDR'} && !defined $opts{'spDR'}) { $opts{'spDR'} = delete($opts{'-spDR'}); } |
|
|
0
|
|
|
|
|
|
|
|
42
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spLR'} && !defined $opts{'spLR'}) { $opts{'spLR'} = delete($opts{'-spLR'}); } |
|
|
0
|
|
|
|
|
|
|
|
43
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spCC'} && !defined $opts{'spCC'}) { $opts{'spCC'} = delete($opts{'-spCC'}); } |
|
|
0
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
|
45
|
0
|
|
|
|
|
|
my ($leftWord, $rightWord, @splitLoc, @chars, $i, $j, $len); |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# various settings, some of which may be language-specific |
|
48
|
0
|
|
|
|
|
|
my $minBegin = 2; # minimum 2 characters before split |
|
49
|
0
|
|
|
|
|
|
my $minEnd = 2; # minimum 2 characters to next line |
|
50
|
|
|
|
|
|
|
#my $hyphen = '-'; |
|
51
|
0
|
|
|
|
|
|
my $hyphen = "\xAD"; # add a hyphen at split, unless splitting at - |
|
52
|
|
|
|
|
|
|
# or other dash character |
|
53
|
|
|
|
|
|
|
# NOTE: PDF-1.7 14.8.2.2.3 suggests using a soft hyphen (\AD) when splitting |
|
54
|
|
|
|
|
|
|
# a word at the end of the line, so that when text is extracted for |
|
55
|
|
|
|
|
|
|
# a screen reader, etc., the closed-up word can have the "visible" |
|
56
|
|
|
|
|
|
|
# hyphen removed. PDF readers should render as -. |
|
57
|
0
|
|
|
|
|
|
my @suppressHyphen = ( # ASCII/Latin-1/UTF-8 ordinals to NOT add - after |
|
58
|
|
|
|
|
|
|
# - en-dash em-dash / |
|
59
|
|
|
|
|
|
|
45, 8211, 8212, 47, |
|
60
|
|
|
|
|
|
|
); |
|
61
|
0
|
0
|
|
|
|
|
my $splitHardH = defined($opts{'spHH'})? $opts{'spHH'}: 1; # 1=OK to split on hard (explicit) hyphen U+002D |
|
62
|
0
|
0
|
|
|
|
|
my $otherPunc = defined($opts{'spOP'})? $opts{'spOP'}: 1; # 1=OK to split after most punctuation |
|
63
|
0
|
0
|
|
|
|
|
my $digitRun = defined($opts{'spDR'})? $opts{'spDR'}: 1; # 1=OK to split after run of digit(s) |
|
64
|
0
|
0
|
|
|
|
|
my $letterRun = defined($opts{'spLR'})? $opts{'spLR'}: 1; # 1=OK to split after run of ASCII letter(s) |
|
65
|
0
|
0
|
|
|
|
|
my $camelCase = defined($opts{'spCC'})? $opts{'spCC'}: 1; # 1=OK to split camelCase on ASCII lc-to-UC transition |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# note that we are ignoring U+2010 "hyphen" and U+2011 "non-splitting |
|
68
|
|
|
|
|
|
|
# hyphen". The first is probably rare enough to not be worth the bother, |
|
69
|
|
|
|
|
|
|
# and the second won't be split at anyway. |
|
70
|
|
|
|
|
|
|
|
|
71
|
0
|
|
|
|
|
|
$leftWord = ''; # default return values |
|
72
|
0
|
|
|
|
|
|
$rightWord = $word; |
|
73
|
|
|
|
|
|
|
|
|
74
|
0
|
|
|
|
|
|
@splitLoc = (); # no known OK splits yet |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# highest priority for splits: hard and soft hyphens |
|
77
|
|
|
|
|
|
|
# remove SHYs, remember any break points |
|
78
|
0
|
|
|
|
|
|
($word, @splitLoc) = _removeSHY($word); |
|
79
|
|
|
|
|
|
|
# remember any break points due to hard coded hyphens |
|
80
|
0
|
|
|
|
|
|
@chars = split //, $word; |
|
81
|
0
|
|
|
|
|
|
for ($i=0; $i
|
|
82
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] eq '-' && $splitHardH) { push @splitLoc, $i; } |
|
|
0
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# note that unlike SHY, - is not removed |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# If nothing in @splitLoc, proceed to find other splits. If @splitLoc |
|
87
|
|
|
|
|
|
|
# has at least one entry, could make it the top priority and split there, |
|
88
|
|
|
|
|
|
|
# and not look at other possible splits. Or, keep adding to @splitLoc |
|
89
|
|
|
|
|
|
|
# (equal priority for all possible splits). Mix and match is OK |
|
90
|
|
|
|
|
|
|
# (grouping criteria, as hard and soft hyphens were done together). |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
#if (!@splitLoc) { |
|
93
|
0
|
0
|
|
|
|
|
if ($otherPunc) { |
|
94
|
|
|
|
|
|
|
# look for other punctuation to split after. |
|
95
|
|
|
|
|
|
|
# don't split on ' or " or other quotes (<, <<, etc.) |
|
96
|
|
|
|
|
|
|
# !%&)]*+/,.:;<>?^_~ and curly right brace ASCII OK for now |
|
97
|
|
|
|
|
|
|
# en-dash, em-dash should ideally be split after, whether they are |
|
98
|
|
|
|
|
|
|
# free floating or embedded between words. |
|
99
|
0
|
|
|
|
|
|
my @ASCII_punct = ( '!', '.', '?', ',', '%', '&', ':', ';', |
|
100
|
|
|
|
|
|
|
'<', '>', ')', ']', chr(125), '_', '~', |
|
101
|
|
|
|
|
|
|
'^', '+', '*', '/', ); |
|
102
|
|
|
|
|
|
|
# en-dash em-dash |
|
103
|
0
|
|
|
|
|
|
my @UTF8_punct = ( 8211, 8212, ); |
|
104
|
|
|
|
|
|
|
# remember not to split if next char is - |
|
105
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
|
106
|
0
|
|
|
|
|
|
for ($i=0; $i
|
|
107
|
0
|
|
|
|
|
|
foreach (@ASCII_punct) { |
|
108
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] eq $_ && $chars[$i+1] ne '-') { |
|
109
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
|
110
|
0
|
|
|
|
|
|
last; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
} |
|
113
|
0
|
|
|
|
|
|
foreach (@UTF8_punct) { |
|
114
|
0
|
0
|
0
|
|
|
|
if (ord($chars[$i]) == $_ && $chars[$i+1] ne '-') { |
|
115
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
|
116
|
0
|
|
|
|
|
|
last; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
#} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# group digit runs and camelCase together at same priority |
|
124
|
|
|
|
|
|
|
#if (!@splitLoc) { |
|
125
|
0
|
0
|
|
|
|
|
if ($digitRun) { |
|
126
|
|
|
|
|
|
|
# look for a run of digits to split after. |
|
127
|
|
|
|
|
|
|
# that is, any digit NOT followed by another digit. |
|
128
|
|
|
|
|
|
|
# remember not to split if next char is - |
|
129
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
|
130
|
0
|
|
|
|
|
|
for ($i=0; $i
|
|
131
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] ge '0' && $chars[$i] le '9' && |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
132
|
|
|
|
|
|
|
!($chars[$i+1] ge '0' && $chars[$i+1] le '9' || |
|
133
|
|
|
|
|
|
|
$chars[$i+1] eq '-')) { |
|
134
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
|
if ($letterRun) { |
|
140
|
|
|
|
|
|
|
# look for a run of letters (ASCII) to split after. |
|
141
|
|
|
|
|
|
|
# that is, any letter NOT followed by another letter. |
|
142
|
|
|
|
|
|
|
# remember not to split if next char is - |
|
143
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
|
144
|
0
|
|
|
|
|
|
for ($i=0; $i
|
|
145
|
0
|
0
|
0
|
|
|
|
if (($chars[$i] ge 'a' && $chars[$i] le 'z' || |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
146
|
|
|
|
|
|
|
$chars[$i] ge 'A' && $chars[$i] le 'Z' ) && |
|
147
|
|
|
|
|
|
|
!($chars[$i+1] ge 'a' && $chars[$i+1] le 'z' || |
|
148
|
|
|
|
|
|
|
$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z' || |
|
149
|
|
|
|
|
|
|
$chars[$i+1] eq '-') ) { |
|
150
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
0
|
0
|
|
|
|
|
if ($camelCase) { |
|
156
|
|
|
|
|
|
|
# look for camelCase to split on lowercase to |
|
157
|
|
|
|
|
|
|
# uppercase transitions. just ASCII letters for now. |
|
158
|
|
|
|
|
|
|
# Note that this will split names like McIlroy -> Mc-Ilroy |
|
159
|
|
|
|
|
|
|
# and MacDonald -> Mac-Donald. |
|
160
|
0
|
|
|
|
|
|
for ($i=0; $i
|
|
161
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] ge 'a' && $chars[$i] le 'z' && |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
162
|
|
|
|
|
|
|
$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z') { |
|
163
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
#} |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
#if (!@splitLoc) { |
|
170
|
|
|
|
|
|
|
# look for real English word split locations |
|
171
|
|
|
|
|
|
|
# TBD |
|
172
|
|
|
|
|
|
|
#} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# sort final @splitLoc, remove any split points violating "min" settings |
|
175
|
|
|
|
|
|
|
# set $leftWord and $rightWord if find successful split |
|
176
|
0
|
0
|
|
|
|
|
if (@splitLoc) { |
|
177
|
0
|
|
|
|
|
|
@splitLoc = sort { $a <=> $b } @splitLoc; |
|
|
0
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# unnecessary to have unique values |
|
179
|
0
|
|
|
|
|
|
$len = length($word); |
|
180
|
0
|
|
|
|
|
|
$j = -1; |
|
181
|
0
|
|
|
|
|
|
for ($i=0; $i
|
|
182
|
0
|
0
|
|
|
|
|
if ($splitLoc[$i] >= $minBegin-1) { last; } |
|
|
0
|
|
|
|
|
|
|
|
183
|
0
|
|
|
|
|
|
$j = $i; |
|
184
|
|
|
|
|
|
|
} |
|
185
|
0
|
0
|
|
|
|
|
if ($j >= 0) { splice(@splitLoc, 0, $j+1); } # remove j+1 els |
|
|
0
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
$j = -1; |
|
187
|
0
|
|
|
|
|
|
for ($i=$#splitLoc; $i>=0; $i--) { |
|
188
|
0
|
0
|
|
|
|
|
if ($splitLoc[$i] < $len-$minEnd) { last; } |
|
|
0
|
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
$j = $i; |
|
190
|
|
|
|
|
|
|
} |
|
191
|
0
|
0
|
|
|
|
|
if ($j >= 0) { splice(@splitLoc, $j); } # remove els >= j-th |
|
|
0
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# scan R to L through @splitLoc to try splitting there |
|
194
|
|
|
|
|
|
|
# TBD estimate starting position in @splitLoc by dividing $width by |
|
195
|
|
|
|
|
|
|
# 1em to get approximate split location; pick highest @splitLoc |
|
196
|
|
|
|
|
|
|
# element that does not exceed it, and move right (probably) or left |
|
197
|
|
|
|
|
|
|
# to get proper split point. |
|
198
|
0
|
|
|
|
|
|
while (@splitLoc) { |
|
199
|
0
|
|
|
|
|
|
$j = pop @splitLoc; # proposed split rightmost on list |
|
200
|
0
|
|
|
|
|
|
my $trial = substr($word, 0, $j+1); |
|
201
|
|
|
|
|
|
|
# this is the left fragment at the end of the line. make sure |
|
202
|
|
|
|
|
|
|
# there is room for the space before it, the hyphen (if added), |
|
203
|
|
|
|
|
|
|
# and any letter doubling (e.g., in German) |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# does the left fragment already end in -, etc.? |
|
206
|
|
|
|
|
|
|
# if it does, don't add a $hyphen. |
|
207
|
0
|
|
|
|
|
|
my $h = $hyphen; |
|
208
|
0
|
|
|
|
|
|
$i = ord(substr($trial, -1, 1)); # last character in left fragment |
|
209
|
0
|
|
|
|
|
|
foreach (@suppressHyphen) { |
|
210
|
0
|
0
|
|
|
|
|
if ($i == $_) { $h = ''; last; } |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
# $width should already count the trailing space in the existing |
|
213
|
|
|
|
|
|
|
# line, or full width if empty |
|
214
|
0
|
|
|
|
|
|
$len = $self->advancewidth("$trial$h", %opts); |
|
215
|
0
|
0
|
|
|
|
|
if ($len > $width) { next; } |
|
|
0
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# any letter doubling needed? |
|
218
|
0
|
|
|
|
|
|
$leftWord = $trial.$h; |
|
219
|
0
|
|
|
|
|
|
$rightWord = substr($word, $j+1); |
|
220
|
0
|
|
|
|
|
|
last; |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
# if fell through because no fragment was short enough, $leftWord and |
|
223
|
|
|
|
|
|
|
# $rightWord were never reassigned, and effect is to leave the entire |
|
224
|
|
|
|
|
|
|
# word for the next line. |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
# if 0 elements in @splitLoc, $leftWord and $rightWord already defaulted |
|
227
|
|
|
|
|
|
|
|
|
228
|
0
|
|
|
|
|
|
return ($leftWord, $rightWord); |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# remove soft hyphens (SHYs) from a word. assume is always #173 (good for |
|
232
|
|
|
|
|
|
|
# Latin-1, CP-1252, UTF-8; might not work for some encodings) TBD might want |
|
233
|
|
|
|
|
|
|
# to pass in current encoding, or what SHY value is. |
|
234
|
|
|
|
|
|
|
# return list of break points where SHYs were removed |
|
235
|
|
|
|
|
|
|
sub _removeSHY { |
|
236
|
0
|
|
|
0
|
|
|
my ($word) = @_; |
|
237
|
|
|
|
|
|
|
|
|
238
|
0
|
|
|
|
|
|
my @SHYs = (); |
|
239
|
0
|
|
|
|
|
|
my $i = 0; |
|
240
|
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
|
my @chars = split //, $word; |
|
242
|
0
|
|
|
|
|
|
my $out = ''; |
|
243
|
0
|
|
|
|
|
|
foreach (@chars) { |
|
244
|
0
|
0
|
|
|
|
|
if (ord($_) == 173) { |
|
245
|
|
|
|
|
|
|
# it's a SHY, so remove from word, add to list |
|
246
|
0
|
|
|
|
|
|
push @SHYs, ($i - 1); |
|
247
|
0
|
|
|
|
|
|
next; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
0
|
|
|
|
|
|
$out .= $_; |
|
250
|
0
|
|
|
|
|
|
$i++; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
0
|
|
|
|
|
|
return ($out, @SHYs); |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
1; |