File Coverage

blib/lib/PDF/Builder/Content/Hyphenate_basic.pm
Criterion Covered Total %
statement 9 135 6.6
branch 0 78 0.0
condition 0 57 0.0
subroutine 3 5 60.0
pod 0 1 0.0
total 12 276 4.3


line stmt bran cond sub pod time code
1             package PDF::Builder::Content::Hyphenate_basic;
2              
3 1     1   11573 use base 'PDF::Builder::Content::Text';
  1         3  
  1         105  
4              
5 1     1   7 use strict;
  1         2  
  1         22  
6 1     1   5 use warnings;
  1         2  
  1         1454  
7              
8             our $VERSION = '3.025'; # VERSION
9             our $LAST_UPDATE = '3.025'; # manually update whenever code is changed
10              
11             =head1 NAME
12              
13             PDF::Builder::Content::Hyphenate_basic - Simple hyphenation capability
14              
15             =head1 SYNOPSIS
16              
17             These are internal routines that are somewhat experimental, and may (or may
18             not) be extended in the future. They are called from various Content routines
19             that take long strings of text and split them into fixed-length lines.
20              
21             Words are split to fill the line most completely, without regard to widows and
22             orphans, long runs of hyphens at the right edge, "rivers" of space flowing
23             through a paragraph, and other problems. Also, only simple splitting is done
24             (not actually I), on a simple, language-independent basis. No dictionary
25             or rules-based splitting is currently done.
26              
27             This functionality may well be replaced by "hooks" to call language-specific
28             word-splitting rules, as well as worrying about the appearance of the results
29             (such as Knuth-Plass).
30              
31             =cut
32              
33             # Main entry. Returns array of left portion of word (and -) to stick on end of
34             # sentence (may be empty) and remaining (right) portion of word to go on next
35             # line (usually not empty).
36             sub splitWord {
37 0     0 0   my ($self, $word, $width, %opts) = @_;
38             # copy dashed option names to preferred undashed names
39 0 0 0       if (defined $opts{'-spHH'} && !defined $opts{'spHH'}) { $opts{'spHH'} = delete($opts{'-spHH'}); }
  0            
40 0 0 0       if (defined $opts{'-spOP'} && !defined $opts{'spOP'}) { $opts{'spOP'} = delete($opts{'-spOP'}); }
  0            
41 0 0 0       if (defined $opts{'-spDR'} && !defined $opts{'spDR'}) { $opts{'spDR'} = delete($opts{'-spDR'}); }
  0            
42 0 0 0       if (defined $opts{'-spLR'} && !defined $opts{'spLR'}) { $opts{'spLR'} = delete($opts{'-spLR'}); }
  0            
43 0 0 0       if (defined $opts{'-spCC'} && !defined $opts{'spCC'}) { $opts{'spCC'} = delete($opts{'-spCC'}); }
  0            
44              
45 0           my ($leftWord, $rightWord, @splitLoc, @chars, $i, $j, $len);
46              
47             # various settings, some of which may be language-specific
48 0           my $minBegin = 2; # minimum 2 characters before split (English rules)
49 0 0         if (defined $opts{'min_prefix'}) { $minBegin = $opts{'min_prefix'}; }
  0            
50 0           my $minEnd = 3; # minimum 3 characters to next line (English rules)
51 0 0         if (defined $opts{'min_suffix'}) { $minEnd = $opts{'min_suffix'}; }
  0            
52 0           my $hyphen = '-';
53             #my $hyphen = "\xAD"; # add a hyphen at split, unless splitting at -
54             # or other dash character
55             # NOTE: PDF-1.7 14.8.2.2.3 suggests using a soft hyphen (\AD) when splitting
56             # a word at the end of the line, so that when text is extracted for
57             # a screen reader, etc., the closed-up word can have the "visible"
58             # hyphen removed. PDF readers should render as -.
59 0           my @suppressHyphen = ( # ASCII/Latin-1/UTF-8 ordinals to NOT add - after
60             # - en-dash em-dash /
61             45, 8211, 8212, 47,
62             );
63 0 0         my $splitHardH = defined($opts{'spHH'})? $opts{'spHH'}: 1; # 1=OK to split on hard (explicit) hyphen U+002D
64 0 0         my $otherPunc = defined($opts{'spOP'})? $opts{'spOP'}: 1; # 1=OK to split after most punctuation
65 0 0         my $digitRun = defined($opts{'spDR'})? $opts{'spDR'}: 1; # 1=OK to split after run of digit(s)
66 0 0         my $letterRun = defined($opts{'spLR'})? $opts{'spLR'}: 1; # 1=OK to split after run of ASCII letter(s)
67 0 0         my $camelCase = defined($opts{'spCC'})? $opts{'spCC'}: 1; # 1=OK to split camelCase on ASCII lc-to-UC transition
68 0 0         my $splitReqBlnk = defined($opts{'spRB'})? $opts{'spRB'}: 0; # 1=OK to split on required blank (NBSP) -- desperation move
69 0 0         my $splitAnywhere = defined($opts{'spFS'})? $opts{'spFS'}: 0; # 1=OK to split to fit available space -- super desperation move
70 0 0         if ($splitAnywhere) {
71             # if requesting to split within a certain length, suppress all other flags
72 0           $splitHardH = $otherPunc = $digitRun = $letterRun = $camelCase =
73             $splitReqBlnk = 0;
74             }
75              
76             # note that we are ignoring U+2010 "hyphen" and U+2011 "non-splitting
77             # hyphen". The first is probably rare enough to not be worth the bother,
78             # and the second won't be split at anyway.
79              
80 0           $leftWord = ''; # default return values
81 0           $rightWord = $word;
82              
83 0           @splitLoc = (); # no known OK splits yet
84              
85             # highest priority for splits: hard and soft hyphens
86             # remove SHYs, remember any break points
87 0           ($word, @splitLoc) = _removeSHY($word);
88             # remember any break points due to hard coded hyphens
89 0           @chars = split //, $word;
90 0           for ($i=0; $i
91 0 0 0       if ($chars[$i] eq '-' && $splitHardH) { push @splitLoc, $i; }
  0            
92             # note that unlike SHY, - is not removed
93             }
94              
95             # If nothing in @splitLoc, proceed to find other splits. If @splitLoc
96             # has at least one entry, could make it the top priority and split there,
97             # and not look at other possible splits. Or, keep adding to @splitLoc
98             # (equal priority for all possible splits). Mix and match is OK
99             # (grouping criteria, as hard and soft hyphens were done together).
100              
101             #if (!@splitLoc) {
102 0 0         if ($otherPunc) {
103             # look for other punctuation to split after.
104             # don't split on ' or " or other quotes (<, <<, etc.)
105             # !%&)]*+/,.:;<>?^_~ and curly right brace ASCII OK for now
106             # en-dash, em-dash should ideally be split after, whether they are
107             # free floating or embedded between words.
108 0           my @ASCII_punct = ( '!', '.', '?', ',', '%', '&', ':', ';',
109             '<', '>', ')', ']', chr(125), '_', '~',
110             '^', '+', '*', '/', );
111             # en-dash em-dash
112 0           my @UTF8_punct = ( 8211, 8212, );
113             # remember not to split if next char is -
114             # (defer split to after hard hyphen - [if allowed]).
115 0           for ($i=0; $i
116 0           foreach (@ASCII_punct) {
117 0 0 0       if ($chars[$i] eq $_ && $chars[$i+1] ne '-') {
118 0           push @splitLoc, $i;
119 0           last;
120             }
121             }
122 0           foreach (@UTF8_punct) {
123 0 0 0       if (ord($chars[$i]) == $_ && $chars[$i+1] ne '-') {
124 0           push @splitLoc, $i;
125 0           last;
126             }
127             }
128             }
129             }
130             #}
131              
132             # group digit runs and camelCase together at same priority
133             #if (!@splitLoc) {
134 0 0         if ($digitRun) {
135             # look for a run of digits to split after.
136             # that is, any digit NOT followed by another digit.
137             # remember not to split if next char is -
138             # (defer split to after hard hyphen - [if allowed]).
139 0           for ($i=0; $i
140 0 0 0       if ($chars[$i] ge '0' && $chars[$i] le '9' &&
      0        
      0        
141             !($chars[$i+1] ge '0' && $chars[$i+1] le '9' ||
142             $chars[$i+1] eq '-')) {
143 0           push @splitLoc, $i;
144             }
145             }
146             }
147              
148 0 0         if ($letterRun) {
149             # look for a run of letters (ASCII) to split after.
150             # that is, any letter NOT followed by another letter.
151             # remember not to split if next char is -
152             # (defer split to after hard hyphen - [if allowed]).
153 0           for ($i=0; $i
154 0 0 0       if (($chars[$i] ge 'a' && $chars[$i] le 'z' ||
      0        
      0        
155             $chars[$i] ge 'A' && $chars[$i] le 'Z' ) &&
156             !($chars[$i+1] ge 'a' && $chars[$i+1] le 'z' ||
157             $chars[$i+1] ge 'A' && $chars[$i+1] le 'Z' ||
158             $chars[$i+1] eq '-') ) {
159 0           push @splitLoc, $i;
160             }
161             }
162             }
163              
164 0 0         if ($camelCase) {
165             # look for camelCase to split on lowercase to
166             # uppercase transitions. just ASCII letters for now.
167             # Note that this will split names like McIlroy -> Mc-Ilroy
168             # and MacDonald -> Mac-Donald.
169 0           for ($i=0; $i
170 0 0 0       if ($chars[$i] ge 'a' && $chars[$i] le 'z' &&
      0        
      0        
171             $chars[$i+1] ge 'A' && $chars[$i+1] le 'Z') {
172 0           push @splitLoc, $i;
173             }
174             }
175             }
176             #}
177              
178             #if (!@splitLoc) {
179             # look for real English word split locations
180             # TBD
181             #}
182              
183 0 0 0       if (!@splitLoc && $splitReqBlnk) {
184             # remember any break points due to desperation split at NBSP
185 0           @chars = split //, $word;
186 0           for ($i=0; $i
187 0 0         if ($chars[$i] eq "\xA0") { push @splitLoc, $i; }
  0            
188             # note that NBSP converted to regular space (x20). we will need
189             # to overwrite the split one with the hyphen
190             }
191             }
192            
193 0 0 0       if (!@splitLoc && $splitAnywhere) {
194             # remember any break point due to desperation split at available length
195 0           @chars = split //, $word;
196 0           my $trial = '';
197 0           for ($i=0; $i
198 0           $trial .= $chars[$i];
199 0 0         if ($self->advancewidth("$trial$hyphen") > $width) { last; }
  0            
200             }
201             # nothing fit? force one letter, even though it overflows
202 0 0         if ($i == 0) { $i = 1; }
  0            
203 0           push @splitLoc, $i-1;
204             # disable minimum prefix and suffix for this
205 0           $minBegin = $minEnd = 1;
206             }
207              
208             # sort final @splitLoc, remove any split points violating "min" settings
209             # set $leftWord and $rightWord if find successful split
210 0 0         if (@splitLoc) {
211 0           @splitLoc = sort { $a <=> $b } @splitLoc;
  0            
212             # unnecessary to have unique values
213 0           $len = length($word);
214 0           $j = -1;
215 0           for ($i=0; $i
216 0 0         if ($splitLoc[$i] >= $minBegin-1) { last; }
  0            
217 0           $j = $i;
218             }
219 0 0         if ($j >= 0) { splice(@splitLoc, 0, $j+1); } # remove j+1 els
  0            
220 0           $j = -1;
221 0           for ($i=$#splitLoc; $i>=0; $i--) {
222 0 0         if ($splitLoc[$i] < $len-$minEnd) { last; }
  0            
223 0           $j = $i;
224             }
225 0 0         if ($j >= 0) { splice(@splitLoc, $j); } # remove els >= j-th
  0            
226              
227             # scan R to L through @splitLoc to try splitting there
228             # TBD estimate starting position in @splitLoc by dividing $width by
229             # 1em to get approximate split location; pick highest @splitLoc
230             # element that does not exceed it, and move right (probably) or left
231             # to get proper split point.
232 0           while (@splitLoc) {
233 0           $j = pop @splitLoc; # proposed split rightmost on list
234 0           my $trial = substr($word, 0, $j+1);
235             # this is the left fragment at the end of the line. make sure
236             # there is room for the space before it, the hyphen (if added),
237             # and any letter doubling (e.g., in German or Dutch)
238              
239             # does the left fragment already end in -, etc.?
240             # if it does, don't add a $hyphen.
241 0           my $h = $hyphen;
242 0           $i = ord(substr($trial, -1, 1)); # last character in left fragment
243 0           foreach (@suppressHyphen) {
244 0 0         if ($i == $_) { $h = ''; last; }
  0            
  0            
245             }
246             # left fragment ends in a space (used to be an NBSP)?
247             # remove space, and no hyphen
248 0 0         if ($i eq ' ') {
249 0           chop($trial);
250 0           $h = '';
251             }
252              
253             # $width should already count the trailing space in the existing
254             # line, or full width if empty
255 0           $len = $self->advancewidth("$trial$h", %opts);
256 0 0         if ($len > $width) { next; }
  0            
257              
258             # any letter doubling needed?
259 0           $leftWord = $trial.$h;
260 0           $rightWord = substr($word, $j+1);
261 0           last;
262             }
263             # if fell through because no fragment was short enough, $leftWord and
264             # $rightWord were never reassigned, and effect is to leave the entire
265             # word for the next line.
266             }
267             # if 0 elements in @splitLoc, $leftWord and $rightWord already defaulted
268              
269 0           return ($leftWord, $rightWord);
270             }
271              
272             # remove soft hyphens (SHYs) from a word. assume is always #173 (good for
273             # Latin-1, CP-1252, UTF-8; might not work for some encodings) TBD might want
274             # to pass in current encoding, or what SHY value is.
275             # return list of break points where SHYs were removed
276             sub _removeSHY {
277 0     0     my ($word) = @_;
278              
279 0           my @SHYs = ();
280 0           my $i = 0;
281              
282 0           my @chars = split //, $word;
283 0           my $out = '';
284 0           foreach (@chars) {
285 0 0         if (ord($_) == 173) {
286             # it's a SHY, so remove from word, add to list
287 0           push @SHYs, ($i - 1);
288 0           next;
289             }
290 0           $out .= $_;
291 0           $i++;
292             }
293 0           return ($out, @SHYs);
294             }
295              
296             1;