File Coverage

blib/lib/PDF/Builder/Content/Hyphenate_basic.pm

Criterion	Covered	Total	%
statement	9	108	8.3
branch	0	56	0.0
condition	0	51	0.0
subroutine	3	5	60.0
pod	0	1	0.0
total	12	221	5.4

line	stmt	bran	cond	sub	pod	time	code
1							package PDF::Builder::Content::Hyphenate_basic;
2
3	1			1		9282	use base 'PDF::Builder::Content::Text';
	1					2
	1					90
4
5	1			1		6	use strict;
	1					2
	1					19
6	1			1		4	use warnings;
	1					2
	1					942
7
8							our $VERSION = '3.024'; # VERSION
9							our $LAST_UPDATE = '3.024'; # manually update whenever code is changed
10
11							=head1 NAME
12
13							PDF::Builder::Content::Hyphenate_basic - Simple hyphenation capability
14
15							=head1 SYNOPSIS
16
17							These are internal routines that are somewhat experimental, and may (or may
18							not) be extended in the future. They are called from various Content routines
19							that take long strings of text and split them into fixed-length lines.
20
21							Words are split to fill the line most completely, without regard to widows and
22							orphans, long runs of hyphens at the right edge, "rivers" of space flowing
23							through a paragraph, and other problems. Also, only simple splitting is done
24							(not actually I), on a simple, language-independent basis. No dictionary
25							or rules-based splitting is currently done.
26
27							This functionality may well be replaced by "hooks" to call language-specific
28							word-splitting rules, as well as worrying about the appearance of the results
29							(such as Knuth-Plass).
30
31							=cut
32
33							# Main entry. Returns array of left portion of word (and -) to stick on end of
34							# sentence (may be empty) and remaining (right) portion of word to go on next
35							# line (usually not empty).
36							sub splitWord {
37	0			0	0		my ($self, $word, $width, %opts) = @_;
38							# copy dashed option names to preferred undashed names
39	0	0	0				if (defined $opts{'-spHH'} && !defined $opts{'spHH'}) { $opts{'spHH'} = delete($opts{'-spHH'}); }
	0
40	0	0	0				if (defined $opts{'-spOP'} && !defined $opts{'spOP'}) { $opts{'spOP'} = delete($opts{'-spOP'}); }
	0
41	0	0	0				if (defined $opts{'-spDR'} && !defined $opts{'spDR'}) { $opts{'spDR'} = delete($opts{'-spDR'}); }
	0
42	0	0	0				if (defined $opts{'-spLR'} && !defined $opts{'spLR'}) { $opts{'spLR'} = delete($opts{'-spLR'}); }
	0
43	0	0	0				if (defined $opts{'-spCC'} && !defined $opts{'spCC'}) { $opts{'spCC'} = delete($opts{'-spCC'}); }
	0
44
45	0						my ($leftWord, $rightWord, @splitLoc, @chars, $i, $j, $len);
46
47							# various settings, some of which may be language-specific
48	0						my $minBegin = 2; # minimum 2 characters before split
49	0						my $minEnd = 2; # minimum 2 characters to next line
50							#my $hyphen = '-';
51	0						my $hyphen = "\xAD"; # add a hyphen at split, unless splitting at -
52							# or other dash character
53							# NOTE: PDF-1.7 14.8.2.2.3 suggests using a soft hyphen (\AD) when splitting
54							# a word at the end of the line, so that when text is extracted for
55							# a screen reader, etc., the closed-up word can have the "visible"
56							# hyphen removed. PDF readers should render as -.
57	0						my @suppressHyphen = ( # ASCII/Latin-1/UTF-8 ordinals to NOT add - after
58							# - en-dash em-dash /
59							45, 8211, 8212, 47,
60							);
61	0	0					my $splitHardH = defined($opts{'spHH'})? $opts{'spHH'}: 1; # 1=OK to split on hard (explicit) hyphen U+002D
62	0	0					my $otherPunc = defined($opts{'spOP'})? $opts{'spOP'}: 1; # 1=OK to split after most punctuation
63	0	0					my $digitRun = defined($opts{'spDR'})? $opts{'spDR'}: 1; # 1=OK to split after run of digit(s)
64	0	0					my $letterRun = defined($opts{'spLR'})? $opts{'spLR'}: 1; # 1=OK to split after run of ASCII letter(s)
65	0	0					my $camelCase = defined($opts{'spCC'})? $opts{'spCC'}: 1; # 1=OK to split camelCase on ASCII lc-to-UC transition
66
67							# note that we are ignoring U+2010 "hyphen" and U+2011 "non-splitting
68							# hyphen". The first is probably rare enough to not be worth the bother,
69							# and the second won't be split at anyway.
70
71	0						$leftWord = ''; # default return values
72	0						$rightWord = $word;
73
74	0						@splitLoc = (); # no known OK splits yet
75
76							# highest priority for splits: hard and soft hyphens
77							# remove SHYs, remember any break points
78	0						($word, @splitLoc) = _removeSHY($word);
79							# remember any break points due to hard coded hyphens
80	0						@chars = split //, $word;
81	0						for ($i=0; $i
82	0	0	0				if ($chars[$i] eq '-' && $splitHardH) { push @splitLoc, $i; }
	0
83							# note that unlike SHY, - is not removed
84							}
85
86							# If nothing in @splitLoc, proceed to find other splits. If @splitLoc
87							# has at least one entry, could make it the top priority and split there,
88							# and not look at other possible splits. Or, keep adding to @splitLoc
89							# (equal priority for all possible splits). Mix and match is OK
90							# (grouping criteria, as hard and soft hyphens were done together).
91
92							#if (!@splitLoc) {
93	0	0					if ($otherPunc) {
94							# look for other punctuation to split after.
95							# don't split on ' or " or other quotes (<, <<, etc.)
96							# !%&)]*+/,.:;<>?^_~ and curly right brace ASCII OK for now
97							# en-dash, em-dash should ideally be split after, whether they are
98							# free floating or embedded between words.
99	0						my @ASCII_punct = ( '!', '.', '?', ',', '%', '&', ':', ';',
100							'<', '>', ')', ']', chr(125), '_', '~',
101							'^', '+', '*', '/', );
102							# en-dash em-dash
103	0						my @UTF8_punct = ( 8211, 8212, );
104							# remember not to split if next char is -
105							# (defer split to after hard hyphen - [if allowed]).
106	0						for ($i=0; $i
107	0						foreach (@ASCII_punct) {
108	0	0	0				if ($chars[$i] eq $_ && $chars[$i+1] ne '-') {
109	0						push @splitLoc, $i;
110	0						last;
111							}
112							}
113	0						foreach (@UTF8_punct) {
114	0	0	0				if (ord($chars[$i]) == $_ && $chars[$i+1] ne '-') {
115	0						push @splitLoc, $i;
116	0						last;
117							}
118							}
119							}
120							}
121							#}
122
123							# group digit runs and camelCase together at same priority
124							#if (!@splitLoc) {
125	0	0					if ($digitRun) {
126							# look for a run of digits to split after.
127							# that is, any digit NOT followed by another digit.
128							# remember not to split if next char is -
129							# (defer split to after hard hyphen - [if allowed]).
130	0						for ($i=0; $i
131	0	0	0				if ($chars[$i] ge '0' && $chars[$i] le '9' &&
			0
			0
132							!($chars[$i+1] ge '0' && $chars[$i+1] le '9' \|\|
133							$chars[$i+1] eq '-')) {
134	0						push @splitLoc, $i;
135							}
136							}
137							}
138
139	0	0					if ($letterRun) {
140							# look for a run of letters (ASCII) to split after.
141							# that is, any letter NOT followed by another letter.
142							# remember not to split if next char is -
143							# (defer split to after hard hyphen - [if allowed]).
144	0						for ($i=0; $i
145	0	0	0				if (($chars[$i] ge 'a' && $chars[$i] le 'z' \|\|
			0
			0
146							$chars[$i] ge 'A' && $chars[$i] le 'Z' ) &&
147							!($chars[$i+1] ge 'a' && $chars[$i+1] le 'z' \|\|
148							$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z' \|\|
149							$chars[$i+1] eq '-') ) {
150	0						push @splitLoc, $i;
151							}
152							}
153							}
154
155	0	0					if ($camelCase) {
156							# look for camelCase to split on lowercase to
157							# uppercase transitions. just ASCII letters for now.
158							# Note that this will split names like McIlroy -> Mc-Ilroy
159							# and MacDonald -> Mac-Donald.
160	0						for ($i=0; $i
161	0	0	0				if ($chars[$i] ge 'a' && $chars[$i] le 'z' &&
			0
			0
162							$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z') {
163	0						push @splitLoc, $i;
164							}
165							}
166							}
167							#}
168
169							#if (!@splitLoc) {
170							# look for real English word split locations
171							# TBD
172							#}
173
174							# sort final @splitLoc, remove any split points violating "min" settings
175							# set $leftWord and $rightWord if find successful split
176	0	0					if (@splitLoc) {
177	0						@splitLoc = sort { $a <=> $b } @splitLoc;
	0
178							# unnecessary to have unique values
179	0						$len = length($word);
180	0						$j = -1;
181	0						for ($i=0; $i
182	0	0					if ($splitLoc[$i] >= $minBegin-1) { last; }
	0
183	0						$j = $i;
184							}
185	0	0					if ($j >= 0) { splice(@splitLoc, 0, $j+1); } # remove j+1 els
	0
186	0						$j = -1;
187	0						for ($i=$#splitLoc; $i>=0; $i--) {
188	0	0					if ($splitLoc[$i] < $len-$minEnd) { last; }
	0
189	0						$j = $i;
190							}
191	0	0					if ($j >= 0) { splice(@splitLoc, $j); } # remove els >= j-th
	0
192
193							# scan R to L through @splitLoc to try splitting there
194							# TBD estimate starting position in @splitLoc by dividing $width by
195							# 1em to get approximate split location; pick highest @splitLoc
196							# element that does not exceed it, and move right (probably) or left
197							# to get proper split point.
198	0						while (@splitLoc) {
199	0						$j = pop @splitLoc; # proposed split rightmost on list
200	0						my $trial = substr($word, 0, $j+1);
201							# this is the left fragment at the end of the line. make sure
202							# there is room for the space before it, the hyphen (if added),
203							# and any letter doubling (e.g., in German)
204
205							# does the left fragment already end in -, etc.?
206							# if it does, don't add a $hyphen.
207	0						my $h = $hyphen;
208	0						$i = ord(substr($trial, -1, 1)); # last character in left fragment
209	0						foreach (@suppressHyphen) {
210	0	0					if ($i == $_) { $h = ''; last; }
	0
	0
211							}
212							# $width should already count the trailing space in the existing
213							# line, or full width if empty
214	0						$len = $self->advancewidth("$trial$h", %opts);
215	0	0					if ($len > $width) { next; }
	0
216
217							# any letter doubling needed?
218	0						$leftWord = $trial.$h;
219	0						$rightWord = substr($word, $j+1);
220	0						last;
221							}
222							# if fell through because no fragment was short enough, $leftWord and
223							# $rightWord were never reassigned, and effect is to leave the entire
224							# word for the next line.
225							}
226							# if 0 elements in @splitLoc, $leftWord and $rightWord already defaulted
227
228	0						return ($leftWord, $rightWord);
229							}
230
231							# remove soft hyphens (SHYs) from a word. assume is always #173 (good for
232							# Latin-1, CP-1252, UTF-8; might not work for some encodings) TBD might want
233							# to pass in current encoding, or what SHY value is.
234							# return list of break points where SHYs were removed
235							sub _removeSHY {
236	0			0			my ($word) = @_;
237
238	0						my @SHYs = ();
239	0						my $i = 0;
240
241	0						my @chars = split //, $word;
242	0						my $out = '';
243	0						foreach (@chars) {
244	0	0					if (ord($_) == 173) {
245							# it's a SHY, so remove from word, add to list
246	0						push @SHYs, ($i - 1);
247	0						next;
248							}
249	0						$out .= $_;
250	0						$i++;
251							}
252	0						return ($out, @SHYs);
253							}
254
255							1;