File Coverage

blib/lib/Pod/Simple/BlackBox.pm

Criterion	Covered	Total	%
statement	854	1006	84.8
branch	368	526	69.9
condition	132	189	69.8
subroutine	59	65	90.7
pod	0	7	0.0
total	1413	1793	78.8

line	stmt	bran	cond	sub	pod	time	code
1							package Pod::Simple::BlackBox;
2							#
3							# "What's in the box?" "Pain."
4							#
5							###########################################################################
6							#
7							# This is where all the scary things happen: parsing lines into
8							# paragraphs; and then into directives, verbatims, and then also
9							# turning formatting sequences into treelets.
10							#
11							# Are you really sure you want to read this code?
12							#
13							#-----------------------------------------------------------------------------
14							#
15							# The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16							# of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17							# to call the proper callbacks on the treelets.
18							#
19							# Every node in a treelet is a ['name', {attrhash}, ...children...]
20
21	69			69		2327	use integer; # vroom!
	69					142
	69					511
22	69			69		2127	use strict;
	69					122
	69					1260
23	69			69		291	use Carp ();
	69					313
	69					1573
24	69			69		303	use vars qw($VERSION );
	69					105
	69					14151
25							$VERSION = '3.43';
26							#use constant DEBUG => 7;
27
28							sub my_qr ($$) {
29
30							# $1 is a pattern to compile and return. Older perls compile any
31							# syntactically valid property, even if it isn't legal. To cope with
32							# this, return an empty string unless the compiled pattern also
33							# successfully matches $2, which the caller furnishes.
34
35	450			450	0	1257	my ($input_re, $should_match) = @_;
36							# XXX could have a third parameter $shouldnt_match for extra safety
37
38	450	50				3333	my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : "";
39
40	67			67		1188	my $re = eval "no warnings; $use_utf8 qr/$input_re/";
	67			67		178
	67			67		3455
	67			67		1059
	67			67		191
	67			67		4206
	67			67		45293
	67			67		2304
	67			67		1074
	67			67		1660
	67			37		198
	67			0		2129
	67			0		407
	67			0		128
	67					2201
	67					424
	67					118
	67					2276
	67					420
	67					126
	67					2374
	67					412
	67					145
	67					2196
	67					405
	67					143
	67					2125
	67					489
	67					138
	67					3917
	450					26395
	3					18
	3					8
	4					37
	0
	0
	0
	0
	0
	0
	0
	0
	0
41							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
42	450	50				1797	return "" if $@;
43
44	67			67		6344	my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
	67			67		170
	67			67		2257
	67			67		646
	67			67		136
	67			67		3947
	67			67		454
	67			67		138
	67			67		2371
	67			747		404
	67			0		129
	67			3		2097
	67			0		452
	67					168
	67					2794
	67					409
	67					134
	67					2266
	67					430
	67					140
	67					2493
	67					432
	67					128
	67					2331
	67					394
	67					140
	67					2709
	450					24894
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
45							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
46	450	50				1796	return "" if $@;
47
48							#print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49	450	50				1653	return $re if $matches;
50
51							#print STDERR __LINE__, ": $re: didn't match\n";
52	1					2	return "";
53							}
54
55							BEGIN {
56	69			69		1620	require Pod::Simple;
57	69	50				5764	*DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58							}
59
60							# Matches a character iff the character will have a different meaning
61							# if we choose CP1252 vs UTF-8 if there is no =encoding line.
62							# This is broken for early Perls on non-ASCII platforms.
63							my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64							$non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65
66							# Use patterns understandable by Perl 5.6, if possible
67	69			69		535	my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
	69					157
	69					64154
68							my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69							# to get assigned
70							my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71							"\x{250}");
72							$rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73
74	67			67		388	my $script_run_re = eval 'no warnings "experimental::script_run";
	67					125
	67					15939
75							qr/(script_run: ^ . $ )/x';
76							my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77							unless ($latin_re) {
78							# This was machine generated to be the ranges of the union of the above
79							# three properties, with things that were undefined by Unicode 4.1 filling
80							# gaps. That is the version in use when Perl advanced enough to
81							# successfully compile and execute the above pattern.
82							$latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83							}
84
85							my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86
87							# Latin script code points not in the first release of Unicode
88							my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89
90							# If this perl doesn't have the Deprecated property, there's only one code
91							# point in it that we need be concerned with.
92							my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93							$deprecated_re = qr/\x{149}/ unless $deprecated_re;
94
95							my $utf8_bom;
96							if (($] ge 5.007_003)) {
97							$utf8_bom = "\x{FEFF}";
98							utf8::encode($utf8_bom);
99							} else {
100							$utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101							}
102
103							# This is used so that the 'content_seen' method doesn't return true on a
104							# file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105							# there is a valid =foo line will we return that content was seen.
106							my $seen_legal_directive = 0;
107
108							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109
110	1			1	0	41	sub parse_line { shift->parse_lines(@_) } # alias
111
112							# - - - Turn back now! Run away! - - -
113
114							sub parse_lines { # Usage: $parser->parse_lines(@lines)
115							# an undef means end-of-stream
116	7636			7636	0	10792	my $self = shift;
117
118	7636					10587	my $code_handler = $self->{'code_handler'};
119	7636					9695	my $cut_handler = $self->{'cut_handler'};
120	7636					9985	my $wl_handler = $self->{'whiteline_handler'};
121	7636		100			14259	$self->{'line_count'} \|\|= 0;
122
123	7636					8664	my $scratch;
124
125							DEBUG > 4 and
126	7636					8365	print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127
128	7636					8501	DEBUG > 5 and
129							print STDERR "# About to parse lines: ",
130							join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131
132	7636		100			15162	my $paras = ($self->{'paras'} \|\|= []);
133							# paragraph buffer. Because we need to defer processing of =over
134							# directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135							# to process this.
136
137	7636		100			14377	$self->{'pod_para_count'} \|\|= 0;
138
139							# An attempt to match the pod portions of a line. This is not fool proof,
140							# but is good enough to serve as part of the heuristic for guessing the pod
141							# encoding if not specified.
142	7636					9213	my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
	73544					135809
	7636					38303
143	7636					35498	my $pod_chars_re = qr/ ^ = [A-Za-z]+ \| [\Q$codes\E] < /x;
144
145	7636					11156	my $line;
146	7801					11953	foreach my $source_line (@_) {
147	16149	50				30584	if( $self->{'source_dead'} ) {
148	166					10695	DEBUG > 4 and print STDERR "# Source is dead.\n";
149	166					702	last;
150							}
151
152	16149	100				39380	unless( defined $source_line ) {
153	833					1510	DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154
155	833					2644	push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156	667					1616	push @$paras, $paras->[-1], $paras->[-1];
157							# So that it definitely fills the buffer.
158	667					1149	$self->{'source_dead'} = 1;
159	2503					4629	$self->_ponder_paragraph_buffer;
160	2503					4018	next;
161							}
162
163
164	17152	100				29651	if( $self->{'line_count'}++ ) {
165	16483					32736	($line = $source_line) =~ tr/\n\r//d;
166							# If we don't have two vars, we'll end up with that there
167							# tr/// modding the (potentially read-only) original source line!
168
169							} else {
170	2505					4772	DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171
172	2505	50				7097	if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
		50
		50
173	1836					2348	DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174	1836					2215	$self->_handle_encoding_line( "=encoding utf8" );
175	1836					4147	delete $self->{'_processed_encoding'};
176	1836					4052	$line =~ tr/\n\r//d;
177
178							} elsif( $line =~ s/^\xFE\xFF//s ) {
179	1836					2626	DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180							$self->scream(
181	17138					36307	$self->{'line_count'},
182							"UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183							);
184	1836					9890	splice @_;
185	1836					10462	push @_, undef;
186	1836					3152	next;
187
188							# TODO: implement somehow?
189
190							} elsif( $line =~ s/^\xFF\xFE//s ) {
191	1836					3226	DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192							$self->scream(
193	3035					6335	$self->{'line_count'},
194							"UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195							);
196	0					0	splice @_;
197	0					0	push @_, undef;
198	3035					6397	next;
199
200							# TODO: implement somehow?
201
202							} else {
203	890					1259	DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204	890					2451	($line = $source_line) =~ tr/\n\r//d;
205							}
206							}
207
208	15537	100	100			109044	if(!$self->{'parse_characters'} && !$self->{'encoding'}
			100
			100
			100
209							&& ($self->{'in_pod'} \|\| $line =~ /^=/s)
210							&& $line =~ /$non_ascii_re/
211							) {
212
213	231					510	my $encoding;
214
215							# No =encoding line, and we are at the first pod line in the input that
216							# contains a non-ascii byte, that is, one whose meaning varies depending
217							# on whether the file is encoded in UTF-8 or CP1252, which are the two
218							# possibilities permitted by the pod spec. (ASCII is assumed if the
219							# file only contains ASCII bytes.) In order to process this line, we
220							# need to figure out what encoding we will use for the file.
221							#
222							# Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223							# 160-255, but it is used here, as it often colloquially is, to refer to
224							# the complete set of code points 0-255, including ASCII (0-127), the C1
225							# controls (128-159), and strict Latin 1 (160-255).
226							#
227							# CP1252 is effectively a superset of Latin 1, because it differs only
228							# from colloquial 8859-1 in the C1 controls, which are very unlikely to
229							# actually be present in 8859-1 files, so can be used for other purposes
230							# without conflict. CP 1252 uses most of them for graphic characters.
231							#
232							# Note that all ASCII-range bytes represent their corresponding code
233							# points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234							# code points require multiple (non-ASCII) bytes to represent. (A
235							# separate paragraph for EBCDIC is below.) The multi-byte
236							# representation is quite structured. If we find an isolated byte that
237							# would require multiple bytes to represent in UTF-8, we know that the
238							# encoding is not UTF-8. If we find a sequence of bytes that violates
239							# the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240							# hence must be 1252.
241							#
242							# But there are ambiguous cases where we could guess wrong. If so, the
243							# user will end up having to supply an =encoding line. We use all
244							# readily available information to improve our chances of guessing
245							# right. The odds of something not being UTF-8, but still passing a
246							# UTF-8 validity test go down very rapidly with increasing length of the
247							# sequence. Therefore we look at all non-ascii sequences on the line.
248							# If any of the sequences can't be UTF-8, we quit there and choose
249							# CP1252. If all could be UTF-8, we see if any of the code points
250							# represented are unlikely to be in pod. If so, we guess CP1252. If
251							# not, we check if the line is all in the same script; if not guess
252							# CP1252; otherwise UTF-8. For perls that don't have convenient script
253							# run testing, see if there is both Latin and non-Latin. If so, CP1252,
254							# otherwise UTF-8.
255							#
256							# On EBCDIC platforms, the situation is somewhat different. In
257							# UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258							# but so do the bytes that are for the C1 controls. Recall that these
259							# correspond to the unused portion of 8859-1 that 1252 mostly takes
260							# over. That means that there are fewer code points that are
261							# represented by multi-bytes. But, note that the these controls are
262							# very unlikely to be in pod text. So if we encounter one of them, it
263							# means that it is quite likely CP1252 and not UTF-8. The net result is
264							# the same code below is used for both platforms.
265							#
266							# XXX probably if the line has E that evaluates to illegal CP1252,
267							# then it is UTF-8. But we haven't processed E<> yet.
268
269	231	50				771	goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls
270
271	231					425	my $copy;
272
273	69			69		606	no warnings 'utf8';
	69					148
	69					5698
274
275	2824	50				5478	if ($] ge 5.007_003) {
276	2609					5631	$copy = $line;
277
278							# On perls that have this function, we can use it to easily see if the
279							# sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280							# needed below for script run detection
281	225	100				622	goto set_1252 if ! utf8::decode($copy);
282							}
283							elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284							# code page doing here anyway?
285							goto set_utf8;
286							}
287							else { # ASCII, no decode(): do it ourselves using the fundamental
288							# characteristics of UTF-8
289	69			69		45894	use if $] le 5.006002, 'utf8';
	69					905
	69					1625
290
291	215					1343	my $char_ord;
292							my $needed; # How many continuation bytes to gobble up
293
294							# Initialize the translated line with a dummy character that will be
295							# deleted after everything else is done. This dummy makes sure that
296							# $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297							# with upgrading in the middle
298	0					0	$copy = chr(0x100);
299
300							# Parse through the line
301	0					0	for (my $i = 0; $i < length $line; $i++) {
302	0					0	my $byte = substr($line, $i, 1);
303
304							# ASCII bytes are trivially dealt with
305	0	0				0	if ($byte !~ $non_ascii_re) {
306	0					0	$copy .= $byte;
307	0					0	next;
308							}
309
310	0					0	my $b_ord = ord $byte;
311
312							# Now figure out what this code point would be if the input is
313							# actually in UTF-8. If, in the process, we discover that it isn't
314							# well-formed UTF-8, we guess CP1252.
315							#
316							# Start the process. If it is UTF-8, we are at the first, start
317							# byte, of a multi-byte sequence. We look at this byte to figure
318							# out how many continuation bytes are needed, and to initialize the
319							# code point accumulator with the data from this byte.
320							#
321							# Normally the minimum continuation byte is 0x80, but in certain
322							# instances the minimum is a higher number. So the code below
323							# overrides this for those instances.
324	0					0	my $min_cont = 0x80;
325
326	0	0				0	if ($b_ord < 0xC2) { # A start byte < C2 is malformed
		0
		0
		0
327	0					0	goto set_1252;
328							}
329							elsif ($b_ord <= 0xDF) {
330	0					0	$needed = 1;
331	0					0	$char_ord = $b_ord & 0x1F;
332							}
333							elsif ($b_ord <= 0xEF) {
334	0	0				0	$min_cont = 0xA0 if $b_ord == 0xE0;
335	0					0	$needed = 2;
336	215					338	$char_ord = $b_ord & (0x1F >> 1);
337							}
338							elsif ($b_ord <= 0xF4) {
339	215	0				581	$min_cont = 0x90 if $b_ord == 0xF0;
340	2814					17630	$needed = 3;
341	11					20	$char_ord = $b_ord & (0x1F >> 2);
342							}
343							else { # F4 is the highest start byte for legal Unicode; higher is
344							# unlikely to be in pod.
345	11					61	goto set_1252;
346							}
347
348							# ? not enough continuation bytes available
349	11	0				18	goto set_1252 if $i + $needed >= length $line;
350
351							# Accumulate the ordinal of the character from the remaining
352							# (continuation) bytes.
353	11					39	while ($needed-- > 0) {
354	11					19	my $cont = substr($line, ++$i, 1);
355	11					142	$b_ord = ord $cont;
356	0	0	0			0	goto set_1252 if $b_ord < $min_cont \|\| $b_ord > 0xBF;
357
358							# In all cases, any next continuation bytes all have the same
359							# minimum legal value
360	0					0	$min_cont = 0x80;
361
362							# Accumulate this byte's contribution to the code point
363	0					0	$char_ord <<= 6;
364	0					0	$char_ord \|= ($b_ord & 0x3F);
365							}
366
367							# Here, the sequence that formed this code point was valid UTF-8,
368							# so add the completed character to the output
369	0					0	$copy .= chr $char_ord;
370							} # End of loop through line
371
372							# Delete the dummy first character
373	0					0	$copy = substr($copy, 1);
374							}
375
376							# Here, $copy is legal UTF-8.
377
378							# If it can't be legal CP1252, no need to look further. (These bytes
379							# aren't valid in CP1252.) This test could have been placed higher in
380							# the code, but it seemed wrong to set the encoding to UTF-8 without
381							# making sure that the very first instance is well-formed. But what if
382							# it isn't legal CP1252 either? We have to choose one or the other, and
383							# It seems safer to favor the single-byte encoding over the multi-byte.
384	0	50				0	goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385
386							# The C1 controls are not likely to appear in pod
387	0	50				0	goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388
389							# Nor are surrogates nor unassigned, nor deprecated.
390	0	50				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391	0	50	33			0	goto set_1252 if $cs_re && $copy =~ $cs_re;
392	0	50	33			0	DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393	0	50	33			0	goto set_1252 if $cn_re && $copy =~ $cn_re;
394	0	50				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395	0	50				0	goto set_1252 if $copy =~ $deprecated_re;
396
397							# Nor are rare code points. But this is hard to determine. khw
398							# believes that IPA characters and the modifier letters are unlikely to
399							# be in pod (and certainly very unlikely to be the in the first line in
400							# the pod containing non-ASCII)
401	0	100				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402	0	100	66			0	goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403
404							# The first Unicode version included essentially every Latin character
405							# in modern usage. So, a Latin character not in the first release will
406							# unlikely be in pod.
407	0	50	33			0	DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408	0	50	33			0	goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409
410							# On perls that handle script runs, if the UTF-8 interpretation yields
411							# a single script, we guess UTF-8, otherwise just having a mixture of
412							# scripts is suspicious, so guess CP1252. We first strip off, as best
413							# we can, the ASCII characters that look like they are pod directives,
414							# as these would always show as mixed with non-Latin text.
415	0					0	$copy =~ s/$pod_chars_re//g;
416
417	0	50				0	if ($script_run_re) {
418	0	0				0	goto set_utf8 if $copy =~ $script_run_re;
419	0					0	DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420	0					0	goto set_1252;
421							}
422
423							# Even without script runs, but on recent enough perls and Unicodes, we
424							# can check if there is a mixture of both Latin and non-Latin. Again,
425							# having a mixture of scripts is suspicious, so assume CP1252
426
427							# If it's all non-Latin, there is no CP1252, as that is Latin
428							# characters and punct, etc.
429	0	50				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430	0	50				0	goto set_utf8 if $copy !~ $latin_re;
431
432	0	100				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433	0	100				0	goto set_utf8 if $copy =~ $every_char_is_latin_re;
434
435	0					0	DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436
437	10					15	set_1252:
438							DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439	10					22	$encoding = 'CP1252';
440	16					52	goto done_set;
441
442	6					23	set_utf8:
443							DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444	6					39	$encoding = 'UTF-8';
445
446	16					128	done_set:
447							$self->_handle_encoding_line( "=encoding $encoding" );
448	16					88	delete $self->{'_processed_encoding'};
449	16	50				89	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450
451	16					222	my ($word) = $line =~ /(\S$non_ascii_re\S)/;
452
453							$self->whine(
454	16					112	$self->{'line_count'},
455							"Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456							);
457							}
458
459	15322					20634	DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460
461	15322	100				25774	if(!$self->{'in_pod'}) {
462	1274	100				3423	if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s\|$)/s) {
463	735	100				1979	if($1 eq 'cut') {
464							$self->scream(
465	9					107	$self->{'line_count'},
466							"=cut found outside a pod block. Skipping to next block."
467							);
468
469							## Before there were errata sections in the world, it was
470							## least-pessimal to abort processing the file. But now we can
471							## just barrel on thru (but still not start a pod block).
472							#splice @_;
473							#push @_, undef;
474
475	9					31	next;
476							} else {
477							$self->{'in_pod'} = $self->{'start_of_pod_block'}
478	726					2047	= $self->{'last_was_blank'} = 1;
479							# And fall thru to the pod-mode block further down
480							}
481							} else {
482	539					589	DEBUG > 5 and print STDERR "# It's a code-line.\n";
483	539	100				893	$code_handler->(map $_, $line, $self->{'line_count'}, $self)
484							if $code_handler;
485							# Note: this may cause code to be processed out of order relative
486							# to pods, but in order relative to cuts.
487
488							# Note also that we haven't yet applied the transcoding to $line
489							# by time we call $code_handler!
490
491	544	50				1097	if( $line =~ m/^#\sline\s+(\d+)\s(?:\s"([^"]+)")?\s*$/ ) {
492							# That RE is from perlsyn, section "Plain Old Comments (Not!)",
493							#$fname = $2 if defined $2;
494							#DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495	5					33	DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496	5					49	$self->{'line_count'} = $1 - 1;
497							}
498
499	544					934	next;
500							}
501							}
502
503							# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504							# Else we're in pod mode:
505
506							# Apply any necessary transcoding:
507	14774	100				25251	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508
509							# HERE WE CATCH =encoding EARLY!
510	14780	100				25452	if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511	14	100				76	next if $self->parse_characters; # Ignore this line
512	14					68	$line = $self->_handle_encoding_line( $line );
513							}
514
515	14777	100				50357	if($line =~ m/^=cut/s) {
		100
		100
516							# here ends the pod block, and therefore the previous pod para
517	123					176	DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518	130					361	$self->{'in_pod'} = 0;
519							# ++$self->{'pod_para_count'};
520	130					397	$self->_ponder_paragraph_buffer();
521							# by now it's safe to consider the previous paragraph as done.
522	130					213	DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523	130	100				506	$cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524							if $cut_handler;
525
526							# TODO: add to docs: Note: this may cause cuts to be processed out
527							# of order relative to pods, but in order relative to code.
528
529							} elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530	4670	100	66			17503	if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531	2817	100				3982	$wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532							if $wl_handler;
533							}
534
535	7473	100	66			26414	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			100
536	951					1930	DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537	732					1385	push @{$paras->[-1]}, $line;
	523					1192
538							} # otherwise it's not interesting
539
540	4659	100	100			14809	if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541	4827					6555	DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542							}
543
544	4878					7918	$self->{'last_was_blank'} = 1;
545
546							} elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547
548	5410	100				16381	if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9])(\s+\|$)(.)/s) {
		100
549							# THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550	2554					10871	my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551	2335	100	100			9282	$new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552							# Note that in "=head1 foo", the WS is lost.
553							# Example: ['=head1', {'start_line' => 123}, ' foo']
554
555	2335					3711	++$self->{'pod_para_count'};
556
557	2554					6876	$self->_ponder_paragraph_buffer();
558							# by now it's safe to consider the previous paragraph as done.
559
560	4930					10646	push @$paras, $new; # the new incipient paragraph
561	4930					8704	DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562
563							} elsif($line =~ m/^\s/s) {
564
565	653	100	33			3408	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			66
566	169					298	DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567	2732					10117	push @{$paras->[-1]}, $line;
	184					413
568							} else {
569	529					839	++$self->{'pod_para_count'};
570	529					1457	$self->_ponder_paragraph_buffer();
571							# by now it's safe to consider the previous paragraph as done.
572	529					782	DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573	529					2087	push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574							}
575							} else {
576	3304					8056	++$self->{'pod_para_count'};
577	2256					5623	$self->_ponder_paragraph_buffer();
578							# by now it's safe to consider the previous paragraph as done.
579	3304					13528	push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580	2343					3712	DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581							}
582	5299					11171	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583
584							} else {
585							# It's a non-blank line /continuing/ the current para
586	4912	50				8756	if(@$paras) {
587	5873					9326	DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588	5819					7419	push @{$paras->[-1]}, $line;
	5873					11880
589							} else {
590							# Unexpected case!
591	1146					4056	die "Continuing a paragraph but \@\$paras is empty?";
592							}
593	5322					11566	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594							}
595
596							} # ends the big while loop
597
598	8153					10861	DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599	8153					37576	return $self;
600							}
601
602							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603
604							sub _handle_encoding_line {
605	535			18		1599	my($self, $line) = @_;
606
607	535	50				1186	return if $self->parse_characters;
608
609							# The point of this routine is to set $self->{'_transcoder'} as indicated.
610
611	535	50				878	return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
612	111					532	DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
613
614	52					98	my $e = $1;
615	52					92	my $orig = $e;
616	52					114	push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
	76					172
617
618	76					222	my $enc_error;
619
620							# Cf. perldoc Encode and perldoc Encode::Supported
621
622	76					4715	require Pod::Simple::Transcode;
623
624	76	100	33			355	if( $self->{'encoding'} ) {
		50
		100
625	534					838	my $norm_current = $self->{'encoding'};
626	534					1431	my $norm_e = $e;
627	534					2054	foreach my $that ($norm_current, $norm_e) {
628	534					855	$that = lc($that);
629	1146					2504	$that =~ s/[-_]//g;
630							}
631	333	100				672	if($norm_current eq $norm_e) {
632	333					414	DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
633							"redundant. ($norm_current eq $norm_e). Ignoring.\n";
634	333					415	$enc_error = '';
635							# But that doesn't necessarily mean that the earlier one went okay
636							} else {
637	333					773	$enc_error = "Encoding is already set to " . $self->{'encoding'};
638	0					0	DEBUG > 1 and print STDERR $enc_error;
639							}
640							} elsif (
641							# OK, let's turn on the encoding
642							do {
643	350					816	DEBUG > 1 and print STDERR " Setting encoding to $e\n";
644	1853					2401	$self->{'encoding'} = $e;
645	1853					9708	1;
646							}
647							and $e eq 'HACKRAW'
648							) {
649	42					119	DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
650
651							} elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
652
653							die($enc_error = "WHAT? _transcoder is already set?!")
654	59	50				6232	if $self->{'_transcoder'}; # should never happen
655	59					381	require Pod::Simple::Transcode;
656	59					168	$self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
657	59					150	eval {
658	59					124	my @x = ('', "abc", "123");
659	59					127	$self->{'_transcoder'}->(@x);
660							};
661	59	50				252	$@ && die( $enc_error =
662							"Really unexpected error setting up encoding $e: $@\nAborting"
663							);
664	59					144	$self->{'detected_encoding'} = $e;
665
666							} else {
667	42					3411	my @supported = Pod::Simple::Transcode::->all_encodings;
668
669							# Note unsupported, and complain
670	42					172	DEBUG and print STDERR " Encoding [$e] is unsupported.",
671							"\nSupporteds: @supported\n";
672	5					11	my $suggestion = '';
673
674							# Look for a near match:
675	5					10	my $norm = lc($e);
676	5					11	$norm =~ tr[-_][]d;
677	10					25	my $n;
678	10					45	foreach my $enc (@supported) {
679	5					17	$n = lc($enc);
680	3					5	$n =~ tr[-_][]d;
681	3	50				7	next unless $n eq $norm;
682	2					6	$suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
683	2					4	last;
684							}
685	37					54	my $encmodver = Pod::Simple::Transcode::->encmodver;
686	37					225	$enc_error = join '' =>
687							"This document probably does not appear as it should, because its ",
688							"\"=encoding $e\" line calls for an unsupported encoding.",
689							$suggestion, " [$encmodver\'s supported encodings are: @supported]"
690							;
691
692	37					417	$self->scream( $self->{'line_count'}, $enc_error );
693							}
694	17					36	push @{ $self->{'encoding_command_statuses'} }, $enc_error;
	52					34649
695	52	100				225	if (defined($self->{'_processed_encoding'})) {
696							# Double declaration.
697	35					167	$self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
698							}
699	52					126	$self->{'_processed_encoding'} = $orig;
700
701	52					153	return $line;
702							}
703
704							# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
705
706							sub _handle_encoding_second_level {
707							# By time this is called, the encoding (if well formed) will already
708							# have been acted on.
709	42			8		115	my($self, $para) = @_;
710	42					127	my @x = @$para;
711	42					110	my $content = join ' ', splice @x, 2;
712	9					1979	$content =~ s/^\s+//s;
713	9					2040	$content =~ s/\s+$//s;
714
715	9					27	DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
716
717	9	100				31	if (defined($self->{'_processed_encoding'})) {
718							#if($content ne $self->{'_processed_encoding'}) {
719							# Could it happen?
720							#}
721	9					51	delete $self->{'_processed_encoding'};
722							# It's already been handled. Check for errors.
723	9	50				50	if(! $self->{'encoding_command_statuses'} ) {
		100
724	2					7	DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
725							} elsif( $self->{'encoding_command_statuses'}[-1] ) {
726							$self->whine( $para->[1]{'start_line'},
727							sprintf "Couldn't do %s: %s",
728							$self->{'encoding_command_reqs' }[-1],
729	248					326	$self->{'encoding_command_statuses'}[-1],
730							);
731							} else {
732	255					326	DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
733							}
734
735							} else {
736							# Otherwise it's a syntax error
737	248					421	$self->whine( $para->[1]{'start_line'},
738							"Invalid =encoding syntax: $content"
739							);
740							}
741
742	7					17	return;
743							}
744
745							#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
746
747							{
748							my $m = -321; # magic line number
749
750							sub _gen_errata {
751	667			668		1037	my $self = $_[0];
752							# Return 0 or more fake-o paragraphs explaining the accumulated
753							# errors on this document.
754
755	669	100	66			2335	return() unless $self->{'errata'} and keys %{$self->{'errata'}};
	16					105
756
757	16					51	my @out;
758
759	56					99	foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
	67					195
	56					194
760							push @out,
761							['=item', {'start_line' => $m}, "Around line $line:"],
762							map( ['~Para', {'start_line' => $m, '~cooked' => 1},
763							#['~Top', {'start_line' => $m},
764							$_
765							#]
766							],
767	33					140	@{$self->{'errata'}{$line}}
	74					293
768							)
769							;
770							}
771
772							# TODO: report of unknown entities? unrenderable characters?
773
774	56					257	unshift @out,
775							['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
776							['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
777							"Hey! ",
778							['B', {},
779							'The above document had some coding errors, which are explained below:'
780							]
781							],
782							['=over', {'start_line' => $m, 'errata' => 1}, ''],
783							;
784
785	45					142	push @out,
786							['=back', {'start_line' => $m, 'errata' => 1}, ''],
787							;
788
789	45					131	DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
790
791	45					159	return @out;
792							}
793
794							}
795
796							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
797
798							##############################################################################
799							##
800							## stop reading now stop reading now stop reading now stop reading now stop
801							##
802							## HERE IT BECOMES REALLY SCARY
803							##
804							## stop reading now stop reading now stop reading now stop reading now stop
805							##
806							##############################################################################
807
808							sub _ponder_paragraph_buffer {
809
810							# Para-token types as found in the buffer.
811							# ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
812							# =over, =back, =item
813							# and the null =pod (to be complained about if over one line)
814							#
815							# "~data" paragraphs are something we generate at this level, depending on
816							# a currently open =over region
817
818							# Events fired: Begin and end for:
819							# directivename (like head1 .. head4), item, extend,
820							# for (from =begin...=end, =for),
821							# over-bullet, over-number, over-text, over-block,
822							# item-bullet, item-number, item-text,
823							# Document,
824							# Data, Para, Verbatim
825							# B, C, longdirname (TODO -- wha?), etc. for all directives
826							#
827
828	5870			5840		8057	my $self = $_[0];
829	5870					7314	my $paras;
830	5870	100				7021	return unless @{$paras = $self->{'paras'}};
	5870					13115
831	5146		100			12719	my $curr_open = ($self->{'curr_open'} \|\|= []);
832
833	5146					6400	my $scratch;
834
835	5116					6366	DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
836
837							# We have something in our buffer. So apparently the document has started.
838	5120	100				9297	unless($self->{'doc_has_started'}) {
839	707					1738	$self->{'doc_has_started'} = 1;
840
841	682					1010	my $starting_contentless;
842	712		66			4035	$starting_contentless =
843							(
844							!@$curr_open
845							and @$paras and ! grep $_->[0] ne '~end', @$paras
846							# i.e., if the paras is all ~ends
847							)
848							;
849	902					1296	DEBUG and print STDERR "# Starting ",
850							$starting_contentless ? 'contentless' : 'contentful',
851							" document\n"
852							;
853
854							$self->_handle_element_start(
855							($scratch = 'Document'),
856							{
857	902	100				4309	'start_line' => $paras->[0][1]{'start_line'},
858							$starting_contentless ? ( 'contentless' => 1 ) : (),
859							},
860							);
861							}
862
863	5142					8474	my($para, $para_type);
864	5142					9616	while(@$paras) {
865
866							# If a directive, assume it's legal; subtract below if found not to be
867	7341	100				20226	$seen_legal_directive++ if $paras->[0][0] =~ /^=/;
868
869							last if @$paras == 1
870							and ( $paras->[0][0] eq '=over'
871							or $paras->[0][0] eq '=item'
872	7316	100	100			31558	or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
			100
873							# Those're the three kinds of paragraphs that require lookahead.
874							# Actually, an "=item Foo" inside an region
875							# and any =item inside an region (rare)
876							# don't require any lookahead, but all others (bullets
877							# and numbers) do.
878							# The verbatim is different from the other two, because those might be
879							# like:
880							#
881							# =item
882							# ...
883							# =cut
884							# ...
885							# =item
886							#
887							# The =cut here finishes the paragraph but doesn't terminate the =over
888							# they should be in. (khw apologizes that he didn't comment at the time
889							# why the 'in_pod' works, and no longer remembers why, and doesn't think
890							# it is currently worth the effort to re-figure it out.)
891
892							# TODO: whinge about many kinds of directives in non-resolving =for regions?
893							# TODO: many? like what? =head1 etc?
894
895	5943					11063	$para = shift @$paras;
896	5944					9960	$para_type = $para->[0];
897
898	5944					7389	DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
899							$self->_dump_curr_open(), ")\n";
900
901	5943	100				18217	if($para_type eq '=for') {
		100
		100
		100
902	74	50				235	next if $self->_ponder_for($para,$curr_open,$paras);
903
904							} elsif($para_type eq '=begin') {
905	83	50				209	next if $self->_ponder_begin($para,$curr_open,$paras);
906
907							} elsif($para_type eq '=end') {
908	83	50				280	next if $self->_ponder_end($para,$curr_open,$paras);
909
910							} elsif($para_type eq '~end') { # The virtual end-document signal
911	2065	50				4114	next if $self->_ponder_doc_end($para,$curr_open,$paras);
912							}
913
914
915							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
916							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
917	6446	100				14596	if(grep $_->[1]{'~ignore'}, @$curr_open) {
918	1398					1897	DEBUG > 1 and
919							print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
920	1398					3514	next;
921							}
922							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
923							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
924
925	6217	100				16735	if($para_type eq '=pod') {
		100
		100
926	1648					2930	$self->_ponder_pod($para,$curr_open,$paras);
927
928							} elsif($para_type eq '=over') {
929	1325	100				2179	next if $self->_ponder_over($para,$curr_open,$paras);
930
931							} elsif($para_type eq '=back') {
932	1325	100				2853	next if $self->_ponder_back($para,$curr_open,$paras);
933
934							} else {
935
936							# All non-magical codes!!!
937
938							# Here we start using $para_type for our own twisted purposes, to
939							# mean how it should get treated, not as what the element name
940							# should be.
941
942	4478					5888	DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
943
944	4478					5562	my $i;
945
946							# Enforce some =headN discipline
947	4478	100	66			12883	if($para_type =~ m/^=head\d$/s
			100
			66
948							and ! $self->{'accept_heads_anywhere'}
949							and @$curr_open
950							and $curr_open->[-1][0] eq '=over'
951							) {
952	221					359	DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
953							$self->whine(
954	221					1207	$para->[1]{'start_line'},
955							"You forgot a '=back' before '$para_type'"
956							);
957	1169					2094	unshift @$paras, ['=back', {}, ''], $para; # close the =over
958	1169					2322	next;
959							}
960
961
962	5917	100	66			21284	if($para_type eq '=item') {
		100
		100
		100
		100
		100
		100
963
964	2628					8811	my $over;
965	2511	50	33			5662	unless(@$curr_open and
966	2619					7007	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
967							$self->whine(
968	1543					2084	$para->[1]{'start_line'},
969							"'=item' outside of any '=over'"
970							);
971							unshift @$paras,
972	1543					4997	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
973							$para
974							;
975	2					13	next;
976							}
977
978
979	1038					2045	my $over_type = $over->[1]{'~type'};
980
981	1041	50				3558	if(!$over_type) {
		50
		100
		100
		50
982							# Shouldn't happen1
983							die "Typeless over in stack, starting at line "
984	251					752	. $over->[1]{'start_line'};
985
986							} elsif($over_type eq 'block') {
987	1147	0				2837	unless($curr_open->[-1][1]{'~bitched_about'}) {
988	22					31	$curr_open->[-1][1]{'~bitched_about'} = 1;
989							$self->whine(
990							$curr_open->[-1][1]{'start_line'},
991							"You can't have =items (as at line "
992	22					47	. $para->[1]{'start_line'}
993							. ") unless the first thing after the =over is an =item"
994							);
995							}
996							# Just turn it into a paragraph and reconsider it
997	1125					2783	$para->[0] = '~Para';
998	98					298	unshift @$paras, $para;
999	57					250	next;
1000
1001							} elsif($over_type eq 'text') {
1002	834					2732	my $item_type = $self->_get_item_type($para);
1003							# That kills the content of the item if it's a number or bullet.
1004	1698					2354	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1005
1006	1698	100	66			3034	if($item_type eq 'text') {
		50
1007							# Nothing special needs doing for 'text'
1008							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1009							$self->whine(
1010	921					2985	$para->[1]{'start_line'},
1011							"Expected text after =item, not a $item_type"
1012							);
1013							# Undo our clobbering:
1014	10					17	push @$para, $para->[1]{'~orig_content'};
1015	10					56	delete $para->[1]{'number'};
1016							# Only a PROPER item-number element is allowed
1017							# to have a number attribute.
1018							} else {
1019	6					18	die "Unhandled item type $item_type"; # should never happen
1020							}
1021
1022							# =item-text thingies don't need any assimilation, it seems.
1023
1024							} elsif($over_type eq 'number') {
1025	34					94	my $item_type = $self->_get_item_type($para);
1026							# That kills the content of the item if it's a number or bullet.
1027	939					4137	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1028
1029	82					147	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1030
1031	82	50				330	if($item_type eq 'bullet') {
		50
		50
		50
1032							# Hm, it's not numeric. Correct for this.
1033	54					283	$para->[1]{'number'} = $expected_value;
1034							$self->whine(
1035	0					0	$para->[1]{'start_line'},
1036							"Expected '=item $expected_value'"
1037							);
1038	0					0	push @$para, $para->[1]{'~orig_content'};
1039							# restore the bullet, blocking the assimilation of next para
1040
1041							} elsif($item_type eq 'text') {
1042							# Hm, it's not numeric. Correct for this.
1043	0					0	$para->[1]{'number'} = $expected_value;
1044							$self->whine(
1045	54					131	$para->[1]{'start_line'},
1046							"Expected '=item $expected_value'"
1047							);
1048							# Text content will still be there and will block next ~Para
1049
1050							} elsif($item_type ne 'number') {
1051	54					227	die "Unknown item type $item_type"; # should never happen
1052
1053							} elsif($expected_value == $para->[1]{'number'}) {
1054	28					36	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1055
1056							} else {
1057	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1058							" instead of the expected value of $expected_value\n";
1059							$self->whine(
1060							$para->[1]{'start_line'},
1061	0					0	"You have '=item " . $para->[1]{'number'} .
1062							"' instead of the expected '=item $expected_value'"
1063							);
1064	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1065							}
1066
1067	28	50				71	if(@$para == 2) {
1068							# For the cases where we /didn't/ push to @$para
1069	28	100				68	if($paras->[0][0] eq '~Para') {
1070	25					31	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1071	79					212	push @$para, splice @{shift @$paras},2;
	79					173
1072							} else {
1073	57					163	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1074	3					14	push @$para, ''; # Just so it's not contentless
1075							}
1076							}
1077
1078
1079							} elsif($over_type eq 'bullet') {
1080	159					522	my $item_type = $self->_get_item_type($para);
1081							# That kills the content of the item if it's a number or bullet.
1082	159					281	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1083
1084	159	50				371	if($item_type eq 'bullet') {
		0
		0
1085							# as expected!
1086
1087	159	100				384	if( $para->[1]{'~_freaky_para_hack'} ) {
1088	101					127	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1089	101					214	push @$para, $para->[1]{'~_freaky_para_hack'};
1090							}
1091
1092							} elsif($item_type eq 'number') {
1093							$self->whine(
1094	0					0	$para->[1]{'start_line'},
1095							"Expected '=item *'"
1096							);
1097	0					0	push @$para, $para->[1]{'~orig_content'};
1098							# and block assimilation of the next paragraph
1099	0					0	delete $para->[1]{'number'};
1100							# Only a PROPER item-number element is allowed
1101							# to have a number attribute.
1102							} elsif($item_type eq 'text') {
1103							$self->whine(
1104	0					0	$para->[1]{'start_line'},
1105							"Expected '=item *'"
1106							);
1107							# But doesn't need processing. But it'll block assimilation
1108							# of the next para.
1109							} else {
1110	0					0	die "Unhandled item type $item_type"; # should never happen
1111							}
1112
1113	159	100				398	if(@$para == 2) {
1114							# For the cases where we /didn't/ push to @$para
1115	58	50				138	if($paras->[0][0] eq '~Para') {
1116	58					72	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1117	58					87	push @$para, splice @{shift @$paras},2;
	58					142
1118							} else {
1119	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1120	0					0	push @$para, ''; # Just so it's not contentless
1121							}
1122							}
1123
1124							} else {
1125	0					0	die "Unhandled =over type \"$over_type\"?";
1126							# Shouldn't happen!
1127							}
1128
1129	968					1688	$para_type = 'Plain';
1130	968					2560	$para->[0] .= '-' . $over_type;
1131							# Whew. Now fall thru and process it.
1132
1133
1134							} elsif($para_type eq '=extend') {
1135							# Well, might as well implement it here.
1136	0					0	$self->_ponder_extend($para);
1137	0					0	next; # and skip
1138							} elsif($para_type eq '=encoding') {
1139							# Not actually acted on here, but we catch errors here.
1140	7					68	$self->_handle_encoding_second_level($para);
1141	7	100				39	next unless $self->keep_encoding_directive;
1142	4					9	$para_type = 'Plain';
1143							} elsif($para_type eq '~Verbatim') {
1144	483					965	$para->[0] = 'Verbatim';
1145	483					826	$para_type = '?Verbatim';
1146							} elsif($para_type eq '~Para') {
1147	2198					3698	$para->[0] = 'Para';
1148	2198					3269	$para_type = '?Plain';
1149							} elsif($para_type eq 'Data') {
1150	28					49	$para->[0] = 'Data';
1151	28					44	$para_type = '?Data';
1152							} elsif( $para_type =~ s/^=//s
1153							and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1154							) {
1155	565					856	DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1156							} else {
1157							# An unknown directive!
1158	8					19	$seen_legal_directive--;
1159							DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1160	8					10	$para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1161							;
1162							$self->whine(
1163	8					57	$para->[1]{'start_line'},
1164							"Unknown directive: $para->[0]"
1165							);
1166
1167							# And maybe treat it as text instead of just letting it go?
1168	8					24	next;
1169							}
1170
1171	4246	100				14950	if($para_type =~ s/^\?//s) {
1172	2709	100				5777	if(! @$curr_open) { # usual case
1173	1610					2154	DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1174							} else {
1175	1099					2931	my @fors = grep $_->[0] eq '=for', @$curr_open;
1176							DEBUG > 1 and print STDERR "Containing fors: ",
1177	1099					1418	join(',', map $_->[1]{'target'}, @fors), "\n";
1178
1179	1099	100				2265	if(! @fors) {
		100
1180	1111					1831	DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1181
1182							#} elsif(grep $_->[1]{'~resolve'}, @fors) {
1183							#} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1184							} elsif( $fors[-1][1]{'~resolve'} ) {
1185							# Look to the immediately containing for
1186
1187	74	100				213	if($para_type eq 'Data') {
1188	38					97	DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1189	38					92	$para->[0] = 'Para';
1190	48					185	$para_type = 'Plain';
1191							} else {
1192	34					128	DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1193							}
1194							} else {
1195	53					89	DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1196	75					201	$para->[0] = $para_type = 'Data';
1197							}
1198							}
1199							}
1200
1201							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1202	4299	100				8353	if($para_type eq 'Plain') {
		100
		50
1203	4317					9943	$self->_ponder_Plain($para);
1204							} elsif($para_type eq 'Verbatim') {
1205	1055					2552	$self->_ponder_Verbatim($para);
1206							} elsif($para_type eq 'Data') {
1207	24					99	$self->_ponder_Data($para);
1208							} else {
1209	2					4	die "\$para type is $para_type -- how did that happen?";
1210							# Shouldn't happen.
1211							}
1212
1213							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1214	4418					11852	$para->[0] =~ s/^[~=]//s;
1215
1216	4250					5632	DEBUG and print STDERR "\n", pretty($para), "\n";
1217
1218							# traverse the treelet (which might well be just one string scalar)
1219							$self->{'content_seen'} \|\|= 1 if $seen_legal_directive
1220	4250	100	100			18344	&& ! $self->{'~tried_gen_errata'};
			100
1221	4250					9880	$self->_traverse_treelet_bit(@$para);
1222							}
1223							}
1224
1225	5120					13873	return;
1226							}
1227
1228							###########################################################################
1229							# The sub-ponderers...
1230
1231
1232
1233							sub _ponder_for {
1234	934			49		3502	my ($self,$para,$curr_open,$paras) = @_;
1235
1236							# Fake it out as a begin/end
1237	677					1495	my $target;
1238
1239	502	50				773	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1240	175					548	DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1241	175					251	return 1;
1242							}
1243
1244	223					564	for(my $i = 2; $i < @$para; ++$i) {
1245	141	50				388	if($para->[$i] =~ s/^\s(\S+)\s//s) {
1246	101					217	$target = $1;
1247	49					81	last;
1248							}
1249							}
1250	49	50				109	unless(defined $target) {
1251							$self->whine(
1252	1					2	$para->[1]{'start_line'},
1253							"=for without a target?"
1254							);
1255	52					82	return 1;
1256							}
1257	77					97	DEBUG > 1 and
1258							print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1259
1260	77					154	$para->[0] = 'Data';
1261
1262							unshift @$paras,
1263							['=begin',
1264							{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1265							$target,
1266							],
1267							$para,
1268							['=end',
1269	934					2084	{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1270							$target,
1271							],
1272							;
1273
1274	851					2399	return 1;
1275							}
1276
1277							sub _ponder_begin {
1278	108			223		278	my ($self,$para,$curr_open,$paras) = @_;
1279	89					264	my $content = join ' ', splice @$para, 2;
1280	57					133	$content =~ s/^\s+//s;
1281	943					2641	$content =~ s/\s+$//s;
1282	943	50				1479	unless(length($content)) {
1283							$self->whine(
1284	886					4299	$para->[1]{'start_line'},
1285							"=begin without a target?"
1286							);
1287	886					2151	DEBUG and print STDERR "Ignoring targetless =begin\n";
1288	1169					3261	return 1;
1289							}
1290
1291	59					259	my ($target, $title) = $content =~ m/^(\S+)\s(.)$/;
1292	59	100				151	$para->[1]{'title'} = $title if ($title);
1293	59					119	$para->[1]{'target'} = $target; # without any ':'
1294	57					90	$content = $target; # strip off the title
1295
1296	57					105	$content =~ s/^:!/!:/s;
1297	59					90	my $neg; # whether this is a negation-match
1298	59	100				147	$neg = 1 if $content =~ s/^!//s;
1299	59					90	my $to_resolve; # whether to process formatting codes
1300	59	100				157	$to_resolve = 1 if $content =~ s/^://s;
1301
1302	59					86	my $dont_ignore; # whether this target matches us
1303
1304	57	100				206	foreach my $target_name (
1305							split(',', $content, -1),
1306							$neg ? () : '*'
1307							) {
1308	116					151	DEBUG > 2 and
1309							print STDERR " Considering whether =begin $content matches $target_name\n";
1310	118	100				266	next unless $self->{'accept_targets'}{$target_name};
1311
1312	32					43	DEBUG > 2 and
1313							print STDERR " It DOES match the acceptable target $target_name!\n";
1314							$to_resolve = 1
1315	32	100				111	if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1316	32					51	$dont_ignore = 1;
1317	100					199	$para->[1]{'target_matching'} = $target_name;
1318	100					237	last; # stop looking at other target names
1319							}
1320
1321	127	100				293	if($neg) {
1322	89	100				195	if( $dont_ignore ) {
1323	76					162	$dont_ignore = '';
1324	6					12	delete $para->[1]{'target_matching'};
1325	6					19	DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1326							} else {
1327	13					24	$dont_ignore = 1;
1328	83					312	$para->[1]{'target_matching'} = '!';
1329	83					185	DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1330							}
1331							}
1332
1333	127					240	$para->[0] = '=for'; # Just what we happen to call these, internally
1334	127		100			237	$para->[1]{'~really'} \|\|= '=begin';
1335	127		100			382	$para->[1]{'~ignore'} = (! $dont_ignore) \|\| 0;
1336	127		100			262	$para->[1]{'~resolve'} = $to_resolve \|\| 0;
1337
1338	127					220	DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1339							"ignore contents of this region\n";
1340	127					177	DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1341							($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1342	127					276	DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1343
1344	127					209	push @$curr_open, $para;
1345	127	100	100			523	if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1346	153					212	DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1347							} else {
1348	170	50	100			451	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1349	73					198	$self->_handle_element_start((my $scratch='for'), $para->[1]);
1350							}
1351
1352	93					274	return 1;
1353							}
1354
1355							sub _ponder_end {
1356	93			57		169	my ($self,$para,$curr_open,$paras) = @_;
1357	93					218	my $content = join ' ', splice @$para, 2;
1358	93					197	$content =~ s/^\s+//s;
1359	127					269	$content =~ s/\s+$//s;
1360	71					97	DEBUG and print STDERR "Ogling '=end $content' directive\n";
1361
1362	57	50				155	unless(length($content)) {
1363							$self->whine(
1364							$para->[1]{'start_line'},
1365							"'=end' without a target?" . (
1366							( @$curr_open and $curr_open->[-1][0] eq '=for' )
1367	0	0	0			0	? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1368							: ''
1369							)
1370							);
1371	0					0	DEBUG and print STDERR "Ignoring targetless =end\n";
1372	14					20	return 1;
1373							}
1374
1375	71	50				230	unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1376							$self->whine(
1377	14					17	$para->[1]{'start_line'},
1378							"'=end $content' is invalid. (Stack: "
1379							. $self->_dump_curr_open() . ')'
1380							);
1381	70					107	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1382	70					283	return 1;
1383							}
1384
1385	127	50	33			448	unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1386							$self->whine(
1387	70					179	$para->[1]{'start_line'},
1388							"=end $content without matching =begin. (Stack: "
1389							. $self->_dump_curr_open() . ')'
1390							);
1391	70					83	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1392	70					86	return 1;
1393							}
1394
1395	127	100				246	unless($content eq $curr_open->[-1][1]{'target'}) {
1396							$self->whine(
1397							$para->[1]{'start_line'},
1398							"=end $content doesn't match =begin "
1399	70					140	. $curr_open->[-1][1]{'target'}
1400							. ". (Stack: "
1401							. $self->_dump_curr_open() . ')'
1402							);
1403	70					270	DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1404	20					29	return 1;
1405							}
1406
1407							# Else it's okay to close...
1408	107	100				328	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1409	70					196	DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1410							# And that may be because of this to-be-closed =for region, or some
1411							# other one, but it doesn't matter.
1412							} else {
1413	107					338	$curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1414							# what's that for?
1415
1416	110	50	50			284	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1417	110					312	$self->_handle_element_end( my $scratch = 'for', $para->[1]);
1418							}
1419	130					299	DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1420	130					245	pop @$curr_open;
1421
1422	130					365	return 1;
1423							}
1424
1425							sub _ponder_doc_end {
1426	760			2523		1625	my ($self,$para,$curr_open,$paras) = @_;
1427	687	100				1381	if(@$curr_open) { # Deal with things left open
1428	6					11	DEBUG and print STDERR "Stack is nonempty at end-document: (",
1429							$self->_dump_curr_open(), ")\n";
1430
1431	6					8	DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1432	79					267	unshift @$paras, $self->_closers_for_all_curr_open;
1433							# Make sure there is exactly one ~end in the parastack, at the end:
1434	6					26	@$paras = grep $_->[0] ne '~end', @$paras;
1435	6					11	push @$paras, $para, $para;
1436							# We need two -- once for the next cycle where we
1437							# generate errata, and then another to be at the end
1438							# when that loop back around to process the errata.
1439	6					24	return 1;
1440
1441							} else {
1442	754					1200	DEBUG and print STDERR "Okay, stack is empty now.\n";
1443							}
1444
1445							# Try generating errata section, if applicable
1446	681	100				1602	unless($self->{'~tried_gen_errata'}) {
1447	667					1312	$self->{'~tried_gen_errata'} = 1;
1448	667					1824	my @extras = $self->_gen_errata();
1449	740	100				1707	if(@extras) {
1450	17					57	unshift @$paras, @extras;
1451	17					32	DEBUG and print STDERR "Generated errata... relooping...\n";
1452	17					79	return 1; # I.e., loop around again to process these fake-o paragraphs
1453							}
1454							}
1455
1456	737					1579	splice @$paras; # Well, that's that for this paragraph buffer.
1457	687					948	DEBUG and print STDERR "Throwing end-document event.\n";
1458
1459	717					2010	$self->_handle_element_end( my $scratch = 'Document' );
1460	717					2345	return 1; # Hasta la byebye
1461							}
1462
1463							sub _ponder_pod {
1464	529			521		1255	my ($self,$para,$curr_open,$paras) = @_;
1465							$self->whine(
1466	549	50				1182	$para->[1]{'start_line'},
1467							"=pod directives shouldn't be over one line long! Ignoring all "
1468							. (@$para - 2) . " lines of content"
1469							) if @$para > 3;
1470
1471							# Content ignored unless 'pod_handler' is set
1472	549	100				1220	if (my $pod_handler = $self->{'pod_handler'}) {
1473	70					328	my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1474	251	100				568	$line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1475	251					496	$pod_handler->($line, $line_num, $self);
1476							}
1477
1478							# The surrounding methods set content_seen, so let us remain consistent.
1479							# I do not know why it was not here before -- should it not be here?
1480							# $self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1481
1482	483					1144	return;
1483							}
1484
1485							sub _ponder_over {
1486	160			187		530	my ($self,$para,$curr_open,$paras) = @_;
1487	160	50				451	return 1 unless @$paras;
1488	160					302	my $list_type;
1489
1490	160	100				479	if($paras->[0][0] eq '=item') { # most common case
		100
		50
1491	147					692	$list_type = $self->_get_initial_item_type($paras->[0]);
1492
1493							} elsif($paras->[0][0] eq '=back') {
1494							# Ignore empty lists by default
1495	247	100				345	if ($self->{'parse_empty_lists'}) {
1496	247					595	$list_type = 'empty';
1497							} else {
1498	221					444	shift @$paras;
1499	221					584	return 1;
1500							}
1501							} elsif($paras->[0][0] eq '~end') {
1502							$self->whine(
1503	221					573	$para->[1]{'start_line'},
1504							"=over is the last thing in the document?!"
1505							);
1506	26					76	return 1; # But feh, ignore it.
1507							} else {
1508	39					64	$list_type = 'block';
1509							}
1510	182					649	$para->[1]{'~type'} = $list_type;
1511	377					788	push @$curr_open, $para;
1512							# yes, we reuse the paragraph as a stack item
1513
1514	377					769	my $content = join ' ', splice @$para, 2;
1515	377					1137	$para->[1]{'~orig_content'} = $content;
1516	377					1063	my $overness;
1517	254	100				1322	if($content =~ m/^\s*$/s) {
		50
1518	164					371	$para->[1]{'indent'} = 4;
1519							} elsif($content =~ m/^\s((?:\d\.)?\d+)\s*$/s) {
1520	69			67		459645	no integer;
	67					183
	67					600
1521	188					535	$para->[1]{'indent'} = $1;
1522	96	50				462	if($1 == 0) {
1523							$self->whine(
1524	6					23	$para->[1]{'start_line'},
1525							"Can't have a 0 in =over $content"
1526							);
1527	6					16	$para->[1]{'indent'} = 4;
1528							}
1529							} else {
1530							$self->whine(
1531	98					261	$para->[1]{'start_line'},
1532							"=over should be: '=over' or '=over positive_number'"
1533							);
1534	57					147	$para->[1]{'indent'} = 4;
1535							}
1536	213					383	DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1537
1538	213	100	100			780	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1539	213					983	$self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1540
1541	198					811	return;
1542							}
1543
1544							sub _ponder_back {
1545	162			377		435	my ($self,$para,$curr_open,$paras) = @_;
1546							# TODO: fire off or or ??
1547
1548	158					455	my $content = join ' ', splice @$para, 2;
1549	160	50				473	if($content =~ m/\S/) {
1550							$self->whine(
1551	4					20	$para->[1]{'start_line'},
1552							"=back doesn't take any parameters, but you said =back $content"
1553							);
1554							}
1555
1556	156	50	33			786	if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1557	156					230	DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1558							# Expected case: we're closing the most recently opened thing
1559							#my $over = pop @$curr_open;
1560	165	100	50			644	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1561							$self->_handle_element_end( my $scratch =
1562	209					943	'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1563							);
1564							} else {
1565	53					122	DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1566							join(', ', map $_->[0], @$curr_open), ").\n";
1567							$self->whine(
1568	53					149	$para->[1]{'start_line'},
1569							'=back without =over'
1570							);
1571	53					120	return 1; # and ignore it
1572							}
1573							}
1574
1575							sub _ponder_item {
1576	53			1378		129	my ($self,$para,$curr_open,$paras) = @_;
1577	53					265	my $over;
1578	53	0	0			138	unless(@$curr_open and
1579	0					0	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1580							$self->whine(
1581	0					0	$para->[1]{'start_line'},
1582							"'=item' outside of any '=over'"
1583							);
1584							unshift @$paras,
1585	0					0	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1586							$para
1587							;
1588	0					0	return 1;
1589							}
1590
1591
1592	0					0	my $over_type = $over->[1]{'~type'};
1593
1594	0	0				0	if(!$over_type) {
		0
		0
		0
		0
1595							# Shouldn't happen1
1596							die "Typeless over in stack, starting at line "
1597	53					99	. $over->[1]{'start_line'};
1598
1599							} elsif($over_type eq 'block') {
1600	53	0				201	unless($curr_open->[-1][1]{'~bitched_about'}) {
1601	53					277	$curr_open->[-1][1]{'~bitched_about'} = 1;
1602							$self->whine(
1603							$curr_open->[-1][1]{'start_line'},
1604							"You can't have =items (as at line "
1605	53					206	. $para->[1]{'start_line'}
1606							. ") unless the first thing after the =over is an =item"
1607							);
1608							}
1609							# Just turn it into a paragraph and reconsider it
1610	53					209	$para->[0] = '~Para';
1611	53					185	unshift @$paras, $para;
1612	53					169	return 1;
1613
1614							} elsif($over_type eq 'text') {
1615	0					0	my $item_type = $self->_get_item_type($para);
1616							# That kills the content of the item if it's a number or bullet.
1617	53					266	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1618
1619	53	0	0			73	if($item_type eq 'text') {
		0
1620							# Nothing special needs doing for 'text'
1621							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1622							$self->whine(
1623	53					161	$para->[1]{'start_line'},
1624							"Expected text after =item, not a $item_type"
1625							);
1626							# Undo our clobbering:
1627	53					227	push @$para, $para->[1]{'~orig_content'};
1628	0					0	delete $para->[1]{'number'};
1629							# Only a PROPER item-number element is allowed
1630							# to have a number attribute.
1631							} else {
1632	0					0	die "Unhandled item type $item_type"; # should never happen
1633							}
1634
1635							# =item-text thingies don't need any assimilation, it seems.
1636
1637							} elsif($over_type eq 'number') {
1638	0					0	my $item_type = $self->_get_item_type($para);
1639							# That kills the content of the item if it's a number or bullet.
1640	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1641
1642	0					0	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1643
1644	0	0				0	if($item_type eq 'bullet') {
		0
		0
		0
1645							# Hm, it's not numeric. Correct for this.
1646	0					0	$para->[1]{'number'} = $expected_value;
1647							$self->whine(
1648	0					0	$para->[1]{'start_line'},
1649							"Expected '=item $expected_value'"
1650							);
1651	0					0	push @$para, $para->[1]{'~orig_content'};
1652							# restore the bullet, blocking the assimilation of next para
1653
1654							} elsif($item_type eq 'text') {
1655							# Hm, it's not numeric. Correct for this.
1656	0					0	$para->[1]{'number'} = $expected_value;
1657							$self->whine(
1658	0					0	$para->[1]{'start_line'},
1659							"Expected '=item $expected_value'"
1660							);
1661							# Text content will still be there and will block next ~Para
1662
1663							} elsif($item_type ne 'number') {
1664	0					0	die "Unknown item type $item_type"; # should never happen
1665
1666							} elsif($expected_value == $para->[1]{'number'}) {
1667	0					0	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1668
1669							} else {
1670	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1671							" instead of the expected value of $expected_value\n";
1672							$self->whine(
1673							$para->[1]{'start_line'},
1674	0					0	"You have '=item " . $para->[1]{'number'} .
1675							"' instead of the expected '=item $expected_value'"
1676							);
1677	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1678							}
1679
1680	0	0				0	if(@$para == 2) {
1681							# For the cases where we /didn't/ push to @$para
1682	0	0				0	if($paras->[0][0] eq '~Para') {
1683	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1684	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1685							} else {
1686	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1687	0					0	push @$para, ''; # Just so it's not contentless
1688							}
1689							}
1690
1691
1692							} elsif($over_type eq 'bullet') {
1693	0					0	my $item_type = $self->_get_item_type($para);
1694							# That kills the content of the item if it's a number or bullet.
1695	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1696
1697	0	0				0	if($item_type eq 'bullet') {
		0
		0
1698							# as expected!
1699
1700	0	0				0	if( $para->[1]{'~_freaky_para_hack'} ) {
1701	0					0	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1702	0					0	push @$para, $para->[1]{'~_freaky_para_hack'};
1703							}
1704
1705							} elsif($item_type eq 'number') {
1706							$self->whine(
1707	0					0	$para->[1]{'start_line'},
1708							"Expected '=item *'"
1709							);
1710	0					0	push @$para, $para->[1]{'~orig_content'};
1711							# and block assimilation of the next paragraph
1712	0					0	delete $para->[1]{'number'};
1713							# Only a PROPER item-number element is allowed
1714							# to have a number attribute.
1715							} elsif($item_type eq 'text') {
1716							$self->whine(
1717	0					0	$para->[1]{'start_line'},
1718							"Expected '=item *'"
1719							);
1720							# But doesn't need processing. But it'll block assimilation
1721							# of the next para.
1722							} else {
1723	0					0	die "Unhandled item type $item_type"; # should never happen
1724							}
1725
1726	0	0				0	if(@$para == 2) {
1727							# For the cases where we /didn't/ push to @$para
1728	0	0				0	if($paras->[0][0] eq '~Para') {
1729	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1730	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1731							} else {
1732	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1733	0					0	push @$para, ''; # Just so it's not contentless
1734							}
1735							}
1736
1737							} else {
1738	0					0	die "Unhandled =over type \"$over_type\"?";
1739							# Shouldn't happen!
1740							}
1741	0					0	$para->[0] .= '-' . $over_type;
1742
1743	0					0	return;
1744							}
1745
1746							sub _ponder_Plain {
1747	3743			3745		6769	my ($self,$para) = @_;
1748	3743					4748	DEBUG and print STDERR " giving plain treatment...\n";
1749	3743	100	100			21152	unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
			66
			100
1750							or $para->[1]{'~cooked'}
1751							) {
1752							push @$para,
1753	3694					5308	@{$self->_make_treelet(
1754							join("\n", splice(@$para, 2)),
1755	3694					17549	$para->[1]{'start_line'}
1756							)};
1757							}
1758							# Empty paragraphs don't need a treelet for any reason I can see.
1759							# And precooked paragraphs already have a treelet.
1760	3743					7724	return;
1761							}
1762
1763							sub _ponder_Verbatim {
1764	481			551		1078	my ($self,$para) = @_;
1765	481					620	DEBUG and print STDERR " giving verbatim treatment...\n";
1766
1767	481					1162	$para->[1]{'xml:space'} = 'preserve';
1768
1769	481	100				1281	unless ($self->{'_output_is_for_JustPod'}) {
1770							# Fix illegal settings for expand_verbatim_tabs()
1771							# This is because this module doesn't do input error checking, but khw
1772							# doesn't want to add yet another instance of that.
1773	400	100	100			1508	$self->expand_verbatim_tabs(8)
1774							if ! defined $self->expand_verbatim_tabs()
1775							\|\| $self->expand_verbatim_tabs() =~ /\D/;
1776
1777	400					1227	my $indent = $self->strip_verbatim_indent;
1778	400	100	100			1076	if ($indent && ref $indent eq 'CODE') {
1779	10					11	my @shifted = (shift @{$para}, shift @{$para});
	10					16
	10					17
1780	10					26	$indent = $indent->($para);
1781	10					57	unshift @{$para}, @shifted;
	10					27
1782							}
1783
1784	400					1103	for(my $i = 2; $i < @$para; $i++) {
1785	1756					3088	foreach my $line ($para->[$i]) { # just for aliasing
1786							# Strip indentation.
1787	1756	100				2922	$line =~ s/^\Q$indent// if $indent;
1788	1756	100				3137	next unless $self->expand_verbatim_tabs;
1789
1790							# This is commented out because of github issue #85, and the
1791							# current maintainers don't know why it was there in the first
1792							# place.
1793							#&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1794	1755					5764	while( $line =~
1795							# Sort of adapted from Text::Tabs.
1796	987					2253	s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1797							* $self->expand_verbatim_tabs)
1798							-(length($1)&7)))/e
1799							) {}
1800
1801							# TODO: whinge about (or otherwise treat) unindented or overlong lines
1802
1803							}
1804							}
1805							}
1806
1807							# Now the VerbatimFormatted hoodoo...
1808	1284	100	66			3250	if( $self->{'accept_codes'} and
		100
1809							$self->{'accept_codes'}{'VerbatimFormatted'}
1810							) {
1811	868		100			5187	while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
	773					1276
1812							# Kill any number of terminal newlines
1813	812					3935	$self->_verbatim_format($para);
1814							} elsif ($self->{'codes_in_verbatim'}) {
1815							push @$para,
1816	805					1670	@{$self->_make_treelet(
1817							join("\n", splice(@$para, 2)),
1818	53					120	$para->[1]{'start_line'}, $para->[1]{'xml:space'}
1819							)};
1820	53					73	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1821							} else {
1822	465	100				2092	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1823	465					2497	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1824							}
1825	532					1113	return;
1826							}
1827
1828							sub _ponder_Data {
1829	73			95		242	my ($self,$para) = @_;
1830	73					165	DEBUG and print STDERR " giving data treatment...\n";
1831	22					49	$para->[1]{'xml:space'} = 'preserve';
1832	22	100				72	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1833	22					41	return;
1834							}
1835
1836
1837
1838
1839							###########################################################################
1840
1841							sub _traverse_treelet_bit { # for use only by the routine above
1842	7015			7266		13968	my($self, $name) = splice @_,0,2;
1843
1844	7015					8711	my $scratch;
1845	7015					20498	$self->_handle_element_start(($scratch=$name), shift @_);
1846
1847	7066					14275	while (@_) {
1848	12044					17167	my $x = shift;
1849	12044	100				18740	if (ref($x)) {
1850	3093					6221	&_traverse_treelet_bit($self, @$x);
1851							} else {
1852	9269		100			23299	$x .= shift while @_ && !ref($_[0]);
1853	8953					17125	$self->_handle_text($x);
1854							}
1855							}
1856
1857	7066					18830	$self->_handle_element_end($scratch=$name);
1858	7052					19025	return;
1859							}
1860
1861							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1862
1863							sub _closers_for_all_curr_open {
1864	55			104		188	my $self = $_[0];
1865	43					118	my @closers;
1866	6	50				10	foreach my $still_open (@{ $self->{'curr_open'} \|\| return }) {
	6					34
1867	9					20	my @copy = @$still_open;
1868	23					102	$copy[1] = {%{ $copy[1] }};
	23					118
1869							#$copy[1]{'start_line'} = -1;
1870	60	100				120	if($copy[0] eq '=for') {
		50
1871	35					75	$copy[0] = '=end';
1872							} elsif($copy[0] eq '=over') {
1873							$self->whine(
1874							$still_open->[1]{start_line} ,
1875	38					74	"=over without closing =back"
1876							);
1877
1878	38					70	$copy[0] = '=back';
1879							} else {
1880	32					90	die "I don't know how to auto-close an open $copy[0] region";
1881							}
1882
1883	41	50				85	unless( @copy > 2 ) {
1884	1001					2543	push @copy, $copy[1]{'target'};
1885	1001	100				1434	$copy[-1] = '' unless defined $copy[-1];
1886							# since =over's don't have targets
1887							}
1888
1889	1001					3475	$copy[1]{'fake-closer'} = 1;
1890
1891	1001					2007	DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1892	1126					1838	unshift @closers, \@copy;
1893							}
1894	1123					2099	return @closers;
1895							}
1896
1897							#--------------------------------------------------------------------------
1898
1899							sub _verbatim_format {
1900	171			122		416	my($it, $p) = @_;
1901
1902	1076					2828	my $formatting;
1903
1904	1076					2602	for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1905	1340					3525	DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1906	1340					3153	$p->[$i] .= "\n";
1907							# Unlike with simple Verbatim blocks, we don't end up just doing
1908							# a join("\n", ...) on the contents, so we have to append a
1909							# newline to every line, and then nix the last one later.
1910							}
1911
1912	69					79	if( DEBUG > 4 ) {
1913							print STDERR "<<\n";
1914							for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1915							print STDERR "_verbatim_format $i: $p->[$i]";
1916							}
1917							print STDERR ">>\n";
1918							}
1919
1920	69					171	for(my $i = $#$p; $i > 2; $i--) {
1921							# work backwards over the lines, except the first (#2)
1922
1923							#next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
1924							# and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1925							# look at a formatty line preceding a nonformatty one
1926	286					325	DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1927	286	100				479	if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
1928	17					27	DEBUG > 5 and print STDERR " It's a formatty line. ",
1929							"Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1930
1931	17	100				34	if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1932	7					28	DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
1933	7					21	next;
1934							} else {
1935	16					25	DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
1936							}
1937							} else {
1938	271					292	DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
1939	271					495	next;
1940							}
1941
1942							# A formatty line has to have #: in the first two columns, and uses
1943							# "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1944							# Example:
1945							# What do you want? i like pie. [or whatever]
1946							# #:^^^^^^^^^^^^^^^^^ /////////////
1947
1948
1949	10					10	DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1950
1951	16					38	$formatting = ' ' . $1;
1952	16					54	$formatting =~ s/\s+$//s; # nix trailing whitespace
1953	16	50	33			56	unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1954	6					13	splice @$p,$i,1; # remove this line
1955	6					15	$i--; # don't consider next line
1956	6					19	next;
1957							}
1958
1959	14	100				28	if( length($formatting) >= length($p->[$i-1]) ) {
1960	40					76	$formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1961							} else {
1962	44					67	$formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1963							}
1964							# Make $formatting and the previous line be exactly the same length,
1965							# with $formatting having a " " as the last character.
1966
1967	47					105	DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
1968
1969
1970	220					270	my @new_line;
1971	220					569	while( $formatting =~ m{\G(( +)\|(\^+)\|(\/+)\|(\%+))}g ) {
1972							#print STDERR "Format matches $1\n";
1973
1974	91	100				156	if($2) {
1975							#print STDERR "SKIPPING <$2>\n";
1976	69					215	push @new_line,
1977							substr($p->[$i-1], pos($formatting)-length($1), length($1));
1978							} else {
1979							#print STDERR "SNARING $+\n";
1980	195	50				322	push @new_line, [
		100
		100
1981							(
1982							$3 ? 'VerbatimB' :
1983							$4 ? 'VerbatimI' :
1984							$5 ? 'VerbatimBI' : die("Should never get called")
1985							), {},
1986							substr($p->[$i-1], pos($formatting)-length($1), length($1))
1987							];
1988							#print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1989							}
1990							}
1991	183					388	my @nixed =
1992							splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1993	10					23	DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1994
1995	10					13	DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1996							scalar(@new_line), "):",
1997							map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1998	10					25	$i--; # So the next line we scrutinize is the line before the one
1999							# that we just went and formatted
2000							}
2001
2002	65					126	$p->[0] = 'VerbatimFormatted';
2003
2004							# Collapse adjacent text nodes, just for kicks.
2005	65					145	for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2006	173	0	0			198	if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2007	173					286	DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2008	0					0	$p->[$i] .= splice @$p, $i+1, 1; # merge
2009	0					0	--$i; # and back up
2010							}
2011							}
2012
2013							# Now look for the last text token, and remove the terminal newline
2014	65					156	for( my $i = $#$p; $i >= 2; $i-- ) {
2015							# work backwards over the tokens, even the first
2016	65	50				129	if( !ref($p->[$i]) ) {
2017	65	50				275	if($p->[$i] =~ s/\n$//s) {
2018	65					89	DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2019							} else {
2020	0					0	DEBUG > 5 and print STDERR
2021							"No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2022							}
2023	65					122	last; # we only want the next one
2024							}
2025							}
2026
2027	65					113	return;
2028							}
2029
2030
2031							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2032
2033
2034							sub _treelet_from_formatting_codes {
2035							# Given a paragraph, returns a treelet. Full of scary tokenizing code.
2036							# Like [ '~Top', {'start_line' => $start_line},
2037							# "I like ",
2038							# [ 'B', {}, "pie" ],
2039							# "!"
2040							# ]
2041							# This illustrates the general format of a treelet. It is an array:
2042							# [0] is a scalar indicating its type. In the example above, the
2043							# types are '~Top' and 'B'
2044							# [1] is a hash of various flags about it, possibly empty
2045							# [2] - [N] are an ordered list of the subcomponents of the treelet.
2046							# Scalars are literal text, refs are sub-treelets, to
2047							# arbitrary levels. Stringifying a treelet will recursively
2048							# stringify the sub-treelets, concatentating everything
2049							# together to form the exact text of the treelet.
2050
2051	3696			3749		7686	my($self, $para, $start_line, $preserve_space) = @_;
2052
2053	3696					10995	my $treelet = ['~Top', {'start_line' => $start_line},];
2054
2055	3696	100	100			13351	unless ($preserve_space \|\| $self->{'preserve_whitespace'}) {
2056	2829					24783	$para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2057	2829					5413	$para =~ s/ $//;
2058	2829					4205	$para =~ s/^ //;
2059							}
2060
2061							# Only apparent problem the above code is that N<< >> turns into
2062							# N<< >>. But then, word wrapping does that too! So don't do that!
2063
2064
2065							# As a Start-code is encountered, the number of opening bracket '<'
2066							# characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2067							# etc). When closing brackets are found in the text, at least this number
2068							# (plus the 1) will be required to mean the Start-code is terminated. When
2069							# those are found, @stack is popped.
2070	3696					5216	my @stack;
2071
2072	3696					6530	my @lineage = ($treelet);
2073	3696					5106	my $raw = ''; # raw content of L<> fcode before splitting/processing
2074							# XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2075							# into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2076							# the 'collapse and trim all whitespace first' lines just above.
2077	3696					4538	my $inL = 0;
2078
2079	3696					4446	DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2080
2081							# Here begins our frightening tokenizer RE. The following regex matches
2082							# text in four main parts:
2083							#
2084							# * Start-codes. The first alternative matches C< or C<<, the latter
2085							# followed by some whitespace. $1 will hold the entire start code
2086							# (including any space following a multiple-angle-bracket delimiter),
2087							# and $2 will hold only the additional brackets past the first in a
2088							# multiple-bracket delimiter. length($2) + 1 will be the number of
2089							# closing brackets we have to find.
2090							#
2091							# * Closing brackets. Match some amount of whitespace followed by
2092							# multiple close brackets. The logic to see if this closes anything
2093							# is down below. Note that in order to parse C<< >> correctly, we
2094							# have to use look-behind (?<=\s\s), since the match of the starting
2095							# code will have consumed the whitespace.
2096							#
2097							# * A single closing bracket, to close a simple code like C<>.
2098							#
2099							# * Something that isn't a start or end code. We have to be careful
2100							# about accepting whitespace, since perlpodspec says that any whitespace
2101							# before a multiple-bracket closing delimiter should be ignored.
2102							#
2103	3733					36604	while($para =~
2104							m/\G
2105							(?:
2106							# Match starting codes, including the whitespace following a
2107							# multiple-delimiter start code. $1 gets the whole start code and
2108							# $2 gets all but one of the
2109							([A-Z]<(?:(<+)\s+)?)
2110							\|
2111							# Match multiple-bracket end codes. $3 gets the whitespace that
2112							# should be discarded before an end bracket but kept in other cases
2113							# and $4 gets the end brackets themselves. ($3 can be empty if the
2114							# construct is empty, like C<< >>, and all the white-space has been
2115							# gobbled up already, considered to be space after the opening
2116							# bracket. In this case we use look-behind to verify that there are
2117							# at least 2 spaces in a row before the ">".)
2118							(\s+\|(?<=\s\s))(>{2,})
2119							\|
2120							(\s?>) # $5: simple end-codes
2121							\|
2122							( # $6: stuff containing no start-codes or end-codes
2123							(?:
2124							[^A-Z\s>]
2125							\|
2126							(?:
2127							[A-Z](?!<)
2128							)
2129							\|
2130							# whitespace is ok, but we don't want to eat the whitespace before
2131							# a multiple-bracket end code.
2132							# NOTE: we may still have problems with e.g. S<< >>
2133							(?:
2134							\s(?!\s*>{2,})
2135							)
2136							)+
2137							)
2138							)
2139							/xgo
2140							) {
2141	14886					19371	DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2142	14849	100				42357	if(defined $1) {
		100
		100
		50
2143	2990					3686	my $bracket_count; # How many '<<<' in a row this has. Needed for
2144							# Pod::Simple::JustPod
2145	2990	100				5413	if(defined $2) {
2146	114					257	DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2147	151					362	$bracket_count = length($2) + 1;
2148	151					367	push @stack, $bracket_count; # length of the necessary complex
2149							# end-code string
2150							} else {
2151	2913					3624	DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2152	2913					4265	push @stack, 0; # signal that we're looking for simple
2153	2876					4007	$bracket_count = 1;
2154							}
2155	3027					6192	my $code = substr($1,0,1);
2156	3027	100				5353	if ('L' eq $code) {
2157	1180	100				2590	if ($inL) {
2158	748					2254	$raw .= $1;
2159	748					2800	$self->scream( $start_line,
2160							'Nested L<> are illegal. Pretending inner one is '
2161							. 'X<...> so can continue looking for other errors.');
2162	748					4045	$code = "X";
2163							}
2164							else {
2165	1179					2434	$raw = ""; # reset raw content accumulator
2166	1179					1976	$inL = @stack;
2167							}
2168							} else {
2169	3304	100				5548	$raw .= $1 if $inL;
2170							}
2171	3737					8805	push @lineage, [ $code, {}, ]; # new node object
2172
2173							# Tell Pod::Simple::JustPod how many brackets there were, but to save
2174							# space, not in the most usual case of there was just 1. It can be
2175							# inferred by the absence of this element. Similarly, if there is more
2176							# than one bracket, extract the white space between the final bracket
2177							# and the real beginning of the interior. Save that if it isn't just a
2178							# single space
2179	3737	100	100			9039	if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2180	765					1079	$lineage[-1][1]{'~bracket_count'} = $bracket_count;
2181	765					1004	my $lspacer = substr($1, 1 + $bracket_count);
2182	765	100				6738	$lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2183							}
2184	4023					5480	push @{ $lineage[-2] }, $lineage[-1];
	4023					20595
2185							} elsif(defined $4) {
2186	210					313	DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2187							# This is where it gets messy...
2188	210	100				721	if(! @stack) {
		100
		100
		50
2189							# We saw " >>>>" but needed nothing. This is ALL just stuff then.
2190	2					88	DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2191	2					11	push @{ $lineage[-1] }, $3, $4;
	2					12
2192	88					132	next;
2193							} elsif(!$stack[-1]) {
2194							# We saw " >>>>" but needed only ">". Back pos up.
2195	87					140	DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2196	87					127	push @{ $lineage[-1] }, $3; # That was a for-real space, too.
	88					206
2197	88					163	pos($para) = pos($para) - length($4) + 1;
2198							} elsif($stack[-1] == length($4)) {
2199							# We found " >>>>", and it was exactly what we needed. Commonest case.
2200	114					214	DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2201							} elsif($stack[-1] < length($4)) {
2202							# We saw " >>>>" but needed only " >>". Back pos up.
2203	0					0	DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2204	0					0	pos($para) = pos($para) - length($4) + $stack[-1];
2205							} else {
2206							# We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2207	7					16	DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2208	7					15	push @{ $lineage[-1] }, $3, $4;
	7					29
2209	95					214	next;
2210							}
2211							#print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2212
2213	202	50	66			557	if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2214	91	100				325	if ($3 ne "") {
		100
2215	1					4	$lineage[-1][1]{'~rspacer'} = $3;
2216							}
2217							elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2218
2219							# Here we had something like C<< >> which was a false positive
2220	1					5	delete $lineage[-1][1]{'~lspacer'};
2221							}
2222							else {
2223							$lineage[-1][1]{'~rspacer'}
2224	1					5	= substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2225	89					138	chop $lineage[-1][1]{'~lspacer'};
2226							}
2227							}
2228
2229	202	100				725	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	6					22
	118					389
2230							# Keep the element from being childless
2231
2232	114	100				266	if ($inL == @stack) {
2233	22					63	$lineage[-1][1]{'raw'} = $raw;
2234	22					31	$inL = 0;
2235							}
2236
2237	114					185	pop @stack;
2238	117					171	pop @lineage;
2239
2240	117	100				663	$raw .= $3.$4 if $inL;
2241
2242							} elsif(defined $5) {
2243	2974					3456	DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2244
2245	2974	100	100			9533	if(@stack and ! $stack[-1]) {
2246							# We're indeed expecting a simple end-code
2247	2869					3765	DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2248
2249	2868	50				6010	if(length($5) == 2) { # There was a space there: " >"
		100
2250	0					0	push @{ $lineage[-1] }, ' ';
	0					0
2251	2868					5956	} elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2252	36					54	push @{ $lineage[-1] }, ''; # keep it from being really childless
	36					94
2253							}
2254
2255	2872	100				5753	if ($inL == @stack) {
2256	410					1175	$lineage[-1][1]{'raw'} = $raw;
2257	410					596	$inL = 0;
2258							}
2259
2260	2868					3847	pop @stack;
2261	2868					3884	pop @lineage;
2262							} else {
2263	103					160	DEBUG > 4 and print STDERR " It's just stuff.\n";
2264	107					160	push @{ $lineage[-1] }, $5;
	103					254
2265							}
2266
2267	2975	100				20349	$raw .= $5 if $inL;
2268
2269							} elsif(defined $6) {
2270	8770					10028	DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2271	8766					10080	push @{ $lineage[-1] }, $6;
	8766					21439
2272	8770	100				31202	$raw .= $6 if $inL;
2273							# XXX does not capture multiplace whitespaces -- 'raw' ends up with
2274							# at most 1 leading/trailing whitespace, why not all of it?
2275							# Answer, because we deliberately trimmed it above
2276
2277							} else {
2278							# should never ever ever ever happen
2279	4					6	DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2280	4					24	die "SPORK 512512!";
2281							}
2282							}
2283
2284	3781	100				7531	if(@stack) { # Uhoh, some sequences weren't closed.
2285	93					347	my $x= "...";
2286	92					136	while(@stack) {
2287	92	50				197	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	0					0
	8					21
2288							# Hmmmmm!
2289
2290	92					203	my $code = (pop @lineage)->[0];
2291	13					24	my $ender_length = pop @stack;
2292	13	50				30	if($ender_length) {
2293	84					179	--$ender_length;
2294	0					0	$x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2295							} else {
2296	8					37	$x = $code . "<$x>";
2297							}
2298							}
2299	92					132	DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2300	92					164	$self->whine($start_line,
2301							"Unterminated $x sequence",
2302							);
2303							}
2304
2305	3697					10708	return $treelet;
2306							}
2307
2308							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2309
2310							sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2311	1			0	0	2	return stringify_lol($_[1]);
2312							}
2313
2314							sub stringify_lol { # function: stringify_lol($lol)
2315	2607			3409	0	4065	my $string_form = '';
2316	2691					5881	_stringify_lol( $_[0] => \$string_form );
2317	3462					8379	return $string_form;
2318							}
2319
2320							sub _stringify_lol { # the real recursor
2321	3842			3037		5811	my($lol, $to) = @_;
2322	3842					8510	for(my $i = 2; $i < @$lol; ++$i) {
2323	4840	100	100			12403	if( ref($lol->[$i] \|\| '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
			66
2324	380					673	_stringify_lol( $lol->[$i], $to); # recurse!
2325							} else {
2326	3604					7810	$$to .= $lol->[$i];
2327							}
2328							}
2329	3733					6213	return;
2330							}
2331
2332							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2333
2334							sub _dump_curr_open { # return a string representation of the stack
2335	0			32		0	my $curr_open = $_[0]{'curr_open'};
2336
2337	0	50				0	return '[empty]' unless @$curr_open;
2338							return join '; ',
2339	0					0	map {;
2340							($_->[0] eq '=for')
2341							? ( ($_->[1]{'~really'} \|\| '=over')
2342	0	50	50			0	. ' ' . $_->[1]{'target'})
2343							: $_->[0]
2344							}
2345							@$curr_open
2346							;
2347							}
2348
2349							###########################################################################
2350							my %pretty_form = (
2351							"\a" => '\a', # ding!
2352							"\b" => '\b', # BS
2353							"\e" => '\e', # ESC
2354							"\f" => '\f', # FF
2355							"\t" => '\t', # tab
2356							"\cm" => '\cm',
2357							"\cj" => '\cj',
2358							"\n" => '\n', # probably overrides one of either \cm or \cj
2359							'"' => '\"',
2360							'\\' => '\\\\',
2361							'$' => '\\$',
2362							'@' => '\\@',
2363							'%' => '\\%',
2364							'#' => '\\#',
2365							);
2366
2367							sub pretty { # adopted from Class::Classless
2368							# Not the most brilliant routine, but passable.
2369							# Don't give it a cyclic data structure!
2370	502			1494	0	2251	my @stuff = @_; # copy
2371	502					617	my $x;
2372							my $out =
2373							# join ",\n" .
2374							join ", ",
2375	502					733	map {;
2376	592	50	100			3985	if(!defined($_)) {
		100	66
		50	33
		100
		100
		100
2377	0					0	"undef";
2378							} elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2379	58					142	$x = "[ " . pretty(@$_) . " ]" ;
2380	58					128	$x;
2381							} elsif(ref($_) eq 'SCALAR') {
2382	0					0	$x = "\\" . pretty($$_) ;
2383	0					0	$x;
2384							} elsif(ref($_) eq 'HASH') {
2385	795					2569	my $hr = $_;
2386							$x = "{" . join(", ",
2387	48					446	map(pretty($_) . '=>' . pretty($hr->{$_}),
2388							sort keys %$hr ) ) . "}" ;
2389	48					171	$x;
2390	2					5	} elsif(!length($_)) { q{''} # empty string
2391							} elsif(
2392							$_ eq '0' # very common case
2393							or(
2394							m/^-?(?:[123456789]\d*\|0)(?:\.\d+)?$/s
2395							and $_ ne '-0' # the strange case that RE lets thru
2396							)
2397	26					83	) { $_;
2398							} else {
2399							# Yes, explicitly name every character desired. There are shorcuts one
2400							# could make, but I (Karl Williamson) was afraid that some Perl
2401							# releases would have bugs in some of them. For example [A-Z] works
2402							# even on EBCDIC platforms to match exactly the 26 uppercase English
2403							# letters, but I don't know if it has always worked without bugs. It
2404							# seemed safest just to list the characters.
2405							# s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2406	458	0				909	s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{\|}~])>
	0					0
2407							<$pretty_form{$1} \|\| '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2408	458					1318	#<$pretty_form{$1} \|\| '\\x'.(unpack("H2",$1))>eg;
2409							qq{"$_"};
2410							}
2411							} @stuff;
2412	502					2234	# $out =~ s/\n */ /g if length($out) < 75;
2413							return $out;
2414							}
2415
2416							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2417
2418							# A rather unsubtle method of blowing away all the state information
2419							# from a parser object so it can be reused. Provided as a utility for
2420							# backward compatibility in Pod::Man, etc. but not recommended for
2421							# general use.
2422
2423	5			9	0	9042	sub reinit {
2424	5					28	my $self = shift;
2425							foreach (qw(source_dead source_filename doc_has_started
2426							start_of_pod_block content_seen last_was_blank paras curr_open
2427							line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2428							Title)) {
2429	83					232
2430							delete $self->{$_};
2431							}
2432							}
2433
2434							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2435							1;
2436