File Coverage

blib/lib/Pod/Simple/BlackBox.pm

Criterion	Covered	Total	%
statement	854	1006	84.8
branch	368	526	69.9
condition	132	189	69.8
subroutine	59	65	90.7
pod	0	7	0.0
total	1413	1793	78.8

line	stmt	bran	cond	sub	pod	time	code
1							package Pod::Simple::BlackBox;
2							#
3							# "What's in the box?" "Pain."
4							#
5							###########################################################################
6							#
7							# This is where all the scary things happen: parsing lines into
8							# paragraphs; and then into directives, verbatims, and then also
9							# turning formatting sequences into treelets.
10							#
11							# Are you really sure you want to read this code?
12							#
13							#-----------------------------------------------------------------------------
14							#
15							# The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16							# of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17							# to call the proper callbacks on the treelets.
18							#
19							# Every node in a treelet is a ['name', {attrhash}, ...children...]
20
21	69			69		2300	use integer; # vroom!
	69					137
	69					477
22	69			69		1969	use strict;
	69					123
	69					1165
23	69			69		285	use Carp ();
	69					297
	69					1475
24	69			69		298	use vars qw($VERSION );
	69					112
	69					13240
25							$VERSION = '3.42';
26							#use constant DEBUG => 7;
27
28							sub my_qr ($$) {
29
30							# $1 is a pattern to compile and return. Older perls compile any
31							# syntactically valid property, even if it isn't legal. To cope with
32							# this, return an empty string unless the compiled pattern also
33							# successfully matches $2, which the caller furnishes.
34
35	450			450	0	1196	my ($input_re, $should_match) = @_;
36							# XXX could have a third parameter $shouldnt_match for extra safety
37
38	450	50				3177	my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : "";
39
40	67			67		1173	my $re = eval "no warnings; $use_utf8 qr/$input_re/";
	67			67		153
	67			67		3190
	67			67		940
	67			67		196
	67			67		3997
	67			67		41336
	67			67		2237
	67			67		1010
	67			67		1583
	67			37		190
	67			0		2180
	67			0		411
	67			0		115
	67					2394
	67					450
	67					127
	67					2156
	67					399
	67					144
	67					2266
	67					381
	67					121
	67					2127
	67					458
	67					130
	67					2046
	67					539
	67					139
	67					3909
	450					24847
	3					7
	3					6
	4					32
	0
	0
	0
	0
	0
	0
	0
	0
	0
41							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
42	450	50				1789	return "" if $@;
43
44	67			67		6362	my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
	67			67		136
	67			67		2204
	67			67		581
	67			67		127
	67			67		3649
	67			67		426
	67			67		128
	67			67		2276
	67			747		378
	67			0		120
	67			3		2135
	67			0		420
	67					123
	67					2512
	67					375
	67					118
	67					10279
	67					407
	67					161
	67					2753
	67					394
	67					127
	67					2231
	67					386
	67					136
	67					2721
	450					23237
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
	0
45							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
46	450	50				1765	return "" if $@;
47
48							#print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49	450	50				1507	return $re if $matches;
50
51							#print STDERR __LINE__, ": $re: didn't match\n";
52	1					2	return "";
53							}
54
55							BEGIN {
56	69			69		1484	require Pod::Simple;
57	69	50				5341	*DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58							}
59
60							# Matches a character iff the character will have a different meaning
61							# if we choose CP1252 vs UTF-8 if there is no =encoding line.
62							# This is broken for early Perls on non-ASCII platforms.
63							my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64							$non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65
66							# Use patterns understandable by Perl 5.6, if possible
67	69			69		482	my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
	69					145
	69					61971
68							my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69							# to get assigned
70							my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71							"\x{250}");
72							$rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73
74	67			67		482	my $script_run_re = eval 'no warnings "experimental::script_run";
	67					124
	67					14669
75							qr/(script_run: ^ . $ )/x';
76							my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77							unless ($latin_re) {
78							# This was machine generated to be the ranges of the union of the above
79							# three properties, with things that were undefined by Unicode 4.1 filling
80							# gaps. That is the version in use when Perl advanced enough to
81							# successfully compile and execute the above pattern.
82							$latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83							}
84
85							my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86
87							# Latin script code points not in the first release of Unicode
88							my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89
90							# If this perl doesn't have the Deprecated property, there's only one code
91							# point in it that we need be concerned with.
92							my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93							$deprecated_re = qr/\x{149}/ unless $deprecated_re;
94
95							my $utf8_bom;
96							if (($] ge 5.007_003)) {
97							$utf8_bom = "\x{FEFF}";
98							utf8::encode($utf8_bom);
99							} else {
100							$utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101							}
102
103							# This is used so that the 'content_seen' method doesn't return true on a
104							# file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105							# there is a valid =foo line will we return that content was seen.
106							my $seen_legal_directive = 0;
107
108							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109
110	1			1	0	43	sub parse_line { shift->parse_lines(@_) } # alias
111
112							# - - - Turn back now! Run away! - - -
113
114							sub parse_lines { # Usage: $parser->parse_lines(@lines)
115							# an undef means end-of-stream
116	7636			7636	0	11107	my $self = shift;
117
118	7636					11017	my $code_handler = $self->{'code_handler'};
119	7636					10494	my $cut_handler = $self->{'cut_handler'};
120	7636					9610	my $wl_handler = $self->{'whiteline_handler'};
121	7636		100			14580	$self->{'line_count'} \|\|= 0;
122
123	7636					9447	my $scratch;
124
125							DEBUG > 4 and
126	7636					8841	print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127
128	7636					8297	DEBUG > 5 and
129							print STDERR "# About to parse lines: ",
130							join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131
132	7636		100			15455	my $paras = ($self->{'paras'} \|\|= []);
133							# paragraph buffer. Because we need to defer processing of =over
134							# directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135							# to process this.
136
137	7636		100			14814	$self->{'pod_para_count'} \|\|= 0;
138
139							# An attempt to match the pod portions of a line. This is not fool proof,
140							# but is good enough to serve as part of the heuristic for guessing the pod
141							# encoding if not specified.
142	7636					9515	my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
	73544					141937
	7636					38502
143	7636					35142	my $pod_chars_re = qr/ ^ = [A-Za-z]+ \| [\Q$codes\E] < /x;
144
145	7636					11204	my $line;
146	7801					12918	foreach my $source_line (@_) {
147	16149	50				28357	if( $self->{'source_dead'} ) {
148	166					9788	DEBUG > 4 and print STDERR "# Source is dead.\n";
149	166					632	last;
150							}
151
152	16149	100				39613	unless( defined $source_line ) {
153	833					1443	DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154
155	833					2537	push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156	667					1524	push @$paras, $paras->[-1], $paras->[-1];
157							# So that it definitely fills the buffer.
158	667					1184	$self->{'source_dead'} = 1;
159	2503					4189	$self->_ponder_paragraph_buffer;
160	2503					3686	next;
161							}
162
163
164	17152	100				26782	if( $self->{'line_count'}++ ) {
165	16483					28907	($line = $source_line) =~ tr/\n\r//d;
166							# If we don't have two vars, we'll end up with that there
167							# tr/// modding the (potentially read-only) original source line!
168
169							} else {
170	2505					4283	DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171
172	2505	50				5898	if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
		50
		50
173	1836					1950	DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174	1836					2053	$self->_handle_encoding_line( "=encoding utf8" );
175	1836					3440	delete $self->{'_processed_encoding'};
176	1836					3423	$line =~ tr/\n\r//d;
177
178							} elsif( $line =~ s/^\xFE\xFF//s ) {
179	1836					2180	DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180							$self->scream(
181	17138					31961	$self->{'line_count'},
182							"UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183							);
184	1836					8522	splice @_;
185	1836					9187	push @_, undef;
186	1836					2584	next;
187
188							# TODO: implement somehow?
189
190							} elsif( $line =~ s/^\xFF\xFE//s ) {
191	1836					2796	DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192							$self->scream(
193	3035					5680	$self->{'line_count'},
194							"UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195							);
196	0					0	splice @_;
197	0					0	push @_, undef;
198	3035					5687	next;
199
200							# TODO: implement somehow?
201
202							} else {
203	890					1260	DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204	890					2396	($line = $source_line) =~ tr/\n\r//d;
205							}
206							}
207
208	15537	100	100			100585	if(!$self->{'parse_characters'} && !$self->{'encoding'}
			100
			100
			100
209							&& ($self->{'in_pod'} \|\| $line =~ /^=/s)
210							&& $line =~ /$non_ascii_re/
211							) {
212
213	231					391	my $encoding;
214
215							# No =encoding line, and we are at the first pod line in the input that
216							# contains a non-ascii byte, that is, one whose meaning varies depending
217							# on whether the file is encoded in UTF-8 or CP1252, which are the two
218							# possibilities permitted by the pod spec. (ASCII is assumed if the
219							# file only contains ASCII bytes.) In order to process this line, we
220							# need to figure out what encoding we will use for the file.
221							#
222							# Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223							# 160-255, but it is used here, as it often colloquially is, to refer to
224							# the complete set of code points 0-255, including ASCII (0-127), the C1
225							# controls (128-159), and strict Latin 1 (160-255).
226							#
227							# CP1252 is effectively a superset of Latin 1, because it differs only
228							# from colloquial 8859-1 in the C1 controls, which are very unlikely to
229							# actually be present in 8859-1 files, so can be used for other purposes
230							# without conflict. CP 1252 uses most of them for graphic characters.
231							#
232							# Note that all ASCII-range bytes represent their corresponding code
233							# points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234							# code points require multiple (non-ASCII) bytes to represent. (A
235							# separate paragraph for EBCDIC is below.) The multi-byte
236							# representation is quite structured. If we find an isolated byte that
237							# would require multiple bytes to represent in UTF-8, we know that the
238							# encoding is not UTF-8. If we find a sequence of bytes that violates
239							# the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240							# hence must be 1252.
241							#
242							# But there are ambiguous cases where we could guess wrong. If so, the
243							# user will end up having to supply an =encoding line. We use all
244							# readily available information to improve our chances of guessing
245							# right. The odds of something not being UTF-8, but still passing a
246							# UTF-8 validity test go down very rapidly with increasing length of the
247							# sequence. Therefore we look at all non-ascii sequences on the line.
248							# If any of the sequences can't be UTF-8, we quit there and choose
249							# CP1252. If all could be UTF-8, we see if any of the code points
250							# represented are unlikely to be in pod. If so, we guess CP1252. If
251							# not, we check if the line is all in the same script; if not guess
252							# CP1252; otherwise UTF-8. For perls that don't have convenient script
253							# run testing, see if there is both Latin and non-Latin. If so, CP1252,
254							# otherwise UTF-8.
255							#
256							# On EBCDIC platforms, the situation is somewhat different. In
257							# UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258							# but so do the bytes that are for the C1 controls. Recall that these
259							# correspond to the unused portion of 8859-1 that 1252 mostly takes
260							# over. That means that there are fewer code points that are
261							# represented by multi-bytes. But, note that the these controls are
262							# very unlikely to be in pod text. So if we encounter one of them, it
263							# means that it is quite likely CP1252 and not UTF-8. The net result is
264							# the same code below is used for both platforms.
265							#
266							# XXX probably if the line has E that evaluates to illegal CP1252,
267							# then it is UTF-8. But we haven't processed E<> yet.
268
269	231	50				575	goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls
270
271	231					365	my $copy;
272
273	69			69		551	no warnings 'utf8';
	69					150
	69					5190
274
275	2824	50				5117	if ($] ge 5.007_003) {
276	2609					5272	$copy = $line;
277
278							# On perls that have this function, we can use it to easily see if the
279							# sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280							# needed below for script run detection
281	225	100				410	goto set_1252 if ! utf8::decode($copy);
282							}
283							elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284							# code page doing here anyway?
285							goto set_utf8;
286							}
287							else { # ASCII, no decode(): do it ourselves using the fundamental
288							# characteristics of UTF-8
289	69			69		40518	use if $] le 5.006002, 'utf8';
	69					851
	69					1467
290
291	215					1173	my $char_ord;
292							my $needed; # How many continuation bytes to gobble up
293
294							# Initialize the translated line with a dummy character that will be
295							# deleted after everything else is done. This dummy makes sure that
296							# $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297							# with upgrading in the middle
298	0					0	$copy = chr(0x100);
299
300							# Parse through the line
301	0					0	for (my $i = 0; $i < length $line; $i++) {
302	0					0	my $byte = substr($line, $i, 1);
303
304							# ASCII bytes are trivially dealt with
305	0	0				0	if ($byte !~ $non_ascii_re) {
306	0					0	$copy .= $byte;
307	0					0	next;
308							}
309
310	0					0	my $b_ord = ord $byte;
311
312							# Now figure out what this code point would be if the input is
313							# actually in UTF-8. If, in the process, we discover that it isn't
314							# well-formed UTF-8, we guess CP1252.
315							#
316							# Start the process. If it is UTF-8, we are at the first, start
317							# byte, of a multi-byte sequence. We look at this byte to figure
318							# out how many continuation bytes are needed, and to initialize the
319							# code point accumulator with the data from this byte.
320							#
321							# Normally the minimum continuation byte is 0x80, but in certain
322							# instances the minimum is a higher number. So the code below
323							# overrides this for those instances.
324	0					0	my $min_cont = 0x80;
325
326	0	0				0	if ($b_ord < 0xC2) { # A start byte < C2 is malformed
		0
		0
		0
327	0					0	goto set_1252;
328							}
329							elsif ($b_ord <= 0xDF) {
330	0					0	$needed = 1;
331	0					0	$char_ord = $b_ord & 0x1F;
332							}
333							elsif ($b_ord <= 0xEF) {
334	0	0				0	$min_cont = 0xA0 if $b_ord == 0xE0;
335	0					0	$needed = 2;
336	215					286	$char_ord = $b_ord & (0x1F >> 1);
337							}
338							elsif ($b_ord <= 0xF4) {
339	215	0				528	$min_cont = 0x90 if $b_ord == 0xF0;
340	2814					15789	$needed = 3;
341	11					21	$char_ord = $b_ord & (0x1F >> 2);
342							}
343							else { # F4 is the highest start byte for legal Unicode; higher is
344							# unlikely to be in pod.
345	11					61	goto set_1252;
346							}
347
348							# ? not enough continuation bytes available
349	11	0				20	goto set_1252 if $i + $needed >= length $line;
350
351							# Accumulate the ordinal of the character from the remaining
352							# (continuation) bytes.
353	11					40	while ($needed-- > 0) {
354	11					20	my $cont = substr($line, ++$i, 1);
355	11					126	$b_ord = ord $cont;
356	0	0	0			0	goto set_1252 if $b_ord < $min_cont \|\| $b_ord > 0xBF;
357
358							# In all cases, any next continuation bytes all have the same
359							# minimum legal value
360	0					0	$min_cont = 0x80;
361
362							# Accumulate this byte's contribution to the code point
363	0					0	$char_ord <<= 6;
364	0					0	$char_ord \|= ($b_ord & 0x3F);
365							}
366
367							# Here, the sequence that formed this code point was valid UTF-8,
368							# so add the completed character to the output
369	0					0	$copy .= chr $char_ord;
370							} # End of loop through line
371
372							# Delete the dummy first character
373	0					0	$copy = substr($copy, 1);
374							}
375
376							# Here, $copy is legal UTF-8.
377
378							# If it can't be legal CP1252, no need to look further. (These bytes
379							# aren't valid in CP1252.) This test could have been placed higher in
380							# the code, but it seemed wrong to set the encoding to UTF-8 without
381							# making sure that the very first instance is well-formed. But what if
382							# it isn't legal CP1252 either? We have to choose one or the other, and
383							# It seems safer to favor the single-byte encoding over the multi-byte.
384	0	50				0	goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385
386							# The C1 controls are not likely to appear in pod
387	0	50				0	goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388
389							# Nor are surrogates nor unassigned, nor deprecated.
390	0	50				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391	0	50	33			0	goto set_1252 if $cs_re && $copy =~ $cs_re;
392	0	50	33			0	DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393	0	50	33			0	goto set_1252 if $cn_re && $copy =~ $cn_re;
394	0	50				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395	0	50				0	goto set_1252 if $copy =~ $deprecated_re;
396
397							# Nor are rare code points. But this is hard to determine. khw
398							# believes that IPA characters and the modifier letters are unlikely to
399							# be in pod (and certainly very unlikely to be the in the first line in
400							# the pod containing non-ASCII)
401	0	100				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402	0	100	66			0	goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403
404							# The first Unicode version included essentially every Latin character
405							# in modern usage. So, a Latin character not in the first release will
406							# unlikely be in pod.
407	0	50	33			0	DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408	0	50	33			0	goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409
410							# On perls that handle script runs, if the UTF-8 interpretation yields
411							# a single script, we guess UTF-8, otherwise just having a mixture of
412							# scripts is suspicious, so guess CP1252. We first strip off, as best
413							# we can, the ASCII characters that look like they are pod directives,
414							# as these would always show as mixed with non-Latin text.
415	0					0	$copy =~ s/$pod_chars_re//g;
416
417	0	50				0	if ($script_run_re) {
418	0	0				0	goto set_utf8 if $copy =~ $script_run_re;
419	0					0	DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420	0					0	goto set_1252;
421							}
422
423							# Even without script runs, but on recent enough perls and Unicodes, we
424							# can check if there is a mixture of both Latin and non-Latin. Again,
425							# having a mixture of scripts is suspicious, so assume CP1252
426
427							# If it's all non-Latin, there is no CP1252, as that is Latin
428							# characters and punct, etc.
429	0	50				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430	0	50				0	goto set_utf8 if $copy !~ $latin_re;
431
432	0	100				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433	0	100				0	goto set_utf8 if $copy =~ $every_char_is_latin_re;
434
435	0					0	DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436
437	10					13	set_1252:
438							DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439	10					24	$encoding = 'CP1252';
440	16					46	goto done_set;
441
442	6					25	set_utf8:
443							DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444	6					40	$encoding = 'UTF-8';
445
446	16					107	done_set:
447							$self->_handle_encoding_line( "=encoding $encoding" );
448	16					75	delete $self->{'_processed_encoding'};
449	16	50				72	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450
451	16					198	my ($word) = $line =~ /(\S$non_ascii_re\S)/;
452
453							$self->whine(
454	16					109	$self->{'line_count'},
455							"Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456							);
457							}
458
459	15322					18813	DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460
461	15322	100				23130	if(!$self->{'in_pod'}) {
462	1274	100				3355	if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s\|$)/s) {
463	735	100				1988	if($1 eq 'cut') {
464							$self->scream(
465	9					88	$self->{'line_count'},
466							"=cut found outside a pod block. Skipping to next block."
467							);
468
469							## Before there were errata sections in the world, it was
470							## least-pessimal to abort processing the file. But now we can
471							## just barrel on thru (but still not start a pod block).
472							#splice @_;
473							#push @_, undef;
474
475	9					25	next;
476							} else {
477							$self->{'in_pod'} = $self->{'start_of_pod_block'}
478	726					1876	= $self->{'last_was_blank'} = 1;
479							# And fall thru to the pod-mode block further down
480							}
481							} else {
482	539					710	DEBUG > 5 and print STDERR "# It's a code-line.\n";
483	539	100				910	$code_handler->(map $_, $line, $self->{'line_count'}, $self)
484							if $code_handler;
485							# Note: this may cause code to be processed out of order relative
486							# to pods, but in order relative to cuts.
487
488							# Note also that we haven't yet applied the transcoding to $line
489							# by time we call $code_handler!
490
491	544	50				1020	if( $line =~ m/^#\sline\s+(\d+)\s(?:\s"([^"]+)")?\s*$/ ) {
492							# That RE is from perlsyn, section "Plain Old Comments (Not!)",
493							#$fname = $2 if defined $2;
494							#DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495	5					31	DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496	5					46	$self->{'line_count'} = $1 - 1;
497							}
498
499	544					996	next;
500							}
501							}
502
503							# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504							# Else we're in pod mode:
505
506							# Apply any necessary transcoding:
507	14774	100				22917	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508
509							# HERE WE CATCH =encoding EARLY!
510	14780	100				22844	if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511	14	100				78	next if $self->parse_characters; # Ignore this line
512	14					74	$line = $self->_handle_encoding_line( $line );
513							}
514
515	14777	100				47339	if($line =~ m/^=cut/s) {
		100
		100
516							# here ends the pod block, and therefore the previous pod para
517	123					196	DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518	130					267	$self->{'in_pod'} = 0;
519							# ++$self->{'pod_para_count'};
520	130					351	$self->_ponder_paragraph_buffer();
521							# by now it's safe to consider the previous paragraph as done.
522	130					211	DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523	130	100				505	$cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524							if $cut_handler;
525
526							# TODO: add to docs: Note: this may cause cuts to be processed out
527							# of order relative to pods, but in order relative to code.
528
529							} elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530	4670	100	66			15791	if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531	2817	100				3691	$wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532							if $wl_handler;
533							}
534
535	7473	100	66			24072	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			100
536	951					1651	DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537	732					1325	push @{$paras->[-1]}, $line;
	523					1139
538							} # otherwise it's not interesting
539
540	4659	100	100			13265	if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541	4827					5655	DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542							}
543
544	4878					6926	$self->{'last_was_blank'} = 1;
545
546							} elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547
548	5410	100				14218	if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9])(\s+\|$)(.)/s) {
		100
549							# THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550	2554					9156	my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551	2335	100	100			8224	$new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552							# Note that in "=head1 foo", the WS is lost.
553							# Example: ['=head1', {'start_line' => 123}, ' foo']
554
555	2335					3349	++$self->{'pod_para_count'};
556
557	2554					6082	$self->_ponder_paragraph_buffer();
558							# by now it's safe to consider the previous paragraph as done.
559
560	4930					9869	push @$paras, $new; # the new incipient paragraph
561	4930					7717	DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562
563							} elsif($line =~ m/^\s/s) {
564
565	653	100	33			3086	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			66
566	169					263	DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567	2732					9506	push @{$paras->[-1]}, $line;
	184					356
568							} else {
569	529					754	++$self->{'pod_para_count'};
570	529					1305	$self->_ponder_paragraph_buffer();
571							# by now it's safe to consider the previous paragraph as done.
572	529					711	DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573	529					1939	push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574							}
575							} else {
576	3304					6941	++$self->{'pod_para_count'};
577	2256					5171	$self->_ponder_paragraph_buffer();
578							# by now it's safe to consider the previous paragraph as done.
579	3304					11643	push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580	2343					3303	DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581							}
582	5299					9694	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583
584							} else {
585							# It's a non-blank line /continuing/ the current para
586	4912	50				7351	if(@$paras) {
587	5873					8066	DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588	5819					6208	push @{$paras->[-1]}, $line;
	5873					10932
589							} else {
590							# Unexpected case!
591	1146					3262	die "Continuing a paragraph but \@\$paras is empty?";
592							}
593	5322					9957	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594							}
595
596							} # ends the big while loop
597
598	8153					11266	DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599	8153					37390	return $self;
600							}
601
602							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603
604							sub _handle_encoding_line {
605	535			18		1422	my($self, $line) = @_;
606
607	535	50				996	return if $self->parse_characters;
608
609							# The point of this routine is to set $self->{'_transcoder'} as indicated.
610
611	535	50				827	return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
612	111					534	DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
613
614	52					90	my $e = $1;
615	52					87	my $orig = $e;
616	52					105	push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
	76					161
617
618	76					165	my $enc_error;
619
620							# Cf. perldoc Encode and perldoc Encode::Supported
621
622	76					3593	require Pod::Simple::Transcode;
623
624	76	100	33			301	if( $self->{'encoding'} ) {
		50
		100
625	534					725	my $norm_current = $self->{'encoding'};
626	534					1276	my $norm_e = $e;
627	534					1895	foreach my $that ($norm_current, $norm_e) {
628	534					778	$that = lc($that);
629	1146					2161	$that =~ s/[-_]//g;
630							}
631	333	100				601	if($norm_current eq $norm_e) {
632	333					389	DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
633							"redundant. ($norm_current eq $norm_e). Ignoring.\n";
634	333					437	$enc_error = '';
635							# But that doesn't necessarily mean that the earlier one went okay
636							} else {
637	333					721	$enc_error = "Encoding is already set to " . $self->{'encoding'};
638	0					0	DEBUG > 1 and print STDERR $enc_error;
639							}
640							} elsif (
641							# OK, let's turn on the encoding
642							do {
643	350					756	DEBUG > 1 and print STDERR " Setting encoding to $e\n";
644	1853					2088	$self->{'encoding'} = $e;
645	1853					8644	1;
646							}
647							and $e eq 'HACKRAW'
648							) {
649	42					109	DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
650
651							} elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
652
653							die($enc_error = "WHAT? _transcoder is already set?!")
654	59	50				7781	if $self->{'_transcoder'}; # should never happen
655	59					292	require Pod::Simple::Transcode;
656	59					159	$self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
657	59					139	eval {
658	59					129	my @x = ('', "abc", "123");
659	59					126	$self->{'_transcoder'}->(@x);
660							};
661	59	50				222	$@ && die( $enc_error =
662							"Really unexpected error setting up encoding $e: $@\nAborting"
663							);
664	59					124	$self->{'detected_encoding'} = $e;
665
666							} else {
667	42					2770	my @supported = Pod::Simple::Transcode::->all_encodings;
668
669							# Note unsupported, and complain
670	42					134	DEBUG and print STDERR " Encoding [$e] is unsupported.",
671							"\nSupporteds: @supported\n";
672	5					13	my $suggestion = '';
673
674							# Look for a near match:
675	5					9	my $norm = lc($e);
676	5					12	$norm =~ tr[-_][]d;
677	10					23	my $n;
678	10					46	foreach my $enc (@supported) {
679	5					16	$n = lc($enc);
680	3					7	$n =~ tr[-_][]d;
681	3	50				5	next unless $n eq $norm;
682	2					7	$suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
683	2					3	last;
684							}
685	37					85	my $encmodver = Pod::Simple::Transcode::->encmodver;
686	37					122	$enc_error = join '' =>
687							"This document probably does not appear as it should, because its ",
688							"\"=encoding $e\" line calls for an unsupported encoding.",
689							$suggestion, " [$encmodver\'s supported encodings are: @supported]"
690							;
691
692	37					390	$self->scream( $self->{'line_count'}, $enc_error );
693							}
694	17					30	push @{ $self->{'encoding_command_statuses'} }, $enc_error;
	52					21219
695	52	100				209	if (defined($self->{'_processed_encoding'})) {
696							# Double declaration.
697	35					161	$self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
698							}
699	52					125	$self->{'_processed_encoding'} = $orig;
700
701	52					138	return $line;
702							}
703
704							# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
705
706							sub _handle_encoding_second_level {
707							# By time this is called, the encoding (if well formed) will already
708							# have been acted on.
709	42			8		118	my($self, $para) = @_;
710	42					123	my @x = @$para;
711	42					119	my $content = join ' ', splice @x, 2;
712	9					1719	$content =~ s/^\s+//s;
713	9					1897	$content =~ s/\s+$//s;
714
715	9					17	DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
716
717	9	100				39	if (defined($self->{'_processed_encoding'})) {
718							#if($content ne $self->{'_processed_encoding'}) {
719							# Could it happen?
720							#}
721	9					20	delete $self->{'_processed_encoding'};
722							# It's already been handled. Check for errors.
723	9	50				46	if(! $self->{'encoding_command_statuses'} ) {
		100
724	2					6	DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
725							} elsif( $self->{'encoding_command_statuses'}[-1] ) {
726							$self->whine( $para->[1]{'start_line'},
727							sprintf "Couldn't do %s: %s",
728							$self->{'encoding_command_reqs' }[-1],
729	248					299	$self->{'encoding_command_statuses'}[-1],
730							);
731							} else {
732	255					290	DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
733							}
734
735							} else {
736							# Otherwise it's a syntax error
737	248					383	$self->whine( $para->[1]{'start_line'},
738							"Invalid =encoding syntax: $content"
739							);
740							}
741
742	7					15	return;
743							}
744
745							#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
746
747							{
748							my $m = -321; # magic line number
749
750							sub _gen_errata {
751	667			668		919	my $self = $_[0];
752							# Return 0 or more fake-o paragraphs explaining the accumulated
753							# errors on this document.
754
755	669	100	66			2073	return() unless $self->{'errata'} and keys %{$self->{'errata'}};
	16					96
756
757	16					48	my @out;
758
759	56					98	foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
	69					172
	56					174
760							push @out,
761							['=item', {'start_line' => $m}, "Around line $line:"],
762							map( ['~Para', {'start_line' => $m, '~cooked' => 1},
763							#['~Top', {'start_line' => $m},
764							$_
765							#]
766							],
767	33					117	@{$self->{'errata'}{$line}}
	74					246
768							)
769							;
770							}
771
772							# TODO: report of unknown entities? unrenderable characters?
773
774	56					265	unshift @out,
775							['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
776							['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
777							"Hey! ",
778							['B', {},
779							'The above document had some coding errors, which are explained below:'
780							]
781							],
782							['=over', {'start_line' => $m, 'errata' => 1}, ''],
783							;
784
785	45					123	push @out,
786							['=back', {'start_line' => $m, 'errata' => 1}, ''],
787							;
788
789	45					109	DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
790
791	45					141	return @out;
792							}
793
794							}
795
796							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
797
798							##############################################################################
799							##
800							## stop reading now stop reading now stop reading now stop reading now stop
801							##
802							## HERE IT BECOMES REALLY SCARY
803							##
804							## stop reading now stop reading now stop reading now stop reading now stop
805							##
806							##############################################################################
807
808							sub _ponder_paragraph_buffer {
809
810							# Para-token types as found in the buffer.
811							# ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
812							# =over, =back, =item
813							# and the null =pod (to be complained about if over one line)
814							#
815							# "~data" paragraphs are something we generate at this level, depending on
816							# a currently open =over region
817
818							# Events fired: Begin and end for:
819							# directivename (like head1 .. head4), item, extend,
820							# for (from =begin...=end, =for),
821							# over-bullet, over-number, over-text, over-block,
822							# item-bullet, item-number, item-text,
823							# Document,
824							# Data, Para, Verbatim
825							# B, C, longdirname (TODO -- wha?), etc. for all directives
826							#
827
828	5870			5840		7352	my $self = $_[0];
829	5870					6527	my $paras;
830	5870	100				6583	return unless @{$paras = $self->{'paras'}};
	5870					12570
831	5146		100			11648	my $curr_open = ($self->{'curr_open'} \|\|= []);
832
833	5146					5972	my $scratch;
834
835	5116					5654	DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
836
837							# We have something in our buffer. So apparently the document has started.
838	5120	100				8793	unless($self->{'doc_has_started'}) {
839	707					1730	$self->{'doc_has_started'} = 1;
840
841	682					1211	my $starting_contentless;
842	712		66			4196	$starting_contentless =
843							(
844							!@$curr_open
845							and @$paras and ! grep $_->[0] ne '~end', @$paras
846							# i.e., if the paras is all ~ends
847							)
848							;
849	902					1240	DEBUG and print STDERR "# Starting ",
850							$starting_contentless ? 'contentless' : 'contentful',
851							" document\n"
852							;
853
854							$self->_handle_element_start(
855							($scratch = 'Document'),
856							{
857	902	100				4034	'start_line' => $paras->[0][1]{'start_line'},
858							$starting_contentless ? ( 'contentless' => 1 ) : (),
859							},
860							);
861							}
862
863	5142					7628	my($para, $para_type);
864	5142					8778	while(@$paras) {
865
866							# If a directive, assume it's legal; subtract below if found not to be
867	7341	100				18324	$seen_legal_directive++ if $paras->[0][0] =~ /^=/;
868
869							last if @$paras == 1
870							and ( $paras->[0][0] eq '=over'
871							or $paras->[0][0] eq '=item'
872	7316	100	100			27907	or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
			100
873							# Those're the three kinds of paragraphs that require lookahead.
874							# Actually, an "=item Foo" inside an region
875							# and any =item inside an region (rare)
876							# don't require any lookahead, but all others (bullets
877							# and numbers) do.
878							# The verbatim is different from the other two, because those might be
879							# like:
880							#
881							# =item
882							# ...
883							# =cut
884							# ...
885							# =item
886							#
887							# The =cut here finishes the paragraph but doesn't terminate the =over
888							# they should be in. (khw apologizes that he didn't comment at the time
889							# why the 'in_pod' works, and no longer remembers why, and doesn't think
890							# it is currently worth the effort to re-figure it out.)
891
892							# TODO: whinge about many kinds of directives in non-resolving =for regions?
893							# TODO: many? like what? =head1 etc?
894
895	5943					10695	$para = shift @$paras;
896	5944					9227	$para_type = $para->[0];
897
898	5944					6907	DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
899							$self->_dump_curr_open(), ")\n";
900
901	5943	100				16938	if($para_type eq '=for') {
		100
		100
		100
902	74	50				214	next if $self->_ponder_for($para,$curr_open,$paras);
903
904							} elsif($para_type eq '=begin') {
905	83	50				189	next if $self->_ponder_begin($para,$curr_open,$paras);
906
907							} elsif($para_type eq '=end') {
908	83	50				216	next if $self->_ponder_end($para,$curr_open,$paras);
909
910							} elsif($para_type eq '~end') { # The virtual end-document signal
911	2065	50				3747	next if $self->_ponder_doc_end($para,$curr_open,$paras);
912							}
913
914
915							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
916							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
917	6446	100				13173	if(grep $_->[1]{'~ignore'}, @$curr_open) {
918	1398					1624	DEBUG > 1 and
919							print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
920	1398					2932	next;
921							}
922							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
923							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
924
925	6217	100				14132	if($para_type eq '=pod') {
		100
		100
926	1648					2764	$self->_ponder_pod($para,$curr_open,$paras);
927
928							} elsif($para_type eq '=over') {
929	1325	100				2102	next if $self->_ponder_over($para,$curr_open,$paras);
930
931							} elsif($para_type eq '=back') {
932	1325	100				2608	next if $self->_ponder_back($para,$curr_open,$paras);
933
934							} else {
935
936							# All non-magical codes!!!
937
938							# Here we start using $para_type for our own twisted purposes, to
939							# mean how it should get treated, not as what the element name
940							# should be.
941
942	4478					5115	DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
943
944	4478					4943	my $i;
945
946							# Enforce some =headN discipline
947	4478	100	66			11780	if($para_type =~ m/^=head\d$/s
			100
			66
948							and ! $self->{'accept_heads_anywhere'}
949							and @$curr_open
950							and $curr_open->[-1][0] eq '=over'
951							) {
952	221					291	DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
953							$self->whine(
954	221					1015	$para->[1]{'start_line'},
955							"You forgot a '=back' before '$para_type'"
956							);
957	1169					1933	unshift @$paras, ['=back', {}, ''], $para; # close the =over
958	1169					2024	next;
959							}
960
961
962	5917	100	66			18360	if($para_type eq '=item') {
		100
		100
		100
		100
		100
		100
963
964	2628					7872	my $over;
965	2511	50	33			4773	unless(@$curr_open and
966	2619					6133	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
967							$self->whine(
968	1543					2250	$para->[1]{'start_line'},
969							"'=item' outside of any '=over'"
970							);
971							unshift @$paras,
972	1543					4385	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
973							$para
974							;
975	2					52	next;
976							}
977
978
979	1038					1705	my $over_type = $over->[1]{'~type'};
980
981	1041	50				2602	if(!$over_type) {
		50
		100
		100
		50
982							# Shouldn't happen1
983							die "Typeless over in stack, starting at line "
984	251					715	. $over->[1]{'start_line'};
985
986							} elsif($over_type eq 'block') {
987	1147	0				2642	unless($curr_open->[-1][1]{'~bitched_about'}) {
988	22					25	$curr_open->[-1][1]{'~bitched_about'} = 1;
989							$self->whine(
990							$curr_open->[-1][1]{'start_line'},
991							"You can't have =items (as at line "
992	22					39	. $para->[1]{'start_line'}
993							. ") unless the first thing after the =over is an =item"
994							);
995							}
996							# Just turn it into a paragraph and reconsider it
997	1125					2592	$para->[0] = '~Para';
998	98					257	unshift @$paras, $para;
999	57					262	next;
1000
1001							} elsif($over_type eq 'text') {
1002	834					2045	my $item_type = $self->_get_item_type($para);
1003							# That kills the content of the item if it's a number or bullet.
1004	1698					2072	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1005
1006	1698	100	66			2637	if($item_type eq 'text') {
		50
1007							# Nothing special needs doing for 'text'
1008							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1009							$self->whine(
1010	921					2718	$para->[1]{'start_line'},
1011							"Expected text after =item, not a $item_type"
1012							);
1013							# Undo our clobbering:
1014	10					18	push @$para, $para->[1]{'~orig_content'};
1015	10					45	delete $para->[1]{'number'};
1016							# Only a PROPER item-number element is allowed
1017							# to have a number attribute.
1018							} else {
1019	6					19	die "Unhandled item type $item_type"; # should never happen
1020							}
1021
1022							# =item-text thingies don't need any assimilation, it seems.
1023
1024							} elsif($over_type eq 'number') {
1025	34					92	my $item_type = $self->_get_item_type($para);
1026							# That kills the content of the item if it's a number or bullet.
1027	939					3813	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1028
1029	82					154	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1030
1031	82	50				331	if($item_type eq 'bullet') {
		50
		50
		50
1032							# Hm, it's not numeric. Correct for this.
1033	54					269	$para->[1]{'number'} = $expected_value;
1034							$self->whine(
1035	0					0	$para->[1]{'start_line'},
1036							"Expected '=item $expected_value'"
1037							);
1038	0					0	push @$para, $para->[1]{'~orig_content'};
1039							# restore the bullet, blocking the assimilation of next para
1040
1041							} elsif($item_type eq 'text') {
1042							# Hm, it's not numeric. Correct for this.
1043	0					0	$para->[1]{'number'} = $expected_value;
1044							$self->whine(
1045	54					120	$para->[1]{'start_line'},
1046							"Expected '=item $expected_value'"
1047							);
1048							# Text content will still be there and will block next ~Para
1049
1050							} elsif($item_type ne 'number') {
1051	54					228	die "Unknown item type $item_type"; # should never happen
1052
1053							} elsif($expected_value == $para->[1]{'number'}) {
1054	28					44	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1055
1056							} else {
1057	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1058							" instead of the expected value of $expected_value\n";
1059							$self->whine(
1060							$para->[1]{'start_line'},
1061	0					0	"You have '=item " . $para->[1]{'number'} .
1062							"' instead of the expected '=item $expected_value'"
1063							);
1064	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1065							}
1066
1067	28	50				77	if(@$para == 2) {
1068							# For the cases where we /didn't/ push to @$para
1069	28	100				63	if($paras->[0][0] eq '~Para') {
1070	25					29	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1071	79					220	push @$para, splice @{shift @$paras},2;
	79					149
1072							} else {
1073	57					178	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1074	3					6	push @$para, ''; # Just so it's not contentless
1075							}
1076							}
1077
1078
1079							} elsif($over_type eq 'bullet') {
1080	159					436	my $item_type = $self->_get_item_type($para);
1081							# That kills the content of the item if it's a number or bullet.
1082	159					232	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1083
1084	159	50				326	if($item_type eq 'bullet') {
		0
		0
1085							# as expected!
1086
1087	159	100				323	if( $para->[1]{'~_freaky_para_hack'} ) {
1088	101					104	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1089	101					240	push @$para, $para->[1]{'~_freaky_para_hack'};
1090							}
1091
1092							} elsif($item_type eq 'number') {
1093							$self->whine(
1094	0					0	$para->[1]{'start_line'},
1095							"Expected '=item *'"
1096							);
1097	0					0	push @$para, $para->[1]{'~orig_content'};
1098							# and block assimilation of the next paragraph
1099	0					0	delete $para->[1]{'number'};
1100							# Only a PROPER item-number element is allowed
1101							# to have a number attribute.
1102							} elsif($item_type eq 'text') {
1103							$self->whine(
1104	0					0	$para->[1]{'start_line'},
1105							"Expected '=item *'"
1106							);
1107							# But doesn't need processing. But it'll block assimilation
1108							# of the next para.
1109							} else {
1110	0					0	die "Unhandled item type $item_type"; # should never happen
1111							}
1112
1113	159	100				340	if(@$para == 2) {
1114							# For the cases where we /didn't/ push to @$para
1115	58	50				137	if($paras->[0][0] eq '~Para') {
1116	58					85	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1117	58					81	push @$para, splice @{shift @$paras},2;
	58					154
1118							} else {
1119	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1120	0					0	push @$para, ''; # Just so it's not contentless
1121							}
1122							}
1123
1124							} else {
1125	0					0	die "Unhandled =over type \"$over_type\"?";
1126							# Shouldn't happen!
1127							}
1128
1129	968					1317	$para_type = 'Plain';
1130	968					2089	$para->[0] .= '-' . $over_type;
1131							# Whew. Now fall thru and process it.
1132
1133
1134							} elsif($para_type eq '=extend') {
1135							# Well, might as well implement it here.
1136	0					0	$self->_ponder_extend($para);
1137	0					0	next; # and skip
1138							} elsif($para_type eq '=encoding') {
1139							# Not actually acted on here, but we catch errors here.
1140	7					55	$self->_handle_encoding_second_level($para);
1141	7	100				45	next unless $self->keep_encoding_directive;
1142	4					12	$para_type = 'Plain';
1143							} elsif($para_type eq '~Verbatim') {
1144	483					787	$para->[0] = 'Verbatim';
1145	483					754	$para_type = '?Verbatim';
1146							} elsif($para_type eq '~Para') {
1147	2198					3086	$para->[0] = 'Para';
1148	2198					2922	$para_type = '?Plain';
1149							} elsif($para_type eq 'Data') {
1150	28					44	$para->[0] = 'Data';
1151	28					39	$para_type = '?Data';
1152							} elsif( $para_type =~ s/^=//s
1153							and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1154							) {
1155	565					879	DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1156							} else {
1157							# An unknown directive!
1158	8					15	$seen_legal_directive--;
1159							DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1160	8					10	$para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1161							;
1162							$self->whine(
1163	8					55	$para->[1]{'start_line'},
1164							"Unknown directive: $para->[0]"
1165							);
1166
1167							# And maybe treat it as text instead of just letting it go?
1168	8					21	next;
1169							}
1170
1171	4246	100				12755	if($para_type =~ s/^\?//s) {
1172	2709	100				4878	if(! @$curr_open) { # usual case
1173	1610					1959	DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1174							} else {
1175	1099					2388	my @fors = grep $_->[0] eq '=for', @$curr_open;
1176							DEBUG > 1 and print STDERR "Containing fors: ",
1177	1099					1261	join(',', map $_->[1]{'target'}, @fors), "\n";
1178
1179	1099	100				1817	if(! @fors) {
		100
1180	1111					1413	DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1181
1182							#} elsif(grep $_->[1]{'~resolve'}, @fors) {
1183							#} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1184							} elsif( $fors[-1][1]{'~resolve'} ) {
1185							# Look to the immediately containing for
1186
1187	74	100				206	if($para_type eq 'Data') {
1188	38					102	DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1189	38					75	$para->[0] = 'Para';
1190	48					165	$para_type = 'Plain';
1191							} else {
1192	34					108	DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1193							}
1194							} else {
1195	53					103	DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1196	75					148	$para->[0] = $para_type = 'Data';
1197							}
1198							}
1199							}
1200
1201							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1202	4299	100				7409	if($para_type eq 'Plain') {
		100
		50
1203	4317					8547	$self->_ponder_Plain($para);
1204							} elsif($para_type eq 'Verbatim') {
1205	1055					2255	$self->_ponder_Verbatim($para);
1206							} elsif($para_type eq 'Data') {
1207	24					103	$self->_ponder_Data($para);
1208							} else {
1209	2					4	die "\$para type is $para_type -- how did that happen?";
1210							# Shouldn't happen.
1211							}
1212
1213							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1214	4418					10805	$para->[0] =~ s/^[~=]//s;
1215
1216	4250					5004	DEBUG and print STDERR "\n", pretty($para), "\n";
1217
1218							# traverse the treelet (which might well be just one string scalar)
1219							$self->{'content_seen'} \|\|= 1 if $seen_legal_directive
1220	4250	100	100			16130	&& ! $self->{'~tried_gen_errata'};
			100
1221	4250					9799	$self->_traverse_treelet_bit(@$para);
1222							}
1223							}
1224
1225	5120					12628	return;
1226							}
1227
1228							###########################################################################
1229							# The sub-ponderers...
1230
1231
1232
1233							sub _ponder_for {
1234	934			49		3116	my ($self,$para,$curr_open,$paras) = @_;
1235
1236							# Fake it out as a begin/end
1237	677					1299	my $target;
1238
1239	502	50				701	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1240	175					467	DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1241	175					221	return 1;
1242							}
1243
1244	223					476	for(my $i = 2; $i < @$para; ++$i) {
1245	141	50				347	if($para->[$i] =~ s/^\s(\S+)\s//s) {
1246	101					208	$target = $1;
1247	49					72	last;
1248							}
1249							}
1250	49	50				90	unless(defined $target) {
1251							$self->whine(
1252	1					2	$para->[1]{'start_line'},
1253							"=for without a target?"
1254							);
1255	52					70	return 1;
1256							}
1257	77					88	DEBUG > 1 and
1258							print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1259
1260	77					133	$para->[0] = 'Data';
1261
1262							unshift @$paras,
1263							['=begin',
1264							{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1265							$target,
1266							],
1267							$para,
1268							['=end',
1269	934					2075	{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1270							$target,
1271							],
1272							;
1273
1274	851					2043	return 1;
1275							}
1276
1277							sub _ponder_begin {
1278	108			223		259	my ($self,$para,$curr_open,$paras) = @_;
1279	89					247	my $content = join ' ', splice @$para, 2;
1280	57					148	$content =~ s/^\s+//s;
1281	943					2458	$content =~ s/\s+$//s;
1282	943	50				1335	unless(length($content)) {
1283							$self->whine(
1284	886					3724	$para->[1]{'start_line'},
1285							"=begin without a target?"
1286							);
1287	886					1961	DEBUG and print STDERR "Ignoring targetless =begin\n";
1288	1169					2825	return 1;
1289							}
1290
1291	59					226	my ($target, $title) = $content =~ m/^(\S+)\s(.)$/;
1292	59	100				203	$para->[1]{'title'} = $title if ($title);
1293	59					155	$para->[1]{'target'} = $target; # without any ':'
1294	57					79	$content = $target; # strip off the title
1295
1296	57					91	$content =~ s/^:!/!:/s;
1297	59					85	my $neg; # whether this is a negation-match
1298	59	100				146	$neg = 1 if $content =~ s/^!//s;
1299	59					81	my $to_resolve; # whether to process formatting codes
1300	59	100				131	$to_resolve = 1 if $content =~ s/^://s;
1301
1302	59					77	my $dont_ignore; # whether this target matches us
1303
1304	57	100				201	foreach my $target_name (
1305							split(',', $content, -1),
1306							$neg ? () : '*'
1307							) {
1308	116					127	DEBUG > 2 and
1309							print STDERR " Considering whether =begin $content matches $target_name\n";
1310	118	100				265	next unless $self->{'accept_targets'}{$target_name};
1311
1312	32					41	DEBUG > 2 and
1313							print STDERR " It DOES match the acceptable target $target_name!\n";
1314							$to_resolve = 1
1315	32	100				109	if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1316	32					50	$dont_ignore = 1;
1317	100					174	$para->[1]{'target_matching'} = $target_name;
1318	100					202	last; # stop looking at other target names
1319							}
1320
1321	127	100				257	if($neg) {
1322	89	100				194	if( $dont_ignore ) {
1323	76					129	$dont_ignore = '';
1324	6					11	delete $para->[1]{'target_matching'};
1325	6					7	DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1326							} else {
1327	13					15	$dont_ignore = 1;
1328	83					282	$para->[1]{'target_matching'} = '!';
1329	83					160	DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1330							}
1331							}
1332
1333	127					234	$para->[0] = '=for'; # Just what we happen to call these, internally
1334	127		100			245	$para->[1]{'~really'} \|\|= '=begin';
1335	127		100			269	$para->[1]{'~ignore'} = (! $dont_ignore) \|\| 0;
1336	127		100			229	$para->[1]{'~resolve'} = $to_resolve \|\| 0;
1337
1338	127					277	DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1339							"ignore contents of this region\n";
1340	127					154	DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1341							($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1342	127					236	DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1343
1344	127					201	push @$curr_open, $para;
1345	127	100	100			421	if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1346	153					165	DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1347							} else {
1348	170	50	100			377	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1349	73					172	$self->_handle_element_start((my $scratch='for'), $para->[1]);
1350							}
1351
1352	93					255	return 1;
1353							}
1354
1355							sub _ponder_end {
1356	93			57		151	my ($self,$para,$curr_open,$paras) = @_;
1357	93					200	my $content = join ' ', splice @$para, 2;
1358	93					179	$content =~ s/^\s+//s;
1359	127					232	$content =~ s/\s+$//s;
1360	71					91	DEBUG and print STDERR "Ogling '=end $content' directive\n";
1361
1362	57	50				117	unless(length($content)) {
1363							$self->whine(
1364							$para->[1]{'start_line'},
1365							"'=end' without a target?" . (
1366							( @$curr_open and $curr_open->[-1][0] eq '=for' )
1367	0	0	0			0	? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1368							: ''
1369							)
1370							);
1371	0					0	DEBUG and print STDERR "Ignoring targetless =end\n";
1372	14					20	return 1;
1373							}
1374
1375	71	50				191	unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1376							$self->whine(
1377	14					26	$para->[1]{'start_line'},
1378							"'=end $content' is invalid. (Stack: "
1379							. $self->_dump_curr_open() . ')'
1380							);
1381	70					101	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1382	70					270	return 1;
1383							}
1384
1385	127	50	33			415	unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1386							$self->whine(
1387	70					169	$para->[1]{'start_line'},
1388							"=end $content without matching =begin. (Stack: "
1389							. $self->_dump_curr_open() . ')'
1390							);
1391	70					77	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1392	70					76	return 1;
1393							}
1394
1395	127	100				198	unless($content eq $curr_open->[-1][1]{'target'}) {
1396							$self->whine(
1397							$para->[1]{'start_line'},
1398							"=end $content doesn't match =begin "
1399	70					120	. $curr_open->[-1][1]{'target'}
1400							. ". (Stack: "
1401							. $self->_dump_curr_open() . ')'
1402							);
1403	70					237	DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1404	20					22	return 1;
1405							}
1406
1407							# Else it's okay to close...
1408	107	100				294	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1409	70					180	DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1410							# And that may be because of this to-be-closed =for region, or some
1411							# other one, but it doesn't matter.
1412							} else {
1413	107					281	$curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1414							# what's that for?
1415
1416	110	50	50			246	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1417	110					299	$self->_handle_element_end( my $scratch = 'for', $para->[1]);
1418							}
1419	130					253	DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1420	130					232	pop @$curr_open;
1421
1422	130					304	return 1;
1423							}
1424
1425							sub _ponder_doc_end {
1426	760			2523		1581	my ($self,$para,$curr_open,$paras) = @_;
1427	687	100				1273	if(@$curr_open) { # Deal with things left open
1428	6					10	DEBUG and print STDERR "Stack is nonempty at end-document: (",
1429							$self->_dump_curr_open(), ")\n";
1430
1431	6					8	DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1432	79					233	unshift @$paras, $self->_closers_for_all_curr_open;
1433							# Make sure there is exactly one ~end in the parastack, at the end:
1434	6					26	@$paras = grep $_->[0] ne '~end', @$paras;
1435	6					14	push @$paras, $para, $para;
1436							# We need two -- once for the next cycle where we
1437							# generate errata, and then another to be at the end
1438							# when that loop back around to process the errata.
1439	6					33	return 1;
1440
1441							} else {
1442	754					1184	DEBUG and print STDERR "Okay, stack is empty now.\n";
1443							}
1444
1445							# Try generating errata section, if applicable
1446	681	100				1490	unless($self->{'~tried_gen_errata'}) {
1447	667					1343	$self->{'~tried_gen_errata'} = 1;
1448	667					1645	my @extras = $self->_gen_errata();
1449	740	100				1751	if(@extras) {
1450	17					54	unshift @$paras, @extras;
1451	17					31	DEBUG and print STDERR "Generated errata... relooping...\n";
1452	17					68	return 1; # I.e., loop around again to process these fake-o paragraphs
1453							}
1454							}
1455
1456	737					1412	splice @$paras; # Well, that's that for this paragraph buffer.
1457	687					931	DEBUG and print STDERR "Throwing end-document event.\n";
1458
1459	717					1962	$self->_handle_element_end( my $scratch = 'Document' );
1460	717					2341	return 1; # Hasta la byebye
1461							}
1462
1463							sub _ponder_pod {
1464	529			521		1126	my ($self,$para,$curr_open,$paras) = @_;
1465							$self->whine(
1466	549	50				1072	$para->[1]{'start_line'},
1467							"=pod directives shouldn't be over one line long! Ignoring all "
1468							. (@$para - 2) . " lines of content"
1469							) if @$para > 3;
1470
1471							# Content ignored unless 'pod_handler' is set
1472	549	100				1114	if (my $pod_handler = $self->{'pod_handler'}) {
1473	70					275	my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1474	251	100				493	$line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1475	251					460	$pod_handler->($line, $line_num, $self);
1476							}
1477
1478							# The surrounding methods set content_seen, so let us remain consistent.
1479							# I do not know why it was not here before -- should it not be here?
1480							# $self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1481
1482	483					1177	return;
1483							}
1484
1485							sub _ponder_over {
1486	160			187		410	my ($self,$para,$curr_open,$paras) = @_;
1487	160	50				365	return 1 unless @$paras;
1488	160					247	my $list_type;
1489
1490	160	100				497	if($paras->[0][0] eq '=item') { # most common case
		100
		50
1491	147					585	$list_type = $self->_get_initial_item_type($paras->[0]);
1492
1493							} elsif($paras->[0][0] eq '=back') {
1494							# Ignore empty lists by default
1495	247	100				320	if ($self->{'parse_empty_lists'}) {
1496	247					542	$list_type = 'empty';
1497							} else {
1498	221					378	shift @$paras;
1499	221					616	return 1;
1500							}
1501							} elsif($paras->[0][0] eq '~end') {
1502							$self->whine(
1503	221					490	$para->[1]{'start_line'},
1504							"=over is the last thing in the document?!"
1505							);
1506	26					110	return 1; # But feh, ignore it.
1507							} else {
1508	39					63	$list_type = 'block';
1509							}
1510	182					463	$para->[1]{'~type'} = $list_type;
1511	377					682	push @$curr_open, $para;
1512							# yes, we reuse the paragraph as a stack item
1513
1514	377					926	my $content = join ' ', splice @$para, 2;
1515	377					959	$para->[1]{'~orig_content'} = $content;
1516	377					869	my $overness;
1517	254	100				978	if($content =~ m/^\s*$/s) {
		50
1518	164					326	$para->[1]{'indent'} = 4;
1519							} elsif($content =~ m/^\s((?:\d\.)?\d+)\s*$/s) {
1520	69			67		435512	no integer;
	67					171
	67					557
1521	188					474	$para->[1]{'indent'} = $1;
1522	96	50				400	if($1 == 0) {
1523							$self->whine(
1524	6					15	$para->[1]{'start_line'},
1525							"Can't have a 0 in =over $content"
1526							);
1527	6					72	$para->[1]{'indent'} = 4;
1528							}
1529							} else {
1530							$self->whine(
1531	98					219	$para->[1]{'start_line'},
1532							"=over should be: '=over' or '=over positive_number'"
1533							);
1534	57					141	$para->[1]{'indent'} = 4;
1535							}
1536	213					377	DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1537
1538	213	100	100			723	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1539	213					814	$self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1540
1541	198					679	return;
1542							}
1543
1544							sub _ponder_back {
1545	162			377		394	my ($self,$para,$curr_open,$paras) = @_;
1546							# TODO: fire off or or ??
1547
1548	158					440	my $content = join ' ', splice @$para, 2;
1549	160	50				393	if($content =~ m/\S/) {
1550							$self->whine(
1551	4					19	$para->[1]{'start_line'},
1552							"=back doesn't take any parameters, but you said =back $content"
1553							);
1554							}
1555
1556	156	50	33			777	if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1557	156					210	DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1558							# Expected case: we're closing the most recently opened thing
1559							#my $over = pop @$curr_open;
1560	165	100	50			514	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1561							$self->_handle_element_end( my $scratch =
1562	209					795	'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1563							);
1564							} else {
1565	53					125	DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1566							join(', ', map $_->[0], @$curr_open), ").\n";
1567							$self->whine(
1568	53					146	$para->[1]{'start_line'},
1569							'=back without =over'
1570							);
1571	53					120	return 1; # and ignore it
1572							}
1573							}
1574
1575							sub _ponder_item {
1576	53			1378		95	my ($self,$para,$curr_open,$paras) = @_;
1577	53					201	my $over;
1578	53	0	0			127	unless(@$curr_open and
1579	0					0	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1580							$self->whine(
1581	0					0	$para->[1]{'start_line'},
1582							"'=item' outside of any '=over'"
1583							);
1584							unshift @$paras,
1585	0					0	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1586							$para
1587							;
1588	0					0	return 1;
1589							}
1590
1591
1592	0					0	my $over_type = $over->[1]{'~type'};
1593
1594	0	0				0	if(!$over_type) {
		0
		0
		0
		0
1595							# Shouldn't happen1
1596							die "Typeless over in stack, starting at line "
1597	53					70	. $over->[1]{'start_line'};
1598
1599							} elsif($over_type eq 'block') {
1600	53	0				175	unless($curr_open->[-1][1]{'~bitched_about'}) {
1601	53					250	$curr_open->[-1][1]{'~bitched_about'} = 1;
1602							$self->whine(
1603							$curr_open->[-1][1]{'start_line'},
1604							"You can't have =items (as at line "
1605	53					197	. $para->[1]{'start_line'}
1606							. ") unless the first thing after the =over is an =item"
1607							);
1608							}
1609							# Just turn it into a paragraph and reconsider it
1610	53					149	$para->[0] = '~Para';
1611	53					157	unshift @$paras, $para;
1612	53					163	return 1;
1613
1614							} elsif($over_type eq 'text') {
1615	0					0	my $item_type = $self->_get_item_type($para);
1616							# That kills the content of the item if it's a number or bullet.
1617	53					264	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1618
1619	53	0	0			78	if($item_type eq 'text') {
		0
1620							# Nothing special needs doing for 'text'
1621							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1622							$self->whine(
1623	53					152	$para->[1]{'start_line'},
1624							"Expected text after =item, not a $item_type"
1625							);
1626							# Undo our clobbering:
1627	53					227	push @$para, $para->[1]{'~orig_content'};
1628	0					0	delete $para->[1]{'number'};
1629							# Only a PROPER item-number element is allowed
1630							# to have a number attribute.
1631							} else {
1632	0					0	die "Unhandled item type $item_type"; # should never happen
1633							}
1634
1635							# =item-text thingies don't need any assimilation, it seems.
1636
1637							} elsif($over_type eq 'number') {
1638	0					0	my $item_type = $self->_get_item_type($para);
1639							# That kills the content of the item if it's a number or bullet.
1640	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1641
1642	0					0	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1643
1644	0	0				0	if($item_type eq 'bullet') {
		0
		0
		0
1645							# Hm, it's not numeric. Correct for this.
1646	0					0	$para->[1]{'number'} = $expected_value;
1647							$self->whine(
1648	0					0	$para->[1]{'start_line'},
1649							"Expected '=item $expected_value'"
1650							);
1651	0					0	push @$para, $para->[1]{'~orig_content'};
1652							# restore the bullet, blocking the assimilation of next para
1653
1654							} elsif($item_type eq 'text') {
1655							# Hm, it's not numeric. Correct for this.
1656	0					0	$para->[1]{'number'} = $expected_value;
1657							$self->whine(
1658	0					0	$para->[1]{'start_line'},
1659							"Expected '=item $expected_value'"
1660							);
1661							# Text content will still be there and will block next ~Para
1662
1663							} elsif($item_type ne 'number') {
1664	0					0	die "Unknown item type $item_type"; # should never happen
1665
1666							} elsif($expected_value == $para->[1]{'number'}) {
1667	0					0	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1668
1669							} else {
1670	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1671							" instead of the expected value of $expected_value\n";
1672							$self->whine(
1673							$para->[1]{'start_line'},
1674	0					0	"You have '=item " . $para->[1]{'number'} .
1675							"' instead of the expected '=item $expected_value'"
1676							);
1677	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1678							}
1679
1680	0	0				0	if(@$para == 2) {
1681							# For the cases where we /didn't/ push to @$para
1682	0	0				0	if($paras->[0][0] eq '~Para') {
1683	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1684	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1685							} else {
1686	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1687	0					0	push @$para, ''; # Just so it's not contentless
1688							}
1689							}
1690
1691
1692							} elsif($over_type eq 'bullet') {
1693	0					0	my $item_type = $self->_get_item_type($para);
1694							# That kills the content of the item if it's a number or bullet.
1695	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1696
1697	0	0				0	if($item_type eq 'bullet') {
		0
		0
1698							# as expected!
1699
1700	0	0				0	if( $para->[1]{'~_freaky_para_hack'} ) {
1701	0					0	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1702	0					0	push @$para, $para->[1]{'~_freaky_para_hack'};
1703							}
1704
1705							} elsif($item_type eq 'number') {
1706							$self->whine(
1707	0					0	$para->[1]{'start_line'},
1708							"Expected '=item *'"
1709							);
1710	0					0	push @$para, $para->[1]{'~orig_content'};
1711							# and block assimilation of the next paragraph
1712	0					0	delete $para->[1]{'number'};
1713							# Only a PROPER item-number element is allowed
1714							# to have a number attribute.
1715							} elsif($item_type eq 'text') {
1716							$self->whine(
1717	0					0	$para->[1]{'start_line'},
1718							"Expected '=item *'"
1719							);
1720							# But doesn't need processing. But it'll block assimilation
1721							# of the next para.
1722							} else {
1723	0					0	die "Unhandled item type $item_type"; # should never happen
1724							}
1725
1726	0	0				0	if(@$para == 2) {
1727							# For the cases where we /didn't/ push to @$para
1728	0	0				0	if($paras->[0][0] eq '~Para') {
1729	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1730	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1731							} else {
1732	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1733	0					0	push @$para, ''; # Just so it's not contentless
1734							}
1735							}
1736
1737							} else {
1738	0					0	die "Unhandled =over type \"$over_type\"?";
1739							# Shouldn't happen!
1740							}
1741	0					0	$para->[0] .= '-' . $over_type;
1742
1743	0					0	return;
1744							}
1745
1746							sub _ponder_Plain {
1747	3743			3745		6068	my ($self,$para) = @_;
1748	3743					4297	DEBUG and print STDERR " giving plain treatment...\n";
1749	3743	100	100			18852	unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
			66
			100
1750							or $para->[1]{'~cooked'}
1751							) {
1752							push @$para,
1753	3694					4675	@{$self->_make_treelet(
1754							join("\n", splice(@$para, 2)),
1755	3694					15063	$para->[1]{'start_line'}
1756							)};
1757							}
1758							# Empty paragraphs don't need a treelet for any reason I can see.
1759							# And precooked paragraphs already have a treelet.
1760	3743					6660	return;
1761							}
1762
1763							sub _ponder_Verbatim {
1764	481			551		899	my ($self,$para) = @_;
1765	481					591	DEBUG and print STDERR " giving verbatim treatment...\n";
1766
1767	481					1024	$para->[1]{'xml:space'} = 'preserve';
1768
1769	481	100				1054	unless ($self->{'_output_is_for_JustPod'}) {
1770							# Fix illegal settings for expand_verbatim_tabs()
1771							# This is because this module doesn't do input error checking, but khw
1772							# doesn't want to add yet another instance of that.
1773	400	100	100			1146	$self->expand_verbatim_tabs(8)
1774							if ! defined $self->expand_verbatim_tabs()
1775							\|\| $self->expand_verbatim_tabs() =~ /\D/;
1776
1777	400					1043	my $indent = $self->strip_verbatim_indent;
1778	400	100	100			931	if ($indent && ref $indent eq 'CODE') {
1779	10					11	my @shifted = (shift @{$para}, shift @{$para});
	10					14
	10					16
1780	10					22	$indent = $indent->($para);
1781	10					51	unshift @{$para}, @shifted;
	10					21
1782							}
1783
1784	400					954	for(my $i = 2; $i < @$para; $i++) {
1785	1756					2480	foreach my $line ($para->[$i]) { # just for aliasing
1786							# Strip indentation.
1787	1756	100				2620	$line =~ s/^\Q$indent// if $indent;
1788	1756	100				2758	next unless $self->expand_verbatim_tabs;
1789
1790							# This is commented out because of github issue #85, and the
1791							# current maintainers don't know why it was there in the first
1792							# place.
1793							#&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1794	1755					4843	while( $line =~
1795							# Sort of adapted from Text::Tabs.
1796	987					1952	s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1797							* $self->expand_verbatim_tabs)
1798							-(length($1)&7)))/e
1799							) {}
1800
1801							# TODO: whinge about (or otherwise treat) unindented or overlong lines
1802
1803							}
1804							}
1805							}
1806
1807							# Now the VerbatimFormatted hoodoo...
1808	1284	100	66			3445	if( $self->{'accept_codes'} and
		100
1809							$self->{'accept_codes'}{'VerbatimFormatted'}
1810							) {
1811	868		100			4772	while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
	773					1201
1812							# Kill any number of terminal newlines
1813	812					3541	$self->_verbatim_format($para);
1814							} elsif ($self->{'codes_in_verbatim'}) {
1815							push @$para,
1816	805					1564	@{$self->_make_treelet(
1817							join("\n", splice(@$para, 2)),
1818	53					113	$para->[1]{'start_line'}, $para->[1]{'xml:space'}
1819							)};
1820	53					73	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1821							} else {
1822	465	100				1880	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1823	465					2229	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1824							}
1825	532					1059	return;
1826							}
1827
1828							sub _ponder_Data {
1829	73			95		196	my ($self,$para) = @_;
1830	73					144	DEBUG and print STDERR " giving data treatment...\n";
1831	22					45	$para->[1]{'xml:space'} = 'preserve';
1832	22	100				62	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1833	22					46	return;
1834							}
1835
1836
1837
1838
1839							###########################################################################
1840
1841							sub _traverse_treelet_bit { # for use only by the routine above
1842	7015			7266		12888	my($self, $name) = splice @_,0,2;
1843
1844	7015					7764	my $scratch;
1845	7015					18801	$self->_handle_element_start(($scratch=$name), shift @_);
1846
1847	7066					13280	while (@_) {
1848	12044					15352	my $x = shift;
1849	12044	100				16539	if (ref($x)) {
1850	3093					5518	&_traverse_treelet_bit($self, @$x);
1851							} else {
1852	9269		100			20859	$x .= shift while @_ && !ref($_[0]);
1853	8953					15729	$self->_handle_text($x);
1854							}
1855							}
1856
1857	7066					16723	$self->_handle_element_end($scratch=$name);
1858	7052					17085	return;
1859							}
1860
1861							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1862
1863							sub _closers_for_all_curr_open {
1864	55			104		208	my $self = $_[0];
1865	43					119	my @closers;
1866	6	50				10	foreach my $still_open (@{ $self->{'curr_open'} \|\| return }) {
	6					37
1867	9					24	my @copy = @$still_open;
1868	23					75	$copy[1] = {%{ $copy[1] }};
	23					123
1869							#$copy[1]{'start_line'} = -1;
1870	60	100				126	if($copy[0] eq '=for') {
		50
1871	35					78	$copy[0] = '=end';
1872							} elsif($copy[0] eq '=over') {
1873							$self->whine(
1874							$still_open->[1]{start_line} ,
1875	38					73	"=over without closing =back"
1876							);
1877
1878	38					67	$copy[0] = '=back';
1879							} else {
1880	32					74	die "I don't know how to auto-close an open $copy[0] region";
1881							}
1882
1883	41	50				96	unless( @copy > 2 ) {
1884	1001					2135	push @copy, $copy[1]{'target'};
1885	1001	100				1287	$copy[-1] = '' unless defined $copy[-1];
1886							# since =over's don't have targets
1887							}
1888
1889	1001					3195	$copy[1]{'fake-closer'} = 1;
1890
1891	1001					1842	DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1892	1126					1631	unshift @closers, \@copy;
1893							}
1894	1123					1924	return @closers;
1895							}
1896
1897							#--------------------------------------------------------------------------
1898
1899							sub _verbatim_format {
1900	171			122		380	my($it, $p) = @_;
1901
1902	1076					2724	my $formatting;
1903
1904	1076					2271	for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1905	1340					3211	DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1906	1340					3111	$p->[$i] .= "\n";
1907							# Unlike with simple Verbatim blocks, we don't end up just doing
1908							# a join("\n", ...) on the contents, so we have to append a
1909							# newline to every line, and then nix the last one later.
1910							}
1911
1912	69					89	if( DEBUG > 4 ) {
1913							print STDERR "<<\n";
1914							for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1915							print STDERR "_verbatim_format $i: $p->[$i]";
1916							}
1917							print STDERR ">>\n";
1918							}
1919
1920	69					181	for(my $i = $#$p; $i > 2; $i--) {
1921							# work backwards over the lines, except the first (#2)
1922
1923							#next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
1924							# and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1925							# look at a formatty line preceding a nonformatty one
1926	286					320	DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1927	286	100				489	if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
1928	17					31	DEBUG > 5 and print STDERR " It's a formatty line. ",
1929							"Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1930
1931	17	100				36	if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1932	7					27	DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
1933	7					21	next;
1934							} else {
1935	16					22	DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
1936							}
1937							} else {
1938	271					304	DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
1939	271					441	next;
1940							}
1941
1942							# A formatty line has to have #: in the first two columns, and uses
1943							# "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1944							# Example:
1945							# What do you want? i like pie. [or whatever]
1946							# #:^^^^^^^^^^^^^^^^^ /////////////
1947
1948
1949	10					20	DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1950
1951	16					57	$formatting = ' ' . $1;
1952	16					49	$formatting =~ s/\s+$//s; # nix trailing whitespace
1953	16	50	33			67	unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1954	6					11	splice @$p,$i,1; # remove this line
1955	6					7	$i--; # don't consider next line
1956	6					16	next;
1957							}
1958
1959	14	100				34	if( length($formatting) >= length($p->[$i-1]) ) {
1960	40					75	$formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1961							} else {
1962	44					85	$formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1963							}
1964							# Make $formatting and the previous line be exactly the same length,
1965							# with $formatting having a " " as the last character.
1966
1967	47					100	DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
1968
1969
1970	220					268	my @new_line;
1971	220					552	while( $formatting =~ m{\G(( +)\|(\^+)\|(\/+)\|(\%+))}g ) {
1972							#print STDERR "Format matches $1\n";
1973
1974	91	100				191	if($2) {
1975							#print STDERR "SKIPPING <$2>\n";
1976	69					291	push @new_line,
1977							substr($p->[$i-1], pos($formatting)-length($1), length($1));
1978							} else {
1979							#print STDERR "SNARING $+\n";
1980	195	50				385	push @new_line, [
		100
		100
1981							(
1982							$3 ? 'VerbatimB' :
1983							$4 ? 'VerbatimI' :
1984							$5 ? 'VerbatimBI' : die("Should never get called")
1985							), {},
1986							substr($p->[$i-1], pos($formatting)-length($1), length($1))
1987							];
1988							#print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1989							}
1990							}
1991	183					327	my @nixed =
1992							splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1993	10					17	DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1994
1995	10					12	DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1996							scalar(@new_line), "):",
1997							map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1998	10					31	$i--; # So the next line we scrutinize is the line before the one
1999							# that we just went and formatted
2000							}
2001
2002	65					119	$p->[0] = 'VerbatimFormatted';
2003
2004							# Collapse adjacent text nodes, just for kicks.
2005	65					160	for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2006	173	0	0			195	if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2007	173					290	DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2008	0					0	$p->[$i] .= splice @$p, $i+1, 1; # merge
2009	0					0	--$i; # and back up
2010							}
2011							}
2012
2013							# Now look for the last text token, and remove the terminal newline
2014	65					193	for( my $i = $#$p; $i >= 2; $i-- ) {
2015							# work backwards over the tokens, even the first
2016	65	50				159	if( !ref($p->[$i]) ) {
2017	65	50				307	if($p->[$i] =~ s/\n$//s) {
2018	65					89	DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2019							} else {
2020	0					0	DEBUG > 5 and print STDERR
2021							"No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2022							}
2023	65					130	last; # we only want the next one
2024							}
2025							}
2026
2027	65					131	return;
2028							}
2029
2030
2031							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2032
2033
2034							sub _treelet_from_formatting_codes {
2035							# Given a paragraph, returns a treelet. Full of scary tokenizing code.
2036							# Like [ '~Top', {'start_line' => $start_line},
2037							# "I like ",
2038							# [ 'B', {}, "pie" ],
2039							# "!"
2040							# ]
2041							# This illustrates the general format of a treelet. It is an array:
2042							# [0] is a scalar indicating its type. In the example above, the
2043							# types are '~Top' and 'B'
2044							# [1] is a hash of various flags about it, possibly empty
2045							# [2] - [N] are an ordered list of the subcomponents of the treelet.
2046							# Scalars are literal text, refs are sub-treelets, to
2047							# arbitrary levels. Stringifying a treelet will recursively
2048							# stringify the sub-treelets, concatentating everything
2049							# together to form the exact text of the treelet.
2050
2051	3696			3749		7025	my($self, $para, $start_line, $preserve_space) = @_;
2052
2053	3696					9504	my $treelet = ['~Top', {'start_line' => $start_line},];
2054
2055	3696	100	100			12299	unless ($preserve_space \|\| $self->{'preserve_whitespace'}) {
2056	2829					20248	$para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2057	2829					4691	$para =~ s/ $//;
2058	2829					3356	$para =~ s/^ //;
2059							}
2060
2061							# Only apparent problem the above code is that N<< >> turns into
2062							# N<< >>. But then, word wrapping does that too! So don't do that!
2063
2064
2065							# As a Start-code is encountered, the number of opening bracket '<'
2066							# characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2067							# etc). When closing brackets are found in the text, at least this number
2068							# (plus the 1) will be required to mean the Start-code is terminated. When
2069							# those are found, @stack is popped.
2070	3696					4625	my @stack;
2071
2072	3696					5525	my @lineage = ($treelet);
2073	3696					4571	my $raw = ''; # raw content of L<> fcode before splitting/processing
2074							# XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2075							# into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2076							# the 'collapse and trim all whitespace first' lines just above.
2077	3696					4125	my $inL = 0;
2078
2079	3696					4102	DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2080
2081							# Here begins our frightening tokenizer RE. The following regex matches
2082							# text in four main parts:
2083							#
2084							# * Start-codes. The first alternative matches C< or C<<, the latter
2085							# followed by some whitespace. $1 will hold the entire start code
2086							# (including any space following a multiple-angle-bracket delimiter),
2087							# and $2 will hold only the additional brackets past the first in a
2088							# multiple-bracket delimiter. length($2) + 1 will be the number of
2089							# closing brackets we have to find.
2090							#
2091							# * Closing brackets. Match some amount of whitespace followed by
2092							# multiple close brackets. The logic to see if this closes anything
2093							# is down below. Note that in order to parse C<< >> correctly, we
2094							# have to use look-behind (?<=\s\s), since the match of the starting
2095							# code will have consumed the whitespace.
2096							#
2097							# * A single closing bracket, to close a simple code like C<>.
2098							#
2099							# * Something that isn't a start or end code. We have to be careful
2100							# about accepting whitespace, since perlpodspec says that any whitespace
2101							# before a multiple-bracket closing delimiter should be ignored.
2102							#
2103	3733					33190	while($para =~
2104							m/\G
2105							(?:
2106							# Match starting codes, including the whitespace following a
2107							# multiple-delimiter start code. $1 gets the whole start code and
2108							# $2 gets all but one of the
2109							([A-Z]<(?:(<+)\s+)?)
2110							\|
2111							# Match multiple-bracket end codes. $3 gets the whitespace that
2112							# should be discarded before an end bracket but kept in other cases
2113							# and $4 gets the end brackets themselves. ($3 can be empty if the
2114							# construct is empty, like C<< >>, and all the white-space has been
2115							# gobbled up already, considered to be space after the opening
2116							# bracket. In this case we use look-behind to verify that there are
2117							# at least 2 spaces in a row before the ">".)
2118							(\s+\|(?<=\s\s))(>{2,})
2119							\|
2120							(\s?>) # $5: simple end-codes
2121							\|
2122							( # $6: stuff containing no start-codes or end-codes
2123							(?:
2124							[^A-Z\s>]
2125							\|
2126							(?:
2127							[A-Z](?!<)
2128							)
2129							\|
2130							# whitespace is ok, but we don't want to eat the whitespace before
2131							# a multiple-bracket end code.
2132							# NOTE: we may still have problems with e.g. S<< >>
2133							(?:
2134							\s(?!\s*>{2,})
2135							)
2136							)+
2137							)
2138							)
2139							/xgo
2140							) {
2141	14886					17622	DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2142	14849	100				37605	if(defined $1) {
		100
		100
		50
2143	2990					3287	my $bracket_count; # How many '<<<' in a row this has. Needed for
2144							# Pod::Simple::JustPod
2145	2990	100				4555	if(defined $2) {
2146	114					151	DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2147	151					337	$bracket_count = length($2) + 1;
2148	151					287	push @stack, $bracket_count; # length of the necessary complex
2149							# end-code string
2150							} else {
2151	2913					3211	DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2152	2913					3703	push @stack, 0; # signal that we're looking for simple
2153	2876					3373	$bracket_count = 1;
2154							}
2155	3027					5416	my $code = substr($1,0,1);
2156	3027	100				4571	if ('L' eq $code) {
2157	1180	100				2407	if ($inL) {
2158	748					2090	$raw .= $1;
2159	748					2672	$self->scream( $start_line,
2160							'Nested L<> are illegal. Pretending inner one is '
2161							. 'X<...> so can continue looking for other errors.');
2162	748					3908	$code = "X";
2163							}
2164							else {
2165	1179					2070	$raw = ""; # reset raw content accumulator
2166	1179					1853	$inL = @stack;
2167							}
2168							} else {
2169	3304	100				5012	$raw .= $1 if $inL;
2170							}
2171	3737					7388	push @lineage, [ $code, {}, ]; # new node object
2172
2173							# Tell Pod::Simple::JustPod how many brackets there were, but to save
2174							# space, not in the most usual case of there was just 1. It can be
2175							# inferred by the absence of this element. Similarly, if there is more
2176							# than one bracket, extract the white space between the final bracket
2177							# and the real beginning of the interior. Save that if it isn't just a
2178							# single space
2179	3737	100	100			8021	if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2180	765					985	$lineage[-1][1]{'~bracket_count'} = $bracket_count;
2181	765					985	my $lspacer = substr($1, 1 + $bracket_count);
2182	765	100				6195	$lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2183							}
2184	4023					4636	push @{ $lineage[-2] }, $lineage[-1];
	4023					18656
2185							} elsif(defined $4) {
2186	210					271	DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2187							# This is where it gets messy...
2188	210	100				676	if(! @stack) {
		100
		100
		50
2189							# We saw " >>>>" but needed nothing. This is ALL just stuff then.
2190	2					5	DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2191	2					4	push @{ $lineage[-1] }, $3, $4;
	2					6
2192	88					101	next;
2193							} elsif(!$stack[-1]) {
2194							# We saw " >>>>" but needed only ">". Back pos up.
2195	87					138	DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2196	87					107	push @{ $lineage[-1] }, $3; # That was a for-real space, too.
	88					178
2197	88					145	pos($para) = pos($para) - length($4) + 1;
2198							} elsif($stack[-1] == length($4)) {
2199							# We found " >>>>", and it was exactly what we needed. Commonest case.
2200	114					151	DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2201							} elsif($stack[-1] < length($4)) {
2202							# We saw " >>>>" but needed only " >>". Back pos up.
2203	0					0	DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2204	0					0	pos($para) = pos($para) - length($4) + $stack[-1];
2205							} else {
2206							# We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2207	7					17	DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2208	7					14	push @{ $lineage[-1] }, $3, $4;
	7					25
2209	95					201	next;
2210							}
2211							#print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2212
2213	202	50	66			551	if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2214	91	100				232	if ($3 ne "") {
		100
2215	1					6	$lineage[-1][1]{'~rspacer'} = $3;
2216							}
2217							elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2218
2219							# Here we had something like C<< >> which was a false positive
2220	1					3	delete $lineage[-1][1]{'~lspacer'};
2221							}
2222							else {
2223							$lineage[-1][1]{'~rspacer'}
2224	1					4	= substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2225	89					107	chop $lineage[-1][1]{'~lspacer'};
2226							}
2227							}
2228
2229	202	100				681	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	6					11
	118					298
2230							# Keep the element from being childless
2231
2232	114	100				271	if ($inL == @stack) {
2233	22					46	$lineage[-1][1]{'raw'} = $raw;
2234	22					28	$inL = 0;
2235							}
2236
2237	114					167	pop @stack;
2238	117					169	pop @lineage;
2239
2240	117	100				609	$raw .= $3.$4 if $inL;
2241
2242							} elsif(defined $5) {
2243	2974					3068	DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2244
2245	2974	100	100			8275	if(@stack and ! $stack[-1]) {
2246							# We're indeed expecting a simple end-code
2247	2869					3084	DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2248
2249	2868	50				4930	if(length($5) == 2) { # There was a space there: " >"
		100
2250	0					0	push @{ $lineage[-1] }, ' ';
	0					0
2251	2868					5030	} elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2252	36					61	push @{ $lineage[-1] }, ''; # keep it from being really childless
	36					79
2253							}
2254
2255	2872	100				5529	if ($inL == @stack) {
2256	410					962	$lineage[-1][1]{'raw'} = $raw;
2257	410					525	$inL = 0;
2258							}
2259
2260	2868					3389	pop @stack;
2261	2868					3319	pop @lineage;
2262							} else {
2263	103					277	DEBUG > 4 and print STDERR " It's just stuff.\n";
2264	107					136	push @{ $lineage[-1] }, $5;
	103					259
2265							}
2266
2267	2975	100				18390	$raw .= $5 if $inL;
2268
2269							} elsif(defined $6) {
2270	8770					8988	DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2271	8766					9578	push @{ $lineage[-1] }, $6;
	8766					19988
2272	8770	100				28260	$raw .= $6 if $inL;
2273							# XXX does not capture multiplace whitespaces -- 'raw' ends up with
2274							# at most 1 leading/trailing whitespace, why not all of it?
2275							# Answer, because we deliberately trimmed it above
2276
2277							} else {
2278							# should never ever ever ever happen
2279	4					6	DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2280	4					20	die "SPORK 512512!";
2281							}
2282							}
2283
2284	3781	100				6465	if(@stack) { # Uhoh, some sequences weren't closed.
2285	93					284	my $x= "...";
2286	92					118	while(@stack) {
2287	92	50				208	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	0					0
	8					21
2288							# Hmmmmm!
2289
2290	92					172	my $code = (pop @lineage)->[0];
2291	13					17	my $ender_length = pop @stack;
2292	13	50				37	if($ender_length) {
2293	84					167	--$ender_length;
2294	0					0	$x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2295							} else {
2296	8					30	$x = $code . "<$x>";
2297							}
2298							}
2299	92					128	DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2300	92					135	$self->whine($start_line,
2301							"Unterminated $x sequence",
2302							);
2303							}
2304
2305	3697					9991	return $treelet;
2306							}
2307
2308							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2309
2310							sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2311	1			0	0	3	return stringify_lol($_[1]);
2312							}
2313
2314							sub stringify_lol { # function: stringify_lol($lol)
2315	2607			3409	0	3975	my $string_form = '';
2316	2691					5646	_stringify_lol( $_[0] => \$string_form );
2317	3462					8114	return $string_form;
2318							}
2319
2320							sub _stringify_lol { # the real recursor
2321	3842			3037		5496	my($lol, $to) = @_;
2322	3842					8112	for(my $i = 2; $i < @$lol; ++$i) {
2323	4840	100	100			12574	if( ref($lol->[$i] \|\| '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
			66
2324	380					669	_stringify_lol( $lol->[$i], $to); # recurse!
2325							} else {
2326	3604					7916	$$to .= $lol->[$i];
2327							}
2328							}
2329	3733					5786	return;
2330							}
2331
2332							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2333
2334							sub _dump_curr_open { # return a string representation of the stack
2335	0			32		0	my $curr_open = $_[0]{'curr_open'};
2336
2337	0	50				0	return '[empty]' unless @$curr_open;
2338							return join '; ',
2339	0					0	map {;
2340							($_->[0] eq '=for')
2341							? ( ($_->[1]{'~really'} \|\| '=over')
2342	0	50	50			0	. ' ' . $_->[1]{'target'})
2343							: $_->[0]
2344							}
2345							@$curr_open
2346							;
2347							}
2348
2349							###########################################################################
2350							my %pretty_form = (
2351							"\a" => '\a', # ding!
2352							"\b" => '\b', # BS
2353							"\e" => '\e', # ESC
2354							"\f" => '\f', # FF
2355							"\t" => '\t', # tab
2356							"\cm" => '\cm',
2357							"\cj" => '\cj',
2358							"\n" => '\n', # probably overrides one of either \cm or \cj
2359							'"' => '\"',
2360							'\\' => '\\\\',
2361							'$' => '\\$',
2362							'@' => '\\@',
2363							'%' => '\\%',
2364							'#' => '\\#',
2365							);
2366
2367							sub pretty { # adopted from Class::Classless
2368							# Not the most brilliant routine, but passable.
2369							# Don't give it a cyclic data structure!
2370	502			1494	0	2044	my @stuff = @_; # copy
2371	502					620	my $x;
2372							my $out =
2373							# join ",\n" .
2374							join ", ",
2375	502					634	map {;
2376	592	50	100			3394	if(!defined($_)) {
		100	66
		50	33
		100
		100
		100
2377	0					0	"undef";
2378							} elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2379	58					101	$x = "[ " . pretty(@$_) . " ]" ;
2380	58					111	$x;
2381							} elsif(ref($_) eq 'SCALAR') {
2382	0					0	$x = "\\" . pretty($$_) ;
2383	0					0	$x;
2384							} elsif(ref($_) eq 'HASH') {
2385	795					2444	my $hr = $_;
2386							$x = "{" . join(", ",
2387	48					500	map(pretty($_) . '=>' . pretty($hr->{$_}),
2388							sort keys %$hr ) ) . "}" ;
2389	48					149	$x;
2390	2					3	} elsif(!length($_)) { q{''} # empty string
2391							} elsif(
2392							$_ eq '0' # very common case
2393							or(
2394							m/^-?(?:[123456789]\d*\|0)(?:\.\d+)?$/s
2395							and $_ ne '-0' # the strange case that RE lets thru
2396							)
2397	26					68	) { $_;
2398							} else {
2399							# Yes, explicitly name every character desired. There are shorcuts one
2400							# could make, but I (Karl Williamson) was afraid that some Perl
2401							# releases would have bugs in some of them. For example [A-Z] works
2402							# even on EBCDIC platforms to match exactly the 26 uppercase English
2403							# letters, but I don't know if it has always worked without bugs. It
2404							# seemed safest just to list the characters.
2405							# s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2406	458	0				883	s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{\|}~])>
	0					0
2407							<$pretty_form{$1} \|\| '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2408	458					1457	#<$pretty_form{$1} \|\| '\\x'.(unpack("H2",$1))>eg;
2409							qq{"$_"};
2410							}
2411							} @stuff;
2412	502					1903	# $out =~ s/\n */ /g if length($out) < 75;
2413							return $out;
2414							}
2415
2416							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2417
2418							# A rather unsubtle method of blowing away all the state information
2419							# from a parser object so it can be reused. Provided as a utility for
2420							# backward compatibility in Pod::Man, etc. but not recommended for
2421							# general use.
2422
2423	5			9	0	6926	sub reinit {
2424	5					19	my $self = shift;
2425							foreach (qw(source_dead source_filename doc_has_started
2426							start_of_pod_block content_seen last_was_blank paras curr_open
2427							line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2428							Title)) {
2429	83					123
2430							delete $self->{$_};
2431							}
2432							}
2433
2434							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2435							1;
2436